diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24849 @@ +{ + "best_global_step": 18000, + "best_metric": 87.30510951216766, + "best_model_checkpoint": "checkpoints_7B_lora/en-chv-final/checkpoint-18000", + "epoch": 1.1691608765366115, + "eval_steps": 1000, + "global_step": 35000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00033404596472474615, + "grad_norm": 1.7456389665603638, + "learning_rate": 4.008016032064128e-07, + "loss": 2.9531, + "step": 10 + }, + { + "epoch": 0.0006680919294494923, + "grad_norm": 1.855299472808838, + "learning_rate": 8.461367178802049e-07, + "loss": 2.9181, + "step": 20 + }, + { + "epoch": 0.0010021378941742383, + "grad_norm": 2.413423776626587, + "learning_rate": 1.291471832553997e-06, + "loss": 2.9701, + "step": 30 + }, + { + "epoch": 0.0013361838588989846, + "grad_norm": 2.179948568344116, + "learning_rate": 1.7368069472277891e-06, + "loss": 2.9165, + "step": 40 + }, + { + "epoch": 0.0016702298236237307, + "grad_norm": 2.240615129470825, + "learning_rate": 2.182142061901581e-06, + "loss": 2.8418, + "step": 50 + }, + { + "epoch": 0.0020042757883484766, + "grad_norm": 2.843095541000366, + "learning_rate": 2.627477176575373e-06, + "loss": 2.9464, + "step": 60 + }, + { + "epoch": 0.0023383217530732227, + "grad_norm": 2.6982319355010986, + "learning_rate": 3.072812291249165e-06, + "loss": 2.8672, + "step": 70 + }, + { + "epoch": 0.002672367717797969, + "grad_norm": 2.2579667568206787, + "learning_rate": 3.5181474059229573e-06, + "loss": 2.794, + "step": 80 + }, + { + "epoch": 0.0030064136825227153, + "grad_norm": 2.052929401397705, + "learning_rate": 3.9634825205967495e-06, + "loss": 2.6249, + "step": 90 + }, + { + "epoch": 0.0033404596472474614, + "grad_norm": 1.9322367906570435, + "learning_rate": 4.408817635270541e-06, + "loss": 2.5383, + "step": 100 + }, + { + "epoch": 0.0036745056119722075, + "grad_norm": 1.8608211278915405, + "learning_rate": 4.854152749944333e-06, + "loss": 2.439, + "step": 110 + }, + { + "epoch": 0.004008551576696953, + "grad_norm": 1.3612754344940186, + "learning_rate": 5.2994878646181255e-06, + "loss": 2.1968, + "step": 120 + }, + { + "epoch": 0.004342597541421699, + "grad_norm": 1.087594985961914, + "learning_rate": 5.744822979291917e-06, + "loss": 2.1285, + "step": 130 + }, + { + "epoch": 0.004676643506146445, + "grad_norm": 0.9390897750854492, + "learning_rate": 6.19015809396571e-06, + "loss": 1.9934, + "step": 140 + }, + { + "epoch": 0.0050106894708711915, + "grad_norm": 1.0713378190994263, + "learning_rate": 6.635493208639501e-06, + "loss": 1.9341, + "step": 150 + }, + { + "epoch": 0.005344735435595938, + "grad_norm": 0.7520629167556763, + "learning_rate": 7.080828323313293e-06, + "loss": 1.7827, + "step": 160 + }, + { + "epoch": 0.0056787814003206845, + "grad_norm": 0.6523362994194031, + "learning_rate": 7.526163437987085e-06, + "loss": 1.6917, + "step": 170 + }, + { + "epoch": 0.006012827365045431, + "grad_norm": 0.5774423480033875, + "learning_rate": 7.971498552660878e-06, + "loss": 1.7051, + "step": 180 + }, + { + "epoch": 0.006346873329770177, + "grad_norm": 0.6960276961326599, + "learning_rate": 8.41683366733467e-06, + "loss": 1.6031, + "step": 190 + }, + { + "epoch": 0.006680919294494923, + "grad_norm": 0.49967119097709656, + "learning_rate": 8.862168782008463e-06, + "loss": 1.5597, + "step": 200 + }, + { + "epoch": 0.007014965259219669, + "grad_norm": 0.4586268663406372, + "learning_rate": 9.307503896682255e-06, + "loss": 1.5679, + "step": 210 + }, + { + "epoch": 0.007349011223944415, + "grad_norm": 0.5391744375228882, + "learning_rate": 9.752839011356046e-06, + "loss": 1.5083, + "step": 220 + }, + { + "epoch": 0.007683057188669161, + "grad_norm": 0.5901408791542053, + "learning_rate": 1.0198174126029838e-05, + "loss": 1.5092, + "step": 230 + }, + { + "epoch": 0.008017103153393906, + "grad_norm": 0.6198647618293762, + "learning_rate": 1.064350924070363e-05, + "loss": 1.461, + "step": 240 + }, + { + "epoch": 0.008351149118118652, + "grad_norm": 0.7310040593147278, + "learning_rate": 1.1088844355377421e-05, + "loss": 1.4221, + "step": 250 + }, + { + "epoch": 0.008685195082843399, + "grad_norm": 0.6236663460731506, + "learning_rate": 1.1534179470051213e-05, + "loss": 1.442, + "step": 260 + }, + { + "epoch": 0.009019241047568145, + "grad_norm": 0.6342867016792297, + "learning_rate": 1.1979514584725007e-05, + "loss": 1.3693, + "step": 270 + }, + { + "epoch": 0.00935328701229289, + "grad_norm": 0.7100450396537781, + "learning_rate": 1.2424849699398798e-05, + "loss": 1.3837, + "step": 280 + }, + { + "epoch": 0.009687332977017637, + "grad_norm": 0.6451166868209839, + "learning_rate": 1.2870184814072588e-05, + "loss": 1.3974, + "step": 290 + }, + { + "epoch": 0.010021378941742383, + "grad_norm": 0.870101809501648, + "learning_rate": 1.3315519928746382e-05, + "loss": 1.3877, + "step": 300 + }, + { + "epoch": 0.010355424906467129, + "grad_norm": 0.8575422763824463, + "learning_rate": 1.3760855043420174e-05, + "loss": 1.4335, + "step": 310 + }, + { + "epoch": 0.010689470871191877, + "grad_norm": 0.8088720440864563, + "learning_rate": 1.4206190158093965e-05, + "loss": 1.3586, + "step": 320 + }, + { + "epoch": 0.011023516835916623, + "grad_norm": 0.8774932622909546, + "learning_rate": 1.4651525272767757e-05, + "loss": 1.3121, + "step": 330 + }, + { + "epoch": 0.011357562800641369, + "grad_norm": 0.8251705765724182, + "learning_rate": 1.509686038744155e-05, + "loss": 1.313, + "step": 340 + }, + { + "epoch": 0.011691608765366115, + "grad_norm": 0.7873732447624207, + "learning_rate": 1.5542195502115342e-05, + "loss": 1.3467, + "step": 350 + }, + { + "epoch": 0.012025654730090861, + "grad_norm": 0.8205036520957947, + "learning_rate": 1.5987530616789136e-05, + "loss": 1.3221, + "step": 360 + }, + { + "epoch": 0.012359700694815607, + "grad_norm": 1.0820122957229614, + "learning_rate": 1.6432865731462926e-05, + "loss": 1.2746, + "step": 370 + }, + { + "epoch": 0.012693746659540353, + "grad_norm": 1.0945748090744019, + "learning_rate": 1.687820084613672e-05, + "loss": 1.2637, + "step": 380 + }, + { + "epoch": 0.0130277926242651, + "grad_norm": 1.1396582126617432, + "learning_rate": 1.732353596081051e-05, + "loss": 1.2608, + "step": 390 + }, + { + "epoch": 0.013361838588989846, + "grad_norm": 0.9956055283546448, + "learning_rate": 1.7768871075484303e-05, + "loss": 1.3235, + "step": 400 + }, + { + "epoch": 0.013695884553714592, + "grad_norm": 1.1046253442764282, + "learning_rate": 1.8214206190158096e-05, + "loss": 1.2938, + "step": 410 + }, + { + "epoch": 0.014029930518439338, + "grad_norm": 1.444706916809082, + "learning_rate": 1.8659541304831886e-05, + "loss": 1.2908, + "step": 420 + }, + { + "epoch": 0.014363976483164084, + "grad_norm": 1.29774808883667, + "learning_rate": 1.910487641950568e-05, + "loss": 1.236, + "step": 430 + }, + { + "epoch": 0.01469802244788883, + "grad_norm": 1.5169700384140015, + "learning_rate": 1.9550211534179473e-05, + "loss": 1.2691, + "step": 440 + }, + { + "epoch": 0.015032068412613576, + "grad_norm": 1.3256111145019531, + "learning_rate": 1.9995546648853263e-05, + "loss": 1.2475, + "step": 450 + }, + { + "epoch": 0.015366114377338322, + "grad_norm": 1.3810324668884277, + "learning_rate": 2.0440881763527056e-05, + "loss": 1.2547, + "step": 460 + }, + { + "epoch": 0.015700160342063067, + "grad_norm": 1.5545682907104492, + "learning_rate": 2.088621687820085e-05, + "loss": 1.2229, + "step": 470 + }, + { + "epoch": 0.016034206306787813, + "grad_norm": 1.5830481052398682, + "learning_rate": 2.133155199287464e-05, + "loss": 1.2297, + "step": 480 + }, + { + "epoch": 0.01636825227151256, + "grad_norm": 1.501063585281372, + "learning_rate": 2.177688710754843e-05, + "loss": 1.2041, + "step": 490 + }, + { + "epoch": 0.016702298236237305, + "grad_norm": 1.4076383113861084, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.1938, + "step": 500 + }, + { + "epoch": 0.01703634420096205, + "grad_norm": 1.6274583339691162, + "learning_rate": 2.2667557336896013e-05, + "loss": 1.2074, + "step": 510 + }, + { + "epoch": 0.017370390165686797, + "grad_norm": 1.5699708461761475, + "learning_rate": 2.3112892451569807e-05, + "loss": 1.1618, + "step": 520 + }, + { + "epoch": 0.017704436130411543, + "grad_norm": 1.7313867807388306, + "learning_rate": 2.35582275662436e-05, + "loss": 1.1554, + "step": 530 + }, + { + "epoch": 0.01803848209513629, + "grad_norm": 1.3827382326126099, + "learning_rate": 2.400356268091739e-05, + "loss": 1.1924, + "step": 540 + }, + { + "epoch": 0.018372528059861035, + "grad_norm": 1.7067400217056274, + "learning_rate": 2.4448897795591184e-05, + "loss": 1.1563, + "step": 550 + }, + { + "epoch": 0.01870657402458578, + "grad_norm": 1.6879206895828247, + "learning_rate": 2.4894232910264974e-05, + "loss": 1.1471, + "step": 560 + }, + { + "epoch": 0.019040619989310528, + "grad_norm": 1.8333022594451904, + "learning_rate": 2.533956802493877e-05, + "loss": 1.1571, + "step": 570 + }, + { + "epoch": 0.019374665954035274, + "grad_norm": 1.7436424493789673, + "learning_rate": 2.578490313961256e-05, + "loss": 1.1277, + "step": 580 + }, + { + "epoch": 0.01970871191876002, + "grad_norm": 1.9212511777877808, + "learning_rate": 2.6230238254286354e-05, + "loss": 1.1528, + "step": 590 + }, + { + "epoch": 0.020042757883484766, + "grad_norm": 1.8021934032440186, + "learning_rate": 2.6675573368960144e-05, + "loss": 1.1608, + "step": 600 + }, + { + "epoch": 0.020376803848209512, + "grad_norm": 1.930381178855896, + "learning_rate": 2.7120908483633934e-05, + "loss": 1.1979, + "step": 610 + }, + { + "epoch": 0.020710849812934258, + "grad_norm": 1.9796295166015625, + "learning_rate": 2.7566243598307727e-05, + "loss": 1.1191, + "step": 620 + }, + { + "epoch": 0.021044895777659008, + "grad_norm": 1.8207927942276, + "learning_rate": 2.8011578712981518e-05, + "loss": 1.0647, + "step": 630 + }, + { + "epoch": 0.021378941742383754, + "grad_norm": 2.017171859741211, + "learning_rate": 2.8456913827655314e-05, + "loss": 1.0555, + "step": 640 + }, + { + "epoch": 0.0217129877071085, + "grad_norm": 2.401942729949951, + "learning_rate": 2.89022489423291e-05, + "loss": 1.1129, + "step": 650 + }, + { + "epoch": 0.022047033671833246, + "grad_norm": 1.6404247283935547, + "learning_rate": 2.9347584057002898e-05, + "loss": 1.1042, + "step": 660 + }, + { + "epoch": 0.022381079636557992, + "grad_norm": 1.8265182971954346, + "learning_rate": 2.9792919171676688e-05, + "loss": 1.126, + "step": 670 + }, + { + "epoch": 0.022715125601282738, + "grad_norm": 2.1850333213806152, + "learning_rate": 3.023825428635048e-05, + "loss": 1.1551, + "step": 680 + }, + { + "epoch": 0.023049171566007484, + "grad_norm": 2.050244092941284, + "learning_rate": 3.068358940102427e-05, + "loss": 1.0831, + "step": 690 + }, + { + "epoch": 0.02338321753073223, + "grad_norm": 2.1816062927246094, + "learning_rate": 3.1128924515698065e-05, + "loss": 1.1301, + "step": 700 + }, + { + "epoch": 0.023717263495456976, + "grad_norm": 2.1626248359680176, + "learning_rate": 3.1574259630371855e-05, + "loss": 1.1301, + "step": 710 + }, + { + "epoch": 0.024051309460181722, + "grad_norm": 2.2363715171813965, + "learning_rate": 3.201959474504565e-05, + "loss": 1.0605, + "step": 720 + }, + { + "epoch": 0.02438535542490647, + "grad_norm": 2.54128360748291, + "learning_rate": 3.246492985971944e-05, + "loss": 1.0583, + "step": 730 + }, + { + "epoch": 0.024719401389631215, + "grad_norm": 2.2511777877807617, + "learning_rate": 3.291026497439323e-05, + "loss": 1.0946, + "step": 740 + }, + { + "epoch": 0.02505344735435596, + "grad_norm": 2.0335311889648438, + "learning_rate": 3.335560008906702e-05, + "loss": 1.0852, + "step": 750 + }, + { + "epoch": 0.025387493319080707, + "grad_norm": 2.362734317779541, + "learning_rate": 3.380093520374082e-05, + "loss": 1.0618, + "step": 760 + }, + { + "epoch": 0.025721539283805453, + "grad_norm": 2.2724108695983887, + "learning_rate": 3.424627031841461e-05, + "loss": 1.0633, + "step": 770 + }, + { + "epoch": 0.0260555852485302, + "grad_norm": 2.307868003845215, + "learning_rate": 3.4691605433088405e-05, + "loss": 1.0844, + "step": 780 + }, + { + "epoch": 0.026389631213254945, + "grad_norm": 2.148146152496338, + "learning_rate": 3.5136940547762195e-05, + "loss": 1.0754, + "step": 790 + }, + { + "epoch": 0.02672367717797969, + "grad_norm": 2.7365825176239014, + "learning_rate": 3.5582275662435985e-05, + "loss": 1.0358, + "step": 800 + }, + { + "epoch": 0.027057723142704437, + "grad_norm": 2.3447327613830566, + "learning_rate": 3.6027610777109776e-05, + "loss": 1.058, + "step": 810 + }, + { + "epoch": 0.027391769107429183, + "grad_norm": 2.7216796875, + "learning_rate": 3.6472945891783566e-05, + "loss": 1.0919, + "step": 820 + }, + { + "epoch": 0.02772581507215393, + "grad_norm": 2.737053632736206, + "learning_rate": 3.691828100645736e-05, + "loss": 1.0519, + "step": 830 + }, + { + "epoch": 0.028059861036878676, + "grad_norm": 2.32270884513855, + "learning_rate": 3.736361612113115e-05, + "loss": 1.0188, + "step": 840 + }, + { + "epoch": 0.028393907001603422, + "grad_norm": 2.63076114654541, + "learning_rate": 3.780895123580494e-05, + "loss": 1.0016, + "step": 850 + }, + { + "epoch": 0.028727952966328168, + "grad_norm": 2.236703634262085, + "learning_rate": 3.825428635047873e-05, + "loss": 1.0372, + "step": 860 + }, + { + "epoch": 0.029061998931052914, + "grad_norm": 2.6094136238098145, + "learning_rate": 3.869962146515253e-05, + "loss": 1.0713, + "step": 870 + }, + { + "epoch": 0.02939604489577766, + "grad_norm": 2.1688013076782227, + "learning_rate": 3.914495657982632e-05, + "loss": 1.0809, + "step": 880 + }, + { + "epoch": 0.029730090860502406, + "grad_norm": 2.5162503719329834, + "learning_rate": 3.9590291694500116e-05, + "loss": 1.0224, + "step": 890 + }, + { + "epoch": 0.030064136825227152, + "grad_norm": 2.48787260055542, + "learning_rate": 4.0035626809173906e-05, + "loss": 0.9487, + "step": 900 + }, + { + "epoch": 0.0303981827899519, + "grad_norm": 2.6829824447631836, + "learning_rate": 4.0480961923847696e-05, + "loss": 1.0133, + "step": 910 + }, + { + "epoch": 0.030732228754676644, + "grad_norm": 2.332667589187622, + "learning_rate": 4.0926297038521486e-05, + "loss": 1.0457, + "step": 920 + }, + { + "epoch": 0.03106627471940139, + "grad_norm": 2.3953633308410645, + "learning_rate": 4.137163215319528e-05, + "loss": 0.9893, + "step": 930 + }, + { + "epoch": 0.03140032068412613, + "grad_norm": 2.1512844562530518, + "learning_rate": 4.181696726786907e-05, + "loss": 0.9658, + "step": 940 + }, + { + "epoch": 0.03173436664885088, + "grad_norm": 2.264373540878296, + "learning_rate": 4.226230238254287e-05, + "loss": 1.0072, + "step": 950 + }, + { + "epoch": 0.032068412613575625, + "grad_norm": 2.703538179397583, + "learning_rate": 4.270763749721666e-05, + "loss": 0.991, + "step": 960 + }, + { + "epoch": 0.032402458578300375, + "grad_norm": 2.6473569869995117, + "learning_rate": 4.315297261189045e-05, + "loss": 0.9911, + "step": 970 + }, + { + "epoch": 0.03273650454302512, + "grad_norm": 2.614718198776245, + "learning_rate": 4.359830772656424e-05, + "loss": 0.9665, + "step": 980 + }, + { + "epoch": 0.03307055050774987, + "grad_norm": 2.466975212097168, + "learning_rate": 4.404364284123804e-05, + "loss": 1.03, + "step": 990 + }, + { + "epoch": 0.03340459647247461, + "grad_norm": 3.7919394969940186, + "learning_rate": 4.448897795591183e-05, + "loss": 0.9568, + "step": 1000 + }, + { + "epoch": 0.03340459647247461, + "eval_chrf": 76.56471572800406, + "eval_loss": 1.65837562084198, + "eval_runtime": 313.0724, + "eval_samples_per_second": 0.319, + "eval_steps_per_second": 0.013, + "step": 1000 + }, + { + "epoch": 0.03373864243719936, + "grad_norm": 2.6076509952545166, + "learning_rate": 4.493431307058562e-05, + "loss": 1.0077, + "step": 1010 + }, + { + "epoch": 0.0340726884019241, + "grad_norm": 2.3477859497070312, + "learning_rate": 4.537964818525941e-05, + "loss": 0.992, + "step": 1020 + }, + { + "epoch": 0.03440673436664885, + "grad_norm": 2.399259328842163, + "learning_rate": 4.58249832999332e-05, + "loss": 0.9813, + "step": 1030 + }, + { + "epoch": 0.034740780331373594, + "grad_norm": 2.2360477447509766, + "learning_rate": 4.6270318414606994e-05, + "loss": 0.9827, + "step": 1040 + }, + { + "epoch": 0.035074826296098344, + "grad_norm": 2.8738391399383545, + "learning_rate": 4.6715653529280784e-05, + "loss": 0.9825, + "step": 1050 + }, + { + "epoch": 0.035408872260823086, + "grad_norm": 2.431370258331299, + "learning_rate": 4.716098864395458e-05, + "loss": 0.9684, + "step": 1060 + }, + { + "epoch": 0.035742918225547836, + "grad_norm": 2.0929181575775146, + "learning_rate": 4.760632375862837e-05, + "loss": 0.9599, + "step": 1070 + }, + { + "epoch": 0.03607696419027258, + "grad_norm": 2.4185545444488525, + "learning_rate": 4.805165887330216e-05, + "loss": 0.9642, + "step": 1080 + }, + { + "epoch": 0.03641101015499733, + "grad_norm": 2.3755533695220947, + "learning_rate": 4.849699398797595e-05, + "loss": 0.9737, + "step": 1090 + }, + { + "epoch": 0.03674505611972207, + "grad_norm": 2.5884833335876465, + "learning_rate": 4.894232910264975e-05, + "loss": 0.9899, + "step": 1100 + }, + { + "epoch": 0.03707910208444682, + "grad_norm": 2.278193473815918, + "learning_rate": 4.938766421732354e-05, + "loss": 0.9289, + "step": 1110 + }, + { + "epoch": 0.03741314804917156, + "grad_norm": 2.4764041900634766, + "learning_rate": 4.9832999331997335e-05, + "loss": 0.9536, + "step": 1120 + }, + { + "epoch": 0.03774719401389631, + "grad_norm": 2.2487101554870605, + "learning_rate": 5.0278334446671125e-05, + "loss": 0.9953, + "step": 1130 + }, + { + "epoch": 0.038081239978621055, + "grad_norm": 2.2460713386535645, + "learning_rate": 5.072366956134491e-05, + "loss": 0.9296, + "step": 1140 + }, + { + "epoch": 0.038415285943345805, + "grad_norm": 2.527865171432495, + "learning_rate": 5.1169004676018705e-05, + "loss": 0.9411, + "step": 1150 + }, + { + "epoch": 0.03874933190807055, + "grad_norm": 2.611863851547241, + "learning_rate": 5.16143397906925e-05, + "loss": 0.948, + "step": 1160 + }, + { + "epoch": 0.0390833778727953, + "grad_norm": 3.1292872428894043, + "learning_rate": 5.20596749053663e-05, + "loss": 0.9529, + "step": 1170 + }, + { + "epoch": 0.03941742383752004, + "grad_norm": 3.00462007522583, + "learning_rate": 5.250501002004008e-05, + "loss": 0.9436, + "step": 1180 + }, + { + "epoch": 0.03975146980224479, + "grad_norm": 2.4933409690856934, + "learning_rate": 5.295034513471387e-05, + "loss": 0.9406, + "step": 1190 + }, + { + "epoch": 0.04008551576696953, + "grad_norm": 2.521202325820923, + "learning_rate": 5.339568024938767e-05, + "loss": 0.9494, + "step": 1200 + }, + { + "epoch": 0.04041956173169428, + "grad_norm": 2.6349973678588867, + "learning_rate": 5.384101536406145e-05, + "loss": 0.9171, + "step": 1210 + }, + { + "epoch": 0.040753607696419024, + "grad_norm": 2.2925546169281006, + "learning_rate": 5.428635047873525e-05, + "loss": 0.9408, + "step": 1220 + }, + { + "epoch": 0.04108765366114377, + "grad_norm": 2.7914302349090576, + "learning_rate": 5.4731685593409045e-05, + "loss": 0.9335, + "step": 1230 + }, + { + "epoch": 0.041421699625868516, + "grad_norm": 2.853599786758423, + "learning_rate": 5.5177020708082835e-05, + "loss": 0.9252, + "step": 1240 + }, + { + "epoch": 0.041755745590593266, + "grad_norm": 2.796250581741333, + "learning_rate": 5.5622355822756625e-05, + "loss": 0.9353, + "step": 1250 + }, + { + "epoch": 0.042089791555318015, + "grad_norm": 2.809563636779785, + "learning_rate": 5.6067690937430415e-05, + "loss": 0.9531, + "step": 1260 + }, + { + "epoch": 0.04242383752004276, + "grad_norm": 3.1351115703582764, + "learning_rate": 5.651302605210421e-05, + "loss": 0.9046, + "step": 1270 + }, + { + "epoch": 0.04275788348476751, + "grad_norm": 3.0374112129211426, + "learning_rate": 5.695836116677801e-05, + "loss": 0.916, + "step": 1280 + }, + { + "epoch": 0.04309192944949225, + "grad_norm": 2.6092092990875244, + "learning_rate": 5.740369628145179e-05, + "loss": 0.8688, + "step": 1290 + }, + { + "epoch": 0.043425975414217, + "grad_norm": 2.3511579036712646, + "learning_rate": 5.784903139612559e-05, + "loss": 0.939, + "step": 1300 + }, + { + "epoch": 0.04376002137894174, + "grad_norm": 2.560506582260132, + "learning_rate": 5.829436651079938e-05, + "loss": 0.9376, + "step": 1310 + }, + { + "epoch": 0.04409406734366649, + "grad_norm": 2.682102918624878, + "learning_rate": 5.8739701625473176e-05, + "loss": 0.9214, + "step": 1320 + }, + { + "epoch": 0.044428113308391234, + "grad_norm": 3.231112241744995, + "learning_rate": 5.918503674014696e-05, + "loss": 0.9441, + "step": 1330 + }, + { + "epoch": 0.044762159273115984, + "grad_norm": 2.4802374839782715, + "learning_rate": 5.9630371854820756e-05, + "loss": 0.9523, + "step": 1340 + }, + { + "epoch": 0.04509620523784073, + "grad_norm": 2.7626960277557373, + "learning_rate": 6.007570696949455e-05, + "loss": 0.9365, + "step": 1350 + }, + { + "epoch": 0.045430251202565476, + "grad_norm": 2.0990278720855713, + "learning_rate": 6.052104208416834e-05, + "loss": 0.9103, + "step": 1360 + }, + { + "epoch": 0.04576429716729022, + "grad_norm": 2.1792502403259277, + "learning_rate": 6.0966377198842126e-05, + "loss": 0.8962, + "step": 1370 + }, + { + "epoch": 0.04609834313201497, + "grad_norm": 2.1209568977355957, + "learning_rate": 6.141171231351592e-05, + "loss": 0.8699, + "step": 1380 + }, + { + "epoch": 0.04643238909673971, + "grad_norm": 2.8365020751953125, + "learning_rate": 6.185704742818971e-05, + "loss": 0.9099, + "step": 1390 + }, + { + "epoch": 0.04676643506146446, + "grad_norm": 2.3414621353149414, + "learning_rate": 6.230238254286352e-05, + "loss": 0.9177, + "step": 1400 + }, + { + "epoch": 0.0471004810261892, + "grad_norm": 2.220017194747925, + "learning_rate": 6.274771765753729e-05, + "loss": 0.9613, + "step": 1410 + }, + { + "epoch": 0.04743452699091395, + "grad_norm": 3.4616196155548096, + "learning_rate": 6.31930527722111e-05, + "loss": 0.8601, + "step": 1420 + }, + { + "epoch": 0.047768572955638695, + "grad_norm": 2.476670980453491, + "learning_rate": 6.363838788688489e-05, + "loss": 0.9241, + "step": 1430 + }, + { + "epoch": 0.048102618920363445, + "grad_norm": 2.929867744445801, + "learning_rate": 6.408372300155868e-05, + "loss": 0.9246, + "step": 1440 + }, + { + "epoch": 0.04843666488508819, + "grad_norm": 2.9315266609191895, + "learning_rate": 6.452905811623247e-05, + "loss": 0.8946, + "step": 1450 + }, + { + "epoch": 0.04877071084981294, + "grad_norm": 2.541630506515503, + "learning_rate": 6.497439323090626e-05, + "loss": 0.8523, + "step": 1460 + }, + { + "epoch": 0.04910475681453768, + "grad_norm": 2.6911709308624268, + "learning_rate": 6.541972834558006e-05, + "loss": 0.8573, + "step": 1470 + }, + { + "epoch": 0.04943880277926243, + "grad_norm": 3.1001124382019043, + "learning_rate": 6.586506346025384e-05, + "loss": 0.9516, + "step": 1480 + }, + { + "epoch": 0.04977284874398717, + "grad_norm": 2.4601104259490967, + "learning_rate": 6.631039857492764e-05, + "loss": 0.9187, + "step": 1490 + }, + { + "epoch": 0.05010689470871192, + "grad_norm": 2.3063952922821045, + "learning_rate": 6.675573368960143e-05, + "loss": 0.9136, + "step": 1500 + }, + { + "epoch": 0.050440940673436664, + "grad_norm": 2.1239025592803955, + "learning_rate": 6.720106880427522e-05, + "loss": 0.8608, + "step": 1510 + }, + { + "epoch": 0.050774986638161414, + "grad_norm": 2.236781597137451, + "learning_rate": 6.764640391894901e-05, + "loss": 0.9345, + "step": 1520 + }, + { + "epoch": 0.051109032602886156, + "grad_norm": 2.49877667427063, + "learning_rate": 6.80917390336228e-05, + "loss": 0.9189, + "step": 1530 + }, + { + "epoch": 0.051443078567610906, + "grad_norm": 2.7318644523620605, + "learning_rate": 6.85370741482966e-05, + "loss": 0.8923, + "step": 1540 + }, + { + "epoch": 0.05177712453233565, + "grad_norm": 2.471806764602661, + "learning_rate": 6.89824092629704e-05, + "loss": 0.8623, + "step": 1550 + }, + { + "epoch": 0.0521111704970604, + "grad_norm": 2.5699281692504883, + "learning_rate": 6.942774437764417e-05, + "loss": 0.8853, + "step": 1560 + }, + { + "epoch": 0.05244521646178514, + "grad_norm": 2.2799735069274902, + "learning_rate": 6.987307949231797e-05, + "loss": 0.8627, + "step": 1570 + }, + { + "epoch": 0.05277926242650989, + "grad_norm": 2.1041269302368164, + "learning_rate": 7.031841460699176e-05, + "loss": 0.8781, + "step": 1580 + }, + { + "epoch": 0.05311330839123463, + "grad_norm": 2.4859321117401123, + "learning_rate": 7.076374972166557e-05, + "loss": 0.8618, + "step": 1590 + }, + { + "epoch": 0.05344735435595938, + "grad_norm": 2.2047078609466553, + "learning_rate": 7.120908483633934e-05, + "loss": 0.8951, + "step": 1600 + }, + { + "epoch": 0.053781400320684125, + "grad_norm": 2.176332712173462, + "learning_rate": 7.165441995101313e-05, + "loss": 0.8875, + "step": 1610 + }, + { + "epoch": 0.054115446285408875, + "grad_norm": 2.4692375659942627, + "learning_rate": 7.209975506568694e-05, + "loss": 0.9085, + "step": 1620 + }, + { + "epoch": 0.05444949225013362, + "grad_norm": 2.164307117462158, + "learning_rate": 7.254509018036071e-05, + "loss": 0.8616, + "step": 1630 + }, + { + "epoch": 0.05478353821485837, + "grad_norm": 2.1938083171844482, + "learning_rate": 7.299042529503452e-05, + "loss": 0.8633, + "step": 1640 + }, + { + "epoch": 0.05511758417958311, + "grad_norm": 2.409367561340332, + "learning_rate": 7.343576040970831e-05, + "loss": 0.9029, + "step": 1650 + }, + { + "epoch": 0.05545163014430786, + "grad_norm": 2.4179251194000244, + "learning_rate": 7.38810955243821e-05, + "loss": 0.8956, + "step": 1660 + }, + { + "epoch": 0.0557856761090326, + "grad_norm": 2.190316915512085, + "learning_rate": 7.432643063905589e-05, + "loss": 0.9018, + "step": 1670 + }, + { + "epoch": 0.05611972207375735, + "grad_norm": 2.1865241527557373, + "learning_rate": 7.477176575372968e-05, + "loss": 0.8903, + "step": 1680 + }, + { + "epoch": 0.056453768038482094, + "grad_norm": 2.434356927871704, + "learning_rate": 7.521710086840348e-05, + "loss": 0.8687, + "step": 1690 + }, + { + "epoch": 0.056787814003206843, + "grad_norm": 2.286177158355713, + "learning_rate": 7.566243598307727e-05, + "loss": 0.856, + "step": 1700 + }, + { + "epoch": 0.057121859967931586, + "grad_norm": 2.1842358112335205, + "learning_rate": 7.610777109775106e-05, + "loss": 0.8251, + "step": 1710 + }, + { + "epoch": 0.057455905932656336, + "grad_norm": 2.371933698654175, + "learning_rate": 7.655310621242485e-05, + "loss": 0.853, + "step": 1720 + }, + { + "epoch": 0.05778995189738108, + "grad_norm": 2.2381701469421387, + "learning_rate": 7.699844132709864e-05, + "loss": 0.8564, + "step": 1730 + }, + { + "epoch": 0.05812399786210583, + "grad_norm": 2.1184046268463135, + "learning_rate": 7.744377644177245e-05, + "loss": 0.8404, + "step": 1740 + }, + { + "epoch": 0.05845804382683057, + "grad_norm": 2.310429334640503, + "learning_rate": 7.788911155644622e-05, + "loss": 0.8329, + "step": 1750 + }, + { + "epoch": 0.05879208979155532, + "grad_norm": 2.403240442276001, + "learning_rate": 7.833444667112003e-05, + "loss": 0.8435, + "step": 1760 + }, + { + "epoch": 0.05912613575628006, + "grad_norm": 2.22519850730896, + "learning_rate": 7.877978178579382e-05, + "loss": 0.7946, + "step": 1770 + }, + { + "epoch": 0.05946018172100481, + "grad_norm": 2.157273769378662, + "learning_rate": 7.92251169004676e-05, + "loss": 0.8083, + "step": 1780 + }, + { + "epoch": 0.059794227685729555, + "grad_norm": 2.591182231903076, + "learning_rate": 7.96704520151414e-05, + "loss": 0.8578, + "step": 1790 + }, + { + "epoch": 0.060128273650454304, + "grad_norm": 2.351850986480713, + "learning_rate": 8.011578712981519e-05, + "loss": 0.8303, + "step": 1800 + }, + { + "epoch": 0.06046231961517905, + "grad_norm": 2.2491748332977295, + "learning_rate": 8.056112224448899e-05, + "loss": 0.8945, + "step": 1810 + }, + { + "epoch": 0.0607963655799038, + "grad_norm": 2.570669412612915, + "learning_rate": 8.100645735916277e-05, + "loss": 0.8125, + "step": 1820 + }, + { + "epoch": 0.06113041154462854, + "grad_norm": 2.3529012203216553, + "learning_rate": 8.145179247383657e-05, + "loss": 0.8115, + "step": 1830 + }, + { + "epoch": 0.06146445750935329, + "grad_norm": 2.171473264694214, + "learning_rate": 8.189712758851036e-05, + "loss": 0.8224, + "step": 1840 + }, + { + "epoch": 0.06179850347407803, + "grad_norm": 2.134735345840454, + "learning_rate": 8.234246270318415e-05, + "loss": 0.8331, + "step": 1850 + }, + { + "epoch": 0.06213254943880278, + "grad_norm": 2.0588860511779785, + "learning_rate": 8.278779781785794e-05, + "loss": 0.7995, + "step": 1860 + }, + { + "epoch": 0.062466595403527524, + "grad_norm": 2.1666712760925293, + "learning_rate": 8.323313293253173e-05, + "loss": 0.8601, + "step": 1870 + }, + { + "epoch": 0.06280064136825227, + "grad_norm": 2.316314458847046, + "learning_rate": 8.367846804720553e-05, + "loss": 0.8737, + "step": 1880 + }, + { + "epoch": 0.06313468733297702, + "grad_norm": 2.1228513717651367, + "learning_rate": 8.412380316187932e-05, + "loss": 0.8196, + "step": 1890 + }, + { + "epoch": 0.06346873329770177, + "grad_norm": 1.805198073387146, + "learning_rate": 8.45691382765531e-05, + "loss": 0.8138, + "step": 1900 + }, + { + "epoch": 0.06380277926242651, + "grad_norm": 2.633667469024658, + "learning_rate": 8.50144733912269e-05, + "loss": 0.839, + "step": 1910 + }, + { + "epoch": 0.06413682522715125, + "grad_norm": 2.720019578933716, + "learning_rate": 8.54598085059007e-05, + "loss": 0.8179, + "step": 1920 + }, + { + "epoch": 0.064470871191876, + "grad_norm": 2.1298136711120605, + "learning_rate": 8.59051436205745e-05, + "loss": 0.8183, + "step": 1930 + }, + { + "epoch": 0.06480491715660075, + "grad_norm": 1.7326240539550781, + "learning_rate": 8.635047873524827e-05, + "loss": 0.8591, + "step": 1940 + }, + { + "epoch": 0.0651389631213255, + "grad_norm": 1.9519201517105103, + "learning_rate": 8.679581384992206e-05, + "loss": 0.8095, + "step": 1950 + }, + { + "epoch": 0.06547300908605024, + "grad_norm": 1.9712485074996948, + "learning_rate": 8.724114896459587e-05, + "loss": 0.825, + "step": 1960 + }, + { + "epoch": 0.06580705505077498, + "grad_norm": 2.3491787910461426, + "learning_rate": 8.768648407926966e-05, + "loss": 0.7994, + "step": 1970 + }, + { + "epoch": 0.06614110101549973, + "grad_norm": 2.047110080718994, + "learning_rate": 8.813181919394345e-05, + "loss": 0.7873, + "step": 1980 + }, + { + "epoch": 0.06647514698022448, + "grad_norm": 2.2620859146118164, + "learning_rate": 8.857715430861724e-05, + "loss": 0.8018, + "step": 1990 + }, + { + "epoch": 0.06680919294494922, + "grad_norm": 1.9324913024902344, + "learning_rate": 8.902248942329103e-05, + "loss": 0.8575, + "step": 2000 + }, + { + "epoch": 0.06680919294494922, + "eval_chrf": 79.51184204559038, + "eval_loss": 1.3767728805541992, + "eval_runtime": 302.9112, + "eval_samples_per_second": 0.33, + "eval_steps_per_second": 0.013, + "step": 2000 + }, + { + "epoch": 0.06714323890967397, + "grad_norm": 1.9122954607009888, + "learning_rate": 8.946782453796483e-05, + "loss": 0.791, + "step": 2010 + }, + { + "epoch": 0.06747728487439872, + "grad_norm": 2.108290195465088, + "learning_rate": 8.991315965263861e-05, + "loss": 0.8515, + "step": 2020 + }, + { + "epoch": 0.06781133083912347, + "grad_norm": 1.9099751710891724, + "learning_rate": 9.035849476731241e-05, + "loss": 0.8705, + "step": 2030 + }, + { + "epoch": 0.0681453768038482, + "grad_norm": 2.2484171390533447, + "learning_rate": 9.08038298819862e-05, + "loss": 0.7471, + "step": 2040 + }, + { + "epoch": 0.06847942276857295, + "grad_norm": 2.1186323165893555, + "learning_rate": 9.124916499665999e-05, + "loss": 0.8351, + "step": 2050 + }, + { + "epoch": 0.0688134687332977, + "grad_norm": 2.340143918991089, + "learning_rate": 9.169450011133378e-05, + "loss": 0.8308, + "step": 2060 + }, + { + "epoch": 0.06914751469802245, + "grad_norm": 2.2099223136901855, + "learning_rate": 9.213983522600757e-05, + "loss": 0.8004, + "step": 2070 + }, + { + "epoch": 0.06948156066274719, + "grad_norm": 1.9246491193771362, + "learning_rate": 9.258517034068137e-05, + "loss": 0.7605, + "step": 2080 + }, + { + "epoch": 0.06981560662747194, + "grad_norm": 2.3552334308624268, + "learning_rate": 9.303050545535515e-05, + "loss": 0.7771, + "step": 2090 + }, + { + "epoch": 0.07014965259219669, + "grad_norm": 2.0112524032592773, + "learning_rate": 9.347584057002895e-05, + "loss": 0.7893, + "step": 2100 + }, + { + "epoch": 0.07048369855692144, + "grad_norm": 1.9687466621398926, + "learning_rate": 9.392117568470274e-05, + "loss": 0.7866, + "step": 2110 + }, + { + "epoch": 0.07081774452164617, + "grad_norm": 2.2704460620880127, + "learning_rate": 9.436651079937654e-05, + "loss": 0.7623, + "step": 2120 + }, + { + "epoch": 0.07115179048637092, + "grad_norm": 2.038999319076538, + "learning_rate": 9.481184591405033e-05, + "loss": 0.7742, + "step": 2130 + }, + { + "epoch": 0.07148583645109567, + "grad_norm": 1.9171621799468994, + "learning_rate": 9.525718102872412e-05, + "loss": 0.7569, + "step": 2140 + }, + { + "epoch": 0.07181988241582042, + "grad_norm": 2.1922760009765625, + "learning_rate": 9.570251614339792e-05, + "loss": 0.7782, + "step": 2150 + }, + { + "epoch": 0.07215392838054516, + "grad_norm": 1.8142212629318237, + "learning_rate": 9.614785125807171e-05, + "loss": 0.776, + "step": 2160 + }, + { + "epoch": 0.0724879743452699, + "grad_norm": 2.166292190551758, + "learning_rate": 9.65931863727455e-05, + "loss": 0.8273, + "step": 2170 + }, + { + "epoch": 0.07282202030999466, + "grad_norm": 1.9324548244476318, + "learning_rate": 9.703852148741929e-05, + "loss": 0.7776, + "step": 2180 + }, + { + "epoch": 0.0731560662747194, + "grad_norm": 2.334383487701416, + "learning_rate": 9.748385660209308e-05, + "loss": 0.8133, + "step": 2190 + }, + { + "epoch": 0.07349011223944414, + "grad_norm": 1.8159606456756592, + "learning_rate": 9.792919171676688e-05, + "loss": 0.7694, + "step": 2200 + }, + { + "epoch": 0.07382415820416889, + "grad_norm": 2.1432340145111084, + "learning_rate": 9.837452683144066e-05, + "loss": 0.758, + "step": 2210 + }, + { + "epoch": 0.07415820416889364, + "grad_norm": 2.457943916320801, + "learning_rate": 9.881986194611446e-05, + "loss": 0.7649, + "step": 2220 + }, + { + "epoch": 0.07449225013361839, + "grad_norm": 2.1346874237060547, + "learning_rate": 9.926519706078825e-05, + "loss": 0.8073, + "step": 2230 + }, + { + "epoch": 0.07482629609834313, + "grad_norm": 1.9563961029052734, + "learning_rate": 9.971053217546203e-05, + "loss": 0.8385, + "step": 2240 + }, + { + "epoch": 0.07516034206306788, + "grad_norm": 2.0002083778381348, + "learning_rate": 0.00010015586729013585, + "loss": 0.7826, + "step": 2250 + }, + { + "epoch": 0.07549438802779262, + "grad_norm": 1.8347655534744263, + "learning_rate": 0.00010060120240480961, + "loss": 0.7461, + "step": 2260 + }, + { + "epoch": 0.07582843399251737, + "grad_norm": 1.9708894491195679, + "learning_rate": 0.00010104653751948341, + "loss": 0.821, + "step": 2270 + }, + { + "epoch": 0.07616247995724211, + "grad_norm": 1.7556084394454956, + "learning_rate": 0.0001014918726341572, + "loss": 0.7393, + "step": 2280 + }, + { + "epoch": 0.07649652592196686, + "grad_norm": 1.8862894773483276, + "learning_rate": 0.00010193720774883099, + "loss": 0.7678, + "step": 2290 + }, + { + "epoch": 0.07683057188669161, + "grad_norm": 2.139695882797241, + "learning_rate": 0.0001023825428635048, + "loss": 0.8105, + "step": 2300 + }, + { + "epoch": 0.07716461785141636, + "grad_norm": 1.6069021224975586, + "learning_rate": 0.00010282787797817859, + "loss": 0.7865, + "step": 2310 + }, + { + "epoch": 0.0774986638161411, + "grad_norm": 2.0706965923309326, + "learning_rate": 0.00010327321309285239, + "loss": 0.7631, + "step": 2320 + }, + { + "epoch": 0.07783270978086584, + "grad_norm": 1.7655831575393677, + "learning_rate": 0.00010371854820752618, + "loss": 0.7499, + "step": 2330 + }, + { + "epoch": 0.0781667557455906, + "grad_norm": 1.8685438632965088, + "learning_rate": 0.00010416388332219996, + "loss": 0.8085, + "step": 2340 + }, + { + "epoch": 0.07850080171031534, + "grad_norm": 1.7118691205978394, + "learning_rate": 0.00010460921843687375, + "loss": 0.7293, + "step": 2350 + }, + { + "epoch": 0.07883484767504008, + "grad_norm": 1.8911796808242798, + "learning_rate": 0.00010505455355154754, + "loss": 0.8152, + "step": 2360 + }, + { + "epoch": 0.07916889363976483, + "grad_norm": 1.8626192808151245, + "learning_rate": 0.00010549988866622134, + "loss": 0.735, + "step": 2370 + }, + { + "epoch": 0.07950293960448958, + "grad_norm": 2.0868899822235107, + "learning_rate": 0.00010594522378089513, + "loss": 0.7364, + "step": 2380 + }, + { + "epoch": 0.07983698556921433, + "grad_norm": 1.7112470865249634, + "learning_rate": 0.00010639055889556892, + "loss": 0.7486, + "step": 2390 + }, + { + "epoch": 0.08017103153393906, + "grad_norm": 1.7827435731887817, + "learning_rate": 0.00010683589401024272, + "loss": 0.7558, + "step": 2400 + }, + { + "epoch": 0.08050507749866381, + "grad_norm": 1.9360101222991943, + "learning_rate": 0.00010728122912491651, + "loss": 0.7547, + "step": 2410 + }, + { + "epoch": 0.08083912346338856, + "grad_norm": 1.6501215696334839, + "learning_rate": 0.00010772656423959029, + "loss": 0.7204, + "step": 2420 + }, + { + "epoch": 0.08117316942811331, + "grad_norm": 1.7264634370803833, + "learning_rate": 0.00010817189935426408, + "loss": 0.7687, + "step": 2430 + }, + { + "epoch": 0.08150721539283805, + "grad_norm": 1.6958173513412476, + "learning_rate": 0.00010861723446893788, + "loss": 0.7698, + "step": 2440 + }, + { + "epoch": 0.0818412613575628, + "grad_norm": 2.0443594455718994, + "learning_rate": 0.00010906256958361167, + "loss": 0.7468, + "step": 2450 + }, + { + "epoch": 0.08217530732228755, + "grad_norm": 1.800091028213501, + "learning_rate": 0.00010950790469828546, + "loss": 0.7689, + "step": 2460 + }, + { + "epoch": 0.0825093532870123, + "grad_norm": 1.5781229734420776, + "learning_rate": 0.00010995323981295927, + "loss": 0.7153, + "step": 2470 + }, + { + "epoch": 0.08284339925173703, + "grad_norm": 1.7734858989715576, + "learning_rate": 0.00011039857492763306, + "loss": 0.7622, + "step": 2480 + }, + { + "epoch": 0.08317744521646178, + "grad_norm": 1.8399907350540161, + "learning_rate": 0.00011084391004230683, + "loss": 0.7394, + "step": 2490 + }, + { + "epoch": 0.08351149118118653, + "grad_norm": 1.4120903015136719, + "learning_rate": 0.00011128924515698062, + "loss": 0.7699, + "step": 2500 + }, + { + "epoch": 0.08384553714591128, + "grad_norm": 1.7722898721694946, + "learning_rate": 0.00011173458027165441, + "loss": 0.7475, + "step": 2510 + }, + { + "epoch": 0.08417958311063603, + "grad_norm": 1.637759804725647, + "learning_rate": 0.00011217991538632822, + "loss": 0.7686, + "step": 2520 + }, + { + "epoch": 0.08451362907536077, + "grad_norm": 1.8854811191558838, + "learning_rate": 0.00011262525050100201, + "loss": 0.7612, + "step": 2530 + }, + { + "epoch": 0.08484767504008552, + "grad_norm": 1.4870004653930664, + "learning_rate": 0.00011307058561567581, + "loss": 0.6999, + "step": 2540 + }, + { + "epoch": 0.08518172100481027, + "grad_norm": 1.5495158433914185, + "learning_rate": 0.0001135159207303496, + "loss": 0.6626, + "step": 2550 + }, + { + "epoch": 0.08551576696953501, + "grad_norm": 1.5705488920211792, + "learning_rate": 0.00011396125584502339, + "loss": 0.7489, + "step": 2560 + }, + { + "epoch": 0.08584981293425975, + "grad_norm": 1.6735090017318726, + "learning_rate": 0.00011440659095969717, + "loss": 0.7786, + "step": 2570 + }, + { + "epoch": 0.0861838588989845, + "grad_norm": 1.5239033699035645, + "learning_rate": 0.00011485192607437096, + "loss": 0.7212, + "step": 2580 + }, + { + "epoch": 0.08651790486370925, + "grad_norm": 1.3365514278411865, + "learning_rate": 0.00011529726118904476, + "loss": 0.7223, + "step": 2590 + }, + { + "epoch": 0.086851950828434, + "grad_norm": 1.522454857826233, + "learning_rate": 0.00011574259630371855, + "loss": 0.723, + "step": 2600 + }, + { + "epoch": 0.08718599679315873, + "grad_norm": 1.4313466548919678, + "learning_rate": 0.00011618793141839234, + "loss": 0.7539, + "step": 2610 + }, + { + "epoch": 0.08752004275788348, + "grad_norm": 1.6229764223098755, + "learning_rate": 0.00011663326653306615, + "loss": 0.7079, + "step": 2620 + }, + { + "epoch": 0.08785408872260823, + "grad_norm": 1.4668691158294678, + "learning_rate": 0.00011707860164773994, + "loss": 0.7579, + "step": 2630 + }, + { + "epoch": 0.08818813468733298, + "grad_norm": 1.3906711339950562, + "learning_rate": 0.00011752393676241371, + "loss": 0.7551, + "step": 2640 + }, + { + "epoch": 0.08852218065205772, + "grad_norm": 1.4407272338867188, + "learning_rate": 0.0001179692718770875, + "loss": 0.6967, + "step": 2650 + }, + { + "epoch": 0.08885622661678247, + "grad_norm": 1.7735347747802734, + "learning_rate": 0.0001184146069917613, + "loss": 0.723, + "step": 2660 + }, + { + "epoch": 0.08919027258150722, + "grad_norm": 1.603212594985962, + "learning_rate": 0.0001188599421064351, + "loss": 0.724, + "step": 2670 + }, + { + "epoch": 0.08952431854623197, + "grad_norm": 1.766235113143921, + "learning_rate": 0.00011930527722110889, + "loss": 0.7462, + "step": 2680 + }, + { + "epoch": 0.0898583645109567, + "grad_norm": 1.6130450963974, + "learning_rate": 0.00011975061233578269, + "loss": 0.7816, + "step": 2690 + }, + { + "epoch": 0.09019241047568145, + "grad_norm": 1.6365820169448853, + "learning_rate": 0.00012019594745045648, + "loss": 0.7303, + "step": 2700 + }, + { + "epoch": 0.0905264564404062, + "grad_norm": 1.8331228494644165, + "learning_rate": 0.00012064128256513027, + "loss": 0.7433, + "step": 2710 + }, + { + "epoch": 0.09086050240513095, + "grad_norm": 1.6143665313720703, + "learning_rate": 0.00012108661767980405, + "loss": 0.7204, + "step": 2720 + }, + { + "epoch": 0.09119454836985569, + "grad_norm": 1.546567440032959, + "learning_rate": 0.00012153195279447785, + "loss": 0.7346, + "step": 2730 + }, + { + "epoch": 0.09152859433458044, + "grad_norm": 1.5030059814453125, + "learning_rate": 0.00012197728790915164, + "loss": 0.761, + "step": 2740 + }, + { + "epoch": 0.09186264029930519, + "grad_norm": 1.7012742757797241, + "learning_rate": 0.00012242262302382544, + "loss": 0.7423, + "step": 2750 + }, + { + "epoch": 0.09219668626402994, + "grad_norm": 1.9118249416351318, + "learning_rate": 0.00012286795813849923, + "loss": 0.7103, + "step": 2760 + }, + { + "epoch": 0.09253073222875467, + "grad_norm": 1.5764511823654175, + "learning_rate": 0.00012331329325317302, + "loss": 0.7166, + "step": 2770 + }, + { + "epoch": 0.09286477819347942, + "grad_norm": 1.4239466190338135, + "learning_rate": 0.0001237586283678468, + "loss": 0.7198, + "step": 2780 + }, + { + "epoch": 0.09319882415820417, + "grad_norm": 1.3562862873077393, + "learning_rate": 0.0001242039634825206, + "loss": 0.6711, + "step": 2790 + }, + { + "epoch": 0.09353287012292892, + "grad_norm": 1.3958638906478882, + "learning_rate": 0.0001246492985971944, + "loss": 0.6995, + "step": 2800 + }, + { + "epoch": 0.09386691608765366, + "grad_norm": 1.7698390483856201, + "learning_rate": 0.00012509463371186818, + "loss": 0.7451, + "step": 2810 + }, + { + "epoch": 0.0942009620523784, + "grad_norm": 1.6088200807571411, + "learning_rate": 0.00012553996882654197, + "loss": 0.7049, + "step": 2820 + }, + { + "epoch": 0.09453500801710316, + "grad_norm": 1.7149405479431152, + "learning_rate": 0.00012598530394121576, + "loss": 0.7493, + "step": 2830 + }, + { + "epoch": 0.0948690539818279, + "grad_norm": 1.5805022716522217, + "learning_rate": 0.00012643063905588955, + "loss": 0.6925, + "step": 2840 + }, + { + "epoch": 0.09520309994655264, + "grad_norm": 1.4247678518295288, + "learning_rate": 0.00012687597417056337, + "loss": 0.723, + "step": 2850 + }, + { + "epoch": 0.09553714591127739, + "grad_norm": 1.9117188453674316, + "learning_rate": 0.00012732130928523716, + "loss": 0.6818, + "step": 2860 + }, + { + "epoch": 0.09587119187600214, + "grad_norm": 1.3899047374725342, + "learning_rate": 0.00012776664439991092, + "loss": 0.7112, + "step": 2870 + }, + { + "epoch": 0.09620523784072689, + "grad_norm": 1.3593915700912476, + "learning_rate": 0.00012821197951458471, + "loss": 0.7018, + "step": 2880 + }, + { + "epoch": 0.09653928380545163, + "grad_norm": 1.659005880355835, + "learning_rate": 0.00012865731462925853, + "loss": 0.7104, + "step": 2890 + }, + { + "epoch": 0.09687332977017638, + "grad_norm": 1.5677987337112427, + "learning_rate": 0.00012910264974393232, + "loss": 0.7405, + "step": 2900 + }, + { + "epoch": 0.09720737573490112, + "grad_norm": 1.8666208982467651, + "learning_rate": 0.0001295479848586061, + "loss": 0.6841, + "step": 2910 + }, + { + "epoch": 0.09754142169962587, + "grad_norm": 1.3272771835327148, + "learning_rate": 0.0001299933199732799, + "loss": 0.6983, + "step": 2920 + }, + { + "epoch": 0.09787546766435061, + "grad_norm": 1.8849602937698364, + "learning_rate": 0.0001304386550879537, + "loss": 0.7232, + "step": 2930 + }, + { + "epoch": 0.09820951362907536, + "grad_norm": 1.7835630178451538, + "learning_rate": 0.00013088399020262748, + "loss": 0.6833, + "step": 2940 + }, + { + "epoch": 0.09854355959380011, + "grad_norm": 1.4135321378707886, + "learning_rate": 0.00013132932531730127, + "loss": 0.7, + "step": 2950 + }, + { + "epoch": 0.09887760555852486, + "grad_norm": 1.4186408519744873, + "learning_rate": 0.00013177466043197506, + "loss": 0.712, + "step": 2960 + }, + { + "epoch": 0.0992116515232496, + "grad_norm": 1.5348856449127197, + "learning_rate": 0.00013221999554664885, + "loss": 0.7161, + "step": 2970 + }, + { + "epoch": 0.09954569748797434, + "grad_norm": 1.5909215211868286, + "learning_rate": 0.00013266533066132264, + "loss": 0.6714, + "step": 2980 + }, + { + "epoch": 0.0998797434526991, + "grad_norm": 1.5100351572036743, + "learning_rate": 0.00013311066577599646, + "loss": 0.6995, + "step": 2990 + }, + { + "epoch": 0.10021378941742384, + "grad_norm": 1.5203381776809692, + "learning_rate": 0.00013355600089067025, + "loss": 0.6841, + "step": 3000 + }, + { + "epoch": 0.10021378941742384, + "eval_chrf": 81.68236949992944, + "eval_loss": 1.1669803857803345, + "eval_runtime": 236.737, + "eval_samples_per_second": 0.422, + "eval_steps_per_second": 0.017, + "step": 3000 + }, + { + "epoch": 0.10054783538214858, + "grad_norm": 1.8751347064971924, + "learning_rate": 0.00013400133600534404, + "loss": 0.629, + "step": 3010 + }, + { + "epoch": 0.10088188134687333, + "grad_norm": 1.4191454648971558, + "learning_rate": 0.00013444667112001783, + "loss": 0.7296, + "step": 3020 + }, + { + "epoch": 0.10121592731159808, + "grad_norm": 1.384886384010315, + "learning_rate": 0.0001348920062346916, + "loss": 0.6812, + "step": 3030 + }, + { + "epoch": 0.10154997327632283, + "grad_norm": 1.2771599292755127, + "learning_rate": 0.0001353373413493654, + "loss": 0.7205, + "step": 3040 + }, + { + "epoch": 0.10188401924104756, + "grad_norm": 1.4225149154663086, + "learning_rate": 0.0001357826764640392, + "loss": 0.7094, + "step": 3050 + }, + { + "epoch": 0.10221806520577231, + "grad_norm": 1.3800084590911865, + "learning_rate": 0.000136228011578713, + "loss": 0.679, + "step": 3060 + }, + { + "epoch": 0.10255211117049706, + "grad_norm": 1.4997609853744507, + "learning_rate": 0.00013667334669338678, + "loss": 0.681, + "step": 3070 + }, + { + "epoch": 0.10288615713522181, + "grad_norm": 1.7790265083312988, + "learning_rate": 0.00013711868180806057, + "loss": 0.6732, + "step": 3080 + }, + { + "epoch": 0.10322020309994655, + "grad_norm": 1.4879150390625, + "learning_rate": 0.00013756401692273439, + "loss": 0.713, + "step": 3090 + }, + { + "epoch": 0.1035542490646713, + "grad_norm": 1.507973313331604, + "learning_rate": 0.00013800935203740815, + "loss": 0.7039, + "step": 3100 + }, + { + "epoch": 0.10388829502939605, + "grad_norm": 1.496062159538269, + "learning_rate": 0.00013845468715208194, + "loss": 0.6679, + "step": 3110 + }, + { + "epoch": 0.1042223409941208, + "grad_norm": 1.4169977903366089, + "learning_rate": 0.00013890002226675573, + "loss": 0.7237, + "step": 3120 + }, + { + "epoch": 0.10455638695884553, + "grad_norm": 1.4140968322753906, + "learning_rate": 0.00013934535738142952, + "loss": 0.7068, + "step": 3130 + }, + { + "epoch": 0.10489043292357028, + "grad_norm": 1.6793572902679443, + "learning_rate": 0.00013979069249610334, + "loss": 0.6649, + "step": 3140 + }, + { + "epoch": 0.10522447888829503, + "grad_norm": 1.4770041704177856, + "learning_rate": 0.00014023602761077713, + "loss": 0.6755, + "step": 3150 + }, + { + "epoch": 0.10555852485301978, + "grad_norm": 1.315122365951538, + "learning_rate": 0.00014068136272545092, + "loss": 0.677, + "step": 3160 + }, + { + "epoch": 0.10589257081774452, + "grad_norm": 1.4143348932266235, + "learning_rate": 0.0001411266978401247, + "loss": 0.6864, + "step": 3170 + }, + { + "epoch": 0.10622661678246927, + "grad_norm": 1.435580849647522, + "learning_rate": 0.0001415720329547985, + "loss": 0.6825, + "step": 3180 + }, + { + "epoch": 0.10656066274719402, + "grad_norm": 1.4847464561462402, + "learning_rate": 0.00014201736806947229, + "loss": 0.6859, + "step": 3190 + }, + { + "epoch": 0.10689470871191876, + "grad_norm": 1.3119359016418457, + "learning_rate": 0.00014246270318414608, + "loss": 0.655, + "step": 3200 + }, + { + "epoch": 0.1072287546766435, + "grad_norm": 1.453138828277588, + "learning_rate": 0.00014290803829881987, + "loss": 0.6936, + "step": 3210 + }, + { + "epoch": 0.10756280064136825, + "grad_norm": 1.5562044382095337, + "learning_rate": 0.00014335337341349366, + "loss": 0.6491, + "step": 3220 + }, + { + "epoch": 0.107896846606093, + "grad_norm": 1.3904789686203003, + "learning_rate": 0.00014379870852816745, + "loss": 0.6639, + "step": 3230 + }, + { + "epoch": 0.10823089257081775, + "grad_norm": 1.4318666458129883, + "learning_rate": 0.00014424404364284126, + "loss": 0.6237, + "step": 3240 + }, + { + "epoch": 0.10856493853554249, + "grad_norm": 1.2185957431793213, + "learning_rate": 0.00014468937875751503, + "loss": 0.6614, + "step": 3250 + }, + { + "epoch": 0.10889898450026723, + "grad_norm": 1.6321661472320557, + "learning_rate": 0.00014513471387218882, + "loss": 0.6603, + "step": 3260 + }, + { + "epoch": 0.10923303046499198, + "grad_norm": 1.3932006359100342, + "learning_rate": 0.0001455800489868626, + "loss": 0.617, + "step": 3270 + }, + { + "epoch": 0.10956707642971673, + "grad_norm": 1.306261658668518, + "learning_rate": 0.00014602538410153642, + "loss": 0.6518, + "step": 3280 + }, + { + "epoch": 0.10990112239444147, + "grad_norm": 1.289352297782898, + "learning_rate": 0.00014647071921621021, + "loss": 0.6983, + "step": 3290 + }, + { + "epoch": 0.11023516835916622, + "grad_norm": 1.3088525533676147, + "learning_rate": 0.000146916054330884, + "loss": 0.7082, + "step": 3300 + }, + { + "epoch": 0.11056921432389097, + "grad_norm": 1.3640263080596924, + "learning_rate": 0.0001473613894455578, + "loss": 0.6162, + "step": 3310 + }, + { + "epoch": 0.11090326028861572, + "grad_norm": 1.3889405727386475, + "learning_rate": 0.00014780672456023158, + "loss": 0.6115, + "step": 3320 + }, + { + "epoch": 0.11123730625334045, + "grad_norm": 1.4052640199661255, + "learning_rate": 0.00014825205967490537, + "loss": 0.6547, + "step": 3330 + }, + { + "epoch": 0.1115713522180652, + "grad_norm": 1.4788811206817627, + "learning_rate": 0.00014869739478957916, + "loss": 0.6254, + "step": 3340 + }, + { + "epoch": 0.11190539818278995, + "grad_norm": 1.3913187980651855, + "learning_rate": 0.00014914272990425295, + "loss": 0.6172, + "step": 3350 + }, + { + "epoch": 0.1122394441475147, + "grad_norm": 1.3309648036956787, + "learning_rate": 0.00014958806501892674, + "loss": 0.6622, + "step": 3360 + }, + { + "epoch": 0.11257349011223944, + "grad_norm": 1.0699092149734497, + "learning_rate": 0.00015003340013360053, + "loss": 0.6279, + "step": 3370 + }, + { + "epoch": 0.11290753607696419, + "grad_norm": 1.2862350940704346, + "learning_rate": 0.00015047873524827435, + "loss": 0.6824, + "step": 3380 + }, + { + "epoch": 0.11324158204168894, + "grad_norm": 1.335388422012329, + "learning_rate": 0.00015092407036294814, + "loss": 0.6742, + "step": 3390 + }, + { + "epoch": 0.11357562800641369, + "grad_norm": 1.2487215995788574, + "learning_rate": 0.00015136940547762193, + "loss": 0.63, + "step": 3400 + }, + { + "epoch": 0.11390967397113842, + "grad_norm": 1.3137410879135132, + "learning_rate": 0.0001518147405922957, + "loss": 0.6686, + "step": 3410 + }, + { + "epoch": 0.11424371993586317, + "grad_norm": 1.2096271514892578, + "learning_rate": 0.00015226007570696948, + "loss": 0.6389, + "step": 3420 + }, + { + "epoch": 0.11457776590058792, + "grad_norm": 1.2942216396331787, + "learning_rate": 0.0001527054108216433, + "loss": 0.6375, + "step": 3430 + }, + { + "epoch": 0.11491181186531267, + "grad_norm": 1.3269033432006836, + "learning_rate": 0.0001531507459363171, + "loss": 0.7073, + "step": 3440 + }, + { + "epoch": 0.11524585783003741, + "grad_norm": 1.2560580968856812, + "learning_rate": 0.00015359608105099088, + "loss": 0.6926, + "step": 3450 + }, + { + "epoch": 0.11557990379476216, + "grad_norm": 1.2922520637512207, + "learning_rate": 0.00015404141616566467, + "loss": 0.6188, + "step": 3460 + }, + { + "epoch": 0.1159139497594869, + "grad_norm": 1.1848070621490479, + "learning_rate": 0.00015448675128033846, + "loss": 0.6348, + "step": 3470 + }, + { + "epoch": 0.11624799572421166, + "grad_norm": 1.3866795301437378, + "learning_rate": 0.00015493208639501225, + "loss": 0.6957, + "step": 3480 + }, + { + "epoch": 0.11658204168893639, + "grad_norm": 1.3343369960784912, + "learning_rate": 0.00015537742150968604, + "loss": 0.6323, + "step": 3490 + }, + { + "epoch": 0.11691608765366114, + "grad_norm": 1.3747074604034424, + "learning_rate": 0.00015582275662435983, + "loss": 0.6147, + "step": 3500 + }, + { + "epoch": 0.11725013361838589, + "grad_norm": 1.5618665218353271, + "learning_rate": 0.00015626809173903362, + "loss": 0.688, + "step": 3510 + }, + { + "epoch": 0.11758417958311064, + "grad_norm": 1.4151391983032227, + "learning_rate": 0.0001567134268537074, + "loss": 0.6323, + "step": 3520 + }, + { + "epoch": 0.11791822554783538, + "grad_norm": 1.304462194442749, + "learning_rate": 0.00015715876196838123, + "loss": 0.6618, + "step": 3530 + }, + { + "epoch": 0.11825227151256013, + "grad_norm": 1.333306074142456, + "learning_rate": 0.00015760409708305502, + "loss": 0.6652, + "step": 3540 + }, + { + "epoch": 0.11858631747728487, + "grad_norm": 1.1493788957595825, + "learning_rate": 0.0001580494321977288, + "loss": 0.6595, + "step": 3550 + }, + { + "epoch": 0.11892036344200962, + "grad_norm": 1.2303088903427124, + "learning_rate": 0.00015849476731240257, + "loss": 0.6433, + "step": 3560 + }, + { + "epoch": 0.11925440940673436, + "grad_norm": 1.3749808073043823, + "learning_rate": 0.0001589401024270764, + "loss": 0.6507, + "step": 3570 + }, + { + "epoch": 0.11958845537145911, + "grad_norm": 1.376664161682129, + "learning_rate": 0.00015938543754175018, + "loss": 0.6371, + "step": 3580 + }, + { + "epoch": 0.11992250133618386, + "grad_norm": 1.1200063228607178, + "learning_rate": 0.00015983077265642397, + "loss": 0.6484, + "step": 3590 + }, + { + "epoch": 0.12025654730090861, + "grad_norm": 1.3652095794677734, + "learning_rate": 0.00016027610777109776, + "loss": 0.587, + "step": 3600 + }, + { + "epoch": 0.12059059326563334, + "grad_norm": 1.2116578817367554, + "learning_rate": 0.00016072144288577155, + "loss": 0.633, + "step": 3610 + }, + { + "epoch": 0.1209246392303581, + "grad_norm": 1.4503707885742188, + "learning_rate": 0.00016116677800044534, + "loss": 0.6724, + "step": 3620 + }, + { + "epoch": 0.12125868519508284, + "grad_norm": 1.283900260925293, + "learning_rate": 0.00016161211311511916, + "loss": 0.6391, + "step": 3630 + }, + { + "epoch": 0.1215927311598076, + "grad_norm": 1.2724339962005615, + "learning_rate": 0.00016205744822979292, + "loss": 0.6082, + "step": 3640 + }, + { + "epoch": 0.12192677712453233, + "grad_norm": 1.3354536294937134, + "learning_rate": 0.0001625027833444667, + "loss": 0.6412, + "step": 3650 + }, + { + "epoch": 0.12226082308925708, + "grad_norm": 1.176488995552063, + "learning_rate": 0.0001629481184591405, + "loss": 0.6314, + "step": 3660 + }, + { + "epoch": 0.12259486905398183, + "grad_norm": 1.230231523513794, + "learning_rate": 0.00016339345357381432, + "loss": 0.6403, + "step": 3670 + }, + { + "epoch": 0.12292891501870658, + "grad_norm": 1.1203300952911377, + "learning_rate": 0.0001638387886884881, + "loss": 0.6443, + "step": 3680 + }, + { + "epoch": 0.12326296098343131, + "grad_norm": 1.3921400308609009, + "learning_rate": 0.0001642841238031619, + "loss": 0.6293, + "step": 3690 + }, + { + "epoch": 0.12359700694815606, + "grad_norm": 1.3587327003479004, + "learning_rate": 0.0001647294589178357, + "loss": 0.6357, + "step": 3700 + }, + { + "epoch": 0.12393105291288081, + "grad_norm": 1.3276653289794922, + "learning_rate": 0.00016517479403250945, + "loss": 0.6235, + "step": 3710 + }, + { + "epoch": 0.12426509887760556, + "grad_norm": 1.4590165615081787, + "learning_rate": 0.00016562012914718327, + "loss": 0.6012, + "step": 3720 + }, + { + "epoch": 0.1245991448423303, + "grad_norm": 1.1540123224258423, + "learning_rate": 0.00016606546426185706, + "loss": 0.6321, + "step": 3730 + }, + { + "epoch": 0.12493319080705505, + "grad_norm": 1.3575044870376587, + "learning_rate": 0.00016651079937653085, + "loss": 0.5948, + "step": 3740 + }, + { + "epoch": 0.1252672367717798, + "grad_norm": 1.0691931247711182, + "learning_rate": 0.00016695613449120464, + "loss": 0.648, + "step": 3750 + }, + { + "epoch": 0.12560128273650453, + "grad_norm": 1.2354623079299927, + "learning_rate": 0.00016740146960587843, + "loss": 0.6148, + "step": 3760 + }, + { + "epoch": 0.12593532870122928, + "grad_norm": 1.1887400150299072, + "learning_rate": 0.00016784680472055224, + "loss": 0.5976, + "step": 3770 + }, + { + "epoch": 0.12626937466595403, + "grad_norm": 1.2322620153427124, + "learning_rate": 0.00016829213983522603, + "loss": 0.6582, + "step": 3780 + }, + { + "epoch": 0.12660342063067878, + "grad_norm": 1.2120084762573242, + "learning_rate": 0.0001687374749498998, + "loss": 0.6436, + "step": 3790 + }, + { + "epoch": 0.12693746659540353, + "grad_norm": 1.3957724571228027, + "learning_rate": 0.0001691828100645736, + "loss": 0.6262, + "step": 3800 + }, + { + "epoch": 0.12727151256012828, + "grad_norm": 1.153637170791626, + "learning_rate": 0.00016962814517924738, + "loss": 0.607, + "step": 3810 + }, + { + "epoch": 0.12760555852485303, + "grad_norm": 1.2310709953308105, + "learning_rate": 0.0001700734802939212, + "loss": 0.6462, + "step": 3820 + }, + { + "epoch": 0.12793960448957778, + "grad_norm": 1.1885381937026978, + "learning_rate": 0.00017051881540859498, + "loss": 0.6315, + "step": 3830 + }, + { + "epoch": 0.1282736504543025, + "grad_norm": 1.0722419023513794, + "learning_rate": 0.00017096415052326877, + "loss": 0.6211, + "step": 3840 + }, + { + "epoch": 0.12860769641902725, + "grad_norm": 1.1849323511123657, + "learning_rate": 0.00017140948563794256, + "loss": 0.6345, + "step": 3850 + }, + { + "epoch": 0.128941742383752, + "grad_norm": 1.3174560070037842, + "learning_rate": 0.00017185482075261635, + "loss": 0.6278, + "step": 3860 + }, + { + "epoch": 0.12927578834847675, + "grad_norm": 1.1463559865951538, + "learning_rate": 0.00017230015586729014, + "loss": 0.5878, + "step": 3870 + }, + { + "epoch": 0.1296098343132015, + "grad_norm": 1.139553189277649, + "learning_rate": 0.00017274549098196393, + "loss": 0.604, + "step": 3880 + }, + { + "epoch": 0.12994388027792625, + "grad_norm": 1.143943428993225, + "learning_rate": 0.00017319082609663772, + "loss": 0.6335, + "step": 3890 + }, + { + "epoch": 0.130277926242651, + "grad_norm": 1.0977891683578491, + "learning_rate": 0.00017363616121131151, + "loss": 0.6037, + "step": 3900 + }, + { + "epoch": 0.13061197220737575, + "grad_norm": 1.188076138496399, + "learning_rate": 0.0001740814963259853, + "loss": 0.6678, + "step": 3910 + }, + { + "epoch": 0.13094601817210047, + "grad_norm": 1.1466950178146362, + "learning_rate": 0.00017452683144065912, + "loss": 0.6508, + "step": 3920 + }, + { + "epoch": 0.13128006413682522, + "grad_norm": 1.1387861967086792, + "learning_rate": 0.0001749721665553329, + "loss": 0.626, + "step": 3930 + }, + { + "epoch": 0.13161411010154997, + "grad_norm": 1.3562359809875488, + "learning_rate": 0.00017541750167000667, + "loss": 0.6452, + "step": 3940 + }, + { + "epoch": 0.13194815606627472, + "grad_norm": 1.1286985874176025, + "learning_rate": 0.00017586283678468046, + "loss": 0.6128, + "step": 3950 + }, + { + "epoch": 0.13228220203099947, + "grad_norm": 1.0472478866577148, + "learning_rate": 0.00017630817189935428, + "loss": 0.596, + "step": 3960 + }, + { + "epoch": 0.13261624799572422, + "grad_norm": 1.1177164316177368, + "learning_rate": 0.00017675350701402807, + "loss": 0.6162, + "step": 3970 + }, + { + "epoch": 0.13295029396044897, + "grad_norm": 1.1572797298431396, + "learning_rate": 0.00017719884212870186, + "loss": 0.6438, + "step": 3980 + }, + { + "epoch": 0.13328433992517372, + "grad_norm": 1.0362883806228638, + "learning_rate": 0.00017764417724337565, + "loss": 0.631, + "step": 3990 + }, + { + "epoch": 0.13361838588989844, + "grad_norm": 1.0047307014465332, + "learning_rate": 0.00017808951235804944, + "loss": 0.6142, + "step": 4000 + }, + { + "epoch": 0.13361838588989844, + "eval_chrf": 85.29107079043794, + "eval_loss": 1.060824990272522, + "eval_runtime": 104.0865, + "eval_samples_per_second": 0.961, + "eval_steps_per_second": 0.038, + "step": 4000 + }, + { + "epoch": 0.1339524318546232, + "grad_norm": 1.2960996627807617, + "learning_rate": 0.00017853484747272323, + "loss": 0.5885, + "step": 4010 + }, + { + "epoch": 0.13428647781934794, + "grad_norm": 1.0989959239959717, + "learning_rate": 0.00017898018258739702, + "loss": 0.6607, + "step": 4020 + }, + { + "epoch": 0.1346205237840727, + "grad_norm": 1.2292121648788452, + "learning_rate": 0.0001794255177020708, + "loss": 0.6022, + "step": 4030 + }, + { + "epoch": 0.13495456974879744, + "grad_norm": 1.1720715761184692, + "learning_rate": 0.0001798708528167446, + "loss": 0.6172, + "step": 4040 + }, + { + "epoch": 0.1352886157135222, + "grad_norm": 1.184891700744629, + "learning_rate": 0.0001803161879314184, + "loss": 0.6161, + "step": 4050 + }, + { + "epoch": 0.13562266167824694, + "grad_norm": 1.1360478401184082, + "learning_rate": 0.0001807615230460922, + "loss": 0.6368, + "step": 4060 + }, + { + "epoch": 0.13595670764297169, + "grad_norm": 1.2550835609436035, + "learning_rate": 0.000181206858160766, + "loss": 0.5977, + "step": 4070 + }, + { + "epoch": 0.1362907536076964, + "grad_norm": 1.3404250144958496, + "learning_rate": 0.0001816521932754398, + "loss": 0.6582, + "step": 4080 + }, + { + "epoch": 0.13662479957242116, + "grad_norm": 1.1586620807647705, + "learning_rate": 0.00018209752839011355, + "loss": 0.6393, + "step": 4090 + }, + { + "epoch": 0.1369588455371459, + "grad_norm": 1.208372712135315, + "learning_rate": 0.00018254286350478734, + "loss": 0.5957, + "step": 4100 + }, + { + "epoch": 0.13729289150187066, + "grad_norm": 1.071280598640442, + "learning_rate": 0.00018298819861946116, + "loss": 0.619, + "step": 4110 + }, + { + "epoch": 0.1376269374665954, + "grad_norm": 1.352867603302002, + "learning_rate": 0.00018343353373413495, + "loss": 0.586, + "step": 4120 + }, + { + "epoch": 0.13796098343132016, + "grad_norm": 1.2540074586868286, + "learning_rate": 0.00018387886884880874, + "loss": 0.6092, + "step": 4130 + }, + { + "epoch": 0.1382950293960449, + "grad_norm": 1.0467684268951416, + "learning_rate": 0.00018432420396348253, + "loss": 0.6159, + "step": 4140 + }, + { + "epoch": 0.13862907536076965, + "grad_norm": 1.2519170045852661, + "learning_rate": 0.00018476953907815632, + "loss": 0.6411, + "step": 4150 + }, + { + "epoch": 0.13896312132549438, + "grad_norm": 1.1897696256637573, + "learning_rate": 0.00018521487419283014, + "loss": 0.5859, + "step": 4160 + }, + { + "epoch": 0.13929716729021913, + "grad_norm": 1.3099178075790405, + "learning_rate": 0.0001856602093075039, + "loss": 0.6, + "step": 4170 + }, + { + "epoch": 0.13963121325494388, + "grad_norm": 1.1583127975463867, + "learning_rate": 0.0001861055444221777, + "loss": 0.6008, + "step": 4180 + }, + { + "epoch": 0.13996525921966863, + "grad_norm": 1.1292494535446167, + "learning_rate": 0.00018655087953685148, + "loss": 0.5952, + "step": 4190 + }, + { + "epoch": 0.14029930518439337, + "grad_norm": 1.2014209032058716, + "learning_rate": 0.00018699621465152527, + "loss": 0.5792, + "step": 4200 + }, + { + "epoch": 0.14063335114911812, + "grad_norm": 1.2924737930297852, + "learning_rate": 0.0001874415497661991, + "loss": 0.5496, + "step": 4210 + }, + { + "epoch": 0.14096739711384287, + "grad_norm": 1.189109444618225, + "learning_rate": 0.00018788688488087288, + "loss": 0.6066, + "step": 4220 + }, + { + "epoch": 0.14130144307856762, + "grad_norm": 1.0685985088348389, + "learning_rate": 0.00018833221999554667, + "loss": 0.5854, + "step": 4230 + }, + { + "epoch": 0.14163548904329235, + "grad_norm": 1.2437845468521118, + "learning_rate": 0.00018877755511022046, + "loss": 0.6447, + "step": 4240 + }, + { + "epoch": 0.1419695350080171, + "grad_norm": 1.2956081628799438, + "learning_rate": 0.00018922289022489422, + "loss": 0.6203, + "step": 4250 + }, + { + "epoch": 0.14230358097274184, + "grad_norm": 1.3054507970809937, + "learning_rate": 0.00018966822533956804, + "loss": 0.6061, + "step": 4260 + }, + { + "epoch": 0.1426376269374666, + "grad_norm": 1.2583873271942139, + "learning_rate": 0.00019011356045424183, + "loss": 0.5955, + "step": 4270 + }, + { + "epoch": 0.14297167290219134, + "grad_norm": 1.1779587268829346, + "learning_rate": 0.00019055889556891562, + "loss": 0.6261, + "step": 4280 + }, + { + "epoch": 0.1433057188669161, + "grad_norm": 1.0547831058502197, + "learning_rate": 0.0001910042306835894, + "loss": 0.6012, + "step": 4290 + }, + { + "epoch": 0.14363976483164084, + "grad_norm": 1.3617991209030151, + "learning_rate": 0.0001914495657982632, + "loss": 0.5849, + "step": 4300 + }, + { + "epoch": 0.1439738107963656, + "grad_norm": 1.0399178266525269, + "learning_rate": 0.00019189490091293701, + "loss": 0.6133, + "step": 4310 + }, + { + "epoch": 0.14430785676109031, + "grad_norm": 1.041013240814209, + "learning_rate": 0.00019234023602761078, + "loss": 0.5896, + "step": 4320 + }, + { + "epoch": 0.14464190272581506, + "grad_norm": 1.0102481842041016, + "learning_rate": 0.00019278557114228457, + "loss": 0.6086, + "step": 4330 + }, + { + "epoch": 0.1449759486905398, + "grad_norm": 1.1365938186645508, + "learning_rate": 0.00019323090625695836, + "loss": 0.6502, + "step": 4340 + }, + { + "epoch": 0.14530999465526456, + "grad_norm": 1.2727710008621216, + "learning_rate": 0.00019367624137163215, + "loss": 0.6, + "step": 4350 + }, + { + "epoch": 0.1456440406199893, + "grad_norm": 1.1044787168502808, + "learning_rate": 0.00019412157648630596, + "loss": 0.6011, + "step": 4360 + }, + { + "epoch": 0.14597808658471406, + "grad_norm": 1.1466782093048096, + "learning_rate": 0.00019456691160097975, + "loss": 0.6189, + "step": 4370 + }, + { + "epoch": 0.1463121325494388, + "grad_norm": 1.013142466545105, + "learning_rate": 0.00019501224671565354, + "loss": 0.5852, + "step": 4380 + }, + { + "epoch": 0.14664617851416356, + "grad_norm": 1.051682949066162, + "learning_rate": 0.00019545758183032733, + "loss": 0.6128, + "step": 4390 + }, + { + "epoch": 0.14698022447888828, + "grad_norm": 1.015570044517517, + "learning_rate": 0.00019590291694500112, + "loss": 0.6142, + "step": 4400 + }, + { + "epoch": 0.14731427044361303, + "grad_norm": 1.095427393913269, + "learning_rate": 0.00019634825205967491, + "loss": 0.6395, + "step": 4410 + }, + { + "epoch": 0.14764831640833778, + "grad_norm": 1.1889519691467285, + "learning_rate": 0.0001967935871743487, + "loss": 0.5913, + "step": 4420 + }, + { + "epoch": 0.14798236237306253, + "grad_norm": 1.0186047554016113, + "learning_rate": 0.0001972389222890225, + "loss": 0.5878, + "step": 4430 + }, + { + "epoch": 0.14831640833778728, + "grad_norm": 0.9655874967575073, + "learning_rate": 0.00019768425740369629, + "loss": 0.5803, + "step": 4440 + }, + { + "epoch": 0.14865045430251203, + "grad_norm": 1.0319541692733765, + "learning_rate": 0.00019812959251837008, + "loss": 0.5941, + "step": 4450 + }, + { + "epoch": 0.14898450026723678, + "grad_norm": 0.8864169120788574, + "learning_rate": 0.0001985749276330439, + "loss": 0.5884, + "step": 4460 + }, + { + "epoch": 0.14931854623196153, + "grad_norm": 1.0686206817626953, + "learning_rate": 0.00019902026274771766, + "loss": 0.621, + "step": 4470 + }, + { + "epoch": 0.14965259219668625, + "grad_norm": 1.072313666343689, + "learning_rate": 0.00019946559786239145, + "loss": 0.5607, + "step": 4480 + }, + { + "epoch": 0.149986638161411, + "grad_norm": 1.0095263719558716, + "learning_rate": 0.00019991093297706524, + "loss": 0.562, + "step": 4490 + }, + { + "epoch": 0.15032068412613575, + "grad_norm": 0.9576295018196106, + "learning_rate": 0.0001999999956611107, + "loss": 0.5754, + "step": 4500 + }, + { + "epoch": 0.1506547300908605, + "grad_norm": 1.245842456817627, + "learning_rate": 0.00019999997803437362, + "loss": 0.6064, + "step": 4510 + }, + { + "epoch": 0.15098877605558525, + "grad_norm": 0.9597447514533997, + "learning_rate": 0.00019999994684861055, + "loss": 0.5867, + "step": 4520 + }, + { + "epoch": 0.15132282202031, + "grad_norm": 1.0598183870315552, + "learning_rate": 0.00019999990210382574, + "loss": 0.5915, + "step": 4530 + }, + { + "epoch": 0.15165686798503475, + "grad_norm": 1.1677656173706055, + "learning_rate": 0.00019999984380002518, + "loss": 0.5914, + "step": 4540 + }, + { + "epoch": 0.1519909139497595, + "grad_norm": 1.1287896633148193, + "learning_rate": 0.0001999997719372169, + "loss": 0.5773, + "step": 4550 + }, + { + "epoch": 0.15232495991448422, + "grad_norm": 1.0660572052001953, + "learning_rate": 0.0001999996865154105, + "loss": 0.574, + "step": 4560 + }, + { + "epoch": 0.15265900587920897, + "grad_norm": 1.0176984071731567, + "learning_rate": 0.00019999958753461769, + "loss": 0.5698, + "step": 4570 + }, + { + "epoch": 0.15299305184393372, + "grad_norm": 1.0862360000610352, + "learning_rate": 0.00019999947499485182, + "loss": 0.5984, + "step": 4580 + }, + { + "epoch": 0.15332709780865847, + "grad_norm": 0.9049597382545471, + "learning_rate": 0.00019999934889612818, + "loss": 0.5936, + "step": 4590 + }, + { + "epoch": 0.15366114377338322, + "grad_norm": 1.052592158317566, + "learning_rate": 0.00019999920923846384, + "loss": 0.5884, + "step": 4600 + }, + { + "epoch": 0.15399518973810797, + "grad_norm": 1.175820231437683, + "learning_rate": 0.00019999905602187775, + "loss": 0.6228, + "step": 4610 + }, + { + "epoch": 0.15432923570283272, + "grad_norm": 1.0966057777404785, + "learning_rate": 0.00019999888924639072, + "loss": 0.6058, + "step": 4620 + }, + { + "epoch": 0.15466328166755747, + "grad_norm": 1.0478767156600952, + "learning_rate": 0.0001999987089120253, + "loss": 0.5556, + "step": 4630 + }, + { + "epoch": 0.1549973276322822, + "grad_norm": 1.0936956405639648, + "learning_rate": 0.000199998515018806, + "loss": 0.6202, + "step": 4640 + }, + { + "epoch": 0.15533137359700694, + "grad_norm": 1.0156654119491577, + "learning_rate": 0.00019999830756675904, + "loss": 0.5723, + "step": 4650 + }, + { + "epoch": 0.1556654195617317, + "grad_norm": 0.9821091890335083, + "learning_rate": 0.00019999808655591262, + "loss": 0.5849, + "step": 4660 + }, + { + "epoch": 0.15599946552645644, + "grad_norm": 1.1164886951446533, + "learning_rate": 0.00019999785198629666, + "loss": 0.5476, + "step": 4670 + }, + { + "epoch": 0.1563335114911812, + "grad_norm": 1.1116862297058105, + "learning_rate": 0.000199997603857943, + "loss": 0.6208, + "step": 4680 + }, + { + "epoch": 0.15666755745590594, + "grad_norm": 0.995680034160614, + "learning_rate": 0.00019999734217088526, + "loss": 0.6024, + "step": 4690 + }, + { + "epoch": 0.1570016034206307, + "grad_norm": 1.1392767429351807, + "learning_rate": 0.00019999706692515893, + "loss": 0.6187, + "step": 4700 + }, + { + "epoch": 0.15733564938535544, + "grad_norm": 1.1127604246139526, + "learning_rate": 0.0001999967781208013, + "loss": 0.5411, + "step": 4710 + }, + { + "epoch": 0.15766969535008016, + "grad_norm": 0.9843993186950684, + "learning_rate": 0.0001999964757578516, + "loss": 0.5783, + "step": 4720 + }, + { + "epoch": 0.1580037413148049, + "grad_norm": 1.1337679624557495, + "learning_rate": 0.00019999615983635073, + "loss": 0.5833, + "step": 4730 + }, + { + "epoch": 0.15833778727952966, + "grad_norm": 1.0033507347106934, + "learning_rate": 0.0001999958303563416, + "loss": 0.6314, + "step": 4740 + }, + { + "epoch": 0.1586718332442544, + "grad_norm": 1.0843076705932617, + "learning_rate": 0.00019999548731786893, + "loss": 0.5639, + "step": 4750 + }, + { + "epoch": 0.15900587920897916, + "grad_norm": 1.109703540802002, + "learning_rate": 0.00019999513072097908, + "loss": 0.5779, + "step": 4760 + }, + { + "epoch": 0.1593399251737039, + "grad_norm": 1.0227696895599365, + "learning_rate": 0.00019999476056572056, + "loss": 0.5848, + "step": 4770 + }, + { + "epoch": 0.15967397113842866, + "grad_norm": 1.0500551462173462, + "learning_rate": 0.00019999437685214345, + "loss": 0.5946, + "step": 4780 + }, + { + "epoch": 0.1600080171031534, + "grad_norm": 1.1449147462844849, + "learning_rate": 0.0001999939795802998, + "loss": 0.5726, + "step": 4790 + }, + { + "epoch": 0.16034206306787813, + "grad_norm": 1.1066560745239258, + "learning_rate": 0.00019999356875024352, + "loss": 0.5848, + "step": 4800 + }, + { + "epoch": 0.16067610903260288, + "grad_norm": 1.0621087551116943, + "learning_rate": 0.00019999314436203029, + "loss": 0.5599, + "step": 4810 + }, + { + "epoch": 0.16101015499732763, + "grad_norm": 0.9491170644760132, + "learning_rate": 0.00019999270641571766, + "loss": 0.5997, + "step": 4820 + }, + { + "epoch": 0.16134420096205238, + "grad_norm": 0.9694833159446716, + "learning_rate": 0.00019999225491136498, + "loss": 0.6331, + "step": 4830 + }, + { + "epoch": 0.16167824692677712, + "grad_norm": 1.0441334247589111, + "learning_rate": 0.0001999917898490335, + "loss": 0.6238, + "step": 4840 + }, + { + "epoch": 0.16201229289150187, + "grad_norm": 0.9293340444564819, + "learning_rate": 0.00019999131122878624, + "loss": 0.5743, + "step": 4850 + }, + { + "epoch": 0.16234633885622662, + "grad_norm": 1.018986463546753, + "learning_rate": 0.00019999081905068818, + "loss": 0.5573, + "step": 4860 + }, + { + "epoch": 0.16268038482095137, + "grad_norm": 0.9239473938941956, + "learning_rate": 0.00019999031331480595, + "loss": 0.5601, + "step": 4870 + }, + { + "epoch": 0.1630144307856761, + "grad_norm": 0.9369543194770813, + "learning_rate": 0.0001999897940212082, + "loss": 0.5643, + "step": 4880 + }, + { + "epoch": 0.16334847675040085, + "grad_norm": 1.019205927848816, + "learning_rate": 0.00019998926116996532, + "loss": 0.6029, + "step": 4890 + }, + { + "epoch": 0.1636825227151256, + "grad_norm": 0.9439253807067871, + "learning_rate": 0.00019998871476114953, + "loss": 0.5879, + "step": 4900 + }, + { + "epoch": 0.16401656867985034, + "grad_norm": 0.9986618161201477, + "learning_rate": 0.00019998815479483497, + "loss": 0.5556, + "step": 4910 + }, + { + "epoch": 0.1643506146445751, + "grad_norm": 0.9808622002601624, + "learning_rate": 0.00019998758127109752, + "loss": 0.5678, + "step": 4920 + }, + { + "epoch": 0.16468466060929984, + "grad_norm": 0.9147446751594543, + "learning_rate": 0.00019998699419001493, + "loss": 0.5776, + "step": 4930 + }, + { + "epoch": 0.1650187065740246, + "grad_norm": 1.0323752164840698, + "learning_rate": 0.0001999863935516669, + "loss": 0.5727, + "step": 4940 + }, + { + "epoch": 0.16535275253874934, + "grad_norm": 0.905082106590271, + "learning_rate": 0.00019998577935613477, + "loss": 0.5802, + "step": 4950 + }, + { + "epoch": 0.16568679850347406, + "grad_norm": 0.9409524202346802, + "learning_rate": 0.00019998515160350185, + "loss": 0.5724, + "step": 4960 + }, + { + "epoch": 0.1660208444681988, + "grad_norm": 1.1164180040359497, + "learning_rate": 0.00019998451029385326, + "loss": 0.6219, + "step": 4970 + }, + { + "epoch": 0.16635489043292356, + "grad_norm": 0.9687106013298035, + "learning_rate": 0.00019998385542727596, + "loss": 0.5906, + "step": 4980 + }, + { + "epoch": 0.1666889363976483, + "grad_norm": 1.0654252767562866, + "learning_rate": 0.00019998318700385873, + "loss": 0.5879, + "step": 4990 + }, + { + "epoch": 0.16702298236237306, + "grad_norm": 0.9698986411094666, + "learning_rate": 0.00019998250502369221, + "loss": 0.5473, + "step": 5000 + }, + { + "epoch": 0.16702298236237306, + "eval_chrf": 83.75091757219782, + "eval_loss": 0.9881706833839417, + "eval_runtime": 184.3466, + "eval_samples_per_second": 0.542, + "eval_steps_per_second": 0.022, + "step": 5000 + }, + { + "epoch": 0.1673570283270978, + "grad_norm": 1.0220621824264526, + "learning_rate": 0.00019998180948686887, + "loss": 0.5524, + "step": 5010 + }, + { + "epoch": 0.16769107429182256, + "grad_norm": 1.177868366241455, + "learning_rate": 0.00019998110039348306, + "loss": 0.608, + "step": 5020 + }, + { + "epoch": 0.1680251202565473, + "grad_norm": 0.9059855341911316, + "learning_rate": 0.00019998037774363086, + "loss": 0.6355, + "step": 5030 + }, + { + "epoch": 0.16835916622127206, + "grad_norm": 0.9896267056465149, + "learning_rate": 0.00019997964153741028, + "loss": 0.5777, + "step": 5040 + }, + { + "epoch": 0.16869321218599678, + "grad_norm": 0.9868308305740356, + "learning_rate": 0.00019997889177492115, + "loss": 0.6189, + "step": 5050 + }, + { + "epoch": 0.16902725815072153, + "grad_norm": 1.078275203704834, + "learning_rate": 0.0001999781284562651, + "loss": 0.5811, + "step": 5060 + }, + { + "epoch": 0.16936130411544628, + "grad_norm": 0.9873348474502563, + "learning_rate": 0.00019997735158154564, + "loss": 0.5628, + "step": 5070 + }, + { + "epoch": 0.16969535008017103, + "grad_norm": 0.8548519015312195, + "learning_rate": 0.00019997656115086816, + "loss": 0.564, + "step": 5080 + }, + { + "epoch": 0.17002939604489578, + "grad_norm": 1.1249057054519653, + "learning_rate": 0.0001999757571643398, + "loss": 0.55, + "step": 5090 + }, + { + "epoch": 0.17036344200962053, + "grad_norm": 0.9400342106819153, + "learning_rate": 0.0001999749396220695, + "loss": 0.5623, + "step": 5100 + }, + { + "epoch": 0.17069748797434528, + "grad_norm": 0.9439047574996948, + "learning_rate": 0.00019997410852416823, + "loss": 0.5948, + "step": 5110 + }, + { + "epoch": 0.17103153393907003, + "grad_norm": 0.8374404311180115, + "learning_rate": 0.0001999732638707486, + "loss": 0.5565, + "step": 5120 + }, + { + "epoch": 0.17136557990379475, + "grad_norm": 0.9575369358062744, + "learning_rate": 0.00019997240566192517, + "loss": 0.5747, + "step": 5130 + }, + { + "epoch": 0.1716996258685195, + "grad_norm": 1.0019299983978271, + "learning_rate": 0.00019997153389781432, + "loss": 0.6342, + "step": 5140 + }, + { + "epoch": 0.17203367183324425, + "grad_norm": 0.938758373260498, + "learning_rate": 0.0001999706485785342, + "loss": 0.5926, + "step": 5150 + }, + { + "epoch": 0.172367717797969, + "grad_norm": 1.0580792427062988, + "learning_rate": 0.00019996974970420487, + "loss": 0.5533, + "step": 5160 + }, + { + "epoch": 0.17270176376269375, + "grad_norm": 0.8537624478340149, + "learning_rate": 0.00019996883727494823, + "loss": 0.5468, + "step": 5170 + }, + { + "epoch": 0.1730358097274185, + "grad_norm": 0.9305787682533264, + "learning_rate": 0.000199967911290888, + "loss": 0.5711, + "step": 5180 + }, + { + "epoch": 0.17336985569214325, + "grad_norm": 0.9401747584342957, + "learning_rate": 0.00019996697175214974, + "loss": 0.5289, + "step": 5190 + }, + { + "epoch": 0.173703901656868, + "grad_norm": 0.9479665160179138, + "learning_rate": 0.00019996601865886077, + "loss": 0.5602, + "step": 5200 + }, + { + "epoch": 0.17403794762159272, + "grad_norm": 0.8901471495628357, + "learning_rate": 0.0001999650520111504, + "loss": 0.5567, + "step": 5210 + }, + { + "epoch": 0.17437199358631747, + "grad_norm": 0.8577725887298584, + "learning_rate": 0.00019996407180914968, + "loss": 0.5838, + "step": 5220 + }, + { + "epoch": 0.17470603955104222, + "grad_norm": 0.9094706773757935, + "learning_rate": 0.00019996307805299148, + "loss": 0.5209, + "step": 5230 + }, + { + "epoch": 0.17504008551576697, + "grad_norm": 0.9517655968666077, + "learning_rate": 0.0001999620707428106, + "loss": 0.5742, + "step": 5240 + }, + { + "epoch": 0.17537413148049172, + "grad_norm": 1.0879467725753784, + "learning_rate": 0.00019996104987874354, + "loss": 0.5472, + "step": 5250 + }, + { + "epoch": 0.17570817744521647, + "grad_norm": 0.9377973675727844, + "learning_rate": 0.00019996001546092882, + "loss": 0.5485, + "step": 5260 + }, + { + "epoch": 0.17604222340994122, + "grad_norm": 0.9744547009468079, + "learning_rate": 0.00019995896748950664, + "loss": 0.5578, + "step": 5270 + }, + { + "epoch": 0.17637626937466597, + "grad_norm": 0.9136465787887573, + "learning_rate": 0.0001999579059646191, + "loss": 0.5955, + "step": 5280 + }, + { + "epoch": 0.1767103153393907, + "grad_norm": 0.935205340385437, + "learning_rate": 0.0001999568308864101, + "loss": 0.5221, + "step": 5290 + }, + { + "epoch": 0.17704436130411544, + "grad_norm": 0.8779012560844421, + "learning_rate": 0.0001999557422550255, + "loss": 0.5904, + "step": 5300 + }, + { + "epoch": 0.1773784072688402, + "grad_norm": 0.916634202003479, + "learning_rate": 0.00019995464007061282, + "loss": 0.5627, + "step": 5310 + }, + { + "epoch": 0.17771245323356494, + "grad_norm": 0.8924671411514282, + "learning_rate": 0.00019995352433332155, + "loss": 0.5468, + "step": 5320 + }, + { + "epoch": 0.1780464991982897, + "grad_norm": 1.0858147144317627, + "learning_rate": 0.00019995239504330296, + "loss": 0.5349, + "step": 5330 + }, + { + "epoch": 0.17838054516301444, + "grad_norm": 0.9470742344856262, + "learning_rate": 0.00019995125220071016, + "loss": 0.5424, + "step": 5340 + }, + { + "epoch": 0.1787145911277392, + "grad_norm": 0.9459046125411987, + "learning_rate": 0.00019995009580569813, + "loss": 0.523, + "step": 5350 + }, + { + "epoch": 0.17904863709246394, + "grad_norm": 0.8642439246177673, + "learning_rate": 0.00019994892585842365, + "loss": 0.5266, + "step": 5360 + }, + { + "epoch": 0.17938268305718866, + "grad_norm": 0.9070957899093628, + "learning_rate": 0.00019994774235904537, + "loss": 0.5358, + "step": 5370 + }, + { + "epoch": 0.1797167290219134, + "grad_norm": 0.9734053015708923, + "learning_rate": 0.00019994654530772375, + "loss": 0.5604, + "step": 5380 + }, + { + "epoch": 0.18005077498663816, + "grad_norm": 1.0013736486434937, + "learning_rate": 0.0001999453347046211, + "loss": 0.5655, + "step": 5390 + }, + { + "epoch": 0.1803848209513629, + "grad_norm": 1.005748987197876, + "learning_rate": 0.00019994411054990153, + "loss": 0.5487, + "step": 5400 + }, + { + "epoch": 0.18071886691608766, + "grad_norm": 1.0022681951522827, + "learning_rate": 0.00019994287284373112, + "loss": 0.5489, + "step": 5410 + }, + { + "epoch": 0.1810529128808124, + "grad_norm": 1.0808484554290771, + "learning_rate": 0.0001999416215862776, + "loss": 0.6072, + "step": 5420 + }, + { + "epoch": 0.18138695884553716, + "grad_norm": 0.9586472511291504, + "learning_rate": 0.00019994035677771066, + "loss": 0.5337, + "step": 5430 + }, + { + "epoch": 0.1817210048102619, + "grad_norm": 0.9273195266723633, + "learning_rate": 0.00019993907841820178, + "loss": 0.5596, + "step": 5440 + }, + { + "epoch": 0.18205505077498663, + "grad_norm": 0.9311820864677429, + "learning_rate": 0.00019993778650792435, + "loss": 0.5586, + "step": 5450 + }, + { + "epoch": 0.18238909673971138, + "grad_norm": 1.0979118347167969, + "learning_rate": 0.00019993648104705346, + "loss": 0.5245, + "step": 5460 + }, + { + "epoch": 0.18272314270443613, + "grad_norm": 1.0274357795715332, + "learning_rate": 0.00019993516203576617, + "loss": 0.51, + "step": 5470 + }, + { + "epoch": 0.18305718866916088, + "grad_norm": 0.9717475175857544, + "learning_rate": 0.00019993382947424128, + "loss": 0.5451, + "step": 5480 + }, + { + "epoch": 0.18339123463388562, + "grad_norm": 1.0376917123794556, + "learning_rate": 0.00019993248336265955, + "loss": 0.5123, + "step": 5490 + }, + { + "epoch": 0.18372528059861037, + "grad_norm": 0.9874864816665649, + "learning_rate": 0.00019993112370120342, + "loss": 0.5437, + "step": 5500 + }, + { + "epoch": 0.18405932656333512, + "grad_norm": 0.8851433992385864, + "learning_rate": 0.00019992975049005728, + "loss": 0.5898, + "step": 5510 + }, + { + "epoch": 0.18439337252805987, + "grad_norm": 0.9328639507293701, + "learning_rate": 0.00019992836372940734, + "loss": 0.5679, + "step": 5520 + }, + { + "epoch": 0.1847274184927846, + "grad_norm": 0.9938822388648987, + "learning_rate": 0.00019992696341944162, + "loss": 0.518, + "step": 5530 + }, + { + "epoch": 0.18506146445750934, + "grad_norm": 0.9014500975608826, + "learning_rate": 0.00019992554956034994, + "loss": 0.5888, + "step": 5540 + }, + { + "epoch": 0.1853955104222341, + "grad_norm": 0.9075184464454651, + "learning_rate": 0.0001999241221523241, + "loss": 0.5753, + "step": 5550 + }, + { + "epoch": 0.18572955638695884, + "grad_norm": 1.113573431968689, + "learning_rate": 0.00019992268119555756, + "loss": 0.5503, + "step": 5560 + }, + { + "epoch": 0.1860636023516836, + "grad_norm": 0.8532381653785706, + "learning_rate": 0.00019992122669024574, + "loss": 0.5435, + "step": 5570 + }, + { + "epoch": 0.18639764831640834, + "grad_norm": 0.9219514727592468, + "learning_rate": 0.00019991975863658588, + "loss": 0.6068, + "step": 5580 + }, + { + "epoch": 0.1867316942811331, + "grad_norm": 0.9973188638687134, + "learning_rate": 0.00019991827703477697, + "loss": 0.547, + "step": 5590 + }, + { + "epoch": 0.18706574024585784, + "grad_norm": 0.8391416072845459, + "learning_rate": 0.00019991678188501993, + "loss": 0.5904, + "step": 5600 + }, + { + "epoch": 0.18739978621058256, + "grad_norm": 0.8709014654159546, + "learning_rate": 0.0001999152731875175, + "loss": 0.5222, + "step": 5610 + }, + { + "epoch": 0.1877338321753073, + "grad_norm": 0.8123546838760376, + "learning_rate": 0.00019991375094247421, + "loss": 0.5837, + "step": 5620 + }, + { + "epoch": 0.18806787814003206, + "grad_norm": 0.9549054503440857, + "learning_rate": 0.00019991221515009656, + "loss": 0.5702, + "step": 5630 + }, + { + "epoch": 0.1884019241047568, + "grad_norm": 1.1201049089431763, + "learning_rate": 0.00019991066581059267, + "loss": 0.5996, + "step": 5640 + }, + { + "epoch": 0.18873597006948156, + "grad_norm": 1.0529861450195312, + "learning_rate": 0.00019990910292417264, + "loss": 0.5494, + "step": 5650 + }, + { + "epoch": 0.1890700160342063, + "grad_norm": 1.0167558193206787, + "learning_rate": 0.0001999075264910484, + "loss": 0.5724, + "step": 5660 + }, + { + "epoch": 0.18940406199893106, + "grad_norm": 0.892126202583313, + "learning_rate": 0.00019990593651143376, + "loss": 0.5592, + "step": 5670 + }, + { + "epoch": 0.1897381079636558, + "grad_norm": 0.9634349346160889, + "learning_rate": 0.00019990433298554418, + "loss": 0.5325, + "step": 5680 + }, + { + "epoch": 0.19007215392838053, + "grad_norm": 1.014253854751587, + "learning_rate": 0.0001999027159135972, + "loss": 0.5643, + "step": 5690 + }, + { + "epoch": 0.19040619989310528, + "grad_norm": 0.95102459192276, + "learning_rate": 0.000199901085295812, + "loss": 0.549, + "step": 5700 + }, + { + "epoch": 0.19074024585783003, + "grad_norm": 0.9939152002334595, + "learning_rate": 0.00019989944113240973, + "loss": 0.5239, + "step": 5710 + }, + { + "epoch": 0.19107429182255478, + "grad_norm": 0.9889900088310242, + "learning_rate": 0.00019989778342361327, + "loss": 0.5632, + "step": 5720 + }, + { + "epoch": 0.19140833778727953, + "grad_norm": 0.9129937887191772, + "learning_rate": 0.00019989611216964743, + "loss": 0.5376, + "step": 5730 + }, + { + "epoch": 0.19174238375200428, + "grad_norm": 0.9177786111831665, + "learning_rate": 0.0001998944273707388, + "loss": 0.5344, + "step": 5740 + }, + { + "epoch": 0.19207642971672903, + "grad_norm": 0.8723755478858948, + "learning_rate": 0.00019989272902711585, + "loss": 0.4998, + "step": 5750 + }, + { + "epoch": 0.19241047568145378, + "grad_norm": 0.9387904405593872, + "learning_rate": 0.0001998910171390088, + "loss": 0.5727, + "step": 5760 + }, + { + "epoch": 0.1927445216461785, + "grad_norm": 1.2100673913955688, + "learning_rate": 0.0001998892917066498, + "loss": 0.5318, + "step": 5770 + }, + { + "epoch": 0.19307856761090325, + "grad_norm": 1.0420572757720947, + "learning_rate": 0.0001998875527302728, + "loss": 0.5429, + "step": 5780 + }, + { + "epoch": 0.193412613575628, + "grad_norm": 1.0020521879196167, + "learning_rate": 0.00019988580021011365, + "loss": 0.557, + "step": 5790 + }, + { + "epoch": 0.19374665954035275, + "grad_norm": 0.9074615240097046, + "learning_rate": 0.00019988403414640987, + "loss": 0.5702, + "step": 5800 + }, + { + "epoch": 0.1940807055050775, + "grad_norm": 0.9029930233955383, + "learning_rate": 0.00019988225453940097, + "loss": 0.5444, + "step": 5810 + }, + { + "epoch": 0.19441475146980225, + "grad_norm": 0.9554100036621094, + "learning_rate": 0.00019988046138932824, + "loss": 0.5326, + "step": 5820 + }, + { + "epoch": 0.194748797434527, + "grad_norm": 1.1509687900543213, + "learning_rate": 0.0001998786546964348, + "loss": 0.5732, + "step": 5830 + }, + { + "epoch": 0.19508284339925175, + "grad_norm": 1.0018824338912964, + "learning_rate": 0.00019987683446096567, + "loss": 0.5149, + "step": 5840 + }, + { + "epoch": 0.19541688936397647, + "grad_norm": 0.9956066012382507, + "learning_rate": 0.0001998750006831676, + "loss": 0.5439, + "step": 5850 + }, + { + "epoch": 0.19575093532870122, + "grad_norm": 1.0206962823867798, + "learning_rate": 0.0001998731533632893, + "loss": 0.5577, + "step": 5860 + }, + { + "epoch": 0.19608498129342597, + "grad_norm": 1.0022388696670532, + "learning_rate": 0.00019987129250158115, + "loss": 0.5597, + "step": 5870 + }, + { + "epoch": 0.19641902725815072, + "grad_norm": 0.8789455890655518, + "learning_rate": 0.00019986941809829555, + "loss": 0.5594, + "step": 5880 + }, + { + "epoch": 0.19675307322287547, + "grad_norm": 0.8609839677810669, + "learning_rate": 0.0001998675301536866, + "loss": 0.5458, + "step": 5890 + }, + { + "epoch": 0.19708711918760022, + "grad_norm": 0.9823794364929199, + "learning_rate": 0.00019986562866801033, + "loss": 0.5506, + "step": 5900 + }, + { + "epoch": 0.19742116515232497, + "grad_norm": 0.9742714166641235, + "learning_rate": 0.00019986371364152453, + "loss": 0.4888, + "step": 5910 + }, + { + "epoch": 0.19775521111704972, + "grad_norm": 1.0452237129211426, + "learning_rate": 0.00019986178507448886, + "loss": 0.5619, + "step": 5920 + }, + { + "epoch": 0.19808925708177444, + "grad_norm": 0.8462994694709778, + "learning_rate": 0.00019985984296716483, + "loss": 0.5362, + "step": 5930 + }, + { + "epoch": 0.1984233030464992, + "grad_norm": 0.8899021744728088, + "learning_rate": 0.00019985788731981575, + "loss": 0.5783, + "step": 5940 + }, + { + "epoch": 0.19875734901122394, + "grad_norm": 1.1785120964050293, + "learning_rate": 0.00019985591813270683, + "loss": 0.5323, + "step": 5950 + }, + { + "epoch": 0.1990913949759487, + "grad_norm": 0.8845856785774231, + "learning_rate": 0.00019985393540610503, + "loss": 0.5742, + "step": 5960 + }, + { + "epoch": 0.19942544094067344, + "grad_norm": 0.9637285470962524, + "learning_rate": 0.0001998519391402792, + "loss": 0.5706, + "step": 5970 + }, + { + "epoch": 0.1997594869053982, + "grad_norm": 0.955951452255249, + "learning_rate": 0.0001998499293355, + "loss": 0.5418, + "step": 5980 + }, + { + "epoch": 0.20009353287012294, + "grad_norm": 1.1062010526657104, + "learning_rate": 0.00019984790599204, + "loss": 0.5045, + "step": 5990 + }, + { + "epoch": 0.20042757883484769, + "grad_norm": 0.9472284317016602, + "learning_rate": 0.00019984586911017349, + "loss": 0.5515, + "step": 6000 + }, + { + "epoch": 0.20042757883484769, + "eval_chrf": 83.38033080513087, + "eval_loss": 0.9294881224632263, + "eval_runtime": 174.2986, + "eval_samples_per_second": 0.574, + "eval_steps_per_second": 0.023, + "step": 6000 + }, + { + "epoch": 0.2007616247995724, + "grad_norm": 0.8701378703117371, + "learning_rate": 0.00019984381869017666, + "loss": 0.5367, + "step": 6010 + }, + { + "epoch": 0.20109567076429716, + "grad_norm": 0.8307036757469177, + "learning_rate": 0.00019984175473232752, + "loss": 0.5283, + "step": 6020 + }, + { + "epoch": 0.2014297167290219, + "grad_norm": 0.9423578977584839, + "learning_rate": 0.00019983967723690595, + "loss": 0.5367, + "step": 6030 + }, + { + "epoch": 0.20176376269374666, + "grad_norm": 0.9621785879135132, + "learning_rate": 0.0001998375862041936, + "loss": 0.5826, + "step": 6040 + }, + { + "epoch": 0.2020978086584714, + "grad_norm": 0.9973872900009155, + "learning_rate": 0.00019983548163447404, + "loss": 0.5461, + "step": 6050 + }, + { + "epoch": 0.20243185462319616, + "grad_norm": 1.0017222166061401, + "learning_rate": 0.00019983336352803258, + "loss": 0.5259, + "step": 6060 + }, + { + "epoch": 0.2027659005879209, + "grad_norm": 0.8502051830291748, + "learning_rate": 0.00019983123188515645, + "loss": 0.5497, + "step": 6070 + }, + { + "epoch": 0.20309994655264565, + "grad_norm": 0.9297699332237244, + "learning_rate": 0.0001998290867061347, + "loss": 0.5371, + "step": 6080 + }, + { + "epoch": 0.20343399251737038, + "grad_norm": 0.9416469931602478, + "learning_rate": 0.00019982692799125814, + "loss": 0.5294, + "step": 6090 + }, + { + "epoch": 0.20376803848209513, + "grad_norm": 0.8366502523422241, + "learning_rate": 0.00019982475574081948, + "loss": 0.5359, + "step": 6100 + }, + { + "epoch": 0.20410208444681988, + "grad_norm": 0.8927239775657654, + "learning_rate": 0.00019982256995511332, + "loss": 0.5276, + "step": 6110 + }, + { + "epoch": 0.20443613041154463, + "grad_norm": 0.8132355809211731, + "learning_rate": 0.00019982037063443595, + "loss": 0.5224, + "step": 6120 + }, + { + "epoch": 0.20477017637626937, + "grad_norm": 0.9435505270957947, + "learning_rate": 0.0001998181577790856, + "loss": 0.5468, + "step": 6130 + }, + { + "epoch": 0.20510422234099412, + "grad_norm": 0.984864354133606, + "learning_rate": 0.00019981593138936233, + "loss": 0.5377, + "step": 6140 + }, + { + "epoch": 0.20543826830571887, + "grad_norm": 0.8152579069137573, + "learning_rate": 0.00019981369146556804, + "loss": 0.5376, + "step": 6150 + }, + { + "epoch": 0.20577231427044362, + "grad_norm": 1.0017338991165161, + "learning_rate": 0.00019981143800800637, + "loss": 0.5162, + "step": 6160 + }, + { + "epoch": 0.20610636023516835, + "grad_norm": 0.8106871843338013, + "learning_rate": 0.00019980917101698292, + "loss": 0.4926, + "step": 6170 + }, + { + "epoch": 0.2064404061998931, + "grad_norm": 0.9428711533546448, + "learning_rate": 0.00019980689049280506, + "loss": 0.5411, + "step": 6180 + }, + { + "epoch": 0.20677445216461784, + "grad_norm": 0.870675265789032, + "learning_rate": 0.00019980459643578202, + "loss": 0.5507, + "step": 6190 + }, + { + "epoch": 0.2071084981293426, + "grad_norm": 0.965762734413147, + "learning_rate": 0.0001998022888462248, + "loss": 0.5979, + "step": 6200 + }, + { + "epoch": 0.20744254409406734, + "grad_norm": 1.1723816394805908, + "learning_rate": 0.00019979996772444636, + "loss": 0.5356, + "step": 6210 + }, + { + "epoch": 0.2077765900587921, + "grad_norm": 1.036665916442871, + "learning_rate": 0.00019979763307076138, + "loss": 0.5672, + "step": 6220 + }, + { + "epoch": 0.20811063602351684, + "grad_norm": 0.6806384325027466, + "learning_rate": 0.0001997952848854864, + "loss": 0.5186, + "step": 6230 + }, + { + "epoch": 0.2084446819882416, + "grad_norm": 1.0470300912857056, + "learning_rate": 0.00019979292316893988, + "loss": 0.5466, + "step": 6240 + }, + { + "epoch": 0.20877872795296634, + "grad_norm": 0.8822453022003174, + "learning_rate": 0.00019979054792144195, + "loss": 0.5188, + "step": 6250 + }, + { + "epoch": 0.20911277391769106, + "grad_norm": 0.9346125721931458, + "learning_rate": 0.00019978815914331474, + "loss": 0.5531, + "step": 6260 + }, + { + "epoch": 0.2094468198824158, + "grad_norm": 0.9940763711929321, + "learning_rate": 0.00019978575683488213, + "loss": 0.5513, + "step": 6270 + }, + { + "epoch": 0.20978086584714056, + "grad_norm": 1.0559648275375366, + "learning_rate": 0.00019978334099646987, + "loss": 0.5548, + "step": 6280 + }, + { + "epoch": 0.2101149118118653, + "grad_norm": 1.1641331911087036, + "learning_rate": 0.00019978091162840546, + "loss": 0.5489, + "step": 6290 + }, + { + "epoch": 0.21044895777659006, + "grad_norm": 0.8385947346687317, + "learning_rate": 0.00019977846873101837, + "loss": 0.5614, + "step": 6300 + }, + { + "epoch": 0.2107830037413148, + "grad_norm": 0.8233581185340881, + "learning_rate": 0.00019977601230463976, + "loss": 0.5331, + "step": 6310 + }, + { + "epoch": 0.21111704970603956, + "grad_norm": 0.847217857837677, + "learning_rate": 0.00019977354234960277, + "loss": 0.5175, + "step": 6320 + }, + { + "epoch": 0.2114510956707643, + "grad_norm": 1.0424152612686157, + "learning_rate": 0.00019977105886624227, + "loss": 0.5802, + "step": 6330 + }, + { + "epoch": 0.21178514163548903, + "grad_norm": 1.016667127609253, + "learning_rate": 0.00019976856185489498, + "loss": 0.5554, + "step": 6340 + }, + { + "epoch": 0.21211918760021378, + "grad_norm": 1.1013553142547607, + "learning_rate": 0.00019976605131589948, + "loss": 0.5562, + "step": 6350 + }, + { + "epoch": 0.21245323356493853, + "grad_norm": 0.9689114689826965, + "learning_rate": 0.00019976352724959621, + "loss": 0.536, + "step": 6360 + }, + { + "epoch": 0.21278727952966328, + "grad_norm": 1.0057661533355713, + "learning_rate": 0.00019976098965632737, + "loss": 0.5259, + "step": 6370 + }, + { + "epoch": 0.21312132549438803, + "grad_norm": 0.8176321983337402, + "learning_rate": 0.00019975843853643706, + "loss": 0.5202, + "step": 6380 + }, + { + "epoch": 0.21345537145911278, + "grad_norm": 0.8424526453018188, + "learning_rate": 0.00019975587389027116, + "loss": 0.5257, + "step": 6390 + }, + { + "epoch": 0.21378941742383753, + "grad_norm": 1.1306337118148804, + "learning_rate": 0.0001997532957181774, + "loss": 0.5392, + "step": 6400 + }, + { + "epoch": 0.21412346338856228, + "grad_norm": 0.8378580212593079, + "learning_rate": 0.0001997507040205054, + "loss": 0.5141, + "step": 6410 + }, + { + "epoch": 0.214457509353287, + "grad_norm": 0.9622635245323181, + "learning_rate": 0.00019974809879760654, + "loss": 0.5369, + "step": 6420 + }, + { + "epoch": 0.21479155531801175, + "grad_norm": 0.9061731696128845, + "learning_rate": 0.0001997454800498341, + "loss": 0.5528, + "step": 6430 + }, + { + "epoch": 0.2151256012827365, + "grad_norm": 1.0778069496154785, + "learning_rate": 0.00019974284777754306, + "loss": 0.547, + "step": 6440 + }, + { + "epoch": 0.21545964724746125, + "grad_norm": 0.9020702838897705, + "learning_rate": 0.00019974020198109044, + "loss": 0.5354, + "step": 6450 + }, + { + "epoch": 0.215793693212186, + "grad_norm": 0.840465784072876, + "learning_rate": 0.00019973754266083496, + "loss": 0.5129, + "step": 6460 + }, + { + "epoch": 0.21612773917691075, + "grad_norm": 0.9829301834106445, + "learning_rate": 0.00019973486981713712, + "loss": 0.5074, + "step": 6470 + }, + { + "epoch": 0.2164617851416355, + "grad_norm": 0.9750456213951111, + "learning_rate": 0.00019973218345035941, + "loss": 0.5057, + "step": 6480 + }, + { + "epoch": 0.21679583110636025, + "grad_norm": 0.9136343002319336, + "learning_rate": 0.0001997294835608661, + "loss": 0.5076, + "step": 6490 + }, + { + "epoch": 0.21712987707108497, + "grad_norm": 0.7194660902023315, + "learning_rate": 0.0001997267701490232, + "loss": 0.5283, + "step": 6500 + }, + { + "epoch": 0.21746392303580972, + "grad_norm": 0.9144142270088196, + "learning_rate": 0.00019972404321519865, + "loss": 0.5345, + "step": 6510 + }, + { + "epoch": 0.21779796900053447, + "grad_norm": 0.9766556620597839, + "learning_rate": 0.00019972130275976215, + "loss": 0.5373, + "step": 6520 + }, + { + "epoch": 0.21813201496525922, + "grad_norm": 0.8629313111305237, + "learning_rate": 0.00019971854878308537, + "loss": 0.5456, + "step": 6530 + }, + { + "epoch": 0.21846606092998397, + "grad_norm": 0.8549830317497253, + "learning_rate": 0.00019971578128554165, + "loss": 0.5117, + "step": 6540 + }, + { + "epoch": 0.21880010689470872, + "grad_norm": 0.8491371273994446, + "learning_rate": 0.00019971300026750628, + "loss": 0.5277, + "step": 6550 + }, + { + "epoch": 0.21913415285943347, + "grad_norm": 0.829200267791748, + "learning_rate": 0.00019971020572935632, + "loss": 0.5563, + "step": 6560 + }, + { + "epoch": 0.21946819882415822, + "grad_norm": 0.8701381087303162, + "learning_rate": 0.00019970739767147065, + "loss": 0.5028, + "step": 6570 + }, + { + "epoch": 0.21980224478888294, + "grad_norm": 0.8372440338134766, + "learning_rate": 0.00019970457609423008, + "loss": 0.5107, + "step": 6580 + }, + { + "epoch": 0.2201362907536077, + "grad_norm": 0.9000784754753113, + "learning_rate": 0.00019970174099801714, + "loss": 0.499, + "step": 6590 + }, + { + "epoch": 0.22047033671833244, + "grad_norm": 1.0054694414138794, + "learning_rate": 0.00019969889238321626, + "loss": 0.4963, + "step": 6600 + }, + { + "epoch": 0.2208043826830572, + "grad_norm": 0.9125935435295105, + "learning_rate": 0.00019969603025021368, + "loss": 0.5281, + "step": 6610 + }, + { + "epoch": 0.22113842864778194, + "grad_norm": 0.9492520093917847, + "learning_rate": 0.0001996931545993975, + "loss": 0.5054, + "step": 6620 + }, + { + "epoch": 0.2214724746125067, + "grad_norm": 0.9592035412788391, + "learning_rate": 0.00019969026543115756, + "loss": 0.5563, + "step": 6630 + }, + { + "epoch": 0.22180652057723144, + "grad_norm": 0.8529420495033264, + "learning_rate": 0.0001996873627458857, + "loss": 0.538, + "step": 6640 + }, + { + "epoch": 0.22214056654195619, + "grad_norm": 0.8216513991355896, + "learning_rate": 0.00019968444654397543, + "loss": 0.5149, + "step": 6650 + }, + { + "epoch": 0.2224746125066809, + "grad_norm": 0.8260053396224976, + "learning_rate": 0.00019968151682582218, + "loss": 0.5194, + "step": 6660 + }, + { + "epoch": 0.22280865847140566, + "grad_norm": 1.0038799047470093, + "learning_rate": 0.0001996785735918232, + "loss": 0.4978, + "step": 6670 + }, + { + "epoch": 0.2231427044361304, + "grad_norm": 0.7800386548042297, + "learning_rate": 0.0001996756168423775, + "loss": 0.5398, + "step": 6680 + }, + { + "epoch": 0.22347675040085516, + "grad_norm": 0.8242761492729187, + "learning_rate": 0.0001996726465778861, + "loss": 0.4902, + "step": 6690 + }, + { + "epoch": 0.2238107963655799, + "grad_norm": 0.8866536617279053, + "learning_rate": 0.00019966966279875162, + "loss": 0.5093, + "step": 6700 + }, + { + "epoch": 0.22414484233030466, + "grad_norm": 0.954418420791626, + "learning_rate": 0.00019966666550537876, + "loss": 0.5853, + "step": 6710 + }, + { + "epoch": 0.2244788882950294, + "grad_norm": 0.9801604151725769, + "learning_rate": 0.0001996636546981738, + "loss": 0.4999, + "step": 6720 + }, + { + "epoch": 0.22481293425975415, + "grad_norm": 0.7978387475013733, + "learning_rate": 0.00019966063037754505, + "loss": 0.4994, + "step": 6730 + }, + { + "epoch": 0.22514698022447888, + "grad_norm": 0.8911777138710022, + "learning_rate": 0.00019965759254390254, + "loss": 0.5127, + "step": 6740 + }, + { + "epoch": 0.22548102618920363, + "grad_norm": 0.8677434921264648, + "learning_rate": 0.0001996545411976582, + "loss": 0.5291, + "step": 6750 + }, + { + "epoch": 0.22581507215392838, + "grad_norm": 1.0539984703063965, + "learning_rate": 0.00019965147633922578, + "loss": 0.5357, + "step": 6760 + }, + { + "epoch": 0.22614911811865313, + "grad_norm": 0.8764503598213196, + "learning_rate": 0.00019964839796902077, + "loss": 0.5119, + "step": 6770 + }, + { + "epoch": 0.22648316408337787, + "grad_norm": 1.0507168769836426, + "learning_rate": 0.00019964530608746064, + "loss": 0.5364, + "step": 6780 + }, + { + "epoch": 0.22681721004810262, + "grad_norm": 0.9103027582168579, + "learning_rate": 0.00019964220069496456, + "loss": 0.518, + "step": 6790 + }, + { + "epoch": 0.22715125601282737, + "grad_norm": 0.9063647389411926, + "learning_rate": 0.00019963908179195368, + "loss": 0.5473, + "step": 6800 + }, + { + "epoch": 0.22748530197755212, + "grad_norm": 0.915732741355896, + "learning_rate": 0.0001996359493788508, + "loss": 0.5528, + "step": 6810 + }, + { + "epoch": 0.22781934794227685, + "grad_norm": 0.7278220653533936, + "learning_rate": 0.00019963280345608068, + "loss": 0.5252, + "step": 6820 + }, + { + "epoch": 0.2281533939070016, + "grad_norm": 0.9504566192626953, + "learning_rate": 0.00019962964402406988, + "loss": 0.4757, + "step": 6830 + }, + { + "epoch": 0.22848743987172634, + "grad_norm": 0.9354764819145203, + "learning_rate": 0.00019962647108324682, + "loss": 0.5412, + "step": 6840 + }, + { + "epoch": 0.2288214858364511, + "grad_norm": 0.9815635085105896, + "learning_rate": 0.00019962328463404166, + "loss": 0.5458, + "step": 6850 + }, + { + "epoch": 0.22915553180117584, + "grad_norm": 0.8239994645118713, + "learning_rate": 0.00019962008467688645, + "loss": 0.5167, + "step": 6860 + }, + { + "epoch": 0.2294895777659006, + "grad_norm": 0.8384312391281128, + "learning_rate": 0.00019961687121221514, + "loss": 0.5428, + "step": 6870 + }, + { + "epoch": 0.22982362373062534, + "grad_norm": 0.8935899138450623, + "learning_rate": 0.00019961364424046337, + "loss": 0.5134, + "step": 6880 + }, + { + "epoch": 0.2301576696953501, + "grad_norm": 0.8609983325004578, + "learning_rate": 0.00019961040376206878, + "loss": 0.5201, + "step": 6890 + }, + { + "epoch": 0.23049171566007481, + "grad_norm": 0.8071815967559814, + "learning_rate": 0.00019960714977747065, + "loss": 0.558, + "step": 6900 + }, + { + "epoch": 0.23082576162479956, + "grad_norm": 0.9722113013267517, + "learning_rate": 0.00019960388228711023, + "loss": 0.5061, + "step": 6910 + }, + { + "epoch": 0.2311598075895243, + "grad_norm": 0.9357370138168335, + "learning_rate": 0.00019960060129143055, + "loss": 0.5215, + "step": 6920 + }, + { + "epoch": 0.23149385355424906, + "grad_norm": 0.8626118898391724, + "learning_rate": 0.00019959730679087648, + "loss": 0.5058, + "step": 6930 + }, + { + "epoch": 0.2318278995189738, + "grad_norm": 0.910269558429718, + "learning_rate": 0.00019959399878589475, + "loss": 0.5281, + "step": 6940 + }, + { + "epoch": 0.23216194548369856, + "grad_norm": 0.9381657242774963, + "learning_rate": 0.0001995906772769339, + "loss": 0.4996, + "step": 6950 + }, + { + "epoch": 0.2324959914484233, + "grad_norm": 0.9199655652046204, + "learning_rate": 0.00019958734226444423, + "loss": 0.4995, + "step": 6960 + }, + { + "epoch": 0.23283003741314806, + "grad_norm": 0.855209231376648, + "learning_rate": 0.00019958399374887798, + "loss": 0.5313, + "step": 6970 + }, + { + "epoch": 0.23316408337787278, + "grad_norm": 0.9168945550918579, + "learning_rate": 0.0001995806317306892, + "loss": 0.5001, + "step": 6980 + }, + { + "epoch": 0.23349812934259753, + "grad_norm": 0.895139753818512, + "learning_rate": 0.00019957725621033367, + "loss": 0.5261, + "step": 6990 + }, + { + "epoch": 0.23383217530732228, + "grad_norm": 0.8300134539604187, + "learning_rate": 0.00019957386718826915, + "loss": 0.5281, + "step": 7000 + }, + { + "epoch": 0.23383217530732228, + "eval_chrf": 82.26555621342898, + "eval_loss": 0.8867863416671753, + "eval_runtime": 302.4853, + "eval_samples_per_second": 0.331, + "eval_steps_per_second": 0.013, + "step": 7000 + }, + { + "epoch": 0.23416622127204703, + "grad_norm": 0.9571975469589233, + "learning_rate": 0.00019957046466495516, + "loss": 0.5358, + "step": 7010 + }, + { + "epoch": 0.23450026723677178, + "grad_norm": 1.0828726291656494, + "learning_rate": 0.00019956704864085298, + "loss": 0.565, + "step": 7020 + }, + { + "epoch": 0.23483431320149653, + "grad_norm": 0.9249463677406311, + "learning_rate": 0.00019956361911642584, + "loss": 0.5264, + "step": 7030 + }, + { + "epoch": 0.23516835916622128, + "grad_norm": 0.9125162363052368, + "learning_rate": 0.00019956017609213875, + "loss": 0.5387, + "step": 7040 + }, + { + "epoch": 0.23550240513094603, + "grad_norm": 0.9537795186042786, + "learning_rate": 0.00019955671956845857, + "loss": 0.5111, + "step": 7050 + }, + { + "epoch": 0.23583645109567075, + "grad_norm": 0.931634247303009, + "learning_rate": 0.00019955324954585392, + "loss": 0.5091, + "step": 7060 + }, + { + "epoch": 0.2361704970603955, + "grad_norm": 1.0084316730499268, + "learning_rate": 0.0001995497660247953, + "loss": 0.5204, + "step": 7070 + }, + { + "epoch": 0.23650454302512025, + "grad_norm": 0.7660313248634338, + "learning_rate": 0.0001995462690057551, + "loss": 0.5507, + "step": 7080 + }, + { + "epoch": 0.236838588989845, + "grad_norm": 1.102610468864441, + "learning_rate": 0.00019954275848920746, + "loss": 0.5208, + "step": 7090 + }, + { + "epoch": 0.23717263495456975, + "grad_norm": 0.8453715443611145, + "learning_rate": 0.00019953923447562835, + "loss": 0.4757, + "step": 7100 + }, + { + "epoch": 0.2375066809192945, + "grad_norm": 0.9575565457344055, + "learning_rate": 0.00019953569696549558, + "loss": 0.4775, + "step": 7110 + }, + { + "epoch": 0.23784072688401925, + "grad_norm": 0.9582642316818237, + "learning_rate": 0.00019953214595928883, + "loss": 0.5327, + "step": 7120 + }, + { + "epoch": 0.238174772848744, + "grad_norm": 0.8354542851448059, + "learning_rate": 0.0001995285814574896, + "loss": 0.5135, + "step": 7130 + }, + { + "epoch": 0.23850881881346872, + "grad_norm": 0.9941251873970032, + "learning_rate": 0.00019952500346058117, + "loss": 0.5287, + "step": 7140 + }, + { + "epoch": 0.23884286477819347, + "grad_norm": 0.8007171154022217, + "learning_rate": 0.00019952141196904867, + "loss": 0.5269, + "step": 7150 + }, + { + "epoch": 0.23917691074291822, + "grad_norm": 0.8905311226844788, + "learning_rate": 0.00019951780698337915, + "loss": 0.5618, + "step": 7160 + }, + { + "epoch": 0.23951095670764297, + "grad_norm": 0.7775999307632446, + "learning_rate": 0.0001995141885040613, + "loss": 0.4816, + "step": 7170 + }, + { + "epoch": 0.23984500267236772, + "grad_norm": 0.8820524215698242, + "learning_rate": 0.0001995105565315858, + "loss": 0.4873, + "step": 7180 + }, + { + "epoch": 0.24017904863709247, + "grad_norm": 0.925364077091217, + "learning_rate": 0.00019950691106644514, + "loss": 0.519, + "step": 7190 + }, + { + "epoch": 0.24051309460181722, + "grad_norm": 0.8523632287979126, + "learning_rate": 0.00019950325210913355, + "loss": 0.5518, + "step": 7200 + }, + { + "epoch": 0.24084714056654197, + "grad_norm": 0.8461365103721619, + "learning_rate": 0.00019949957966014723, + "loss": 0.5225, + "step": 7210 + }, + { + "epoch": 0.2411811865312667, + "grad_norm": 0.7926530838012695, + "learning_rate": 0.00019949589371998406, + "loss": 0.5111, + "step": 7220 + }, + { + "epoch": 0.24151523249599144, + "grad_norm": 0.8698731660842896, + "learning_rate": 0.00019949219428914386, + "loss": 0.4795, + "step": 7230 + }, + { + "epoch": 0.2418492784607162, + "grad_norm": 0.6940591335296631, + "learning_rate": 0.0001994884813681282, + "loss": 0.5522, + "step": 7240 + }, + { + "epoch": 0.24218332442544094, + "grad_norm": 0.8640892505645752, + "learning_rate": 0.0001994847549574405, + "loss": 0.5187, + "step": 7250 + }, + { + "epoch": 0.2425173703901657, + "grad_norm": 0.7517714500427246, + "learning_rate": 0.0001994810150575861, + "loss": 0.4939, + "step": 7260 + }, + { + "epoch": 0.24285141635489044, + "grad_norm": 0.9727846384048462, + "learning_rate": 0.000199477261669072, + "loss": 0.5264, + "step": 7270 + }, + { + "epoch": 0.2431854623196152, + "grad_norm": 0.7612695693969727, + "learning_rate": 0.00019947349479240725, + "loss": 0.5478, + "step": 7280 + }, + { + "epoch": 0.24351950828433994, + "grad_norm": 0.9633700847625732, + "learning_rate": 0.00019946971442810245, + "loss": 0.4749, + "step": 7290 + }, + { + "epoch": 0.24385355424906466, + "grad_norm": 0.9143096804618835, + "learning_rate": 0.0001994659205766703, + "loss": 0.529, + "step": 7300 + }, + { + "epoch": 0.2441876002137894, + "grad_norm": 0.8851656913757324, + "learning_rate": 0.00019946211323862515, + "loss": 0.4996, + "step": 7310 + }, + { + "epoch": 0.24452164617851416, + "grad_norm": 0.8721536993980408, + "learning_rate": 0.00019945829241448327, + "loss": 0.5154, + "step": 7320 + }, + { + "epoch": 0.2448556921432389, + "grad_norm": 0.6937206983566284, + "learning_rate": 0.0001994544581047627, + "loss": 0.5255, + "step": 7330 + }, + { + "epoch": 0.24518973810796366, + "grad_norm": 0.8057401180267334, + "learning_rate": 0.00019945061030998335, + "loss": 0.499, + "step": 7340 + }, + { + "epoch": 0.2455237840726884, + "grad_norm": 1.0023798942565918, + "learning_rate": 0.00019944674903066693, + "loss": 0.5246, + "step": 7350 + }, + { + "epoch": 0.24585783003741316, + "grad_norm": 0.8678934574127197, + "learning_rate": 0.000199442874267337, + "loss": 0.5173, + "step": 7360 + }, + { + "epoch": 0.2461918760021379, + "grad_norm": 0.9343276619911194, + "learning_rate": 0.00019943898602051897, + "loss": 0.5033, + "step": 7370 + }, + { + "epoch": 0.24652592196686263, + "grad_norm": 0.7583227753639221, + "learning_rate": 0.00019943508429074, + "loss": 0.5298, + "step": 7380 + }, + { + "epoch": 0.24685996793158738, + "grad_norm": 0.9024145007133484, + "learning_rate": 0.00019943116907852914, + "loss": 0.5045, + "step": 7390 + }, + { + "epoch": 0.24719401389631213, + "grad_norm": 0.8382844924926758, + "learning_rate": 0.0001994272403844173, + "loss": 0.4987, + "step": 7400 + }, + { + "epoch": 0.24752805986103688, + "grad_norm": 0.9915279746055603, + "learning_rate": 0.00019942329820893707, + "loss": 0.5004, + "step": 7410 + }, + { + "epoch": 0.24786210582576162, + "grad_norm": 0.9732442498207092, + "learning_rate": 0.0001994193425526231, + "loss": 0.5292, + "step": 7420 + }, + { + "epoch": 0.24819615179048637, + "grad_norm": 0.8704445362091064, + "learning_rate": 0.00019941537341601161, + "loss": 0.5081, + "step": 7430 + }, + { + "epoch": 0.24853019775521112, + "grad_norm": 1.026589035987854, + "learning_rate": 0.0001994113907996409, + "loss": 0.4825, + "step": 7440 + }, + { + "epoch": 0.24886424371993587, + "grad_norm": 1.0495578050613403, + "learning_rate": 0.00019940739470405088, + "loss": 0.5363, + "step": 7450 + }, + { + "epoch": 0.2491982896846606, + "grad_norm": 0.7614114284515381, + "learning_rate": 0.0001994033851297834, + "loss": 0.469, + "step": 7460 + }, + { + "epoch": 0.24953233564938535, + "grad_norm": 0.9039644598960876, + "learning_rate": 0.0001993993620773822, + "loss": 0.5318, + "step": 7470 + }, + { + "epoch": 0.2498663816141101, + "grad_norm": 0.7976282238960266, + "learning_rate": 0.00019939532554739264, + "loss": 0.5105, + "step": 7480 + }, + { + "epoch": 0.25020042757883487, + "grad_norm": 0.7901231646537781, + "learning_rate": 0.0001993912755403621, + "loss": 0.531, + "step": 7490 + }, + { + "epoch": 0.2505344735435596, + "grad_norm": 1.0002256631851196, + "learning_rate": 0.00019938721205683976, + "loss": 0.519, + "step": 7500 + }, + { + "epoch": 0.2508685195082843, + "grad_norm": 0.7858566641807556, + "learning_rate": 0.0001993831350973765, + "loss": 0.5267, + "step": 7510 + }, + { + "epoch": 0.25120256547300907, + "grad_norm": 0.9473349452018738, + "learning_rate": 0.00019937904466252521, + "loss": 0.5271, + "step": 7520 + }, + { + "epoch": 0.2515366114377338, + "grad_norm": 0.7414092421531677, + "learning_rate": 0.00019937494075284042, + "loss": 0.5194, + "step": 7530 + }, + { + "epoch": 0.25187065740245856, + "grad_norm": 0.9927947521209717, + "learning_rate": 0.00019937082336887868, + "loss": 0.471, + "step": 7540 + }, + { + "epoch": 0.2522047033671833, + "grad_norm": 0.7895140051841736, + "learning_rate": 0.00019936669251119818, + "loss": 0.4954, + "step": 7550 + }, + { + "epoch": 0.25253874933190806, + "grad_norm": 0.9787417650222778, + "learning_rate": 0.00019936254818035906, + "loss": 0.5239, + "step": 7560 + }, + { + "epoch": 0.2528727952966328, + "grad_norm": 0.9364962577819824, + "learning_rate": 0.00019935839037692326, + "loss": 0.4562, + "step": 7570 + }, + { + "epoch": 0.25320684126135756, + "grad_norm": 0.8818072080612183, + "learning_rate": 0.00019935421910145448, + "loss": 0.5295, + "step": 7580 + }, + { + "epoch": 0.2535408872260823, + "grad_norm": 0.8475042581558228, + "learning_rate": 0.0001993500343545184, + "loss": 0.5077, + "step": 7590 + }, + { + "epoch": 0.25387493319080706, + "grad_norm": 0.864146888256073, + "learning_rate": 0.00019934583613668238, + "loss": 0.5017, + "step": 7600 + }, + { + "epoch": 0.2542089791555318, + "grad_norm": 0.7740395665168762, + "learning_rate": 0.00019934162444851564, + "loss": 0.4829, + "step": 7610 + }, + { + "epoch": 0.25454302512025656, + "grad_norm": 1.0291764736175537, + "learning_rate": 0.0001993373992905893, + "loss": 0.5354, + "step": 7620 + }, + { + "epoch": 0.2548770710849813, + "grad_norm": 0.7710726857185364, + "learning_rate": 0.00019933316066347617, + "loss": 0.5305, + "step": 7630 + }, + { + "epoch": 0.25521111704970606, + "grad_norm": 0.8995339274406433, + "learning_rate": 0.00019932890856775104, + "loss": 0.5187, + "step": 7640 + }, + { + "epoch": 0.2555451630144308, + "grad_norm": 0.9670824408531189, + "learning_rate": 0.00019932464300399043, + "loss": 0.5343, + "step": 7650 + }, + { + "epoch": 0.25587920897915556, + "grad_norm": 0.7237956523895264, + "learning_rate": 0.00019932036397277267, + "loss": 0.498, + "step": 7660 + }, + { + "epoch": 0.25621325494388025, + "grad_norm": 0.820739209651947, + "learning_rate": 0.000199316071474678, + "loss": 0.5059, + "step": 7670 + }, + { + "epoch": 0.256547300908605, + "grad_norm": 0.8891921043395996, + "learning_rate": 0.00019931176551028846, + "loss": 0.5019, + "step": 7680 + }, + { + "epoch": 0.25688134687332975, + "grad_norm": 0.8104463219642639, + "learning_rate": 0.00019930744608018784, + "loss": 0.5125, + "step": 7690 + }, + { + "epoch": 0.2572153928380545, + "grad_norm": 0.7966371178627014, + "learning_rate": 0.00019930311318496182, + "loss": 0.5024, + "step": 7700 + }, + { + "epoch": 0.25754943880277925, + "grad_norm": 0.8550522327423096, + "learning_rate": 0.00019929876682519795, + "loss": 0.5351, + "step": 7710 + }, + { + "epoch": 0.257883484767504, + "grad_norm": 0.8938041925430298, + "learning_rate": 0.0001992944070014855, + "loss": 0.509, + "step": 7720 + }, + { + "epoch": 0.25821753073222875, + "grad_norm": 0.7290399670600891, + "learning_rate": 0.00019929003371441567, + "loss": 0.5034, + "step": 7730 + }, + { + "epoch": 0.2585515766969535, + "grad_norm": 0.7694828510284424, + "learning_rate": 0.00019928564696458138, + "loss": 0.4937, + "step": 7740 + }, + { + "epoch": 0.25888562266167825, + "grad_norm": 0.9401078224182129, + "learning_rate": 0.00019928124675257746, + "loss": 0.5112, + "step": 7750 + }, + { + "epoch": 0.259219668626403, + "grad_norm": 0.9294880032539368, + "learning_rate": 0.00019927683307900055, + "loss": 0.4867, + "step": 7760 + }, + { + "epoch": 0.25955371459112775, + "grad_norm": 0.84788978099823, + "learning_rate": 0.00019927240594444908, + "loss": 0.5285, + "step": 7770 + }, + { + "epoch": 0.2598877605558525, + "grad_norm": 1.0445371866226196, + "learning_rate": 0.0001992679653495233, + "loss": 0.5083, + "step": 7780 + }, + { + "epoch": 0.26022180652057725, + "grad_norm": 0.7452255487442017, + "learning_rate": 0.0001992635112948254, + "loss": 0.4974, + "step": 7790 + }, + { + "epoch": 0.260555852485302, + "grad_norm": 0.9368339776992798, + "learning_rate": 0.00019925904378095922, + "loss": 0.4874, + "step": 7800 + }, + { + "epoch": 0.26088989845002675, + "grad_norm": 0.8699080348014832, + "learning_rate": 0.0001992545628085305, + "loss": 0.5432, + "step": 7810 + }, + { + "epoch": 0.2612239444147515, + "grad_norm": 0.817009449005127, + "learning_rate": 0.0001992500683781469, + "loss": 0.5097, + "step": 7820 + }, + { + "epoch": 0.2615579903794762, + "grad_norm": 0.9508885741233826, + "learning_rate": 0.00019924556049041777, + "loss": 0.5103, + "step": 7830 + }, + { + "epoch": 0.26189203634420094, + "grad_norm": 0.7698647975921631, + "learning_rate": 0.00019924103914595437, + "loss": 0.4712, + "step": 7840 + }, + { + "epoch": 0.2622260823089257, + "grad_norm": 0.7984238266944885, + "learning_rate": 0.00019923650434536968, + "loss": 0.5192, + "step": 7850 + }, + { + "epoch": 0.26256012827365044, + "grad_norm": 0.8053947687149048, + "learning_rate": 0.00019923195608927865, + "loss": 0.5099, + "step": 7860 + }, + { + "epoch": 0.2628941742383752, + "grad_norm": 1.1834499835968018, + "learning_rate": 0.00019922739437829793, + "loss": 0.4799, + "step": 7870 + }, + { + "epoch": 0.26322822020309994, + "grad_norm": 0.8690808415412903, + "learning_rate": 0.0001992228192130461, + "loss": 0.5369, + "step": 7880 + }, + { + "epoch": 0.2635622661678247, + "grad_norm": 0.8662399649620056, + "learning_rate": 0.00019921823059414342, + "loss": 0.5356, + "step": 7890 + }, + { + "epoch": 0.26389631213254944, + "grad_norm": 0.8665481805801392, + "learning_rate": 0.00019921362852221215, + "loss": 0.531, + "step": 7900 + }, + { + "epoch": 0.2642303580972742, + "grad_norm": 0.9517328143119812, + "learning_rate": 0.0001992090129978762, + "loss": 0.5003, + "step": 7910 + }, + { + "epoch": 0.26456440406199894, + "grad_norm": 0.849541962146759, + "learning_rate": 0.0001992043840217615, + "loss": 0.5016, + "step": 7920 + }, + { + "epoch": 0.2648984500267237, + "grad_norm": 0.9600797891616821, + "learning_rate": 0.00019919974159449558, + "loss": 0.5025, + "step": 7930 + }, + { + "epoch": 0.26523249599144844, + "grad_norm": 0.7836249470710754, + "learning_rate": 0.000199195085716708, + "loss": 0.4831, + "step": 7940 + }, + { + "epoch": 0.2655665419561732, + "grad_norm": 0.9801962375640869, + "learning_rate": 0.00019919041638903, + "loss": 0.5318, + "step": 7950 + }, + { + "epoch": 0.26590058792089794, + "grad_norm": 0.7875173687934875, + "learning_rate": 0.0001991857336120947, + "loss": 0.4829, + "step": 7960 + }, + { + "epoch": 0.2662346338856227, + "grad_norm": 0.7537250518798828, + "learning_rate": 0.00019918103738653705, + "loss": 0.5239, + "step": 7970 + }, + { + "epoch": 0.26656867985034743, + "grad_norm": 0.8993843793869019, + "learning_rate": 0.00019917632771299383, + "loss": 0.5169, + "step": 7980 + }, + { + "epoch": 0.26690272581507213, + "grad_norm": 0.9689232707023621, + "learning_rate": 0.0001991716045921036, + "loss": 0.5117, + "step": 7990 + }, + { + "epoch": 0.2672367717797969, + "grad_norm": 1.0239914655685425, + "learning_rate": 0.00019916686802450676, + "loss": 0.5309, + "step": 8000 + }, + { + "epoch": 0.2672367717797969, + "eval_chrf": 86.55305745975566, + "eval_loss": 0.8682594299316406, + "eval_runtime": 145.5747, + "eval_samples_per_second": 0.687, + "eval_steps_per_second": 0.027, + "step": 8000 + }, + { + "epoch": 0.2675708177445216, + "grad_norm": 0.7286977171897888, + "learning_rate": 0.00019916211801084558, + "loss": 0.5138, + "step": 8010 + }, + { + "epoch": 0.2679048637092464, + "grad_norm": 0.9438340067863464, + "learning_rate": 0.00019915735455176408, + "loss": 0.5215, + "step": 8020 + }, + { + "epoch": 0.2682389096739711, + "grad_norm": 0.848320722579956, + "learning_rate": 0.00019915257764790812, + "loss": 0.5108, + "step": 8030 + }, + { + "epoch": 0.2685729556386959, + "grad_norm": 0.9466085433959961, + "learning_rate": 0.0001991477872999255, + "loss": 0.4575, + "step": 8040 + }, + { + "epoch": 0.2689070016034206, + "grad_norm": 0.9775509238243103, + "learning_rate": 0.00019914298350846564, + "loss": 0.4558, + "step": 8050 + }, + { + "epoch": 0.2692410475681454, + "grad_norm": 0.8090218901634216, + "learning_rate": 0.00019913816627417993, + "loss": 0.5207, + "step": 8060 + }, + { + "epoch": 0.2695750935328701, + "grad_norm": 0.971305251121521, + "learning_rate": 0.00019913333559772153, + "loss": 0.4807, + "step": 8070 + }, + { + "epoch": 0.2699091394975949, + "grad_norm": 0.7601242065429688, + "learning_rate": 0.00019912849147974546, + "loss": 0.5007, + "step": 8080 + }, + { + "epoch": 0.2702431854623196, + "grad_norm": 0.9221014380455017, + "learning_rate": 0.00019912363392090852, + "loss": 0.504, + "step": 8090 + }, + { + "epoch": 0.2705772314270444, + "grad_norm": 0.8465068936347961, + "learning_rate": 0.00019911876292186932, + "loss": 0.5201, + "step": 8100 + }, + { + "epoch": 0.2709112773917691, + "grad_norm": 0.7881593704223633, + "learning_rate": 0.00019911387848328834, + "loss": 0.5081, + "step": 8110 + }, + { + "epoch": 0.2712453233564939, + "grad_norm": 0.8457317352294922, + "learning_rate": 0.0001991089806058279, + "loss": 0.5172, + "step": 8120 + }, + { + "epoch": 0.2715793693212186, + "grad_norm": 0.8767427206039429, + "learning_rate": 0.00019910406929015204, + "loss": 0.4831, + "step": 8130 + }, + { + "epoch": 0.27191341528594337, + "grad_norm": 0.9221076965332031, + "learning_rate": 0.0001990991445369267, + "loss": 0.5161, + "step": 8140 + }, + { + "epoch": 0.27224746125066807, + "grad_norm": 0.7781730890274048, + "learning_rate": 0.0001990942063468197, + "loss": 0.4628, + "step": 8150 + }, + { + "epoch": 0.2725815072153928, + "grad_norm": 0.8241472244262695, + "learning_rate": 0.0001990892547205005, + "loss": 0.478, + "step": 8160 + }, + { + "epoch": 0.27291555318011756, + "grad_norm": 0.8273789882659912, + "learning_rate": 0.00019908428965864059, + "loss": 0.5173, + "step": 8170 + }, + { + "epoch": 0.2732495991448423, + "grad_norm": 0.9277275204658508, + "learning_rate": 0.00019907931116191314, + "loss": 0.4832, + "step": 8180 + }, + { + "epoch": 0.27358364510956706, + "grad_norm": 0.7659791111946106, + "learning_rate": 0.00019907431923099316, + "loss": 0.4928, + "step": 8190 + }, + { + "epoch": 0.2739176910742918, + "grad_norm": 0.8846988081932068, + "learning_rate": 0.00019906931386655754, + "loss": 0.5157, + "step": 8200 + }, + { + "epoch": 0.27425173703901656, + "grad_norm": 0.9565207958221436, + "learning_rate": 0.00019906429506928497, + "loss": 0.478, + "step": 8210 + }, + { + "epoch": 0.2745857830037413, + "grad_norm": 0.6910601854324341, + "learning_rate": 0.00019905926283985594, + "loss": 0.501, + "step": 8220 + }, + { + "epoch": 0.27491982896846606, + "grad_norm": 1.0224066972732544, + "learning_rate": 0.00019905421717895277, + "loss": 0.5158, + "step": 8230 + }, + { + "epoch": 0.2752538749331908, + "grad_norm": 0.7154384851455688, + "learning_rate": 0.0001990491580872596, + "loss": 0.4926, + "step": 8240 + }, + { + "epoch": 0.27558792089791556, + "grad_norm": 0.78072589635849, + "learning_rate": 0.00019904408556546238, + "loss": 0.4999, + "step": 8250 + }, + { + "epoch": 0.2759219668626403, + "grad_norm": 0.7550773024559021, + "learning_rate": 0.00019903899961424887, + "loss": 0.4779, + "step": 8260 + }, + { + "epoch": 0.27625601282736506, + "grad_norm": 0.9067159295082092, + "learning_rate": 0.00019903390023430877, + "loss": 0.4742, + "step": 8270 + }, + { + "epoch": 0.2765900587920898, + "grad_norm": 0.7890802621841431, + "learning_rate": 0.0001990287874263334, + "loss": 0.45, + "step": 8280 + }, + { + "epoch": 0.27692410475681456, + "grad_norm": 0.8711549043655396, + "learning_rate": 0.00019902366119101613, + "loss": 0.4915, + "step": 8290 + }, + { + "epoch": 0.2772581507215393, + "grad_norm": 0.7940702438354492, + "learning_rate": 0.00019901852152905188, + "loss": 0.492, + "step": 8300 + }, + { + "epoch": 0.277592196686264, + "grad_norm": 0.9088519811630249, + "learning_rate": 0.00019901336844113765, + "loss": 0.4961, + "step": 8310 + }, + { + "epoch": 0.27792624265098875, + "grad_norm": 0.7419559955596924, + "learning_rate": 0.00019900820192797212, + "loss": 0.4465, + "step": 8320 + }, + { + "epoch": 0.2782602886157135, + "grad_norm": 0.7227708697319031, + "learning_rate": 0.00019900302199025579, + "loss": 0.471, + "step": 8330 + }, + { + "epoch": 0.27859433458043825, + "grad_norm": 0.7825828790664673, + "learning_rate": 0.00019899782862869102, + "loss": 0.5209, + "step": 8340 + }, + { + "epoch": 0.278928380545163, + "grad_norm": 0.7921634316444397, + "learning_rate": 0.000198992621843982, + "loss": 0.5132, + "step": 8350 + }, + { + "epoch": 0.27926242650988775, + "grad_norm": 0.7269680500030518, + "learning_rate": 0.0001989874016368347, + "loss": 0.5307, + "step": 8360 + }, + { + "epoch": 0.2795964724746125, + "grad_norm": 0.7700535655021667, + "learning_rate": 0.00019898216800795697, + "loss": 0.4941, + "step": 8370 + }, + { + "epoch": 0.27993051843933725, + "grad_norm": 0.8676790595054626, + "learning_rate": 0.00019897692095805836, + "loss": 0.5182, + "step": 8380 + }, + { + "epoch": 0.280264564404062, + "grad_norm": 0.9337838888168335, + "learning_rate": 0.0001989716604878504, + "loss": 0.5071, + "step": 8390 + }, + { + "epoch": 0.28059861036878675, + "grad_norm": 0.7741159200668335, + "learning_rate": 0.00019896638659804632, + "loss": 0.5001, + "step": 8400 + }, + { + "epoch": 0.2809326563335115, + "grad_norm": 0.7444791793823242, + "learning_rate": 0.00019896109928936123, + "loss": 0.4956, + "step": 8410 + }, + { + "epoch": 0.28126670229823625, + "grad_norm": 0.8651930689811707, + "learning_rate": 0.000198955798562512, + "loss": 0.4859, + "step": 8420 + }, + { + "epoch": 0.281600748262961, + "grad_norm": 0.6539933085441589, + "learning_rate": 0.00019895048441821736, + "loss": 0.5042, + "step": 8430 + }, + { + "epoch": 0.28193479422768575, + "grad_norm": 0.8455032110214233, + "learning_rate": 0.00019894515685719787, + "loss": 0.4933, + "step": 8440 + }, + { + "epoch": 0.2822688401924105, + "grad_norm": 0.8024274110794067, + "learning_rate": 0.00019893981588017595, + "loss": 0.4766, + "step": 8450 + }, + { + "epoch": 0.28260288615713525, + "grad_norm": 0.9455446600914001, + "learning_rate": 0.0001989344614878757, + "loss": 0.5171, + "step": 8460 + }, + { + "epoch": 0.28293693212185994, + "grad_norm": 0.956288754940033, + "learning_rate": 0.00019892909368102318, + "loss": 0.468, + "step": 8470 + }, + { + "epoch": 0.2832709780865847, + "grad_norm": 0.7613013386726379, + "learning_rate": 0.00019892371246034616, + "loss": 0.5235, + "step": 8480 + }, + { + "epoch": 0.28360502405130944, + "grad_norm": 0.8916621208190918, + "learning_rate": 0.00019891831782657433, + "loss": 0.5061, + "step": 8490 + }, + { + "epoch": 0.2839390700160342, + "grad_norm": 0.7866721749305725, + "learning_rate": 0.00019891290978043912, + "loss": 0.46, + "step": 8500 + }, + { + "epoch": 0.28427311598075894, + "grad_norm": 1.1454440355300903, + "learning_rate": 0.0001989074883226738, + "loss": 0.4989, + "step": 8510 + }, + { + "epoch": 0.2846071619454837, + "grad_norm": 0.8369997143745422, + "learning_rate": 0.00019890205345401353, + "loss": 0.4725, + "step": 8520 + }, + { + "epoch": 0.28494120791020844, + "grad_norm": 0.8037457466125488, + "learning_rate": 0.00019889660517519517, + "loss": 0.4695, + "step": 8530 + }, + { + "epoch": 0.2852752538749332, + "grad_norm": 0.7570744156837463, + "learning_rate": 0.00019889114348695745, + "loss": 0.5117, + "step": 8540 + }, + { + "epoch": 0.28560929983965794, + "grad_norm": 0.8357149958610535, + "learning_rate": 0.00019888566839004098, + "loss": 0.4983, + "step": 8550 + }, + { + "epoch": 0.2859433458043827, + "grad_norm": 0.9159808158874512, + "learning_rate": 0.00019888017988518804, + "loss": 0.5155, + "step": 8560 + }, + { + "epoch": 0.28627739176910744, + "grad_norm": 0.9858645796775818, + "learning_rate": 0.00019887467797314288, + "loss": 0.4625, + "step": 8570 + }, + { + "epoch": 0.2866114377338322, + "grad_norm": 0.8820253014564514, + "learning_rate": 0.0001988691626546515, + "loss": 0.5105, + "step": 8580 + }, + { + "epoch": 0.28694548369855694, + "grad_norm": 0.7162189483642578, + "learning_rate": 0.0001988636339304617, + "loss": 0.5139, + "step": 8590 + }, + { + "epoch": 0.2872795296632817, + "grad_norm": 0.7859961986541748, + "learning_rate": 0.00019885809180132316, + "loss": 0.5074, + "step": 8600 + }, + { + "epoch": 0.28761357562800643, + "grad_norm": 0.8652927875518799, + "learning_rate": 0.0001988525362679873, + "loss": 0.4939, + "step": 8610 + }, + { + "epoch": 0.2879476215927312, + "grad_norm": 0.9597941040992737, + "learning_rate": 0.00019884696733120746, + "loss": 0.4947, + "step": 8620 + }, + { + "epoch": 0.2882816675574559, + "grad_norm": 0.9023726582527161, + "learning_rate": 0.0001988413849917386, + "loss": 0.4982, + "step": 8630 + }, + { + "epoch": 0.28861571352218063, + "grad_norm": 0.8418779969215393, + "learning_rate": 0.00019883578925033776, + "loss": 0.5203, + "step": 8640 + }, + { + "epoch": 0.2889497594869054, + "grad_norm": 0.7766163349151611, + "learning_rate": 0.00019883018010776363, + "loss": 0.4994, + "step": 8650 + }, + { + "epoch": 0.2892838054516301, + "grad_norm": 0.9431635141372681, + "learning_rate": 0.00019882455756477677, + "loss": 0.4611, + "step": 8660 + }, + { + "epoch": 0.2896178514163549, + "grad_norm": 0.8124434947967529, + "learning_rate": 0.0001988189216221395, + "loss": 0.4985, + "step": 8670 + }, + { + "epoch": 0.2899518973810796, + "grad_norm": 0.7580332159996033, + "learning_rate": 0.00019881327228061602, + "loss": 0.5261, + "step": 8680 + }, + { + "epoch": 0.2902859433458044, + "grad_norm": 0.8329998850822449, + "learning_rate": 0.0001988076095409723, + "loss": 0.5036, + "step": 8690 + }, + { + "epoch": 0.2906199893105291, + "grad_norm": 0.934085488319397, + "learning_rate": 0.0001988019334039762, + "loss": 0.5027, + "step": 8700 + }, + { + "epoch": 0.2909540352752539, + "grad_norm": 0.7838440537452698, + "learning_rate": 0.00019879624387039737, + "loss": 0.4881, + "step": 8710 + }, + { + "epoch": 0.2912880812399786, + "grad_norm": 0.9331871867179871, + "learning_rate": 0.00019879054094100717, + "loss": 0.4802, + "step": 8720 + }, + { + "epoch": 0.2916221272047034, + "grad_norm": 0.9774669408798218, + "learning_rate": 0.00019878482461657893, + "loss": 0.5115, + "step": 8730 + }, + { + "epoch": 0.2919561731694281, + "grad_norm": 0.8607262372970581, + "learning_rate": 0.00019877909489788767, + "loss": 0.4904, + "step": 8740 + }, + { + "epoch": 0.2922902191341529, + "grad_norm": 1.0534628629684448, + "learning_rate": 0.00019877335178571036, + "loss": 0.5113, + "step": 8750 + }, + { + "epoch": 0.2926242650988776, + "grad_norm": 1.017336130142212, + "learning_rate": 0.0001987675952808256, + "loss": 0.4774, + "step": 8760 + }, + { + "epoch": 0.2929583110636024, + "grad_norm": 0.8141566514968872, + "learning_rate": 0.00019876182538401408, + "loss": 0.4981, + "step": 8770 + }, + { + "epoch": 0.2932923570283271, + "grad_norm": 0.7687285542488098, + "learning_rate": 0.000198756042096058, + "loss": 0.487, + "step": 8780 + }, + { + "epoch": 0.29362640299305187, + "grad_norm": 0.7673447132110596, + "learning_rate": 0.00019875024541774156, + "loss": 0.5102, + "step": 8790 + }, + { + "epoch": 0.29396044895777657, + "grad_norm": 0.8718093633651733, + "learning_rate": 0.00019874443534985075, + "loss": 0.497, + "step": 8800 + }, + { + "epoch": 0.2942944949225013, + "grad_norm": 0.8940958976745605, + "learning_rate": 0.00019873861189317334, + "loss": 0.4964, + "step": 8810 + }, + { + "epoch": 0.29462854088722606, + "grad_norm": 0.8976048827171326, + "learning_rate": 0.00019873277504849895, + "loss": 0.4734, + "step": 8820 + }, + { + "epoch": 0.2949625868519508, + "grad_norm": 0.7147444486618042, + "learning_rate": 0.000198726924816619, + "loss": 0.4534, + "step": 8830 + }, + { + "epoch": 0.29529663281667556, + "grad_norm": 0.7496196031570435, + "learning_rate": 0.0001987210611983267, + "loss": 0.4566, + "step": 8840 + }, + { + "epoch": 0.2956306787814003, + "grad_norm": 0.7494941353797913, + "learning_rate": 0.0001987151841944171, + "loss": 0.4749, + "step": 8850 + }, + { + "epoch": 0.29596472474612506, + "grad_norm": 0.9968093633651733, + "learning_rate": 0.0001987092938056871, + "loss": 0.51, + "step": 8860 + }, + { + "epoch": 0.2962987707108498, + "grad_norm": 0.7799782156944275, + "learning_rate": 0.00019870339003293534, + "loss": 0.4803, + "step": 8870 + }, + { + "epoch": 0.29663281667557456, + "grad_norm": 0.9433035850524902, + "learning_rate": 0.00019869747287696234, + "loss": 0.4867, + "step": 8880 + }, + { + "epoch": 0.2969668626402993, + "grad_norm": 0.7455233931541443, + "learning_rate": 0.0001986915423385704, + "loss": 0.4982, + "step": 8890 + }, + { + "epoch": 0.29730090860502406, + "grad_norm": 0.8817819952964783, + "learning_rate": 0.00019868559841856368, + "loss": 0.466, + "step": 8900 + }, + { + "epoch": 0.2976349545697488, + "grad_norm": 0.8005492687225342, + "learning_rate": 0.00019867964111774806, + "loss": 0.4957, + "step": 8910 + }, + { + "epoch": 0.29796900053447356, + "grad_norm": 0.8101239204406738, + "learning_rate": 0.0001986736704369313, + "loss": 0.4748, + "step": 8920 + }, + { + "epoch": 0.2983030464991983, + "grad_norm": 0.7476082444190979, + "learning_rate": 0.00019866768637692302, + "loss": 0.5156, + "step": 8930 + }, + { + "epoch": 0.29863709246392306, + "grad_norm": 0.833126962184906, + "learning_rate": 0.00019866168893853452, + "loss": 0.502, + "step": 8940 + }, + { + "epoch": 0.2989711384286478, + "grad_norm": 0.7939196825027466, + "learning_rate": 0.00019865567812257906, + "loss": 0.4685, + "step": 8950 + }, + { + "epoch": 0.2993051843933725, + "grad_norm": 0.7889593243598938, + "learning_rate": 0.00019864965392987164, + "loss": 0.4932, + "step": 8960 + }, + { + "epoch": 0.29963923035809725, + "grad_norm": 0.6903104782104492, + "learning_rate": 0.00019864361636122904, + "loss": 0.4842, + "step": 8970 + }, + { + "epoch": 0.299973276322822, + "grad_norm": 0.7354331612586975, + "learning_rate": 0.0001986375654174699, + "loss": 0.4805, + "step": 8980 + }, + { + "epoch": 0.30030732228754675, + "grad_norm": 0.6792007088661194, + "learning_rate": 0.00019863150109941473, + "loss": 0.4913, + "step": 8990 + }, + { + "epoch": 0.3006413682522715, + "grad_norm": 0.9036393761634827, + "learning_rate": 0.00019862542340788575, + "loss": 0.4934, + "step": 9000 + }, + { + "epoch": 0.3006413682522715, + "eval_chrf": 86.221461518433, + "eval_loss": 0.8669318556785583, + "eval_runtime": 101.2172, + "eval_samples_per_second": 0.988, + "eval_steps_per_second": 0.04, + "step": 9000 + }, + { + "epoch": 0.30097541421699625, + "grad_norm": 0.980208158493042, + "learning_rate": 0.00019861933234370706, + "loss": 0.4864, + "step": 9010 + }, + { + "epoch": 0.301309460181721, + "grad_norm": 0.7595743536949158, + "learning_rate": 0.0001986132279077045, + "loss": 0.4976, + "step": 9020 + }, + { + "epoch": 0.30164350614644575, + "grad_norm": 0.740927517414093, + "learning_rate": 0.0001986071101007058, + "loss": 0.5137, + "step": 9030 + }, + { + "epoch": 0.3019775521111705, + "grad_norm": 0.7829098701477051, + "learning_rate": 0.00019860097892354048, + "loss": 0.4879, + "step": 9040 + }, + { + "epoch": 0.30231159807589525, + "grad_norm": 0.719974935054779, + "learning_rate": 0.00019859483437703986, + "loss": 0.4716, + "step": 9050 + }, + { + "epoch": 0.30264564404062, + "grad_norm": 0.8702359795570374, + "learning_rate": 0.0001985886764620371, + "loss": 0.4722, + "step": 9060 + }, + { + "epoch": 0.30297969000534475, + "grad_norm": 0.7944677472114563, + "learning_rate": 0.00019858250517936713, + "loss": 0.4844, + "step": 9070 + }, + { + "epoch": 0.3033137359700695, + "grad_norm": 0.7130064368247986, + "learning_rate": 0.00019857632052986673, + "loss": 0.4818, + "step": 9080 + }, + { + "epoch": 0.30364778193479425, + "grad_norm": 0.8654322028160095, + "learning_rate": 0.0001985701225143745, + "loss": 0.4566, + "step": 9090 + }, + { + "epoch": 0.303981827899519, + "grad_norm": 0.7713915705680847, + "learning_rate": 0.0001985639111337308, + "loss": 0.4914, + "step": 9100 + }, + { + "epoch": 0.30431587386424375, + "grad_norm": 0.7690659165382385, + "learning_rate": 0.0001985576863887778, + "loss": 0.491, + "step": 9110 + }, + { + "epoch": 0.30464991982896844, + "grad_norm": 0.7633639574050903, + "learning_rate": 0.00019855144828035957, + "loss": 0.514, + "step": 9120 + }, + { + "epoch": 0.3049839657936932, + "grad_norm": 0.7922964096069336, + "learning_rate": 0.00019854519680932194, + "loss": 0.4657, + "step": 9130 + }, + { + "epoch": 0.30531801175841794, + "grad_norm": 0.8954222202301025, + "learning_rate": 0.00019853893197651249, + "loss": 0.4885, + "step": 9140 + }, + { + "epoch": 0.3056520577231427, + "grad_norm": 0.8895009756088257, + "learning_rate": 0.00019853265378278075, + "loss": 0.4996, + "step": 9150 + }, + { + "epoch": 0.30598610368786744, + "grad_norm": 0.6855996251106262, + "learning_rate": 0.0001985263622289779, + "loss": 0.5058, + "step": 9160 + }, + { + "epoch": 0.3063201496525922, + "grad_norm": 0.7712249755859375, + "learning_rate": 0.0001985200573159571, + "loss": 0.4998, + "step": 9170 + }, + { + "epoch": 0.30665419561731694, + "grad_norm": 0.8835926651954651, + "learning_rate": 0.00019851373904457317, + "loss": 0.4866, + "step": 9180 + }, + { + "epoch": 0.3069882415820417, + "grad_norm": 0.6903185248374939, + "learning_rate": 0.0001985074074156828, + "loss": 0.4945, + "step": 9190 + }, + { + "epoch": 0.30732228754676644, + "grad_norm": 0.9075086116790771, + "learning_rate": 0.00019850106243014454, + "loss": 0.4933, + "step": 9200 + }, + { + "epoch": 0.3076563335114912, + "grad_norm": 0.7682658433914185, + "learning_rate": 0.00019849470408881873, + "loss": 0.5147, + "step": 9210 + }, + { + "epoch": 0.30799037947621594, + "grad_norm": 0.7736578583717346, + "learning_rate": 0.00019848833239256738, + "loss": 0.4799, + "step": 9220 + }, + { + "epoch": 0.3083244254409407, + "grad_norm": 0.9277499318122864, + "learning_rate": 0.00019848194734225458, + "loss": 0.5026, + "step": 9230 + }, + { + "epoch": 0.30865847140566544, + "grad_norm": 0.8002655506134033, + "learning_rate": 0.00019847554893874598, + "loss": 0.5175, + "step": 9240 + }, + { + "epoch": 0.3089925173703902, + "grad_norm": 0.8736346960067749, + "learning_rate": 0.00019846913718290918, + "loss": 0.4688, + "step": 9250 + }, + { + "epoch": 0.30932656333511493, + "grad_norm": 0.7674985527992249, + "learning_rate": 0.00019846271207561356, + "loss": 0.4525, + "step": 9260 + }, + { + "epoch": 0.3096606092998397, + "grad_norm": 0.7126444578170776, + "learning_rate": 0.00019845627361773026, + "loss": 0.4805, + "step": 9270 + }, + { + "epoch": 0.3099946552645644, + "grad_norm": 0.6991499066352844, + "learning_rate": 0.00019844982181013233, + "loss": 0.5041, + "step": 9280 + }, + { + "epoch": 0.31032870122928913, + "grad_norm": 0.8597789406776428, + "learning_rate": 0.0001984433566536945, + "loss": 0.4809, + "step": 9290 + }, + { + "epoch": 0.3106627471940139, + "grad_norm": 0.7841350436210632, + "learning_rate": 0.00019843687814929348, + "loss": 0.4854, + "step": 9300 + }, + { + "epoch": 0.3109967931587386, + "grad_norm": 0.9168494939804077, + "learning_rate": 0.0001984303862978076, + "loss": 0.5006, + "step": 9310 + }, + { + "epoch": 0.3113308391234634, + "grad_norm": 0.7907804250717163, + "learning_rate": 0.00019842388110011716, + "loss": 0.485, + "step": 9320 + }, + { + "epoch": 0.3116648850881881, + "grad_norm": 0.8104248046875, + "learning_rate": 0.00019841736255710415, + "loss": 0.481, + "step": 9330 + }, + { + "epoch": 0.3119989310529129, + "grad_norm": 1.2834248542785645, + "learning_rate": 0.0001984108306696524, + "loss": 0.479, + "step": 9340 + }, + { + "epoch": 0.3123329770176376, + "grad_norm": 1.0437935590744019, + "learning_rate": 0.0001984042854386477, + "loss": 0.4821, + "step": 9350 + }, + { + "epoch": 0.3126670229823624, + "grad_norm": 0.6233859062194824, + "learning_rate": 0.00019839772686497736, + "loss": 0.5369, + "step": 9360 + }, + { + "epoch": 0.3130010689470871, + "grad_norm": 0.8385756611824036, + "learning_rate": 0.00019839115494953076, + "loss": 0.5239, + "step": 9370 + }, + { + "epoch": 0.3133351149118119, + "grad_norm": 0.8494948744773865, + "learning_rate": 0.00019838456969319896, + "loss": 0.5054, + "step": 9380 + }, + { + "epoch": 0.3136691608765366, + "grad_norm": 0.8146253228187561, + "learning_rate": 0.00019837797109687483, + "loss": 0.4805, + "step": 9390 + }, + { + "epoch": 0.3140032068412614, + "grad_norm": 0.9986990094184875, + "learning_rate": 0.00019837135916145313, + "loss": 0.4977, + "step": 9400 + }, + { + "epoch": 0.3143372528059861, + "grad_norm": 0.9305341243743896, + "learning_rate": 0.00019836473388783034, + "loss": 0.4476, + "step": 9410 + }, + { + "epoch": 0.31467129877071087, + "grad_norm": 0.7608397006988525, + "learning_rate": 0.00019835809527690476, + "loss": 0.4683, + "step": 9420 + }, + { + "epoch": 0.3150053447354356, + "grad_norm": 0.7416658997535706, + "learning_rate": 0.0001983514433295766, + "loss": 0.4731, + "step": 9430 + }, + { + "epoch": 0.3153393907001603, + "grad_norm": 0.8297653794288635, + "learning_rate": 0.00019834477804674773, + "loss": 0.4768, + "step": 9440 + }, + { + "epoch": 0.31567343666488507, + "grad_norm": 0.7217274308204651, + "learning_rate": 0.0001983380994293219, + "loss": 0.4762, + "step": 9450 + }, + { + "epoch": 0.3160074826296098, + "grad_norm": 0.6445144414901733, + "learning_rate": 0.00019833140747820468, + "loss": 0.4658, + "step": 9460 + }, + { + "epoch": 0.31634152859433456, + "grad_norm": 0.8053786158561707, + "learning_rate": 0.00019832470219430346, + "loss": 0.4562, + "step": 9470 + }, + { + "epoch": 0.3166755745590593, + "grad_norm": 1.0634390115737915, + "learning_rate": 0.00019831798357852737, + "loss": 0.508, + "step": 9480 + }, + { + "epoch": 0.31700962052378406, + "grad_norm": 0.8144078850746155, + "learning_rate": 0.0001983112516317874, + "loss": 0.4695, + "step": 9490 + }, + { + "epoch": 0.3173436664885088, + "grad_norm": 0.9061560034751892, + "learning_rate": 0.00019830450635499635, + "loss": 0.4489, + "step": 9500 + }, + { + "epoch": 0.31767771245323356, + "grad_norm": 1.2146528959274292, + "learning_rate": 0.00019829774774906883, + "loss": 0.4987, + "step": 9510 + }, + { + "epoch": 0.3180117584179583, + "grad_norm": 0.8559467792510986, + "learning_rate": 0.00019829097581492116, + "loss": 0.5084, + "step": 9520 + }, + { + "epoch": 0.31834580438268306, + "grad_norm": 0.7655954360961914, + "learning_rate": 0.00019828419055347167, + "loss": 0.4785, + "step": 9530 + }, + { + "epoch": 0.3186798503474078, + "grad_norm": 0.7611229419708252, + "learning_rate": 0.00019827739196564027, + "loss": 0.5197, + "step": 9540 + }, + { + "epoch": 0.31901389631213256, + "grad_norm": 0.8146008849143982, + "learning_rate": 0.00019827058005234886, + "loss": 0.4938, + "step": 9550 + }, + { + "epoch": 0.3193479422768573, + "grad_norm": 0.8250485062599182, + "learning_rate": 0.000198263754814521, + "loss": 0.4444, + "step": 9560 + }, + { + "epoch": 0.31968198824158206, + "grad_norm": 1.0367143154144287, + "learning_rate": 0.00019825691625308218, + "loss": 0.4882, + "step": 9570 + }, + { + "epoch": 0.3200160342063068, + "grad_norm": 0.6693322062492371, + "learning_rate": 0.00019825006436895962, + "loss": 0.4903, + "step": 9580 + }, + { + "epoch": 0.32035008017103156, + "grad_norm": 0.966487467288971, + "learning_rate": 0.0001982431991630824, + "loss": 0.4614, + "step": 9590 + }, + { + "epoch": 0.32068412613575625, + "grad_norm": 0.764102041721344, + "learning_rate": 0.0001982363206363813, + "loss": 0.4565, + "step": 9600 + }, + { + "epoch": 0.321018172100481, + "grad_norm": 0.9750432372093201, + "learning_rate": 0.00019822942878978904, + "loss": 0.4683, + "step": 9610 + }, + { + "epoch": 0.32135221806520575, + "grad_norm": 0.7986359000205994, + "learning_rate": 0.00019822252362424006, + "loss": 0.5335, + "step": 9620 + }, + { + "epoch": 0.3216862640299305, + "grad_norm": 0.7218362092971802, + "learning_rate": 0.00019821560514067069, + "loss": 0.4905, + "step": 9630 + }, + { + "epoch": 0.32202030999465525, + "grad_norm": 0.8194231390953064, + "learning_rate": 0.00019820867334001892, + "loss": 0.4884, + "step": 9640 + }, + { + "epoch": 0.32235435595938, + "grad_norm": 0.7414090633392334, + "learning_rate": 0.00019820172822322473, + "loss": 0.5028, + "step": 9650 + }, + { + "epoch": 0.32268840192410475, + "grad_norm": 0.8972886204719543, + "learning_rate": 0.00019819476979122974, + "loss": 0.4974, + "step": 9660 + }, + { + "epoch": 0.3230224478888295, + "grad_norm": 0.9556971788406372, + "learning_rate": 0.00019818779804497746, + "loss": 0.4696, + "step": 9670 + }, + { + "epoch": 0.32335649385355425, + "grad_norm": 0.7380297780036926, + "learning_rate": 0.00019818081298541323, + "loss": 0.4927, + "step": 9680 + }, + { + "epoch": 0.323690539818279, + "grad_norm": 0.7318816184997559, + "learning_rate": 0.0001981738146134841, + "loss": 0.5087, + "step": 9690 + }, + { + "epoch": 0.32402458578300375, + "grad_norm": 0.9152219295501709, + "learning_rate": 0.00019816680293013902, + "loss": 0.4701, + "step": 9700 + }, + { + "epoch": 0.3243586317477285, + "grad_norm": 0.8081129193305969, + "learning_rate": 0.0001981597779363287, + "loss": 0.4777, + "step": 9710 + }, + { + "epoch": 0.32469267771245325, + "grad_norm": 0.8119087219238281, + "learning_rate": 0.00019815273963300564, + "loss": 0.4795, + "step": 9720 + }, + { + "epoch": 0.325026723677178, + "grad_norm": 0.5850111246109009, + "learning_rate": 0.0001981456880211242, + "loss": 0.4943, + "step": 9730 + }, + { + "epoch": 0.32536076964190275, + "grad_norm": 0.7698376774787903, + "learning_rate": 0.00019813862310164045, + "loss": 0.4785, + "step": 9740 + }, + { + "epoch": 0.3256948156066275, + "grad_norm": 0.8136736154556274, + "learning_rate": 0.00019813154487551243, + "loss": 0.4686, + "step": 9750 + }, + { + "epoch": 0.3260288615713522, + "grad_norm": 0.7792925238609314, + "learning_rate": 0.00019812445334369975, + "loss": 0.4808, + "step": 9760 + }, + { + "epoch": 0.32636290753607694, + "grad_norm": 0.7545470595359802, + "learning_rate": 0.00019811734850716403, + "loss": 0.4632, + "step": 9770 + }, + { + "epoch": 0.3266969535008017, + "grad_norm": 0.8896690011024475, + "learning_rate": 0.00019811023036686863, + "loss": 0.4855, + "step": 9780 + }, + { + "epoch": 0.32703099946552644, + "grad_norm": 0.9172888994216919, + "learning_rate": 0.00019810309892377866, + "loss": 0.4492, + "step": 9790 + }, + { + "epoch": 0.3273650454302512, + "grad_norm": 0.7512381076812744, + "learning_rate": 0.00019809595417886108, + "loss": 0.4442, + "step": 9800 + }, + { + "epoch": 0.32769909139497594, + "grad_norm": 0.8864928483963013, + "learning_rate": 0.0001980887961330847, + "loss": 0.5004, + "step": 9810 + }, + { + "epoch": 0.3280331373597007, + "grad_norm": 0.7057550549507141, + "learning_rate": 0.00019808162478741998, + "loss": 0.4792, + "step": 9820 + }, + { + "epoch": 0.32836718332442544, + "grad_norm": 0.7389547228813171, + "learning_rate": 0.00019807444014283937, + "loss": 0.4987, + "step": 9830 + }, + { + "epoch": 0.3287012292891502, + "grad_norm": 0.7675162553787231, + "learning_rate": 0.000198067242200317, + "loss": 0.4555, + "step": 9840 + }, + { + "epoch": 0.32903527525387494, + "grad_norm": 0.7330090999603271, + "learning_rate": 0.00019806003096082888, + "loss": 0.4751, + "step": 9850 + }, + { + "epoch": 0.3293693212185997, + "grad_norm": 0.8917701840400696, + "learning_rate": 0.00019805280642535273, + "loss": 0.4727, + "step": 9860 + }, + { + "epoch": 0.32970336718332444, + "grad_norm": 0.9137223958969116, + "learning_rate": 0.00019804556859486815, + "loss": 0.4871, + "step": 9870 + }, + { + "epoch": 0.3300374131480492, + "grad_norm": 0.6934738755226135, + "learning_rate": 0.00019803831747035655, + "loss": 0.4867, + "step": 9880 + }, + { + "epoch": 0.33037145911277394, + "grad_norm": 0.7302271127700806, + "learning_rate": 0.00019803105305280107, + "loss": 0.4806, + "step": 9890 + }, + { + "epoch": 0.3307055050774987, + "grad_norm": 0.8141774535179138, + "learning_rate": 0.0001980237753431867, + "loss": 0.4857, + "step": 9900 + }, + { + "epoch": 0.33103955104222343, + "grad_norm": 0.8890634775161743, + "learning_rate": 0.00019801648434250027, + "loss": 0.4832, + "step": 9910 + }, + { + "epoch": 0.33137359700694813, + "grad_norm": 0.7255607843399048, + "learning_rate": 0.0001980091800517303, + "loss": 0.4958, + "step": 9920 + }, + { + "epoch": 0.3317076429716729, + "grad_norm": 0.7126867175102234, + "learning_rate": 0.00019800186247186723, + "loss": 0.4963, + "step": 9930 + }, + { + "epoch": 0.3320416889363976, + "grad_norm": 0.8589699864387512, + "learning_rate": 0.00019799453160390323, + "loss": 0.5077, + "step": 9940 + }, + { + "epoch": 0.3323757349011224, + "grad_norm": 0.8465405702590942, + "learning_rate": 0.00019798718744883228, + "loss": 0.5075, + "step": 9950 + }, + { + "epoch": 0.3327097808658471, + "grad_norm": 0.7663049697875977, + "learning_rate": 0.00019797983000765027, + "loss": 0.4656, + "step": 9960 + }, + { + "epoch": 0.3330438268305719, + "grad_norm": 1.0316894054412842, + "learning_rate": 0.00019797245928135466, + "loss": 0.4934, + "step": 9970 + }, + { + "epoch": 0.3333778727952966, + "grad_norm": 0.8598639965057373, + "learning_rate": 0.00019796507527094495, + "loss": 0.4866, + "step": 9980 + }, + { + "epoch": 0.3337119187600214, + "grad_norm": 0.7342830896377563, + "learning_rate": 0.0001979576779774223, + "loss": 0.4882, + "step": 9990 + }, + { + "epoch": 0.3340459647247461, + "grad_norm": 0.6758005619049072, + "learning_rate": 0.0001979502674017897, + "loss": 0.4598, + "step": 10000 + }, + { + "epoch": 0.3340459647247461, + "eval_chrf": 84.27446855235347, + "eval_loss": 0.8406010270118713, + "eval_runtime": 154.983, + "eval_samples_per_second": 0.645, + "eval_steps_per_second": 0.026, + "step": 10000 + }, + { + "epoch": 0.3343800106894709, + "grad_norm": 0.8404979109764099, + "learning_rate": 0.00019794284354505198, + "loss": 0.4916, + "step": 10010 + }, + { + "epoch": 0.3347140566541956, + "grad_norm": 0.9548724293708801, + "learning_rate": 0.00019793540640821572, + "loss": 0.4976, + "step": 10020 + }, + { + "epoch": 0.3350481026189204, + "grad_norm": 0.8668383955955505, + "learning_rate": 0.00019792795599228935, + "loss": 0.4647, + "step": 10030 + }, + { + "epoch": 0.3353821485836451, + "grad_norm": 0.9449284076690674, + "learning_rate": 0.00019792049229828303, + "loss": 0.481, + "step": 10040 + }, + { + "epoch": 0.3357161945483699, + "grad_norm": 0.6947341561317444, + "learning_rate": 0.00019791301532720885, + "loss": 0.4626, + "step": 10050 + }, + { + "epoch": 0.3360502405130946, + "grad_norm": 0.7421325445175171, + "learning_rate": 0.00019790552508008053, + "loss": 0.4717, + "step": 10060 + }, + { + "epoch": 0.33638428647781937, + "grad_norm": 0.7840331792831421, + "learning_rate": 0.0001978980215579137, + "loss": 0.4974, + "step": 10070 + }, + { + "epoch": 0.3367183324425441, + "grad_norm": 0.734192967414856, + "learning_rate": 0.00019789050476172576, + "loss": 0.4681, + "step": 10080 + }, + { + "epoch": 0.3370523784072688, + "grad_norm": 0.9255924224853516, + "learning_rate": 0.00019788297469253597, + "loss": 0.4689, + "step": 10090 + }, + { + "epoch": 0.33738642437199357, + "grad_norm": 0.7588878273963928, + "learning_rate": 0.00019787543135136524, + "loss": 0.4266, + "step": 10100 + }, + { + "epoch": 0.3377204703367183, + "grad_norm": 0.8314864039421082, + "learning_rate": 0.00019786787473923643, + "loss": 0.4432, + "step": 10110 + }, + { + "epoch": 0.33805451630144306, + "grad_norm": 0.7125697731971741, + "learning_rate": 0.00019786030485717412, + "loss": 0.4951, + "step": 10120 + }, + { + "epoch": 0.3383885622661678, + "grad_norm": 0.8328582644462585, + "learning_rate": 0.00019785272170620474, + "loss": 0.5071, + "step": 10130 + }, + { + "epoch": 0.33872260823089256, + "grad_norm": 0.833241879940033, + "learning_rate": 0.0001978451252873565, + "loss": 0.4578, + "step": 10140 + }, + { + "epoch": 0.3390566541956173, + "grad_norm": 0.7744841575622559, + "learning_rate": 0.00019783751560165935, + "loss": 0.4804, + "step": 10150 + }, + { + "epoch": 0.33939070016034206, + "grad_norm": 0.7624791860580444, + "learning_rate": 0.00019782989265014513, + "loss": 0.4728, + "step": 10160 + }, + { + "epoch": 0.3397247461250668, + "grad_norm": 0.775273323059082, + "learning_rate": 0.00019782225643384741, + "loss": 0.5213, + "step": 10170 + }, + { + "epoch": 0.34005879208979156, + "grad_norm": 0.8155893087387085, + "learning_rate": 0.00019781460695380163, + "loss": 0.4991, + "step": 10180 + }, + { + "epoch": 0.3403928380545163, + "grad_norm": 0.7629421353340149, + "learning_rate": 0.00019780694421104496, + "loss": 0.4906, + "step": 10190 + }, + { + "epoch": 0.34072688401924106, + "grad_norm": 0.8506101369857788, + "learning_rate": 0.00019779926820661636, + "loss": 0.5128, + "step": 10200 + }, + { + "epoch": 0.3410609299839658, + "grad_norm": 1.0900532007217407, + "learning_rate": 0.00019779157894155667, + "loss": 0.4756, + "step": 10210 + }, + { + "epoch": 0.34139497594869056, + "grad_norm": 0.7372317910194397, + "learning_rate": 0.0001977838764169085, + "loss": 0.4706, + "step": 10220 + }, + { + "epoch": 0.3417290219134153, + "grad_norm": 0.7971315383911133, + "learning_rate": 0.00019777616063371617, + "loss": 0.4792, + "step": 10230 + }, + { + "epoch": 0.34206306787814006, + "grad_norm": 0.94187331199646, + "learning_rate": 0.00019776843159302585, + "loss": 0.4756, + "step": 10240 + }, + { + "epoch": 0.34239711384286475, + "grad_norm": 0.8116739392280579, + "learning_rate": 0.00019776068929588562, + "loss": 0.484, + "step": 10250 + }, + { + "epoch": 0.3427311598075895, + "grad_norm": 0.7088529467582703, + "learning_rate": 0.0001977529337433452, + "loss": 0.4663, + "step": 10260 + }, + { + "epoch": 0.34306520577231425, + "grad_norm": 0.8465521931648254, + "learning_rate": 0.0001977451649364562, + "loss": 0.4857, + "step": 10270 + }, + { + "epoch": 0.343399251737039, + "grad_norm": 0.6388803124427795, + "learning_rate": 0.00019773738287627194, + "loss": 0.476, + "step": 10280 + }, + { + "epoch": 0.34373329770176375, + "grad_norm": 0.9249483346939087, + "learning_rate": 0.00019772958756384762, + "loss": 0.4596, + "step": 10290 + }, + { + "epoch": 0.3440673436664885, + "grad_norm": 0.9700983166694641, + "learning_rate": 0.00019772177900024022, + "loss": 0.5097, + "step": 10300 + }, + { + "epoch": 0.34440138963121325, + "grad_norm": 0.9682155251502991, + "learning_rate": 0.00019771395718650852, + "loss": 0.493, + "step": 10310 + }, + { + "epoch": 0.344735435595938, + "grad_norm": 0.7454023361206055, + "learning_rate": 0.00019770612212371305, + "loss": 0.4846, + "step": 10320 + }, + { + "epoch": 0.34506948156066275, + "grad_norm": 0.7793940901756287, + "learning_rate": 0.00019769827381291616, + "loss": 0.4743, + "step": 10330 + }, + { + "epoch": 0.3454035275253875, + "grad_norm": 0.7634183764457703, + "learning_rate": 0.00019769041225518204, + "loss": 0.4808, + "step": 10340 + }, + { + "epoch": 0.34573757349011225, + "grad_norm": 0.713284432888031, + "learning_rate": 0.00019768253745157662, + "loss": 0.5079, + "step": 10350 + }, + { + "epoch": 0.346071619454837, + "grad_norm": 0.8389780521392822, + "learning_rate": 0.00019767464940316764, + "loss": 0.4769, + "step": 10360 + }, + { + "epoch": 0.34640566541956175, + "grad_norm": 0.7174515128135681, + "learning_rate": 0.00019766674811102467, + "loss": 0.4642, + "step": 10370 + }, + { + "epoch": 0.3467397113842865, + "grad_norm": 0.6691192388534546, + "learning_rate": 0.000197658833576219, + "loss": 0.4532, + "step": 10380 + }, + { + "epoch": 0.34707375734901125, + "grad_norm": 0.7878669500350952, + "learning_rate": 0.00019765090579982383, + "loss": 0.4813, + "step": 10390 + }, + { + "epoch": 0.347407803313736, + "grad_norm": 0.7515242695808411, + "learning_rate": 0.00019764296478291406, + "loss": 0.4515, + "step": 10400 + }, + { + "epoch": 0.3477418492784607, + "grad_norm": 0.8122110366821289, + "learning_rate": 0.0001976350105265664, + "loss": 0.4817, + "step": 10410 + }, + { + "epoch": 0.34807589524318544, + "grad_norm": 0.7072919607162476, + "learning_rate": 0.00019762704303185938, + "loss": 0.4527, + "step": 10420 + }, + { + "epoch": 0.3484099412079102, + "grad_norm": 0.7258039116859436, + "learning_rate": 0.0001976190622998733, + "loss": 0.4571, + "step": 10430 + }, + { + "epoch": 0.34874398717263494, + "grad_norm": 1.1901227235794067, + "learning_rate": 0.00019761106833169028, + "loss": 0.4674, + "step": 10440 + }, + { + "epoch": 0.3490780331373597, + "grad_norm": 0.7902029752731323, + "learning_rate": 0.00019760306112839423, + "loss": 0.5125, + "step": 10450 + }, + { + "epoch": 0.34941207910208444, + "grad_norm": 0.9555513262748718, + "learning_rate": 0.00019759504069107088, + "loss": 0.4788, + "step": 10460 + }, + { + "epoch": 0.3497461250668092, + "grad_norm": 0.6862301826477051, + "learning_rate": 0.00019758700702080767, + "loss": 0.4748, + "step": 10470 + }, + { + "epoch": 0.35008017103153394, + "grad_norm": 0.8117475509643555, + "learning_rate": 0.00019757896011869392, + "loss": 0.5169, + "step": 10480 + }, + { + "epoch": 0.3504142169962587, + "grad_norm": 0.7458468675613403, + "learning_rate": 0.0001975708999858207, + "loss": 0.4603, + "step": 10490 + }, + { + "epoch": 0.35074826296098344, + "grad_norm": 0.8273071646690369, + "learning_rate": 0.00019756282662328087, + "loss": 0.5139, + "step": 10500 + }, + { + "epoch": 0.3510823089257082, + "grad_norm": 0.6612288951873779, + "learning_rate": 0.00019755474003216913, + "loss": 0.4498, + "step": 10510 + }, + { + "epoch": 0.35141635489043294, + "grad_norm": 0.7159819006919861, + "learning_rate": 0.00019754664021358193, + "loss": 0.4894, + "step": 10520 + }, + { + "epoch": 0.3517504008551577, + "grad_norm": 0.7718474864959717, + "learning_rate": 0.00019753852716861752, + "loss": 0.4923, + "step": 10530 + }, + { + "epoch": 0.35208444681988244, + "grad_norm": 0.8232442736625671, + "learning_rate": 0.00019753040089837595, + "loss": 0.4582, + "step": 10540 + }, + { + "epoch": 0.3524184927846072, + "grad_norm": 0.8357414603233337, + "learning_rate": 0.00019752226140395908, + "loss": 0.4868, + "step": 10550 + }, + { + "epoch": 0.35275253874933193, + "grad_norm": 0.7636700868606567, + "learning_rate": 0.00019751410868647056, + "loss": 0.4636, + "step": 10560 + }, + { + "epoch": 0.35308658471405663, + "grad_norm": 0.6893655061721802, + "learning_rate": 0.00019750594274701576, + "loss": 0.4936, + "step": 10570 + }, + { + "epoch": 0.3534206306787814, + "grad_norm": 0.7135987877845764, + "learning_rate": 0.00019749776358670196, + "loss": 0.487, + "step": 10580 + }, + { + "epoch": 0.3537546766435061, + "grad_norm": 0.7857927680015564, + "learning_rate": 0.00019748957120663816, + "loss": 0.4718, + "step": 10590 + }, + { + "epoch": 0.3540887226082309, + "grad_norm": 0.7815923690795898, + "learning_rate": 0.00019748136560793512, + "loss": 0.4583, + "step": 10600 + }, + { + "epoch": 0.3544227685729556, + "grad_norm": 0.8372818827629089, + "learning_rate": 0.0001974731467917055, + "loss": 0.4417, + "step": 10610 + }, + { + "epoch": 0.3547568145376804, + "grad_norm": 0.7350714802742004, + "learning_rate": 0.00019746491475906367, + "loss": 0.5004, + "step": 10620 + }, + { + "epoch": 0.3550908605024051, + "grad_norm": 0.8285728693008423, + "learning_rate": 0.00019745666951112583, + "loss": 0.4555, + "step": 10630 + }, + { + "epoch": 0.3554249064671299, + "grad_norm": 0.7722315192222595, + "learning_rate": 0.00019744841104900993, + "loss": 0.536, + "step": 10640 + }, + { + "epoch": 0.3557589524318546, + "grad_norm": 0.7042340636253357, + "learning_rate": 0.00019744013937383575, + "loss": 0.4787, + "step": 10650 + }, + { + "epoch": 0.3560929983965794, + "grad_norm": 0.9074069857597351, + "learning_rate": 0.00019743185448672484, + "loss": 0.4865, + "step": 10660 + }, + { + "epoch": 0.3564270443613041, + "grad_norm": 0.7449846863746643, + "learning_rate": 0.00019742355638880055, + "loss": 0.4454, + "step": 10670 + }, + { + "epoch": 0.3567610903260289, + "grad_norm": 0.7971091270446777, + "learning_rate": 0.00019741524508118803, + "loss": 0.4688, + "step": 10680 + }, + { + "epoch": 0.3570951362907536, + "grad_norm": 0.825592041015625, + "learning_rate": 0.00019740692056501423, + "loss": 0.465, + "step": 10690 + }, + { + "epoch": 0.3574291822554784, + "grad_norm": 0.7828197479248047, + "learning_rate": 0.0001973985828414078, + "loss": 0.4575, + "step": 10700 + }, + { + "epoch": 0.3577632282202031, + "grad_norm": 1.059834599494934, + "learning_rate": 0.0001973902319114994, + "loss": 0.4897, + "step": 10710 + }, + { + "epoch": 0.35809727418492787, + "grad_norm": 0.6886934638023376, + "learning_rate": 0.00019738186777642117, + "loss": 0.457, + "step": 10720 + }, + { + "epoch": 0.35843132014965257, + "grad_norm": 0.7650066018104553, + "learning_rate": 0.00019737349043730728, + "loss": 0.4918, + "step": 10730 + }, + { + "epoch": 0.3587653661143773, + "grad_norm": 1.0661060810089111, + "learning_rate": 0.00019736509989529366, + "loss": 0.4616, + "step": 10740 + }, + { + "epoch": 0.35909941207910206, + "grad_norm": 0.7165107727050781, + "learning_rate": 0.0001973566961515179, + "loss": 0.4772, + "step": 10750 + }, + { + "epoch": 0.3594334580438268, + "grad_norm": 0.8998333811759949, + "learning_rate": 0.00019734827920711952, + "loss": 0.4953, + "step": 10760 + }, + { + "epoch": 0.35976750400855156, + "grad_norm": 0.7248699069023132, + "learning_rate": 0.0001973398490632398, + "loss": 0.4792, + "step": 10770 + }, + { + "epoch": 0.3601015499732763, + "grad_norm": 0.9567614793777466, + "learning_rate": 0.0001973314057210217, + "loss": 0.4762, + "step": 10780 + }, + { + "epoch": 0.36043559593800106, + "grad_norm": 0.7169862985610962, + "learning_rate": 0.00019732294918161012, + "loss": 0.4703, + "step": 10790 + }, + { + "epoch": 0.3607696419027258, + "grad_norm": 0.7044866681098938, + "learning_rate": 0.00019731447944615167, + "loss": 0.4982, + "step": 10800 + }, + { + "epoch": 0.36110368786745056, + "grad_norm": 0.7856030464172363, + "learning_rate": 0.00019730599651579476, + "loss": 0.4519, + "step": 10810 + }, + { + "epoch": 0.3614377338321753, + "grad_norm": 0.7583265900611877, + "learning_rate": 0.00019729750039168957, + "loss": 0.4817, + "step": 10820 + }, + { + "epoch": 0.36177177979690006, + "grad_norm": 0.8426154851913452, + "learning_rate": 0.00019728899107498816, + "loss": 0.4948, + "step": 10830 + }, + { + "epoch": 0.3621058257616248, + "grad_norm": 0.9635867476463318, + "learning_rate": 0.00019728046856684424, + "loss": 0.4822, + "step": 10840 + }, + { + "epoch": 0.36243987172634956, + "grad_norm": 0.853259801864624, + "learning_rate": 0.0001972719328684134, + "loss": 0.486, + "step": 10850 + }, + { + "epoch": 0.3627739176910743, + "grad_norm": 0.7423056364059448, + "learning_rate": 0.00019726338398085304, + "loss": 0.4758, + "step": 10860 + }, + { + "epoch": 0.36310796365579906, + "grad_norm": 0.8231603503227234, + "learning_rate": 0.00019725482190532226, + "loss": 0.4752, + "step": 10870 + }, + { + "epoch": 0.3634420096205238, + "grad_norm": 0.9050951600074768, + "learning_rate": 0.000197246246642982, + "loss": 0.5056, + "step": 10880 + }, + { + "epoch": 0.3637760555852485, + "grad_norm": 0.7685396671295166, + "learning_rate": 0.00019723765819499496, + "loss": 0.4655, + "step": 10890 + }, + { + "epoch": 0.36411010154997325, + "grad_norm": 0.8198825716972351, + "learning_rate": 0.0001972290565625257, + "loss": 0.4372, + "step": 10900 + }, + { + "epoch": 0.364444147514698, + "grad_norm": 0.8989099264144897, + "learning_rate": 0.0001972204417467405, + "loss": 0.4891, + "step": 10910 + }, + { + "epoch": 0.36477819347942275, + "grad_norm": 0.8434351086616516, + "learning_rate": 0.00019721181374880744, + "loss": 0.4638, + "step": 10920 + }, + { + "epoch": 0.3651122394441475, + "grad_norm": 0.8260385394096375, + "learning_rate": 0.0001972031725698964, + "loss": 0.5134, + "step": 10930 + }, + { + "epoch": 0.36544628540887225, + "grad_norm": 0.9877833724021912, + "learning_rate": 0.00019719451821117903, + "loss": 0.4791, + "step": 10940 + }, + { + "epoch": 0.365780331373597, + "grad_norm": 0.8119081258773804, + "learning_rate": 0.00019718585067382875, + "loss": 0.4864, + "step": 10950 + }, + { + "epoch": 0.36611437733832175, + "grad_norm": 0.7405663132667542, + "learning_rate": 0.00019717716995902086, + "loss": 0.4935, + "step": 10960 + }, + { + "epoch": 0.3664484233030465, + "grad_norm": 0.6708000302314758, + "learning_rate": 0.00019716847606793234, + "loss": 0.4649, + "step": 10970 + }, + { + "epoch": 0.36678246926777125, + "grad_norm": 0.7842719554901123, + "learning_rate": 0.00019715976900174203, + "loss": 0.4957, + "step": 10980 + }, + { + "epoch": 0.367116515232496, + "grad_norm": 0.8582305908203125, + "learning_rate": 0.00019715104876163043, + "loss": 0.4732, + "step": 10990 + }, + { + "epoch": 0.36745056119722075, + "grad_norm": 0.8141451478004456, + "learning_rate": 0.00019714231534878005, + "loss": 0.4781, + "step": 11000 + }, + { + "epoch": 0.36745056119722075, + "eval_chrf": 83.882648472551, + "eval_loss": 0.8297273516654968, + "eval_runtime": 102.7082, + "eval_samples_per_second": 0.974, + "eval_steps_per_second": 0.039, + "step": 11000 + }, + { + "epoch": 0.3677846071619455, + "grad_norm": 0.8461644053459167, + "learning_rate": 0.00019713356876437495, + "loss": 0.4845, + "step": 11010 + }, + { + "epoch": 0.36811865312667025, + "grad_norm": 0.8959101438522339, + "learning_rate": 0.00019712480900960114, + "loss": 0.4936, + "step": 11020 + }, + { + "epoch": 0.368452699091395, + "grad_norm": 0.7106905579566956, + "learning_rate": 0.00019711603608564633, + "loss": 0.4747, + "step": 11030 + }, + { + "epoch": 0.36878674505611975, + "grad_norm": 0.7683181166648865, + "learning_rate": 0.00019710724999370008, + "loss": 0.486, + "step": 11040 + }, + { + "epoch": 0.36912079102084444, + "grad_norm": 0.6132196187973022, + "learning_rate": 0.00019709845073495366, + "loss": 0.4491, + "step": 11050 + }, + { + "epoch": 0.3694548369855692, + "grad_norm": 0.7045121788978577, + "learning_rate": 0.00019708963831060016, + "loss": 0.4426, + "step": 11060 + }, + { + "epoch": 0.36978888295029394, + "grad_norm": 0.7871989607810974, + "learning_rate": 0.0001970808127218345, + "loss": 0.4627, + "step": 11070 + }, + { + "epoch": 0.3701229289150187, + "grad_norm": 0.683876633644104, + "learning_rate": 0.0001970719739698533, + "loss": 0.4455, + "step": 11080 + }, + { + "epoch": 0.37045697487974344, + "grad_norm": 0.9440566301345825, + "learning_rate": 0.00019706312205585507, + "loss": 0.4616, + "step": 11090 + }, + { + "epoch": 0.3707910208444682, + "grad_norm": 0.7363137006759644, + "learning_rate": 0.00019705425698103996, + "loss": 0.4227, + "step": 11100 + }, + { + "epoch": 0.37112506680919294, + "grad_norm": 0.7593136429786682, + "learning_rate": 0.00019704537874661005, + "loss": 0.4977, + "step": 11110 + }, + { + "epoch": 0.3714591127739177, + "grad_norm": 0.7085240483283997, + "learning_rate": 0.00019703648735376912, + "loss": 0.4787, + "step": 11120 + }, + { + "epoch": 0.37179315873864244, + "grad_norm": 0.7872964143753052, + "learning_rate": 0.00019702758280372277, + "loss": 0.4512, + "step": 11130 + }, + { + "epoch": 0.3721272047033672, + "grad_norm": 0.8554701805114746, + "learning_rate": 0.00019701866509767835, + "loss": 0.4689, + "step": 11140 + }, + { + "epoch": 0.37246125066809194, + "grad_norm": 0.7175393104553223, + "learning_rate": 0.00019700973423684504, + "loss": 0.4855, + "step": 11150 + }, + { + "epoch": 0.3727952966328167, + "grad_norm": 0.7222559452056885, + "learning_rate": 0.00019700079022243373, + "loss": 0.5323, + "step": 11160 + }, + { + "epoch": 0.37312934259754144, + "grad_norm": 0.8892595171928406, + "learning_rate": 0.00019699183305565724, + "loss": 0.4887, + "step": 11170 + }, + { + "epoch": 0.3734633885622662, + "grad_norm": 0.7408721446990967, + "learning_rate": 0.00019698286273772992, + "loss": 0.4843, + "step": 11180 + }, + { + "epoch": 0.37379743452699093, + "grad_norm": 0.8000454306602478, + "learning_rate": 0.0001969738792698682, + "loss": 0.4672, + "step": 11190 + }, + { + "epoch": 0.3741314804917157, + "grad_norm": 0.7798967361450195, + "learning_rate": 0.0001969648826532901, + "loss": 0.4757, + "step": 11200 + }, + { + "epoch": 0.3744655264564404, + "grad_norm": 0.7103991508483887, + "learning_rate": 0.00019695587288921549, + "loss": 0.4665, + "step": 11210 + }, + { + "epoch": 0.37479957242116513, + "grad_norm": 0.6947950720787048, + "learning_rate": 0.00019694684997886594, + "loss": 0.4724, + "step": 11220 + }, + { + "epoch": 0.3751336183858899, + "grad_norm": 0.8962208032608032, + "learning_rate": 0.00019693781392346495, + "loss": 0.4688, + "step": 11230 + }, + { + "epoch": 0.3754676643506146, + "grad_norm": 0.8613168001174927, + "learning_rate": 0.00019692876472423768, + "loss": 0.4999, + "step": 11240 + }, + { + "epoch": 0.3758017103153394, + "grad_norm": 0.7403880953788757, + "learning_rate": 0.00019691970238241112, + "loss": 0.4669, + "step": 11250 + }, + { + "epoch": 0.3761357562800641, + "grad_norm": 0.739456832408905, + "learning_rate": 0.00019691062689921406, + "loss": 0.4717, + "step": 11260 + }, + { + "epoch": 0.3764698022447889, + "grad_norm": 0.8237618207931519, + "learning_rate": 0.000196901538275877, + "loss": 0.4495, + "step": 11270 + }, + { + "epoch": 0.3768038482095136, + "grad_norm": 0.7873772382736206, + "learning_rate": 0.0001968924365136323, + "loss": 0.4638, + "step": 11280 + }, + { + "epoch": 0.3771378941742384, + "grad_norm": 0.7746085524559021, + "learning_rate": 0.0001968833216137141, + "loss": 0.4528, + "step": 11290 + }, + { + "epoch": 0.3774719401389631, + "grad_norm": 0.8668926954269409, + "learning_rate": 0.0001968741935773582, + "loss": 0.4963, + "step": 11300 + }, + { + "epoch": 0.3778059861036879, + "grad_norm": 0.7093120217323303, + "learning_rate": 0.0001968650524058024, + "loss": 0.4581, + "step": 11310 + }, + { + "epoch": 0.3781400320684126, + "grad_norm": 0.743602454662323, + "learning_rate": 0.00019685589810028602, + "loss": 0.4729, + "step": 11320 + }, + { + "epoch": 0.3784740780331374, + "grad_norm": 0.7020447850227356, + "learning_rate": 0.00019684673066205037, + "loss": 0.4163, + "step": 11330 + }, + { + "epoch": 0.3788081239978621, + "grad_norm": 0.6761562824249268, + "learning_rate": 0.0001968375500923385, + "loss": 0.4405, + "step": 11340 + }, + { + "epoch": 0.3791421699625869, + "grad_norm": 0.9481543898582458, + "learning_rate": 0.0001968283563923951, + "loss": 0.4824, + "step": 11350 + }, + { + "epoch": 0.3794762159273116, + "grad_norm": 0.6668941378593445, + "learning_rate": 0.00019681914956346685, + "loss": 0.4607, + "step": 11360 + }, + { + "epoch": 0.37981026189203637, + "grad_norm": 0.836795449256897, + "learning_rate": 0.00019680992960680204, + "loss": 0.4807, + "step": 11370 + }, + { + "epoch": 0.38014430785676107, + "grad_norm": 0.8149944543838501, + "learning_rate": 0.00019680069652365086, + "loss": 0.4416, + "step": 11380 + }, + { + "epoch": 0.3804783538214858, + "grad_norm": 0.8936391472816467, + "learning_rate": 0.00019679145031526517, + "loss": 0.4694, + "step": 11390 + }, + { + "epoch": 0.38081239978621056, + "grad_norm": 0.8575438857078552, + "learning_rate": 0.0001967821909828987, + "loss": 0.4428, + "step": 11400 + }, + { + "epoch": 0.3811464457509353, + "grad_norm": 0.7331439256668091, + "learning_rate": 0.00019677291852780693, + "loss": 0.4984, + "step": 11410 + }, + { + "epoch": 0.38148049171566006, + "grad_norm": 0.7884340286254883, + "learning_rate": 0.0001967636329512471, + "loss": 0.4497, + "step": 11420 + }, + { + "epoch": 0.3818145376803848, + "grad_norm": 0.914576530456543, + "learning_rate": 0.00019675433425447826, + "loss": 0.4932, + "step": 11430 + }, + { + "epoch": 0.38214858364510956, + "grad_norm": 0.7630864381790161, + "learning_rate": 0.0001967450224387612, + "loss": 0.4138, + "step": 11440 + }, + { + "epoch": 0.3824826296098343, + "grad_norm": 0.7330807447433472, + "learning_rate": 0.00019673569750535853, + "loss": 0.4752, + "step": 11450 + }, + { + "epoch": 0.38281667557455906, + "grad_norm": 0.7038103938102722, + "learning_rate": 0.0001967263594555346, + "loss": 0.447, + "step": 11460 + }, + { + "epoch": 0.3831507215392838, + "grad_norm": 0.8009968400001526, + "learning_rate": 0.00019671700829055556, + "loss": 0.4667, + "step": 11470 + }, + { + "epoch": 0.38348476750400856, + "grad_norm": 0.8041735291481018, + "learning_rate": 0.00019670764401168937, + "loss": 0.4663, + "step": 11480 + }, + { + "epoch": 0.3838188134687333, + "grad_norm": 0.7537304759025574, + "learning_rate": 0.00019669826662020568, + "loss": 0.489, + "step": 11490 + }, + { + "epoch": 0.38415285943345806, + "grad_norm": 0.7765626311302185, + "learning_rate": 0.00019668887611737606, + "loss": 0.4597, + "step": 11500 + }, + { + "epoch": 0.3844869053981828, + "grad_norm": 0.6865957379341125, + "learning_rate": 0.0001966794725044737, + "loss": 0.4505, + "step": 11510 + }, + { + "epoch": 0.38482095136290756, + "grad_norm": 0.6744564771652222, + "learning_rate": 0.00019667005578277364, + "loss": 0.4999, + "step": 11520 + }, + { + "epoch": 0.3851549973276323, + "grad_norm": 0.6161285638809204, + "learning_rate": 0.00019666062595355275, + "loss": 0.4783, + "step": 11530 + }, + { + "epoch": 0.385489043292357, + "grad_norm": 0.7809021472930908, + "learning_rate": 0.00019665118301808953, + "loss": 0.4634, + "step": 11540 + }, + { + "epoch": 0.38582308925708175, + "grad_norm": 0.8055886626243591, + "learning_rate": 0.00019664172697766446, + "loss": 0.4651, + "step": 11550 + }, + { + "epoch": 0.3861571352218065, + "grad_norm": 0.7690239548683167, + "learning_rate": 0.00019663225783355962, + "loss": 0.4678, + "step": 11560 + }, + { + "epoch": 0.38649118118653125, + "grad_norm": 0.9547606110572815, + "learning_rate": 0.00019662277558705893, + "loss": 0.4816, + "step": 11570 + }, + { + "epoch": 0.386825227151256, + "grad_norm": 0.9427857398986816, + "learning_rate": 0.00019661328023944813, + "loss": 0.4736, + "step": 11580 + }, + { + "epoch": 0.38715927311598075, + "grad_norm": 0.7548187971115112, + "learning_rate": 0.00019660377179201466, + "loss": 0.4747, + "step": 11590 + }, + { + "epoch": 0.3874933190807055, + "grad_norm": 0.799413800239563, + "learning_rate": 0.00019659425024604778, + "loss": 0.4785, + "step": 11600 + }, + { + "epoch": 0.38782736504543025, + "grad_norm": 0.7871565222740173, + "learning_rate": 0.00019658471560283854, + "loss": 0.5036, + "step": 11610 + }, + { + "epoch": 0.388161411010155, + "grad_norm": 0.7192272543907166, + "learning_rate": 0.00019657516786367976, + "loss": 0.4798, + "step": 11620 + }, + { + "epoch": 0.38849545697487975, + "grad_norm": 0.759269118309021, + "learning_rate": 0.00019656560702986596, + "loss": 0.4634, + "step": 11630 + }, + { + "epoch": 0.3888295029396045, + "grad_norm": 0.7818036079406738, + "learning_rate": 0.00019655603310269354, + "loss": 0.456, + "step": 11640 + }, + { + "epoch": 0.38916354890432925, + "grad_norm": 0.7934758067131042, + "learning_rate": 0.00019654644608346064, + "loss": 0.4774, + "step": 11650 + }, + { + "epoch": 0.389497594869054, + "grad_norm": 0.9040003418922424, + "learning_rate": 0.00019653684597346712, + "loss": 0.4494, + "step": 11660 + }, + { + "epoch": 0.38983164083377875, + "grad_norm": 0.6384785771369934, + "learning_rate": 0.00019652723277401471, + "loss": 0.4656, + "step": 11670 + }, + { + "epoch": 0.3901656867985035, + "grad_norm": 0.8215344548225403, + "learning_rate": 0.00019651760648640683, + "loss": 0.4883, + "step": 11680 + }, + { + "epoch": 0.39049973276322825, + "grad_norm": 0.6444737315177917, + "learning_rate": 0.00019650796711194875, + "loss": 0.489, + "step": 11690 + }, + { + "epoch": 0.39083377872795294, + "grad_norm": 0.8934856653213501, + "learning_rate": 0.0001964983146519474, + "loss": 0.4504, + "step": 11700 + }, + { + "epoch": 0.3911678246926777, + "grad_norm": 0.9299535751342773, + "learning_rate": 0.00019648864910771168, + "loss": 0.4872, + "step": 11710 + }, + { + "epoch": 0.39150187065740244, + "grad_norm": 0.7235091328620911, + "learning_rate": 0.00019647897048055203, + "loss": 0.4804, + "step": 11720 + }, + { + "epoch": 0.3918359166221272, + "grad_norm": 0.661304771900177, + "learning_rate": 0.00019646927877178084, + "loss": 0.4344, + "step": 11730 + }, + { + "epoch": 0.39216996258685194, + "grad_norm": 0.8083805441856384, + "learning_rate": 0.0001964595739827122, + "loss": 0.4547, + "step": 11740 + }, + { + "epoch": 0.3925040085515767, + "grad_norm": 0.7910724878311157, + "learning_rate": 0.00019644985611466198, + "loss": 0.4843, + "step": 11750 + }, + { + "epoch": 0.39283805451630144, + "grad_norm": 0.7472886443138123, + "learning_rate": 0.00019644012516894783, + "loss": 0.4827, + "step": 11760 + }, + { + "epoch": 0.3931721004810262, + "grad_norm": 0.9335936307907104, + "learning_rate": 0.00019643038114688918, + "loss": 0.4572, + "step": 11770 + }, + { + "epoch": 0.39350614644575094, + "grad_norm": 0.8157020807266235, + "learning_rate": 0.0001964206240498072, + "loss": 0.4712, + "step": 11780 + }, + { + "epoch": 0.3938401924104757, + "grad_norm": 0.8066133856773376, + "learning_rate": 0.00019641085387902489, + "loss": 0.4816, + "step": 11790 + }, + { + "epoch": 0.39417423837520044, + "grad_norm": 0.7861907482147217, + "learning_rate": 0.000196401070635867, + "loss": 0.4791, + "step": 11800 + }, + { + "epoch": 0.3945082843399252, + "grad_norm": 0.7700847387313843, + "learning_rate": 0.00019639127432165995, + "loss": 0.494, + "step": 11810 + }, + { + "epoch": 0.39484233030464994, + "grad_norm": 0.7726387977600098, + "learning_rate": 0.00019638146493773214, + "loss": 0.4658, + "step": 11820 + }, + { + "epoch": 0.3951763762693747, + "grad_norm": 0.7228941321372986, + "learning_rate": 0.00019637164248541357, + "loss": 0.4839, + "step": 11830 + }, + { + "epoch": 0.39551042223409943, + "grad_norm": 0.658534049987793, + "learning_rate": 0.00019636180696603608, + "loss": 0.4738, + "step": 11840 + }, + { + "epoch": 0.3958444681988242, + "grad_norm": 0.7263990640640259, + "learning_rate": 0.00019635195838093328, + "loss": 0.4411, + "step": 11850 + }, + { + "epoch": 0.3961785141635489, + "grad_norm": 1.0063343048095703, + "learning_rate": 0.0001963420967314405, + "loss": 0.4824, + "step": 11860 + }, + { + "epoch": 0.39651256012827363, + "grad_norm": 0.8860673308372498, + "learning_rate": 0.00019633222201889493, + "loss": 0.5134, + "step": 11870 + }, + { + "epoch": 0.3968466060929984, + "grad_norm": 0.8006879687309265, + "learning_rate": 0.00019632233424463546, + "loss": 0.4779, + "step": 11880 + }, + { + "epoch": 0.3971806520577231, + "grad_norm": 0.8487799167633057, + "learning_rate": 0.00019631243341000282, + "loss": 0.4484, + "step": 11890 + }, + { + "epoch": 0.3975146980224479, + "grad_norm": 0.7340543866157532, + "learning_rate": 0.00019630251951633943, + "loss": 0.4936, + "step": 11900 + }, + { + "epoch": 0.3978487439871726, + "grad_norm": 0.8068410754203796, + "learning_rate": 0.0001962925925649895, + "loss": 0.4577, + "step": 11910 + }, + { + "epoch": 0.3981827899518974, + "grad_norm": 0.7553281784057617, + "learning_rate": 0.00019628265255729907, + "loss": 0.4689, + "step": 11920 + }, + { + "epoch": 0.3985168359166221, + "grad_norm": 0.906579315662384, + "learning_rate": 0.0001962726994946159, + "loss": 0.4783, + "step": 11930 + }, + { + "epoch": 0.3988508818813469, + "grad_norm": 0.741832435131073, + "learning_rate": 0.00019626273337828947, + "loss": 0.4423, + "step": 11940 + }, + { + "epoch": 0.3991849278460716, + "grad_norm": 0.6025128960609436, + "learning_rate": 0.00019625275420967117, + "loss": 0.4093, + "step": 11950 + }, + { + "epoch": 0.3995189738107964, + "grad_norm": 0.7121363282203674, + "learning_rate": 0.00019624276199011403, + "loss": 0.4734, + "step": 11960 + }, + { + "epoch": 0.3998530197755211, + "grad_norm": 0.9440948963165283, + "learning_rate": 0.00019623275672097293, + "loss": 0.4497, + "step": 11970 + }, + { + "epoch": 0.4001870657402459, + "grad_norm": 0.75343257188797, + "learning_rate": 0.00019622273840360442, + "loss": 0.4635, + "step": 11980 + }, + { + "epoch": 0.4005211117049706, + "grad_norm": 0.8060387969017029, + "learning_rate": 0.000196212707039367, + "loss": 0.47, + "step": 11990 + }, + { + "epoch": 0.40085515766969537, + "grad_norm": 0.6871958374977112, + "learning_rate": 0.00019620266262962072, + "loss": 0.509, + "step": 12000 + }, + { + "epoch": 0.40085515766969537, + "eval_chrf": 84.12509562563687, + "eval_loss": 0.8187453150749207, + "eval_runtime": 170.4993, + "eval_samples_per_second": 0.587, + "eval_steps_per_second": 0.023, + "step": 12000 + }, + { + "epoch": 0.4011892036344201, + "grad_norm": 0.7877377271652222, + "learning_rate": 0.0001961926051757276, + "loss": 0.4619, + "step": 12010 + }, + { + "epoch": 0.4015232495991448, + "grad_norm": 0.7216804027557373, + "learning_rate": 0.0001961825346790512, + "loss": 0.4952, + "step": 12020 + }, + { + "epoch": 0.40185729556386957, + "grad_norm": 0.6721543073654175, + "learning_rate": 0.00019617245114095716, + "loss": 0.4634, + "step": 12030 + }, + { + "epoch": 0.4021913415285943, + "grad_norm": 0.8537166714668274, + "learning_rate": 0.00019616235456281254, + "loss": 0.4921, + "step": 12040 + }, + { + "epoch": 0.40252538749331906, + "grad_norm": 0.6567972898483276, + "learning_rate": 0.0001961522449459864, + "loss": 0.4463, + "step": 12050 + }, + { + "epoch": 0.4028594334580438, + "grad_norm": 0.8174808621406555, + "learning_rate": 0.00019614212229184953, + "loss": 0.471, + "step": 12060 + }, + { + "epoch": 0.40319347942276856, + "grad_norm": 0.6819205284118652, + "learning_rate": 0.00019613198660177448, + "loss": 0.4543, + "step": 12070 + }, + { + "epoch": 0.4035275253874933, + "grad_norm": 0.7450578808784485, + "learning_rate": 0.00019612183787713549, + "loss": 0.4477, + "step": 12080 + }, + { + "epoch": 0.40386157135221806, + "grad_norm": 0.7166303992271423, + "learning_rate": 0.00019611167611930869, + "loss": 0.473, + "step": 12090 + }, + { + "epoch": 0.4041956173169428, + "grad_norm": 0.7241618633270264, + "learning_rate": 0.00019610150132967185, + "loss": 0.4927, + "step": 12100 + }, + { + "epoch": 0.40452966328166756, + "grad_norm": 0.6643677353858948, + "learning_rate": 0.0001960913135096046, + "loss": 0.4498, + "step": 12110 + }, + { + "epoch": 0.4048637092463923, + "grad_norm": 0.777229905128479, + "learning_rate": 0.00019608111266048833, + "loss": 0.4472, + "step": 12120 + }, + { + "epoch": 0.40519775521111706, + "grad_norm": 0.7487637996673584, + "learning_rate": 0.00019607089878370616, + "loss": 0.4696, + "step": 12130 + }, + { + "epoch": 0.4055318011758418, + "grad_norm": 0.7533994317054749, + "learning_rate": 0.00019606067188064302, + "loss": 0.4546, + "step": 12140 + }, + { + "epoch": 0.40586584714056656, + "grad_norm": 0.8595882654190063, + "learning_rate": 0.0001960504319526855, + "loss": 0.4977, + "step": 12150 + }, + { + "epoch": 0.4061998931052913, + "grad_norm": 0.6456552147865295, + "learning_rate": 0.00019604017900122213, + "loss": 0.5018, + "step": 12160 + }, + { + "epoch": 0.40653393907001606, + "grad_norm": 0.7625018954277039, + "learning_rate": 0.00019602991302764306, + "loss": 0.4311, + "step": 12170 + }, + { + "epoch": 0.40686798503474075, + "grad_norm": 0.7633054256439209, + "learning_rate": 0.00019601963403334028, + "loss": 0.4525, + "step": 12180 + }, + { + "epoch": 0.4072020309994655, + "grad_norm": 0.7970126271247864, + "learning_rate": 0.00019600934201970748, + "loss": 0.4939, + "step": 12190 + }, + { + "epoch": 0.40753607696419025, + "grad_norm": 0.6416233777999878, + "learning_rate": 0.0001959990369881402, + "loss": 0.4612, + "step": 12200 + }, + { + "epoch": 0.407870122928915, + "grad_norm": 0.9063084125518799, + "learning_rate": 0.00019598871894003565, + "loss": 0.473, + "step": 12210 + }, + { + "epoch": 0.40820416889363975, + "grad_norm": 0.7767049074172974, + "learning_rate": 0.00019597838787679294, + "loss": 0.4387, + "step": 12220 + }, + { + "epoch": 0.4085382148583645, + "grad_norm": 0.8792232871055603, + "learning_rate": 0.0001959680437998128, + "loss": 0.4641, + "step": 12230 + }, + { + "epoch": 0.40887226082308925, + "grad_norm": 0.6900570392608643, + "learning_rate": 0.00019595768671049779, + "loss": 0.4321, + "step": 12240 + }, + { + "epoch": 0.409206306787814, + "grad_norm": 0.7847850918769836, + "learning_rate": 0.00019594731661025226, + "loss": 0.4756, + "step": 12250 + }, + { + "epoch": 0.40954035275253875, + "grad_norm": 0.6564018130302429, + "learning_rate": 0.00019593693350048225, + "loss": 0.4421, + "step": 12260 + }, + { + "epoch": 0.4098743987172635, + "grad_norm": 0.7897236347198486, + "learning_rate": 0.00019592653738259566, + "loss": 0.4904, + "step": 12270 + }, + { + "epoch": 0.41020844468198825, + "grad_norm": 0.6749510169029236, + "learning_rate": 0.00019591612825800206, + "loss": 0.4589, + "step": 12280 + }, + { + "epoch": 0.410542490646713, + "grad_norm": 0.8254622220993042, + "learning_rate": 0.00019590570612811287, + "loss": 0.4939, + "step": 12290 + }, + { + "epoch": 0.41087653661143775, + "grad_norm": 0.8289754390716553, + "learning_rate": 0.0001958952709943412, + "loss": 0.4826, + "step": 12300 + }, + { + "epoch": 0.4112105825761625, + "grad_norm": 0.8133805990219116, + "learning_rate": 0.00019588482285810195, + "loss": 0.4354, + "step": 12310 + }, + { + "epoch": 0.41154462854088725, + "grad_norm": 0.6556638479232788, + "learning_rate": 0.00019587436172081177, + "loss": 0.4521, + "step": 12320 + }, + { + "epoch": 0.411878674505612, + "grad_norm": 0.681178092956543, + "learning_rate": 0.00019586388758388915, + "loss": 0.4517, + "step": 12330 + }, + { + "epoch": 0.4122127204703367, + "grad_norm": 0.7689962983131409, + "learning_rate": 0.00019585340044875422, + "loss": 0.5099, + "step": 12340 + }, + { + "epoch": 0.41254676643506144, + "grad_norm": 0.6675758361816406, + "learning_rate": 0.00019584290031682898, + "loss": 0.4818, + "step": 12350 + }, + { + "epoch": 0.4128808123997862, + "grad_norm": 0.9746050238609314, + "learning_rate": 0.00019583238718953708, + "loss": 0.4355, + "step": 12360 + }, + { + "epoch": 0.41321485836451094, + "grad_norm": 0.7204445600509644, + "learning_rate": 0.00019582186106830406, + "loss": 0.4568, + "step": 12370 + }, + { + "epoch": 0.4135489043292357, + "grad_norm": 0.8911823034286499, + "learning_rate": 0.00019581132195455717, + "loss": 0.4496, + "step": 12380 + }, + { + "epoch": 0.41388295029396044, + "grad_norm": 0.8366311192512512, + "learning_rate": 0.00019580076984972538, + "loss": 0.4734, + "step": 12390 + }, + { + "epoch": 0.4142169962586852, + "grad_norm": 0.9188138842582703, + "learning_rate": 0.00019579020475523942, + "loss": 0.4488, + "step": 12400 + }, + { + "epoch": 0.41455104222340994, + "grad_norm": 0.6592729091644287, + "learning_rate": 0.00019577962667253186, + "loss": 0.4553, + "step": 12410 + }, + { + "epoch": 0.4148850881881347, + "grad_norm": 0.6868622303009033, + "learning_rate": 0.00019576903560303697, + "loss": 0.4358, + "step": 12420 + }, + { + "epoch": 0.41521913415285944, + "grad_norm": 0.7919336557388306, + "learning_rate": 0.0001957584315481908, + "loss": 0.4969, + "step": 12430 + }, + { + "epoch": 0.4155531801175842, + "grad_norm": 0.8055655360221863, + "learning_rate": 0.00019574781450943117, + "loss": 0.4564, + "step": 12440 + }, + { + "epoch": 0.41588722608230894, + "grad_norm": 0.782390832901001, + "learning_rate": 0.0001957371844881976, + "loss": 0.5138, + "step": 12450 + }, + { + "epoch": 0.4162212720470337, + "grad_norm": 0.7484316825866699, + "learning_rate": 0.0001957265414859315, + "loss": 0.4416, + "step": 12460 + }, + { + "epoch": 0.41655531801175844, + "grad_norm": 0.8447122573852539, + "learning_rate": 0.00019571588550407591, + "loss": 0.4558, + "step": 12470 + }, + { + "epoch": 0.4168893639764832, + "grad_norm": 0.6710695028305054, + "learning_rate": 0.00019570521654407565, + "loss": 0.4592, + "step": 12480 + }, + { + "epoch": 0.41722340994120793, + "grad_norm": 0.9042978882789612, + "learning_rate": 0.00019569453460737738, + "loss": 0.4551, + "step": 12490 + }, + { + "epoch": 0.4175574559059327, + "grad_norm": 0.626265823841095, + "learning_rate": 0.00019568383969542942, + "loss": 0.4535, + "step": 12500 + }, + { + "epoch": 0.4178915018706574, + "grad_norm": 0.6871891021728516, + "learning_rate": 0.00019567313180968192, + "loss": 0.45, + "step": 12510 + }, + { + "epoch": 0.4182255478353821, + "grad_norm": 0.8078727722167969, + "learning_rate": 0.00019566241095158676, + "loss": 0.4446, + "step": 12520 + }, + { + "epoch": 0.4185595938001069, + "grad_norm": 0.7571625709533691, + "learning_rate": 0.00019565167712259763, + "loss": 0.4707, + "step": 12530 + }, + { + "epoch": 0.4188936397648316, + "grad_norm": 0.8624815344810486, + "learning_rate": 0.00019564093032416982, + "loss": 0.461, + "step": 12540 + }, + { + "epoch": 0.4192276857295564, + "grad_norm": 0.7415806651115417, + "learning_rate": 0.00019563017055776062, + "loss": 0.4424, + "step": 12550 + }, + { + "epoch": 0.4195617316942811, + "grad_norm": 0.6654296517372131, + "learning_rate": 0.00019561939782482886, + "loss": 0.4747, + "step": 12560 + }, + { + "epoch": 0.4198957776590059, + "grad_norm": 0.763261079788208, + "learning_rate": 0.00019560861212683528, + "loss": 0.4703, + "step": 12570 + }, + { + "epoch": 0.4202298236237306, + "grad_norm": 0.7310267686843872, + "learning_rate": 0.00019559781346524226, + "loss": 0.4201, + "step": 12580 + }, + { + "epoch": 0.4205638695884554, + "grad_norm": 0.6698771119117737, + "learning_rate": 0.00019558700184151405, + "loss": 0.4283, + "step": 12590 + }, + { + "epoch": 0.4208979155531801, + "grad_norm": 0.7101729512214661, + "learning_rate": 0.00019557617725711655, + "loss": 0.464, + "step": 12600 + }, + { + "epoch": 0.4212319615179049, + "grad_norm": 0.6976768374443054, + "learning_rate": 0.0001955653397135175, + "loss": 0.4499, + "step": 12610 + }, + { + "epoch": 0.4215660074826296, + "grad_norm": 0.6799318790435791, + "learning_rate": 0.00019555448921218632, + "loss": 0.4629, + "step": 12620 + }, + { + "epoch": 0.4219000534473544, + "grad_norm": 0.8444611430168152, + "learning_rate": 0.0001955436257545943, + "loss": 0.456, + "step": 12630 + }, + { + "epoch": 0.4222340994120791, + "grad_norm": 0.7331224679946899, + "learning_rate": 0.00019553274934221436, + "loss": 0.4843, + "step": 12640 + }, + { + "epoch": 0.42256814537680387, + "grad_norm": 0.8414499163627625, + "learning_rate": 0.0001955218599765213, + "loss": 0.4973, + "step": 12650 + }, + { + "epoch": 0.4229021913415286, + "grad_norm": 0.8668429255485535, + "learning_rate": 0.00019551095765899158, + "loss": 0.4856, + "step": 12660 + }, + { + "epoch": 0.4232362373062533, + "grad_norm": 0.8337170481681824, + "learning_rate": 0.0001955000423911034, + "loss": 0.4684, + "step": 12670 + }, + { + "epoch": 0.42357028327097807, + "grad_norm": 0.5983981490135193, + "learning_rate": 0.0001954891141743368, + "loss": 0.4781, + "step": 12680 + }, + { + "epoch": 0.4239043292357028, + "grad_norm": 0.7948489785194397, + "learning_rate": 0.0001954781730101736, + "loss": 0.4567, + "step": 12690 + }, + { + "epoch": 0.42423837520042756, + "grad_norm": 0.7609530091285706, + "learning_rate": 0.00019546721890009724, + "loss": 0.4474, + "step": 12700 + }, + { + "epoch": 0.4245724211651523, + "grad_norm": 0.6856061220169067, + "learning_rate": 0.00019545625184559304, + "loss": 0.4876, + "step": 12710 + }, + { + "epoch": 0.42490646712987706, + "grad_norm": 0.70638108253479, + "learning_rate": 0.00019544527184814795, + "loss": 0.4699, + "step": 12720 + }, + { + "epoch": 0.4252405130946018, + "grad_norm": 0.8413442373275757, + "learning_rate": 0.00019543427890925084, + "loss": 0.4509, + "step": 12730 + }, + { + "epoch": 0.42557455905932656, + "grad_norm": 0.735463559627533, + "learning_rate": 0.0001954232730303922, + "loss": 0.4206, + "step": 12740 + }, + { + "epoch": 0.4259086050240513, + "grad_norm": 0.7356415390968323, + "learning_rate": 0.00019541225421306431, + "loss": 0.4793, + "step": 12750 + }, + { + "epoch": 0.42624265098877606, + "grad_norm": 0.7593218088150024, + "learning_rate": 0.00019540122245876126, + "loss": 0.4593, + "step": 12760 + }, + { + "epoch": 0.4265766969535008, + "grad_norm": 0.6642664074897766, + "learning_rate": 0.0001953901777689788, + "loss": 0.4605, + "step": 12770 + }, + { + "epoch": 0.42691074291822556, + "grad_norm": 0.6838988661766052, + "learning_rate": 0.00019537912014521452, + "loss": 0.4566, + "step": 12780 + }, + { + "epoch": 0.4272447888829503, + "grad_norm": 0.9228357672691345, + "learning_rate": 0.00019536804958896772, + "loss": 0.4521, + "step": 12790 + }, + { + "epoch": 0.42757883484767506, + "grad_norm": 0.8001843690872192, + "learning_rate": 0.00019535696610173942, + "loss": 0.4282, + "step": 12800 + }, + { + "epoch": 0.4279128808123998, + "grad_norm": 0.748106062412262, + "learning_rate": 0.00019534586968503247, + "loss": 0.4426, + "step": 12810 + }, + { + "epoch": 0.42824692677712456, + "grad_norm": 0.8272425532341003, + "learning_rate": 0.00019533476034035144, + "loss": 0.4676, + "step": 12820 + }, + { + "epoch": 0.42858097274184925, + "grad_norm": 0.7360121011734009, + "learning_rate": 0.00019532363806920263, + "loss": 0.4522, + "step": 12830 + }, + { + "epoch": 0.428915018706574, + "grad_norm": 0.8063158988952637, + "learning_rate": 0.00019531250287309413, + "loss": 0.4501, + "step": 12840 + }, + { + "epoch": 0.42924906467129875, + "grad_norm": 0.7268065214157104, + "learning_rate": 0.00019530135475353576, + "loss": 0.4499, + "step": 12850 + }, + { + "epoch": 0.4295831106360235, + "grad_norm": 3.9996092319488525, + "learning_rate": 0.0001952901937120391, + "loss": 0.4614, + "step": 12860 + }, + { + "epoch": 0.42991715660074825, + "grad_norm": 0.7722535133361816, + "learning_rate": 0.00019527901975011745, + "loss": 0.4606, + "step": 12870 + }, + { + "epoch": 0.430251202565473, + "grad_norm": 0.7522767186164856, + "learning_rate": 0.0001952678328692859, + "loss": 0.4403, + "step": 12880 + }, + { + "epoch": 0.43058524853019775, + "grad_norm": 0.7003090381622314, + "learning_rate": 0.00019525663307106133, + "loss": 0.444, + "step": 12890 + }, + { + "epoch": 0.4309192944949225, + "grad_norm": 0.8167745471000671, + "learning_rate": 0.00019524542035696224, + "loss": 0.4832, + "step": 12900 + }, + { + "epoch": 0.43125334045964725, + "grad_norm": 0.8354787826538086, + "learning_rate": 0.00019523419472850906, + "loss": 0.489, + "step": 12910 + }, + { + "epoch": 0.431587386424372, + "grad_norm": 0.7630041837692261, + "learning_rate": 0.00019522295618722382, + "loss": 0.4567, + "step": 12920 + }, + { + "epoch": 0.43192143238909675, + "grad_norm": 0.8161671161651611, + "learning_rate": 0.00019521170473463036, + "loss": 0.4737, + "step": 12930 + }, + { + "epoch": 0.4322554783538215, + "grad_norm": 0.8538644313812256, + "learning_rate": 0.00019520044037225426, + "loss": 0.5028, + "step": 12940 + }, + { + "epoch": 0.43258952431854625, + "grad_norm": 0.78462153673172, + "learning_rate": 0.00019518916310162288, + "loss": 0.4681, + "step": 12950 + }, + { + "epoch": 0.432923570283271, + "grad_norm": 0.7666411995887756, + "learning_rate": 0.00019517787292426536, + "loss": 0.4406, + "step": 12960 + }, + { + "epoch": 0.43325761624799575, + "grad_norm": 0.8464723229408264, + "learning_rate": 0.0001951665698417124, + "loss": 0.4272, + "step": 12970 + }, + { + "epoch": 0.4335916622127205, + "grad_norm": 0.9603878259658813, + "learning_rate": 0.00019515525385549668, + "loss": 0.4738, + "step": 12980 + }, + { + "epoch": 0.4339257081774452, + "grad_norm": 0.738681435585022, + "learning_rate": 0.00019514392496715254, + "loss": 0.4712, + "step": 12990 + }, + { + "epoch": 0.43425975414216994, + "grad_norm": 0.722231924533844, + "learning_rate": 0.00019513258317821607, + "loss": 0.5025, + "step": 13000 + }, + { + "epoch": 0.43425975414216994, + "eval_chrf": 84.76571814479664, + "eval_loss": 0.8131721019744873, + "eval_runtime": 178.5629, + "eval_samples_per_second": 0.56, + "eval_steps_per_second": 0.022, + "step": 13000 + }, + { + "epoch": 0.4345938001068947, + "grad_norm": 0.7434185147285461, + "learning_rate": 0.00019512122849022506, + "loss": 0.4503, + "step": 13010 + }, + { + "epoch": 0.43492784607161944, + "grad_norm": 0.6826658844947815, + "learning_rate": 0.00019510986090471915, + "loss": 0.4512, + "step": 13020 + }, + { + "epoch": 0.4352618920363442, + "grad_norm": 0.7430537343025208, + "learning_rate": 0.00019509848042323965, + "loss": 0.4854, + "step": 13030 + }, + { + "epoch": 0.43559593800106894, + "grad_norm": 0.625129222869873, + "learning_rate": 0.00019508708704732964, + "loss": 0.4575, + "step": 13040 + }, + { + "epoch": 0.4359299839657937, + "grad_norm": 0.7724032402038574, + "learning_rate": 0.00019507568077853393, + "loss": 0.4836, + "step": 13050 + }, + { + "epoch": 0.43626402993051844, + "grad_norm": 0.7185297012329102, + "learning_rate": 0.00019506426161839912, + "loss": 0.455, + "step": 13060 + }, + { + "epoch": 0.4365980758952432, + "grad_norm": 0.7680572271347046, + "learning_rate": 0.00019505282956847357, + "loss": 0.4361, + "step": 13070 + }, + { + "epoch": 0.43693212185996794, + "grad_norm": 0.7837376594543457, + "learning_rate": 0.0001950413846303073, + "loss": 0.4525, + "step": 13080 + }, + { + "epoch": 0.4372661678246927, + "grad_norm": 0.7476522922515869, + "learning_rate": 0.00019502992680545215, + "loss": 0.4666, + "step": 13090 + }, + { + "epoch": 0.43760021378941744, + "grad_norm": 0.8905794024467468, + "learning_rate": 0.00019501845609546173, + "loss": 0.4786, + "step": 13100 + }, + { + "epoch": 0.4379342597541422, + "grad_norm": 0.7740911841392517, + "learning_rate": 0.0001950069725018913, + "loss": 0.4545, + "step": 13110 + }, + { + "epoch": 0.43826830571886694, + "grad_norm": 0.8797983527183533, + "learning_rate": 0.00019499547602629794, + "loss": 0.478, + "step": 13120 + }, + { + "epoch": 0.4386023516835917, + "grad_norm": 0.8270354270935059, + "learning_rate": 0.0001949839666702405, + "loss": 0.4464, + "step": 13130 + }, + { + "epoch": 0.43893639764831643, + "grad_norm": 0.6720091700553894, + "learning_rate": 0.00019497244443527947, + "loss": 0.468, + "step": 13140 + }, + { + "epoch": 0.43927044361304113, + "grad_norm": 0.7950276136398315, + "learning_rate": 0.0001949609093229772, + "loss": 0.444, + "step": 13150 + }, + { + "epoch": 0.4396044895777659, + "grad_norm": 0.7679458856582642, + "learning_rate": 0.00019494936133489772, + "loss": 0.4847, + "step": 13160 + }, + { + "epoch": 0.4399385355424906, + "grad_norm": 0.7334076762199402, + "learning_rate": 0.00019493780047260683, + "loss": 0.4749, + "step": 13170 + }, + { + "epoch": 0.4402725815072154, + "grad_norm": 0.7785823941230774, + "learning_rate": 0.0001949262267376721, + "loss": 0.4725, + "step": 13180 + }, + { + "epoch": 0.4406066274719401, + "grad_norm": 0.7163248062133789, + "learning_rate": 0.00019491464013166276, + "loss": 0.4362, + "step": 13190 + }, + { + "epoch": 0.4409406734366649, + "grad_norm": 0.8560178279876709, + "learning_rate": 0.00019490304065614987, + "loss": 0.4537, + "step": 13200 + }, + { + "epoch": 0.4412747194013896, + "grad_norm": 0.7184076309204102, + "learning_rate": 0.00019489142831270623, + "loss": 0.458, + "step": 13210 + }, + { + "epoch": 0.4416087653661144, + "grad_norm": 0.6946830153465271, + "learning_rate": 0.00019487980310290632, + "loss": 0.4411, + "step": 13220 + }, + { + "epoch": 0.4419428113308391, + "grad_norm": 0.7297454476356506, + "learning_rate": 0.00019486816502832643, + "loss": 0.4769, + "step": 13230 + }, + { + "epoch": 0.4422768572955639, + "grad_norm": 0.6790744662284851, + "learning_rate": 0.00019485651409054455, + "loss": 0.4489, + "step": 13240 + }, + { + "epoch": 0.4426109032602886, + "grad_norm": 0.7886067628860474, + "learning_rate": 0.00019484485029114046, + "loss": 0.4547, + "step": 13250 + }, + { + "epoch": 0.4429449492250134, + "grad_norm": 0.6971169114112854, + "learning_rate": 0.00019483317363169563, + "loss": 0.4811, + "step": 13260 + }, + { + "epoch": 0.4432789951897381, + "grad_norm": 0.6011500954627991, + "learning_rate": 0.00019482148411379332, + "loss": 0.479, + "step": 13270 + }, + { + "epoch": 0.4436130411544629, + "grad_norm": 0.7637840509414673, + "learning_rate": 0.00019480978173901857, + "loss": 0.4621, + "step": 13280 + }, + { + "epoch": 0.4439470871191876, + "grad_norm": 0.7926741242408752, + "learning_rate": 0.00019479806650895797, + "loss": 0.5132, + "step": 13290 + }, + { + "epoch": 0.44428113308391237, + "grad_norm": 0.7452704310417175, + "learning_rate": 0.0001947863384252001, + "loss": 0.4421, + "step": 13300 + }, + { + "epoch": 0.44461517904863707, + "grad_norm": 0.7788531184196472, + "learning_rate": 0.00019477459748933512, + "loss": 0.471, + "step": 13310 + }, + { + "epoch": 0.4449492250133618, + "grad_norm": 0.6357733607292175, + "learning_rate": 0.00019476284370295506, + "loss": 0.5045, + "step": 13320 + }, + { + "epoch": 0.44528327097808656, + "grad_norm": 1.0999094247817993, + "learning_rate": 0.00019475107706765353, + "loss": 0.4825, + "step": 13330 + }, + { + "epoch": 0.4456173169428113, + "grad_norm": 0.7688634991645813, + "learning_rate": 0.00019473929758502605, + "loss": 0.473, + "step": 13340 + }, + { + "epoch": 0.44595136290753606, + "grad_norm": 0.6906781196594238, + "learning_rate": 0.00019472750525666976, + "loss": 0.4666, + "step": 13350 + }, + { + "epoch": 0.4462854088722608, + "grad_norm": 0.7467180490493774, + "learning_rate": 0.00019471570008418359, + "loss": 0.4437, + "step": 13360 + }, + { + "epoch": 0.44661945483698556, + "grad_norm": 0.6687162518501282, + "learning_rate": 0.00019470388206916818, + "loss": 0.4165, + "step": 13370 + }, + { + "epoch": 0.4469535008017103, + "grad_norm": 0.7441921234130859, + "learning_rate": 0.000194692051213226, + "loss": 0.485, + "step": 13380 + }, + { + "epoch": 0.44728754676643506, + "grad_norm": 0.7496967315673828, + "learning_rate": 0.00019468020751796115, + "loss": 0.4605, + "step": 13390 + }, + { + "epoch": 0.4476215927311598, + "grad_norm": 0.6787314414978027, + "learning_rate": 0.00019466835098497952, + "loss": 0.4692, + "step": 13400 + }, + { + "epoch": 0.44795563869588456, + "grad_norm": 0.8261398077011108, + "learning_rate": 0.0001946564816158888, + "loss": 0.4469, + "step": 13410 + }, + { + "epoch": 0.4482896846606093, + "grad_norm": 0.6962042450904846, + "learning_rate": 0.00019464459941229828, + "loss": 0.4434, + "step": 13420 + }, + { + "epoch": 0.44862373062533406, + "grad_norm": 0.773162841796875, + "learning_rate": 0.00019463270437581912, + "loss": 0.4789, + "step": 13430 + }, + { + "epoch": 0.4489577765900588, + "grad_norm": 0.6836844086647034, + "learning_rate": 0.00019462079650806418, + "loss": 0.4481, + "step": 13440 + }, + { + "epoch": 0.44929182255478356, + "grad_norm": 0.6728940010070801, + "learning_rate": 0.000194608875810648, + "loss": 0.4594, + "step": 13450 + }, + { + "epoch": 0.4496258685195083, + "grad_norm": 0.8977975845336914, + "learning_rate": 0.00019459694228518698, + "loss": 0.4514, + "step": 13460 + }, + { + "epoch": 0.449959914484233, + "grad_norm": 0.7738216519355774, + "learning_rate": 0.00019458499593329913, + "loss": 0.4623, + "step": 13470 + }, + { + "epoch": 0.45029396044895775, + "grad_norm": 0.7506988048553467, + "learning_rate": 0.00019457303675660427, + "loss": 0.4437, + "step": 13480 + }, + { + "epoch": 0.4506280064136825, + "grad_norm": 0.6900798082351685, + "learning_rate": 0.00019456106475672399, + "loss": 0.4748, + "step": 13490 + }, + { + "epoch": 0.45096205237840725, + "grad_norm": 0.8436459898948669, + "learning_rate": 0.0001945490799352815, + "loss": 0.4399, + "step": 13500 + }, + { + "epoch": 0.451296098343132, + "grad_norm": 0.6166313290596008, + "learning_rate": 0.00019453708229390194, + "loss": 0.4601, + "step": 13510 + }, + { + "epoch": 0.45163014430785675, + "grad_norm": 0.592606246471405, + "learning_rate": 0.00019452507183421197, + "loss": 0.4414, + "step": 13520 + }, + { + "epoch": 0.4519641902725815, + "grad_norm": 0.8229379057884216, + "learning_rate": 0.00019451304855784012, + "loss": 0.476, + "step": 13530 + }, + { + "epoch": 0.45229823623730625, + "grad_norm": 0.8291531801223755, + "learning_rate": 0.00019450101246641665, + "loss": 0.4672, + "step": 13540 + }, + { + "epoch": 0.452632282202031, + "grad_norm": 0.8077791929244995, + "learning_rate": 0.0001944889635615735, + "loss": 0.4635, + "step": 13550 + }, + { + "epoch": 0.45296632816675575, + "grad_norm": 0.7168554067611694, + "learning_rate": 0.00019447690184494444, + "loss": 0.4636, + "step": 13560 + }, + { + "epoch": 0.4533003741314805, + "grad_norm": 0.8836997151374817, + "learning_rate": 0.0001944648273181649, + "loss": 0.4945, + "step": 13570 + }, + { + "epoch": 0.45363442009620525, + "grad_norm": 0.6797115206718445, + "learning_rate": 0.00019445273998287203, + "loss": 0.4343, + "step": 13580 + }, + { + "epoch": 0.45396846606093, + "grad_norm": 0.8722407817840576, + "learning_rate": 0.0001944406398407048, + "loss": 0.4854, + "step": 13590 + }, + { + "epoch": 0.45430251202565475, + "grad_norm": 0.8164008259773254, + "learning_rate": 0.00019442852689330385, + "loss": 0.4562, + "step": 13600 + }, + { + "epoch": 0.4546365579903795, + "grad_norm": 0.7439300417900085, + "learning_rate": 0.00019441640114231158, + "loss": 0.4767, + "step": 13610 + }, + { + "epoch": 0.45497060395510425, + "grad_norm": 0.732358455657959, + "learning_rate": 0.00019440426258937214, + "loss": 0.4662, + "step": 13620 + }, + { + "epoch": 0.45530464991982894, + "grad_norm": 0.7565233707427979, + "learning_rate": 0.00019439211123613138, + "loss": 0.4867, + "step": 13630 + }, + { + "epoch": 0.4556386958845537, + "grad_norm": 0.7915957570075989, + "learning_rate": 0.00019437994708423694, + "loss": 0.4471, + "step": 13640 + }, + { + "epoch": 0.45597274184927844, + "grad_norm": 0.712871253490448, + "learning_rate": 0.0001943677701353381, + "loss": 0.472, + "step": 13650 + }, + { + "epoch": 0.4563067878140032, + "grad_norm": 0.811211109161377, + "learning_rate": 0.000194355580391086, + "loss": 0.4669, + "step": 13660 + }, + { + "epoch": 0.45664083377872794, + "grad_norm": 0.7226788401603699, + "learning_rate": 0.00019434337785313342, + "loss": 0.417, + "step": 13670 + }, + { + "epoch": 0.4569748797434527, + "grad_norm": 0.7336011528968811, + "learning_rate": 0.0001943311625231349, + "loss": 0.4643, + "step": 13680 + }, + { + "epoch": 0.45730892570817744, + "grad_norm": 0.721524715423584, + "learning_rate": 0.0001943189344027467, + "loss": 0.4808, + "step": 13690 + }, + { + "epoch": 0.4576429716729022, + "grad_norm": 0.7216586470603943, + "learning_rate": 0.00019430669349362688, + "loss": 0.469, + "step": 13700 + }, + { + "epoch": 0.45797701763762694, + "grad_norm": 0.7798793315887451, + "learning_rate": 0.0001942944397974352, + "loss": 0.4405, + "step": 13710 + }, + { + "epoch": 0.4583110636023517, + "grad_norm": 0.7729379534721375, + "learning_rate": 0.0001942821733158331, + "loss": 0.4704, + "step": 13720 + }, + { + "epoch": 0.45864510956707644, + "grad_norm": 0.6545955538749695, + "learning_rate": 0.0001942698940504838, + "loss": 0.4766, + "step": 13730 + }, + { + "epoch": 0.4589791555318012, + "grad_norm": 0.7565506100654602, + "learning_rate": 0.00019425760200305226, + "loss": 0.438, + "step": 13740 + }, + { + "epoch": 0.45931320149652594, + "grad_norm": 0.7113173604011536, + "learning_rate": 0.00019424529717520513, + "loss": 0.4799, + "step": 13750 + }, + { + "epoch": 0.4596472474612507, + "grad_norm": 0.6819316744804382, + "learning_rate": 0.0001942329795686109, + "loss": 0.442, + "step": 13760 + }, + { + "epoch": 0.45998129342597543, + "grad_norm": 0.6948909759521484, + "learning_rate": 0.00019422064918493966, + "loss": 0.4104, + "step": 13770 + }, + { + "epoch": 0.4603153393907002, + "grad_norm": 1.2363849878311157, + "learning_rate": 0.0001942083060258633, + "loss": 0.4606, + "step": 13780 + }, + { + "epoch": 0.46064938535542493, + "grad_norm": 0.7254630923271179, + "learning_rate": 0.00019419595009305542, + "loss": 0.4192, + "step": 13790 + }, + { + "epoch": 0.46098343132014963, + "grad_norm": 0.6580613851547241, + "learning_rate": 0.00019418358138819142, + "loss": 0.4693, + "step": 13800 + }, + { + "epoch": 0.4613174772848744, + "grad_norm": 0.7714186906814575, + "learning_rate": 0.0001941711999129483, + "loss": 0.4321, + "step": 13810 + }, + { + "epoch": 0.4616515232495991, + "grad_norm": 0.7866247296333313, + "learning_rate": 0.00019415880566900492, + "loss": 0.4615, + "step": 13820 + }, + { + "epoch": 0.4619855692143239, + "grad_norm": 0.66412353515625, + "learning_rate": 0.0001941463986580418, + "loss": 0.4498, + "step": 13830 + }, + { + "epoch": 0.4623196151790486, + "grad_norm": 0.6872178912162781, + "learning_rate": 0.00019413397888174121, + "loss": 0.4659, + "step": 13840 + }, + { + "epoch": 0.4626536611437734, + "grad_norm": 0.7506560683250427, + "learning_rate": 0.0001941215463417872, + "loss": 0.4674, + "step": 13850 + }, + { + "epoch": 0.4629877071084981, + "grad_norm": 0.711233913898468, + "learning_rate": 0.0001941091010398654, + "loss": 0.4735, + "step": 13860 + }, + { + "epoch": 0.4633217530732229, + "grad_norm": 0.6963764429092407, + "learning_rate": 0.00019409664297766335, + "loss": 0.4546, + "step": 13870 + }, + { + "epoch": 0.4636557990379476, + "grad_norm": 0.7346891164779663, + "learning_rate": 0.00019408417215687021, + "loss": 0.3976, + "step": 13880 + }, + { + "epoch": 0.4639898450026724, + "grad_norm": 0.8031058311462402, + "learning_rate": 0.0001940716885791769, + "loss": 0.4761, + "step": 13890 + }, + { + "epoch": 0.4643238909673971, + "grad_norm": 0.6883862018585205, + "learning_rate": 0.00019405919224627614, + "loss": 0.4501, + "step": 13900 + }, + { + "epoch": 0.4646579369321219, + "grad_norm": 0.8204253911972046, + "learning_rate": 0.0001940466831598622, + "loss": 0.4448, + "step": 13910 + }, + { + "epoch": 0.4649919828968466, + "grad_norm": 0.7701585292816162, + "learning_rate": 0.00019403416132163128, + "loss": 0.4143, + "step": 13920 + }, + { + "epoch": 0.4653260288615714, + "grad_norm": 0.6740711331367493, + "learning_rate": 0.00019402162673328116, + "loss": 0.4453, + "step": 13930 + }, + { + "epoch": 0.4656600748262961, + "grad_norm": 0.6129841208457947, + "learning_rate": 0.00019400907939651148, + "loss": 0.4307, + "step": 13940 + }, + { + "epoch": 0.46599412079102087, + "grad_norm": 0.6782830357551575, + "learning_rate": 0.00019399651931302346, + "loss": 0.4842, + "step": 13950 + }, + { + "epoch": 0.46632816675574557, + "grad_norm": 0.7075527906417847, + "learning_rate": 0.00019398394648452016, + "loss": 0.4321, + "step": 13960 + }, + { + "epoch": 0.4666622127204703, + "grad_norm": 0.7873295545578003, + "learning_rate": 0.00019397136091270633, + "loss": 0.5237, + "step": 13970 + }, + { + "epoch": 0.46699625868519506, + "grad_norm": 0.7533546686172485, + "learning_rate": 0.00019395876259928845, + "loss": 0.4659, + "step": 13980 + }, + { + "epoch": 0.4673303046499198, + "grad_norm": 0.7151362895965576, + "learning_rate": 0.00019394615154597476, + "loss": 0.4742, + "step": 13990 + }, + { + "epoch": 0.46766435061464456, + "grad_norm": 0.6300932765007019, + "learning_rate": 0.0001939335277544751, + "loss": 0.4385, + "step": 14000 + }, + { + "epoch": 0.46766435061464456, + "eval_chrf": 85.82839144223566, + "eval_loss": 0.8308346271514893, + "eval_runtime": 91.9528, + "eval_samples_per_second": 1.088, + "eval_steps_per_second": 0.044, + "step": 14000 + }, + { + "epoch": 0.4679983965793693, + "grad_norm": 0.6977421045303345, + "learning_rate": 0.00019392089122650128, + "loss": 0.4635, + "step": 14010 + }, + { + "epoch": 0.46833244254409406, + "grad_norm": 0.7821333408355713, + "learning_rate": 0.00019390824196376656, + "loss": 0.4366, + "step": 14020 + }, + { + "epoch": 0.4686664885088188, + "grad_norm": 0.7272844910621643, + "learning_rate": 0.00019389557996798612, + "loss": 0.4405, + "step": 14030 + }, + { + "epoch": 0.46900053447354356, + "grad_norm": 0.7079196572303772, + "learning_rate": 0.00019388290524087682, + "loss": 0.4545, + "step": 14040 + }, + { + "epoch": 0.4693345804382683, + "grad_norm": 0.8419969081878662, + "learning_rate": 0.0001938702177841572, + "loss": 0.4312, + "step": 14050 + }, + { + "epoch": 0.46966862640299306, + "grad_norm": 0.7684766054153442, + "learning_rate": 0.00019385751759954753, + "loss": 0.5103, + "step": 14060 + }, + { + "epoch": 0.4700026723677178, + "grad_norm": 0.6765429377555847, + "learning_rate": 0.00019384480468876986, + "loss": 0.4513, + "step": 14070 + }, + { + "epoch": 0.47033671833244256, + "grad_norm": 0.7268145680427551, + "learning_rate": 0.00019383207905354795, + "loss": 0.4597, + "step": 14080 + }, + { + "epoch": 0.4706707642971673, + "grad_norm": 0.7273085713386536, + "learning_rate": 0.00019381934069560725, + "loss": 0.4484, + "step": 14090 + }, + { + "epoch": 0.47100481026189206, + "grad_norm": 0.6785349249839783, + "learning_rate": 0.000193806589616675, + "loss": 0.4533, + "step": 14100 + }, + { + "epoch": 0.4713388562266168, + "grad_norm": 0.665161669254303, + "learning_rate": 0.00019379382581848008, + "loss": 0.454, + "step": 14110 + }, + { + "epoch": 0.4716729021913415, + "grad_norm": 0.7695482969284058, + "learning_rate": 0.00019378104930275312, + "loss": 0.456, + "step": 14120 + }, + { + "epoch": 0.47200694815606625, + "grad_norm": 0.8090707659721375, + "learning_rate": 0.00019376826007122652, + "loss": 0.4526, + "step": 14130 + }, + { + "epoch": 0.472340994120791, + "grad_norm": 0.651022732257843, + "learning_rate": 0.0001937554581256344, + "loss": 0.4622, + "step": 14140 + }, + { + "epoch": 0.47267504008551575, + "grad_norm": 0.7614033818244934, + "learning_rate": 0.00019374264346771255, + "loss": 0.4412, + "step": 14150 + }, + { + "epoch": 0.4730090860502405, + "grad_norm": 0.7826064825057983, + "learning_rate": 0.0001937298160991985, + "loss": 0.4936, + "step": 14160 + }, + { + "epoch": 0.47334313201496525, + "grad_norm": 0.6058046221733093, + "learning_rate": 0.00019371697602183155, + "loss": 0.4872, + "step": 14170 + }, + { + "epoch": 0.47367717797969, + "grad_norm": 0.7066707015037537, + "learning_rate": 0.00019370412323735264, + "loss": 0.4531, + "step": 14180 + }, + { + "epoch": 0.47401122394441475, + "grad_norm": 0.7087106108665466, + "learning_rate": 0.00019369125774750456, + "loss": 0.4752, + "step": 14190 + }, + { + "epoch": 0.4743452699091395, + "grad_norm": 0.70475172996521, + "learning_rate": 0.0001936783795540317, + "loss": 0.4841, + "step": 14200 + }, + { + "epoch": 0.47467931587386425, + "grad_norm": 0.7501519918441772, + "learning_rate": 0.0001936654886586802, + "loss": 0.4647, + "step": 14210 + }, + { + "epoch": 0.475013361838589, + "grad_norm": 0.7163450717926025, + "learning_rate": 0.00019365258506319797, + "loss": 0.4411, + "step": 14220 + }, + { + "epoch": 0.47534740780331375, + "grad_norm": 0.8123687505722046, + "learning_rate": 0.0001936396687693346, + "loss": 0.4766, + "step": 14230 + }, + { + "epoch": 0.4756814537680385, + "grad_norm": 0.6750186085700989, + "learning_rate": 0.0001936267397788414, + "loss": 0.4619, + "step": 14240 + }, + { + "epoch": 0.47601549973276325, + "grad_norm": 0.6711135506629944, + "learning_rate": 0.00019361379809347144, + "loss": 0.425, + "step": 14250 + }, + { + "epoch": 0.476349545697488, + "grad_norm": 0.9062971472740173, + "learning_rate": 0.0001936008437149795, + "loss": 0.4723, + "step": 14260 + }, + { + "epoch": 0.47668359166221275, + "grad_norm": 0.6887251138687134, + "learning_rate": 0.000193587876645122, + "loss": 0.4531, + "step": 14270 + }, + { + "epoch": 0.47701763762693744, + "grad_norm": 0.8026403784751892, + "learning_rate": 0.00019357489688565725, + "loss": 0.5053, + "step": 14280 + }, + { + "epoch": 0.4773516835916622, + "grad_norm": 0.7014697790145874, + "learning_rate": 0.00019356190443834513, + "loss": 0.5089, + "step": 14290 + }, + { + "epoch": 0.47768572955638694, + "grad_norm": 0.7091430425643921, + "learning_rate": 0.00019354889930494725, + "loss": 0.4198, + "step": 14300 + }, + { + "epoch": 0.4780197755211117, + "grad_norm": 0.7694460153579712, + "learning_rate": 0.000193535881487227, + "loss": 0.474, + "step": 14310 + }, + { + "epoch": 0.47835382148583644, + "grad_norm": 0.7265195250511169, + "learning_rate": 0.0001935228509869495, + "loss": 0.4338, + "step": 14320 + }, + { + "epoch": 0.4786878674505612, + "grad_norm": 0.7469212412834167, + "learning_rate": 0.00019350980780588156, + "loss": 0.4727, + "step": 14330 + }, + { + "epoch": 0.47902191341528594, + "grad_norm": 0.7850968241691589, + "learning_rate": 0.00019349675194579166, + "loss": 0.442, + "step": 14340 + }, + { + "epoch": 0.4793559593800107, + "grad_norm": 0.7723694443702698, + "learning_rate": 0.0001934836834084501, + "loss": 0.4658, + "step": 14350 + }, + { + "epoch": 0.47969000534473544, + "grad_norm": 0.6611987352371216, + "learning_rate": 0.00019347060219562881, + "loss": 0.472, + "step": 14360 + }, + { + "epoch": 0.4800240513094602, + "grad_norm": 0.7426055073738098, + "learning_rate": 0.00019345750830910153, + "loss": 0.4565, + "step": 14370 + }, + { + "epoch": 0.48035809727418494, + "grad_norm": 0.7147203087806702, + "learning_rate": 0.0001934444017506436, + "loss": 0.4574, + "step": 14380 + }, + { + "epoch": 0.4806921432389097, + "grad_norm": 0.6980175971984863, + "learning_rate": 0.00019343128252203218, + "loss": 0.478, + "step": 14390 + }, + { + "epoch": 0.48102618920363444, + "grad_norm": 0.6491966247558594, + "learning_rate": 0.0001934181506250461, + "loss": 0.471, + "step": 14400 + }, + { + "epoch": 0.4813602351683592, + "grad_norm": 0.7207120656967163, + "learning_rate": 0.0001934050060614659, + "loss": 0.4419, + "step": 14410 + }, + { + "epoch": 0.48169428113308393, + "grad_norm": 0.8163737654685974, + "learning_rate": 0.00019339184883307388, + "loss": 0.4526, + "step": 14420 + }, + { + "epoch": 0.4820283270978087, + "grad_norm": 0.7266551852226257, + "learning_rate": 0.00019337867894165403, + "loss": 0.4594, + "step": 14430 + }, + { + "epoch": 0.4823623730625334, + "grad_norm": 0.7418677806854248, + "learning_rate": 0.000193365496388992, + "loss": 0.449, + "step": 14440 + }, + { + "epoch": 0.48269641902725813, + "grad_norm": 0.5463441014289856, + "learning_rate": 0.00019335230117687534, + "loss": 0.4628, + "step": 14450 + }, + { + "epoch": 0.4830304649919829, + "grad_norm": 0.7502585053443909, + "learning_rate": 0.0001933390933070931, + "loss": 0.4943, + "step": 14460 + }, + { + "epoch": 0.4833645109567076, + "grad_norm": 0.6910586953163147, + "learning_rate": 0.00019332587278143615, + "loss": 0.4342, + "step": 14470 + }, + { + "epoch": 0.4836985569214324, + "grad_norm": 0.7622266411781311, + "learning_rate": 0.00019331263960169708, + "loss": 0.4581, + "step": 14480 + }, + { + "epoch": 0.4840326028861571, + "grad_norm": 0.7573332786560059, + "learning_rate": 0.00019329939376967016, + "loss": 0.4694, + "step": 14490 + }, + { + "epoch": 0.4843666488508819, + "grad_norm": 0.768525242805481, + "learning_rate": 0.00019328613528715141, + "loss": 0.503, + "step": 14500 + }, + { + "epoch": 0.4847006948156066, + "grad_norm": 0.8716367483139038, + "learning_rate": 0.0001932728641559386, + "loss": 0.4631, + "step": 14510 + }, + { + "epoch": 0.4850347407803314, + "grad_norm": 0.7156944870948792, + "learning_rate": 0.00019325958037783107, + "loss": 0.4445, + "step": 14520 + }, + { + "epoch": 0.4853687867450561, + "grad_norm": 0.852107584476471, + "learning_rate": 0.00019324628395463007, + "loss": 0.4674, + "step": 14530 + }, + { + "epoch": 0.4857028327097809, + "grad_norm": 0.638319730758667, + "learning_rate": 0.0001932329748881384, + "loss": 0.4807, + "step": 14540 + }, + { + "epoch": 0.4860368786745056, + "grad_norm": 0.7308068871498108, + "learning_rate": 0.00019321965318016063, + "loss": 0.4789, + "step": 14550 + }, + { + "epoch": 0.4863709246392304, + "grad_norm": 0.7397300601005554, + "learning_rate": 0.00019320631883250312, + "loss": 0.4634, + "step": 14560 + }, + { + "epoch": 0.4867049706039551, + "grad_norm": 0.8097351789474487, + "learning_rate": 0.0001931929718469738, + "loss": 0.4779, + "step": 14570 + }, + { + "epoch": 0.48703901656867987, + "grad_norm": 0.6819607615470886, + "learning_rate": 0.00019317961222538243, + "loss": 0.4499, + "step": 14580 + }, + { + "epoch": 0.4873730625334046, + "grad_norm": 0.8648117780685425, + "learning_rate": 0.00019316623996954045, + "loss": 0.4759, + "step": 14590 + }, + { + "epoch": 0.4877071084981293, + "grad_norm": 0.8369916677474976, + "learning_rate": 0.00019315285508126104, + "loss": 0.4479, + "step": 14600 + }, + { + "epoch": 0.48804115446285407, + "grad_norm": 0.7152202725410461, + "learning_rate": 0.00019313945756235904, + "loss": 0.4736, + "step": 14610 + }, + { + "epoch": 0.4883752004275788, + "grad_norm": 0.6876022219657898, + "learning_rate": 0.00019312604741465093, + "loss": 0.4407, + "step": 14620 + }, + { + "epoch": 0.48870924639230356, + "grad_norm": 0.7280433773994446, + "learning_rate": 0.00019311262463995513, + "loss": 0.4527, + "step": 14630 + }, + { + "epoch": 0.4890432923570283, + "grad_norm": 0.7263306379318237, + "learning_rate": 0.00019309918924009158, + "loss": 0.4685, + "step": 14640 + }, + { + "epoch": 0.48937733832175306, + "grad_norm": 0.8482736945152283, + "learning_rate": 0.000193085741216882, + "loss": 0.4699, + "step": 14650 + }, + { + "epoch": 0.4897113842864778, + "grad_norm": 0.6924654245376587, + "learning_rate": 0.00019307228057214977, + "loss": 0.4457, + "step": 14660 + }, + { + "epoch": 0.49004543025120256, + "grad_norm": 0.7689080238342285, + "learning_rate": 0.00019305880730772004, + "loss": 0.4351, + "step": 14670 + }, + { + "epoch": 0.4903794762159273, + "grad_norm": 0.7761452794075012, + "learning_rate": 0.00019304532142541973, + "loss": 0.449, + "step": 14680 + }, + { + "epoch": 0.49071352218065206, + "grad_norm": 0.7022421956062317, + "learning_rate": 0.00019303182292707734, + "loss": 0.4456, + "step": 14690 + }, + { + "epoch": 0.4910475681453768, + "grad_norm": 0.7701778411865234, + "learning_rate": 0.00019301831181452308, + "loss": 0.4531, + "step": 14700 + }, + { + "epoch": 0.49138161411010156, + "grad_norm": 0.798927366733551, + "learning_rate": 0.000193004788089589, + "loss": 0.4624, + "step": 14710 + }, + { + "epoch": 0.4917156600748263, + "grad_norm": 0.5867473483085632, + "learning_rate": 0.00019299125175410874, + "loss": 0.4273, + "step": 14720 + }, + { + "epoch": 0.49204970603955106, + "grad_norm": 0.6370320916175842, + "learning_rate": 0.00019297770280991773, + "loss": 0.4303, + "step": 14730 + }, + { + "epoch": 0.4923837520042758, + "grad_norm": 0.6570526957511902, + "learning_rate": 0.00019296414125885306, + "loss": 0.4283, + "step": 14740 + }, + { + "epoch": 0.49271779796900056, + "grad_norm": 0.7586194276809692, + "learning_rate": 0.00019295056710275357, + "loss": 0.4591, + "step": 14750 + }, + { + "epoch": 0.49305184393372525, + "grad_norm": 0.6709245443344116, + "learning_rate": 0.00019293698034345976, + "loss": 0.462, + "step": 14760 + }, + { + "epoch": 0.49338588989845, + "grad_norm": 0.6543884873390198, + "learning_rate": 0.00019292338098281383, + "loss": 0.411, + "step": 14770 + }, + { + "epoch": 0.49371993586317475, + "grad_norm": 0.7150288820266724, + "learning_rate": 0.0001929097690226598, + "loss": 0.4531, + "step": 14780 + }, + { + "epoch": 0.4940539818278995, + "grad_norm": 0.6725692749023438, + "learning_rate": 0.0001928961444648432, + "loss": 0.4647, + "step": 14790 + }, + { + "epoch": 0.49438802779262425, + "grad_norm": 0.7736278772354126, + "learning_rate": 0.00019288250731121153, + "loss": 0.4491, + "step": 14800 + }, + { + "epoch": 0.494722073757349, + "grad_norm": 0.6484153866767883, + "learning_rate": 0.00019286885756361377, + "loss": 0.4481, + "step": 14810 + }, + { + "epoch": 0.49505611972207375, + "grad_norm": 0.6895773410797119, + "learning_rate": 0.0001928551952239007, + "loss": 0.448, + "step": 14820 + }, + { + "epoch": 0.4953901656867985, + "grad_norm": 0.671870231628418, + "learning_rate": 0.00019284152029392483, + "loss": 0.4292, + "step": 14830 + }, + { + "epoch": 0.49572421165152325, + "grad_norm": 0.7321531772613525, + "learning_rate": 0.0001928278327755403, + "loss": 0.4497, + "step": 14840 + }, + { + "epoch": 0.496058257616248, + "grad_norm": 0.7683451771736145, + "learning_rate": 0.00019281413267060307, + "loss": 0.4272, + "step": 14850 + }, + { + "epoch": 0.49639230358097275, + "grad_norm": 0.5988258719444275, + "learning_rate": 0.00019280041998097068, + "loss": 0.4101, + "step": 14860 + }, + { + "epoch": 0.4967263495456975, + "grad_norm": 0.8403514623641968, + "learning_rate": 0.00019278669470850247, + "loss": 0.4952, + "step": 14870 + }, + { + "epoch": 0.49706039551042225, + "grad_norm": 0.628643274307251, + "learning_rate": 0.00019277295685505945, + "loss": 0.4329, + "step": 14880 + }, + { + "epoch": 0.497394441475147, + "grad_norm": 0.695465087890625, + "learning_rate": 0.00019275920642250434, + "loss": 0.4512, + "step": 14890 + }, + { + "epoch": 0.49772848743987175, + "grad_norm": 0.7788556814193726, + "learning_rate": 0.00019274544341270155, + "loss": 0.4778, + "step": 14900 + }, + { + "epoch": 0.4980625334045965, + "grad_norm": 0.8619930148124695, + "learning_rate": 0.00019273166782751724, + "loss": 0.4256, + "step": 14910 + }, + { + "epoch": 0.4983965793693212, + "grad_norm": 0.691524088382721, + "learning_rate": 0.0001927178796688192, + "loss": 0.4441, + "step": 14920 + }, + { + "epoch": 0.49873062533404594, + "grad_norm": 0.7380512952804565, + "learning_rate": 0.000192704078938477, + "loss": 0.4389, + "step": 14930 + }, + { + "epoch": 0.4990646712987707, + "grad_norm": 0.7537968754768372, + "learning_rate": 0.0001926902656383619, + "loss": 0.4788, + "step": 14940 + }, + { + "epoch": 0.49939871726349544, + "grad_norm": 0.7822479605674744, + "learning_rate": 0.00019267643977034684, + "loss": 0.4656, + "step": 14950 + }, + { + "epoch": 0.4997327632282202, + "grad_norm": 0.789779782295227, + "learning_rate": 0.00019266260133630643, + "loss": 0.4909, + "step": 14960 + }, + { + "epoch": 0.500066809192945, + "grad_norm": 0.6278780698776245, + "learning_rate": 0.0001926487503381171, + "loss": 0.4604, + "step": 14970 + }, + { + "epoch": 0.5004008551576697, + "grad_norm": 0.8889648914337158, + "learning_rate": 0.00019263488677765685, + "loss": 0.4136, + "step": 14980 + }, + { + "epoch": 0.5007349011223945, + "grad_norm": 0.7694296836853027, + "learning_rate": 0.00019262101065680546, + "loss": 0.4478, + "step": 14990 + }, + { + "epoch": 0.5010689470871192, + "grad_norm": 0.786186158657074, + "learning_rate": 0.00019260712197744444, + "loss": 0.4403, + "step": 15000 + }, + { + "epoch": 0.5010689470871192, + "eval_chrf": 86.46629411219712, + "eval_loss": 0.7965089678764343, + "eval_runtime": 99.5988, + "eval_samples_per_second": 1.004, + "eval_steps_per_second": 0.04, + "step": 15000 + }, + { + "epoch": 0.5014029930518439, + "grad_norm": 0.8116645812988281, + "learning_rate": 0.0001925932207414569, + "loss": 0.4801, + "step": 15010 + }, + { + "epoch": 0.5017370390165686, + "grad_norm": 0.8164967894554138, + "learning_rate": 0.00019257930695072773, + "loss": 0.4299, + "step": 15020 + }, + { + "epoch": 0.5020710849812934, + "grad_norm": 0.7406567335128784, + "learning_rate": 0.00019256538060714352, + "loss": 0.4643, + "step": 15030 + }, + { + "epoch": 0.5024051309460181, + "grad_norm": 0.788263738155365, + "learning_rate": 0.00019255144171259256, + "loss": 0.4783, + "step": 15040 + }, + { + "epoch": 0.5027391769107429, + "grad_norm": 0.6406670212745667, + "learning_rate": 0.00019253749026896478, + "loss": 0.4294, + "step": 15050 + }, + { + "epoch": 0.5030732228754676, + "grad_norm": 0.786417543888092, + "learning_rate": 0.00019252352627815192, + "loss": 0.4703, + "step": 15060 + }, + { + "epoch": 0.5034072688401924, + "grad_norm": 0.6029480695724487, + "learning_rate": 0.0001925095497420473, + "loss": 0.4673, + "step": 15070 + }, + { + "epoch": 0.5037413148049171, + "grad_norm": 0.8151570558547974, + "learning_rate": 0.00019249556066254603, + "loss": 0.4164, + "step": 15080 + }, + { + "epoch": 0.5040753607696419, + "grad_norm": 0.6242373585700989, + "learning_rate": 0.00019248155904154492, + "loss": 0.4308, + "step": 15090 + }, + { + "epoch": 0.5044094067343666, + "grad_norm": 0.7308436036109924, + "learning_rate": 0.0001924675448809424, + "loss": 0.474, + "step": 15100 + }, + { + "epoch": 0.5047434526990914, + "grad_norm": 0.679319441318512, + "learning_rate": 0.0001924535181826387, + "loss": 0.4712, + "step": 15110 + }, + { + "epoch": 0.5050774986638161, + "grad_norm": 0.6975399255752563, + "learning_rate": 0.0001924394789485357, + "loss": 0.442, + "step": 15120 + }, + { + "epoch": 0.5054115446285409, + "grad_norm": 0.7881934642791748, + "learning_rate": 0.00019242542718053698, + "loss": 0.4608, + "step": 15130 + }, + { + "epoch": 0.5057455905932656, + "grad_norm": 0.7533766627311707, + "learning_rate": 0.00019241136288054777, + "loss": 0.4609, + "step": 15140 + }, + { + "epoch": 0.5060796365579904, + "grad_norm": 0.6878604292869568, + "learning_rate": 0.00019239728605047513, + "loss": 0.4568, + "step": 15150 + }, + { + "epoch": 0.5064136825227151, + "grad_norm": 0.7600917220115662, + "learning_rate": 0.0001923831966922277, + "loss": 0.4294, + "step": 15160 + }, + { + "epoch": 0.5067477284874399, + "grad_norm": 0.8359717130661011, + "learning_rate": 0.00019236909480771586, + "loss": 0.4413, + "step": 15170 + }, + { + "epoch": 0.5070817744521646, + "grad_norm": 0.7284037470817566, + "learning_rate": 0.00019235498039885172, + "loss": 0.439, + "step": 15180 + }, + { + "epoch": 0.5074158204168894, + "grad_norm": 0.7746601104736328, + "learning_rate": 0.00019234085346754904, + "loss": 0.462, + "step": 15190 + }, + { + "epoch": 0.5077498663816141, + "grad_norm": 0.6840062737464905, + "learning_rate": 0.00019232671401572327, + "loss": 0.4253, + "step": 15200 + }, + { + "epoch": 0.5080839123463389, + "grad_norm": 0.7358014583587646, + "learning_rate": 0.0001923125620452916, + "loss": 0.4353, + "step": 15210 + }, + { + "epoch": 0.5084179583110636, + "grad_norm": 0.6194219589233398, + "learning_rate": 0.0001922983975581729, + "loss": 0.436, + "step": 15220 + }, + { + "epoch": 0.5087520042757884, + "grad_norm": 0.6819415092468262, + "learning_rate": 0.00019228422055628776, + "loss": 0.4463, + "step": 15230 + }, + { + "epoch": 0.5090860502405131, + "grad_norm": 0.8110389113426208, + "learning_rate": 0.00019227003104155843, + "loss": 0.4319, + "step": 15240 + }, + { + "epoch": 0.5094200962052379, + "grad_norm": 0.7222191691398621, + "learning_rate": 0.00019225582901590883, + "loss": 0.4591, + "step": 15250 + }, + { + "epoch": 0.5097541421699626, + "grad_norm": 0.6072009205818176, + "learning_rate": 0.00019224161448126465, + "loss": 0.4434, + "step": 15260 + }, + { + "epoch": 0.5100881881346874, + "grad_norm": 0.7633602619171143, + "learning_rate": 0.00019222738743955326, + "loss": 0.4484, + "step": 15270 + }, + { + "epoch": 0.5104222340994121, + "grad_norm": 0.6734861135482788, + "learning_rate": 0.00019221314789270367, + "loss": 0.4901, + "step": 15280 + }, + { + "epoch": 0.5107562800641369, + "grad_norm": 0.8658114671707153, + "learning_rate": 0.00019219889584264665, + "loss": 0.4359, + "step": 15290 + }, + { + "epoch": 0.5110903260288616, + "grad_norm": 0.7052868604660034, + "learning_rate": 0.00019218463129131466, + "loss": 0.4385, + "step": 15300 + }, + { + "epoch": 0.5114243719935864, + "grad_norm": 0.8116106390953064, + "learning_rate": 0.0001921703542406418, + "loss": 0.4308, + "step": 15310 + }, + { + "epoch": 0.5117584179583111, + "grad_norm": 0.6368309855461121, + "learning_rate": 0.00019215606469256388, + "loss": 0.4244, + "step": 15320 + }, + { + "epoch": 0.5120924639230358, + "grad_norm": 0.8387051820755005, + "learning_rate": 0.0001921417626490185, + "loss": 0.4454, + "step": 15330 + }, + { + "epoch": 0.5124265098877605, + "grad_norm": 0.7951540350914001, + "learning_rate": 0.0001921274481119448, + "loss": 0.4264, + "step": 15340 + }, + { + "epoch": 0.5127605558524853, + "grad_norm": 0.6789252161979675, + "learning_rate": 0.00019211312108328375, + "loss": 0.4343, + "step": 15350 + }, + { + "epoch": 0.51309460181721, + "grad_norm": 0.677387535572052, + "learning_rate": 0.0001920987815649779, + "loss": 0.4547, + "step": 15360 + }, + { + "epoch": 0.5134286477819348, + "grad_norm": 0.668830394744873, + "learning_rate": 0.00019208442955897156, + "loss": 0.4479, + "step": 15370 + }, + { + "epoch": 0.5137626937466595, + "grad_norm": 0.8084149956703186, + "learning_rate": 0.00019207006506721078, + "loss": 0.4516, + "step": 15380 + }, + { + "epoch": 0.5140967397113843, + "grad_norm": 0.7563090920448303, + "learning_rate": 0.00019205568809164318, + "loss": 0.4604, + "step": 15390 + }, + { + "epoch": 0.514430785676109, + "grad_norm": 0.7330135107040405, + "learning_rate": 0.0001920412986342182, + "loss": 0.4355, + "step": 15400 + }, + { + "epoch": 0.5147648316408338, + "grad_norm": 0.6771528720855713, + "learning_rate": 0.00019202689669688686, + "loss": 0.4398, + "step": 15410 + }, + { + "epoch": 0.5150988776055585, + "grad_norm": 0.5608977675437927, + "learning_rate": 0.00019201248228160195, + "loss": 0.4746, + "step": 15420 + }, + { + "epoch": 0.5154329235702833, + "grad_norm": 0.7341822385787964, + "learning_rate": 0.00019199805539031788, + "loss": 0.4537, + "step": 15430 + }, + { + "epoch": 0.515766969535008, + "grad_norm": 0.7937577366828918, + "learning_rate": 0.00019198361602499087, + "loss": 0.4712, + "step": 15440 + }, + { + "epoch": 0.5161010154997328, + "grad_norm": 0.681412398815155, + "learning_rate": 0.00019196916418757866, + "loss": 0.4801, + "step": 15450 + }, + { + "epoch": 0.5164350614644575, + "grad_norm": 0.7065302729606628, + "learning_rate": 0.0001919546998800409, + "loss": 0.4253, + "step": 15460 + }, + { + "epoch": 0.5167691074291823, + "grad_norm": 0.659758985042572, + "learning_rate": 0.0001919402231043387, + "loss": 0.4528, + "step": 15470 + }, + { + "epoch": 0.517103153393907, + "grad_norm": 0.8960694074630737, + "learning_rate": 0.00019192573386243503, + "loss": 0.4481, + "step": 15480 + }, + { + "epoch": 0.5174371993586317, + "grad_norm": 0.6747415065765381, + "learning_rate": 0.0001919112321562945, + "loss": 0.404, + "step": 15490 + }, + { + "epoch": 0.5177712453233565, + "grad_norm": 0.6414953470230103, + "learning_rate": 0.00019189671798788336, + "loss": 0.429, + "step": 15500 + }, + { + "epoch": 0.5181052912880812, + "grad_norm": 0.7728968262672424, + "learning_rate": 0.0001918821913591696, + "loss": 0.4734, + "step": 15510 + }, + { + "epoch": 0.518439337252806, + "grad_norm": 0.6479413509368896, + "learning_rate": 0.00019186765227212292, + "loss": 0.4352, + "step": 15520 + }, + { + "epoch": 0.5187733832175307, + "grad_norm": 0.6454426646232605, + "learning_rate": 0.00019185310072871464, + "loss": 0.4518, + "step": 15530 + }, + { + "epoch": 0.5191074291822555, + "grad_norm": 0.7421649694442749, + "learning_rate": 0.0001918385367309178, + "loss": 0.45, + "step": 15540 + }, + { + "epoch": 0.5194414751469802, + "grad_norm": 0.637956976890564, + "learning_rate": 0.0001918239602807072, + "loss": 0.4472, + "step": 15550 + }, + { + "epoch": 0.519775521111705, + "grad_norm": 0.8598539233207703, + "learning_rate": 0.00019180937138005923, + "loss": 0.4737, + "step": 15560 + }, + { + "epoch": 0.5201095670764297, + "grad_norm": 0.7481197714805603, + "learning_rate": 0.00019179477003095197, + "loss": 0.4574, + "step": 15570 + }, + { + "epoch": 0.5204436130411545, + "grad_norm": 0.8257797360420227, + "learning_rate": 0.00019178015623536526, + "loss": 0.4223, + "step": 15580 + }, + { + "epoch": 0.5207776590058792, + "grad_norm": 0.6709581017494202, + "learning_rate": 0.00019176552999528062, + "loss": 0.4244, + "step": 15590 + }, + { + "epoch": 0.521111704970604, + "grad_norm": 0.9369125366210938, + "learning_rate": 0.00019175089131268113, + "loss": 0.4399, + "step": 15600 + }, + { + "epoch": 0.5214457509353287, + "grad_norm": 0.7677419781684875, + "learning_rate": 0.00019173624018955173, + "loss": 0.4801, + "step": 15610 + }, + { + "epoch": 0.5217797969000535, + "grad_norm": 0.6755557656288147, + "learning_rate": 0.00019172157662787894, + "loss": 0.427, + "step": 15620 + }, + { + "epoch": 0.5221138428647782, + "grad_norm": 0.7098275423049927, + "learning_rate": 0.00019170690062965103, + "loss": 0.4503, + "step": 15630 + }, + { + "epoch": 0.522447888829503, + "grad_norm": 0.6328698396682739, + "learning_rate": 0.00019169221219685786, + "loss": 0.4692, + "step": 15640 + }, + { + "epoch": 0.5227819347942276, + "grad_norm": 0.6760245561599731, + "learning_rate": 0.0001916775113314911, + "loss": 0.486, + "step": 15650 + }, + { + "epoch": 0.5231159807589524, + "grad_norm": 0.7246215343475342, + "learning_rate": 0.000191662798035544, + "loss": 0.4325, + "step": 15660 + }, + { + "epoch": 0.5234500267236771, + "grad_norm": 0.7984539270401001, + "learning_rate": 0.0001916480723110116, + "loss": 0.4764, + "step": 15670 + }, + { + "epoch": 0.5237840726884019, + "grad_norm": 0.5351455807685852, + "learning_rate": 0.0001916333341598905, + "loss": 0.4368, + "step": 15680 + }, + { + "epoch": 0.5241181186531266, + "grad_norm": 0.791249692440033, + "learning_rate": 0.0001916185835841791, + "loss": 0.4763, + "step": 15690 + }, + { + "epoch": 0.5244521646178514, + "grad_norm": 0.7245512008666992, + "learning_rate": 0.0001916038205858774, + "loss": 0.446, + "step": 15700 + }, + { + "epoch": 0.5247862105825761, + "grad_norm": 0.7188728451728821, + "learning_rate": 0.00019158904516698713, + "loss": 0.455, + "step": 15710 + }, + { + "epoch": 0.5251202565473009, + "grad_norm": 0.8127872347831726, + "learning_rate": 0.0001915742573295117, + "loss": 0.4159, + "step": 15720 + }, + { + "epoch": 0.5254543025120256, + "grad_norm": 0.6377723813056946, + "learning_rate": 0.00019155945707545618, + "loss": 0.4201, + "step": 15730 + }, + { + "epoch": 0.5257883484767504, + "grad_norm": 0.8241913914680481, + "learning_rate": 0.00019154464440682736, + "loss": 0.4587, + "step": 15740 + }, + { + "epoch": 0.5261223944414751, + "grad_norm": 0.7634080052375793, + "learning_rate": 0.00019152981932563367, + "loss": 0.4232, + "step": 15750 + }, + { + "epoch": 0.5264564404061999, + "grad_norm": 0.6852890253067017, + "learning_rate": 0.00019151498183388527, + "loss": 0.4749, + "step": 15760 + }, + { + "epoch": 0.5267904863709246, + "grad_norm": 0.7494749426841736, + "learning_rate": 0.000191500131933594, + "loss": 0.4356, + "step": 15770 + }, + { + "epoch": 0.5271245323356494, + "grad_norm": 0.7664982080459595, + "learning_rate": 0.0001914852696267733, + "loss": 0.4844, + "step": 15780 + }, + { + "epoch": 0.5274585783003741, + "grad_norm": 0.6282423734664917, + "learning_rate": 0.0001914703949154384, + "loss": 0.4569, + "step": 15790 + }, + { + "epoch": 0.5277926242650989, + "grad_norm": 0.8209015130996704, + "learning_rate": 0.00019145550780160617, + "loss": 0.4576, + "step": 15800 + }, + { + "epoch": 0.5281266702298236, + "grad_norm": 0.7040371298789978, + "learning_rate": 0.00019144060828729512, + "loss": 0.4511, + "step": 15810 + }, + { + "epoch": 0.5284607161945484, + "grad_norm": 0.788470983505249, + "learning_rate": 0.00019142569637452557, + "loss": 0.4292, + "step": 15820 + }, + { + "epoch": 0.5287947621592731, + "grad_norm": 0.8145607113838196, + "learning_rate": 0.0001914107720653193, + "loss": 0.4623, + "step": 15830 + }, + { + "epoch": 0.5291288081239979, + "grad_norm": 0.667375922203064, + "learning_rate": 0.00019139583536169997, + "loss": 0.4374, + "step": 15840 + }, + { + "epoch": 0.5294628540887226, + "grad_norm": 0.6685918569564819, + "learning_rate": 0.00019138088626569286, + "loss": 0.4343, + "step": 15850 + }, + { + "epoch": 0.5297969000534474, + "grad_norm": 0.811694860458374, + "learning_rate": 0.0001913659247793249, + "loss": 0.4023, + "step": 15860 + }, + { + "epoch": 0.5301309460181721, + "grad_norm": 0.7222856879234314, + "learning_rate": 0.00019135095090462475, + "loss": 0.4536, + "step": 15870 + }, + { + "epoch": 0.5304649919828969, + "grad_norm": 0.7558906078338623, + "learning_rate": 0.0001913359646436227, + "loss": 0.4116, + "step": 15880 + }, + { + "epoch": 0.5307990379476216, + "grad_norm": 0.5928136706352234, + "learning_rate": 0.00019132096599835076, + "loss": 0.4377, + "step": 15890 + }, + { + "epoch": 0.5311330839123464, + "grad_norm": 0.8291724920272827, + "learning_rate": 0.00019130595497084257, + "loss": 0.4745, + "step": 15900 + }, + { + "epoch": 0.5314671298770711, + "grad_norm": 0.720106303691864, + "learning_rate": 0.00019129093156313348, + "loss": 0.4548, + "step": 15910 + }, + { + "epoch": 0.5318011758417959, + "grad_norm": 0.7265968918800354, + "learning_rate": 0.00019127589577726058, + "loss": 0.4445, + "step": 15920 + }, + { + "epoch": 0.5321352218065206, + "grad_norm": 0.7178078889846802, + "learning_rate": 0.00019126084761526252, + "loss": 0.4876, + "step": 15930 + }, + { + "epoch": 0.5324692677712454, + "grad_norm": 0.7136808633804321, + "learning_rate": 0.00019124578707917965, + "loss": 0.4401, + "step": 15940 + }, + { + "epoch": 0.5328033137359701, + "grad_norm": 0.6776118874549866, + "learning_rate": 0.0001912307141710541, + "loss": 0.4214, + "step": 15950 + }, + { + "epoch": 0.5331373597006949, + "grad_norm": 0.7082982659339905, + "learning_rate": 0.0001912156288929296, + "loss": 0.4455, + "step": 15960 + }, + { + "epoch": 0.5334714056654195, + "grad_norm": 0.7196520566940308, + "learning_rate": 0.00019120053124685156, + "loss": 0.4217, + "step": 15970 + }, + { + "epoch": 0.5338054516301443, + "grad_norm": 0.6733249425888062, + "learning_rate": 0.00019118542123486708, + "loss": 0.4554, + "step": 15980 + }, + { + "epoch": 0.534139497594869, + "grad_norm": 0.7596478462219238, + "learning_rate": 0.00019117029885902488, + "loss": 0.4426, + "step": 15990 + }, + { + "epoch": 0.5344735435595938, + "grad_norm": 0.7029207348823547, + "learning_rate": 0.00019115516412137546, + "loss": 0.4301, + "step": 16000 + }, + { + "epoch": 0.5344735435595938, + "eval_chrf": 86.08410419840978, + "eval_loss": 0.7814594507217407, + "eval_runtime": 94.5792, + "eval_samples_per_second": 1.057, + "eval_steps_per_second": 0.042, + "step": 16000 + }, + { + "epoch": 0.5348075895243185, + "grad_norm": 0.7804129123687744, + "learning_rate": 0.00019114001702397095, + "loss": 0.4505, + "step": 16010 + }, + { + "epoch": 0.5351416354890433, + "grad_norm": 0.7973668575286865, + "learning_rate": 0.0001911248575688651, + "loss": 0.4549, + "step": 16020 + }, + { + "epoch": 0.535475681453768, + "grad_norm": 0.7568674683570862, + "learning_rate": 0.00019110968575811345, + "loss": 0.4313, + "step": 16030 + }, + { + "epoch": 0.5358097274184928, + "grad_norm": 0.808009147644043, + "learning_rate": 0.0001910945015937731, + "loss": 0.4385, + "step": 16040 + }, + { + "epoch": 0.5361437733832175, + "grad_norm": 0.6917476058006287, + "learning_rate": 0.0001910793050779029, + "loss": 0.4571, + "step": 16050 + }, + { + "epoch": 0.5364778193479423, + "grad_norm": 0.6728397011756897, + "learning_rate": 0.00019106409621256335, + "loss": 0.4329, + "step": 16060 + }, + { + "epoch": 0.536811865312667, + "grad_norm": 0.7723320722579956, + "learning_rate": 0.00019104887499981658, + "loss": 0.4659, + "step": 16070 + }, + { + "epoch": 0.5371459112773918, + "grad_norm": 0.681516706943512, + "learning_rate": 0.00019103364144172652, + "loss": 0.4766, + "step": 16080 + }, + { + "epoch": 0.5374799572421165, + "grad_norm": 0.7144908905029297, + "learning_rate": 0.0001910183955403586, + "loss": 0.4077, + "step": 16090 + }, + { + "epoch": 0.5378140032068413, + "grad_norm": 0.780248761177063, + "learning_rate": 0.00019100313729778008, + "loss": 0.4713, + "step": 16100 + }, + { + "epoch": 0.538148049171566, + "grad_norm": 0.7179064154624939, + "learning_rate": 0.0001909878667160598, + "loss": 0.4408, + "step": 16110 + }, + { + "epoch": 0.5384820951362908, + "grad_norm": 0.6880764961242676, + "learning_rate": 0.00019097258379726834, + "loss": 0.4256, + "step": 16120 + }, + { + "epoch": 0.5388161411010155, + "grad_norm": 0.6189603805541992, + "learning_rate": 0.00019095728854347788, + "loss": 0.452, + "step": 16130 + }, + { + "epoch": 0.5391501870657402, + "grad_norm": 0.7177815437316895, + "learning_rate": 0.00019094198095676233, + "loss": 0.4578, + "step": 16140 + }, + { + "epoch": 0.539484233030465, + "grad_norm": 0.734308123588562, + "learning_rate": 0.00019092666103919724, + "loss": 0.4492, + "step": 16150 + }, + { + "epoch": 0.5398182789951897, + "grad_norm": 0.7574283480644226, + "learning_rate": 0.0001909113287928598, + "loss": 0.4692, + "step": 16160 + }, + { + "epoch": 0.5401523249599145, + "grad_norm": 0.8085964322090149, + "learning_rate": 0.00019089598421982896, + "loss": 0.4733, + "step": 16170 + }, + { + "epoch": 0.5404863709246392, + "grad_norm": 0.6251456141471863, + "learning_rate": 0.00019088062732218534, + "loss": 0.4544, + "step": 16180 + }, + { + "epoch": 0.540820416889364, + "grad_norm": 0.7504349946975708, + "learning_rate": 0.00019086525810201111, + "loss": 0.4612, + "step": 16190 + }, + { + "epoch": 0.5411544628540887, + "grad_norm": 0.7828432321548462, + "learning_rate": 0.0001908498765613902, + "loss": 0.4573, + "step": 16200 + }, + { + "epoch": 0.5414885088188135, + "grad_norm": 0.7420288920402527, + "learning_rate": 0.00019083448270240823, + "loss": 0.4531, + "step": 16210 + }, + { + "epoch": 0.5418225547835382, + "grad_norm": 0.7242956757545471, + "learning_rate": 0.0001908190765271524, + "loss": 0.4332, + "step": 16220 + }, + { + "epoch": 0.542156600748263, + "grad_norm": 0.9198775291442871, + "learning_rate": 0.00019080365803771172, + "loss": 0.4424, + "step": 16230 + }, + { + "epoch": 0.5424906467129877, + "grad_norm": 0.6290988326072693, + "learning_rate": 0.00019078822723617674, + "loss": 0.4559, + "step": 16240 + }, + { + "epoch": 0.5428246926777125, + "grad_norm": 0.7094000577926636, + "learning_rate": 0.0001907727841246397, + "loss": 0.4331, + "step": 16250 + }, + { + "epoch": 0.5431587386424372, + "grad_norm": 0.7957440614700317, + "learning_rate": 0.00019075732870519455, + "loss": 0.4411, + "step": 16260 + }, + { + "epoch": 0.543492784607162, + "grad_norm": 0.5723440051078796, + "learning_rate": 0.00019074186097993697, + "loss": 0.4307, + "step": 16270 + }, + { + "epoch": 0.5438268305718867, + "grad_norm": 0.6568721532821655, + "learning_rate": 0.00019072638095096414, + "loss": 0.4394, + "step": 16280 + }, + { + "epoch": 0.5441608765366115, + "grad_norm": 0.7496821880340576, + "learning_rate": 0.00019071088862037503, + "loss": 0.428, + "step": 16290 + }, + { + "epoch": 0.5444949225013361, + "grad_norm": 0.8060212135314941, + "learning_rate": 0.00019069538399027027, + "loss": 0.4297, + "step": 16300 + }, + { + "epoch": 0.5448289684660609, + "grad_norm": 0.7004479169845581, + "learning_rate": 0.0001906798670627521, + "loss": 0.4531, + "step": 16310 + }, + { + "epoch": 0.5451630144307856, + "grad_norm": 0.7743666172027588, + "learning_rate": 0.00019066433783992452, + "loss": 0.4356, + "step": 16320 + }, + { + "epoch": 0.5454970603955104, + "grad_norm": 0.81378573179245, + "learning_rate": 0.0001906487963238931, + "loss": 0.4609, + "step": 16330 + }, + { + "epoch": 0.5458311063602351, + "grad_norm": 0.7900034189224243, + "learning_rate": 0.00019063324251676514, + "loss": 0.4354, + "step": 16340 + }, + { + "epoch": 0.5461651523249599, + "grad_norm": 0.8216637372970581, + "learning_rate": 0.00019061767642064955, + "loss": 0.4328, + "step": 16350 + }, + { + "epoch": 0.5464991982896846, + "grad_norm": 0.7408298254013062, + "learning_rate": 0.00019060209803765696, + "loss": 0.4492, + "step": 16360 + }, + { + "epoch": 0.5468332442544094, + "grad_norm": 0.712120771408081, + "learning_rate": 0.00019058650736989964, + "loss": 0.4147, + "step": 16370 + }, + { + "epoch": 0.5471672902191341, + "grad_norm": 0.837378978729248, + "learning_rate": 0.00019057090441949157, + "loss": 0.4335, + "step": 16380 + }, + { + "epoch": 0.5475013361838589, + "grad_norm": 0.7741566300392151, + "learning_rate": 0.00019055528918854832, + "loss": 0.4468, + "step": 16390 + }, + { + "epoch": 0.5478353821485836, + "grad_norm": 0.6116801500320435, + "learning_rate": 0.0001905396616791872, + "loss": 0.4161, + "step": 16400 + }, + { + "epoch": 0.5481694281133084, + "grad_norm": 1.0875941514968872, + "learning_rate": 0.0001905240218935271, + "loss": 0.4787, + "step": 16410 + }, + { + "epoch": 0.5485034740780331, + "grad_norm": 0.6486870646476746, + "learning_rate": 0.00019050836983368867, + "loss": 0.4502, + "step": 16420 + }, + { + "epoch": 0.5488375200427579, + "grad_norm": 0.7513815760612488, + "learning_rate": 0.00019049270550179414, + "loss": 0.4505, + "step": 16430 + }, + { + "epoch": 0.5491715660074826, + "grad_norm": 0.7911511063575745, + "learning_rate": 0.00019047702889996745, + "loss": 0.4162, + "step": 16440 + }, + { + "epoch": 0.5495056119722074, + "grad_norm": 0.7822365760803223, + "learning_rate": 0.00019046134003033424, + "loss": 0.4438, + "step": 16450 + }, + { + "epoch": 0.5498396579369321, + "grad_norm": 0.7761499881744385, + "learning_rate": 0.00019044563889502168, + "loss": 0.4386, + "step": 16460 + }, + { + "epoch": 0.5501737039016569, + "grad_norm": 0.7255483269691467, + "learning_rate": 0.00019042992549615878, + "loss": 0.4976, + "step": 16470 + }, + { + "epoch": 0.5505077498663816, + "grad_norm": 0.7498269081115723, + "learning_rate": 0.00019041419983587606, + "loss": 0.4667, + "step": 16480 + }, + { + "epoch": 0.5508417958311064, + "grad_norm": 0.7354015707969666, + "learning_rate": 0.0001903984619163058, + "loss": 0.4198, + "step": 16490 + }, + { + "epoch": 0.5511758417958311, + "grad_norm": 0.7770567536354065, + "learning_rate": 0.00019038271173958197, + "loss": 0.4453, + "step": 16500 + }, + { + "epoch": 0.5515098877605559, + "grad_norm": 0.7277063727378845, + "learning_rate": 0.00019036694930784, + "loss": 0.4509, + "step": 16510 + }, + { + "epoch": 0.5518439337252806, + "grad_norm": 0.5887619853019714, + "learning_rate": 0.0001903511746232172, + "loss": 0.4349, + "step": 16520 + }, + { + "epoch": 0.5521779796900054, + "grad_norm": 0.6408060789108276, + "learning_rate": 0.00019033538768785247, + "loss": 0.4336, + "step": 16530 + }, + { + "epoch": 0.5525120256547301, + "grad_norm": 0.8185721635818481, + "learning_rate": 0.00019031958850388635, + "loss": 0.4307, + "step": 16540 + }, + { + "epoch": 0.5528460716194549, + "grad_norm": 0.874928891658783, + "learning_rate": 0.00019030377707346106, + "loss": 0.43, + "step": 16550 + }, + { + "epoch": 0.5531801175841796, + "grad_norm": 0.6574690937995911, + "learning_rate": 0.0001902879533987205, + "loss": 0.4012, + "step": 16560 + }, + { + "epoch": 0.5535141635489044, + "grad_norm": 0.7946933507919312, + "learning_rate": 0.00019027211748181015, + "loss": 0.4639, + "step": 16570 + }, + { + "epoch": 0.5538482095136291, + "grad_norm": 0.6958156228065491, + "learning_rate": 0.0001902562693248773, + "loss": 0.4142, + "step": 16580 + }, + { + "epoch": 0.5541822554783539, + "grad_norm": 0.6246572732925415, + "learning_rate": 0.00019024040893007068, + "loss": 0.4419, + "step": 16590 + }, + { + "epoch": 0.5545163014430786, + "grad_norm": 0.699303388595581, + "learning_rate": 0.00019022453629954092, + "loss": 0.4454, + "step": 16600 + }, + { + "epoch": 0.5548503474078034, + "grad_norm": 0.7020283341407776, + "learning_rate": 0.0001902086514354401, + "loss": 0.4575, + "step": 16610 + }, + { + "epoch": 0.555184393372528, + "grad_norm": 0.7838448286056519, + "learning_rate": 0.00019019275433992215, + "loss": 0.4846, + "step": 16620 + }, + { + "epoch": 0.5555184393372528, + "grad_norm": 0.6286579966545105, + "learning_rate": 0.00019017684501514248, + "loss": 0.4491, + "step": 16630 + }, + { + "epoch": 0.5558524853019775, + "grad_norm": 0.7373703122138977, + "learning_rate": 0.0001901609234632583, + "loss": 0.431, + "step": 16640 + }, + { + "epoch": 0.5561865312667023, + "grad_norm": 0.7619877457618713, + "learning_rate": 0.00019014498968642835, + "loss": 0.4724, + "step": 16650 + }, + { + "epoch": 0.556520577231427, + "grad_norm": 0.8069422245025635, + "learning_rate": 0.00019012904368681315, + "loss": 0.4479, + "step": 16660 + }, + { + "epoch": 0.5568546231961518, + "grad_norm": 0.8838154077529907, + "learning_rate": 0.00019011308546657482, + "loss": 0.4579, + "step": 16670 + }, + { + "epoch": 0.5571886691608765, + "grad_norm": 0.6279448866844177, + "learning_rate": 0.00019009711502787714, + "loss": 0.4424, + "step": 16680 + }, + { + "epoch": 0.5575227151256013, + "grad_norm": 0.8726599812507629, + "learning_rate": 0.0001900811323728855, + "loss": 0.3913, + "step": 16690 + }, + { + "epoch": 0.557856761090326, + "grad_norm": 0.7100908756256104, + "learning_rate": 0.00019006513750376706, + "loss": 0.4535, + "step": 16700 + }, + { + "epoch": 0.5581908070550508, + "grad_norm": 0.7265294194221497, + "learning_rate": 0.00019004913042269052, + "loss": 0.4359, + "step": 16710 + }, + { + "epoch": 0.5585248530197755, + "grad_norm": 0.7700689435005188, + "learning_rate": 0.0001900331111318263, + "loss": 0.4161, + "step": 16720 + }, + { + "epoch": 0.5588588989845003, + "grad_norm": 0.7208101153373718, + "learning_rate": 0.00019001707963334647, + "loss": 0.4509, + "step": 16730 + }, + { + "epoch": 0.559192944949225, + "grad_norm": 0.7604899406433105, + "learning_rate": 0.00019000103592942472, + "loss": 0.4758, + "step": 16740 + }, + { + "epoch": 0.5595269909139498, + "grad_norm": 0.8759505152702332, + "learning_rate": 0.00018998498002223646, + "loss": 0.4322, + "step": 16750 + }, + { + "epoch": 0.5598610368786745, + "grad_norm": 0.7261239886283875, + "learning_rate": 0.00018996891191395867, + "loss": 0.4519, + "step": 16760 + }, + { + "epoch": 0.5601950828433993, + "grad_norm": 0.7054134607315063, + "learning_rate": 0.00018995283160677006, + "loss": 0.4668, + "step": 16770 + }, + { + "epoch": 0.560529128808124, + "grad_norm": 0.6874467730522156, + "learning_rate": 0.00018993673910285098, + "loss": 0.4515, + "step": 16780 + }, + { + "epoch": 0.5608631747728487, + "grad_norm": 0.7071050405502319, + "learning_rate": 0.00018992063440438335, + "loss": 0.44, + "step": 16790 + }, + { + "epoch": 0.5611972207375735, + "grad_norm": 0.7485989332199097, + "learning_rate": 0.00018990451751355088, + "loss": 0.417, + "step": 16800 + }, + { + "epoch": 0.5615312667022982, + "grad_norm": 0.7377429604530334, + "learning_rate": 0.00018988838843253883, + "loss": 0.4292, + "step": 16810 + }, + { + "epoch": 0.561865312667023, + "grad_norm": 0.7726672291755676, + "learning_rate": 0.00018987224716353413, + "loss": 0.4561, + "step": 16820 + }, + { + "epoch": 0.5621993586317477, + "grad_norm": 0.7322279810905457, + "learning_rate": 0.00018985609370872542, + "loss": 0.432, + "step": 16830 + }, + { + "epoch": 0.5625334045964725, + "grad_norm": 0.7547426819801331, + "learning_rate": 0.00018983992807030298, + "loss": 0.4149, + "step": 16840 + }, + { + "epoch": 0.5628674505611972, + "grad_norm": 0.6458396911621094, + "learning_rate": 0.00018982375025045862, + "loss": 0.4424, + "step": 16850 + }, + { + "epoch": 0.563201496525922, + "grad_norm": 0.6524646282196045, + "learning_rate": 0.00018980756025138595, + "loss": 0.438, + "step": 16860 + }, + { + "epoch": 0.5635355424906467, + "grad_norm": 0.7206399440765381, + "learning_rate": 0.0001897913580752802, + "loss": 0.4249, + "step": 16870 + }, + { + "epoch": 0.5638695884553715, + "grad_norm": 0.5700181126594543, + "learning_rate": 0.00018977514372433817, + "loss": 0.4443, + "step": 16880 + }, + { + "epoch": 0.5642036344200962, + "grad_norm": 0.7268218398094177, + "learning_rate": 0.00018975891720075837, + "loss": 0.4885, + "step": 16890 + }, + { + "epoch": 0.564537680384821, + "grad_norm": 0.7459208369255066, + "learning_rate": 0.000189742678506741, + "loss": 0.473, + "step": 16900 + }, + { + "epoch": 0.5648717263495457, + "grad_norm": 0.7018886804580688, + "learning_rate": 0.0001897264276444879, + "loss": 0.4635, + "step": 16910 + }, + { + "epoch": 0.5652057723142705, + "grad_norm": 0.6950275301933289, + "learning_rate": 0.00018971016461620245, + "loss": 0.4491, + "step": 16920 + }, + { + "epoch": 0.5655398182789952, + "grad_norm": 0.6016286611557007, + "learning_rate": 0.0001896938894240898, + "loss": 0.442, + "step": 16930 + }, + { + "epoch": 0.5658738642437199, + "grad_norm": 0.7165111303329468, + "learning_rate": 0.0001896776020703567, + "loss": 0.4549, + "step": 16940 + }, + { + "epoch": 0.5662079102084446, + "grad_norm": 0.7364968061447144, + "learning_rate": 0.00018966130255721156, + "loss": 0.4585, + "step": 16950 + }, + { + "epoch": 0.5665419561731694, + "grad_norm": 0.611731231212616, + "learning_rate": 0.0001896449908868644, + "loss": 0.4467, + "step": 16960 + }, + { + "epoch": 0.5668760021378941, + "grad_norm": 0.6870841383934021, + "learning_rate": 0.000189628667061527, + "loss": 0.4597, + "step": 16970 + }, + { + "epoch": 0.5672100481026189, + "grad_norm": 0.7176504135131836, + "learning_rate": 0.00018961233108341267, + "loss": 0.455, + "step": 16980 + }, + { + "epoch": 0.5675440940673436, + "grad_norm": 0.8188748359680176, + "learning_rate": 0.00018959598295473638, + "loss": 0.4139, + "step": 16990 + }, + { + "epoch": 0.5678781400320684, + "grad_norm": 0.6779983639717102, + "learning_rate": 0.0001895796226777148, + "loss": 0.4627, + "step": 17000 + }, + { + "epoch": 0.5678781400320684, + "eval_chrf": 86.40975264875559, + "eval_loss": 0.7821881771087646, + "eval_runtime": 100.4788, + "eval_samples_per_second": 0.995, + "eval_steps_per_second": 0.04, + "step": 17000 + }, + { + "epoch": 0.5682121859967931, + "grad_norm": 0.670108437538147, + "learning_rate": 0.00018956325025456627, + "loss": 0.4439, + "step": 17010 + }, + { + "epoch": 0.5685462319615179, + "grad_norm": 0.6723058819770813, + "learning_rate": 0.0001895468656875107, + "loss": 0.4414, + "step": 17020 + }, + { + "epoch": 0.5688802779262426, + "grad_norm": 0.7577635645866394, + "learning_rate": 0.00018953046897876962, + "loss": 0.4269, + "step": 17030 + }, + { + "epoch": 0.5692143238909674, + "grad_norm": 0.7833504676818848, + "learning_rate": 0.00018951406013056635, + "loss": 0.4177, + "step": 17040 + }, + { + "epoch": 0.5695483698556921, + "grad_norm": 0.6904743909835815, + "learning_rate": 0.00018949763914512575, + "loss": 0.4619, + "step": 17050 + }, + { + "epoch": 0.5698824158204169, + "grad_norm": 0.7813600897789001, + "learning_rate": 0.00018948120602467431, + "loss": 0.4262, + "step": 17060 + }, + { + "epoch": 0.5702164617851416, + "grad_norm": 0.6988306045532227, + "learning_rate": 0.00018946476077144023, + "loss": 0.4501, + "step": 17070 + }, + { + "epoch": 0.5705505077498664, + "grad_norm": 0.7617890238761902, + "learning_rate": 0.00018944830338765334, + "loss": 0.4195, + "step": 17080 + }, + { + "epoch": 0.5708845537145911, + "grad_norm": 0.6880974173545837, + "learning_rate": 0.00018943183387554503, + "loss": 0.4457, + "step": 17090 + }, + { + "epoch": 0.5712185996793159, + "grad_norm": 0.6884957551956177, + "learning_rate": 0.00018941535223734852, + "loss": 0.4685, + "step": 17100 + }, + { + "epoch": 0.5715526456440406, + "grad_norm": 0.7300429344177246, + "learning_rate": 0.00018939885847529846, + "loss": 0.4505, + "step": 17110 + }, + { + "epoch": 0.5718866916087654, + "grad_norm": 0.7267704606056213, + "learning_rate": 0.0001893823525916313, + "loss": 0.4261, + "step": 17120 + }, + { + "epoch": 0.5722207375734901, + "grad_norm": 0.6211147904396057, + "learning_rate": 0.00018936583458858505, + "loss": 0.446, + "step": 17130 + }, + { + "epoch": 0.5725547835382149, + "grad_norm": 0.79896479845047, + "learning_rate": 0.0001893493044683994, + "loss": 0.4296, + "step": 17140 + }, + { + "epoch": 0.5728888295029396, + "grad_norm": 0.7260019183158875, + "learning_rate": 0.00018933276223331567, + "loss": 0.4277, + "step": 17150 + }, + { + "epoch": 0.5732228754676644, + "grad_norm": 0.6593769788742065, + "learning_rate": 0.00018931620788557684, + "loss": 0.409, + "step": 17160 + }, + { + "epoch": 0.5735569214323891, + "grad_norm": 0.8949527144432068, + "learning_rate": 0.0001892996414274275, + "loss": 0.44, + "step": 17170 + }, + { + "epoch": 0.5738909673971139, + "grad_norm": 0.694253146648407, + "learning_rate": 0.00018928306286111393, + "loss": 0.4659, + "step": 17180 + }, + { + "epoch": 0.5742250133618386, + "grad_norm": 0.7421480417251587, + "learning_rate": 0.00018926647218888402, + "loss": 0.4307, + "step": 17190 + }, + { + "epoch": 0.5745590593265634, + "grad_norm": 0.7246625423431396, + "learning_rate": 0.00018924986941298724, + "loss": 0.4602, + "step": 17200 + }, + { + "epoch": 0.5748931052912881, + "grad_norm": 0.7543987035751343, + "learning_rate": 0.00018923325453567484, + "loss": 0.4879, + "step": 17210 + }, + { + "epoch": 0.5752271512560129, + "grad_norm": 0.8306609988212585, + "learning_rate": 0.0001892166275591996, + "loss": 0.4976, + "step": 17220 + }, + { + "epoch": 0.5755611972207376, + "grad_norm": 0.6406270265579224, + "learning_rate": 0.000189199988485816, + "loss": 0.4563, + "step": 17230 + }, + { + "epoch": 0.5758952431854624, + "grad_norm": 0.739643931388855, + "learning_rate": 0.00018918333731778012, + "loss": 0.4286, + "step": 17240 + }, + { + "epoch": 0.5762292891501871, + "grad_norm": 0.6590543389320374, + "learning_rate": 0.00018916667405734968, + "loss": 0.4438, + "step": 17250 + }, + { + "epoch": 0.5765633351149118, + "grad_norm": 0.7190114855766296, + "learning_rate": 0.0001891499987067841, + "loss": 0.4685, + "step": 17260 + }, + { + "epoch": 0.5768973810796365, + "grad_norm": 0.6724058389663696, + "learning_rate": 0.00018913331126834434, + "loss": 0.4396, + "step": 17270 + }, + { + "epoch": 0.5772314270443613, + "grad_norm": 0.6686803102493286, + "learning_rate": 0.0001891166117442931, + "loss": 0.4202, + "step": 17280 + }, + { + "epoch": 0.577565473009086, + "grad_norm": 0.7551309466362, + "learning_rate": 0.00018909990013689468, + "loss": 0.4642, + "step": 17290 + }, + { + "epoch": 0.5778995189738108, + "grad_norm": 0.7056359648704529, + "learning_rate": 0.00018908317644841495, + "loss": 0.4883, + "step": 17300 + }, + { + "epoch": 0.5782335649385355, + "grad_norm": 0.6706560850143433, + "learning_rate": 0.00018906644068112155, + "loss": 0.4292, + "step": 17310 + }, + { + "epoch": 0.5785676109032603, + "grad_norm": 0.6797659397125244, + "learning_rate": 0.00018904969283728364, + "loss": 0.4474, + "step": 17320 + }, + { + "epoch": 0.578901656867985, + "grad_norm": 0.537106990814209, + "learning_rate": 0.0001890329329191721, + "loss": 0.4346, + "step": 17330 + }, + { + "epoch": 0.5792357028327098, + "grad_norm": 0.7051129937171936, + "learning_rate": 0.00018901616092905938, + "loss": 0.4656, + "step": 17340 + }, + { + "epoch": 0.5795697487974345, + "grad_norm": 0.7141388654708862, + "learning_rate": 0.0001889993768692196, + "loss": 0.4183, + "step": 17350 + }, + { + "epoch": 0.5799037947621593, + "grad_norm": 0.6953002214431763, + "learning_rate": 0.0001889825807419286, + "loss": 0.4231, + "step": 17360 + }, + { + "epoch": 0.580237840726884, + "grad_norm": 0.6353461146354675, + "learning_rate": 0.00018896577254946361, + "loss": 0.4503, + "step": 17370 + }, + { + "epoch": 0.5805718866916088, + "grad_norm": 0.6196063756942749, + "learning_rate": 0.0001889489522941038, + "loss": 0.4593, + "step": 17380 + }, + { + "epoch": 0.5809059326563335, + "grad_norm": 0.7505100965499878, + "learning_rate": 0.00018893211997812975, + "loss": 0.4169, + "step": 17390 + }, + { + "epoch": 0.5812399786210583, + "grad_norm": 0.6832234263420105, + "learning_rate": 0.00018891527560382384, + "loss": 0.4626, + "step": 17400 + }, + { + "epoch": 0.581574024585783, + "grad_norm": 0.8239922523498535, + "learning_rate": 0.00018889841917346992, + "loss": 0.3986, + "step": 17410 + }, + { + "epoch": 0.5819080705505077, + "grad_norm": 0.6903819441795349, + "learning_rate": 0.00018888155068935358, + "loss": 0.4535, + "step": 17420 + }, + { + "epoch": 0.5822421165152325, + "grad_norm": 0.7833369970321655, + "learning_rate": 0.00018886467015376206, + "loss": 0.4342, + "step": 17430 + }, + { + "epoch": 0.5825761624799572, + "grad_norm": 0.9061029553413391, + "learning_rate": 0.00018884777756898418, + "loss": 0.443, + "step": 17440 + }, + { + "epoch": 0.582910208444682, + "grad_norm": 0.6740909218788147, + "learning_rate": 0.00018883087293731037, + "loss": 0.4942, + "step": 17450 + }, + { + "epoch": 0.5832442544094067, + "grad_norm": 0.7982663512229919, + "learning_rate": 0.00018881395626103278, + "loss": 0.4473, + "step": 17460 + }, + { + "epoch": 0.5835783003741315, + "grad_norm": 1.0617347955703735, + "learning_rate": 0.00018879702754244515, + "loss": 0.442, + "step": 17470 + }, + { + "epoch": 0.5839123463388562, + "grad_norm": 0.7619776129722595, + "learning_rate": 0.0001887800867838428, + "loss": 0.4279, + "step": 17480 + }, + { + "epoch": 0.584246392303581, + "grad_norm": 0.7402876615524292, + "learning_rate": 0.00018876313398752279, + "loss": 0.4362, + "step": 17490 + }, + { + "epoch": 0.5845804382683057, + "grad_norm": 0.6315394639968872, + "learning_rate": 0.00018874616915578373, + "loss": 0.4376, + "step": 17500 + }, + { + "epoch": 0.5849144842330305, + "grad_norm": 0.7361652255058289, + "learning_rate": 0.00018872919229092586, + "loss": 0.4848, + "step": 17510 + }, + { + "epoch": 0.5852485301977552, + "grad_norm": 0.7487019300460815, + "learning_rate": 0.00018871220339525114, + "loss": 0.4408, + "step": 17520 + }, + { + "epoch": 0.58558257616248, + "grad_norm": 0.7024465799331665, + "learning_rate": 0.00018869520247106304, + "loss": 0.4625, + "step": 17530 + }, + { + "epoch": 0.5859166221272047, + "grad_norm": 0.7404446005821228, + "learning_rate": 0.00018867818952066675, + "loss": 0.4403, + "step": 17540 + }, + { + "epoch": 0.5862506680919295, + "grad_norm": 0.7573193907737732, + "learning_rate": 0.00018866116454636906, + "loss": 0.4013, + "step": 17550 + }, + { + "epoch": 0.5865847140566542, + "grad_norm": 0.707602858543396, + "learning_rate": 0.0001886441275504784, + "loss": 0.4423, + "step": 17560 + }, + { + "epoch": 0.586918760021379, + "grad_norm": 0.6677404046058655, + "learning_rate": 0.00018862707853530477, + "loss": 0.4374, + "step": 17570 + }, + { + "epoch": 0.5872528059861037, + "grad_norm": 0.5912588834762573, + "learning_rate": 0.00018861001750315992, + "loss": 0.4275, + "step": 17580 + }, + { + "epoch": 0.5875868519508284, + "grad_norm": 0.6619681715965271, + "learning_rate": 0.00018859294445635713, + "loss": 0.4396, + "step": 17590 + }, + { + "epoch": 0.5879208979155531, + "grad_norm": 0.5515499114990234, + "learning_rate": 0.00018857585939721132, + "loss": 0.4332, + "step": 17600 + }, + { + "epoch": 0.5882549438802779, + "grad_norm": 0.7548701763153076, + "learning_rate": 0.00018855876232803907, + "loss": 0.4235, + "step": 17610 + }, + { + "epoch": 0.5885889898450026, + "grad_norm": 0.5853291153907776, + "learning_rate": 0.0001885416532511586, + "loss": 0.4307, + "step": 17620 + }, + { + "epoch": 0.5889230358097274, + "grad_norm": 0.7123871445655823, + "learning_rate": 0.0001885245321688897, + "loss": 0.4491, + "step": 17630 + }, + { + "epoch": 0.5892570817744521, + "grad_norm": 0.6407101154327393, + "learning_rate": 0.00018850739908355389, + "loss": 0.4374, + "step": 17640 + }, + { + "epoch": 0.5895911277391769, + "grad_norm": 0.716325581073761, + "learning_rate": 0.00018849025399747413, + "loss": 0.4274, + "step": 17650 + }, + { + "epoch": 0.5899251737039016, + "grad_norm": 0.6358019113540649, + "learning_rate": 0.00018847309691297524, + "loss": 0.387, + "step": 17660 + }, + { + "epoch": 0.5902592196686264, + "grad_norm": 0.6611371636390686, + "learning_rate": 0.0001884559278323835, + "loss": 0.3902, + "step": 17670 + }, + { + "epoch": 0.5905932656333511, + "grad_norm": 0.842372715473175, + "learning_rate": 0.00018843874675802686, + "loss": 0.4221, + "step": 17680 + }, + { + "epoch": 0.5909273115980759, + "grad_norm": 0.7764332890510559, + "learning_rate": 0.00018842155369223494, + "loss": 0.4245, + "step": 17690 + }, + { + "epoch": 0.5912613575628006, + "grad_norm": 0.6307492852210999, + "learning_rate": 0.00018840434863733898, + "loss": 0.424, + "step": 17700 + }, + { + "epoch": 0.5915954035275254, + "grad_norm": 0.7730133533477783, + "learning_rate": 0.00018838713159567173, + "loss": 0.4719, + "step": 17710 + }, + { + "epoch": 0.5919294494922501, + "grad_norm": 0.8115317821502686, + "learning_rate": 0.00018836990256956772, + "loss": 0.4392, + "step": 17720 + }, + { + "epoch": 0.5922634954569749, + "grad_norm": 0.7611342072486877, + "learning_rate": 0.00018835266156136304, + "loss": 0.4564, + "step": 17730 + }, + { + "epoch": 0.5925975414216996, + "grad_norm": 0.7688199281692505, + "learning_rate": 0.0001883354085733954, + "loss": 0.4427, + "step": 17740 + }, + { + "epoch": 0.5929315873864244, + "grad_norm": 0.718677282333374, + "learning_rate": 0.00018831814360800405, + "loss": 0.4297, + "step": 17750 + }, + { + "epoch": 0.5932656333511491, + "grad_norm": 0.6693311333656311, + "learning_rate": 0.00018830086666753008, + "loss": 0.4404, + "step": 17760 + }, + { + "epoch": 0.5935996793158739, + "grad_norm": 0.7295831441879272, + "learning_rate": 0.00018828357775431601, + "loss": 0.445, + "step": 17770 + }, + { + "epoch": 0.5939337252805986, + "grad_norm": 0.6987696290016174, + "learning_rate": 0.00018826627687070605, + "loss": 0.4201, + "step": 17780 + }, + { + "epoch": 0.5942677712453234, + "grad_norm": 0.7449126839637756, + "learning_rate": 0.00018824896401904603, + "loss": 0.4386, + "step": 17790 + }, + { + "epoch": 0.5946018172100481, + "grad_norm": 0.6110575795173645, + "learning_rate": 0.00018823163920168344, + "loss": 0.4517, + "step": 17800 + }, + { + "epoch": 0.5949358631747729, + "grad_norm": 0.6926220655441284, + "learning_rate": 0.00018821430242096733, + "loss": 0.4359, + "step": 17810 + }, + { + "epoch": 0.5952699091394976, + "grad_norm": 0.7295883893966675, + "learning_rate": 0.0001881969536792484, + "loss": 0.4086, + "step": 17820 + }, + { + "epoch": 0.5956039551042224, + "grad_norm": 0.7686299085617065, + "learning_rate": 0.00018817959297887896, + "loss": 0.4427, + "step": 17830 + }, + { + "epoch": 0.5959380010689471, + "grad_norm": 0.7837757468223572, + "learning_rate": 0.00018816222032221297, + "loss": 0.4207, + "step": 17840 + }, + { + "epoch": 0.5962720470336719, + "grad_norm": 0.6555127501487732, + "learning_rate": 0.000188144835711606, + "loss": 0.3982, + "step": 17850 + }, + { + "epoch": 0.5966060929983966, + "grad_norm": 0.682339072227478, + "learning_rate": 0.00018812743914941522, + "loss": 0.4139, + "step": 17860 + }, + { + "epoch": 0.5969401389631214, + "grad_norm": 0.5906055569648743, + "learning_rate": 0.00018811003063799942, + "loss": 0.4322, + "step": 17870 + }, + { + "epoch": 0.5972741849278461, + "grad_norm": 0.8302304744720459, + "learning_rate": 0.00018809261017971904, + "loss": 0.4215, + "step": 17880 + }, + { + "epoch": 0.5976082308925709, + "grad_norm": 0.7394044399261475, + "learning_rate": 0.00018807517777693616, + "loss": 0.4694, + "step": 17890 + }, + { + "epoch": 0.5979422768572956, + "grad_norm": 1.125675916671753, + "learning_rate": 0.00018805773343201439, + "loss": 0.4358, + "step": 17900 + }, + { + "epoch": 0.5982763228220203, + "grad_norm": 0.7094932198524475, + "learning_rate": 0.00018804027714731903, + "loss": 0.4145, + "step": 17910 + }, + { + "epoch": 0.598610368786745, + "grad_norm": 0.7412818670272827, + "learning_rate": 0.000188022808925217, + "loss": 0.498, + "step": 17920 + }, + { + "epoch": 0.5989444147514698, + "grad_norm": 0.6839555501937866, + "learning_rate": 0.0001880053287680768, + "loss": 0.4291, + "step": 17930 + }, + { + "epoch": 0.5992784607161945, + "grad_norm": 0.8543412685394287, + "learning_rate": 0.00018798783667826858, + "loss": 0.4387, + "step": 17940 + }, + { + "epoch": 0.5996125066809193, + "grad_norm": 0.7134606242179871, + "learning_rate": 0.0001879703326581641, + "loss": 0.4464, + "step": 17950 + }, + { + "epoch": 0.599946552645644, + "grad_norm": 0.6235162019729614, + "learning_rate": 0.0001879528167101367, + "loss": 0.4317, + "step": 17960 + }, + { + "epoch": 0.6002805986103688, + "grad_norm": 0.7397158145904541, + "learning_rate": 0.00018793528883656146, + "loss": 0.4447, + "step": 17970 + }, + { + "epoch": 0.6006146445750935, + "grad_norm": 0.8539977669715881, + "learning_rate": 0.0001879177490398149, + "loss": 0.471, + "step": 17980 + }, + { + "epoch": 0.6009486905398183, + "grad_norm": 0.6828790307044983, + "learning_rate": 0.0001879001973222753, + "loss": 0.4257, + "step": 17990 + }, + { + "epoch": 0.601282736504543, + "grad_norm": 0.7318171858787537, + "learning_rate": 0.00018788263368632247, + "loss": 0.4593, + "step": 18000 + }, + { + "epoch": 0.601282736504543, + "eval_chrf": 87.30510951216766, + "eval_loss": 0.7841387391090393, + "eval_runtime": 87.2319, + "eval_samples_per_second": 1.146, + "eval_steps_per_second": 0.046, + "step": 18000 + }, + { + "epoch": 0.6016167824692678, + "grad_norm": 0.7016127109527588, + "learning_rate": 0.00018786505813433787, + "loss": 0.4482, + "step": 18010 + }, + { + "epoch": 0.6019508284339925, + "grad_norm": 0.7837814688682556, + "learning_rate": 0.0001878474706687046, + "loss": 0.4339, + "step": 18020 + }, + { + "epoch": 0.6022848743987173, + "grad_norm": 0.7619006037712097, + "learning_rate": 0.00018782987129180735, + "loss": 0.5233, + "step": 18030 + }, + { + "epoch": 0.602618920363442, + "grad_norm": 0.8393512964248657, + "learning_rate": 0.0001878122600060324, + "loss": 0.4816, + "step": 18040 + }, + { + "epoch": 0.6029529663281668, + "grad_norm": 0.8660761117935181, + "learning_rate": 0.00018779463681376765, + "loss": 0.4494, + "step": 18050 + }, + { + "epoch": 0.6032870122928915, + "grad_norm": 0.6567659378051758, + "learning_rate": 0.00018777700171740272, + "loss": 0.4779, + "step": 18060 + }, + { + "epoch": 0.6036210582576162, + "grad_norm": 0.8646301031112671, + "learning_rate": 0.00018775935471932863, + "loss": 0.4309, + "step": 18070 + }, + { + "epoch": 0.603955104222341, + "grad_norm": 0.667201817035675, + "learning_rate": 0.00018774169582193827, + "loss": 0.4505, + "step": 18080 + }, + { + "epoch": 0.6042891501870657, + "grad_norm": 0.6057419776916504, + "learning_rate": 0.00018772402502762592, + "loss": 0.4514, + "step": 18090 + }, + { + "epoch": 0.6046231961517905, + "grad_norm": 0.8254333734512329, + "learning_rate": 0.0001877063423387876, + "loss": 0.4401, + "step": 18100 + }, + { + "epoch": 0.6049572421165152, + "grad_norm": 0.6565999388694763, + "learning_rate": 0.00018768864775782094, + "loss": 0.4123, + "step": 18110 + }, + { + "epoch": 0.60529128808124, + "grad_norm": 0.5760644674301147, + "learning_rate": 0.00018767094128712512, + "loss": 0.4195, + "step": 18120 + }, + { + "epoch": 0.6056253340459647, + "grad_norm": 0.7200978398323059, + "learning_rate": 0.000187653222929101, + "loss": 0.4217, + "step": 18130 + }, + { + "epoch": 0.6059593800106895, + "grad_norm": 0.6663920283317566, + "learning_rate": 0.00018763549268615095, + "loss": 0.4077, + "step": 18140 + }, + { + "epoch": 0.6062934259754142, + "grad_norm": 0.7047891616821289, + "learning_rate": 0.0001876177505606791, + "loss": 0.4609, + "step": 18150 + }, + { + "epoch": 0.606627471940139, + "grad_norm": 0.6848933696746826, + "learning_rate": 0.00018759999655509102, + "loss": 0.4401, + "step": 18160 + }, + { + "epoch": 0.6069615179048637, + "grad_norm": 0.9723472595214844, + "learning_rate": 0.00018758223067179408, + "loss": 0.4461, + "step": 18170 + }, + { + "epoch": 0.6072955638695885, + "grad_norm": 0.7803860902786255, + "learning_rate": 0.00018756445291319708, + "loss": 0.4393, + "step": 18180 + }, + { + "epoch": 0.6076296098343132, + "grad_norm": 0.7010934352874756, + "learning_rate": 0.00018754666328171058, + "loss": 0.4763, + "step": 18190 + }, + { + "epoch": 0.607963655799038, + "grad_norm": 0.7318642139434814, + "learning_rate": 0.0001875288617797466, + "loss": 0.4078, + "step": 18200 + }, + { + "epoch": 0.6082977017637627, + "grad_norm": 0.7537170052528381, + "learning_rate": 0.00018751104840971893, + "loss": 0.4459, + "step": 18210 + }, + { + "epoch": 0.6086317477284875, + "grad_norm": 0.6956712603569031, + "learning_rate": 0.00018749322317404286, + "loss": 0.4234, + "step": 18220 + }, + { + "epoch": 0.6089657936932121, + "grad_norm": 0.7774499654769897, + "learning_rate": 0.00018747538607513528, + "loss": 0.4217, + "step": 18230 + }, + { + "epoch": 0.6092998396579369, + "grad_norm": 0.8052096366882324, + "learning_rate": 0.0001874575371154148, + "loss": 0.4582, + "step": 18240 + }, + { + "epoch": 0.6096338856226616, + "grad_norm": 0.6535292267799377, + "learning_rate": 0.0001874396762973015, + "loss": 0.4398, + "step": 18250 + }, + { + "epoch": 0.6099679315873864, + "grad_norm": 0.8517035245895386, + "learning_rate": 0.00018742180362321719, + "loss": 0.4085, + "step": 18260 + }, + { + "epoch": 0.6103019775521111, + "grad_norm": 0.6449573636054993, + "learning_rate": 0.0001874039190955852, + "loss": 0.4553, + "step": 18270 + }, + { + "epoch": 0.6106360235168359, + "grad_norm": 0.6858041882514954, + "learning_rate": 0.00018738602271683048, + "loss": 0.4358, + "step": 18280 + }, + { + "epoch": 0.6109700694815606, + "grad_norm": 0.7278577089309692, + "learning_rate": 0.0001873681144893796, + "loss": 0.4352, + "step": 18290 + }, + { + "epoch": 0.6113041154462854, + "grad_norm": 0.7655772566795349, + "learning_rate": 0.00018735019441566082, + "loss": 0.4392, + "step": 18300 + }, + { + "epoch": 0.6116381614110101, + "grad_norm": 0.6397401690483093, + "learning_rate": 0.00018733226249810386, + "loss": 0.4191, + "step": 18310 + }, + { + "epoch": 0.6119722073757349, + "grad_norm": 0.8372877240180969, + "learning_rate": 0.00018731431873914012, + "loss": 0.4196, + "step": 18320 + }, + { + "epoch": 0.6123062533404596, + "grad_norm": 0.7785798907279968, + "learning_rate": 0.00018729636314120262, + "loss": 0.4153, + "step": 18330 + }, + { + "epoch": 0.6126402993051844, + "grad_norm": 0.7349402904510498, + "learning_rate": 0.00018727839570672594, + "loss": 0.4197, + "step": 18340 + }, + { + "epoch": 0.6129743452699091, + "grad_norm": 0.7211949229240417, + "learning_rate": 0.0001872604164381463, + "loss": 0.4068, + "step": 18350 + }, + { + "epoch": 0.6133083912346339, + "grad_norm": 0.5916420221328735, + "learning_rate": 0.00018724242533790154, + "loss": 0.44, + "step": 18360 + }, + { + "epoch": 0.6136424371993586, + "grad_norm": 0.7984458804130554, + "learning_rate": 0.00018722442240843106, + "loss": 0.4593, + "step": 18370 + }, + { + "epoch": 0.6139764831640834, + "grad_norm": 0.5625454187393188, + "learning_rate": 0.0001872064076521759, + "loss": 0.4683, + "step": 18380 + }, + { + "epoch": 0.6143105291288081, + "grad_norm": 0.6885100603103638, + "learning_rate": 0.0001871883810715786, + "loss": 0.4308, + "step": 18390 + }, + { + "epoch": 0.6146445750935329, + "grad_norm": 0.7859699130058289, + "learning_rate": 0.0001871703426690835, + "loss": 0.4684, + "step": 18400 + }, + { + "epoch": 0.6149786210582576, + "grad_norm": 0.794296145439148, + "learning_rate": 0.00018715229244713636, + "loss": 0.4521, + "step": 18410 + }, + { + "epoch": 0.6153126670229824, + "grad_norm": 0.6370987296104431, + "learning_rate": 0.00018713423040818463, + "loss": 0.4517, + "step": 18420 + }, + { + "epoch": 0.6156467129877071, + "grad_norm": 0.6595648527145386, + "learning_rate": 0.00018711615655467737, + "loss": 0.4468, + "step": 18430 + }, + { + "epoch": 0.6159807589524319, + "grad_norm": 0.728413999080658, + "learning_rate": 0.00018709807088906522, + "loss": 0.4412, + "step": 18440 + }, + { + "epoch": 0.6163148049171566, + "grad_norm": 0.7566998600959778, + "learning_rate": 0.00018707997341380043, + "loss": 0.4381, + "step": 18450 + }, + { + "epoch": 0.6166488508818814, + "grad_norm": 0.6449579000473022, + "learning_rate": 0.0001870618641313368, + "loss": 0.4631, + "step": 18460 + }, + { + "epoch": 0.6169828968466061, + "grad_norm": 0.9874942898750305, + "learning_rate": 0.00018704374304412977, + "loss": 0.4767, + "step": 18470 + }, + { + "epoch": 0.6173169428113309, + "grad_norm": 0.7965143918991089, + "learning_rate": 0.00018702561015463643, + "loss": 0.4841, + "step": 18480 + }, + { + "epoch": 0.6176509887760556, + "grad_norm": 0.7577676773071289, + "learning_rate": 0.0001870074654653154, + "loss": 0.429, + "step": 18490 + }, + { + "epoch": 0.6179850347407804, + "grad_norm": 0.6348257660865784, + "learning_rate": 0.00018698930897862693, + "loss": 0.421, + "step": 18500 + }, + { + "epoch": 0.6183190807055051, + "grad_norm": 0.7289773225784302, + "learning_rate": 0.00018697114069703282, + "loss": 0.4142, + "step": 18510 + }, + { + "epoch": 0.6186531266702299, + "grad_norm": 0.7461106181144714, + "learning_rate": 0.0001869529606229966, + "loss": 0.4663, + "step": 18520 + }, + { + "epoch": 0.6189871726349546, + "grad_norm": 0.8172454833984375, + "learning_rate": 0.0001869347687589832, + "loss": 0.4356, + "step": 18530 + }, + { + "epoch": 0.6193212185996794, + "grad_norm": 0.6865146160125732, + "learning_rate": 0.00018691656510745936, + "loss": 0.4169, + "step": 18540 + }, + { + "epoch": 0.619655264564404, + "grad_norm": 0.7970819473266602, + "learning_rate": 0.00018689834967089327, + "loss": 0.4724, + "step": 18550 + }, + { + "epoch": 0.6199893105291288, + "grad_norm": 0.6780045032501221, + "learning_rate": 0.00018688012245175482, + "loss": 0.3852, + "step": 18560 + }, + { + "epoch": 0.6203233564938535, + "grad_norm": 0.5773563981056213, + "learning_rate": 0.00018686188345251535, + "loss": 0.4447, + "step": 18570 + }, + { + "epoch": 0.6206574024585783, + "grad_norm": 0.7487504482269287, + "learning_rate": 0.00018684363267564794, + "loss": 0.4675, + "step": 18580 + }, + { + "epoch": 0.620991448423303, + "grad_norm": 0.8370174169540405, + "learning_rate": 0.00018682537012362725, + "loss": 0.4365, + "step": 18590 + }, + { + "epoch": 0.6213254943880278, + "grad_norm": 0.777859091758728, + "learning_rate": 0.00018680709579892946, + "loss": 0.4305, + "step": 18600 + }, + { + "epoch": 0.6216595403527525, + "grad_norm": 0.8078742027282715, + "learning_rate": 0.00018678880970403238, + "loss": 0.4016, + "step": 18610 + }, + { + "epoch": 0.6219935863174773, + "grad_norm": 0.8819447755813599, + "learning_rate": 0.00018677051184141547, + "loss": 0.5259, + "step": 18620 + }, + { + "epoch": 0.622327632282202, + "grad_norm": 0.6113800406455994, + "learning_rate": 0.00018675220221355972, + "loss": 0.4816, + "step": 18630 + }, + { + "epoch": 0.6226616782469268, + "grad_norm": 0.8430353403091431, + "learning_rate": 0.00018673388082294777, + "loss": 0.4535, + "step": 18640 + }, + { + "epoch": 0.6229957242116515, + "grad_norm": 0.8496198058128357, + "learning_rate": 0.00018671554767206376, + "loss": 0.4674, + "step": 18650 + }, + { + "epoch": 0.6233297701763763, + "grad_norm": 0.6729563474655151, + "learning_rate": 0.00018669720276339352, + "loss": 0.4493, + "step": 18660 + }, + { + "epoch": 0.623663816141101, + "grad_norm": 0.6265200972557068, + "learning_rate": 0.00018667884609942448, + "loss": 0.4188, + "step": 18670 + }, + { + "epoch": 0.6239978621058258, + "grad_norm": 0.6975818276405334, + "learning_rate": 0.00018666047768264553, + "loss": 0.4271, + "step": 18680 + }, + { + "epoch": 0.6243319080705505, + "grad_norm": 0.8339488506317139, + "learning_rate": 0.00018664209751554736, + "loss": 0.4323, + "step": 18690 + }, + { + "epoch": 0.6246659540352753, + "grad_norm": 0.5827168822288513, + "learning_rate": 0.00018662370560062206, + "loss": 0.4068, + "step": 18700 + }, + { + "epoch": 0.625, + "grad_norm": 0.5894114375114441, + "learning_rate": 0.00018660530194036342, + "loss": 0.427, + "step": 18710 + }, + { + "epoch": 0.6253340459647247, + "grad_norm": 0.7514945864677429, + "learning_rate": 0.00018658688653726683, + "loss": 0.4514, + "step": 18720 + }, + { + "epoch": 0.6256680919294495, + "grad_norm": 0.701473593711853, + "learning_rate": 0.0001865684593938292, + "loss": 0.4062, + "step": 18730 + }, + { + "epoch": 0.6260021378941742, + "grad_norm": 0.6220928430557251, + "learning_rate": 0.00018655002051254908, + "loss": 0.4271, + "step": 18740 + }, + { + "epoch": 0.626336183858899, + "grad_norm": 0.568866491317749, + "learning_rate": 0.00018653156989592663, + "loss": 0.4158, + "step": 18750 + }, + { + "epoch": 0.6266702298236237, + "grad_norm": 0.630129873752594, + "learning_rate": 0.00018651310754646354, + "loss": 0.4254, + "step": 18760 + }, + { + "epoch": 0.6270042757883485, + "grad_norm": 0.6494055986404419, + "learning_rate": 0.00018649463346666312, + "loss": 0.4536, + "step": 18770 + }, + { + "epoch": 0.6273383217530732, + "grad_norm": 0.7819657921791077, + "learning_rate": 0.0001864761476590303, + "loss": 0.4737, + "step": 18780 + }, + { + "epoch": 0.627672367717798, + "grad_norm": 0.7134982347488403, + "learning_rate": 0.00018645765012607158, + "loss": 0.4325, + "step": 18790 + }, + { + "epoch": 0.6280064136825227, + "grad_norm": 0.6123294830322266, + "learning_rate": 0.00018643914087029502, + "loss": 0.4464, + "step": 18800 + }, + { + "epoch": 0.6283404596472475, + "grad_norm": 0.6820539832115173, + "learning_rate": 0.0001864206198942103, + "loss": 0.4491, + "step": 18810 + }, + { + "epoch": 0.6286745056119722, + "grad_norm": 0.7323684096336365, + "learning_rate": 0.00018640208720032875, + "loss": 0.475, + "step": 18820 + }, + { + "epoch": 0.629008551576697, + "grad_norm": 0.8368024230003357, + "learning_rate": 0.0001863835427911631, + "loss": 0.4345, + "step": 18830 + }, + { + "epoch": 0.6293425975414217, + "grad_norm": 0.7498637437820435, + "learning_rate": 0.0001863649866692279, + "loss": 0.4659, + "step": 18840 + }, + { + "epoch": 0.6296766435061465, + "grad_norm": 0.6319894790649414, + "learning_rate": 0.00018634641883703913, + "loss": 0.447, + "step": 18850 + }, + { + "epoch": 0.6300106894708712, + "grad_norm": 0.7433391213417053, + "learning_rate": 0.0001863278392971144, + "loss": 0.4149, + "step": 18860 + }, + { + "epoch": 0.630344735435596, + "grad_norm": 0.5948516130447388, + "learning_rate": 0.00018630924805197296, + "loss": 0.3906, + "step": 18870 + }, + { + "epoch": 0.6306787814003206, + "grad_norm": 0.6439496874809265, + "learning_rate": 0.00018629064510413556, + "loss": 0.4514, + "step": 18880 + }, + { + "epoch": 0.6310128273650454, + "grad_norm": 0.638244092464447, + "learning_rate": 0.0001862720304561246, + "loss": 0.4516, + "step": 18890 + }, + { + "epoch": 0.6313468733297701, + "grad_norm": 0.5734465718269348, + "learning_rate": 0.00018625340411046403, + "loss": 0.4329, + "step": 18900 + }, + { + "epoch": 0.6316809192944949, + "grad_norm": 0.633585512638092, + "learning_rate": 0.00018623476606967942, + "loss": 0.4208, + "step": 18910 + }, + { + "epoch": 0.6320149652592196, + "grad_norm": 0.6555805206298828, + "learning_rate": 0.0001862161163362979, + "loss": 0.422, + "step": 18920 + }, + { + "epoch": 0.6323490112239444, + "grad_norm": 0.7530593872070312, + "learning_rate": 0.0001861974549128482, + "loss": 0.4376, + "step": 18930 + }, + { + "epoch": 0.6326830571886691, + "grad_norm": 0.6330033540725708, + "learning_rate": 0.0001861787818018606, + "loss": 0.4187, + "step": 18940 + }, + { + "epoch": 0.6330171031533939, + "grad_norm": 0.6659457087516785, + "learning_rate": 0.000186160097005867, + "loss": 0.4422, + "step": 18950 + }, + { + "epoch": 0.6333511491181186, + "grad_norm": 0.6806774139404297, + "learning_rate": 0.00018614140052740092, + "loss": 0.4852, + "step": 18960 + }, + { + "epoch": 0.6336851950828434, + "grad_norm": 0.57012540102005, + "learning_rate": 0.00018612269236899739, + "loss": 0.4349, + "step": 18970 + }, + { + "epoch": 0.6340192410475681, + "grad_norm": 0.6695536971092224, + "learning_rate": 0.000186103972533193, + "loss": 0.4664, + "step": 18980 + }, + { + "epoch": 0.6343532870122929, + "grad_norm": 0.7564519643783569, + "learning_rate": 0.00018608524102252608, + "loss": 0.4691, + "step": 18990 + }, + { + "epoch": 0.6346873329770176, + "grad_norm": 0.6725431084632874, + "learning_rate": 0.0001860664978395364, + "loss": 0.4416, + "step": 19000 + }, + { + "epoch": 0.6346873329770176, + "eval_chrf": 83.59837146666051, + "eval_loss": 0.7853174805641174, + "eval_runtime": 250.26, + "eval_samples_per_second": 0.4, + "eval_steps_per_second": 0.016, + "step": 19000 + }, + { + "epoch": 0.6350213789417424, + "grad_norm": 0.676261842250824, + "learning_rate": 0.0001860477429867653, + "loss": 0.4181, + "step": 19010 + }, + { + "epoch": 0.6353554249064671, + "grad_norm": 0.8009313941001892, + "learning_rate": 0.00018602897646675584, + "loss": 0.4513, + "step": 19020 + }, + { + "epoch": 0.6356894708711919, + "grad_norm": 0.7437488436698914, + "learning_rate": 0.00018601019828205255, + "loss": 0.4214, + "step": 19030 + }, + { + "epoch": 0.6360235168359166, + "grad_norm": 0.6362650990486145, + "learning_rate": 0.00018599140843520156, + "loss": 0.4247, + "step": 19040 + }, + { + "epoch": 0.6363575628006414, + "grad_norm": 0.7466262578964233, + "learning_rate": 0.00018597260692875057, + "loss": 0.4205, + "step": 19050 + }, + { + "epoch": 0.6366916087653661, + "grad_norm": 0.5506861805915833, + "learning_rate": 0.00018595379376524888, + "loss": 0.4424, + "step": 19060 + }, + { + "epoch": 0.6370256547300909, + "grad_norm": 0.7859874367713928, + "learning_rate": 0.00018593496894724743, + "loss": 0.4362, + "step": 19070 + }, + { + "epoch": 0.6373597006948156, + "grad_norm": 0.6775894165039062, + "learning_rate": 0.0001859161324772986, + "loss": 0.441, + "step": 19080 + }, + { + "epoch": 0.6376937466595404, + "grad_norm": 0.7780091762542725, + "learning_rate": 0.00018589728435795656, + "loss": 0.4701, + "step": 19090 + }, + { + "epoch": 0.6380277926242651, + "grad_norm": 0.7130481004714966, + "learning_rate": 0.0001858784245917768, + "loss": 0.4439, + "step": 19100 + }, + { + "epoch": 0.6383618385889899, + "grad_norm": 0.6759771108627319, + "learning_rate": 0.0001858595531813166, + "loss": 0.4106, + "step": 19110 + }, + { + "epoch": 0.6386958845537146, + "grad_norm": 0.6579328775405884, + "learning_rate": 0.0001858406701291347, + "loss": 0.4643, + "step": 19120 + }, + { + "epoch": 0.6390299305184394, + "grad_norm": 0.7359927892684937, + "learning_rate": 0.00018582177543779145, + "loss": 0.4482, + "step": 19130 + }, + { + "epoch": 0.6393639764831641, + "grad_norm": 0.6813386678695679, + "learning_rate": 0.0001858028691098488, + "loss": 0.4328, + "step": 19140 + }, + { + "epoch": 0.6396980224478889, + "grad_norm": 0.7394254803657532, + "learning_rate": 0.0001857839511478703, + "loss": 0.4443, + "step": 19150 + }, + { + "epoch": 0.6400320684126136, + "grad_norm": 0.7334058880805969, + "learning_rate": 0.00018576502155442102, + "loss": 0.4445, + "step": 19160 + }, + { + "epoch": 0.6403661143773384, + "grad_norm": 0.6196887493133545, + "learning_rate": 0.00018574608033206758, + "loss": 0.4175, + "step": 19170 + }, + { + "epoch": 0.6407001603420631, + "grad_norm": 0.6910877227783203, + "learning_rate": 0.00018572712748337832, + "loss": 0.4379, + "step": 19180 + }, + { + "epoch": 0.6410342063067879, + "grad_norm": 0.7418641448020935, + "learning_rate": 0.00018570816301092302, + "loss": 0.4346, + "step": 19190 + }, + { + "epoch": 0.6413682522715125, + "grad_norm": 0.8302858471870422, + "learning_rate": 0.00018568918691727303, + "loss": 0.4388, + "step": 19200 + }, + { + "epoch": 0.6417022982362373, + "grad_norm": 0.7545500993728638, + "learning_rate": 0.00018567019920500137, + "loss": 0.4196, + "step": 19210 + }, + { + "epoch": 0.642036344200962, + "grad_norm": 0.6758487820625305, + "learning_rate": 0.0001856511998766826, + "loss": 0.4625, + "step": 19220 + }, + { + "epoch": 0.6423703901656868, + "grad_norm": 0.6352600455284119, + "learning_rate": 0.00018563218893489282, + "loss": 0.4252, + "step": 19230 + }, + { + "epoch": 0.6427044361304115, + "grad_norm": 0.6635923981666565, + "learning_rate": 0.00018561316638220974, + "loss": 0.4079, + "step": 19240 + }, + { + "epoch": 0.6430384820951363, + "grad_norm": 0.694794774055481, + "learning_rate": 0.0001855941322212126, + "loss": 0.4474, + "step": 19250 + }, + { + "epoch": 0.643372528059861, + "grad_norm": 0.6813491582870483, + "learning_rate": 0.0001855750864544823, + "loss": 0.406, + "step": 19260 + }, + { + "epoch": 0.6437065740245858, + "grad_norm": 0.8462486267089844, + "learning_rate": 0.00018555602908460123, + "loss": 0.4668, + "step": 19270 + }, + { + "epoch": 0.6440406199893105, + "grad_norm": 0.7035932540893555, + "learning_rate": 0.0001855369601141534, + "loss": 0.4697, + "step": 19280 + }, + { + "epoch": 0.6443746659540353, + "grad_norm": 0.7620035409927368, + "learning_rate": 0.0001855178795457244, + "loss": 0.4208, + "step": 19290 + }, + { + "epoch": 0.64470871191876, + "grad_norm": 0.6382061839103699, + "learning_rate": 0.00018549878738190133, + "loss": 0.4149, + "step": 19300 + }, + { + "epoch": 0.6450427578834848, + "grad_norm": 0.794662594795227, + "learning_rate": 0.00018547968362527288, + "loss": 0.4412, + "step": 19310 + }, + { + "epoch": 0.6453768038482095, + "grad_norm": 0.8303952217102051, + "learning_rate": 0.00018546056827842934, + "loss": 0.4422, + "step": 19320 + }, + { + "epoch": 0.6457108498129343, + "grad_norm": 0.6161072850227356, + "learning_rate": 0.00018544144134396265, + "loss": 0.415, + "step": 19330 + }, + { + "epoch": 0.646044895777659, + "grad_norm": 0.6293084621429443, + "learning_rate": 0.00018542230282446616, + "loss": 0.4177, + "step": 19340 + }, + { + "epoch": 0.6463789417423838, + "grad_norm": 0.6740375757217407, + "learning_rate": 0.00018540315272253487, + "loss": 0.4428, + "step": 19350 + }, + { + "epoch": 0.6467129877071085, + "grad_norm": 0.7211604714393616, + "learning_rate": 0.0001853839910407654, + "loss": 0.4918, + "step": 19360 + }, + { + "epoch": 0.6470470336718332, + "grad_norm": 0.5262354612350464, + "learning_rate": 0.00018536481778175582, + "loss": 0.4125, + "step": 19370 + }, + { + "epoch": 0.647381079636558, + "grad_norm": 0.7253087759017944, + "learning_rate": 0.0001853456329481059, + "loss": 0.4333, + "step": 19380 + }, + { + "epoch": 0.6477151256012827, + "grad_norm": 0.8032359480857849, + "learning_rate": 0.00018532643654241686, + "loss": 0.4471, + "step": 19390 + }, + { + "epoch": 0.6480491715660075, + "grad_norm": 0.7322123050689697, + "learning_rate": 0.00018530722856729156, + "loss": 0.4222, + "step": 19400 + }, + { + "epoch": 0.6483832175307322, + "grad_norm": 0.7530114054679871, + "learning_rate": 0.00018528800902533444, + "loss": 0.4228, + "step": 19410 + }, + { + "epoch": 0.648717263495457, + "grad_norm": 0.6367051601409912, + "learning_rate": 0.0001852687779191515, + "loss": 0.4349, + "step": 19420 + }, + { + "epoch": 0.6490513094601817, + "grad_norm": 0.7758284211158752, + "learning_rate": 0.00018524953525135023, + "loss": 0.417, + "step": 19430 + }, + { + "epoch": 0.6493853554249065, + "grad_norm": 0.6344946026802063, + "learning_rate": 0.00018523028102453975, + "loss": 0.391, + "step": 19440 + }, + { + "epoch": 0.6497194013896312, + "grad_norm": 0.7926116585731506, + "learning_rate": 0.00018521101524133083, + "loss": 0.4376, + "step": 19450 + }, + { + "epoch": 0.650053447354356, + "grad_norm": 0.6597754955291748, + "learning_rate": 0.00018519173790433567, + "loss": 0.4263, + "step": 19460 + }, + { + "epoch": 0.6503874933190807, + "grad_norm": 0.6659796237945557, + "learning_rate": 0.00018517244901616805, + "loss": 0.4197, + "step": 19470 + }, + { + "epoch": 0.6507215392838055, + "grad_norm": 0.8246304392814636, + "learning_rate": 0.00018515314857944342, + "loss": 0.4612, + "step": 19480 + }, + { + "epoch": 0.6510555852485302, + "grad_norm": 0.7640775442123413, + "learning_rate": 0.0001851338365967787, + "loss": 0.4342, + "step": 19490 + }, + { + "epoch": 0.651389631213255, + "grad_norm": 0.7740781903266907, + "learning_rate": 0.00018511451307079243, + "loss": 0.3955, + "step": 19500 + }, + { + "epoch": 0.6517236771779797, + "grad_norm": 0.8217988014221191, + "learning_rate": 0.00018509517800410465, + "loss": 0.4432, + "step": 19510 + }, + { + "epoch": 0.6520577231427044, + "grad_norm": 0.6191927790641785, + "learning_rate": 0.00018507583139933703, + "loss": 0.4443, + "step": 19520 + }, + { + "epoch": 0.6523917691074291, + "grad_norm": 0.6179030537605286, + "learning_rate": 0.00018505647325911282, + "loss": 0.4591, + "step": 19530 + }, + { + "epoch": 0.6527258150721539, + "grad_norm": 0.7095394730567932, + "learning_rate": 0.00018503710358605674, + "loss": 0.4318, + "step": 19540 + }, + { + "epoch": 0.6530598610368786, + "grad_norm": 0.6912176012992859, + "learning_rate": 0.00018501772238279518, + "loss": 0.4356, + "step": 19550 + }, + { + "epoch": 0.6533939070016034, + "grad_norm": 0.6795780658721924, + "learning_rate": 0.000184998329651956, + "loss": 0.4394, + "step": 19560 + }, + { + "epoch": 0.6537279529663281, + "grad_norm": 0.672929584980011, + "learning_rate": 0.00018497892539616866, + "loss": 0.433, + "step": 19570 + }, + { + "epoch": 0.6540619989310529, + "grad_norm": 0.6304863691329956, + "learning_rate": 0.0001849595096180642, + "loss": 0.4348, + "step": 19580 + }, + { + "epoch": 0.6543960448957776, + "grad_norm": 0.6005010604858398, + "learning_rate": 0.00018494008232027527, + "loss": 0.406, + "step": 19590 + }, + { + "epoch": 0.6547300908605024, + "grad_norm": 0.5419998168945312, + "learning_rate": 0.00018492064350543593, + "loss": 0.4332, + "step": 19600 + }, + { + "epoch": 0.6550641368252271, + "grad_norm": 0.62958824634552, + "learning_rate": 0.00018490119317618198, + "loss": 0.4316, + "step": 19610 + }, + { + "epoch": 0.6553981827899519, + "grad_norm": 0.6884702444076538, + "learning_rate": 0.0001848817313351506, + "loss": 0.4313, + "step": 19620 + }, + { + "epoch": 0.6557322287546766, + "grad_norm": 0.7512421011924744, + "learning_rate": 0.0001848622579849807, + "loss": 0.4261, + "step": 19630 + }, + { + "epoch": 0.6560662747194014, + "grad_norm": 0.6714699268341064, + "learning_rate": 0.00018484277312831268, + "loss": 0.4212, + "step": 19640 + }, + { + "epoch": 0.6564003206841261, + "grad_norm": 0.6765161156654358, + "learning_rate": 0.00018482327676778846, + "loss": 0.4414, + "step": 19650 + }, + { + "epoch": 0.6567343666488509, + "grad_norm": 0.6641454100608826, + "learning_rate": 0.00018480376890605155, + "loss": 0.4287, + "step": 19660 + }, + { + "epoch": 0.6570684126135756, + "grad_norm": 0.6002545952796936, + "learning_rate": 0.0001847842495457471, + "loss": 0.4494, + "step": 19670 + }, + { + "epoch": 0.6574024585783004, + "grad_norm": 0.7917832136154175, + "learning_rate": 0.00018476471868952165, + "loss": 0.4336, + "step": 19680 + }, + { + "epoch": 0.6577365045430251, + "grad_norm": 0.9126007556915283, + "learning_rate": 0.00018474517634002347, + "loss": 0.4128, + "step": 19690 + }, + { + "epoch": 0.6580705505077499, + "grad_norm": 0.66801518201828, + "learning_rate": 0.00018472562249990224, + "loss": 0.4292, + "step": 19700 + }, + { + "epoch": 0.6584045964724746, + "grad_norm": 0.713407039642334, + "learning_rate": 0.00018470605717180934, + "loss": 0.4305, + "step": 19710 + }, + { + "epoch": 0.6587386424371994, + "grad_norm": 0.697417140007019, + "learning_rate": 0.0001846864803583976, + "loss": 0.4335, + "step": 19720 + }, + { + "epoch": 0.6590726884019241, + "grad_norm": 0.8019979596138, + "learning_rate": 0.0001846668920623215, + "loss": 0.4189, + "step": 19730 + }, + { + "epoch": 0.6594067343666489, + "grad_norm": 0.639952540397644, + "learning_rate": 0.00018464729228623693, + "loss": 0.4274, + "step": 19740 + }, + { + "epoch": 0.6597407803313736, + "grad_norm": 0.6518083810806274, + "learning_rate": 0.00018462768103280145, + "loss": 0.447, + "step": 19750 + }, + { + "epoch": 0.6600748262960984, + "grad_norm": 0.6999320983886719, + "learning_rate": 0.00018460805830467422, + "loss": 0.41, + "step": 19760 + }, + { + "epoch": 0.6604088722608231, + "grad_norm": 0.603603184223175, + "learning_rate": 0.0001845884241045159, + "loss": 0.413, + "step": 19770 + }, + { + "epoch": 0.6607429182255479, + "grad_norm": 0.655368447303772, + "learning_rate": 0.0001845687784349886, + "loss": 0.4078, + "step": 19780 + }, + { + "epoch": 0.6610769641902726, + "grad_norm": 0.6248191595077515, + "learning_rate": 0.00018454912129875614, + "loss": 0.4726, + "step": 19790 + }, + { + "epoch": 0.6614110101549974, + "grad_norm": 0.7396923303604126, + "learning_rate": 0.00018452945269848385, + "loss": 0.408, + "step": 19800 + }, + { + "epoch": 0.6617450561197221, + "grad_norm": 0.8057122230529785, + "learning_rate": 0.00018450977263683856, + "loss": 0.4228, + "step": 19810 + }, + { + "epoch": 0.6620791020844469, + "grad_norm": 0.7895125150680542, + "learning_rate": 0.00018449008111648873, + "loss": 0.4415, + "step": 19820 + }, + { + "epoch": 0.6624131480491716, + "grad_norm": 0.7618771195411682, + "learning_rate": 0.00018447037814010433, + "loss": 0.4452, + "step": 19830 + }, + { + "epoch": 0.6627471940138963, + "grad_norm": 0.7373926639556885, + "learning_rate": 0.00018445066371035687, + "loss": 0.4817, + "step": 19840 + }, + { + "epoch": 0.663081239978621, + "grad_norm": 0.6795749068260193, + "learning_rate": 0.00018443093782991946, + "loss": 0.4508, + "step": 19850 + }, + { + "epoch": 0.6634152859433458, + "grad_norm": 0.6554961800575256, + "learning_rate": 0.00018441120050146674, + "loss": 0.4564, + "step": 19860 + }, + { + "epoch": 0.6637493319080705, + "grad_norm": 0.8785384893417358, + "learning_rate": 0.00018439145172767488, + "loss": 0.4356, + "step": 19870 + }, + { + "epoch": 0.6640833778727953, + "grad_norm": 0.6658512353897095, + "learning_rate": 0.00018437169151122167, + "loss": 0.4237, + "step": 19880 + }, + { + "epoch": 0.66441742383752, + "grad_norm": 0.6991045475006104, + "learning_rate": 0.00018435191985478633, + "loss": 0.4121, + "step": 19890 + }, + { + "epoch": 0.6647514698022448, + "grad_norm": 0.6903331875801086, + "learning_rate": 0.00018433213676104977, + "loss": 0.4076, + "step": 19900 + }, + { + "epoch": 0.6650855157669695, + "grad_norm": 0.8050356507301331, + "learning_rate": 0.00018431234223269434, + "loss": 0.4144, + "step": 19910 + }, + { + "epoch": 0.6654195617316943, + "grad_norm": 0.5802947282791138, + "learning_rate": 0.000184292536272404, + "loss": 0.4223, + "step": 19920 + }, + { + "epoch": 0.665753607696419, + "grad_norm": 0.6845982670783997, + "learning_rate": 0.00018427271888286426, + "loss": 0.4596, + "step": 19930 + }, + { + "epoch": 0.6660876536611438, + "grad_norm": 0.6138932108879089, + "learning_rate": 0.00018425289006676214, + "loss": 0.4338, + "step": 19940 + }, + { + "epoch": 0.6664216996258685, + "grad_norm": 0.6090888381004333, + "learning_rate": 0.00018423304982678626, + "loss": 0.411, + "step": 19950 + }, + { + "epoch": 0.6667557455905933, + "grad_norm": 0.7974926233291626, + "learning_rate": 0.00018421319816562678, + "loss": 0.4574, + "step": 19960 + }, + { + "epoch": 0.667089791555318, + "grad_norm": 0.8115525245666504, + "learning_rate": 0.00018419333508597533, + "loss": 0.4386, + "step": 19970 + }, + { + "epoch": 0.6674238375200428, + "grad_norm": 0.6253410577774048, + "learning_rate": 0.00018417346059052518, + "loss": 0.4127, + "step": 19980 + }, + { + "epoch": 0.6677578834847675, + "grad_norm": 0.8466482758522034, + "learning_rate": 0.00018415357468197114, + "loss": 0.4536, + "step": 19990 + }, + { + "epoch": 0.6680919294494923, + "grad_norm": 0.825547456741333, + "learning_rate": 0.00018413367736300952, + "loss": 0.469, + "step": 20000 + }, + { + "epoch": 0.6680919294494923, + "eval_chrf": 85.05281179212753, + "eval_loss": 0.7675221562385559, + "eval_runtime": 106.6476, + "eval_samples_per_second": 0.938, + "eval_steps_per_second": 0.038, + "step": 20000 + }, + { + "epoch": 0.668425975414217, + "grad_norm": 0.6638519763946533, + "learning_rate": 0.00018411376863633824, + "loss": 0.4565, + "step": 20010 + }, + { + "epoch": 0.6687600213789417, + "grad_norm": 0.6954872608184814, + "learning_rate": 0.0001840938485046567, + "loss": 0.4367, + "step": 20020 + }, + { + "epoch": 0.6690940673436665, + "grad_norm": 0.6239484548568726, + "learning_rate": 0.00018407391697066586, + "loss": 0.4519, + "step": 20030 + }, + { + "epoch": 0.6694281133083912, + "grad_norm": 0.6302216649055481, + "learning_rate": 0.00018405397403706828, + "loss": 0.4566, + "step": 20040 + }, + { + "epoch": 0.669762159273116, + "grad_norm": 0.7296533584594727, + "learning_rate": 0.00018403401970656802, + "loss": 0.4233, + "step": 20050 + }, + { + "epoch": 0.6700962052378407, + "grad_norm": 0.6588180065155029, + "learning_rate": 0.00018401405398187065, + "loss": 0.4434, + "step": 20060 + }, + { + "epoch": 0.6704302512025655, + "grad_norm": 0.6803295016288757, + "learning_rate": 0.00018399407686568343, + "loss": 0.4512, + "step": 20070 + }, + { + "epoch": 0.6707642971672902, + "grad_norm": 0.6416254043579102, + "learning_rate": 0.00018397408836071493, + "loss": 0.4385, + "step": 20080 + }, + { + "epoch": 0.671098343132015, + "grad_norm": 0.9077038168907166, + "learning_rate": 0.0001839540884696755, + "loss": 0.4075, + "step": 20090 + }, + { + "epoch": 0.6714323890967397, + "grad_norm": 0.6581910252571106, + "learning_rate": 0.00018393407719527688, + "loss": 0.4252, + "step": 20100 + }, + { + "epoch": 0.6717664350614645, + "grad_norm": 0.6541240811347961, + "learning_rate": 0.00018391405454023244, + "loss": 0.3809, + "step": 20110 + }, + { + "epoch": 0.6721004810261892, + "grad_norm": 0.7507724165916443, + "learning_rate": 0.000183894020507257, + "loss": 0.4537, + "step": 20120 + }, + { + "epoch": 0.672434526990914, + "grad_norm": 0.6332038640975952, + "learning_rate": 0.00018387397509906706, + "loss": 0.4144, + "step": 20130 + }, + { + "epoch": 0.6727685729556387, + "grad_norm": 0.7630555629730225, + "learning_rate": 0.0001838539183183805, + "loss": 0.4392, + "step": 20140 + }, + { + "epoch": 0.6731026189203635, + "grad_norm": 0.7452860474586487, + "learning_rate": 0.0001838338501679169, + "loss": 0.4199, + "step": 20150 + }, + { + "epoch": 0.6734366648850882, + "grad_norm": 0.6712950468063354, + "learning_rate": 0.00018381377065039724, + "loss": 0.4205, + "step": 20160 + }, + { + "epoch": 0.6737707108498129, + "grad_norm": 0.7241241931915283, + "learning_rate": 0.00018379367976854413, + "loss": 0.4338, + "step": 20170 + }, + { + "epoch": 0.6741047568145376, + "grad_norm": 0.7063023447990417, + "learning_rate": 0.00018377357752508174, + "loss": 0.4346, + "step": 20180 + }, + { + "epoch": 0.6744388027792624, + "grad_norm": 0.7964999675750732, + "learning_rate": 0.00018375346392273572, + "loss": 0.426, + "step": 20190 + }, + { + "epoch": 0.6747728487439871, + "grad_norm": 0.6838825941085815, + "learning_rate": 0.00018373333896423323, + "loss": 0.4532, + "step": 20200 + }, + { + "epoch": 0.6751068947087119, + "grad_norm": 0.7237727046012878, + "learning_rate": 0.00018371320265230304, + "loss": 0.4296, + "step": 20210 + }, + { + "epoch": 0.6754409406734366, + "grad_norm": 0.6413806080818176, + "learning_rate": 0.00018369305498967548, + "loss": 0.4531, + "step": 20220 + }, + { + "epoch": 0.6757749866381614, + "grad_norm": 0.6424244046211243, + "learning_rate": 0.00018367289597908236, + "loss": 0.4487, + "step": 20230 + }, + { + "epoch": 0.6761090326028861, + "grad_norm": 0.7096964120864868, + "learning_rate": 0.000183652725623257, + "loss": 0.4403, + "step": 20240 + }, + { + "epoch": 0.6764430785676109, + "grad_norm": 0.5736209750175476, + "learning_rate": 0.0001836325439249344, + "loss": 0.4206, + "step": 20250 + }, + { + "epoch": 0.6767771245323356, + "grad_norm": 0.8064764142036438, + "learning_rate": 0.0001836123508868509, + "loss": 0.4456, + "step": 20260 + }, + { + "epoch": 0.6771111704970604, + "grad_norm": 0.6465324759483337, + "learning_rate": 0.00018359214651174452, + "loss": 0.4134, + "step": 20270 + }, + { + "epoch": 0.6774452164617851, + "grad_norm": 0.8265402913093567, + "learning_rate": 0.00018357193080235478, + "loss": 0.4136, + "step": 20280 + }, + { + "epoch": 0.6777792624265099, + "grad_norm": 0.7235385179519653, + "learning_rate": 0.00018355170376142273, + "loss": 0.4118, + "step": 20290 + }, + { + "epoch": 0.6781133083912346, + "grad_norm": 0.8699031472206116, + "learning_rate": 0.000183531465391691, + "loss": 0.4144, + "step": 20300 + }, + { + "epoch": 0.6784473543559594, + "grad_norm": 0.6632704734802246, + "learning_rate": 0.00018351121569590366, + "loss": 0.454, + "step": 20310 + }, + { + "epoch": 0.6787814003206841, + "grad_norm": 0.8004485368728638, + "learning_rate": 0.0001834909546768064, + "loss": 0.45, + "step": 20320 + }, + { + "epoch": 0.6791154462854089, + "grad_norm": 0.6328513622283936, + "learning_rate": 0.00018347068233714637, + "loss": 0.4345, + "step": 20330 + }, + { + "epoch": 0.6794494922501336, + "grad_norm": 0.6846922636032104, + "learning_rate": 0.00018345039867967238, + "loss": 0.398, + "step": 20340 + }, + { + "epoch": 0.6797835382148584, + "grad_norm": 0.6389940977096558, + "learning_rate": 0.00018343010370713465, + "loss": 0.4401, + "step": 20350 + }, + { + "epoch": 0.6801175841795831, + "grad_norm": 0.8726378083229065, + "learning_rate": 0.000183409797422285, + "loss": 0.4346, + "step": 20360 + }, + { + "epoch": 0.6804516301443079, + "grad_norm": 0.6886200308799744, + "learning_rate": 0.00018338947982787676, + "loss": 0.4351, + "step": 20370 + }, + { + "epoch": 0.6807856761090326, + "grad_norm": 0.6489108800888062, + "learning_rate": 0.00018336915092666476, + "loss": 0.4268, + "step": 20380 + }, + { + "epoch": 0.6811197220737574, + "grad_norm": 0.6930118799209595, + "learning_rate": 0.00018334881072140546, + "loss": 0.4509, + "step": 20390 + }, + { + "epoch": 0.6814537680384821, + "grad_norm": 0.5939650535583496, + "learning_rate": 0.00018332845921485678, + "loss": 0.4117, + "step": 20400 + }, + { + "epoch": 0.6817878140032069, + "grad_norm": 0.6286852955818176, + "learning_rate": 0.00018330809640977816, + "loss": 0.4138, + "step": 20410 + }, + { + "epoch": 0.6821218599679316, + "grad_norm": 0.5997180938720703, + "learning_rate": 0.0001832877223089306, + "loss": 0.4417, + "step": 20420 + }, + { + "epoch": 0.6824559059326564, + "grad_norm": 0.6665716171264648, + "learning_rate": 0.00018326733691507668, + "loss": 0.4299, + "step": 20430 + }, + { + "epoch": 0.6827899518973811, + "grad_norm": 0.706278920173645, + "learning_rate": 0.00018324694023098045, + "loss": 0.4232, + "step": 20440 + }, + { + "epoch": 0.6831239978621059, + "grad_norm": 0.761397659778595, + "learning_rate": 0.0001832265322594074, + "loss": 0.4263, + "step": 20450 + }, + { + "epoch": 0.6834580438268306, + "grad_norm": 0.7926584482192993, + "learning_rate": 0.00018320611300312478, + "loss": 0.3956, + "step": 20460 + }, + { + "epoch": 0.6837920897915554, + "grad_norm": 0.6607089042663574, + "learning_rate": 0.0001831856824649012, + "loss": 0.4234, + "step": 20470 + }, + { + "epoch": 0.6841261357562801, + "grad_norm": 0.8517969250679016, + "learning_rate": 0.00018316524064750683, + "loss": 0.45, + "step": 20480 + }, + { + "epoch": 0.6844601817210048, + "grad_norm": 0.7654266357421875, + "learning_rate": 0.00018314478755371337, + "loss": 0.4492, + "step": 20490 + }, + { + "epoch": 0.6847942276857295, + "grad_norm": 0.6266884207725525, + "learning_rate": 0.0001831243231862941, + "loss": 0.4092, + "step": 20500 + }, + { + "epoch": 0.6851282736504543, + "grad_norm": 0.7813456058502197, + "learning_rate": 0.00018310384754802376, + "loss": 0.4447, + "step": 20510 + }, + { + "epoch": 0.685462319615179, + "grad_norm": 0.6864488124847412, + "learning_rate": 0.0001830833606416787, + "loss": 0.4271, + "step": 20520 + }, + { + "epoch": 0.6857963655799038, + "grad_norm": 0.6834902167320251, + "learning_rate": 0.00018306286247003666, + "loss": 0.4071, + "step": 20530 + }, + { + "epoch": 0.6861304115446285, + "grad_norm": 0.6276711225509644, + "learning_rate": 0.00018304235303587702, + "loss": 0.4157, + "step": 20540 + }, + { + "epoch": 0.6864644575093533, + "grad_norm": 0.6812220215797424, + "learning_rate": 0.0001830218323419807, + "loss": 0.4109, + "step": 20550 + }, + { + "epoch": 0.686798503474078, + "grad_norm": 0.7702106833457947, + "learning_rate": 0.0001830013003911301, + "loss": 0.4446, + "step": 20560 + }, + { + "epoch": 0.6871325494388028, + "grad_norm": 0.6621636152267456, + "learning_rate": 0.00018298075718610913, + "loss": 0.3923, + "step": 20570 + }, + { + "epoch": 0.6874665954035275, + "grad_norm": 0.7422793507575989, + "learning_rate": 0.00018296020272970322, + "loss": 0.4553, + "step": 20580 + }, + { + "epoch": 0.6878006413682523, + "grad_norm": 0.7377235293388367, + "learning_rate": 0.0001829396370246994, + "loss": 0.4417, + "step": 20590 + }, + { + "epoch": 0.688134687332977, + "grad_norm": 0.6151415109634399, + "learning_rate": 0.0001829190600738862, + "loss": 0.3933, + "step": 20600 + }, + { + "epoch": 0.6884687332977018, + "grad_norm": 0.7348946928977966, + "learning_rate": 0.00018289847188005364, + "loss": 0.4286, + "step": 20610 + }, + { + "epoch": 0.6888027792624265, + "grad_norm": 0.7590888738632202, + "learning_rate": 0.00018287787244599318, + "loss": 0.4499, + "step": 20620 + }, + { + "epoch": 0.6891368252271513, + "grad_norm": 0.6009424328804016, + "learning_rate": 0.00018285726177449808, + "loss": 0.4151, + "step": 20630 + }, + { + "epoch": 0.689470871191876, + "grad_norm": 0.7622150182723999, + "learning_rate": 0.00018283663986836283, + "loss": 0.3901, + "step": 20640 + }, + { + "epoch": 0.6898049171566007, + "grad_norm": 0.6918014287948608, + "learning_rate": 0.00018281600673038354, + "loss": 0.4347, + "step": 20650 + }, + { + "epoch": 0.6901389631213255, + "grad_norm": 0.6010653376579285, + "learning_rate": 0.00018279536236335793, + "loss": 0.3912, + "step": 20660 + }, + { + "epoch": 0.6904730090860502, + "grad_norm": 0.6666380763053894, + "learning_rate": 0.00018277470677008516, + "loss": 0.4038, + "step": 20670 + }, + { + "epoch": 0.690807055050775, + "grad_norm": 0.7231628894805908, + "learning_rate": 0.00018275403995336595, + "loss": 0.4262, + "step": 20680 + }, + { + "epoch": 0.6911411010154997, + "grad_norm": 0.7001346945762634, + "learning_rate": 0.00018273336191600246, + "loss": 0.4672, + "step": 20690 + }, + { + "epoch": 0.6914751469802245, + "grad_norm": 0.7947933673858643, + "learning_rate": 0.00018271267266079848, + "loss": 0.4378, + "step": 20700 + }, + { + "epoch": 0.6918091929449492, + "grad_norm": 0.6304612159729004, + "learning_rate": 0.00018269197219055922, + "loss": 0.4354, + "step": 20710 + }, + { + "epoch": 0.692143238909674, + "grad_norm": 0.5760933756828308, + "learning_rate": 0.00018267126050809153, + "loss": 0.4196, + "step": 20720 + }, + { + "epoch": 0.6924772848743987, + "grad_norm": 0.7482807040214539, + "learning_rate": 0.00018265053761620367, + "loss": 0.4135, + "step": 20730 + }, + { + "epoch": 0.6928113308391235, + "grad_norm": 0.7102651000022888, + "learning_rate": 0.00018262980351770546, + "loss": 0.4169, + "step": 20740 + }, + { + "epoch": 0.6931453768038482, + "grad_norm": 0.8187651634216309, + "learning_rate": 0.00018260905821540827, + "loss": 0.4258, + "step": 20750 + }, + { + "epoch": 0.693479422768573, + "grad_norm": 0.6520400643348694, + "learning_rate": 0.00018258830171212492, + "loss": 0.4438, + "step": 20760 + }, + { + "epoch": 0.6938134687332977, + "grad_norm": 0.6552689671516418, + "learning_rate": 0.00018256753401066985, + "loss": 0.4412, + "step": 20770 + }, + { + "epoch": 0.6941475146980225, + "grad_norm": 0.7374101281166077, + "learning_rate": 0.00018254675511385893, + "loss": 0.451, + "step": 20780 + }, + { + "epoch": 0.6944815606627472, + "grad_norm": 0.7282772660255432, + "learning_rate": 0.00018252596502450953, + "loss": 0.4128, + "step": 20790 + }, + { + "epoch": 0.694815606627472, + "grad_norm": 0.8241093754768372, + "learning_rate": 0.00018250516374544064, + "loss": 0.4408, + "step": 20800 + }, + { + "epoch": 0.6951496525921966, + "grad_norm": 0.6655164957046509, + "learning_rate": 0.00018248435127947272, + "loss": 0.4197, + "step": 20810 + }, + { + "epoch": 0.6954836985569214, + "grad_norm": 0.8061104416847229, + "learning_rate": 0.0001824635276294277, + "loss": 0.4292, + "step": 20820 + }, + { + "epoch": 0.6958177445216461, + "grad_norm": 0.6908974647521973, + "learning_rate": 0.00018244269279812909, + "loss": 0.4509, + "step": 20830 + }, + { + "epoch": 0.6961517904863709, + "grad_norm": 0.7327307462692261, + "learning_rate": 0.0001824218467884019, + "loss": 0.4135, + "step": 20840 + }, + { + "epoch": 0.6964858364510956, + "grad_norm": 0.7709669470787048, + "learning_rate": 0.0001824009896030726, + "loss": 0.4234, + "step": 20850 + }, + { + "epoch": 0.6968198824158204, + "grad_norm": 0.664616048336029, + "learning_rate": 0.00018238012124496925, + "loss": 0.4329, + "step": 20860 + }, + { + "epoch": 0.6971539283805451, + "grad_norm": 0.6893818378448486, + "learning_rate": 0.0001823592417169214, + "loss": 0.4648, + "step": 20870 + }, + { + "epoch": 0.6974879743452699, + "grad_norm": 0.6650697588920593, + "learning_rate": 0.00018233835102176012, + "loss": 0.4496, + "step": 20880 + }, + { + "epoch": 0.6978220203099946, + "grad_norm": 0.6942777037620544, + "learning_rate": 0.00018231744916231797, + "loss": 0.3831, + "step": 20890 + }, + { + "epoch": 0.6981560662747194, + "grad_norm": 0.6850726008415222, + "learning_rate": 0.00018229653614142903, + "loss": 0.4133, + "step": 20900 + }, + { + "epoch": 0.6984901122394441, + "grad_norm": 0.7291197776794434, + "learning_rate": 0.00018227561196192894, + "loss": 0.4518, + "step": 20910 + }, + { + "epoch": 0.6988241582041689, + "grad_norm": 0.6012083888053894, + "learning_rate": 0.00018225467662665476, + "loss": 0.4374, + "step": 20920 + }, + { + "epoch": 0.6991582041688936, + "grad_norm": 0.818712592124939, + "learning_rate": 0.00018223373013844516, + "loss": 0.4848, + "step": 20930 + }, + { + "epoch": 0.6994922501336184, + "grad_norm": 0.668904721736908, + "learning_rate": 0.00018221277250014025, + "loss": 0.4219, + "step": 20940 + }, + { + "epoch": 0.6998262960983431, + "grad_norm": 0.5899852514266968, + "learning_rate": 0.00018219180371458176, + "loss": 0.3879, + "step": 20950 + }, + { + "epoch": 0.7001603420630679, + "grad_norm": 0.7065714597702026, + "learning_rate": 0.00018217082378461273, + "loss": 0.4634, + "step": 20960 + }, + { + "epoch": 0.7004943880277926, + "grad_norm": 0.6949188709259033, + "learning_rate": 0.00018214983271307796, + "loss": 0.4644, + "step": 20970 + }, + { + "epoch": 0.7008284339925174, + "grad_norm": 0.7571803331375122, + "learning_rate": 0.0001821288305028235, + "loss": 0.432, + "step": 20980 + }, + { + "epoch": 0.7011624799572421, + "grad_norm": 0.6886268854141235, + "learning_rate": 0.0001821078171566972, + "loss": 0.4145, + "step": 20990 + }, + { + "epoch": 0.7014965259219669, + "grad_norm": 0.802520215511322, + "learning_rate": 0.00018208679267754814, + "loss": 0.4181, + "step": 21000 + }, + { + "epoch": 0.7014965259219669, + "eval_chrf": 85.24734320333044, + "eval_loss": 0.7563986778259277, + "eval_runtime": 135.6916, + "eval_samples_per_second": 0.737, + "eval_steps_per_second": 0.029, + "step": 21000 + }, + { + "epoch": 0.7018305718866916, + "grad_norm": 0.7039611339569092, + "learning_rate": 0.00018206575706822712, + "loss": 0.4151, + "step": 21010 + }, + { + "epoch": 0.7021646178514164, + "grad_norm": 0.6973361968994141, + "learning_rate": 0.00018204471033158627, + "loss": 0.4157, + "step": 21020 + }, + { + "epoch": 0.7024986638161411, + "grad_norm": 0.5726926326751709, + "learning_rate": 0.00018202365247047943, + "loss": 0.4414, + "step": 21030 + }, + { + "epoch": 0.7028327097808659, + "grad_norm": 0.7250692844390869, + "learning_rate": 0.00018200258348776178, + "loss": 0.4368, + "step": 21040 + }, + { + "epoch": 0.7031667557455906, + "grad_norm": 0.7709754705429077, + "learning_rate": 0.00018198150338629007, + "loss": 0.4113, + "step": 21050 + }, + { + "epoch": 0.7035008017103154, + "grad_norm": 0.7196980118751526, + "learning_rate": 0.00018196041216892255, + "loss": 0.4346, + "step": 21060 + }, + { + "epoch": 0.7038348476750401, + "grad_norm": 0.6847418546676636, + "learning_rate": 0.00018193930983851907, + "loss": 0.4042, + "step": 21070 + }, + { + "epoch": 0.7041688936397649, + "grad_norm": 0.6932473182678223, + "learning_rate": 0.00018191819639794073, + "loss": 0.3798, + "step": 21080 + }, + { + "epoch": 0.7045029396044896, + "grad_norm": 0.5912729501724243, + "learning_rate": 0.0001818970718500505, + "loss": 0.4087, + "step": 21090 + }, + { + "epoch": 0.7048369855692144, + "grad_norm": 0.6914377212524414, + "learning_rate": 0.00018187593619771252, + "loss": 0.4197, + "step": 21100 + }, + { + "epoch": 0.7051710315339391, + "grad_norm": 1.7148648500442505, + "learning_rate": 0.00018185478944379266, + "loss": 0.4244, + "step": 21110 + }, + { + "epoch": 0.7055050774986639, + "grad_norm": 0.6860547661781311, + "learning_rate": 0.00018183363159115818, + "loss": 0.4428, + "step": 21120 + }, + { + "epoch": 0.7058391234633885, + "grad_norm": 0.6547127962112427, + "learning_rate": 0.0001818124626426779, + "loss": 0.4199, + "step": 21130 + }, + { + "epoch": 0.7061731694281133, + "grad_norm": 0.6225613355636597, + "learning_rate": 0.00018179128260122212, + "loss": 0.4602, + "step": 21140 + }, + { + "epoch": 0.706507215392838, + "grad_norm": 0.5882841348648071, + "learning_rate": 0.0001817700914696626, + "loss": 0.4276, + "step": 21150 + }, + { + "epoch": 0.7068412613575628, + "grad_norm": 0.6387538909912109, + "learning_rate": 0.0001817488892508727, + "loss": 0.4222, + "step": 21160 + }, + { + "epoch": 0.7071753073222875, + "grad_norm": 0.6357945799827576, + "learning_rate": 0.00018172767594772725, + "loss": 0.4115, + "step": 21170 + }, + { + "epoch": 0.7075093532870123, + "grad_norm": 0.5662307143211365, + "learning_rate": 0.00018170645156310254, + "loss": 0.4699, + "step": 21180 + }, + { + "epoch": 0.707843399251737, + "grad_norm": 0.7562418580055237, + "learning_rate": 0.00018168521609987636, + "loss": 0.4372, + "step": 21190 + }, + { + "epoch": 0.7081774452164618, + "grad_norm": 0.7424671053886414, + "learning_rate": 0.0001816639695609281, + "loss": 0.3977, + "step": 21200 + }, + { + "epoch": 0.7085114911811865, + "grad_norm": 0.843397855758667, + "learning_rate": 0.00018164271194913854, + "loss": 0.4306, + "step": 21210 + }, + { + "epoch": 0.7088455371459113, + "grad_norm": 0.681532084941864, + "learning_rate": 0.00018162144326739003, + "loss": 0.4563, + "step": 21220 + }, + { + "epoch": 0.709179583110636, + "grad_norm": 0.6402402520179749, + "learning_rate": 0.00018160016351856636, + "loss": 0.4186, + "step": 21230 + }, + { + "epoch": 0.7095136290753608, + "grad_norm": 0.6735727190971375, + "learning_rate": 0.0001815788727055529, + "loss": 0.4273, + "step": 21240 + }, + { + "epoch": 0.7098476750400855, + "grad_norm": 0.7714232802391052, + "learning_rate": 0.00018155757083123644, + "loss": 0.4151, + "step": 21250 + }, + { + "epoch": 0.7101817210048103, + "grad_norm": 0.6130335330963135, + "learning_rate": 0.00018153625789850536, + "loss": 0.4189, + "step": 21260 + }, + { + "epoch": 0.710515766969535, + "grad_norm": 0.6833260655403137, + "learning_rate": 0.0001815149339102494, + "loss": 0.4325, + "step": 21270 + }, + { + "epoch": 0.7108498129342598, + "grad_norm": 0.609749436378479, + "learning_rate": 0.00018149359886935996, + "loss": 0.4315, + "step": 21280 + }, + { + "epoch": 0.7111838588989845, + "grad_norm": 0.741682767868042, + "learning_rate": 0.00018147225277872987, + "loss": 0.4015, + "step": 21290 + }, + { + "epoch": 0.7115179048637092, + "grad_norm": 0.7366710901260376, + "learning_rate": 0.00018145089564125343, + "loss": 0.4234, + "step": 21300 + }, + { + "epoch": 0.711851950828434, + "grad_norm": 0.6782798767089844, + "learning_rate": 0.00018142952745982643, + "loss": 0.4363, + "step": 21310 + }, + { + "epoch": 0.7121859967931587, + "grad_norm": 0.7518175840377808, + "learning_rate": 0.0001814081482373462, + "loss": 0.4544, + "step": 21320 + }, + { + "epoch": 0.7125200427578835, + "grad_norm": 0.6315569877624512, + "learning_rate": 0.00018138675797671158, + "loss": 0.4515, + "step": 21330 + }, + { + "epoch": 0.7128540887226082, + "grad_norm": 0.6881856322288513, + "learning_rate": 0.00018136535668082288, + "loss": 0.4261, + "step": 21340 + }, + { + "epoch": 0.713188134687333, + "grad_norm": 0.5225602388381958, + "learning_rate": 0.00018134394435258188, + "loss": 0.4142, + "step": 21350 + }, + { + "epoch": 0.7135221806520577, + "grad_norm": 0.8686723113059998, + "learning_rate": 0.00018132252099489193, + "loss": 0.4455, + "step": 21360 + }, + { + "epoch": 0.7138562266167825, + "grad_norm": 0.6150943636894226, + "learning_rate": 0.00018130108661065778, + "loss": 0.4166, + "step": 21370 + }, + { + "epoch": 0.7141902725815072, + "grad_norm": 0.7076891660690308, + "learning_rate": 0.00018127964120278573, + "loss": 0.4426, + "step": 21380 + }, + { + "epoch": 0.714524318546232, + "grad_norm": 0.7069090604782104, + "learning_rate": 0.00018125818477418362, + "loss": 0.4183, + "step": 21390 + }, + { + "epoch": 0.7148583645109567, + "grad_norm": 0.5749965906143188, + "learning_rate": 0.00018123671732776065, + "loss": 0.4179, + "step": 21400 + }, + { + "epoch": 0.7151924104756815, + "grad_norm": 0.5712727308273315, + "learning_rate": 0.0001812152388664277, + "loss": 0.4142, + "step": 21410 + }, + { + "epoch": 0.7155264564404062, + "grad_norm": 0.6813251972198486, + "learning_rate": 0.00018119374939309696, + "loss": 0.4393, + "step": 21420 + }, + { + "epoch": 0.715860502405131, + "grad_norm": 0.6911742091178894, + "learning_rate": 0.00018117224891068222, + "loss": 0.4385, + "step": 21430 + }, + { + "epoch": 0.7161945483698557, + "grad_norm": 0.6375542283058167, + "learning_rate": 0.00018115073742209874, + "loss": 0.4486, + "step": 21440 + }, + { + "epoch": 0.7165285943345805, + "grad_norm": 0.6285650134086609, + "learning_rate": 0.00018112921493026326, + "loss": 0.3795, + "step": 21450 + }, + { + "epoch": 0.7168626402993051, + "grad_norm": 0.78983473777771, + "learning_rate": 0.00018110768143809403, + "loss": 0.4264, + "step": 21460 + }, + { + "epoch": 0.7171966862640299, + "grad_norm": 0.7565487623214722, + "learning_rate": 0.0001810861369485108, + "loss": 0.4623, + "step": 21470 + }, + { + "epoch": 0.7175307322287546, + "grad_norm": 0.700519323348999, + "learning_rate": 0.00018106458146443475, + "loss": 0.4346, + "step": 21480 + }, + { + "epoch": 0.7178647781934794, + "grad_norm": 0.6774875521659851, + "learning_rate": 0.0001810430149887886, + "loss": 0.4426, + "step": 21490 + }, + { + "epoch": 0.7181988241582041, + "grad_norm": 0.6421693563461304, + "learning_rate": 0.0001810214375244966, + "loss": 0.4448, + "step": 21500 + }, + { + "epoch": 0.7185328701229289, + "grad_norm": 0.7149474024772644, + "learning_rate": 0.0001809998490744844, + "loss": 0.4319, + "step": 21510 + }, + { + "epoch": 0.7188669160876536, + "grad_norm": 0.8338332176208496, + "learning_rate": 0.0001809782496416792, + "loss": 0.4406, + "step": 21520 + }, + { + "epoch": 0.7192009620523784, + "grad_norm": 0.7076464891433716, + "learning_rate": 0.0001809566392290097, + "loss": 0.4288, + "step": 21530 + }, + { + "epoch": 0.7195350080171031, + "grad_norm": 0.7534632086753845, + "learning_rate": 0.00018093501783940602, + "loss": 0.431, + "step": 21540 + }, + { + "epoch": 0.7198690539818279, + "grad_norm": 0.6354673504829407, + "learning_rate": 0.00018091338547579982, + "loss": 0.4394, + "step": 21550 + }, + { + "epoch": 0.7202030999465526, + "grad_norm": 0.5369386672973633, + "learning_rate": 0.00018089174214112424, + "loss": 0.4154, + "step": 21560 + }, + { + "epoch": 0.7205371459112774, + "grad_norm": 0.6863415837287903, + "learning_rate": 0.00018087008783831393, + "loss": 0.4089, + "step": 21570 + }, + { + "epoch": 0.7208711918760021, + "grad_norm": 0.7050747275352478, + "learning_rate": 0.00018084842257030497, + "loss": 0.4232, + "step": 21580 + }, + { + "epoch": 0.7212052378407269, + "grad_norm": 0.6710808277130127, + "learning_rate": 0.00018082674634003495, + "loss": 0.3887, + "step": 21590 + }, + { + "epoch": 0.7215392838054516, + "grad_norm": 0.7392172813415527, + "learning_rate": 0.000180805059150443, + "loss": 0.4423, + "step": 21600 + }, + { + "epoch": 0.7218733297701764, + "grad_norm": 0.754584550857544, + "learning_rate": 0.0001807833610044697, + "loss": 0.4531, + "step": 21610 + }, + { + "epoch": 0.7222073757349011, + "grad_norm": 0.6032457947731018, + "learning_rate": 0.00018076165190505705, + "loss": 0.4336, + "step": 21620 + }, + { + "epoch": 0.7225414216996259, + "grad_norm": 0.7321736812591553, + "learning_rate": 0.00018073993185514865, + "loss": 0.4512, + "step": 21630 + }, + { + "epoch": 0.7228754676643506, + "grad_norm": 0.6671245694160461, + "learning_rate": 0.00018071820085768946, + "loss": 0.4181, + "step": 21640 + }, + { + "epoch": 0.7232095136290754, + "grad_norm": 0.6727116703987122, + "learning_rate": 0.00018069645891562606, + "loss": 0.4366, + "step": 21650 + }, + { + "epoch": 0.7235435595938001, + "grad_norm": 0.7012612223625183, + "learning_rate": 0.0001806747060319064, + "loss": 0.3919, + "step": 21660 + }, + { + "epoch": 0.7238776055585249, + "grad_norm": 0.6415188312530518, + "learning_rate": 0.00018065294220948, + "loss": 0.395, + "step": 21670 + }, + { + "epoch": 0.7242116515232496, + "grad_norm": 0.7192271947860718, + "learning_rate": 0.0001806311674512978, + "loss": 0.4073, + "step": 21680 + }, + { + "epoch": 0.7245456974879744, + "grad_norm": 0.6445436477661133, + "learning_rate": 0.00018060938176031225, + "loss": 0.4065, + "step": 21690 + }, + { + "epoch": 0.7248797434526991, + "grad_norm": 0.7137343883514404, + "learning_rate": 0.0001805875851394773, + "loss": 0.4625, + "step": 21700 + }, + { + "epoch": 0.7252137894174239, + "grad_norm": 0.6239537000656128, + "learning_rate": 0.00018056577759174828, + "loss": 0.4006, + "step": 21710 + }, + { + "epoch": 0.7255478353821486, + "grad_norm": 0.7516697645187378, + "learning_rate": 0.00018054395912008218, + "loss": 0.4347, + "step": 21720 + }, + { + "epoch": 0.7258818813468734, + "grad_norm": 0.5589054226875305, + "learning_rate": 0.0001805221297274373, + "loss": 0.4436, + "step": 21730 + }, + { + "epoch": 0.7262159273115981, + "grad_norm": 0.7435981035232544, + "learning_rate": 0.00018050028941677356, + "loss": 0.435, + "step": 21740 + }, + { + "epoch": 0.7265499732763229, + "grad_norm": 0.6540804505348206, + "learning_rate": 0.00018047843819105224, + "loss": 0.4292, + "step": 21750 + }, + { + "epoch": 0.7268840192410476, + "grad_norm": 0.8910009860992432, + "learning_rate": 0.00018045657605323618, + "loss": 0.4606, + "step": 21760 + }, + { + "epoch": 0.7272180652057724, + "grad_norm": 0.6529805064201355, + "learning_rate": 0.00018043470300628967, + "loss": 0.4328, + "step": 21770 + }, + { + "epoch": 0.727552111170497, + "grad_norm": 0.6681368947029114, + "learning_rate": 0.00018041281905317846, + "loss": 0.4258, + "step": 21780 + }, + { + "epoch": 0.7278861571352218, + "grad_norm": 0.7071722149848938, + "learning_rate": 0.00018039092419686983, + "loss": 0.4474, + "step": 21790 + }, + { + "epoch": 0.7282202030999465, + "grad_norm": 0.696273148059845, + "learning_rate": 0.0001803690184403325, + "loss": 0.4201, + "step": 21800 + }, + { + "epoch": 0.7285542490646713, + "grad_norm": 0.7174062728881836, + "learning_rate": 0.0001803471017865367, + "loss": 0.4376, + "step": 21810 + }, + { + "epoch": 0.728888295029396, + "grad_norm": 0.6263520121574402, + "learning_rate": 0.00018032517423845405, + "loss": 0.4, + "step": 21820 + }, + { + "epoch": 0.7292223409941208, + "grad_norm": 0.6765794157981873, + "learning_rate": 0.00018030323579905778, + "loss": 0.4124, + "step": 21830 + }, + { + "epoch": 0.7295563869588455, + "grad_norm": 0.6113195419311523, + "learning_rate": 0.0001802812864713225, + "loss": 0.4007, + "step": 21840 + }, + { + "epoch": 0.7298904329235703, + "grad_norm": 0.6088060736656189, + "learning_rate": 0.00018025932625822435, + "loss": 0.429, + "step": 21850 + }, + { + "epoch": 0.730224478888295, + "grad_norm": 0.7115740776062012, + "learning_rate": 0.0001802373551627409, + "loss": 0.4316, + "step": 21860 + }, + { + "epoch": 0.7305585248530198, + "grad_norm": 0.6083971261978149, + "learning_rate": 0.00018021537318785124, + "loss": 0.4421, + "step": 21870 + }, + { + "epoch": 0.7308925708177445, + "grad_norm": 0.6995817422866821, + "learning_rate": 0.00018019338033653583, + "loss": 0.4266, + "step": 21880 + }, + { + "epoch": 0.7312266167824693, + "grad_norm": 0.6991373300552368, + "learning_rate": 0.00018017137661177678, + "loss": 0.4492, + "step": 21890 + }, + { + "epoch": 0.731560662747194, + "grad_norm": 0.7550511360168457, + "learning_rate": 0.0001801493620165576, + "loss": 0.4601, + "step": 21900 + }, + { + "epoch": 0.7318947087119188, + "grad_norm": 0.6841117143630981, + "learning_rate": 0.00018012733655386315, + "loss": 0.4595, + "step": 21910 + }, + { + "epoch": 0.7322287546766435, + "grad_norm": 0.6272009015083313, + "learning_rate": 0.00018010530022667993, + "loss": 0.4461, + "step": 21920 + }, + { + "epoch": 0.7325628006413683, + "grad_norm": 0.6556243896484375, + "learning_rate": 0.00018008325303799587, + "loss": 0.454, + "step": 21930 + }, + { + "epoch": 0.732896846606093, + "grad_norm": 0.631788432598114, + "learning_rate": 0.00018006119499080033, + "loss": 0.4126, + "step": 21940 + }, + { + "epoch": 0.7332308925708177, + "grad_norm": 0.7121738791465759, + "learning_rate": 0.00018003912608808412, + "loss": 0.3955, + "step": 21950 + }, + { + "epoch": 0.7335649385355425, + "grad_norm": 0.7100335359573364, + "learning_rate": 0.0001800170463328397, + "loss": 0.436, + "step": 21960 + }, + { + "epoch": 0.7338989845002672, + "grad_norm": 0.621306300163269, + "learning_rate": 0.00017999495572806074, + "loss": 0.4602, + "step": 21970 + }, + { + "epoch": 0.734233030464992, + "grad_norm": 0.6288275122642517, + "learning_rate": 0.00017997285427674258, + "loss": 0.4254, + "step": 21980 + }, + { + "epoch": 0.7345670764297167, + "grad_norm": 0.7925300598144531, + "learning_rate": 0.0001799507419818819, + "loss": 0.4258, + "step": 21990 + }, + { + "epoch": 0.7349011223944415, + "grad_norm": 0.5857428908348083, + "learning_rate": 0.000179928618846477, + "loss": 0.4195, + "step": 22000 + }, + { + "epoch": 0.7349011223944415, + "eval_chrf": 86.19489823865186, + "eval_loss": 0.7771641612052917, + "eval_runtime": 91.6956, + "eval_samples_per_second": 1.091, + "eval_steps_per_second": 0.044, + "step": 22000 + }, + { + "epoch": 0.7352351683591662, + "grad_norm": 0.7693691253662109, + "learning_rate": 0.0001799064848735275, + "loss": 0.4158, + "step": 22010 + }, + { + "epoch": 0.735569214323891, + "grad_norm": 0.664253294467926, + "learning_rate": 0.00017988434006603455, + "loss": 0.4313, + "step": 22020 + }, + { + "epoch": 0.7359032602886157, + "grad_norm": 0.7145563960075378, + "learning_rate": 0.00017986218442700083, + "loss": 0.4639, + "step": 22030 + }, + { + "epoch": 0.7362373062533405, + "grad_norm": 0.785377025604248, + "learning_rate": 0.00017984001795943034, + "loss": 0.4458, + "step": 22040 + }, + { + "epoch": 0.7365713522180652, + "grad_norm": 0.6136037111282349, + "learning_rate": 0.00017981784066632873, + "loss": 0.4417, + "step": 22050 + }, + { + "epoch": 0.73690539818279, + "grad_norm": 0.7071608304977417, + "learning_rate": 0.00017979565255070296, + "loss": 0.4513, + "step": 22060 + }, + { + "epoch": 0.7372394441475147, + "grad_norm": 0.6212626695632935, + "learning_rate": 0.00017977345361556158, + "loss": 0.4317, + "step": 22070 + }, + { + "epoch": 0.7375734901122395, + "grad_norm": 0.7804982662200928, + "learning_rate": 0.0001797512438639145, + "loss": 0.4642, + "step": 22080 + }, + { + "epoch": 0.7379075360769642, + "grad_norm": 0.6918339729309082, + "learning_rate": 0.00017972902329877312, + "loss": 0.4804, + "step": 22090 + }, + { + "epoch": 0.7382415820416889, + "grad_norm": 0.8520131707191467, + "learning_rate": 0.00017970679192315042, + "loss": 0.4224, + "step": 22100 + }, + { + "epoch": 0.7385756280064136, + "grad_norm": 0.8075221180915833, + "learning_rate": 0.00017968454974006073, + "loss": 0.4417, + "step": 22110 + }, + { + "epoch": 0.7389096739711384, + "grad_norm": 0.6018359065055847, + "learning_rate": 0.0001796622967525198, + "loss": 0.4003, + "step": 22120 + }, + { + "epoch": 0.7392437199358631, + "grad_norm": 0.5789085030555725, + "learning_rate": 0.00017964003296354504, + "loss": 0.4368, + "step": 22130 + }, + { + "epoch": 0.7395777659005879, + "grad_norm": 0.733235776424408, + "learning_rate": 0.00017961775837615509, + "loss": 0.4576, + "step": 22140 + }, + { + "epoch": 0.7399118118653126, + "grad_norm": 0.7086751461029053, + "learning_rate": 0.00017959547299337025, + "loss": 0.4199, + "step": 22150 + }, + { + "epoch": 0.7402458578300374, + "grad_norm": 0.6634631156921387, + "learning_rate": 0.00017957317681821215, + "loss": 0.4247, + "step": 22160 + }, + { + "epoch": 0.7405799037947621, + "grad_norm": 0.7349246740341187, + "learning_rate": 0.00017955086985370396, + "loss": 0.4099, + "step": 22170 + }, + { + "epoch": 0.7409139497594869, + "grad_norm": 0.6540806889533997, + "learning_rate": 0.00017952855210287026, + "loss": 0.4056, + "step": 22180 + }, + { + "epoch": 0.7412479957242116, + "grad_norm": 0.7440052628517151, + "learning_rate": 0.00017950622356873718, + "loss": 0.4374, + "step": 22190 + }, + { + "epoch": 0.7415820416889364, + "grad_norm": 0.6447760462760925, + "learning_rate": 0.0001794838842543322, + "loss": 0.3961, + "step": 22200 + }, + { + "epoch": 0.7419160876536611, + "grad_norm": 0.6212721467018127, + "learning_rate": 0.00017946153416268434, + "loss": 0.4283, + "step": 22210 + }, + { + "epoch": 0.7422501336183859, + "grad_norm": 0.6497822999954224, + "learning_rate": 0.00017943917329682404, + "loss": 0.4166, + "step": 22220 + }, + { + "epoch": 0.7425841795831106, + "grad_norm": 0.6284915208816528, + "learning_rate": 0.0001794168016597832, + "loss": 0.4273, + "step": 22230 + }, + { + "epoch": 0.7429182255478354, + "grad_norm": 0.7663860321044922, + "learning_rate": 0.00017939441925459524, + "loss": 0.425, + "step": 22240 + }, + { + "epoch": 0.7432522715125601, + "grad_norm": 0.6566057205200195, + "learning_rate": 0.00017937202608429498, + "loss": 0.3698, + "step": 22250 + }, + { + "epoch": 0.7435863174772849, + "grad_norm": 0.738263726234436, + "learning_rate": 0.0001793496221519187, + "loss": 0.443, + "step": 22260 + }, + { + "epoch": 0.7439203634420096, + "grad_norm": 0.6417142748832703, + "learning_rate": 0.00017932720746050417, + "loss": 0.3839, + "step": 22270 + }, + { + "epoch": 0.7442544094067344, + "grad_norm": 0.7931541800498962, + "learning_rate": 0.0001793047820130906, + "loss": 0.4209, + "step": 22280 + }, + { + "epoch": 0.7445884553714591, + "grad_norm": 0.9608420133590698, + "learning_rate": 0.00017928234581271867, + "loss": 0.393, + "step": 22290 + }, + { + "epoch": 0.7449225013361839, + "grad_norm": 0.7376227974891663, + "learning_rate": 0.00017925989886243048, + "loss": 0.4658, + "step": 22300 + }, + { + "epoch": 0.7452565473009086, + "grad_norm": 0.7292225360870361, + "learning_rate": 0.00017923744116526966, + "loss": 0.4652, + "step": 22310 + }, + { + "epoch": 0.7455905932656334, + "grad_norm": 0.6645877361297607, + "learning_rate": 0.00017921497272428125, + "loss": 0.4327, + "step": 22320 + }, + { + "epoch": 0.7459246392303581, + "grad_norm": 0.7131154537200928, + "learning_rate": 0.00017919249354251174, + "loss": 0.4048, + "step": 22330 + }, + { + "epoch": 0.7462586851950829, + "grad_norm": 0.7216104865074158, + "learning_rate": 0.00017917000362300907, + "loss": 0.4196, + "step": 22340 + }, + { + "epoch": 0.7465927311598076, + "grad_norm": 0.825938880443573, + "learning_rate": 0.00017914750296882268, + "loss": 0.4226, + "step": 22350 + }, + { + "epoch": 0.7469267771245324, + "grad_norm": 0.8205549120903015, + "learning_rate": 0.00017912499158300344, + "loss": 0.424, + "step": 22360 + }, + { + "epoch": 0.7472608230892571, + "grad_norm": 0.7015880346298218, + "learning_rate": 0.00017910246946860366, + "loss": 0.4172, + "step": 22370 + }, + { + "epoch": 0.7475948690539819, + "grad_norm": 0.6571441888809204, + "learning_rate": 0.00017907993662867713, + "loss": 0.4493, + "step": 22380 + }, + { + "epoch": 0.7479289150187066, + "grad_norm": 0.6562917232513428, + "learning_rate": 0.00017905739306627911, + "loss": 0.4316, + "step": 22390 + }, + { + "epoch": 0.7482629609834314, + "grad_norm": 0.6513550877571106, + "learning_rate": 0.00017903483878446626, + "loss": 0.4112, + "step": 22400 + }, + { + "epoch": 0.7485970069481561, + "grad_norm": 0.5719326138496399, + "learning_rate": 0.00017901227378629673, + "loss": 0.4097, + "step": 22410 + }, + { + "epoch": 0.7489310529128808, + "grad_norm": 0.7021951079368591, + "learning_rate": 0.0001789896980748301, + "loss": 0.4325, + "step": 22420 + }, + { + "epoch": 0.7492650988776055, + "grad_norm": 0.7036457657814026, + "learning_rate": 0.0001789671116531274, + "loss": 0.395, + "step": 22430 + }, + { + "epoch": 0.7495991448423303, + "grad_norm": 0.6756548285484314, + "learning_rate": 0.0001789445145242512, + "loss": 0.4533, + "step": 22440 + }, + { + "epoch": 0.749933190807055, + "grad_norm": 0.7309603095054626, + "learning_rate": 0.00017892190669126538, + "loss": 0.4352, + "step": 22450 + }, + { + "epoch": 0.7502672367717798, + "grad_norm": 0.7893760204315186, + "learning_rate": 0.00017889928815723537, + "loss": 0.4059, + "step": 22460 + }, + { + "epoch": 0.7506012827365045, + "grad_norm": 0.6318427324295044, + "learning_rate": 0.00017887665892522803, + "loss": 0.4502, + "step": 22470 + }, + { + "epoch": 0.7509353287012293, + "grad_norm": 0.6534332633018494, + "learning_rate": 0.00017885401899831165, + "loss": 0.445, + "step": 22480 + }, + { + "epoch": 0.751269374665954, + "grad_norm": 0.7154189944267273, + "learning_rate": 0.00017883136837955598, + "loss": 0.4294, + "step": 22490 + }, + { + "epoch": 0.7516034206306788, + "grad_norm": 0.7945976257324219, + "learning_rate": 0.00017880870707203227, + "loss": 0.4289, + "step": 22500 + }, + { + "epoch": 0.7519374665954035, + "grad_norm": 0.8473165035247803, + "learning_rate": 0.00017878603507881312, + "loss": 0.4776, + "step": 22510 + }, + { + "epoch": 0.7522715125601283, + "grad_norm": 1.1677203178405762, + "learning_rate": 0.00017876335240297266, + "loss": 0.4183, + "step": 22520 + }, + { + "epoch": 0.752605558524853, + "grad_norm": 0.6810056567192078, + "learning_rate": 0.0001787406590475864, + "loss": 0.4558, + "step": 22530 + }, + { + "epoch": 0.7529396044895778, + "grad_norm": 0.6182177066802979, + "learning_rate": 0.0001787179550157314, + "loss": 0.4423, + "step": 22540 + }, + { + "epoch": 0.7532736504543025, + "grad_norm": 0.7174659967422485, + "learning_rate": 0.0001786952403104861, + "loss": 0.4233, + "step": 22550 + }, + { + "epoch": 0.7536076964190273, + "grad_norm": 0.6998006701469421, + "learning_rate": 0.00017867251493493033, + "loss": 0.415, + "step": 22560 + }, + { + "epoch": 0.753941742383752, + "grad_norm": 0.7289135456085205, + "learning_rate": 0.0001786497788921455, + "loss": 0.4784, + "step": 22570 + }, + { + "epoch": 0.7542757883484768, + "grad_norm": 0.772833526134491, + "learning_rate": 0.00017862703218521431, + "loss": 0.4135, + "step": 22580 + }, + { + "epoch": 0.7546098343132015, + "grad_norm": 0.7057951092720032, + "learning_rate": 0.00017860427481722113, + "loss": 0.4045, + "step": 22590 + }, + { + "epoch": 0.7549438802779262, + "grad_norm": 0.7288662791252136, + "learning_rate": 0.00017858150679125152, + "loss": 0.4716, + "step": 22600 + }, + { + "epoch": 0.755277926242651, + "grad_norm": 0.6740784049034119, + "learning_rate": 0.00017855872811039264, + "loss": 0.3977, + "step": 22610 + }, + { + "epoch": 0.7556119722073757, + "grad_norm": 0.6908742785453796, + "learning_rate": 0.00017853593877773305, + "loss": 0.3968, + "step": 22620 + }, + { + "epoch": 0.7559460181721005, + "grad_norm": 0.6451966166496277, + "learning_rate": 0.00017851313879636277, + "loss": 0.4628, + "step": 22630 + }, + { + "epoch": 0.7562800641368252, + "grad_norm": 0.7048203349113464, + "learning_rate": 0.00017849032816937327, + "loss": 0.4049, + "step": 22640 + }, + { + "epoch": 0.75661411010155, + "grad_norm": 1.0091681480407715, + "learning_rate": 0.00017846750689985743, + "loss": 0.4363, + "step": 22650 + }, + { + "epoch": 0.7569481560662747, + "grad_norm": 0.5633321404457092, + "learning_rate": 0.0001784446749909096, + "loss": 0.4212, + "step": 22660 + }, + { + "epoch": 0.7572822020309995, + "grad_norm": 0.6306982636451721, + "learning_rate": 0.0001784218324456256, + "loss": 0.4462, + "step": 22670 + }, + { + "epoch": 0.7576162479957242, + "grad_norm": 0.6862691640853882, + "learning_rate": 0.00017839897926710254, + "loss": 0.415, + "step": 22680 + }, + { + "epoch": 0.757950293960449, + "grad_norm": 0.5955480933189392, + "learning_rate": 0.00017837611545843922, + "loss": 0.4304, + "step": 22690 + }, + { + "epoch": 0.7582843399251737, + "grad_norm": 0.6343395709991455, + "learning_rate": 0.00017835324102273572, + "loss": 0.4242, + "step": 22700 + }, + { + "epoch": 0.7586183858898985, + "grad_norm": 0.7406463027000427, + "learning_rate": 0.00017833035596309356, + "loss": 0.4361, + "step": 22710 + }, + { + "epoch": 0.7589524318546232, + "grad_norm": 0.8348444700241089, + "learning_rate": 0.0001783074602826157, + "loss": 0.4345, + "step": 22720 + }, + { + "epoch": 0.759286477819348, + "grad_norm": 0.7484140992164612, + "learning_rate": 0.00017828455398440668, + "loss": 0.434, + "step": 22730 + }, + { + "epoch": 0.7596205237840727, + "grad_norm": 0.6391878128051758, + "learning_rate": 0.00017826163707157226, + "loss": 0.457, + "step": 22740 + }, + { + "epoch": 0.7599545697487974, + "grad_norm": 0.8161389231681824, + "learning_rate": 0.00017823870954721983, + "loss": 0.4643, + "step": 22750 + }, + { + "epoch": 0.7602886157135221, + "grad_norm": 0.7142333388328552, + "learning_rate": 0.00017821577141445812, + "loss": 0.4466, + "step": 22760 + }, + { + "epoch": 0.7606226616782469, + "grad_norm": 0.6500018239021301, + "learning_rate": 0.00017819282267639728, + "loss": 0.4796, + "step": 22770 + }, + { + "epoch": 0.7609567076429716, + "grad_norm": 0.6956322193145752, + "learning_rate": 0.00017816986333614896, + "loss": 0.4533, + "step": 22780 + }, + { + "epoch": 0.7612907536076964, + "grad_norm": 0.7166767120361328, + "learning_rate": 0.00017814689339682623, + "loss": 0.4151, + "step": 22790 + }, + { + "epoch": 0.7616247995724211, + "grad_norm": 0.789677619934082, + "learning_rate": 0.0001781239128615436, + "loss": 0.4433, + "step": 22800 + }, + { + "epoch": 0.7619588455371459, + "grad_norm": 0.7657991647720337, + "learning_rate": 0.000178100921733417, + "loss": 0.4256, + "step": 22810 + }, + { + "epoch": 0.7622928915018706, + "grad_norm": 0.861562967300415, + "learning_rate": 0.00017807792001556377, + "loss": 0.4555, + "step": 22820 + }, + { + "epoch": 0.7626269374665954, + "grad_norm": 0.7053157091140747, + "learning_rate": 0.00017805490771110276, + "loss": 0.4473, + "step": 22830 + }, + { + "epoch": 0.7629609834313201, + "grad_norm": 0.7487970590591431, + "learning_rate": 0.00017803188482315422, + "loss": 0.4223, + "step": 22840 + }, + { + "epoch": 0.7632950293960449, + "grad_norm": 0.6151548624038696, + "learning_rate": 0.00017800885135483977, + "loss": 0.4252, + "step": 22850 + }, + { + "epoch": 0.7636290753607696, + "grad_norm": 0.5677239298820496, + "learning_rate": 0.0001779858073092826, + "loss": 0.4634, + "step": 22860 + }, + { + "epoch": 0.7639631213254944, + "grad_norm": 0.6971555948257446, + "learning_rate": 0.00017796275268960722, + "loss": 0.4222, + "step": 22870 + }, + { + "epoch": 0.7642971672902191, + "grad_norm": 0.6108532547950745, + "learning_rate": 0.00017793968749893965, + "loss": 0.4359, + "step": 22880 + }, + { + "epoch": 0.7646312132549439, + "grad_norm": 0.7630313038825989, + "learning_rate": 0.00017791661174040724, + "loss": 0.4498, + "step": 22890 + }, + { + "epoch": 0.7649652592196686, + "grad_norm": 0.6541460752487183, + "learning_rate": 0.00017789352541713887, + "loss": 0.3573, + "step": 22900 + }, + { + "epoch": 0.7652993051843934, + "grad_norm": 0.6682366132736206, + "learning_rate": 0.0001778704285322648, + "loss": 0.4152, + "step": 22910 + }, + { + "epoch": 0.7656333511491181, + "grad_norm": 0.5921030640602112, + "learning_rate": 0.00017784732108891684, + "loss": 0.4109, + "step": 22920 + }, + { + "epoch": 0.7659673971138429, + "grad_norm": 0.6949250102043152, + "learning_rate": 0.000177824203090228, + "loss": 0.4481, + "step": 22930 + }, + { + "epoch": 0.7663014430785676, + "grad_norm": 0.7187597155570984, + "learning_rate": 0.00017780107453933296, + "loss": 0.4313, + "step": 22940 + }, + { + "epoch": 0.7666354890432924, + "grad_norm": 0.5539267063140869, + "learning_rate": 0.00017777793543936765, + "loss": 0.4376, + "step": 22950 + }, + { + "epoch": 0.7669695350080171, + "grad_norm": 0.7610986232757568, + "learning_rate": 0.00017775478579346957, + "loss": 0.4133, + "step": 22960 + }, + { + "epoch": 0.7673035809727419, + "grad_norm": 0.6486761569976807, + "learning_rate": 0.00017773162560477757, + "loss": 0.4095, + "step": 22970 + }, + { + "epoch": 0.7676376269374666, + "grad_norm": 0.580025851726532, + "learning_rate": 0.0001777084548764319, + "loss": 0.4461, + "step": 22980 + }, + { + "epoch": 0.7679716729021914, + "grad_norm": 0.8659842014312744, + "learning_rate": 0.00017768527361157433, + "loss": 0.4038, + "step": 22990 + }, + { + "epoch": 0.7683057188669161, + "grad_norm": 0.6151331663131714, + "learning_rate": 0.00017766208181334804, + "loss": 0.4288, + "step": 23000 + }, + { + "epoch": 0.7683057188669161, + "eval_chrf": 84.10140096332283, + "eval_loss": 0.7622534036636353, + "eval_runtime": 147.1047, + "eval_samples_per_second": 0.68, + "eval_steps_per_second": 0.027, + "step": 23000 + }, + { + "epoch": 0.7686397648316409, + "grad_norm": 0.587854266166687, + "learning_rate": 0.00017763887948489754, + "loss": 0.4183, + "step": 23010 + }, + { + "epoch": 0.7689738107963656, + "grad_norm": 0.720871090888977, + "learning_rate": 0.00017761566662936892, + "loss": 0.4176, + "step": 23020 + }, + { + "epoch": 0.7693078567610904, + "grad_norm": 0.6442272663116455, + "learning_rate": 0.00017759244324990956, + "loss": 0.4171, + "step": 23030 + }, + { + "epoch": 0.7696419027258151, + "grad_norm": 0.6268578171730042, + "learning_rate": 0.00017756920934966833, + "loss": 0.4142, + "step": 23040 + }, + { + "epoch": 0.7699759486905399, + "grad_norm": 0.686083972454071, + "learning_rate": 0.00017754596493179555, + "loss": 0.4197, + "step": 23050 + }, + { + "epoch": 0.7703099946552646, + "grad_norm": 0.7276718616485596, + "learning_rate": 0.00017752270999944294, + "loss": 0.4249, + "step": 23060 + }, + { + "epoch": 0.7706440406199893, + "grad_norm": 0.6035161018371582, + "learning_rate": 0.00017749944455576358, + "loss": 0.476, + "step": 23070 + }, + { + "epoch": 0.770978086584714, + "grad_norm": 0.8865569829940796, + "learning_rate": 0.00017747616860391212, + "loss": 0.4177, + "step": 23080 + }, + { + "epoch": 0.7713121325494388, + "grad_norm": 0.7130288481712341, + "learning_rate": 0.0001774528821470445, + "loss": 0.4071, + "step": 23090 + }, + { + "epoch": 0.7716461785141635, + "grad_norm": 0.866570770740509, + "learning_rate": 0.00017742958518831816, + "loss": 0.4417, + "step": 23100 + }, + { + "epoch": 0.7719802244788883, + "grad_norm": 0.8309292197227478, + "learning_rate": 0.00017740627773089193, + "loss": 0.4372, + "step": 23110 + }, + { + "epoch": 0.772314270443613, + "grad_norm": 0.676877498626709, + "learning_rate": 0.0001773829597779261, + "loss": 0.3939, + "step": 23120 + }, + { + "epoch": 0.7726483164083378, + "grad_norm": 0.7387454509735107, + "learning_rate": 0.00017735963133258228, + "loss": 0.4131, + "step": 23130 + }, + { + "epoch": 0.7729823623730625, + "grad_norm": 0.6792025566101074, + "learning_rate": 0.0001773362923980237, + "loss": 0.416, + "step": 23140 + }, + { + "epoch": 0.7733164083377873, + "grad_norm": 0.8122945427894592, + "learning_rate": 0.0001773129429774148, + "loss": 0.4537, + "step": 23150 + }, + { + "epoch": 0.773650454302512, + "grad_norm": 0.7437384724617004, + "learning_rate": 0.00017728958307392157, + "loss": 0.4303, + "step": 23160 + }, + { + "epoch": 0.7739845002672368, + "grad_norm": 0.6848219037055969, + "learning_rate": 0.00017726621269071138, + "loss": 0.4631, + "step": 23170 + }, + { + "epoch": 0.7743185462319615, + "grad_norm": 0.7761818170547485, + "learning_rate": 0.00017724283183095306, + "loss": 0.412, + "step": 23180 + }, + { + "epoch": 0.7746525921966863, + "grad_norm": 0.6607082486152649, + "learning_rate": 0.00017721944049781674, + "loss": 0.3895, + "step": 23190 + }, + { + "epoch": 0.774986638161411, + "grad_norm": 0.7052908539772034, + "learning_rate": 0.00017719603869447415, + "loss": 0.4255, + "step": 23200 + }, + { + "epoch": 0.7753206841261358, + "grad_norm": 0.6467841267585754, + "learning_rate": 0.00017717262642409832, + "loss": 0.3989, + "step": 23210 + }, + { + "epoch": 0.7756547300908605, + "grad_norm": 0.6168584227561951, + "learning_rate": 0.0001771492036898637, + "loss": 0.4115, + "step": 23220 + }, + { + "epoch": 0.7759887760555852, + "grad_norm": 0.6493638753890991, + "learning_rate": 0.00017712577049494617, + "loss": 0.4429, + "step": 23230 + }, + { + "epoch": 0.77632282202031, + "grad_norm": 0.8112576007843018, + "learning_rate": 0.00017710232684252315, + "loss": 0.4048, + "step": 23240 + }, + { + "epoch": 0.7766568679850347, + "grad_norm": 0.6016640663146973, + "learning_rate": 0.00017707887273577327, + "loss": 0.4125, + "step": 23250 + }, + { + "epoch": 0.7769909139497595, + "grad_norm": 0.7932833433151245, + "learning_rate": 0.0001770554081778767, + "loss": 0.4376, + "step": 23260 + }, + { + "epoch": 0.7773249599144842, + "grad_norm": 0.6744533777236938, + "learning_rate": 0.000177031933172015, + "loss": 0.4391, + "step": 23270 + }, + { + "epoch": 0.777659005879209, + "grad_norm": 0.6871066689491272, + "learning_rate": 0.0001770084477213712, + "loss": 0.4333, + "step": 23280 + }, + { + "epoch": 0.7779930518439337, + "grad_norm": 0.6260595917701721, + "learning_rate": 0.00017698495182912963, + "loss": 0.4255, + "step": 23290 + }, + { + "epoch": 0.7783270978086585, + "grad_norm": 0.6004045009613037, + "learning_rate": 0.00017696144549847616, + "loss": 0.4546, + "step": 23300 + }, + { + "epoch": 0.7786611437733832, + "grad_norm": 0.6693439483642578, + "learning_rate": 0.000176937928732598, + "loss": 0.4227, + "step": 23310 + }, + { + "epoch": 0.778995189738108, + "grad_norm": 0.7820325493812561, + "learning_rate": 0.0001769144015346838, + "loss": 0.4445, + "step": 23320 + }, + { + "epoch": 0.7793292357028327, + "grad_norm": 0.7354835271835327, + "learning_rate": 0.0001768908639079236, + "loss": 0.4446, + "step": 23330 + }, + { + "epoch": 0.7796632816675575, + "grad_norm": 0.6951891183853149, + "learning_rate": 0.00017686731585550892, + "loss": 0.4329, + "step": 23340 + }, + { + "epoch": 0.7799973276322822, + "grad_norm": 0.6497251987457275, + "learning_rate": 0.0001768437573806326, + "loss": 0.3971, + "step": 23350 + }, + { + "epoch": 0.780331373597007, + "grad_norm": 0.648512601852417, + "learning_rate": 0.00017682018848648893, + "loss": 0.3935, + "step": 23360 + }, + { + "epoch": 0.7806654195617317, + "grad_norm": 0.562835156917572, + "learning_rate": 0.0001767966091762737, + "loss": 0.4539, + "step": 23370 + }, + { + "epoch": 0.7809994655264565, + "grad_norm": 0.6753464341163635, + "learning_rate": 0.00017677301945318393, + "loss": 0.436, + "step": 23380 + }, + { + "epoch": 0.7813335114911811, + "grad_norm": 0.8825802206993103, + "learning_rate": 0.00017674941932041826, + "loss": 0.4648, + "step": 23390 + }, + { + "epoch": 0.7816675574559059, + "grad_norm": 0.8675494194030762, + "learning_rate": 0.00017672580878117658, + "loss": 0.467, + "step": 23400 + }, + { + "epoch": 0.7820016034206306, + "grad_norm": 0.7188384532928467, + "learning_rate": 0.00017670218783866022, + "loss": 0.4433, + "step": 23410 + }, + { + "epoch": 0.7823356493853554, + "grad_norm": 0.6739723086357117, + "learning_rate": 0.00017667855649607206, + "loss": 0.4196, + "step": 23420 + }, + { + "epoch": 0.7826696953500801, + "grad_norm": 0.662468433380127, + "learning_rate": 0.0001766549147566162, + "loss": 0.4195, + "step": 23430 + }, + { + "epoch": 0.7830037413148049, + "grad_norm": 0.6583346724510193, + "learning_rate": 0.0001766312626234982, + "loss": 0.4266, + "step": 23440 + }, + { + "epoch": 0.7833377872795296, + "grad_norm": 0.6672881245613098, + "learning_rate": 0.00017660760009992514, + "loss": 0.3992, + "step": 23450 + }, + { + "epoch": 0.7836718332442544, + "grad_norm": 0.5640183091163635, + "learning_rate": 0.00017658392718910537, + "loss": 0.3968, + "step": 23460 + }, + { + "epoch": 0.7840058792089791, + "grad_norm": 0.636432945728302, + "learning_rate": 0.00017656024389424874, + "loss": 0.411, + "step": 23470 + }, + { + "epoch": 0.7843399251737039, + "grad_norm": 0.6627435684204102, + "learning_rate": 0.00017653655021856646, + "loss": 0.4069, + "step": 23480 + }, + { + "epoch": 0.7846739711384286, + "grad_norm": 0.7041613459587097, + "learning_rate": 0.00017651284616527116, + "loss": 0.4083, + "step": 23490 + }, + { + "epoch": 0.7850080171031534, + "grad_norm": 0.6423289775848389, + "learning_rate": 0.00017648913173757687, + "loss": 0.424, + "step": 23500 + }, + { + "epoch": 0.7853420630678781, + "grad_norm": 0.6657639145851135, + "learning_rate": 0.00017646540693869907, + "loss": 0.4365, + "step": 23510 + }, + { + "epoch": 0.7856761090326029, + "grad_norm": 0.7117176055908203, + "learning_rate": 0.00017644167177185461, + "loss": 0.4103, + "step": 23520 + }, + { + "epoch": 0.7860101549973276, + "grad_norm": 0.7509021162986755, + "learning_rate": 0.0001764179262402617, + "loss": 0.4159, + "step": 23530 + }, + { + "epoch": 0.7863442009620524, + "grad_norm": 0.7692280411720276, + "learning_rate": 0.00017639417034714006, + "loss": 0.442, + "step": 23540 + }, + { + "epoch": 0.7866782469267771, + "grad_norm": 0.6991557478904724, + "learning_rate": 0.00017637040409571072, + "loss": 0.4196, + "step": 23550 + }, + { + "epoch": 0.7870122928915019, + "grad_norm": 0.5447357892990112, + "learning_rate": 0.00017634662748919615, + "loss": 0.4152, + "step": 23560 + }, + { + "epoch": 0.7873463388562266, + "grad_norm": 0.5971201658248901, + "learning_rate": 0.00017632284053082026, + "loss": 0.4029, + "step": 23570 + }, + { + "epoch": 0.7876803848209514, + "grad_norm": 0.6207103133201599, + "learning_rate": 0.00017629904322380834, + "loss": 0.3925, + "step": 23580 + }, + { + "epoch": 0.7880144307856761, + "grad_norm": 0.814227283000946, + "learning_rate": 0.00017627523557138698, + "loss": 0.4562, + "step": 23590 + }, + { + "epoch": 0.7883484767504009, + "grad_norm": 0.7055184841156006, + "learning_rate": 0.0001762514175767844, + "loss": 0.4349, + "step": 23600 + }, + { + "epoch": 0.7886825227151256, + "grad_norm": 0.6715635657310486, + "learning_rate": 0.00017622758924323, + "loss": 0.4237, + "step": 23610 + }, + { + "epoch": 0.7890165686798504, + "grad_norm": 0.6182885766029358, + "learning_rate": 0.00017620375057395465, + "loss": 0.4224, + "step": 23620 + }, + { + "epoch": 0.7893506146445751, + "grad_norm": 0.6695681810379028, + "learning_rate": 0.00017617990157219076, + "loss": 0.4346, + "step": 23630 + }, + { + "epoch": 0.7896846606092999, + "grad_norm": 0.7520004510879517, + "learning_rate": 0.00017615604224117192, + "loss": 0.4243, + "step": 23640 + }, + { + "epoch": 0.7900187065740246, + "grad_norm": 0.7665175199508667, + "learning_rate": 0.00017613217258413322, + "loss": 0.4474, + "step": 23650 + }, + { + "epoch": 0.7903527525387494, + "grad_norm": 0.6217501759529114, + "learning_rate": 0.0001761082926043112, + "loss": 0.4599, + "step": 23660 + }, + { + "epoch": 0.7906867985034741, + "grad_norm": 0.5703338980674744, + "learning_rate": 0.00017608440230494377, + "loss": 0.4021, + "step": 23670 + }, + { + "epoch": 0.7910208444681989, + "grad_norm": 0.6274169683456421, + "learning_rate": 0.00017606050168927014, + "loss": 0.4364, + "step": 23680 + }, + { + "epoch": 0.7913548904329236, + "grad_norm": 0.7083061933517456, + "learning_rate": 0.00017603659076053112, + "loss": 0.44, + "step": 23690 + }, + { + "epoch": 0.7916889363976484, + "grad_norm": 0.6725417971611023, + "learning_rate": 0.00017601266952196868, + "loss": 0.4444, + "step": 23700 + }, + { + "epoch": 0.7920229823623731, + "grad_norm": 0.813775897026062, + "learning_rate": 0.00017598873797682637, + "loss": 0.4324, + "step": 23710 + }, + { + "epoch": 0.7923570283270978, + "grad_norm": 0.5310735702514648, + "learning_rate": 0.0001759647961283491, + "loss": 0.3981, + "step": 23720 + }, + { + "epoch": 0.7926910742918225, + "grad_norm": 0.7437506318092346, + "learning_rate": 0.0001759408439797831, + "loss": 0.4291, + "step": 23730 + }, + { + "epoch": 0.7930251202565473, + "grad_norm": 0.6039184331893921, + "learning_rate": 0.00017591688153437604, + "loss": 0.4032, + "step": 23740 + }, + { + "epoch": 0.793359166221272, + "grad_norm": 0.6879783868789673, + "learning_rate": 0.00017589290879537704, + "loss": 0.4331, + "step": 23750 + }, + { + "epoch": 0.7936932121859968, + "grad_norm": 0.6444907784461975, + "learning_rate": 0.00017586892576603656, + "loss": 0.4054, + "step": 23760 + }, + { + "epoch": 0.7940272581507215, + "grad_norm": 0.6556650996208191, + "learning_rate": 0.00017584493244960644, + "loss": 0.4295, + "step": 23770 + }, + { + "epoch": 0.7943613041154463, + "grad_norm": 0.6469134092330933, + "learning_rate": 0.00017582092884933993, + "loss": 0.4165, + "step": 23780 + }, + { + "epoch": 0.794695350080171, + "grad_norm": 0.6427074074745178, + "learning_rate": 0.00017579691496849177, + "loss": 0.45, + "step": 23790 + }, + { + "epoch": 0.7950293960448958, + "grad_norm": 0.7316064238548279, + "learning_rate": 0.00017577289081031794, + "loss": 0.4348, + "step": 23800 + }, + { + "epoch": 0.7953634420096205, + "grad_norm": 0.6674372553825378, + "learning_rate": 0.0001757488563780759, + "loss": 0.4264, + "step": 23810 + }, + { + "epoch": 0.7956974879743453, + "grad_norm": 0.712563157081604, + "learning_rate": 0.00017572481167502443, + "loss": 0.4158, + "step": 23820 + }, + { + "epoch": 0.79603153393907, + "grad_norm": 0.606708824634552, + "learning_rate": 0.00017570075670442388, + "loss": 0.4268, + "step": 23830 + }, + { + "epoch": 0.7963655799037948, + "grad_norm": 0.8621882796287537, + "learning_rate": 0.00017567669146953575, + "loss": 0.4122, + "step": 23840 + }, + { + "epoch": 0.7966996258685195, + "grad_norm": 0.7085481882095337, + "learning_rate": 0.00017565261597362315, + "loss": 0.4366, + "step": 23850 + }, + { + "epoch": 0.7970336718332443, + "grad_norm": 0.7786434888839722, + "learning_rate": 0.00017562853021995038, + "loss": 0.4336, + "step": 23860 + }, + { + "epoch": 0.797367717797969, + "grad_norm": 0.6570208072662354, + "learning_rate": 0.00017560443421178333, + "loss": 0.4282, + "step": 23870 + }, + { + "epoch": 0.7977017637626937, + "grad_norm": 0.6492790579795837, + "learning_rate": 0.00017558032795238912, + "loss": 0.4189, + "step": 23880 + }, + { + "epoch": 0.7980358097274185, + "grad_norm": 0.6207686066627502, + "learning_rate": 0.00017555621144503636, + "loss": 0.4284, + "step": 23890 + }, + { + "epoch": 0.7983698556921432, + "grad_norm": 0.8141191005706787, + "learning_rate": 0.000175532084692995, + "loss": 0.3992, + "step": 23900 + }, + { + "epoch": 0.798703901656868, + "grad_norm": 0.6882479786872864, + "learning_rate": 0.0001755079476995364, + "loss": 0.4255, + "step": 23910 + }, + { + "epoch": 0.7990379476215927, + "grad_norm": 0.7048838138580322, + "learning_rate": 0.0001754838004679333, + "loss": 0.4424, + "step": 23920 + }, + { + "epoch": 0.7993719935863175, + "grad_norm": 0.6536533236503601, + "learning_rate": 0.00017545964300145982, + "loss": 0.4227, + "step": 23930 + }, + { + "epoch": 0.7997060395510422, + "grad_norm": 0.702521800994873, + "learning_rate": 0.0001754354753033915, + "loss": 0.4047, + "step": 23940 + }, + { + "epoch": 0.800040085515767, + "grad_norm": 0.6642126441001892, + "learning_rate": 0.00017541129737700523, + "loss": 0.3733, + "step": 23950 + }, + { + "epoch": 0.8003741314804917, + "grad_norm": 0.7971897125244141, + "learning_rate": 0.0001753871092255793, + "loss": 0.4465, + "step": 23960 + }, + { + "epoch": 0.8007081774452165, + "grad_norm": 0.6622767448425293, + "learning_rate": 0.00017536291085239338, + "loss": 0.42, + "step": 23970 + }, + { + "epoch": 0.8010422234099412, + "grad_norm": 0.6437558531761169, + "learning_rate": 0.00017533870226072855, + "loss": 0.3864, + "step": 23980 + }, + { + "epoch": 0.801376269374666, + "grad_norm": 0.7265076041221619, + "learning_rate": 0.00017531448345386728, + "loss": 0.4333, + "step": 23990 + }, + { + "epoch": 0.8017103153393907, + "grad_norm": 0.6434394121170044, + "learning_rate": 0.00017529025443509334, + "loss": 0.4406, + "step": 24000 + }, + { + "epoch": 0.8017103153393907, + "eval_chrf": 83.95554782755596, + "eval_loss": 0.7625317573547363, + "eval_runtime": 191.697, + "eval_samples_per_second": 0.522, + "eval_steps_per_second": 0.021, + "step": 24000 + }, + { + "epoch": 0.8020443613041155, + "grad_norm": 0.7294747233390808, + "learning_rate": 0.00017526601520769202, + "loss": 0.4141, + "step": 24010 + }, + { + "epoch": 0.8023784072688402, + "grad_norm": 0.7360848188400269, + "learning_rate": 0.00017524176577494988, + "loss": 0.4056, + "step": 24020 + }, + { + "epoch": 0.802712453233565, + "grad_norm": 0.5751156210899353, + "learning_rate": 0.00017521750614015496, + "loss": 0.4654, + "step": 24030 + }, + { + "epoch": 0.8030464991982896, + "grad_norm": 0.6141709089279175, + "learning_rate": 0.00017519323630659655, + "loss": 0.3892, + "step": 24040 + }, + { + "epoch": 0.8033805451630144, + "grad_norm": 0.6972567439079285, + "learning_rate": 0.00017516895627756548, + "loss": 0.4364, + "step": 24050 + }, + { + "epoch": 0.8037145911277391, + "grad_norm": 0.7536477446556091, + "learning_rate": 0.00017514466605635386, + "loss": 0.4429, + "step": 24060 + }, + { + "epoch": 0.8040486370924639, + "grad_norm": 0.6884116530418396, + "learning_rate": 0.00017512036564625515, + "loss": 0.4494, + "step": 24070 + }, + { + "epoch": 0.8043826830571886, + "grad_norm": 0.6547605991363525, + "learning_rate": 0.00017509605505056433, + "loss": 0.4699, + "step": 24080 + }, + { + "epoch": 0.8047167290219134, + "grad_norm": 0.8067330718040466, + "learning_rate": 0.00017507173427257767, + "loss": 0.3878, + "step": 24090 + }, + { + "epoch": 0.8050507749866381, + "grad_norm": 0.6059708595275879, + "learning_rate": 0.0001750474033155928, + "loss": 0.4076, + "step": 24100 + }, + { + "epoch": 0.8053848209513629, + "grad_norm": 0.7619866132736206, + "learning_rate": 0.00017502306218290876, + "loss": 0.4173, + "step": 24110 + }, + { + "epoch": 0.8057188669160876, + "grad_norm": 0.6193556189537048, + "learning_rate": 0.00017499871087782603, + "loss": 0.4181, + "step": 24120 + }, + { + "epoch": 0.8060529128808124, + "grad_norm": 0.6309000253677368, + "learning_rate": 0.0001749743494036463, + "loss": 0.4093, + "step": 24130 + }, + { + "epoch": 0.8063869588455371, + "grad_norm": 0.6974467635154724, + "learning_rate": 0.00017494997776367285, + "loss": 0.4482, + "step": 24140 + }, + { + "epoch": 0.8067210048102619, + "grad_norm": 0.5978614687919617, + "learning_rate": 0.00017492559596121021, + "loss": 0.4275, + "step": 24150 + }, + { + "epoch": 0.8070550507749866, + "grad_norm": 0.6016488671302795, + "learning_rate": 0.00017490120399956432, + "loss": 0.4044, + "step": 24160 + }, + { + "epoch": 0.8073890967397114, + "grad_norm": 0.6971710920333862, + "learning_rate": 0.00017487680188204245, + "loss": 0.4184, + "step": 24170 + }, + { + "epoch": 0.8077231427044361, + "grad_norm": 0.5432742238044739, + "learning_rate": 0.00017485238961195335, + "loss": 0.3997, + "step": 24180 + }, + { + "epoch": 0.8080571886691609, + "grad_norm": 0.5341528058052063, + "learning_rate": 0.00017482796719260705, + "loss": 0.4128, + "step": 24190 + }, + { + "epoch": 0.8083912346338856, + "grad_norm": 0.5989844799041748, + "learning_rate": 0.000174803534627315, + "loss": 0.4439, + "step": 24200 + }, + { + "epoch": 0.8087252805986104, + "grad_norm": 0.6933886408805847, + "learning_rate": 0.00017477909191939003, + "loss": 0.4337, + "step": 24210 + }, + { + "epoch": 0.8090593265633351, + "grad_norm": 0.7579688429832458, + "learning_rate": 0.0001747546390721463, + "loss": 0.4385, + "step": 24220 + }, + { + "epoch": 0.8093933725280599, + "grad_norm": 0.7709610462188721, + "learning_rate": 0.00017473017608889943, + "loss": 0.4459, + "step": 24230 + }, + { + "epoch": 0.8097274184927846, + "grad_norm": 0.6555464863777161, + "learning_rate": 0.00017470570297296634, + "loss": 0.4169, + "step": 24240 + }, + { + "epoch": 0.8100614644575094, + "grad_norm": 0.7334009408950806, + "learning_rate": 0.0001746812197276653, + "loss": 0.442, + "step": 24250 + }, + { + "epoch": 0.8103955104222341, + "grad_norm": 0.6087445616722107, + "learning_rate": 0.00017465672635631608, + "loss": 0.391, + "step": 24260 + }, + { + "epoch": 0.8107295563869589, + "grad_norm": 0.6212217211723328, + "learning_rate": 0.00017463222286223973, + "loss": 0.4062, + "step": 24270 + }, + { + "epoch": 0.8110636023516836, + "grad_norm": 0.7618390321731567, + "learning_rate": 0.00017460770924875862, + "loss": 0.4159, + "step": 24280 + }, + { + "epoch": 0.8113976483164084, + "grad_norm": 0.710512638092041, + "learning_rate": 0.00017458318551919662, + "loss": 0.3942, + "step": 24290 + }, + { + "epoch": 0.8117316942811331, + "grad_norm": 0.749927818775177, + "learning_rate": 0.00017455865167687891, + "loss": 0.4538, + "step": 24300 + }, + { + "epoch": 0.8120657402458579, + "grad_norm": 0.75137859582901, + "learning_rate": 0.000174534107725132, + "loss": 0.4334, + "step": 24310 + }, + { + "epoch": 0.8123997862105826, + "grad_norm": 0.6255035400390625, + "learning_rate": 0.0001745095536672838, + "loss": 0.4361, + "step": 24320 + }, + { + "epoch": 0.8127338321753074, + "grad_norm": 0.5565643906593323, + "learning_rate": 0.00017448498950666368, + "loss": 0.3933, + "step": 24330 + }, + { + "epoch": 0.8130678781400321, + "grad_norm": 0.5897197723388672, + "learning_rate": 0.00017446041524660225, + "loss": 0.4096, + "step": 24340 + }, + { + "epoch": 0.8134019241047569, + "grad_norm": 0.7459964156150818, + "learning_rate": 0.0001744358308904315, + "loss": 0.4216, + "step": 24350 + }, + { + "epoch": 0.8137359700694815, + "grad_norm": 0.7526453733444214, + "learning_rate": 0.00017441123644148493, + "loss": 0.4389, + "step": 24360 + }, + { + "epoch": 0.8140700160342063, + "grad_norm": 0.5306355953216553, + "learning_rate": 0.00017438663190309727, + "loss": 0.4137, + "step": 24370 + }, + { + "epoch": 0.814404061998931, + "grad_norm": 0.7461660504341125, + "learning_rate": 0.00017436201727860458, + "loss": 0.4246, + "step": 24380 + }, + { + "epoch": 0.8147381079636558, + "grad_norm": 0.7933438420295715, + "learning_rate": 0.00017433739257134446, + "loss": 0.4234, + "step": 24390 + }, + { + "epoch": 0.8150721539283805, + "grad_norm": 0.5554569363594055, + "learning_rate": 0.00017431275778465578, + "loss": 0.4633, + "step": 24400 + }, + { + "epoch": 0.8154061998931053, + "grad_norm": 0.781414270401001, + "learning_rate": 0.0001742881129218787, + "loss": 0.4303, + "step": 24410 + }, + { + "epoch": 0.81574024585783, + "grad_norm": 0.6716150641441345, + "learning_rate": 0.0001742634579863549, + "loss": 0.4512, + "step": 24420 + }, + { + "epoch": 0.8160742918225548, + "grad_norm": 0.8257132768630981, + "learning_rate": 0.00017423879298142727, + "loss": 0.4454, + "step": 24430 + }, + { + "epoch": 0.8164083377872795, + "grad_norm": 0.660959005355835, + "learning_rate": 0.0001742141179104402, + "loss": 0.407, + "step": 24440 + }, + { + "epoch": 0.8167423837520043, + "grad_norm": 0.7980122566223145, + "learning_rate": 0.00017418943277673944, + "loss": 0.3873, + "step": 24450 + }, + { + "epoch": 0.817076429716729, + "grad_norm": 0.5739338397979736, + "learning_rate": 0.000174164737583672, + "loss": 0.4372, + "step": 24460 + }, + { + "epoch": 0.8174104756814538, + "grad_norm": 0.6544968485832214, + "learning_rate": 0.00017414003233458627, + "loss": 0.4107, + "step": 24470 + }, + { + "epoch": 0.8177445216461785, + "grad_norm": 0.8732417821884155, + "learning_rate": 0.00017411531703283206, + "loss": 0.4357, + "step": 24480 + }, + { + "epoch": 0.8180785676109033, + "grad_norm": 0.7663009762763977, + "learning_rate": 0.00017409059168176056, + "loss": 0.4414, + "step": 24490 + }, + { + "epoch": 0.818412613575628, + "grad_norm": 0.6907883286476135, + "learning_rate": 0.0001740658562847243, + "loss": 0.3835, + "step": 24500 + }, + { + "epoch": 0.8187466595403528, + "grad_norm": 0.6912271976470947, + "learning_rate": 0.0001740411108450771, + "loss": 0.4406, + "step": 24510 + }, + { + "epoch": 0.8190807055050775, + "grad_norm": 0.629127025604248, + "learning_rate": 0.00017401635536617424, + "loss": 0.4021, + "step": 24520 + }, + { + "epoch": 0.8194147514698022, + "grad_norm": 0.6233227849006653, + "learning_rate": 0.00017399158985137233, + "loss": 0.4088, + "step": 24530 + }, + { + "epoch": 0.819748797434527, + "grad_norm": 0.546593189239502, + "learning_rate": 0.0001739668143040293, + "loss": 0.4106, + "step": 24540 + }, + { + "epoch": 0.8200828433992517, + "grad_norm": 0.7162787318229675, + "learning_rate": 0.00017394202872750452, + "loss": 0.4387, + "step": 24550 + }, + { + "epoch": 0.8204168893639765, + "grad_norm": 0.5872108340263367, + "learning_rate": 0.00017391723312515863, + "loss": 0.4333, + "step": 24560 + }, + { + "epoch": 0.8207509353287012, + "grad_norm": 0.5151611566543579, + "learning_rate": 0.00017389242750035367, + "loss": 0.4212, + "step": 24570 + }, + { + "epoch": 0.821084981293426, + "grad_norm": 0.6718699336051941, + "learning_rate": 0.00017386761185645306, + "loss": 0.4447, + "step": 24580 + }, + { + "epoch": 0.8214190272581507, + "grad_norm": 0.6012167930603027, + "learning_rate": 0.0001738427861968216, + "loss": 0.4149, + "step": 24590 + }, + { + "epoch": 0.8217530732228755, + "grad_norm": 0.6226468086242676, + "learning_rate": 0.00017381795052482532, + "loss": 0.3999, + "step": 24600 + }, + { + "epoch": 0.8220871191876002, + "grad_norm": 0.6110774874687195, + "learning_rate": 0.00017379310484383177, + "loss": 0.41, + "step": 24610 + }, + { + "epoch": 0.822421165152325, + "grad_norm": 0.6201490759849548, + "learning_rate": 0.00017376824915720976, + "loss": 0.443, + "step": 24620 + }, + { + "epoch": 0.8227552111170497, + "grad_norm": 0.7229218482971191, + "learning_rate": 0.00017374338346832948, + "loss": 0.4216, + "step": 24630 + }, + { + "epoch": 0.8230892570817745, + "grad_norm": 0.5639005303382874, + "learning_rate": 0.00017371850778056248, + "loss": 0.4206, + "step": 24640 + }, + { + "epoch": 0.8234233030464992, + "grad_norm": 0.6005734205245972, + "learning_rate": 0.00017369362209728164, + "loss": 0.4161, + "step": 24650 + }, + { + "epoch": 0.823757349011224, + "grad_norm": 0.7343204021453857, + "learning_rate": 0.00017366872642186123, + "loss": 0.4594, + "step": 24660 + }, + { + "epoch": 0.8240913949759487, + "grad_norm": 0.6772376298904419, + "learning_rate": 0.0001736438207576769, + "loss": 0.4303, + "step": 24670 + }, + { + "epoch": 0.8244254409406734, + "grad_norm": 0.5763529539108276, + "learning_rate": 0.00017361890510810555, + "loss": 0.4325, + "step": 24680 + }, + { + "epoch": 0.8247594869053981, + "grad_norm": 0.6908073425292969, + "learning_rate": 0.0001735939794765255, + "loss": 0.4428, + "step": 24690 + }, + { + "epoch": 0.8250935328701229, + "grad_norm": 0.6545168161392212, + "learning_rate": 0.0001735690438663165, + "loss": 0.441, + "step": 24700 + }, + { + "epoch": 0.8254275788348476, + "grad_norm": 0.6933400630950928, + "learning_rate": 0.0001735440982808595, + "loss": 0.4512, + "step": 24710 + }, + { + "epoch": 0.8257616247995724, + "grad_norm": 0.672709047794342, + "learning_rate": 0.00017351914272353694, + "loss": 0.4573, + "step": 24720 + }, + { + "epoch": 0.8260956707642971, + "grad_norm": 0.8517891764640808, + "learning_rate": 0.00017349417719773254, + "loss": 0.4299, + "step": 24730 + }, + { + "epoch": 0.8264297167290219, + "grad_norm": 0.6193088889122009, + "learning_rate": 0.0001734692017068313, + "loss": 0.3791, + "step": 24740 + }, + { + "epoch": 0.8267637626937466, + "grad_norm": 0.6512857675552368, + "learning_rate": 0.00017344421625421975, + "loss": 0.4066, + "step": 24750 + }, + { + "epoch": 0.8270978086584714, + "grad_norm": 0.5988582372665405, + "learning_rate": 0.00017341922084328565, + "loss": 0.4037, + "step": 24760 + }, + { + "epoch": 0.8274318546231961, + "grad_norm": 0.6200369000434875, + "learning_rate": 0.00017339421547741814, + "loss": 0.407, + "step": 24770 + }, + { + "epoch": 0.8277659005879209, + "grad_norm": 0.6998265385627747, + "learning_rate": 0.00017336920016000767, + "loss": 0.4118, + "step": 24780 + }, + { + "epoch": 0.8280999465526456, + "grad_norm": 0.7839971780776978, + "learning_rate": 0.0001733441748944461, + "loss": 0.416, + "step": 24790 + }, + { + "epoch": 0.8284339925173704, + "grad_norm": 0.581222653388977, + "learning_rate": 0.0001733191396841266, + "loss": 0.3982, + "step": 24800 + }, + { + "epoch": 0.8287680384820951, + "grad_norm": 0.6835952997207642, + "learning_rate": 0.00017329409453244372, + "loss": 0.4067, + "step": 24810 + }, + { + "epoch": 0.8291020844468199, + "grad_norm": 0.6868036985397339, + "learning_rate": 0.00017326903944279333, + "loss": 0.4324, + "step": 24820 + }, + { + "epoch": 0.8294361304115446, + "grad_norm": 0.8105065226554871, + "learning_rate": 0.00017324397441857267, + "loss": 0.4218, + "step": 24830 + }, + { + "epoch": 0.8297701763762694, + "grad_norm": 0.7486416697502136, + "learning_rate": 0.0001732188994631803, + "loss": 0.4313, + "step": 24840 + }, + { + "epoch": 0.8301042223409941, + "grad_norm": 0.6400812864303589, + "learning_rate": 0.00017319381458001613, + "loss": 0.4204, + "step": 24850 + }, + { + "epoch": 0.8304382683057189, + "grad_norm": 0.5999836921691895, + "learning_rate": 0.00017316871977248144, + "loss": 0.4352, + "step": 24860 + }, + { + "epoch": 0.8307723142704436, + "grad_norm": 0.7619590759277344, + "learning_rate": 0.00017314361504397886, + "loss": 0.4391, + "step": 24870 + }, + { + "epoch": 0.8311063602351684, + "grad_norm": 0.5928987860679626, + "learning_rate": 0.0001731185003979123, + "loss": 0.4307, + "step": 24880 + }, + { + "epoch": 0.8314404061998931, + "grad_norm": 0.608038067817688, + "learning_rate": 0.0001730933758376871, + "loss": 0.3959, + "step": 24890 + }, + { + "epoch": 0.8317744521646179, + "grad_norm": 0.8409287929534912, + "learning_rate": 0.00017306824136670992, + "loss": 0.469, + "step": 24900 + }, + { + "epoch": 0.8321084981293426, + "grad_norm": 0.6493387818336487, + "learning_rate": 0.00017304309698838873, + "loss": 0.4513, + "step": 24910 + }, + { + "epoch": 0.8324425440940674, + "grad_norm": 0.6364259123802185, + "learning_rate": 0.00017301794270613283, + "loss": 0.4048, + "step": 24920 + }, + { + "epoch": 0.8327765900587921, + "grad_norm": 0.64515620470047, + "learning_rate": 0.00017299277852335292, + "loss": 0.438, + "step": 24930 + }, + { + "epoch": 0.8331106360235169, + "grad_norm": 0.6530366539955139, + "learning_rate": 0.00017296760444346102, + "loss": 0.4233, + "step": 24940 + }, + { + "epoch": 0.8334446819882416, + "grad_norm": 0.5853860378265381, + "learning_rate": 0.00017294242046987055, + "loss": 0.4222, + "step": 24950 + }, + { + "epoch": 0.8337787279529664, + "grad_norm": 0.5941686034202576, + "learning_rate": 0.0001729172266059961, + "loss": 0.4158, + "step": 24960 + }, + { + "epoch": 0.8341127739176911, + "grad_norm": 0.646035373210907, + "learning_rate": 0.0001728920228552538, + "loss": 0.4489, + "step": 24970 + }, + { + "epoch": 0.8344468198824159, + "grad_norm": 0.5416696071624756, + "learning_rate": 0.00017286680922106102, + "loss": 0.4307, + "step": 24980 + }, + { + "epoch": 0.8347808658471406, + "grad_norm": 0.7034637928009033, + "learning_rate": 0.00017284158570683643, + "loss": 0.3823, + "step": 24990 + }, + { + "epoch": 0.8351149118118654, + "grad_norm": 0.5743913650512695, + "learning_rate": 0.00017281635231600017, + "loss": 0.4203, + "step": 25000 + }, + { + "epoch": 0.8351149118118654, + "eval_chrf": 86.22079012662057, + "eval_loss": 0.7676324844360352, + "eval_runtime": 169.6949, + "eval_samples_per_second": 0.589, + "eval_steps_per_second": 0.024, + "step": 25000 + }, + { + "epoch": 0.83544895777659, + "grad_norm": 0.5959119200706482, + "learning_rate": 0.0001727911090519736, + "loss": 0.4139, + "step": 25010 + }, + { + "epoch": 0.8357830037413148, + "grad_norm": 0.5718786120414734, + "learning_rate": 0.00017276585591817948, + "loss": 0.4083, + "step": 25020 + }, + { + "epoch": 0.8361170497060395, + "grad_norm": 0.7640865445137024, + "learning_rate": 0.00017274059291804184, + "loss": 0.4748, + "step": 25030 + }, + { + "epoch": 0.8364510956707643, + "grad_norm": 0.47738611698150635, + "learning_rate": 0.00017271532005498616, + "loss": 0.4192, + "step": 25040 + }, + { + "epoch": 0.836785141635489, + "grad_norm": 0.8254352807998657, + "learning_rate": 0.00017269003733243917, + "loss": 0.4239, + "step": 25050 + }, + { + "epoch": 0.8371191876002138, + "grad_norm": 0.607133686542511, + "learning_rate": 0.00017266474475382895, + "loss": 0.398, + "step": 25060 + }, + { + "epoch": 0.8374532335649385, + "grad_norm": 0.7793869972229004, + "learning_rate": 0.00017263944232258493, + "loss": 0.393, + "step": 25070 + }, + { + "epoch": 0.8377872795296633, + "grad_norm": 0.693407416343689, + "learning_rate": 0.00017261413004213792, + "loss": 0.4157, + "step": 25080 + }, + { + "epoch": 0.838121325494388, + "grad_norm": 0.7283529043197632, + "learning_rate": 0.00017258880791591992, + "loss": 0.4674, + "step": 25090 + }, + { + "epoch": 0.8384553714591128, + "grad_norm": 0.6255287528038025, + "learning_rate": 0.00017256347594736448, + "loss": 0.4197, + "step": 25100 + }, + { + "epoch": 0.8387894174238375, + "grad_norm": 0.6156442165374756, + "learning_rate": 0.00017253813413990632, + "loss": 0.4031, + "step": 25110 + }, + { + "epoch": 0.8391234633885623, + "grad_norm": 0.6078478097915649, + "learning_rate": 0.00017251278249698154, + "loss": 0.4173, + "step": 25120 + }, + { + "epoch": 0.839457509353287, + "grad_norm": 0.6068661212921143, + "learning_rate": 0.00017248742102202752, + "loss": 0.4259, + "step": 25130 + }, + { + "epoch": 0.8397915553180118, + "grad_norm": 0.6246240139007568, + "learning_rate": 0.00017246204971848315, + "loss": 0.429, + "step": 25140 + }, + { + "epoch": 0.8401256012827365, + "grad_norm": 0.6518892049789429, + "learning_rate": 0.00017243666858978845, + "loss": 0.4369, + "step": 25150 + }, + { + "epoch": 0.8404596472474613, + "grad_norm": 0.5389935374259949, + "learning_rate": 0.00017241127763938484, + "loss": 0.4243, + "step": 25160 + }, + { + "epoch": 0.840793693212186, + "grad_norm": 0.7755690813064575, + "learning_rate": 0.00017238587687071517, + "loss": 0.4277, + "step": 25170 + }, + { + "epoch": 0.8411277391769107, + "grad_norm": 0.6192887425422668, + "learning_rate": 0.00017236046628722344, + "loss": 0.4107, + "step": 25180 + }, + { + "epoch": 0.8414617851416355, + "grad_norm": 0.6739871501922607, + "learning_rate": 0.00017233504589235515, + "loss": 0.4419, + "step": 25190 + }, + { + "epoch": 0.8417958311063602, + "grad_norm": 0.6197271347045898, + "learning_rate": 0.00017230961568955703, + "loss": 0.4113, + "step": 25200 + }, + { + "epoch": 0.842129877071085, + "grad_norm": 0.6555098295211792, + "learning_rate": 0.0001722841756822772, + "loss": 0.4077, + "step": 25210 + }, + { + "epoch": 0.8424639230358097, + "grad_norm": 0.7171416878700256, + "learning_rate": 0.00017225872587396502, + "loss": 0.4095, + "step": 25220 + }, + { + "epoch": 0.8427979690005345, + "grad_norm": 0.8059936761856079, + "learning_rate": 0.00017223326626807125, + "loss": 0.4191, + "step": 25230 + }, + { + "epoch": 0.8431320149652592, + "grad_norm": 0.7512304782867432, + "learning_rate": 0.000172207796868048, + "loss": 0.4331, + "step": 25240 + }, + { + "epoch": 0.843466060929984, + "grad_norm": 0.5571887493133545, + "learning_rate": 0.00017218231767734867, + "loss": 0.3975, + "step": 25250 + }, + { + "epoch": 0.8438001068947087, + "grad_norm": 0.7273104190826416, + "learning_rate": 0.00017215682869942795, + "loss": 0.3947, + "step": 25260 + }, + { + "epoch": 0.8441341528594335, + "grad_norm": 0.8506426811218262, + "learning_rate": 0.00017213132993774196, + "loss": 0.4344, + "step": 25270 + }, + { + "epoch": 0.8444681988241582, + "grad_norm": 0.6240358352661133, + "learning_rate": 0.00017210582139574803, + "loss": 0.4531, + "step": 25280 + }, + { + "epoch": 0.844802244788883, + "grad_norm": 0.6470492482185364, + "learning_rate": 0.00017208030307690487, + "loss": 0.4313, + "step": 25290 + }, + { + "epoch": 0.8451362907536077, + "grad_norm": 0.6726449728012085, + "learning_rate": 0.00017205477498467256, + "loss": 0.4569, + "step": 25300 + }, + { + "epoch": 0.8454703367183325, + "grad_norm": 0.6776848435401917, + "learning_rate": 0.0001720292371225124, + "loss": 0.4215, + "step": 25310 + }, + { + "epoch": 0.8458043826830572, + "grad_norm": 0.7808078527450562, + "learning_rate": 0.0001720036894938872, + "loss": 0.413, + "step": 25320 + }, + { + "epoch": 0.8461384286477819, + "grad_norm": 0.74277263879776, + "learning_rate": 0.0001719781321022608, + "loss": 0.4305, + "step": 25330 + }, + { + "epoch": 0.8464724746125066, + "grad_norm": 0.7718458771705627, + "learning_rate": 0.00017195256495109866, + "loss": 0.4118, + "step": 25340 + }, + { + "epoch": 0.8468065205772314, + "grad_norm": 0.6748285293579102, + "learning_rate": 0.00017192698804386738, + "loss": 0.4043, + "step": 25350 + }, + { + "epoch": 0.8471405665419561, + "grad_norm": 0.7605293393135071, + "learning_rate": 0.00017190140138403497, + "loss": 0.4427, + "step": 25360 + }, + { + "epoch": 0.8474746125066809, + "grad_norm": 0.64583420753479, + "learning_rate": 0.0001718758049750707, + "loss": 0.4258, + "step": 25370 + }, + { + "epoch": 0.8478086584714056, + "grad_norm": 0.7081349492073059, + "learning_rate": 0.0001718501988204452, + "loss": 0.3889, + "step": 25380 + }, + { + "epoch": 0.8481427044361304, + "grad_norm": 0.6732751131057739, + "learning_rate": 0.00017182458292363045, + "loss": 0.4392, + "step": 25390 + }, + { + "epoch": 0.8484767504008551, + "grad_norm": 0.5908846259117126, + "learning_rate": 0.00017179895728809973, + "loss": 0.4262, + "step": 25400 + }, + { + "epoch": 0.8488107963655799, + "grad_norm": 0.5734415054321289, + "learning_rate": 0.00017177332191732752, + "loss": 0.4444, + "step": 25410 + }, + { + "epoch": 0.8491448423303046, + "grad_norm": 0.5604541301727295, + "learning_rate": 0.00017174767681478982, + "loss": 0.4155, + "step": 25420 + }, + { + "epoch": 0.8494788882950294, + "grad_norm": 0.4579305350780487, + "learning_rate": 0.00017172202198396388, + "loss": 0.4104, + "step": 25430 + }, + { + "epoch": 0.8498129342597541, + "grad_norm": 0.7299268841743469, + "learning_rate": 0.00017169635742832816, + "loss": 0.4138, + "step": 25440 + }, + { + "epoch": 0.8501469802244789, + "grad_norm": 0.7706639170646667, + "learning_rate": 0.0001716706831513626, + "loss": 0.4302, + "step": 25450 + }, + { + "epoch": 0.8504810261892036, + "grad_norm": 0.5888841152191162, + "learning_rate": 0.00017164499915654834, + "loss": 0.4481, + "step": 25460 + }, + { + "epoch": 0.8508150721539284, + "grad_norm": 0.6631225347518921, + "learning_rate": 0.00017161930544736786, + "loss": 0.4317, + "step": 25470 + }, + { + "epoch": 0.8511491181186531, + "grad_norm": 0.7705806493759155, + "learning_rate": 0.00017159360202730504, + "loss": 0.4254, + "step": 25480 + }, + { + "epoch": 0.8514831640833779, + "grad_norm": 0.5975449681282043, + "learning_rate": 0.00017156788889984497, + "loss": 0.4121, + "step": 25490 + }, + { + "epoch": 0.8518172100481026, + "grad_norm": 0.6318559646606445, + "learning_rate": 0.0001715421660684741, + "loss": 0.4197, + "step": 25500 + }, + { + "epoch": 0.8521512560128274, + "grad_norm": 0.71295565366745, + "learning_rate": 0.00017151643353668024, + "loss": 0.4376, + "step": 25510 + }, + { + "epoch": 0.8524853019775521, + "grad_norm": 0.7191693186759949, + "learning_rate": 0.00017149069130795242, + "loss": 0.4203, + "step": 25520 + }, + { + "epoch": 0.8528193479422769, + "grad_norm": 0.6285431981086731, + "learning_rate": 0.00017146493938578105, + "loss": 0.4164, + "step": 25530 + }, + { + "epoch": 0.8531533939070016, + "grad_norm": 0.6781705617904663, + "learning_rate": 0.00017143917777365786, + "loss": 0.4097, + "step": 25540 + }, + { + "epoch": 0.8534874398717264, + "grad_norm": 0.7081285119056702, + "learning_rate": 0.00017141340647507585, + "loss": 0.3737, + "step": 25550 + }, + { + "epoch": 0.8538214858364511, + "grad_norm": 0.5970644950866699, + "learning_rate": 0.0001713876254935294, + "loss": 0.4059, + "step": 25560 + }, + { + "epoch": 0.8541555318011759, + "grad_norm": 0.6193240880966187, + "learning_rate": 0.0001713618348325141, + "loss": 0.4325, + "step": 25570 + }, + { + "epoch": 0.8544895777659006, + "grad_norm": 0.6513376832008362, + "learning_rate": 0.00017133603449552695, + "loss": 0.4525, + "step": 25580 + }, + { + "epoch": 0.8548236237306254, + "grad_norm": 0.6037896871566772, + "learning_rate": 0.00017131022448606625, + "loss": 0.4272, + "step": 25590 + }, + { + "epoch": 0.8551576696953501, + "grad_norm": 0.6566991806030273, + "learning_rate": 0.00017128440480763152, + "loss": 0.4482, + "step": 25600 + }, + { + "epoch": 0.8554917156600749, + "grad_norm": 0.6744807362556458, + "learning_rate": 0.0001712585754637237, + "loss": 0.4187, + "step": 25610 + }, + { + "epoch": 0.8558257616247996, + "grad_norm": 0.6652624011039734, + "learning_rate": 0.000171232736457845, + "loss": 0.4247, + "step": 25620 + }, + { + "epoch": 0.8561598075895244, + "grad_norm": 0.7069579362869263, + "learning_rate": 0.00017120688779349893, + "loss": 0.4424, + "step": 25630 + }, + { + "epoch": 0.8564938535542491, + "grad_norm": 0.7585152387619019, + "learning_rate": 0.00017118102947419027, + "loss": 0.4114, + "step": 25640 + }, + { + "epoch": 0.8568278995189738, + "grad_norm": 0.6831428408622742, + "learning_rate": 0.00017115516150342525, + "loss": 0.4354, + "step": 25650 + }, + { + "epoch": 0.8571619454836985, + "grad_norm": 0.6729920506477356, + "learning_rate": 0.00017112928388471126, + "loss": 0.4408, + "step": 25660 + }, + { + "epoch": 0.8574959914484233, + "grad_norm": 0.5891386866569519, + "learning_rate": 0.00017110339662155705, + "loss": 0.3839, + "step": 25670 + }, + { + "epoch": 0.857830037413148, + "grad_norm": 0.7617397308349609, + "learning_rate": 0.0001710774997174727, + "loss": 0.3699, + "step": 25680 + }, + { + "epoch": 0.8581640833778728, + "grad_norm": 0.778652012348175, + "learning_rate": 0.0001710515931759696, + "loss": 0.432, + "step": 25690 + }, + { + "epoch": 0.8584981293425975, + "grad_norm": 0.8509665727615356, + "learning_rate": 0.00017102567700056034, + "loss": 0.4082, + "step": 25700 + }, + { + "epoch": 0.8588321753073223, + "grad_norm": 0.696434736251831, + "learning_rate": 0.000170999751194759, + "loss": 0.4027, + "step": 25710 + }, + { + "epoch": 0.859166221272047, + "grad_norm": 0.6787528991699219, + "learning_rate": 0.00017097381576208087, + "loss": 0.4367, + "step": 25720 + }, + { + "epoch": 0.8595002672367718, + "grad_norm": 0.7218984365463257, + "learning_rate": 0.00017094787070604246, + "loss": 0.4275, + "step": 25730 + }, + { + "epoch": 0.8598343132014965, + "grad_norm": 0.6965874433517456, + "learning_rate": 0.0001709219160301617, + "loss": 0.4216, + "step": 25740 + }, + { + "epoch": 0.8601683591662213, + "grad_norm": 0.5960062146186829, + "learning_rate": 0.0001708959517379578, + "loss": 0.4277, + "step": 25750 + }, + { + "epoch": 0.860502405130946, + "grad_norm": 0.7255739569664001, + "learning_rate": 0.00017086997783295128, + "loss": 0.4474, + "step": 25760 + }, + { + "epoch": 0.8608364510956708, + "grad_norm": 0.6227213740348816, + "learning_rate": 0.00017084399431866395, + "loss": 0.4708, + "step": 25770 + }, + { + "epoch": 0.8611704970603955, + "grad_norm": 0.6719828844070435, + "learning_rate": 0.0001708180011986189, + "loss": 0.4337, + "step": 25780 + }, + { + "epoch": 0.8615045430251203, + "grad_norm": 0.6787228584289551, + "learning_rate": 0.00017079199847634056, + "loss": 0.4269, + "step": 25790 + }, + { + "epoch": 0.861838588989845, + "grad_norm": 0.6970129609107971, + "learning_rate": 0.00017076598615535458, + "loss": 0.4069, + "step": 25800 + }, + { + "epoch": 0.8621726349545697, + "grad_norm": 0.6224098205566406, + "learning_rate": 0.00017073996423918813, + "loss": 0.4104, + "step": 25810 + }, + { + "epoch": 0.8625066809192945, + "grad_norm": 0.6233200430870056, + "learning_rate": 0.00017071393273136937, + "loss": 0.4396, + "step": 25820 + }, + { + "epoch": 0.8628407268840192, + "grad_norm": 0.6186385154724121, + "learning_rate": 0.00017068789163542802, + "loss": 0.4311, + "step": 25830 + }, + { + "epoch": 0.863174772848744, + "grad_norm": 0.6398409008979797, + "learning_rate": 0.00017066184095489495, + "loss": 0.4281, + "step": 25840 + }, + { + "epoch": 0.8635088188134687, + "grad_norm": 0.6050474047660828, + "learning_rate": 0.0001706357806933024, + "loss": 0.4253, + "step": 25850 + }, + { + "epoch": 0.8638428647781935, + "grad_norm": 0.7247027158737183, + "learning_rate": 0.0001706097108541839, + "loss": 0.4044, + "step": 25860 + }, + { + "epoch": 0.8641769107429182, + "grad_norm": 0.5925003886222839, + "learning_rate": 0.0001705836314410742, + "loss": 0.4051, + "step": 25870 + }, + { + "epoch": 0.864510956707643, + "grad_norm": 0.6955405473709106, + "learning_rate": 0.00017055754245750952, + "loss": 0.4219, + "step": 25880 + }, + { + "epoch": 0.8648450026723677, + "grad_norm": 0.716328501701355, + "learning_rate": 0.0001705314439070272, + "loss": 0.449, + "step": 25890 + }, + { + "epoch": 0.8651790486370925, + "grad_norm": 0.7200480699539185, + "learning_rate": 0.000170505335793166, + "loss": 0.4648, + "step": 25900 + }, + { + "epoch": 0.8655130946018172, + "grad_norm": 0.6760954260826111, + "learning_rate": 0.00017047921811946584, + "loss": 0.4245, + "step": 25910 + }, + { + "epoch": 0.865847140566542, + "grad_norm": 0.6218106150627136, + "learning_rate": 0.00017045309088946811, + "loss": 0.395, + "step": 25920 + }, + { + "epoch": 0.8661811865312667, + "grad_norm": 0.7584658861160278, + "learning_rate": 0.0001704269541067154, + "loss": 0.4435, + "step": 25930 + }, + { + "epoch": 0.8665152324959915, + "grad_norm": 0.7919891476631165, + "learning_rate": 0.00017040080777475154, + "loss": 0.4236, + "step": 25940 + }, + { + "epoch": 0.8668492784607162, + "grad_norm": 0.6208048462867737, + "learning_rate": 0.00017037465189712178, + "loss": 0.4247, + "step": 25950 + }, + { + "epoch": 0.867183324425441, + "grad_norm": 0.8129919767379761, + "learning_rate": 0.00017034848647737256, + "loss": 0.4116, + "step": 25960 + }, + { + "epoch": 0.8675173703901656, + "grad_norm": 0.7519887685775757, + "learning_rate": 0.00017032231151905172, + "loss": 0.4162, + "step": 25970 + }, + { + "epoch": 0.8678514163548904, + "grad_norm": 0.7016174793243408, + "learning_rate": 0.00017029612702570824, + "loss": 0.4139, + "step": 25980 + }, + { + "epoch": 0.8681854623196151, + "grad_norm": 0.7585100531578064, + "learning_rate": 0.00017026993300089257, + "loss": 0.4218, + "step": 25990 + }, + { + "epoch": 0.8685195082843399, + "grad_norm": 0.6525117754936218, + "learning_rate": 0.00017024372944815635, + "loss": 0.4132, + "step": 26000 + }, + { + "epoch": 0.8685195082843399, + "eval_chrf": 85.16362824891128, + "eval_loss": 0.7558543682098389, + "eval_runtime": 122.3015, + "eval_samples_per_second": 0.818, + "eval_steps_per_second": 0.033, + "step": 26000 + }, + { + "epoch": 0.8688535542490646, + "grad_norm": 0.6141493916511536, + "learning_rate": 0.00017021751637105248, + "loss": 0.4324, + "step": 26010 + }, + { + "epoch": 0.8691876002137894, + "grad_norm": 0.8038309216499329, + "learning_rate": 0.0001701912937731352, + "loss": 0.4481, + "step": 26020 + }, + { + "epoch": 0.8695216461785141, + "grad_norm": 0.759630560874939, + "learning_rate": 0.00017016506165796008, + "loss": 0.4079, + "step": 26030 + }, + { + "epoch": 0.8698556921432389, + "grad_norm": 0.6893547773361206, + "learning_rate": 0.00017013882002908393, + "loss": 0.4145, + "step": 26040 + }, + { + "epoch": 0.8701897381079636, + "grad_norm": 0.705274224281311, + "learning_rate": 0.00017011256889006487, + "loss": 0.4045, + "step": 26050 + }, + { + "epoch": 0.8705237840726884, + "grad_norm": 0.592519998550415, + "learning_rate": 0.00017008630824446227, + "loss": 0.4098, + "step": 26060 + }, + { + "epoch": 0.8708578300374131, + "grad_norm": 0.6438660621643066, + "learning_rate": 0.00017006003809583685, + "loss": 0.3814, + "step": 26070 + }, + { + "epoch": 0.8711918760021379, + "grad_norm": 0.6389734148979187, + "learning_rate": 0.00017003375844775053, + "loss": 0.4136, + "step": 26080 + }, + { + "epoch": 0.8715259219668626, + "grad_norm": 0.7206357717514038, + "learning_rate": 0.00017000746930376665, + "loss": 0.3972, + "step": 26090 + }, + { + "epoch": 0.8718599679315874, + "grad_norm": 0.6913564205169678, + "learning_rate": 0.00016998117066744972, + "loss": 0.401, + "step": 26100 + }, + { + "epoch": 0.8721940138963121, + "grad_norm": 0.733603835105896, + "learning_rate": 0.00016995486254236557, + "loss": 0.4359, + "step": 26110 + }, + { + "epoch": 0.8725280598610369, + "grad_norm": 0.6569717526435852, + "learning_rate": 0.00016992854493208136, + "loss": 0.408, + "step": 26120 + }, + { + "epoch": 0.8728621058257616, + "grad_norm": 0.6673974394798279, + "learning_rate": 0.0001699022178401655, + "loss": 0.4157, + "step": 26130 + }, + { + "epoch": 0.8731961517904864, + "grad_norm": 0.6553773283958435, + "learning_rate": 0.00016987588127018763, + "loss": 0.4084, + "step": 26140 + }, + { + "epoch": 0.8735301977552111, + "grad_norm": 0.6821873784065247, + "learning_rate": 0.00016984953522571883, + "loss": 0.4275, + "step": 26150 + }, + { + "epoch": 0.8738642437199359, + "grad_norm": 0.604436993598938, + "learning_rate": 0.00016982317971033127, + "loss": 0.3901, + "step": 26160 + }, + { + "epoch": 0.8741982896846606, + "grad_norm": 0.7005555629730225, + "learning_rate": 0.0001697968147275986, + "loss": 0.3841, + "step": 26170 + }, + { + "epoch": 0.8745323356493854, + "grad_norm": 0.5899726748466492, + "learning_rate": 0.00016977044028109557, + "loss": 0.4219, + "step": 26180 + }, + { + "epoch": 0.8748663816141101, + "grad_norm": 0.5477669835090637, + "learning_rate": 0.0001697440563743983, + "loss": 0.4247, + "step": 26190 + }, + { + "epoch": 0.8752004275788349, + "grad_norm": 0.6415575742721558, + "learning_rate": 0.00016971766301108427, + "loss": 0.3933, + "step": 26200 + }, + { + "epoch": 0.8755344735435596, + "grad_norm": 0.6314520835876465, + "learning_rate": 0.0001696912601947321, + "loss": 0.483, + "step": 26210 + }, + { + "epoch": 0.8758685195082844, + "grad_norm": 0.6909754276275635, + "learning_rate": 0.00016966484792892177, + "loss": 0.4198, + "step": 26220 + }, + { + "epoch": 0.8762025654730091, + "grad_norm": 0.6944664716720581, + "learning_rate": 0.00016963842621723452, + "loss": 0.4305, + "step": 26230 + }, + { + "epoch": 0.8765366114377339, + "grad_norm": 0.7080066204071045, + "learning_rate": 0.00016961199506325292, + "loss": 0.4316, + "step": 26240 + }, + { + "epoch": 0.8768706574024586, + "grad_norm": 0.6488223671913147, + "learning_rate": 0.00016958555447056073, + "loss": 0.4367, + "step": 26250 + }, + { + "epoch": 0.8772047033671834, + "grad_norm": 0.6469858884811401, + "learning_rate": 0.00016955910444274304, + "loss": 0.4193, + "step": 26260 + }, + { + "epoch": 0.8775387493319081, + "grad_norm": 0.691259503364563, + "learning_rate": 0.00016953264498338624, + "loss": 0.4421, + "step": 26270 + }, + { + "epoch": 0.8778727952966329, + "grad_norm": 0.5835904479026794, + "learning_rate": 0.00016950617609607792, + "loss": 0.4126, + "step": 26280 + }, + { + "epoch": 0.8782068412613576, + "grad_norm": 0.7912886738777161, + "learning_rate": 0.0001694796977844071, + "loss": 0.4304, + "step": 26290 + }, + { + "epoch": 0.8785408872260823, + "grad_norm": 0.7572183609008789, + "learning_rate": 0.0001694532100519639, + "loss": 0.4416, + "step": 26300 + }, + { + "epoch": 0.878874933190807, + "grad_norm": 0.71031254529953, + "learning_rate": 0.00016942671290233986, + "loss": 0.4043, + "step": 26310 + }, + { + "epoch": 0.8792089791555318, + "grad_norm": 0.6484765410423279, + "learning_rate": 0.0001694002063391277, + "loss": 0.4361, + "step": 26320 + }, + { + "epoch": 0.8795430251202565, + "grad_norm": 0.8074885606765747, + "learning_rate": 0.00016937369036592145, + "loss": 0.4479, + "step": 26330 + }, + { + "epoch": 0.8798770710849813, + "grad_norm": 0.5961145758628845, + "learning_rate": 0.00016934716498631644, + "loss": 0.4169, + "step": 26340 + }, + { + "epoch": 0.880211117049706, + "grad_norm": 0.6509643793106079, + "learning_rate": 0.00016932063020390925, + "loss": 0.392, + "step": 26350 + }, + { + "epoch": 0.8805451630144308, + "grad_norm": 0.6896036267280579, + "learning_rate": 0.0001692940860222977, + "loss": 0.4128, + "step": 26360 + }, + { + "epoch": 0.8808792089791555, + "grad_norm": 0.6780446767807007, + "learning_rate": 0.00016926753244508096, + "loss": 0.3766, + "step": 26370 + }, + { + "epoch": 0.8812132549438803, + "grad_norm": 0.7046335935592651, + "learning_rate": 0.00016924096947585943, + "loss": 0.3955, + "step": 26380 + }, + { + "epoch": 0.881547300908605, + "grad_norm": 0.6733250617980957, + "learning_rate": 0.0001692143971182348, + "loss": 0.4486, + "step": 26390 + }, + { + "epoch": 0.8818813468733298, + "grad_norm": 0.6557486057281494, + "learning_rate": 0.00016918781537581, + "loss": 0.4022, + "step": 26400 + }, + { + "epoch": 0.8822153928380545, + "grad_norm": 0.8741340041160583, + "learning_rate": 0.0001691612242521893, + "loss": 0.4174, + "step": 26410 + }, + { + "epoch": 0.8825494388027793, + "grad_norm": 0.5768049359321594, + "learning_rate": 0.00016913462375097815, + "loss": 0.4054, + "step": 26420 + }, + { + "epoch": 0.882883484767504, + "grad_norm": 0.6256388425827026, + "learning_rate": 0.00016910801387578332, + "loss": 0.4374, + "step": 26430 + }, + { + "epoch": 0.8832175307322288, + "grad_norm": 0.5612363815307617, + "learning_rate": 0.0001690813946302129, + "loss": 0.4157, + "step": 26440 + }, + { + "epoch": 0.8835515766969535, + "grad_norm": 0.6318510174751282, + "learning_rate": 0.00016905476601787616, + "loss": 0.3955, + "step": 26450 + }, + { + "epoch": 0.8838856226616782, + "grad_norm": 0.6278559565544128, + "learning_rate": 0.00016902812804238368, + "loss": 0.4068, + "step": 26460 + }, + { + "epoch": 0.884219668626403, + "grad_norm": 0.867922306060791, + "learning_rate": 0.00016900148070734736, + "loss": 0.4204, + "step": 26470 + }, + { + "epoch": 0.8845537145911277, + "grad_norm": 0.7175678014755249, + "learning_rate": 0.00016897482401638026, + "loss": 0.4022, + "step": 26480 + }, + { + "epoch": 0.8848877605558525, + "grad_norm": 0.9110668301582336, + "learning_rate": 0.0001689481579730968, + "loss": 0.4479, + "step": 26490 + }, + { + "epoch": 0.8852218065205772, + "grad_norm": 0.6468082070350647, + "learning_rate": 0.0001689214825811126, + "loss": 0.4009, + "step": 26500 + }, + { + "epoch": 0.885555852485302, + "grad_norm": 0.70963054895401, + "learning_rate": 0.00016889479784404463, + "loss": 0.4178, + "step": 26510 + }, + { + "epoch": 0.8858898984500267, + "grad_norm": 0.7580585479736328, + "learning_rate": 0.00016886810376551105, + "loss": 0.4215, + "step": 26520 + }, + { + "epoch": 0.8862239444147515, + "grad_norm": 0.5951772332191467, + "learning_rate": 0.00016884140034913137, + "loss": 0.4127, + "step": 26530 + }, + { + "epoch": 0.8865579903794762, + "grad_norm": 0.7633501291275024, + "learning_rate": 0.00016881468759852623, + "loss": 0.4214, + "step": 26540 + }, + { + "epoch": 0.886892036344201, + "grad_norm": 0.6946791410446167, + "learning_rate": 0.00016878796551731768, + "loss": 0.4171, + "step": 26550 + }, + { + "epoch": 0.8872260823089257, + "grad_norm": 0.7655012607574463, + "learning_rate": 0.000168761234109129, + "loss": 0.4263, + "step": 26560 + }, + { + "epoch": 0.8875601282736505, + "grad_norm": 0.7380564212799072, + "learning_rate": 0.0001687344933775846, + "loss": 0.4265, + "step": 26570 + }, + { + "epoch": 0.8878941742383752, + "grad_norm": 0.7288685441017151, + "learning_rate": 0.0001687077433263104, + "loss": 0.4432, + "step": 26580 + }, + { + "epoch": 0.8882282202031, + "grad_norm": 0.7227764129638672, + "learning_rate": 0.0001686809839589333, + "loss": 0.4371, + "step": 26590 + }, + { + "epoch": 0.8885622661678247, + "grad_norm": 0.7877309322357178, + "learning_rate": 0.00016865421527908177, + "loss": 0.4292, + "step": 26600 + }, + { + "epoch": 0.8888963121325495, + "grad_norm": 0.7037298679351807, + "learning_rate": 0.00016862743729038524, + "loss": 0.4034, + "step": 26610 + }, + { + "epoch": 0.8892303580972741, + "grad_norm": 0.7540507316589355, + "learning_rate": 0.00016860064999647465, + "loss": 0.4219, + "step": 26620 + }, + { + "epoch": 0.8895644040619989, + "grad_norm": 0.6594278812408447, + "learning_rate": 0.00016857385340098203, + "loss": 0.3986, + "step": 26630 + }, + { + "epoch": 0.8898984500267236, + "grad_norm": 0.5593328475952148, + "learning_rate": 0.00016854704750754075, + "loss": 0.3935, + "step": 26640 + }, + { + "epoch": 0.8902324959914484, + "grad_norm": 0.5925552248954773, + "learning_rate": 0.0001685202323197855, + "loss": 0.418, + "step": 26650 + }, + { + "epoch": 0.8905665419561731, + "grad_norm": 0.637718915939331, + "learning_rate": 0.00016849340784135203, + "loss": 0.4281, + "step": 26660 + }, + { + "epoch": 0.8909005879208979, + "grad_norm": 0.6239839196205139, + "learning_rate": 0.0001684665740758776, + "loss": 0.4287, + "step": 26670 + }, + { + "epoch": 0.8912346338856226, + "grad_norm": 0.7875577807426453, + "learning_rate": 0.00016843973102700053, + "loss": 0.3864, + "step": 26680 + }, + { + "epoch": 0.8915686798503474, + "grad_norm": 0.7386941909790039, + "learning_rate": 0.00016841287869836052, + "loss": 0.3875, + "step": 26690 + }, + { + "epoch": 0.8919027258150721, + "grad_norm": 0.6543652415275574, + "learning_rate": 0.00016838601709359848, + "loss": 0.3937, + "step": 26700 + }, + { + "epoch": 0.8922367717797969, + "grad_norm": 0.5849388241767883, + "learning_rate": 0.00016835914621635655, + "loss": 0.4142, + "step": 26710 + }, + { + "epoch": 0.8925708177445216, + "grad_norm": 0.6043843626976013, + "learning_rate": 0.00016833226607027818, + "loss": 0.4149, + "step": 26720 + }, + { + "epoch": 0.8929048637092464, + "grad_norm": 0.6658894419670105, + "learning_rate": 0.0001683053766590081, + "loss": 0.4321, + "step": 26730 + }, + { + "epoch": 0.8932389096739711, + "grad_norm": 0.5924740433692932, + "learning_rate": 0.00016827847798619216, + "loss": 0.3902, + "step": 26740 + }, + { + "epoch": 0.8935729556386959, + "grad_norm": 0.5868433117866516, + "learning_rate": 0.00016825157005547763, + "loss": 0.404, + "step": 26750 + }, + { + "epoch": 0.8939070016034206, + "grad_norm": 0.6760953068733215, + "learning_rate": 0.00016822465287051296, + "loss": 0.4278, + "step": 26760 + }, + { + "epoch": 0.8942410475681454, + "grad_norm": 0.7294096350669861, + "learning_rate": 0.00016819772643494784, + "loss": 0.3964, + "step": 26770 + }, + { + "epoch": 0.8945750935328701, + "grad_norm": 0.549727737903595, + "learning_rate": 0.00016817079075243324, + "loss": 0.4157, + "step": 26780 + }, + { + "epoch": 0.8949091394975949, + "grad_norm": 0.6039133071899414, + "learning_rate": 0.00016814384582662135, + "loss": 0.3976, + "step": 26790 + }, + { + "epoch": 0.8952431854623196, + "grad_norm": 0.6710125207901001, + "learning_rate": 0.0001681168916611657, + "loss": 0.3988, + "step": 26800 + }, + { + "epoch": 0.8955772314270444, + "grad_norm": 0.7007187604904175, + "learning_rate": 0.00016808992825972094, + "loss": 0.451, + "step": 26810 + }, + { + "epoch": 0.8959112773917691, + "grad_norm": 0.6793410181999207, + "learning_rate": 0.00016806295562594311, + "loss": 0.4169, + "step": 26820 + }, + { + "epoch": 0.8962453233564939, + "grad_norm": 0.8321549892425537, + "learning_rate": 0.0001680359737634894, + "loss": 0.4436, + "step": 26830 + }, + { + "epoch": 0.8965793693212186, + "grad_norm": 0.5851354598999023, + "learning_rate": 0.0001680089826760183, + "loss": 0.3909, + "step": 26840 + }, + { + "epoch": 0.8969134152859434, + "grad_norm": 0.7811641693115234, + "learning_rate": 0.00016798198236718954, + "loss": 0.4368, + "step": 26850 + }, + { + "epoch": 0.8972474612506681, + "grad_norm": 0.8087803721427917, + "learning_rate": 0.00016795497284066412, + "loss": 0.3916, + "step": 26860 + }, + { + "epoch": 0.8975815072153929, + "grad_norm": 0.716031551361084, + "learning_rate": 0.0001679279541001042, + "loss": 0.4351, + "step": 26870 + }, + { + "epoch": 0.8979155531801176, + "grad_norm": 0.5434061288833618, + "learning_rate": 0.00016790092614917335, + "loss": 0.4285, + "step": 26880 + }, + { + "epoch": 0.8982495991448424, + "grad_norm": 0.700194776058197, + "learning_rate": 0.00016787388899153623, + "loss": 0.4027, + "step": 26890 + }, + { + "epoch": 0.8985836451095671, + "grad_norm": 0.5610730648040771, + "learning_rate": 0.00016784684263085887, + "loss": 0.4298, + "step": 26900 + }, + { + "epoch": 0.8989176910742919, + "grad_norm": 0.692162036895752, + "learning_rate": 0.00016781978707080842, + "loss": 0.4243, + "step": 26910 + }, + { + "epoch": 0.8992517370390166, + "grad_norm": 0.6753206849098206, + "learning_rate": 0.00016779272231505341, + "loss": 0.4323, + "step": 26920 + }, + { + "epoch": 0.8995857830037414, + "grad_norm": 0.6116331219673157, + "learning_rate": 0.00016776564836726353, + "loss": 0.408, + "step": 26930 + }, + { + "epoch": 0.899919828968466, + "grad_norm": 0.7375907897949219, + "learning_rate": 0.0001677385652311098, + "loss": 0.439, + "step": 26940 + }, + { + "epoch": 0.9002538749331908, + "grad_norm": 0.6878701448440552, + "learning_rate": 0.00016771147291026436, + "loss": 0.4136, + "step": 26950 + }, + { + "epoch": 0.9005879208979155, + "grad_norm": 0.6688471436500549, + "learning_rate": 0.0001676843714084007, + "loss": 0.4152, + "step": 26960 + }, + { + "epoch": 0.9009219668626403, + "grad_norm": 0.6221848726272583, + "learning_rate": 0.0001676572607291935, + "loss": 0.4088, + "step": 26970 + }, + { + "epoch": 0.901256012827365, + "grad_norm": 0.7638468146324158, + "learning_rate": 0.00016763014087631874, + "loss": 0.3916, + "step": 26980 + }, + { + "epoch": 0.9015900587920898, + "grad_norm": 0.7306135892868042, + "learning_rate": 0.00016760301185345358, + "loss": 0.4287, + "step": 26990 + }, + { + "epoch": 0.9019241047568145, + "grad_norm": 0.6549275517463684, + "learning_rate": 0.00016757587366427642, + "loss": 0.415, + "step": 27000 + }, + { + "epoch": 0.9019241047568145, + "eval_chrf": 85.91293841926212, + "eval_loss": 0.7626347541809082, + "eval_runtime": 108.8468, + "eval_samples_per_second": 0.919, + "eval_steps_per_second": 0.037, + "step": 27000 + }, + { + "epoch": 0.9022581507215393, + "grad_norm": 0.7197611927986145, + "learning_rate": 0.00016754872631246704, + "loss": 0.4283, + "step": 27010 + }, + { + "epoch": 0.902592196686264, + "grad_norm": 0.6814311146736145, + "learning_rate": 0.00016752156980170626, + "loss": 0.4111, + "step": 27020 + }, + { + "epoch": 0.9029262426509888, + "grad_norm": 0.6594102382659912, + "learning_rate": 0.0001674944041356763, + "loss": 0.4123, + "step": 27030 + }, + { + "epoch": 0.9032602886157135, + "grad_norm": 0.6399022340774536, + "learning_rate": 0.0001674672293180605, + "loss": 0.4243, + "step": 27040 + }, + { + "epoch": 0.9035943345804383, + "grad_norm": 0.7295684218406677, + "learning_rate": 0.00016744004535254353, + "loss": 0.4398, + "step": 27050 + }, + { + "epoch": 0.903928380545163, + "grad_norm": 0.7717476487159729, + "learning_rate": 0.0001674128522428113, + "loss": 0.419, + "step": 27060 + }, + { + "epoch": 0.9042624265098878, + "grad_norm": 0.5514503121376038, + "learning_rate": 0.0001673856499925509, + "loss": 0.4098, + "step": 27070 + }, + { + "epoch": 0.9045964724746125, + "grad_norm": 0.7964188456535339, + "learning_rate": 0.00016735843860545067, + "loss": 0.4103, + "step": 27080 + }, + { + "epoch": 0.9049305184393373, + "grad_norm": 0.7884505987167358, + "learning_rate": 0.00016733121808520024, + "loss": 0.4053, + "step": 27090 + }, + { + "epoch": 0.905264564404062, + "grad_norm": 0.6205947995185852, + "learning_rate": 0.00016730398843549046, + "loss": 0.4312, + "step": 27100 + }, + { + "epoch": 0.9055986103687867, + "grad_norm": 0.7147979736328125, + "learning_rate": 0.0001672767496600134, + "loss": 0.4532, + "step": 27110 + }, + { + "epoch": 0.9059326563335115, + "grad_norm": 0.6571245789527893, + "learning_rate": 0.00016724950176246237, + "loss": 0.4328, + "step": 27120 + }, + { + "epoch": 0.9062667022982362, + "grad_norm": 0.5694935917854309, + "learning_rate": 0.00016722224474653193, + "loss": 0.3928, + "step": 27130 + }, + { + "epoch": 0.906600748262961, + "grad_norm": 0.6373029947280884, + "learning_rate": 0.0001671949786159178, + "loss": 0.4239, + "step": 27140 + }, + { + "epoch": 0.9069347942276857, + "grad_norm": 0.6879444122314453, + "learning_rate": 0.0001671677033743171, + "loss": 0.401, + "step": 27150 + }, + { + "epoch": 0.9072688401924105, + "grad_norm": 0.6211090087890625, + "learning_rate": 0.000167140419025428, + "loss": 0.3928, + "step": 27160 + }, + { + "epoch": 0.9076028861571352, + "grad_norm": 0.8818375468254089, + "learning_rate": 0.00016711312557295008, + "loss": 0.4003, + "step": 27170 + }, + { + "epoch": 0.90793693212186, + "grad_norm": 0.6383905410766602, + "learning_rate": 0.00016708582302058398, + "loss": 0.4191, + "step": 27180 + }, + { + "epoch": 0.9082709780865847, + "grad_norm": 0.6431902050971985, + "learning_rate": 0.00016705851137203175, + "loss": 0.4456, + "step": 27190 + }, + { + "epoch": 0.9086050240513095, + "grad_norm": 0.6286666393280029, + "learning_rate": 0.00016703119063099647, + "loss": 0.4062, + "step": 27200 + }, + { + "epoch": 0.9089390700160342, + "grad_norm": 0.7727645635604858, + "learning_rate": 0.0001670038608011827, + "loss": 0.4022, + "step": 27210 + }, + { + "epoch": 0.909273115980759, + "grad_norm": 0.8297967910766602, + "learning_rate": 0.00016697652188629603, + "loss": 0.4236, + "step": 27220 + }, + { + "epoch": 0.9096071619454837, + "grad_norm": 0.606185793876648, + "learning_rate": 0.00016694917389004332, + "loss": 0.4297, + "step": 27230 + }, + { + "epoch": 0.9099412079102085, + "grad_norm": 0.5873643159866333, + "learning_rate": 0.00016692181681613274, + "loss": 0.3988, + "step": 27240 + }, + { + "epoch": 0.9102752538749332, + "grad_norm": 0.630871593952179, + "learning_rate": 0.00016689445066827365, + "loss": 0.4229, + "step": 27250 + }, + { + "epoch": 0.9106092998396579, + "grad_norm": 0.6856635212898254, + "learning_rate": 0.0001668670754501766, + "loss": 0.4203, + "step": 27260 + }, + { + "epoch": 0.9109433458043826, + "grad_norm": 0.5968522429466248, + "learning_rate": 0.00016683969116555343, + "loss": 0.4048, + "step": 27270 + }, + { + "epoch": 0.9112773917691074, + "grad_norm": 0.6798935532569885, + "learning_rate": 0.00016681229781811716, + "loss": 0.4098, + "step": 27280 + }, + { + "epoch": 0.9116114377338321, + "grad_norm": 0.6084237694740295, + "learning_rate": 0.0001667848954115821, + "loss": 0.3927, + "step": 27290 + }, + { + "epoch": 0.9119454836985569, + "grad_norm": 0.670846700668335, + "learning_rate": 0.0001667574839496637, + "loss": 0.4077, + "step": 27300 + }, + { + "epoch": 0.9122795296632816, + "grad_norm": 0.5852785110473633, + "learning_rate": 0.0001667300634360787, + "loss": 0.4195, + "step": 27310 + }, + { + "epoch": 0.9126135756280064, + "grad_norm": 0.7006387710571289, + "learning_rate": 0.00016670263387454513, + "loss": 0.3978, + "step": 27320 + }, + { + "epoch": 0.9129476215927311, + "grad_norm": 0.6011231541633606, + "learning_rate": 0.00016667519526878205, + "loss": 0.4445, + "step": 27330 + }, + { + "epoch": 0.9132816675574559, + "grad_norm": 0.5204564332962036, + "learning_rate": 0.00016664774762250995, + "loss": 0.3996, + "step": 27340 + }, + { + "epoch": 0.9136157135221806, + "grad_norm": 0.6421999335289001, + "learning_rate": 0.00016662029093945043, + "loss": 0.4077, + "step": 27350 + }, + { + "epoch": 0.9139497594869054, + "grad_norm": 0.7379025816917419, + "learning_rate": 0.00016659282522332638, + "loss": 0.4101, + "step": 27360 + }, + { + "epoch": 0.9142838054516301, + "grad_norm": 0.6712580919265747, + "learning_rate": 0.00016656535047786188, + "loss": 0.4003, + "step": 27370 + }, + { + "epoch": 0.9146178514163549, + "grad_norm": 0.7897153496742249, + "learning_rate": 0.00016653786670678218, + "loss": 0.3886, + "step": 27380 + }, + { + "epoch": 0.9149518973810796, + "grad_norm": 0.5768500566482544, + "learning_rate": 0.00016651037391381392, + "loss": 0.4146, + "step": 27390 + }, + { + "epoch": 0.9152859433458044, + "grad_norm": 0.6981680393218994, + "learning_rate": 0.00016648287210268477, + "loss": 0.4375, + "step": 27400 + }, + { + "epoch": 0.9156199893105291, + "grad_norm": 0.7493621110916138, + "learning_rate": 0.0001664553612771237, + "loss": 0.379, + "step": 27410 + }, + { + "epoch": 0.9159540352752539, + "grad_norm": 0.6706789135932922, + "learning_rate": 0.000166427841440861, + "loss": 0.4552, + "step": 27420 + }, + { + "epoch": 0.9162880812399786, + "grad_norm": 0.7568528056144714, + "learning_rate": 0.000166400312597628, + "loss": 0.4271, + "step": 27430 + }, + { + "epoch": 0.9166221272047034, + "grad_norm": 0.6074115633964539, + "learning_rate": 0.00016637277475115738, + "loss": 0.3873, + "step": 27440 + }, + { + "epoch": 0.9169561731694281, + "grad_norm": 0.7313281297683716, + "learning_rate": 0.00016634522790518301, + "loss": 0.4465, + "step": 27450 + }, + { + "epoch": 0.9172902191341529, + "grad_norm": 0.7206487059593201, + "learning_rate": 0.00016631767206343997, + "loss": 0.4468, + "step": 27460 + }, + { + "epoch": 0.9176242650988776, + "grad_norm": 0.5663774013519287, + "learning_rate": 0.00016629010722966458, + "loss": 0.3969, + "step": 27470 + }, + { + "epoch": 0.9179583110636024, + "grad_norm": 0.6636072993278503, + "learning_rate": 0.00016626253340759433, + "loss": 0.4449, + "step": 27480 + }, + { + "epoch": 0.9182923570283271, + "grad_norm": 0.5673471689224243, + "learning_rate": 0.000166234950600968, + "loss": 0.4465, + "step": 27490 + }, + { + "epoch": 0.9186264029930519, + "grad_norm": 0.6125243902206421, + "learning_rate": 0.0001662073588135255, + "loss": 0.4247, + "step": 27500 + }, + { + "epoch": 0.9189604489577766, + "grad_norm": 0.6069029569625854, + "learning_rate": 0.00016617975804900806, + "loss": 0.4165, + "step": 27510 + }, + { + "epoch": 0.9192944949225014, + "grad_norm": 0.681857168674469, + "learning_rate": 0.00016615214831115804, + "loss": 0.4035, + "step": 27520 + }, + { + "epoch": 0.9196285408872261, + "grad_norm": 0.5720523595809937, + "learning_rate": 0.00016612452960371909, + "loss": 0.4385, + "step": 27530 + }, + { + "epoch": 0.9199625868519509, + "grad_norm": 0.6386457681655884, + "learning_rate": 0.000166096901930436, + "loss": 0.4365, + "step": 27540 + }, + { + "epoch": 0.9202966328166756, + "grad_norm": 0.5956602692604065, + "learning_rate": 0.00016606926529505483, + "loss": 0.3767, + "step": 27550 + }, + { + "epoch": 0.9206306787814004, + "grad_norm": 0.6366074085235596, + "learning_rate": 0.00016604161970132285, + "loss": 0.4139, + "step": 27560 + }, + { + "epoch": 0.9209647247461251, + "grad_norm": 0.6273916959762573, + "learning_rate": 0.00016601396515298852, + "loss": 0.4088, + "step": 27570 + }, + { + "epoch": 0.9212987707108499, + "grad_norm": 0.771324098110199, + "learning_rate": 0.00016598630165380154, + "loss": 0.4452, + "step": 27580 + }, + { + "epoch": 0.9216328166755745, + "grad_norm": 0.6155034303665161, + "learning_rate": 0.0001659586292075128, + "loss": 0.4192, + "step": 27590 + }, + { + "epoch": 0.9219668626402993, + "grad_norm": 0.5987372398376465, + "learning_rate": 0.0001659309478178744, + "loss": 0.3978, + "step": 27600 + }, + { + "epoch": 0.922300908605024, + "grad_norm": 0.7155749797821045, + "learning_rate": 0.00016590325748863972, + "loss": 0.422, + "step": 27610 + }, + { + "epoch": 0.9226349545697488, + "grad_norm": 0.8199020028114319, + "learning_rate": 0.00016587555822356325, + "loss": 0.4208, + "step": 27620 + }, + { + "epoch": 0.9229690005344735, + "grad_norm": 0.6078616380691528, + "learning_rate": 0.00016584785002640072, + "loss": 0.43, + "step": 27630 + }, + { + "epoch": 0.9233030464991983, + "grad_norm": 0.6100404262542725, + "learning_rate": 0.0001658201329009092, + "loss": 0.4146, + "step": 27640 + }, + { + "epoch": 0.923637092463923, + "grad_norm": 0.7635073065757751, + "learning_rate": 0.00016579240685084674, + "loss": 0.4038, + "step": 27650 + }, + { + "epoch": 0.9239711384286478, + "grad_norm": 0.6082972288131714, + "learning_rate": 0.0001657646718799728, + "loss": 0.4581, + "step": 27660 + }, + { + "epoch": 0.9243051843933725, + "grad_norm": 0.8136555552482605, + "learning_rate": 0.00016573692799204794, + "loss": 0.4395, + "step": 27670 + }, + { + "epoch": 0.9246392303580973, + "grad_norm": 0.5423725247383118, + "learning_rate": 0.00016570917519083398, + "loss": 0.4348, + "step": 27680 + }, + { + "epoch": 0.924973276322822, + "grad_norm": 0.6070282459259033, + "learning_rate": 0.00016568141348009393, + "loss": 0.4443, + "step": 27690 + }, + { + "epoch": 0.9253073222875468, + "grad_norm": 0.6088677048683167, + "learning_rate": 0.00016565364286359198, + "loss": 0.3928, + "step": 27700 + }, + { + "epoch": 0.9256413682522715, + "grad_norm": 0.6794064044952393, + "learning_rate": 0.0001656258633450936, + "loss": 0.4326, + "step": 27710 + }, + { + "epoch": 0.9259754142169963, + "grad_norm": 0.7175974249839783, + "learning_rate": 0.00016559807492836536, + "loss": 0.3953, + "step": 27720 + }, + { + "epoch": 0.926309460181721, + "grad_norm": 0.624619722366333, + "learning_rate": 0.00016557027761717518, + "loss": 0.3772, + "step": 27730 + }, + { + "epoch": 0.9266435061464458, + "grad_norm": 0.6877516508102417, + "learning_rate": 0.00016554247141529205, + "loss": 0.4138, + "step": 27740 + }, + { + "epoch": 0.9269775521111705, + "grad_norm": 0.7816430330276489, + "learning_rate": 0.00016551465632648623, + "loss": 0.4393, + "step": 27750 + }, + { + "epoch": 0.9273115980758952, + "grad_norm": 0.6912556886672974, + "learning_rate": 0.00016548683235452914, + "loss": 0.3972, + "step": 27760 + }, + { + "epoch": 0.92764564404062, + "grad_norm": 0.6324269771575928, + "learning_rate": 0.00016545899950319354, + "loss": 0.4413, + "step": 27770 + }, + { + "epoch": 0.9279796900053447, + "grad_norm": 0.7190887928009033, + "learning_rate": 0.00016543115777625325, + "loss": 0.4083, + "step": 27780 + }, + { + "epoch": 0.9283137359700695, + "grad_norm": 0.6980831623077393, + "learning_rate": 0.00016540330717748324, + "loss": 0.3855, + "step": 27790 + }, + { + "epoch": 0.9286477819347942, + "grad_norm": 0.6998801231384277, + "learning_rate": 0.00016537544771065994, + "loss": 0.4095, + "step": 27800 + }, + { + "epoch": 0.928981827899519, + "grad_norm": 0.6189838647842407, + "learning_rate": 0.00016534757937956073, + "loss": 0.4099, + "step": 27810 + }, + { + "epoch": 0.9293158738642437, + "grad_norm": 0.7172427773475647, + "learning_rate": 0.00016531970218796428, + "loss": 0.4271, + "step": 27820 + }, + { + "epoch": 0.9296499198289685, + "grad_norm": 0.735614001750946, + "learning_rate": 0.0001652918161396505, + "loss": 0.4164, + "step": 27830 + }, + { + "epoch": 0.9299839657936932, + "grad_norm": 0.7031096816062927, + "learning_rate": 0.00016526392123840047, + "loss": 0.4248, + "step": 27840 + }, + { + "epoch": 0.930318011758418, + "grad_norm": 0.7136692404747009, + "learning_rate": 0.00016523601748799643, + "loss": 0.4148, + "step": 27850 + }, + { + "epoch": 0.9306520577231427, + "grad_norm": 0.7088313698768616, + "learning_rate": 0.00016520810489222188, + "loss": 0.4374, + "step": 27860 + }, + { + "epoch": 0.9309861036878675, + "grad_norm": 0.6800187230110168, + "learning_rate": 0.00016518018345486153, + "loss": 0.4079, + "step": 27870 + }, + { + "epoch": 0.9313201496525922, + "grad_norm": 0.6563946604728699, + "learning_rate": 0.0001651522531797012, + "loss": 0.392, + "step": 27880 + }, + { + "epoch": 0.931654195617317, + "grad_norm": 0.6043025255203247, + "learning_rate": 0.000165124314070528, + "loss": 0.4135, + "step": 27890 + }, + { + "epoch": 0.9319882415820417, + "grad_norm": 0.652675449848175, + "learning_rate": 0.00016509636613113018, + "loss": 0.3919, + "step": 27900 + }, + { + "epoch": 0.9323222875467664, + "grad_norm": 0.6832460165023804, + "learning_rate": 0.0001650684093652972, + "loss": 0.4348, + "step": 27910 + }, + { + "epoch": 0.9326563335114911, + "grad_norm": 0.6642490029335022, + "learning_rate": 0.0001650404437768198, + "loss": 0.4012, + "step": 27920 + }, + { + "epoch": 0.9329903794762159, + "grad_norm": 0.656682014465332, + "learning_rate": 0.00016501246936948975, + "loss": 0.4094, + "step": 27930 + }, + { + "epoch": 0.9333244254409406, + "grad_norm": 0.613155722618103, + "learning_rate": 0.00016498448614710016, + "loss": 0.4104, + "step": 27940 + }, + { + "epoch": 0.9336584714056654, + "grad_norm": 0.7327589988708496, + "learning_rate": 0.00016495649411344524, + "loss": 0.3784, + "step": 27950 + }, + { + "epoch": 0.9339925173703901, + "grad_norm": 0.8519203662872314, + "learning_rate": 0.0001649284932723205, + "loss": 0.3938, + "step": 27960 + }, + { + "epoch": 0.9343265633351149, + "grad_norm": 0.8082032203674316, + "learning_rate": 0.00016490048362752252, + "loss": 0.4015, + "step": 27970 + }, + { + "epoch": 0.9346606092998396, + "grad_norm": 0.6892248392105103, + "learning_rate": 0.00016487246518284923, + "loss": 0.4621, + "step": 27980 + }, + { + "epoch": 0.9349946552645644, + "grad_norm": 0.7234610319137573, + "learning_rate": 0.0001648444379420995, + "loss": 0.4728, + "step": 27990 + }, + { + "epoch": 0.9353287012292891, + "grad_norm": 0.709416925907135, + "learning_rate": 0.0001648164019090737, + "loss": 0.4283, + "step": 28000 + }, + { + "epoch": 0.9353287012292891, + "eval_chrf": 84.92471354752757, + "eval_loss": 0.7585941553115845, + "eval_runtime": 181.6818, + "eval_samples_per_second": 0.55, + "eval_steps_per_second": 0.022, + "step": 28000 + }, + { + "epoch": 0.9356627471940139, + "grad_norm": 0.7181259989738464, + "learning_rate": 0.00016478835708757318, + "loss": 0.437, + "step": 28010 + }, + { + "epoch": 0.9359967931587386, + "grad_norm": 0.572722852230072, + "learning_rate": 0.00016476030348140056, + "loss": 0.4316, + "step": 28020 + }, + { + "epoch": 0.9363308391234634, + "grad_norm": 0.6246945858001709, + "learning_rate": 0.00016473224109435963, + "loss": 0.387, + "step": 28030 + }, + { + "epoch": 0.9366648850881881, + "grad_norm": 0.6916311383247375, + "learning_rate": 0.00016470416993025536, + "loss": 0.3923, + "step": 28040 + }, + { + "epoch": 0.9369989310529129, + "grad_norm": 0.6355416774749756, + "learning_rate": 0.000164676089992894, + "loss": 0.4026, + "step": 28050 + }, + { + "epoch": 0.9373329770176376, + "grad_norm": 0.663777232170105, + "learning_rate": 0.0001646480012860828, + "loss": 0.418, + "step": 28060 + }, + { + "epoch": 0.9376670229823624, + "grad_norm": 0.5860746502876282, + "learning_rate": 0.00016461990381363043, + "loss": 0.4476, + "step": 28070 + }, + { + "epoch": 0.9380010689470871, + "grad_norm": 0.6725481152534485, + "learning_rate": 0.00016459179757934652, + "loss": 0.3987, + "step": 28080 + }, + { + "epoch": 0.9383351149118119, + "grad_norm": 0.6832795143127441, + "learning_rate": 0.00016456368258704212, + "loss": 0.3893, + "step": 28090 + }, + { + "epoch": 0.9386691608765366, + "grad_norm": 0.5894426703453064, + "learning_rate": 0.00016453555884052927, + "loss": 0.3816, + "step": 28100 + }, + { + "epoch": 0.9390032068412614, + "grad_norm": 0.7449106574058533, + "learning_rate": 0.0001645074263436213, + "loss": 0.4236, + "step": 28110 + }, + { + "epoch": 0.9393372528059861, + "grad_norm": 0.7142997980117798, + "learning_rate": 0.00016447928510013272, + "loss": 0.4371, + "step": 28120 + }, + { + "epoch": 0.9396712987707109, + "grad_norm": 0.7365140914916992, + "learning_rate": 0.0001644511351138792, + "loss": 0.4171, + "step": 28130 + }, + { + "epoch": 0.9400053447354356, + "grad_norm": 0.5461568832397461, + "learning_rate": 0.0001644229763886776, + "loss": 0.4124, + "step": 28140 + }, + { + "epoch": 0.9403393907001604, + "grad_norm": 0.6810713410377502, + "learning_rate": 0.00016439480892834594, + "loss": 0.4218, + "step": 28150 + }, + { + "epoch": 0.9406734366648851, + "grad_norm": 0.6710931062698364, + "learning_rate": 0.0001643666327367035, + "loss": 0.4235, + "step": 28160 + }, + { + "epoch": 0.9410074826296099, + "grad_norm": 0.6333529949188232, + "learning_rate": 0.0001643384478175707, + "loss": 0.3967, + "step": 28170 + }, + { + "epoch": 0.9413415285943346, + "grad_norm": 0.8548906445503235, + "learning_rate": 0.0001643102541747691, + "loss": 0.4332, + "step": 28180 + }, + { + "epoch": 0.9416755745590594, + "grad_norm": 0.6811680197715759, + "learning_rate": 0.0001642820518121215, + "loss": 0.408, + "step": 28190 + }, + { + "epoch": 0.9420096205237841, + "grad_norm": 0.577258288860321, + "learning_rate": 0.00016425384073345188, + "loss": 0.442, + "step": 28200 + }, + { + "epoch": 0.9423436664885089, + "grad_norm": 0.5885841846466064, + "learning_rate": 0.0001642256209425854, + "loss": 0.397, + "step": 28210 + }, + { + "epoch": 0.9426777124532336, + "grad_norm": 0.6427764296531677, + "learning_rate": 0.00016419739244334837, + "loss": 0.4146, + "step": 28220 + }, + { + "epoch": 0.9430117584179583, + "grad_norm": 0.5737310647964478, + "learning_rate": 0.00016416915523956828, + "loss": 0.3992, + "step": 28230 + }, + { + "epoch": 0.943345804382683, + "grad_norm": 0.7216953635215759, + "learning_rate": 0.00016414090933507386, + "loss": 0.4165, + "step": 28240 + }, + { + "epoch": 0.9436798503474078, + "grad_norm": 0.616515576839447, + "learning_rate": 0.00016411265473369497, + "loss": 0.3898, + "step": 28250 + }, + { + "epoch": 0.9440138963121325, + "grad_norm": 0.6664136648178101, + "learning_rate": 0.00016408439143926265, + "loss": 0.4199, + "step": 28260 + }, + { + "epoch": 0.9443479422768573, + "grad_norm": 0.6024528741836548, + "learning_rate": 0.00016405611945560913, + "loss": 0.4138, + "step": 28270 + }, + { + "epoch": 0.944681988241582, + "grad_norm": 0.6650018692016602, + "learning_rate": 0.0001640278387865678, + "loss": 0.3898, + "step": 28280 + }, + { + "epoch": 0.9450160342063068, + "grad_norm": 0.6703158020973206, + "learning_rate": 0.00016399954943597327, + "loss": 0.4093, + "step": 28290 + }, + { + "epoch": 0.9453500801710315, + "grad_norm": 0.6186183094978333, + "learning_rate": 0.0001639712514076613, + "loss": 0.4234, + "step": 28300 + }, + { + "epoch": 0.9456841261357563, + "grad_norm": 0.720687747001648, + "learning_rate": 0.00016394294470546884, + "loss": 0.4289, + "step": 28310 + }, + { + "epoch": 0.946018172100481, + "grad_norm": 0.7207848429679871, + "learning_rate": 0.00016391462933323394, + "loss": 0.409, + "step": 28320 + }, + { + "epoch": 0.9463522180652058, + "grad_norm": 0.7303851246833801, + "learning_rate": 0.00016388630529479595, + "loss": 0.4244, + "step": 28330 + }, + { + "epoch": 0.9466862640299305, + "grad_norm": 0.7535518407821655, + "learning_rate": 0.00016385797259399533, + "loss": 0.4605, + "step": 28340 + }, + { + "epoch": 0.9470203099946553, + "grad_norm": 0.6633395552635193, + "learning_rate": 0.0001638296312346737, + "loss": 0.4186, + "step": 28350 + }, + { + "epoch": 0.94735435595938, + "grad_norm": 0.688200831413269, + "learning_rate": 0.0001638012812206739, + "loss": 0.4468, + "step": 28360 + }, + { + "epoch": 0.9476884019241048, + "grad_norm": 0.7186072468757629, + "learning_rate": 0.00016377292255583987, + "loss": 0.4346, + "step": 28370 + }, + { + "epoch": 0.9480224478888295, + "grad_norm": 0.6399295330047607, + "learning_rate": 0.0001637445552440168, + "loss": 0.4008, + "step": 28380 + }, + { + "epoch": 0.9483564938535542, + "grad_norm": 0.760608971118927, + "learning_rate": 0.00016371617928905103, + "loss": 0.4479, + "step": 28390 + }, + { + "epoch": 0.948690539818279, + "grad_norm": 0.8375923037528992, + "learning_rate": 0.00016368779469479004, + "loss": 0.4359, + "step": 28400 + }, + { + "epoch": 0.9490245857830037, + "grad_norm": 0.5399025678634644, + "learning_rate": 0.00016365940146508253, + "loss": 0.387, + "step": 28410 + }, + { + "epoch": 0.9493586317477285, + "grad_norm": 0.6226215958595276, + "learning_rate": 0.00016363099960377834, + "loss": 0.3943, + "step": 28420 + }, + { + "epoch": 0.9496926777124532, + "grad_norm": 0.6706262826919556, + "learning_rate": 0.00016360258911472849, + "loss": 0.429, + "step": 28430 + }, + { + "epoch": 0.950026723677178, + "grad_norm": 0.622541606426239, + "learning_rate": 0.00016357417000178513, + "loss": 0.3959, + "step": 28440 + }, + { + "epoch": 0.9503607696419027, + "grad_norm": 0.7003149390220642, + "learning_rate": 0.00016354574226880165, + "loss": 0.4484, + "step": 28450 + }, + { + "epoch": 0.9506948156066275, + "grad_norm": 0.6396574974060059, + "learning_rate": 0.00016351730591963258, + "loss": 0.4041, + "step": 28460 + }, + { + "epoch": 0.9510288615713522, + "grad_norm": 0.6003515124320984, + "learning_rate": 0.00016348886095813358, + "loss": 0.408, + "step": 28470 + }, + { + "epoch": 0.951362907536077, + "grad_norm": 0.6684345006942749, + "learning_rate": 0.00016346040738816155, + "loss": 0.4024, + "step": 28480 + }, + { + "epoch": 0.9516969535008017, + "grad_norm": 0.6464314460754395, + "learning_rate": 0.0001634319452135745, + "loss": 0.3962, + "step": 28490 + }, + { + "epoch": 0.9520309994655265, + "grad_norm": 0.7616468071937561, + "learning_rate": 0.0001634034744382316, + "loss": 0.4187, + "step": 28500 + }, + { + "epoch": 0.9523650454302512, + "grad_norm": 0.636340320110321, + "learning_rate": 0.00016337499506599327, + "loss": 0.4185, + "step": 28510 + }, + { + "epoch": 0.952699091394976, + "grad_norm": 0.780094563961029, + "learning_rate": 0.00016334650710072098, + "loss": 0.4151, + "step": 28520 + }, + { + "epoch": 0.9530331373597007, + "grad_norm": 0.714312732219696, + "learning_rate": 0.00016331801054627744, + "loss": 0.4066, + "step": 28530 + }, + { + "epoch": 0.9533671833244255, + "grad_norm": 0.5828033089637756, + "learning_rate": 0.00016328950540652653, + "loss": 0.4335, + "step": 28540 + }, + { + "epoch": 0.9537012292891501, + "grad_norm": 0.6333234906196594, + "learning_rate": 0.00016326099168533325, + "loss": 0.3998, + "step": 28550 + }, + { + "epoch": 0.9540352752538749, + "grad_norm": 0.7502822279930115, + "learning_rate": 0.00016323246938656374, + "loss": 0.4011, + "step": 28560 + }, + { + "epoch": 0.9543693212185996, + "grad_norm": 0.8767051696777344, + "learning_rate": 0.00016320393851408543, + "loss": 0.4278, + "step": 28570 + }, + { + "epoch": 0.9547033671833244, + "grad_norm": 0.5986884832382202, + "learning_rate": 0.00016317539907176676, + "loss": 0.4007, + "step": 28580 + }, + { + "epoch": 0.9550374131480491, + "grad_norm": 0.6911078095436096, + "learning_rate": 0.00016314685106347745, + "loss": 0.436, + "step": 28590 + }, + { + "epoch": 0.9553714591127739, + "grad_norm": 0.5874336361885071, + "learning_rate": 0.00016311829449308827, + "loss": 0.4311, + "step": 28600 + }, + { + "epoch": 0.9557055050774986, + "grad_norm": 0.6109907031059265, + "learning_rate": 0.00016308972936447132, + "loss": 0.3741, + "step": 28610 + }, + { + "epoch": 0.9560395510422234, + "grad_norm": 0.697694718837738, + "learning_rate": 0.00016306115568149964, + "loss": 0.4174, + "step": 28620 + }, + { + "epoch": 0.9563735970069481, + "grad_norm": 0.5782793760299683, + "learning_rate": 0.0001630325734480476, + "loss": 0.3993, + "step": 28630 + }, + { + "epoch": 0.9567076429716729, + "grad_norm": 0.703821063041687, + "learning_rate": 0.0001630039826679907, + "loss": 0.4203, + "step": 28640 + }, + { + "epoch": 0.9570416889363976, + "grad_norm": 0.74274742603302, + "learning_rate": 0.00016297538334520553, + "loss": 0.3864, + "step": 28650 + }, + { + "epoch": 0.9573757349011224, + "grad_norm": 0.6516667604446411, + "learning_rate": 0.00016294677548356988, + "loss": 0.4062, + "step": 28660 + }, + { + "epoch": 0.9577097808658471, + "grad_norm": 0.5964801907539368, + "learning_rate": 0.00016291815908696272, + "loss": 0.389, + "step": 28670 + }, + { + "epoch": 0.9580438268305719, + "grad_norm": 0.7191340923309326, + "learning_rate": 0.00016288953415926412, + "loss": 0.3778, + "step": 28680 + }, + { + "epoch": 0.9583778727952966, + "grad_norm": 0.6114505529403687, + "learning_rate": 0.0001628609007043554, + "loss": 0.4239, + "step": 28690 + }, + { + "epoch": 0.9587119187600214, + "grad_norm": 0.5765867233276367, + "learning_rate": 0.0001628322587261189, + "loss": 0.4032, + "step": 28700 + }, + { + "epoch": 0.9590459647247461, + "grad_norm": 0.7096400260925293, + "learning_rate": 0.00016280360822843826, + "loss": 0.402, + "step": 28710 + }, + { + "epoch": 0.9593800106894709, + "grad_norm": 0.6875273585319519, + "learning_rate": 0.00016277494921519817, + "loss": 0.4209, + "step": 28720 + }, + { + "epoch": 0.9597140566541956, + "grad_norm": 0.6604338884353638, + "learning_rate": 0.00016274628169028456, + "loss": 0.412, + "step": 28730 + }, + { + "epoch": 0.9600481026189204, + "grad_norm": 0.5683020353317261, + "learning_rate": 0.00016271760565758444, + "loss": 0.4224, + "step": 28740 + }, + { + "epoch": 0.9603821485836451, + "grad_norm": 0.6181301474571228, + "learning_rate": 0.00016268892112098596, + "loss": 0.4406, + "step": 28750 + }, + { + "epoch": 0.9607161945483699, + "grad_norm": 0.6688970327377319, + "learning_rate": 0.00016266022808437855, + "loss": 0.4188, + "step": 28760 + }, + { + "epoch": 0.9610502405130946, + "grad_norm": 0.5803858637809753, + "learning_rate": 0.00016263152655165267, + "loss": 0.3879, + "step": 28770 + }, + { + "epoch": 0.9613842864778194, + "grad_norm": 0.776665449142456, + "learning_rate": 0.0001626028165266999, + "loss": 0.4122, + "step": 28780 + }, + { + "epoch": 0.9617183324425441, + "grad_norm": 0.8339954614639282, + "learning_rate": 0.00016257409801341316, + "loss": 0.4166, + "step": 28790 + }, + { + "epoch": 0.9620523784072689, + "grad_norm": 0.828284740447998, + "learning_rate": 0.00016254537101568632, + "loss": 0.3952, + "step": 28800 + }, + { + "epoch": 0.9623864243719936, + "grad_norm": 0.6806625723838806, + "learning_rate": 0.0001625166355374145, + "loss": 0.4078, + "step": 28810 + }, + { + "epoch": 0.9627204703367184, + "grad_norm": 0.517581582069397, + "learning_rate": 0.00016248789158249398, + "loss": 0.4412, + "step": 28820 + }, + { + "epoch": 0.9630545163014431, + "grad_norm": 0.6610422730445862, + "learning_rate": 0.0001624591391548221, + "loss": 0.4305, + "step": 28830 + }, + { + "epoch": 0.9633885622661679, + "grad_norm": 0.604584813117981, + "learning_rate": 0.00016243037825829752, + "loss": 0.4054, + "step": 28840 + }, + { + "epoch": 0.9637226082308926, + "grad_norm": 0.5499275326728821, + "learning_rate": 0.00016240160889681983, + "loss": 0.4343, + "step": 28850 + }, + { + "epoch": 0.9640566541956174, + "grad_norm": 0.7709673047065735, + "learning_rate": 0.00016237283107428988, + "loss": 0.4229, + "step": 28860 + }, + { + "epoch": 0.9643907001603421, + "grad_norm": 0.5701597332954407, + "learning_rate": 0.00016234404479460973, + "loss": 0.4131, + "step": 28870 + }, + { + "epoch": 0.9647247461250668, + "grad_norm": 0.6884097456932068, + "learning_rate": 0.00016231525006168252, + "loss": 0.3956, + "step": 28880 + }, + { + "epoch": 0.9650587920897915, + "grad_norm": 0.6181395649909973, + "learning_rate": 0.00016228644687941245, + "loss": 0.401, + "step": 28890 + }, + { + "epoch": 0.9653928380545163, + "grad_norm": 0.5700411796569824, + "learning_rate": 0.000162257635251705, + "loss": 0.4286, + "step": 28900 + }, + { + "epoch": 0.965726884019241, + "grad_norm": 0.6960602402687073, + "learning_rate": 0.00016222881518246677, + "loss": 0.4255, + "step": 28910 + }, + { + "epoch": 0.9660609299839658, + "grad_norm": 0.7677624225616455, + "learning_rate": 0.00016219998667560547, + "loss": 0.465, + "step": 28920 + }, + { + "epoch": 0.9663949759486905, + "grad_norm": 0.5178518891334534, + "learning_rate": 0.00016217114973502995, + "loss": 0.4182, + "step": 28930 + }, + { + "epoch": 0.9667290219134153, + "grad_norm": 0.6612915992736816, + "learning_rate": 0.00016214230436465023, + "loss": 0.3955, + "step": 28940 + }, + { + "epoch": 0.96706306787814, + "grad_norm": 0.6839411854743958, + "learning_rate": 0.00016211345056837745, + "loss": 0.4142, + "step": 28950 + }, + { + "epoch": 0.9673971138428648, + "grad_norm": 0.6987684965133667, + "learning_rate": 0.00016208458835012394, + "loss": 0.4521, + "step": 28960 + }, + { + "epoch": 0.9677311598075895, + "grad_norm": 0.7458717823028564, + "learning_rate": 0.00016205571771380308, + "loss": 0.3971, + "step": 28970 + }, + { + "epoch": 0.9680652057723143, + "grad_norm": 0.6579371094703674, + "learning_rate": 0.0001620268386633295, + "loss": 0.4286, + "step": 28980 + }, + { + "epoch": 0.968399251737039, + "grad_norm": 0.8103367686271667, + "learning_rate": 0.00016199795120261888, + "loss": 0.4059, + "step": 28990 + }, + { + "epoch": 0.9687332977017638, + "grad_norm": 0.6735685467720032, + "learning_rate": 0.00016196905533558814, + "loss": 0.4495, + "step": 29000 + }, + { + "epoch": 0.9687332977017638, + "eval_chrf": 85.50374337419993, + "eval_loss": 0.7431288361549377, + "eval_runtime": 118.3405, + "eval_samples_per_second": 0.845, + "eval_steps_per_second": 0.034, + "step": 29000 + }, + { + "epoch": 0.9690673436664885, + "grad_norm": 0.7111045122146606, + "learning_rate": 0.0001619401510661552, + "loss": 0.3814, + "step": 29010 + }, + { + "epoch": 0.9694013896312133, + "grad_norm": 0.6644542217254639, + "learning_rate": 0.00016191123839823926, + "loss": 0.419, + "step": 29020 + }, + { + "epoch": 0.969735435595938, + "grad_norm": 0.6274954080581665, + "learning_rate": 0.00016188231733576053, + "loss": 0.4022, + "step": 29030 + }, + { + "epoch": 0.9700694815606627, + "grad_norm": 0.6033514142036438, + "learning_rate": 0.00016185338788264047, + "loss": 0.4044, + "step": 29040 + }, + { + "epoch": 0.9704035275253875, + "grad_norm": 0.6803905963897705, + "learning_rate": 0.00016182445004280166, + "loss": 0.3872, + "step": 29050 + }, + { + "epoch": 0.9707375734901122, + "grad_norm": 0.7263877987861633, + "learning_rate": 0.00016179550382016777, + "loss": 0.4235, + "step": 29060 + }, + { + "epoch": 0.971071619454837, + "grad_norm": 0.6771949529647827, + "learning_rate": 0.00016176654921866359, + "loss": 0.4139, + "step": 29070 + }, + { + "epoch": 0.9714056654195617, + "grad_norm": 0.6738631129264832, + "learning_rate": 0.0001617375862422151, + "loss": 0.3848, + "step": 29080 + }, + { + "epoch": 0.9717397113842865, + "grad_norm": 0.7112792134284973, + "learning_rate": 0.00016170861489474945, + "loss": 0.4233, + "step": 29090 + }, + { + "epoch": 0.9720737573490112, + "grad_norm": 0.6868206262588501, + "learning_rate": 0.0001616796351801948, + "loss": 0.4252, + "step": 29100 + }, + { + "epoch": 0.972407803313736, + "grad_norm": 0.7426026463508606, + "learning_rate": 0.0001616506471024805, + "loss": 0.435, + "step": 29110 + }, + { + "epoch": 0.9727418492784607, + "grad_norm": 0.7386029958724976, + "learning_rate": 0.00016162165066553719, + "loss": 0.4047, + "step": 29120 + }, + { + "epoch": 0.9730758952431855, + "grad_norm": 0.5846496820449829, + "learning_rate": 0.00016159264587329637, + "loss": 0.4241, + "step": 29130 + }, + { + "epoch": 0.9734099412079102, + "grad_norm": 0.694428563117981, + "learning_rate": 0.00016156363272969086, + "loss": 0.4229, + "step": 29140 + }, + { + "epoch": 0.973743987172635, + "grad_norm": 0.8565306067466736, + "learning_rate": 0.00016153461123865454, + "loss": 0.4212, + "step": 29150 + }, + { + "epoch": 0.9740780331373597, + "grad_norm": 0.6731276512145996, + "learning_rate": 0.0001615055814041225, + "loss": 0.3918, + "step": 29160 + }, + { + "epoch": 0.9744120791020845, + "grad_norm": 0.7223070859909058, + "learning_rate": 0.00016147654323003083, + "loss": 0.3857, + "step": 29170 + }, + { + "epoch": 0.9747461250668092, + "grad_norm": 0.6648160815238953, + "learning_rate": 0.00016144749672031687, + "loss": 0.4174, + "step": 29180 + }, + { + "epoch": 0.975080171031534, + "grad_norm": 0.7709528803825378, + "learning_rate": 0.00016141844187891899, + "loss": 0.4322, + "step": 29190 + }, + { + "epoch": 0.9754142169962586, + "grad_norm": 0.6571624279022217, + "learning_rate": 0.00016138937870977685, + "loss": 0.4132, + "step": 29200 + }, + { + "epoch": 0.9757482629609834, + "grad_norm": 0.6298735737800598, + "learning_rate": 0.000161360307216831, + "loss": 0.4166, + "step": 29210 + }, + { + "epoch": 0.9760823089257081, + "grad_norm": 0.5757023096084595, + "learning_rate": 0.00016133122740402336, + "loss": 0.4536, + "step": 29220 + }, + { + "epoch": 0.9764163548904329, + "grad_norm": 0.6133437156677246, + "learning_rate": 0.00016130213927529683, + "loss": 0.3966, + "step": 29230 + }, + { + "epoch": 0.9767504008551576, + "grad_norm": 0.5822750329971313, + "learning_rate": 0.00016127304283459543, + "loss": 0.4159, + "step": 29240 + }, + { + "epoch": 0.9770844468198824, + "grad_norm": 0.6789774894714355, + "learning_rate": 0.00016124393808586446, + "loss": 0.4062, + "step": 29250 + }, + { + "epoch": 0.9774184927846071, + "grad_norm": 0.5346429347991943, + "learning_rate": 0.00016121482503305017, + "loss": 0.3831, + "step": 29260 + }, + { + "epoch": 0.9777525387493319, + "grad_norm": 0.6092677116394043, + "learning_rate": 0.00016118570368010002, + "loss": 0.4008, + "step": 29270 + }, + { + "epoch": 0.9780865847140566, + "grad_norm": 0.8356127142906189, + "learning_rate": 0.00016115657403096257, + "loss": 0.4238, + "step": 29280 + }, + { + "epoch": 0.9784206306787814, + "grad_norm": 0.642802357673645, + "learning_rate": 0.00016112743608958757, + "loss": 0.4237, + "step": 29290 + }, + { + "epoch": 0.9787546766435061, + "grad_norm": 0.6428471207618713, + "learning_rate": 0.00016109828985992576, + "loss": 0.4109, + "step": 29300 + }, + { + "epoch": 0.9790887226082309, + "grad_norm": 0.669037938117981, + "learning_rate": 0.00016106913534592914, + "loss": 0.4334, + "step": 29310 + }, + { + "epoch": 0.9794227685729556, + "grad_norm": 0.5804521441459656, + "learning_rate": 0.0001610399725515508, + "loss": 0.4092, + "step": 29320 + }, + { + "epoch": 0.9797568145376804, + "grad_norm": 0.5560088753700256, + "learning_rate": 0.00016101080148074484, + "loss": 0.4285, + "step": 29330 + }, + { + "epoch": 0.9800908605024051, + "grad_norm": 0.5982033014297485, + "learning_rate": 0.00016098162213746666, + "loss": 0.4108, + "step": 29340 + }, + { + "epoch": 0.9804249064671299, + "grad_norm": 0.8037095069885254, + "learning_rate": 0.00016095243452567267, + "loss": 0.4331, + "step": 29350 + }, + { + "epoch": 0.9807589524318546, + "grad_norm": 0.6545886397361755, + "learning_rate": 0.00016092323864932043, + "loss": 0.42, + "step": 29360 + }, + { + "epoch": 0.9810929983965794, + "grad_norm": 0.7851911783218384, + "learning_rate": 0.00016089403451236862, + "loss": 0.4072, + "step": 29370 + }, + { + "epoch": 0.9814270443613041, + "grad_norm": 0.6015502214431763, + "learning_rate": 0.00016086482211877704, + "loss": 0.4001, + "step": 29380 + }, + { + "epoch": 0.9817610903260289, + "grad_norm": 0.8733850121498108, + "learning_rate": 0.00016083560147250658, + "loss": 0.3846, + "step": 29390 + }, + { + "epoch": 0.9820951362907536, + "grad_norm": 0.661385178565979, + "learning_rate": 0.00016080637257751928, + "loss": 0.4032, + "step": 29400 + }, + { + "epoch": 0.9824291822554784, + "grad_norm": 0.7604096531867981, + "learning_rate": 0.00016077713543777834, + "loss": 0.4189, + "step": 29410 + }, + { + "epoch": 0.9827632282202031, + "grad_norm": 0.6617636680603027, + "learning_rate": 0.00016074789005724797, + "loss": 0.4273, + "step": 29420 + }, + { + "epoch": 0.9830972741849279, + "grad_norm": 0.6794228553771973, + "learning_rate": 0.0001607186364398936, + "loss": 0.4023, + "step": 29430 + }, + { + "epoch": 0.9834313201496526, + "grad_norm": 0.6825741529464722, + "learning_rate": 0.00016068937458968178, + "loss": 0.4115, + "step": 29440 + }, + { + "epoch": 0.9837653661143774, + "grad_norm": 0.6325975060462952, + "learning_rate": 0.00016066010451058, + "loss": 0.4181, + "step": 29450 + }, + { + "epoch": 0.9840994120791021, + "grad_norm": 0.6342548727989197, + "learning_rate": 0.0001606308262065571, + "loss": 0.4009, + "step": 29460 + }, + { + "epoch": 0.9844334580438269, + "grad_norm": 0.7334816455841064, + "learning_rate": 0.00016060153968158294, + "loss": 0.4487, + "step": 29470 + }, + { + "epoch": 0.9847675040085516, + "grad_norm": 0.8415297865867615, + "learning_rate": 0.00016057224493962844, + "loss": 0.4161, + "step": 29480 + }, + { + "epoch": 0.9851015499732764, + "grad_norm": 0.5709089040756226, + "learning_rate": 0.00016054294198466568, + "loss": 0.4008, + "step": 29490 + }, + { + "epoch": 0.9854355959380011, + "grad_norm": 0.7032826542854309, + "learning_rate": 0.00016051363082066793, + "loss": 0.4301, + "step": 29500 + }, + { + "epoch": 0.9857696419027259, + "grad_norm": 0.6296345591545105, + "learning_rate": 0.0001604843114516094, + "loss": 0.4426, + "step": 29510 + }, + { + "epoch": 0.9861036878674505, + "grad_norm": 0.6102468967437744, + "learning_rate": 0.0001604549838814656, + "loss": 0.4205, + "step": 29520 + }, + { + "epoch": 0.9864377338321753, + "grad_norm": 0.6600838303565979, + "learning_rate": 0.00016042564811421297, + "loss": 0.3756, + "step": 29530 + }, + { + "epoch": 0.9867717797969, + "grad_norm": 0.7445717453956604, + "learning_rate": 0.00016039630415382928, + "loss": 0.4183, + "step": 29540 + }, + { + "epoch": 0.9871058257616248, + "grad_norm": 0.6004316210746765, + "learning_rate": 0.00016036695200429314, + "loss": 0.3692, + "step": 29550 + }, + { + "epoch": 0.9874398717263495, + "grad_norm": 0.5994698405265808, + "learning_rate": 0.00016033759166958454, + "loss": 0.3962, + "step": 29560 + }, + { + "epoch": 0.9877739176910743, + "grad_norm": 0.7441630959510803, + "learning_rate": 0.00016030822315368437, + "loss": 0.3961, + "step": 29570 + }, + { + "epoch": 0.988107963655799, + "grad_norm": 0.7721065282821655, + "learning_rate": 0.00016027884646057475, + "loss": 0.3825, + "step": 29580 + }, + { + "epoch": 0.9884420096205238, + "grad_norm": 0.695807933807373, + "learning_rate": 0.00016024946159423894, + "loss": 0.3805, + "step": 29590 + }, + { + "epoch": 0.9887760555852485, + "grad_norm": 0.6513135433197021, + "learning_rate": 0.00016022006855866114, + "loss": 0.4003, + "step": 29600 + }, + { + "epoch": 0.9891101015499733, + "grad_norm": 0.6612939238548279, + "learning_rate": 0.0001601906673578268, + "loss": 0.3825, + "step": 29610 + }, + { + "epoch": 0.989444147514698, + "grad_norm": 0.7109509110450745, + "learning_rate": 0.00016016125799572244, + "loss": 0.419, + "step": 29620 + }, + { + "epoch": 0.9897781934794228, + "grad_norm": 0.6815495491027832, + "learning_rate": 0.00016013184047633568, + "loss": 0.3977, + "step": 29630 + }, + { + "epoch": 0.9901122394441475, + "grad_norm": 0.6020383834838867, + "learning_rate": 0.00016010241480365523, + "loss": 0.4341, + "step": 29640 + }, + { + "epoch": 0.9904462854088723, + "grad_norm": 0.7630085349082947, + "learning_rate": 0.00016007298098167098, + "loss": 0.4122, + "step": 29650 + }, + { + "epoch": 0.990780331373597, + "grad_norm": 0.6222795844078064, + "learning_rate": 0.00016004353901437384, + "loss": 0.4208, + "step": 29660 + }, + { + "epoch": 0.9911143773383218, + "grad_norm": 0.6344598531723022, + "learning_rate": 0.0001600140889057558, + "loss": 0.3849, + "step": 29670 + }, + { + "epoch": 0.9914484233030465, + "grad_norm": 0.7456019520759583, + "learning_rate": 0.0001599846306598101, + "loss": 0.4144, + "step": 29680 + }, + { + "epoch": 0.9917824692677712, + "grad_norm": 0.6799163818359375, + "learning_rate": 0.00015995516428053095, + "loss": 0.4229, + "step": 29690 + }, + { + "epoch": 0.992116515232496, + "grad_norm": 0.7401474714279175, + "learning_rate": 0.0001599256897719137, + "loss": 0.4189, + "step": 29700 + }, + { + "epoch": 0.9924505611972207, + "grad_norm": 0.6695952415466309, + "learning_rate": 0.00015989620713795482, + "loss": 0.4377, + "step": 29710 + }, + { + "epoch": 0.9927846071619455, + "grad_norm": 0.630071222782135, + "learning_rate": 0.0001598667163826519, + "loss": 0.386, + "step": 29720 + }, + { + "epoch": 0.9931186531266702, + "grad_norm": 0.6460761427879333, + "learning_rate": 0.0001598372175100035, + "loss": 0.4055, + "step": 29730 + }, + { + "epoch": 0.993452699091395, + "grad_norm": 0.6744544506072998, + "learning_rate": 0.00015980771052400948, + "loss": 0.4004, + "step": 29740 + }, + { + "epoch": 0.9937867450561197, + "grad_norm": 0.7448058724403381, + "learning_rate": 0.00015977819542867066, + "loss": 0.4077, + "step": 29750 + }, + { + "epoch": 0.9941207910208445, + "grad_norm": 0.7847725749015808, + "learning_rate": 0.00015974867222798896, + "loss": 0.4328, + "step": 29760 + }, + { + "epoch": 0.9944548369855692, + "grad_norm": 0.6265324354171753, + "learning_rate": 0.00015971914092596757, + "loss": 0.4199, + "step": 29770 + }, + { + "epoch": 0.994788882950294, + "grad_norm": 0.6902084350585938, + "learning_rate": 0.0001596896015266105, + "loss": 0.3898, + "step": 29780 + }, + { + "epoch": 0.9951229289150187, + "grad_norm": 0.7731800079345703, + "learning_rate": 0.00015966005403392311, + "loss": 0.4272, + "step": 29790 + }, + { + "epoch": 0.9954569748797435, + "grad_norm": 0.6717022061347961, + "learning_rate": 0.0001596304984519117, + "loss": 0.425, + "step": 29800 + }, + { + "epoch": 0.9957910208444682, + "grad_norm": 0.8673071265220642, + "learning_rate": 0.0001596009347845837, + "loss": 0.4075, + "step": 29810 + }, + { + "epoch": 0.996125066809193, + "grad_norm": 0.6932689547538757, + "learning_rate": 0.00015957136303594772, + "loss": 0.4293, + "step": 29820 + }, + { + "epoch": 0.9964591127739177, + "grad_norm": 0.5524224042892456, + "learning_rate": 0.00015954178321001336, + "loss": 0.4023, + "step": 29830 + }, + { + "epoch": 0.9967931587386424, + "grad_norm": 0.6250215768814087, + "learning_rate": 0.00015951219531079136, + "loss": 0.3972, + "step": 29840 + }, + { + "epoch": 0.9971272047033671, + "grad_norm": 0.7794172763824463, + "learning_rate": 0.00015948259934229356, + "loss": 0.4361, + "step": 29850 + }, + { + "epoch": 0.9974612506680919, + "grad_norm": 0.7573460936546326, + "learning_rate": 0.00015945299530853292, + "loss": 0.4478, + "step": 29860 + }, + { + "epoch": 0.9977952966328166, + "grad_norm": 0.8546281456947327, + "learning_rate": 0.00015942338321352337, + "loss": 0.4257, + "step": 29870 + }, + { + "epoch": 0.9981293425975414, + "grad_norm": 0.6912742257118225, + "learning_rate": 0.0001593937630612801, + "loss": 0.4025, + "step": 29880 + }, + { + "epoch": 0.9984633885622661, + "grad_norm": 0.680280864238739, + "learning_rate": 0.0001593641348558193, + "loss": 0.4224, + "step": 29890 + }, + { + "epoch": 0.9987974345269909, + "grad_norm": 0.7769901156425476, + "learning_rate": 0.00015933449860115823, + "loss": 0.426, + "step": 29900 + }, + { + "epoch": 0.9991314804917156, + "grad_norm": 0.6050432920455933, + "learning_rate": 0.00015930485430131532, + "loss": 0.3951, + "step": 29910 + }, + { + "epoch": 0.9994655264564404, + "grad_norm": 0.571648120880127, + "learning_rate": 0.00015927520196031006, + "loss": 0.3852, + "step": 29920 + }, + { + "epoch": 0.9997995724211651, + "grad_norm": 0.6357899308204651, + "learning_rate": 0.00015924554158216296, + "loss": 0.3919, + "step": 29930 + }, + { + "epoch": 1.00013361838589, + "grad_norm": 0.6554333567619324, + "learning_rate": 0.00015921587317089574, + "loss": 0.4539, + "step": 29940 + }, + { + "epoch": 1.0004676643506147, + "grad_norm": 0.6938609480857849, + "learning_rate": 0.00015918619673053113, + "loss": 0.3855, + "step": 29950 + }, + { + "epoch": 1.0008017103153395, + "grad_norm": 0.6008100509643555, + "learning_rate": 0.00015915651226509293, + "loss": 0.411, + "step": 29960 + }, + { + "epoch": 1.0011357562800642, + "grad_norm": 0.5286538004875183, + "learning_rate": 0.0001591268197786061, + "loss": 0.4089, + "step": 29970 + }, + { + "epoch": 1.001469802244789, + "grad_norm": 0.5883219242095947, + "learning_rate": 0.00015909711927509666, + "loss": 0.3666, + "step": 29980 + }, + { + "epoch": 1.0018038482095137, + "grad_norm": 0.6281086206436157, + "learning_rate": 0.0001590674107585917, + "loss": 0.4102, + "step": 29990 + }, + { + "epoch": 1.0021378941742385, + "grad_norm": 0.582422137260437, + "learning_rate": 0.0001590376942331194, + "loss": 0.4028, + "step": 30000 + }, + { + "epoch": 1.0021378941742385, + "eval_chrf": 84.68921517076356, + "eval_loss": 0.7548351883888245, + "eval_runtime": 116.2949, + "eval_samples_per_second": 0.86, + "eval_steps_per_second": 0.034, + "step": 30000 + }, + { + "epoch": 1.002471940138963, + "grad_norm": 0.5582163333892822, + "learning_rate": 0.00015900796970270904, + "loss": 0.381, + "step": 30010 + }, + { + "epoch": 1.0028059861036878, + "grad_norm": 0.7098758220672607, + "learning_rate": 0.00015897823717139097, + "loss": 0.4013, + "step": 30020 + }, + { + "epoch": 1.0031400320684125, + "grad_norm": 0.6150277853012085, + "learning_rate": 0.00015894849664319666, + "loss": 0.4188, + "step": 30030 + }, + { + "epoch": 1.0034740780331373, + "grad_norm": 0.6747395992279053, + "learning_rate": 0.0001589187481221586, + "loss": 0.3561, + "step": 30040 + }, + { + "epoch": 1.003808123997862, + "grad_norm": 0.60276859998703, + "learning_rate": 0.0001588889916123104, + "loss": 0.3745, + "step": 30050 + }, + { + "epoch": 1.0041421699625868, + "grad_norm": 0.7079046964645386, + "learning_rate": 0.0001588592271176868, + "loss": 0.4131, + "step": 30060 + }, + { + "epoch": 1.0044762159273115, + "grad_norm": 0.6754781007766724, + "learning_rate": 0.0001588294546423235, + "loss": 0.3751, + "step": 30070 + }, + { + "epoch": 1.0048102618920363, + "grad_norm": 0.5670630931854248, + "learning_rate": 0.00015879967419025746, + "loss": 0.3653, + "step": 30080 + }, + { + "epoch": 1.005144307856761, + "grad_norm": 0.5448384881019592, + "learning_rate": 0.00015876988576552653, + "loss": 0.4263, + "step": 30090 + }, + { + "epoch": 1.0054783538214858, + "grad_norm": 0.6444534063339233, + "learning_rate": 0.00015874008937216973, + "loss": 0.3905, + "step": 30100 + }, + { + "epoch": 1.0058123997862105, + "grad_norm": 0.7758830785751343, + "learning_rate": 0.00015871028501422726, + "loss": 0.3952, + "step": 30110 + }, + { + "epoch": 1.0061464457509353, + "grad_norm": 0.6929954886436462, + "learning_rate": 0.00015868047269574018, + "loss": 0.421, + "step": 30120 + }, + { + "epoch": 1.00648049171566, + "grad_norm": 0.6823943853378296, + "learning_rate": 0.00015865065242075087, + "loss": 0.3634, + "step": 30130 + }, + { + "epoch": 1.0068145376803848, + "grad_norm": 0.6033768057823181, + "learning_rate": 0.00015862082419330255, + "loss": 0.3777, + "step": 30140 + }, + { + "epoch": 1.0071485836451095, + "grad_norm": 0.6588881015777588, + "learning_rate": 0.00015859098801743974, + "loss": 0.389, + "step": 30150 + }, + { + "epoch": 1.0074826296098343, + "grad_norm": 0.7433032989501953, + "learning_rate": 0.00015856114389720789, + "loss": 0.3735, + "step": 30160 + }, + { + "epoch": 1.007816675574559, + "grad_norm": 0.6512438654899597, + "learning_rate": 0.00015853129183665353, + "loss": 0.4277, + "step": 30170 + }, + { + "epoch": 1.0081507215392838, + "grad_norm": 0.5337277054786682, + "learning_rate": 0.0001585014318398244, + "loss": 0.3979, + "step": 30180 + }, + { + "epoch": 1.0084847675040085, + "grad_norm": 0.6053078174591064, + "learning_rate": 0.00015847156391076916, + "loss": 0.3864, + "step": 30190 + }, + { + "epoch": 1.0088188134687333, + "grad_norm": 0.7046955823898315, + "learning_rate": 0.00015844168805353764, + "loss": 0.4057, + "step": 30200 + }, + { + "epoch": 1.009152859433458, + "grad_norm": 0.6762715578079224, + "learning_rate": 0.0001584118042721807, + "loss": 0.3957, + "step": 30210 + }, + { + "epoch": 1.0094869053981828, + "grad_norm": 0.5681241154670715, + "learning_rate": 0.00015838191257075033, + "loss": 0.4224, + "step": 30220 + }, + { + "epoch": 1.0098209513629075, + "grad_norm": 0.6513677835464478, + "learning_rate": 0.0001583520129532995, + "loss": 0.3707, + "step": 30230 + }, + { + "epoch": 1.0101549973276323, + "grad_norm": 0.6630640625953674, + "learning_rate": 0.00015832210542388232, + "loss": 0.3742, + "step": 30240 + }, + { + "epoch": 1.010489043292357, + "grad_norm": 0.5989456176757812, + "learning_rate": 0.00015829218998655398, + "loss": 0.3776, + "step": 30250 + }, + { + "epoch": 1.0108230892570818, + "grad_norm": 0.6947439908981323, + "learning_rate": 0.0001582622666453707, + "loss": 0.3907, + "step": 30260 + }, + { + "epoch": 1.0111571352218065, + "grad_norm": 0.7531583905220032, + "learning_rate": 0.00015823233540438985, + "loss": 0.3981, + "step": 30270 + }, + { + "epoch": 1.0114911811865313, + "grad_norm": 0.5948493480682373, + "learning_rate": 0.0001582023962676697, + "loss": 0.3861, + "step": 30280 + }, + { + "epoch": 1.011825227151256, + "grad_norm": 0.587716817855835, + "learning_rate": 0.00015817244923926985, + "loss": 0.396, + "step": 30290 + }, + { + "epoch": 1.0121592731159808, + "grad_norm": 0.6150562763214111, + "learning_rate": 0.0001581424943232507, + "loss": 0.3742, + "step": 30300 + }, + { + "epoch": 1.0124933190807055, + "grad_norm": 0.6901524066925049, + "learning_rate": 0.00015811253152367394, + "loss": 0.373, + "step": 30310 + }, + { + "epoch": 1.0128273650454303, + "grad_norm": 0.7180845141410828, + "learning_rate": 0.00015808256084460221, + "loss": 0.3814, + "step": 30320 + }, + { + "epoch": 1.013161411010155, + "grad_norm": 0.6733746528625488, + "learning_rate": 0.0001580525822900992, + "loss": 0.3999, + "step": 30330 + }, + { + "epoch": 1.0134954569748797, + "grad_norm": 0.5723176598548889, + "learning_rate": 0.00015802259586422977, + "loss": 0.408, + "step": 30340 + }, + { + "epoch": 1.0138295029396045, + "grad_norm": 0.6507017016410828, + "learning_rate": 0.00015799260157105973, + "loss": 0.3914, + "step": 30350 + }, + { + "epoch": 1.0141635489043292, + "grad_norm": 0.6550756692886353, + "learning_rate": 0.00015796259941465606, + "loss": 0.399, + "step": 30360 + }, + { + "epoch": 1.014497594869054, + "grad_norm": 0.5804959535598755, + "learning_rate": 0.00015793258939908672, + "loss": 0.4107, + "step": 30370 + }, + { + "epoch": 1.0148316408337787, + "grad_norm": 0.5797379612922668, + "learning_rate": 0.00015790257152842086, + "loss": 0.3637, + "step": 30380 + }, + { + "epoch": 1.0151656867985035, + "grad_norm": 0.6536020040512085, + "learning_rate": 0.0001578725458067285, + "loss": 0.4288, + "step": 30390 + }, + { + "epoch": 1.0154997327632282, + "grad_norm": 0.5438137054443359, + "learning_rate": 0.0001578425122380809, + "loss": 0.3811, + "step": 30400 + }, + { + "epoch": 1.015833778727953, + "grad_norm": 0.783945620059967, + "learning_rate": 0.0001578124708265503, + "loss": 0.4104, + "step": 30410 + }, + { + "epoch": 1.0161678246926777, + "grad_norm": 0.7400382161140442, + "learning_rate": 0.00015778242157621006, + "loss": 0.3983, + "step": 30420 + }, + { + "epoch": 1.0165018706574025, + "grad_norm": 0.5094575881958008, + "learning_rate": 0.0001577523644911345, + "loss": 0.4076, + "step": 30430 + }, + { + "epoch": 1.0168359166221272, + "grad_norm": 0.6488326191902161, + "learning_rate": 0.00015772229957539916, + "loss": 0.4096, + "step": 30440 + }, + { + "epoch": 1.017169962586852, + "grad_norm": 0.7249358892440796, + "learning_rate": 0.0001576922268330805, + "loss": 0.3624, + "step": 30450 + }, + { + "epoch": 1.0175040085515767, + "grad_norm": 0.7502314448356628, + "learning_rate": 0.00015766214626825603, + "loss": 0.4379, + "step": 30460 + }, + { + "epoch": 1.0178380545163015, + "grad_norm": 0.7085846662521362, + "learning_rate": 0.0001576320578850045, + "loss": 0.4381, + "step": 30470 + }, + { + "epoch": 1.0181721004810262, + "grad_norm": 0.6152790784835815, + "learning_rate": 0.00015760196168740552, + "loss": 0.3956, + "step": 30480 + }, + { + "epoch": 1.018506146445751, + "grad_norm": 0.6149989366531372, + "learning_rate": 0.00015757185767953987, + "loss": 0.4355, + "step": 30490 + }, + { + "epoch": 1.0188401924104757, + "grad_norm": 0.48922303318977356, + "learning_rate": 0.0001575417458654894, + "loss": 0.38, + "step": 30500 + }, + { + "epoch": 1.0191742383752005, + "grad_norm": 0.6487305760383606, + "learning_rate": 0.00015751162624933688, + "loss": 0.3988, + "step": 30510 + }, + { + "epoch": 1.0195082843399252, + "grad_norm": 0.6418102979660034, + "learning_rate": 0.00015748149883516635, + "loss": 0.4257, + "step": 30520 + }, + { + "epoch": 1.01984233030465, + "grad_norm": 0.6111548542976379, + "learning_rate": 0.0001574513636270627, + "loss": 0.358, + "step": 30530 + }, + { + "epoch": 1.0201763762693747, + "grad_norm": 0.6989127993583679, + "learning_rate": 0.00015742122062911205, + "loss": 0.403, + "step": 30540 + }, + { + "epoch": 1.0205104222340995, + "grad_norm": 0.6354879140853882, + "learning_rate": 0.0001573910698454014, + "loss": 0.3947, + "step": 30550 + }, + { + "epoch": 1.0208444681988242, + "grad_norm": 0.6360312700271606, + "learning_rate": 0.00015736091128001905, + "loss": 0.408, + "step": 30560 + }, + { + "epoch": 1.021178514163549, + "grad_norm": 0.6720179915428162, + "learning_rate": 0.00015733074493705406, + "loss": 0.397, + "step": 30570 + }, + { + "epoch": 1.0215125601282737, + "grad_norm": 0.6369712948799133, + "learning_rate": 0.0001573005708205968, + "loss": 0.3842, + "step": 30580 + }, + { + "epoch": 1.0218466060929985, + "grad_norm": 0.6743009090423584, + "learning_rate": 0.0001572703889347385, + "loss": 0.3829, + "step": 30590 + }, + { + "epoch": 1.0221806520577232, + "grad_norm": 0.6357889771461487, + "learning_rate": 0.0001572401992835716, + "loss": 0.3824, + "step": 30600 + }, + { + "epoch": 1.022514698022448, + "grad_norm": 0.7758944034576416, + "learning_rate": 0.00015721000187118948, + "loss": 0.4132, + "step": 30610 + }, + { + "epoch": 1.0228487439871727, + "grad_norm": 0.7012585997581482, + "learning_rate": 0.00015717979670168663, + "loss": 0.4005, + "step": 30620 + }, + { + "epoch": 1.0231827899518975, + "grad_norm": 0.7627592086791992, + "learning_rate": 0.0001571495837791586, + "loss": 0.4238, + "step": 30630 + }, + { + "epoch": 1.0235168359166222, + "grad_norm": 0.6189246773719788, + "learning_rate": 0.00015711936310770197, + "loss": 0.3668, + "step": 30640 + }, + { + "epoch": 1.023850881881347, + "grad_norm": 0.6988254189491272, + "learning_rate": 0.00015708913469141434, + "loss": 0.3694, + "step": 30650 + }, + { + "epoch": 1.0241849278460715, + "grad_norm": 0.6240927577018738, + "learning_rate": 0.00015705889853439436, + "loss": 0.382, + "step": 30660 + }, + { + "epoch": 1.0245189738107963, + "grad_norm": 0.7112955451011658, + "learning_rate": 0.00015702865464074184, + "loss": 0.4331, + "step": 30670 + }, + { + "epoch": 1.024853019775521, + "grad_norm": 0.6801533102989197, + "learning_rate": 0.0001569984030145575, + "loss": 0.3983, + "step": 30680 + }, + { + "epoch": 1.0251870657402458, + "grad_norm": 0.6725050210952759, + "learning_rate": 0.00015696814365994319, + "loss": 0.3827, + "step": 30690 + }, + { + "epoch": 1.0255211117049705, + "grad_norm": 0.5707390308380127, + "learning_rate": 0.00015693787658100177, + "loss": 0.4004, + "step": 30700 + }, + { + "epoch": 1.0258551576696953, + "grad_norm": 0.6323923468589783, + "learning_rate": 0.0001569076017818372, + "loss": 0.384, + "step": 30710 + }, + { + "epoch": 1.02618920363442, + "grad_norm": 0.6622703075408936, + "learning_rate": 0.0001568773192665544, + "loss": 0.4431, + "step": 30720 + }, + { + "epoch": 1.0265232495991448, + "grad_norm": 0.6918878555297852, + "learning_rate": 0.00015684702903925938, + "loss": 0.4004, + "step": 30730 + }, + { + "epoch": 1.0268572955638695, + "grad_norm": 0.672504723072052, + "learning_rate": 0.0001568167311040593, + "loss": 0.3712, + "step": 30740 + }, + { + "epoch": 1.0271913415285943, + "grad_norm": 0.7457714676856995, + "learning_rate": 0.0001567864254650621, + "loss": 0.4047, + "step": 30750 + }, + { + "epoch": 1.027525387493319, + "grad_norm": 0.7395593523979187, + "learning_rate": 0.00015675611212637706, + "loss": 0.3789, + "step": 30760 + }, + { + "epoch": 1.0278594334580438, + "grad_norm": 0.6980950236320496, + "learning_rate": 0.00015672579109211434, + "loss": 0.4169, + "step": 30770 + }, + { + "epoch": 1.0281934794227685, + "grad_norm": 0.5751655697822571, + "learning_rate": 0.00015669546236638516, + "loss": 0.4012, + "step": 30780 + }, + { + "epoch": 1.0285275253874933, + "grad_norm": 0.6254413723945618, + "learning_rate": 0.00015666512595330186, + "loss": 0.367, + "step": 30790 + }, + { + "epoch": 1.028861571352218, + "grad_norm": 0.7288251519203186, + "learning_rate": 0.00015663478185697763, + "loss": 0.4196, + "step": 30800 + }, + { + "epoch": 1.0291956173169428, + "grad_norm": 0.6478354334831238, + "learning_rate": 0.000156604430081527, + "loss": 0.3683, + "step": 30810 + }, + { + "epoch": 1.0295296632816675, + "grad_norm": 0.7130792140960693, + "learning_rate": 0.00015657407063106527, + "loss": 0.4052, + "step": 30820 + }, + { + "epoch": 1.0298637092463923, + "grad_norm": 0.6907187104225159, + "learning_rate": 0.00015654370350970894, + "loss": 0.3836, + "step": 30830 + }, + { + "epoch": 1.030197755211117, + "grad_norm": 0.7127656936645508, + "learning_rate": 0.00015651332872157543, + "loss": 0.4218, + "step": 30840 + }, + { + "epoch": 1.0305318011758418, + "grad_norm": 0.6611000895500183, + "learning_rate": 0.00015648294627078335, + "loss": 0.3855, + "step": 30850 + }, + { + "epoch": 1.0308658471405665, + "grad_norm": 0.7442795038223267, + "learning_rate": 0.0001564525561614522, + "loss": 0.4063, + "step": 30860 + }, + { + "epoch": 1.0311998931052913, + "grad_norm": 0.6155968308448792, + "learning_rate": 0.0001564221583977026, + "loss": 0.425, + "step": 30870 + }, + { + "epoch": 1.031533939070016, + "grad_norm": 0.7347699403762817, + "learning_rate": 0.00015639175298365617, + "loss": 0.4114, + "step": 30880 + }, + { + "epoch": 1.0318679850347408, + "grad_norm": 0.7505794763565063, + "learning_rate": 0.00015636133992343566, + "loss": 0.4111, + "step": 30890 + }, + { + "epoch": 1.0322020309994655, + "grad_norm": 0.6944608092308044, + "learning_rate": 0.00015633091922116475, + "loss": 0.3684, + "step": 30900 + }, + { + "epoch": 1.0325360769641903, + "grad_norm": 0.5945519208908081, + "learning_rate": 0.00015630049088096818, + "loss": 0.3859, + "step": 30910 + }, + { + "epoch": 1.032870122928915, + "grad_norm": 0.6334394216537476, + "learning_rate": 0.00015627005490697173, + "loss": 0.4042, + "step": 30920 + }, + { + "epoch": 1.0332041688936398, + "grad_norm": 0.7385438680648804, + "learning_rate": 0.00015623961130330224, + "loss": 0.3852, + "step": 30930 + }, + { + "epoch": 1.0335382148583645, + "grad_norm": 0.6991183757781982, + "learning_rate": 0.00015620916007408757, + "loss": 0.3724, + "step": 30940 + }, + { + "epoch": 1.0338722608230893, + "grad_norm": 0.8334141373634338, + "learning_rate": 0.00015617870122345656, + "loss": 0.4137, + "step": 30950 + }, + { + "epoch": 1.034206306787814, + "grad_norm": 0.6598323583602905, + "learning_rate": 0.00015614823475553922, + "loss": 0.3998, + "step": 30960 + }, + { + "epoch": 1.0345403527525387, + "grad_norm": 0.6702103614807129, + "learning_rate": 0.00015611776067446647, + "loss": 0.3703, + "step": 30970 + }, + { + "epoch": 1.0348743987172635, + "grad_norm": 0.5996959805488586, + "learning_rate": 0.00015608727898437029, + "loss": 0.3907, + "step": 30980 + }, + { + "epoch": 1.0352084446819882, + "grad_norm": 0.6318657994270325, + "learning_rate": 0.0001560567896893837, + "loss": 0.4156, + "step": 30990 + }, + { + "epoch": 1.035542490646713, + "grad_norm": 0.6210951805114746, + "learning_rate": 0.00015602629279364073, + "loss": 0.3804, + "step": 31000 + }, + { + "epoch": 1.035542490646713, + "eval_chrf": 82.08934899962253, + "eval_loss": 0.7352509498596191, + "eval_runtime": 212.3956, + "eval_samples_per_second": 0.471, + "eval_steps_per_second": 0.019, + "step": 31000 + }, + { + "epoch": 1.0358765366114377, + "grad_norm": 0.6518322825431824, + "learning_rate": 0.00015599578830127655, + "loss": 0.3825, + "step": 31010 + }, + { + "epoch": 1.0362105825761625, + "grad_norm": 0.7735515236854553, + "learning_rate": 0.00015596527621642716, + "loss": 0.3971, + "step": 31020 + }, + { + "epoch": 1.0365446285408872, + "grad_norm": 0.5995593070983887, + "learning_rate": 0.00015593475654322977, + "loss": 0.3785, + "step": 31030 + }, + { + "epoch": 1.036878674505612, + "grad_norm": 0.7210389971733093, + "learning_rate": 0.00015590422928582256, + "loss": 0.4059, + "step": 31040 + }, + { + "epoch": 1.0372127204703367, + "grad_norm": 0.6442881226539612, + "learning_rate": 0.00015587369444834467, + "loss": 0.42, + "step": 31050 + }, + { + "epoch": 1.0375467664350615, + "grad_norm": 0.6314358711242676, + "learning_rate": 0.0001558431520349364, + "loss": 0.4026, + "step": 31060 + }, + { + "epoch": 1.0378808123997862, + "grad_norm": 0.6613219380378723, + "learning_rate": 0.00015581260204973894, + "loss": 0.392, + "step": 31070 + }, + { + "epoch": 1.038214858364511, + "grad_norm": 0.8239825963973999, + "learning_rate": 0.00015578204449689462, + "loss": 0.3795, + "step": 31080 + }, + { + "epoch": 1.0385489043292357, + "grad_norm": 0.5313824415206909, + "learning_rate": 0.0001557514793805467, + "loss": 0.4263, + "step": 31090 + }, + { + "epoch": 1.0388829502939605, + "grad_norm": 0.804043173789978, + "learning_rate": 0.00015572090670483958, + "loss": 0.388, + "step": 31100 + }, + { + "epoch": 1.0392169962586852, + "grad_norm": 0.5817767381668091, + "learning_rate": 0.00015569032647391856, + "loss": 0.4055, + "step": 31110 + }, + { + "epoch": 1.03955104222341, + "grad_norm": 0.7274056673049927, + "learning_rate": 0.00015565973869193005, + "loss": 0.4263, + "step": 31120 + }, + { + "epoch": 1.0398850881881347, + "grad_norm": 0.6297281980514526, + "learning_rate": 0.00015562914336302144, + "loss": 0.392, + "step": 31130 + }, + { + "epoch": 1.0402191341528595, + "grad_norm": 0.7284983396530151, + "learning_rate": 0.00015559854049134115, + "loss": 0.3962, + "step": 31140 + }, + { + "epoch": 1.0405531801175842, + "grad_norm": 0.7238017916679382, + "learning_rate": 0.00015556793008103867, + "loss": 0.3934, + "step": 31150 + }, + { + "epoch": 1.040887226082309, + "grad_norm": 0.6182476282119751, + "learning_rate": 0.00015553731213626442, + "loss": 0.3911, + "step": 31160 + }, + { + "epoch": 1.0412212720470337, + "grad_norm": 0.7227134704589844, + "learning_rate": 0.00015550668666116998, + "loss": 0.3741, + "step": 31170 + }, + { + "epoch": 1.0415553180117585, + "grad_norm": 0.6205272674560547, + "learning_rate": 0.0001554760536599078, + "loss": 0.3718, + "step": 31180 + }, + { + "epoch": 1.0418893639764832, + "grad_norm": 0.7476280927658081, + "learning_rate": 0.00015544541313663145, + "loss": 0.388, + "step": 31190 + }, + { + "epoch": 1.042223409941208, + "grad_norm": 0.6032068133354187, + "learning_rate": 0.00015541476509549544, + "loss": 0.4267, + "step": 31200 + }, + { + "epoch": 1.0425574559059327, + "grad_norm": 0.6466596722602844, + "learning_rate": 0.00015538410954065538, + "loss": 0.4195, + "step": 31210 + }, + { + "epoch": 1.0428915018706575, + "grad_norm": 0.6463454365730286, + "learning_rate": 0.00015535344647626785, + "loss": 0.3788, + "step": 31220 + }, + { + "epoch": 1.0432255478353822, + "grad_norm": 0.6143805980682373, + "learning_rate": 0.00015532277590649053, + "loss": 0.3771, + "step": 31230 + }, + { + "epoch": 1.043559593800107, + "grad_norm": 0.6342659592628479, + "learning_rate": 0.00015529209783548197, + "loss": 0.4019, + "step": 31240 + }, + { + "epoch": 1.0438936397648317, + "grad_norm": 0.6215687990188599, + "learning_rate": 0.00015526141226740182, + "loss": 0.41, + "step": 31250 + }, + { + "epoch": 1.0442276857295565, + "grad_norm": 0.8896481394767761, + "learning_rate": 0.0001552307192064108, + "loss": 0.4319, + "step": 31260 + }, + { + "epoch": 1.0445617316942812, + "grad_norm": 0.6243941783905029, + "learning_rate": 0.0001552000186566706, + "loss": 0.3989, + "step": 31270 + }, + { + "epoch": 1.044895777659006, + "grad_norm": 0.587960422039032, + "learning_rate": 0.00015516931062234382, + "loss": 0.3898, + "step": 31280 + }, + { + "epoch": 1.0452298236237305, + "grad_norm": 0.733153760433197, + "learning_rate": 0.00015513859510759428, + "loss": 0.3958, + "step": 31290 + }, + { + "epoch": 1.0455638695884555, + "grad_norm": 0.7125657200813293, + "learning_rate": 0.0001551078721165866, + "loss": 0.3944, + "step": 31300 + }, + { + "epoch": 1.04589791555318, + "grad_norm": 0.6918461322784424, + "learning_rate": 0.00015507714165348662, + "loss": 0.3609, + "step": 31310 + }, + { + "epoch": 1.0462319615179048, + "grad_norm": 0.6127220392227173, + "learning_rate": 0.00015504640372246105, + "loss": 0.3843, + "step": 31320 + }, + { + "epoch": 1.0465660074826295, + "grad_norm": 0.7536218166351318, + "learning_rate": 0.00015501565832767763, + "loss": 0.4181, + "step": 31330 + }, + { + "epoch": 1.0469000534473543, + "grad_norm": 0.6571843028068542, + "learning_rate": 0.00015498490547330516, + "loss": 0.3873, + "step": 31340 + }, + { + "epoch": 1.047234099412079, + "grad_norm": 0.6509032845497131, + "learning_rate": 0.0001549541451635135, + "loss": 0.4208, + "step": 31350 + }, + { + "epoch": 1.0475681453768038, + "grad_norm": 0.764640748500824, + "learning_rate": 0.00015492337740247327, + "loss": 0.3954, + "step": 31360 + }, + { + "epoch": 1.0479021913415285, + "grad_norm": 0.6499558091163635, + "learning_rate": 0.00015489260219435645, + "loss": 0.4008, + "step": 31370 + }, + { + "epoch": 1.0482362373062533, + "grad_norm": 0.6920099854469299, + "learning_rate": 0.0001548618195433358, + "loss": 0.3933, + "step": 31380 + }, + { + "epoch": 1.048570283270978, + "grad_norm": 0.6322662234306335, + "learning_rate": 0.00015483102945358513, + "loss": 0.4085, + "step": 31390 + }, + { + "epoch": 1.0489043292357028, + "grad_norm": 0.6097990274429321, + "learning_rate": 0.00015480023192927927, + "loss": 0.4004, + "step": 31400 + }, + { + "epoch": 1.0492383752004275, + "grad_norm": 0.6452500224113464, + "learning_rate": 0.0001547694269745941, + "loss": 0.4173, + "step": 31410 + }, + { + "epoch": 1.0495724211651523, + "grad_norm": 0.7463740110397339, + "learning_rate": 0.0001547386145937065, + "loss": 0.3915, + "step": 31420 + }, + { + "epoch": 1.049906467129877, + "grad_norm": 0.545926034450531, + "learning_rate": 0.00015470779479079424, + "loss": 0.3816, + "step": 31430 + }, + { + "epoch": 1.0502405130946018, + "grad_norm": 0.5322889089584351, + "learning_rate": 0.00015467696757003627, + "loss": 0.4015, + "step": 31440 + }, + { + "epoch": 1.0505745590593265, + "grad_norm": 0.7112037539482117, + "learning_rate": 0.0001546461329356124, + "loss": 0.377, + "step": 31450 + }, + { + "epoch": 1.0509086050240513, + "grad_norm": 0.6907027959823608, + "learning_rate": 0.00015461529089170356, + "loss": 0.3875, + "step": 31460 + }, + { + "epoch": 1.051242650988776, + "grad_norm": 0.6510934829711914, + "learning_rate": 0.0001545844414424916, + "loss": 0.3823, + "step": 31470 + }, + { + "epoch": 1.0515766969535008, + "grad_norm": 0.7073367834091187, + "learning_rate": 0.00015455358459215944, + "loss": 0.3713, + "step": 31480 + }, + { + "epoch": 1.0519107429182255, + "grad_norm": 0.6821051239967346, + "learning_rate": 0.0001545227203448909, + "loss": 0.3985, + "step": 31490 + }, + { + "epoch": 1.0522447888829503, + "grad_norm": 0.5994327068328857, + "learning_rate": 0.00015449184870487096, + "loss": 0.4249, + "step": 31500 + }, + { + "epoch": 1.052578834847675, + "grad_norm": 0.7246626615524292, + "learning_rate": 0.0001544609696762854, + "loss": 0.4083, + "step": 31510 + }, + { + "epoch": 1.0529128808123998, + "grad_norm": 0.633391261100769, + "learning_rate": 0.00015443008326332126, + "loss": 0.4229, + "step": 31520 + }, + { + "epoch": 1.0532469267771245, + "grad_norm": 0.8004332184791565, + "learning_rate": 0.0001543991894701663, + "loss": 0.4126, + "step": 31530 + }, + { + "epoch": 1.0535809727418493, + "grad_norm": 0.5672710537910461, + "learning_rate": 0.00015436828830100953, + "loss": 0.3897, + "step": 31540 + }, + { + "epoch": 1.053915018706574, + "grad_norm": 0.6330865025520325, + "learning_rate": 0.0001543373797600408, + "loss": 0.3979, + "step": 31550 + }, + { + "epoch": 1.0542490646712988, + "grad_norm": 0.614921510219574, + "learning_rate": 0.00015430646385145095, + "loss": 0.3872, + "step": 31560 + }, + { + "epoch": 1.0545831106360235, + "grad_norm": 0.5669344663619995, + "learning_rate": 0.00015427554057943196, + "loss": 0.3853, + "step": 31570 + }, + { + "epoch": 1.0549171566007483, + "grad_norm": 0.745164155960083, + "learning_rate": 0.00015424460994817673, + "loss": 0.3857, + "step": 31580 + }, + { + "epoch": 1.055251202565473, + "grad_norm": 0.6202241778373718, + "learning_rate": 0.0001542136719618791, + "loss": 0.3953, + "step": 31590 + }, + { + "epoch": 1.0555852485301978, + "grad_norm": 0.6544796824455261, + "learning_rate": 0.000154182726624734, + "loss": 0.3881, + "step": 31600 + }, + { + "epoch": 1.0559192944949225, + "grad_norm": 0.6929325461387634, + "learning_rate": 0.0001541517739409373, + "loss": 0.364, + "step": 31610 + }, + { + "epoch": 1.0562533404596472, + "grad_norm": 0.7343741059303284, + "learning_rate": 0.0001541208139146859, + "loss": 0.4066, + "step": 31620 + }, + { + "epoch": 1.056587386424372, + "grad_norm": 0.6061020493507385, + "learning_rate": 0.0001540898465501776, + "loss": 0.4239, + "step": 31630 + }, + { + "epoch": 1.0569214323890967, + "grad_norm": 0.6738826036453247, + "learning_rate": 0.0001540588718516114, + "loss": 0.3778, + "step": 31640 + }, + { + "epoch": 1.0572554783538215, + "grad_norm": 0.6384398341178894, + "learning_rate": 0.0001540278898231871, + "loss": 0.3862, + "step": 31650 + }, + { + "epoch": 1.0575895243185462, + "grad_norm": 0.5680800080299377, + "learning_rate": 0.00015399690046910552, + "loss": 0.3944, + "step": 31660 + }, + { + "epoch": 1.057923570283271, + "grad_norm": 0.610848605632782, + "learning_rate": 0.0001539659037935686, + "loss": 0.4133, + "step": 31670 + }, + { + "epoch": 1.0582576162479957, + "grad_norm": 0.6607316136360168, + "learning_rate": 0.00015393489980077915, + "loss": 0.3929, + "step": 31680 + }, + { + "epoch": 1.0585916622127205, + "grad_norm": 0.5842889547348022, + "learning_rate": 0.00015390388849494105, + "loss": 0.4335, + "step": 31690 + }, + { + "epoch": 1.0589257081774452, + "grad_norm": 0.685884416103363, + "learning_rate": 0.00015387286988025904, + "loss": 0.391, + "step": 31700 + }, + { + "epoch": 1.05925975414217, + "grad_norm": 0.6031178832054138, + "learning_rate": 0.00015384184396093902, + "loss": 0.4262, + "step": 31710 + }, + { + "epoch": 1.0595938001068947, + "grad_norm": 0.8190531134605408, + "learning_rate": 0.00015381081074118777, + "loss": 0.3755, + "step": 31720 + }, + { + "epoch": 1.0599278460716195, + "grad_norm": 0.6529978513717651, + "learning_rate": 0.0001537797702252131, + "loss": 0.3997, + "step": 31730 + }, + { + "epoch": 1.0602618920363442, + "grad_norm": 0.6262980699539185, + "learning_rate": 0.00015374872241722383, + "loss": 0.3596, + "step": 31740 + }, + { + "epoch": 1.060595938001069, + "grad_norm": 0.7549333572387695, + "learning_rate": 0.0001537176673214297, + "loss": 0.4031, + "step": 31750 + }, + { + "epoch": 1.0609299839657937, + "grad_norm": 0.8256839513778687, + "learning_rate": 0.0001536866049420415, + "loss": 0.3923, + "step": 31760 + }, + { + "epoch": 1.0612640299305185, + "grad_norm": 0.6927961111068726, + "learning_rate": 0.00015365553528327096, + "loss": 0.3942, + "step": 31770 + }, + { + "epoch": 1.0615980758952432, + "grad_norm": 0.6020379066467285, + "learning_rate": 0.0001536244583493309, + "loss": 0.3688, + "step": 31780 + }, + { + "epoch": 1.061932121859968, + "grad_norm": 0.6936122179031372, + "learning_rate": 0.00015359337414443496, + "loss": 0.3955, + "step": 31790 + }, + { + "epoch": 1.0622661678246927, + "grad_norm": 0.6246137022972107, + "learning_rate": 0.00015356228267279792, + "loss": 0.3722, + "step": 31800 + }, + { + "epoch": 1.0626002137894175, + "grad_norm": 0.6276421546936035, + "learning_rate": 0.00015353118393863545, + "loss": 0.3731, + "step": 31810 + }, + { + "epoch": 1.0629342597541422, + "grad_norm": 0.6005711555480957, + "learning_rate": 0.00015350007794616424, + "loss": 0.3995, + "step": 31820 + }, + { + "epoch": 1.063268305718867, + "grad_norm": 0.6880163550376892, + "learning_rate": 0.00015346896469960191, + "loss": 0.4289, + "step": 31830 + }, + { + "epoch": 1.0636023516835917, + "grad_norm": 0.6248577237129211, + "learning_rate": 0.00015343784420316725, + "loss": 0.3881, + "step": 31840 + }, + { + "epoch": 1.0639363976483165, + "grad_norm": 0.7652528882026672, + "learning_rate": 0.00015340671646107978, + "loss": 0.3886, + "step": 31850 + }, + { + "epoch": 1.0642704436130412, + "grad_norm": 0.8710148930549622, + "learning_rate": 0.00015337558147756014, + "loss": 0.3867, + "step": 31860 + }, + { + "epoch": 1.064604489577766, + "grad_norm": 0.790136456489563, + "learning_rate": 0.00015334443925682997, + "loss": 0.3636, + "step": 31870 + }, + { + "epoch": 1.0649385355424907, + "grad_norm": 0.5890563130378723, + "learning_rate": 0.00015331328980311182, + "loss": 0.4112, + "step": 31880 + }, + { + "epoch": 1.0652725815072155, + "grad_norm": 0.7097724676132202, + "learning_rate": 0.00015328213312062927, + "loss": 0.404, + "step": 31890 + }, + { + "epoch": 1.0656066274719402, + "grad_norm": 0.6660045981407166, + "learning_rate": 0.00015325096921360684, + "loss": 0.374, + "step": 31900 + }, + { + "epoch": 1.065940673436665, + "grad_norm": 0.6714134812355042, + "learning_rate": 0.0001532197980862701, + "loss": 0.374, + "step": 31910 + }, + { + "epoch": 1.0662747194013895, + "grad_norm": 0.6448271870613098, + "learning_rate": 0.0001531886197428455, + "loss": 0.3995, + "step": 31920 + }, + { + "epoch": 1.0666087653661145, + "grad_norm": 0.5707643032073975, + "learning_rate": 0.0001531574341875605, + "loss": 0.3946, + "step": 31930 + }, + { + "epoch": 1.066942811330839, + "grad_norm": 0.6564704775810242, + "learning_rate": 0.00015312624142464366, + "loss": 0.3977, + "step": 31940 + }, + { + "epoch": 1.067276857295564, + "grad_norm": 0.49765506386756897, + "learning_rate": 0.0001530950414583243, + "loss": 0.378, + "step": 31950 + }, + { + "epoch": 1.0676109032602885, + "grad_norm": 0.8377857804298401, + "learning_rate": 0.00015306383429283295, + "loss": 0.4, + "step": 31960 + }, + { + "epoch": 1.0679449492250133, + "grad_norm": 0.6857339143753052, + "learning_rate": 0.0001530326199324009, + "loss": 0.3924, + "step": 31970 + }, + { + "epoch": 1.068278995189738, + "grad_norm": 0.5566911101341248, + "learning_rate": 0.00015300139838126056, + "loss": 0.3575, + "step": 31980 + }, + { + "epoch": 1.0686130411544628, + "grad_norm": 0.7227137088775635, + "learning_rate": 0.0001529701696436453, + "loss": 0.4077, + "step": 31990 + }, + { + "epoch": 1.0689470871191875, + "grad_norm": 0.7257761359214783, + "learning_rate": 0.00015293893372378935, + "loss": 0.3885, + "step": 32000 + }, + { + "epoch": 1.0689470871191875, + "eval_chrf": 84.69995094622682, + "eval_loss": 0.7359504699707031, + "eval_runtime": 186.0659, + "eval_samples_per_second": 0.537, + "eval_steps_per_second": 0.021, + "step": 32000 + }, + { + "epoch": 1.0692811330839123, + "grad_norm": 0.6162306666374207, + "learning_rate": 0.00015290769062592803, + "loss": 0.3999, + "step": 32010 + }, + { + "epoch": 1.069615179048637, + "grad_norm": 0.664222002029419, + "learning_rate": 0.00015287644035429763, + "loss": 0.4124, + "step": 32020 + }, + { + "epoch": 1.0699492250133618, + "grad_norm": 0.7247756719589233, + "learning_rate": 0.00015284518291313535, + "loss": 0.3758, + "step": 32030 + }, + { + "epoch": 1.0702832709780865, + "grad_norm": 0.6978458762168884, + "learning_rate": 0.0001528139183066794, + "loss": 0.4007, + "step": 32040 + }, + { + "epoch": 1.0706173169428113, + "grad_norm": 0.755987823009491, + "learning_rate": 0.00015278264653916903, + "loss": 0.3954, + "step": 32050 + }, + { + "epoch": 1.070951362907536, + "grad_norm": 0.6950144171714783, + "learning_rate": 0.00015275136761484426, + "loss": 0.3762, + "step": 32060 + }, + { + "epoch": 1.0712854088722608, + "grad_norm": 0.6735520362854004, + "learning_rate": 0.00015272008153794633, + "loss": 0.3902, + "step": 32070 + }, + { + "epoch": 1.0716194548369855, + "grad_norm": 0.6599205136299133, + "learning_rate": 0.0001526887883127172, + "loss": 0.3829, + "step": 32080 + }, + { + "epoch": 1.0719535008017103, + "grad_norm": 0.6319999098777771, + "learning_rate": 0.00015265748794340007, + "loss": 0.3847, + "step": 32090 + }, + { + "epoch": 1.072287546766435, + "grad_norm": 0.6575327515602112, + "learning_rate": 0.00015262618043423886, + "loss": 0.3843, + "step": 32100 + }, + { + "epoch": 1.0726215927311598, + "grad_norm": 0.5816564559936523, + "learning_rate": 0.00015259486578947864, + "loss": 0.4126, + "step": 32110 + }, + { + "epoch": 1.0729556386958845, + "grad_norm": 0.5660355687141418, + "learning_rate": 0.0001525635440133653, + "loss": 0.3824, + "step": 32120 + }, + { + "epoch": 1.0732896846606093, + "grad_norm": 0.6475275158882141, + "learning_rate": 0.0001525322151101458, + "loss": 0.3764, + "step": 32130 + }, + { + "epoch": 1.073623730625334, + "grad_norm": 0.6561474800109863, + "learning_rate": 0.00015250087908406804, + "loss": 0.3816, + "step": 32140 + }, + { + "epoch": 1.0739577765900588, + "grad_norm": 0.5747151970863342, + "learning_rate": 0.0001524695359393809, + "loss": 0.3822, + "step": 32150 + }, + { + "epoch": 1.0742918225547835, + "grad_norm": 0.819019079208374, + "learning_rate": 0.0001524381856803341, + "loss": 0.4189, + "step": 32160 + }, + { + "epoch": 1.0746258685195083, + "grad_norm": 0.7285705208778381, + "learning_rate": 0.00015240682831117863, + "loss": 0.3937, + "step": 32170 + }, + { + "epoch": 1.074959914484233, + "grad_norm": 0.5573272109031677, + "learning_rate": 0.00015237546383616604, + "loss": 0.386, + "step": 32180 + }, + { + "epoch": 1.0752939604489578, + "grad_norm": 0.7629221677780151, + "learning_rate": 0.0001523440922595492, + "loss": 0.392, + "step": 32190 + }, + { + "epoch": 1.0756280064136825, + "grad_norm": 0.6611957550048828, + "learning_rate": 0.00015231271358558164, + "loss": 0.3939, + "step": 32200 + }, + { + "epoch": 1.0759620523784073, + "grad_norm": 0.7367185950279236, + "learning_rate": 0.0001522813278185182, + "loss": 0.4062, + "step": 32210 + }, + { + "epoch": 1.076296098343132, + "grad_norm": 0.6648398637771606, + "learning_rate": 0.0001522499349626143, + "loss": 0.3819, + "step": 32220 + }, + { + "epoch": 1.0766301443078568, + "grad_norm": 0.6883804798126221, + "learning_rate": 0.00015221853502212662, + "loss": 0.3898, + "step": 32230 + }, + { + "epoch": 1.0769641902725815, + "grad_norm": 0.6462768316268921, + "learning_rate": 0.00015218712800131262, + "loss": 0.3922, + "step": 32240 + }, + { + "epoch": 1.0772982362373063, + "grad_norm": 0.693915843963623, + "learning_rate": 0.0001521557139044308, + "loss": 0.3902, + "step": 32250 + }, + { + "epoch": 1.077632282202031, + "grad_norm": 0.6383981108665466, + "learning_rate": 0.00015212429273574069, + "loss": 0.3974, + "step": 32260 + }, + { + "epoch": 1.0779663281667557, + "grad_norm": 0.7130057215690613, + "learning_rate": 0.0001520928644995026, + "loss": 0.3886, + "step": 32270 + }, + { + "epoch": 1.0783003741314805, + "grad_norm": 0.523904025554657, + "learning_rate": 0.00015206142919997793, + "loss": 0.4164, + "step": 32280 + }, + { + "epoch": 1.0786344200962052, + "grad_norm": 0.8585821390151978, + "learning_rate": 0.00015202998684142894, + "loss": 0.3932, + "step": 32290 + }, + { + "epoch": 1.07896846606093, + "grad_norm": 0.7782782912254333, + "learning_rate": 0.00015199853742811902, + "loss": 0.4066, + "step": 32300 + }, + { + "epoch": 1.0793025120256547, + "grad_norm": 0.6855494976043701, + "learning_rate": 0.00015196708096431233, + "loss": 0.402, + "step": 32310 + }, + { + "epoch": 1.0796365579903795, + "grad_norm": 0.6536297798156738, + "learning_rate": 0.00015193561745427409, + "loss": 0.3988, + "step": 32320 + }, + { + "epoch": 1.0799706039551042, + "grad_norm": 0.5749618411064148, + "learning_rate": 0.0001519041469022704, + "loss": 0.3646, + "step": 32330 + }, + { + "epoch": 1.080304649919829, + "grad_norm": 0.6644771695137024, + "learning_rate": 0.00015187266931256845, + "loss": 0.3859, + "step": 32340 + }, + { + "epoch": 1.0806386958845537, + "grad_norm": 0.6259563565254211, + "learning_rate": 0.0001518411846894362, + "loss": 0.4061, + "step": 32350 + }, + { + "epoch": 1.0809727418492785, + "grad_norm": 0.6828506588935852, + "learning_rate": 0.00015180969303714268, + "loss": 0.3989, + "step": 32360 + }, + { + "epoch": 1.0813067878140032, + "grad_norm": 0.7077738046646118, + "learning_rate": 0.00015177819435995793, + "loss": 0.3943, + "step": 32370 + }, + { + "epoch": 1.081640833778728, + "grad_norm": 0.5492916107177734, + "learning_rate": 0.0001517466886621528, + "loss": 0.4033, + "step": 32380 + }, + { + "epoch": 1.0819748797434527, + "grad_norm": 0.5495889186859131, + "learning_rate": 0.00015171517594799914, + "loss": 0.3849, + "step": 32390 + }, + { + "epoch": 1.0823089257081775, + "grad_norm": 0.8280493021011353, + "learning_rate": 0.0001516836562217698, + "loss": 0.3904, + "step": 32400 + }, + { + "epoch": 1.0826429716729022, + "grad_norm": 0.5687371492385864, + "learning_rate": 0.00015165212948773852, + "loss": 0.4195, + "step": 32410 + }, + { + "epoch": 1.082977017637627, + "grad_norm": 0.6446851491928101, + "learning_rate": 0.00015162059575018008, + "loss": 0.3672, + "step": 32420 + }, + { + "epoch": 1.0833110636023517, + "grad_norm": 0.6955679059028625, + "learning_rate": 0.0001515890550133701, + "loss": 0.3956, + "step": 32430 + }, + { + "epoch": 1.0836451095670765, + "grad_norm": 0.6464055180549622, + "learning_rate": 0.0001515575072815852, + "loss": 0.3799, + "step": 32440 + }, + { + "epoch": 1.0839791555318012, + "grad_norm": 0.721744179725647, + "learning_rate": 0.00015152595255910292, + "loss": 0.3888, + "step": 32450 + }, + { + "epoch": 1.084313201496526, + "grad_norm": 0.6234685182571411, + "learning_rate": 0.00015149439085020188, + "loss": 0.4158, + "step": 32460 + }, + { + "epoch": 1.0846472474612507, + "grad_norm": 0.6967000365257263, + "learning_rate": 0.00015146282215916138, + "loss": 0.4147, + "step": 32470 + }, + { + "epoch": 1.0849812934259755, + "grad_norm": 0.6546948552131653, + "learning_rate": 0.00015143124649026198, + "loss": 0.3921, + "step": 32480 + }, + { + "epoch": 1.0853153393907002, + "grad_norm": 0.657715916633606, + "learning_rate": 0.00015139966384778493, + "loss": 0.3901, + "step": 32490 + }, + { + "epoch": 1.085649385355425, + "grad_norm": 0.6768990755081177, + "learning_rate": 0.00015136807423601263, + "loss": 0.4034, + "step": 32500 + }, + { + "epoch": 1.0859834313201497, + "grad_norm": 0.6087000370025635, + "learning_rate": 0.0001513364776592282, + "loss": 0.3999, + "step": 32510 + }, + { + "epoch": 1.0863174772848745, + "grad_norm": 0.6712257266044617, + "learning_rate": 0.00015130487412171593, + "loss": 0.4235, + "step": 32520 + }, + { + "epoch": 1.0866515232495992, + "grad_norm": 0.7128852605819702, + "learning_rate": 0.0001512732636277609, + "loss": 0.3965, + "step": 32530 + }, + { + "epoch": 1.086985569214324, + "grad_norm": 0.7706629037857056, + "learning_rate": 0.0001512416461816492, + "loss": 0.4057, + "step": 32540 + }, + { + "epoch": 1.0873196151790487, + "grad_norm": 0.7071778178215027, + "learning_rate": 0.00015121002178766785, + "loss": 0.4103, + "step": 32550 + }, + { + "epoch": 1.0876536611437735, + "grad_norm": 0.559522807598114, + "learning_rate": 0.00015117839045010483, + "loss": 0.3635, + "step": 32560 + }, + { + "epoch": 1.087987707108498, + "grad_norm": 0.6901444792747498, + "learning_rate": 0.000151146752173249, + "loss": 0.3812, + "step": 32570 + }, + { + "epoch": 1.088321753073223, + "grad_norm": 0.719542920589447, + "learning_rate": 0.00015111510696139027, + "loss": 0.4033, + "step": 32580 + }, + { + "epoch": 1.0886557990379475, + "grad_norm": 0.6495571136474609, + "learning_rate": 0.00015108345481881932, + "loss": 0.3917, + "step": 32590 + }, + { + "epoch": 1.0889898450026725, + "grad_norm": 0.7068628668785095, + "learning_rate": 0.00015105179574982796, + "loss": 0.3927, + "step": 32600 + }, + { + "epoch": 1.089323890967397, + "grad_norm": 0.6231522560119629, + "learning_rate": 0.00015102012975870882, + "loss": 0.4149, + "step": 32610 + }, + { + "epoch": 1.0896579369321218, + "grad_norm": 0.5850902795791626, + "learning_rate": 0.00015098845684975552, + "loss": 0.3912, + "step": 32620 + }, + { + "epoch": 1.0899919828968465, + "grad_norm": 0.6033743023872375, + "learning_rate": 0.0001509567770272626, + "loss": 0.4063, + "step": 32630 + }, + { + "epoch": 1.0903260288615713, + "grad_norm": 0.7030525207519531, + "learning_rate": 0.00015092509029552547, + "loss": 0.4002, + "step": 32640 + }, + { + "epoch": 1.090660074826296, + "grad_norm": 0.7140531539916992, + "learning_rate": 0.00015089339665884063, + "loss": 0.365, + "step": 32650 + }, + { + "epoch": 1.0909941207910208, + "grad_norm": 0.6535202860832214, + "learning_rate": 0.0001508616961215054, + "loss": 0.3611, + "step": 32660 + }, + { + "epoch": 1.0913281667557455, + "grad_norm": 0.6521779894828796, + "learning_rate": 0.00015082998868781806, + "loss": 0.3758, + "step": 32670 + }, + { + "epoch": 1.0916622127204703, + "grad_norm": 0.6749773025512695, + "learning_rate": 0.00015079827436207782, + "loss": 0.3892, + "step": 32680 + }, + { + "epoch": 1.091996258685195, + "grad_norm": 0.6513385772705078, + "learning_rate": 0.00015076655314858487, + "loss": 0.372, + "step": 32690 + }, + { + "epoch": 1.0923303046499198, + "grad_norm": 0.5982686281204224, + "learning_rate": 0.0001507348250516402, + "loss": 0.422, + "step": 32700 + }, + { + "epoch": 1.0926643506146445, + "grad_norm": 0.5826268792152405, + "learning_rate": 0.00015070309007554598, + "loss": 0.3733, + "step": 32710 + }, + { + "epoch": 1.0929983965793693, + "grad_norm": 0.5771560668945312, + "learning_rate": 0.00015067134822460503, + "loss": 0.4049, + "step": 32720 + }, + { + "epoch": 1.093332442544094, + "grad_norm": 0.6471542119979858, + "learning_rate": 0.00015063959950312135, + "loss": 0.3972, + "step": 32730 + }, + { + "epoch": 1.0936664885088188, + "grad_norm": 0.5773517489433289, + "learning_rate": 0.00015060784391539967, + "loss": 0.3787, + "step": 32740 + }, + { + "epoch": 1.0940005344735435, + "grad_norm": 0.7075666785240173, + "learning_rate": 0.0001505760814657458, + "loss": 0.351, + "step": 32750 + }, + { + "epoch": 1.0943345804382683, + "grad_norm": 0.625497043132782, + "learning_rate": 0.00015054431215846635, + "loss": 0.3894, + "step": 32760 + }, + { + "epoch": 1.094668626402993, + "grad_norm": 0.7433095574378967, + "learning_rate": 0.00015051253599786903, + "loss": 0.3975, + "step": 32770 + }, + { + "epoch": 1.0950026723677178, + "grad_norm": 0.5025230050086975, + "learning_rate": 0.00015048075298826227, + "loss": 0.3793, + "step": 32780 + }, + { + "epoch": 1.0953367183324425, + "grad_norm": 0.6576723456382751, + "learning_rate": 0.0001504489631339556, + "loss": 0.3781, + "step": 32790 + }, + { + "epoch": 1.0956707642971673, + "grad_norm": 0.7078614234924316, + "learning_rate": 0.00015041716643925942, + "loss": 0.379, + "step": 32800 + }, + { + "epoch": 1.096004810261892, + "grad_norm": 0.658562421798706, + "learning_rate": 0.00015038536290848503, + "loss": 0.4095, + "step": 32810 + }, + { + "epoch": 1.0963388562266168, + "grad_norm": 0.5888537764549255, + "learning_rate": 0.00015035355254594472, + "loss": 0.4045, + "step": 32820 + }, + { + "epoch": 1.0966729021913415, + "grad_norm": 0.5991591215133667, + "learning_rate": 0.00015032173535595159, + "loss": 0.4094, + "step": 32830 + }, + { + "epoch": 1.0970069481560663, + "grad_norm": 0.5492830276489258, + "learning_rate": 0.00015028991134281982, + "loss": 0.3787, + "step": 32840 + }, + { + "epoch": 1.097340994120791, + "grad_norm": 0.6638930439949036, + "learning_rate": 0.0001502580805108644, + "loss": 0.3748, + "step": 32850 + }, + { + "epoch": 1.0976750400855158, + "grad_norm": 0.817823052406311, + "learning_rate": 0.00015022624286440125, + "loss": 0.4073, + "step": 32860 + }, + { + "epoch": 1.0980090860502405, + "grad_norm": 0.7436062693595886, + "learning_rate": 0.0001501943984077473, + "loss": 0.4073, + "step": 32870 + }, + { + "epoch": 1.0983431320149653, + "grad_norm": 0.5771978497505188, + "learning_rate": 0.00015016254714522034, + "loss": 0.4106, + "step": 32880 + }, + { + "epoch": 1.09867717797969, + "grad_norm": 0.6777241230010986, + "learning_rate": 0.00015013068908113905, + "loss": 0.3944, + "step": 32890 + }, + { + "epoch": 1.0990112239444148, + "grad_norm": 0.6946690678596497, + "learning_rate": 0.00015009882421982313, + "loss": 0.3967, + "step": 32900 + }, + { + "epoch": 1.0993452699091395, + "grad_norm": 0.6860179901123047, + "learning_rate": 0.0001500669525655931, + "loss": 0.4147, + "step": 32910 + }, + { + "epoch": 1.0996793158738642, + "grad_norm": 0.7347412705421448, + "learning_rate": 0.00015003507412277053, + "loss": 0.3899, + "step": 32920 + }, + { + "epoch": 1.100013361838589, + "grad_norm": 0.6461763978004456, + "learning_rate": 0.0001500031888956777, + "loss": 0.4065, + "step": 32930 + }, + { + "epoch": 1.1003474078033137, + "grad_norm": 0.6525400876998901, + "learning_rate": 0.00014997129688863804, + "loss": 0.3833, + "step": 32940 + }, + { + "epoch": 1.1006814537680385, + "grad_norm": 0.7523420453071594, + "learning_rate": 0.00014993939810597576, + "loss": 0.3902, + "step": 32950 + }, + { + "epoch": 1.1010154997327632, + "grad_norm": 0.6981995701789856, + "learning_rate": 0.000149907492552016, + "loss": 0.4049, + "step": 32960 + }, + { + "epoch": 1.101349545697488, + "grad_norm": 0.6987504363059998, + "learning_rate": 0.00014987558023108486, + "loss": 0.4111, + "step": 32970 + }, + { + "epoch": 1.1016835916622127, + "grad_norm": 0.5821407437324524, + "learning_rate": 0.0001498436611475094, + "loss": 0.3997, + "step": 32980 + }, + { + "epoch": 1.1020176376269375, + "grad_norm": 0.7568138837814331, + "learning_rate": 0.00014981173530561744, + "loss": 0.3948, + "step": 32990 + }, + { + "epoch": 1.1023516835916622, + "grad_norm": 0.7167226076126099, + "learning_rate": 0.00014977980270973788, + "loss": 0.3813, + "step": 33000 + }, + { + "epoch": 1.1023516835916622, + "eval_chrf": 85.27433800938137, + "eval_loss": 0.7324214577674866, + "eval_runtime": 104.0107, + "eval_samples_per_second": 0.961, + "eval_steps_per_second": 0.038, + "step": 33000 + }, + { + "epoch": 1.102685729556387, + "grad_norm": 0.7366155385971069, + "learning_rate": 0.00014974786336420043, + "loss": 0.3944, + "step": 33010 + }, + { + "epoch": 1.1030197755211117, + "grad_norm": 0.6513412594795227, + "learning_rate": 0.0001497159172733358, + "loss": 0.3832, + "step": 33020 + }, + { + "epoch": 1.1033538214858365, + "grad_norm": 0.7345359325408936, + "learning_rate": 0.00014968396444147553, + "loss": 0.3885, + "step": 33030 + }, + { + "epoch": 1.1036878674505612, + "grad_norm": 0.5979567766189575, + "learning_rate": 0.00014965200487295212, + "loss": 0.3776, + "step": 33040 + }, + { + "epoch": 1.104021913415286, + "grad_norm": 0.7487165331840515, + "learning_rate": 0.000149620038572099, + "loss": 0.4055, + "step": 33050 + }, + { + "epoch": 1.1043559593800107, + "grad_norm": 0.6932562589645386, + "learning_rate": 0.0001495880655432505, + "loss": 0.4103, + "step": 33060 + }, + { + "epoch": 1.1046900053447355, + "grad_norm": 0.5934482216835022, + "learning_rate": 0.0001495560857907418, + "loss": 0.401, + "step": 33070 + }, + { + "epoch": 1.1050240513094602, + "grad_norm": 0.791140079498291, + "learning_rate": 0.00014952409931890908, + "loss": 0.3621, + "step": 33080 + }, + { + "epoch": 1.105358097274185, + "grad_norm": 0.6272256374359131, + "learning_rate": 0.0001494921061320894, + "loss": 0.3805, + "step": 33090 + }, + { + "epoch": 1.1056921432389097, + "grad_norm": 0.6510805487632751, + "learning_rate": 0.00014946010623462068, + "loss": 0.3924, + "step": 33100 + }, + { + "epoch": 1.1060261892036345, + "grad_norm": 0.7538283467292786, + "learning_rate": 0.00014942809963084186, + "loss": 0.3787, + "step": 33110 + }, + { + "epoch": 1.1063602351683592, + "grad_norm": 0.6314569711685181, + "learning_rate": 0.00014939608632509268, + "loss": 0.3885, + "step": 33120 + }, + { + "epoch": 1.106694281133084, + "grad_norm": 0.7154045104980469, + "learning_rate": 0.00014936406632171383, + "loss": 0.3811, + "step": 33130 + }, + { + "epoch": 1.1070283270978087, + "grad_norm": 0.7610613107681274, + "learning_rate": 0.00014933203962504696, + "loss": 0.43, + "step": 33140 + }, + { + "epoch": 1.1073623730625335, + "grad_norm": 0.6231352090835571, + "learning_rate": 0.00014930000623943453, + "loss": 0.3974, + "step": 33150 + }, + { + "epoch": 1.1076964190272582, + "grad_norm": 0.5586511492729187, + "learning_rate": 0.00014926796616921995, + "loss": 0.3972, + "step": 33160 + }, + { + "epoch": 1.108030464991983, + "grad_norm": 0.6358824968338013, + "learning_rate": 0.0001492359194187476, + "loss": 0.3782, + "step": 33170 + }, + { + "epoch": 1.1083645109567077, + "grad_norm": 0.8141195774078369, + "learning_rate": 0.00014920386599236263, + "loss": 0.4207, + "step": 33180 + }, + { + "epoch": 1.1086985569214325, + "grad_norm": 0.8019670844078064, + "learning_rate": 0.0001491718058944113, + "loss": 0.4099, + "step": 33190 + }, + { + "epoch": 1.1090326028861572, + "grad_norm": 0.6686485409736633, + "learning_rate": 0.0001491397391292405, + "loss": 0.3641, + "step": 33200 + }, + { + "epoch": 1.109366648850882, + "grad_norm": 0.6223136782646179, + "learning_rate": 0.00014910766570119826, + "loss": 0.4171, + "step": 33210 + }, + { + "epoch": 1.1097006948156065, + "grad_norm": 0.6323383450508118, + "learning_rate": 0.00014907558561463337, + "loss": 0.4082, + "step": 33220 + }, + { + "epoch": 1.1100347407803315, + "grad_norm": 0.6057208776473999, + "learning_rate": 0.00014904349887389566, + "loss": 0.4042, + "step": 33230 + }, + { + "epoch": 1.110368786745056, + "grad_norm": 0.7694222331047058, + "learning_rate": 0.0001490114054833357, + "loss": 0.4146, + "step": 33240 + }, + { + "epoch": 1.1107028327097808, + "grad_norm": 0.6725189089775085, + "learning_rate": 0.00014897930544730507, + "loss": 0.4284, + "step": 33250 + }, + { + "epoch": 1.1110368786745055, + "grad_norm": 0.7597146034240723, + "learning_rate": 0.00014894719877015623, + "loss": 0.402, + "step": 33260 + }, + { + "epoch": 1.1113709246392303, + "grad_norm": 0.6628609299659729, + "learning_rate": 0.00014891508545624254, + "loss": 0.3965, + "step": 33270 + }, + { + "epoch": 1.111704970603955, + "grad_norm": 0.6652149558067322, + "learning_rate": 0.00014888296550991822, + "loss": 0.4136, + "step": 33280 + }, + { + "epoch": 1.1120390165686798, + "grad_norm": 0.6821569800376892, + "learning_rate": 0.00014885083893553848, + "loss": 0.3905, + "step": 33290 + }, + { + "epoch": 1.1123730625334045, + "grad_norm": 0.7311185598373413, + "learning_rate": 0.0001488187057374593, + "loss": 0.4582, + "step": 33300 + }, + { + "epoch": 1.1127071084981293, + "grad_norm": 0.8168458342552185, + "learning_rate": 0.0001487865659200377, + "loss": 0.4064, + "step": 33310 + }, + { + "epoch": 1.113041154462854, + "grad_norm": 0.6395614147186279, + "learning_rate": 0.0001487544194876315, + "loss": 0.4102, + "step": 33320 + }, + { + "epoch": 1.1133752004275788, + "grad_norm": 0.7044562101364136, + "learning_rate": 0.0001487222664445994, + "loss": 0.4019, + "step": 33330 + }, + { + "epoch": 1.1137092463923035, + "grad_norm": 0.6341235637664795, + "learning_rate": 0.0001486901067953011, + "loss": 0.3718, + "step": 33340 + }, + { + "epoch": 1.1140432923570283, + "grad_norm": 0.6855657696723938, + "learning_rate": 0.00014865794054409712, + "loss": 0.4111, + "step": 33350 + }, + { + "epoch": 1.114377338321753, + "grad_norm": 0.6411564350128174, + "learning_rate": 0.00014862576769534888, + "loss": 0.4018, + "step": 33360 + }, + { + "epoch": 1.1147113842864778, + "grad_norm": 0.7115310430526733, + "learning_rate": 0.00014859358825341872, + "loss": 0.402, + "step": 33370 + }, + { + "epoch": 1.1150454302512025, + "grad_norm": 0.6870629787445068, + "learning_rate": 0.00014856140222266985, + "loss": 0.4014, + "step": 33380 + }, + { + "epoch": 1.1153794762159273, + "grad_norm": 0.6879727840423584, + "learning_rate": 0.0001485292096074664, + "loss": 0.3677, + "step": 33390 + }, + { + "epoch": 1.115713522180652, + "grad_norm": 0.5879802107810974, + "learning_rate": 0.00014849701041217333, + "loss": 0.3726, + "step": 33400 + }, + { + "epoch": 1.1160475681453768, + "grad_norm": 0.7522575259208679, + "learning_rate": 0.0001484648046411566, + "loss": 0.3913, + "step": 33410 + }, + { + "epoch": 1.1163816141101015, + "grad_norm": 0.6429845094680786, + "learning_rate": 0.00014843259229878293, + "loss": 0.3877, + "step": 33420 + }, + { + "epoch": 1.1167156600748263, + "grad_norm": 0.6640375852584839, + "learning_rate": 0.00014840037338942008, + "loss": 0.3639, + "step": 33430 + }, + { + "epoch": 1.117049706039551, + "grad_norm": 0.6661775708198547, + "learning_rate": 0.00014836814791743653, + "loss": 0.4081, + "step": 33440 + }, + { + "epoch": 1.1173837520042758, + "grad_norm": 0.7435650825500488, + "learning_rate": 0.0001483359158872019, + "loss": 0.4118, + "step": 33450 + }, + { + "epoch": 1.1177177979690005, + "grad_norm": 0.6244760751724243, + "learning_rate": 0.00014830367730308633, + "loss": 0.3947, + "step": 33460 + }, + { + "epoch": 1.1180518439337253, + "grad_norm": 0.8154481649398804, + "learning_rate": 0.00014827143216946116, + "loss": 0.3833, + "step": 33470 + }, + { + "epoch": 1.11838588989845, + "grad_norm": 0.6931558847427368, + "learning_rate": 0.00014823918049069857, + "loss": 0.3904, + "step": 33480 + }, + { + "epoch": 1.1187199358631748, + "grad_norm": 0.6131518483161926, + "learning_rate": 0.0001482069222711715, + "loss": 0.3915, + "step": 33490 + }, + { + "epoch": 1.1190539818278995, + "grad_norm": 0.6391249299049377, + "learning_rate": 0.0001481746575152539, + "loss": 0.4099, + "step": 33500 + }, + { + "epoch": 1.1193880277926243, + "grad_norm": 0.5890313982963562, + "learning_rate": 0.0001481423862273205, + "loss": 0.3934, + "step": 33510 + }, + { + "epoch": 1.119722073757349, + "grad_norm": 0.6168562173843384, + "learning_rate": 0.00014811010841174702, + "loss": 0.3691, + "step": 33520 + }, + { + "epoch": 1.1200561197220738, + "grad_norm": 0.5263534784317017, + "learning_rate": 0.00014807782407291002, + "loss": 0.3736, + "step": 33530 + }, + { + "epoch": 1.1203901656867985, + "grad_norm": 0.6851488351821899, + "learning_rate": 0.0001480455332151869, + "loss": 0.4431, + "step": 33540 + }, + { + "epoch": 1.1207242116515232, + "grad_norm": 0.5882483124732971, + "learning_rate": 0.000148013235842956, + "loss": 0.3651, + "step": 33550 + }, + { + "epoch": 1.121058257616248, + "grad_norm": 0.5404395461082458, + "learning_rate": 0.00014798093196059656, + "loss": 0.3904, + "step": 33560 + }, + { + "epoch": 1.1213923035809727, + "grad_norm": 0.7165801525115967, + "learning_rate": 0.0001479486215724887, + "loss": 0.4011, + "step": 33570 + }, + { + "epoch": 1.1217263495456975, + "grad_norm": 0.7061018943786621, + "learning_rate": 0.0001479163046830133, + "loss": 0.3964, + "step": 33580 + }, + { + "epoch": 1.1220603955104222, + "grad_norm": 0.6029189825057983, + "learning_rate": 0.00014788398129655228, + "loss": 0.3923, + "step": 33590 + }, + { + "epoch": 1.122394441475147, + "grad_norm": 0.8030551075935364, + "learning_rate": 0.00014785165141748837, + "loss": 0.3915, + "step": 33600 + }, + { + "epoch": 1.1227284874398717, + "grad_norm": 0.6968623995780945, + "learning_rate": 0.00014781931505020518, + "loss": 0.3535, + "step": 33610 + }, + { + "epoch": 1.1230625334045965, + "grad_norm": 0.7178488969802856, + "learning_rate": 0.00014778697219908722, + "loss": 0.3722, + "step": 33620 + }, + { + "epoch": 1.1233965793693212, + "grad_norm": 0.6682683229446411, + "learning_rate": 0.00014775462286851986, + "loss": 0.3958, + "step": 33630 + }, + { + "epoch": 1.123730625334046, + "grad_norm": 0.593487560749054, + "learning_rate": 0.00014772226706288933, + "loss": 0.4178, + "step": 33640 + }, + { + "epoch": 1.1240646712987707, + "grad_norm": 0.7352398037910461, + "learning_rate": 0.0001476899047865828, + "loss": 0.3843, + "step": 33650 + }, + { + "epoch": 1.1243987172634955, + "grad_norm": 0.6165003180503845, + "learning_rate": 0.00014765753604398826, + "loss": 0.403, + "step": 33660 + }, + { + "epoch": 1.1247327632282202, + "grad_norm": 0.5676947236061096, + "learning_rate": 0.00014762516083949458, + "loss": 0.3529, + "step": 33670 + }, + { + "epoch": 1.125066809192945, + "grad_norm": 0.6254270672798157, + "learning_rate": 0.0001475927791774916, + "loss": 0.3709, + "step": 33680 + }, + { + "epoch": 1.1254008551576697, + "grad_norm": 0.7642256617546082, + "learning_rate": 0.00014756039106236985, + "loss": 0.3717, + "step": 33690 + }, + { + "epoch": 1.1257349011223945, + "grad_norm": 0.5167163610458374, + "learning_rate": 0.00014752799649852096, + "loss": 0.3925, + "step": 33700 + }, + { + "epoch": 1.1260689470871192, + "grad_norm": 0.8166671395301819, + "learning_rate": 0.0001474955954903372, + "loss": 0.3798, + "step": 33710 + }, + { + "epoch": 1.126402993051844, + "grad_norm": 0.6013413667678833, + "learning_rate": 0.00014746318804221192, + "loss": 0.3901, + "step": 33720 + }, + { + "epoch": 1.1267370390165687, + "grad_norm": 0.5557494163513184, + "learning_rate": 0.00014743077415853918, + "loss": 0.4043, + "step": 33730 + }, + { + "epoch": 1.1270710849812935, + "grad_norm": 0.671737790107727, + "learning_rate": 0.00014739835384371407, + "loss": 0.3675, + "step": 33740 + }, + { + "epoch": 1.1274051309460182, + "grad_norm": 0.6521509885787964, + "learning_rate": 0.00014736592710213245, + "loss": 0.4122, + "step": 33750 + }, + { + "epoch": 1.127739176910743, + "grad_norm": 0.6441331505775452, + "learning_rate": 0.00014733349393819103, + "loss": 0.3551, + "step": 33760 + }, + { + "epoch": 1.1280732228754677, + "grad_norm": 0.6966383457183838, + "learning_rate": 0.00014730105435628745, + "loss": 0.4, + "step": 33770 + }, + { + "epoch": 1.1284072688401925, + "grad_norm": 0.6316562294960022, + "learning_rate": 0.0001472686083608202, + "loss": 0.3947, + "step": 33780 + }, + { + "epoch": 1.1287413148049172, + "grad_norm": 0.6967915296554565, + "learning_rate": 0.00014723615595618866, + "loss": 0.4309, + "step": 33790 + }, + { + "epoch": 1.129075360769642, + "grad_norm": 0.5798444747924805, + "learning_rate": 0.0001472036971467931, + "loss": 0.3844, + "step": 33800 + }, + { + "epoch": 1.1294094067343667, + "grad_norm": 0.7475165128707886, + "learning_rate": 0.0001471712319370345, + "loss": 0.4392, + "step": 33810 + }, + { + "epoch": 1.1297434526990915, + "grad_norm": 0.7218406796455383, + "learning_rate": 0.00014713876033131492, + "loss": 0.3886, + "step": 33820 + }, + { + "epoch": 1.1300774986638162, + "grad_norm": 0.6008209586143494, + "learning_rate": 0.00014710628233403716, + "loss": 0.4111, + "step": 33830 + }, + { + "epoch": 1.130411544628541, + "grad_norm": 0.6526086926460266, + "learning_rate": 0.00014707379794960494, + "loss": 0.3733, + "step": 33840 + }, + { + "epoch": 1.1307455905932655, + "grad_norm": 0.7749031186103821, + "learning_rate": 0.00014704130718242283, + "loss": 0.4141, + "step": 33850 + }, + { + "epoch": 1.1310796365579905, + "grad_norm": 0.6762859225273132, + "learning_rate": 0.00014700881003689627, + "loss": 0.4139, + "step": 33860 + }, + { + "epoch": 1.131413682522715, + "grad_norm": 0.6790890097618103, + "learning_rate": 0.0001469763065174315, + "loss": 0.398, + "step": 33870 + }, + { + "epoch": 1.13174772848744, + "grad_norm": 0.6031625270843506, + "learning_rate": 0.00014694379662843574, + "loss": 0.3926, + "step": 33880 + }, + { + "epoch": 1.1320817744521645, + "grad_norm": 0.6534963250160217, + "learning_rate": 0.000146911280374317, + "loss": 0.4093, + "step": 33890 + }, + { + "epoch": 1.1324158204168895, + "grad_norm": 0.6669240593910217, + "learning_rate": 0.00014687875775948417, + "loss": 0.3797, + "step": 33900 + }, + { + "epoch": 1.132749866381614, + "grad_norm": 0.7323751449584961, + "learning_rate": 0.00014684622878834697, + "loss": 0.3784, + "step": 33910 + }, + { + "epoch": 1.1330839123463388, + "grad_norm": 0.6974325180053711, + "learning_rate": 0.00014681369346531607, + "loss": 0.4077, + "step": 33920 + }, + { + "epoch": 1.1334179583110635, + "grad_norm": 0.5742664337158203, + "learning_rate": 0.0001467811517948029, + "loss": 0.3878, + "step": 33930 + }, + { + "epoch": 1.1337520042757883, + "grad_norm": 0.7360290288925171, + "learning_rate": 0.0001467486037812198, + "loss": 0.417, + "step": 33940 + }, + { + "epoch": 1.134086050240513, + "grad_norm": 0.6318567395210266, + "learning_rate": 0.00014671604942897998, + "loss": 0.3879, + "step": 33950 + }, + { + "epoch": 1.1344200962052378, + "grad_norm": 0.6107730269432068, + "learning_rate": 0.00014668348874249747, + "loss": 0.3895, + "step": 33960 + }, + { + "epoch": 1.1347541421699625, + "grad_norm": 0.6970890760421753, + "learning_rate": 0.00014665092172618722, + "loss": 0.4076, + "step": 33970 + }, + { + "epoch": 1.1350881881346873, + "grad_norm": 0.585902988910675, + "learning_rate": 0.00014661834838446496, + "loss": 0.3868, + "step": 33980 + }, + { + "epoch": 1.135422234099412, + "grad_norm": 0.6418012380599976, + "learning_rate": 0.00014658576872174733, + "loss": 0.3795, + "step": 33990 + }, + { + "epoch": 1.1357562800641368, + "grad_norm": 0.5973289012908936, + "learning_rate": 0.00014655318274245184, + "loss": 0.4051, + "step": 34000 + }, + { + "epoch": 1.1357562800641368, + "eval_chrf": 85.6261819758287, + "eval_loss": 0.7260558605194092, + "eval_runtime": 122.1238, + "eval_samples_per_second": 0.819, + "eval_steps_per_second": 0.033, + "step": 34000 + }, + { + "epoch": 1.1360903260288615, + "grad_norm": 0.7079175710678101, + "learning_rate": 0.00014652059045099685, + "loss": 0.4086, + "step": 34010 + }, + { + "epoch": 1.1364243719935863, + "grad_norm": 0.6007031798362732, + "learning_rate": 0.00014648799185180148, + "loss": 0.4024, + "step": 34020 + }, + { + "epoch": 1.136758417958311, + "grad_norm": 0.6341286897659302, + "learning_rate": 0.00014645538694928585, + "loss": 0.3975, + "step": 34030 + }, + { + "epoch": 1.1370924639230358, + "grad_norm": 0.6621591448783875, + "learning_rate": 0.0001464227757478708, + "loss": 0.4078, + "step": 34040 + }, + { + "epoch": 1.1374265098877605, + "grad_norm": 0.7006877064704895, + "learning_rate": 0.0001463901582519782, + "loss": 0.4204, + "step": 34050 + }, + { + "epoch": 1.1377605558524853, + "grad_norm": 0.535763144493103, + "learning_rate": 0.00014635753446603058, + "loss": 0.4001, + "step": 34060 + }, + { + "epoch": 1.13809460181721, + "grad_norm": 0.6579592823982239, + "learning_rate": 0.00014632490439445142, + "loss": 0.4119, + "step": 34070 + }, + { + "epoch": 1.1384286477819348, + "grad_norm": 0.7648515701293945, + "learning_rate": 0.00014629226804166508, + "loss": 0.3872, + "step": 34080 + }, + { + "epoch": 1.1387626937466595, + "grad_norm": 0.5461599826812744, + "learning_rate": 0.00014625962541209671, + "loss": 0.3852, + "step": 34090 + }, + { + "epoch": 1.1390967397113843, + "grad_norm": 0.766132116317749, + "learning_rate": 0.0001462269765101723, + "loss": 0.3991, + "step": 34100 + }, + { + "epoch": 1.139430785676109, + "grad_norm": 0.601845920085907, + "learning_rate": 0.00014619432134031876, + "loss": 0.4139, + "step": 34110 + }, + { + "epoch": 1.1397648316408338, + "grad_norm": 0.8147502541542053, + "learning_rate": 0.00014616165990696383, + "loss": 0.4055, + "step": 34120 + }, + { + "epoch": 1.1400988776055585, + "grad_norm": 0.583740770816803, + "learning_rate": 0.00014612899221453605, + "loss": 0.3923, + "step": 34130 + }, + { + "epoch": 1.1404329235702833, + "grad_norm": 0.5663930773735046, + "learning_rate": 0.00014609631826746484, + "loss": 0.4002, + "step": 34140 + }, + { + "epoch": 1.140766969535008, + "grad_norm": 0.7407366037368774, + "learning_rate": 0.00014606363807018047, + "loss": 0.4107, + "step": 34150 + }, + { + "epoch": 1.1411010154997328, + "grad_norm": 0.6485843062400818, + "learning_rate": 0.00014603095162711411, + "loss": 0.4112, + "step": 34160 + }, + { + "epoch": 1.1414350614644575, + "grad_norm": 0.6844989657402039, + "learning_rate": 0.00014599825894269768, + "loss": 0.3845, + "step": 34170 + }, + { + "epoch": 1.1417691074291823, + "grad_norm": 0.7122927904129028, + "learning_rate": 0.000145965560021364, + "loss": 0.4065, + "step": 34180 + }, + { + "epoch": 1.142103153393907, + "grad_norm": 0.7450219392776489, + "learning_rate": 0.00014593285486754666, + "loss": 0.4194, + "step": 34190 + }, + { + "epoch": 1.1424371993586317, + "grad_norm": 0.6252874135971069, + "learning_rate": 0.0001459001434856803, + "loss": 0.4167, + "step": 34200 + }, + { + "epoch": 1.1427712453233565, + "grad_norm": 0.6802263855934143, + "learning_rate": 0.00014586742588020016, + "loss": 0.3908, + "step": 34210 + }, + { + "epoch": 1.1431052912880812, + "grad_norm": 0.7939043641090393, + "learning_rate": 0.00014583470205554246, + "loss": 0.4068, + "step": 34220 + }, + { + "epoch": 1.143439337252806, + "grad_norm": 0.6337961554527283, + "learning_rate": 0.0001458019720161442, + "loss": 0.3818, + "step": 34230 + }, + { + "epoch": 1.1437733832175307, + "grad_norm": 0.8145526647567749, + "learning_rate": 0.00014576923576644334, + "loss": 0.3754, + "step": 34240 + }, + { + "epoch": 1.1441074291822555, + "grad_norm": 0.626798689365387, + "learning_rate": 0.00014573649331087852, + "loss": 0.3818, + "step": 34250 + }, + { + "epoch": 1.1444414751469802, + "grad_norm": 0.6277692914009094, + "learning_rate": 0.0001457037446538893, + "loss": 0.3823, + "step": 34260 + }, + { + "epoch": 1.144775521111705, + "grad_norm": 0.6918072700500488, + "learning_rate": 0.00014567098979991613, + "loss": 0.4021, + "step": 34270 + }, + { + "epoch": 1.1451095670764297, + "grad_norm": 0.5897974371910095, + "learning_rate": 0.00014563822875340022, + "loss": 0.3731, + "step": 34280 + }, + { + "epoch": 1.1454436130411545, + "grad_norm": 0.6564681529998779, + "learning_rate": 0.00014560546151878365, + "loss": 0.4173, + "step": 34290 + }, + { + "epoch": 1.1457776590058792, + "grad_norm": 0.747519850730896, + "learning_rate": 0.00014557268810050933, + "loss": 0.3974, + "step": 34300 + }, + { + "epoch": 1.146111704970604, + "grad_norm": 0.7929636240005493, + "learning_rate": 0.00014553990850302103, + "loss": 0.3939, + "step": 34310 + }, + { + "epoch": 1.1464457509353287, + "grad_norm": 0.8233813047409058, + "learning_rate": 0.00014550712273076337, + "loss": 0.3867, + "step": 34320 + }, + { + "epoch": 1.1467797969000535, + "grad_norm": 0.649601936340332, + "learning_rate": 0.00014547433078818177, + "loss": 0.3835, + "step": 34330 + }, + { + "epoch": 1.1471138428647782, + "grad_norm": 0.6235836148262024, + "learning_rate": 0.00014544153267972244, + "loss": 0.419, + "step": 34340 + }, + { + "epoch": 1.147447888829503, + "grad_norm": 0.6260568499565125, + "learning_rate": 0.00014540872840983253, + "loss": 0.3649, + "step": 34350 + }, + { + "epoch": 1.1477819347942277, + "grad_norm": 0.6489930152893066, + "learning_rate": 0.00014537591798296002, + "loss": 0.3732, + "step": 34360 + }, + { + "epoch": 1.1481159807589525, + "grad_norm": 0.5193274617195129, + "learning_rate": 0.0001453431014035536, + "loss": 0.4001, + "step": 34370 + }, + { + "epoch": 1.1484500267236772, + "grad_norm": 0.6012530326843262, + "learning_rate": 0.00014531027867606297, + "loss": 0.3909, + "step": 34380 + }, + { + "epoch": 1.148784072688402, + "grad_norm": 0.5265390872955322, + "learning_rate": 0.0001452774498049385, + "loss": 0.3827, + "step": 34390 + }, + { + "epoch": 1.1491181186531267, + "grad_norm": 0.7247859239578247, + "learning_rate": 0.00014524461479463148, + "loss": 0.4139, + "step": 34400 + }, + { + "epoch": 1.1494521646178515, + "grad_norm": 0.6239689588546753, + "learning_rate": 0.00014521177364959408, + "loss": 0.4109, + "step": 34410 + }, + { + "epoch": 1.1497862105825762, + "grad_norm": 0.6319902539253235, + "learning_rate": 0.00014517892637427916, + "loss": 0.3975, + "step": 34420 + }, + { + "epoch": 1.150120256547301, + "grad_norm": 0.6480329036712646, + "learning_rate": 0.00014514607297314055, + "loss": 0.4078, + "step": 34430 + }, + { + "epoch": 1.1504543025120257, + "grad_norm": 0.6118185520172119, + "learning_rate": 0.0001451132134506328, + "loss": 0.378, + "step": 34440 + }, + { + "epoch": 1.1507883484767505, + "grad_norm": 0.5503482222557068, + "learning_rate": 0.0001450803478112114, + "loss": 0.3665, + "step": 34450 + }, + { + "epoch": 1.1511223944414752, + "grad_norm": 0.7252528071403503, + "learning_rate": 0.0001450474760593326, + "loss": 0.3941, + "step": 34460 + }, + { + "epoch": 1.1514564404062, + "grad_norm": 0.6137741804122925, + "learning_rate": 0.0001450145981994534, + "loss": 0.3965, + "step": 34470 + }, + { + "epoch": 1.1517904863709245, + "grad_norm": 0.7802608609199524, + "learning_rate": 0.0001449817142360318, + "loss": 0.3864, + "step": 34480 + }, + { + "epoch": 1.1521245323356495, + "grad_norm": 0.5683929920196533, + "learning_rate": 0.00014494882417352662, + "loss": 0.4092, + "step": 34490 + }, + { + "epoch": 1.152458578300374, + "grad_norm": 0.5983339548110962, + "learning_rate": 0.00014491592801639728, + "loss": 0.3852, + "step": 34500 + }, + { + "epoch": 1.152792624265099, + "grad_norm": 0.7088465094566345, + "learning_rate": 0.00014488302576910427, + "loss": 0.3764, + "step": 34510 + }, + { + "epoch": 1.1531266702298235, + "grad_norm": 0.6307457089424133, + "learning_rate": 0.0001448501174361088, + "loss": 0.3804, + "step": 34520 + }, + { + "epoch": 1.1534607161945485, + "grad_norm": 0.5835682153701782, + "learning_rate": 0.0001448172030218729, + "loss": 0.3893, + "step": 34530 + }, + { + "epoch": 1.153794762159273, + "grad_norm": 0.6112266182899475, + "learning_rate": 0.00014478428253085945, + "loss": 0.379, + "step": 34540 + }, + { + "epoch": 1.154128808123998, + "grad_norm": 0.5913633704185486, + "learning_rate": 0.00014475135596753218, + "loss": 0.3659, + "step": 34550 + }, + { + "epoch": 1.1544628540887225, + "grad_norm": 0.6673291325569153, + "learning_rate": 0.0001447184233363556, + "loss": 0.368, + "step": 34560 + }, + { + "epoch": 1.1547969000534473, + "grad_norm": 0.6723556518554688, + "learning_rate": 0.000144685484641795, + "loss": 0.3712, + "step": 34570 + }, + { + "epoch": 1.155130946018172, + "grad_norm": 0.7449964880943298, + "learning_rate": 0.00014465253988831664, + "loss": 0.3944, + "step": 34580 + }, + { + "epoch": 1.1554649919828968, + "grad_norm": 0.6911473870277405, + "learning_rate": 0.00014461958908038743, + "loss": 0.3924, + "step": 34590 + }, + { + "epoch": 1.1557990379476215, + "grad_norm": 0.731319785118103, + "learning_rate": 0.00014458663222247523, + "loss": 0.3931, + "step": 34600 + }, + { + "epoch": 1.1561330839123463, + "grad_norm": 0.6461872458457947, + "learning_rate": 0.00014455366931904865, + "loss": 0.3924, + "step": 34610 + }, + { + "epoch": 1.156467129877071, + "grad_norm": 0.659043550491333, + "learning_rate": 0.00014452070037457713, + "loss": 0.3886, + "step": 34620 + }, + { + "epoch": 1.1568011758417958, + "grad_norm": 0.6538869738578796, + "learning_rate": 0.00014448772539353097, + "loss": 0.3768, + "step": 34630 + }, + { + "epoch": 1.1571352218065205, + "grad_norm": 0.6356179714202881, + "learning_rate": 0.00014445474438038122, + "loss": 0.3682, + "step": 34640 + }, + { + "epoch": 1.1574692677712453, + "grad_norm": 0.6394898295402527, + "learning_rate": 0.00014442175733959985, + "loss": 0.3979, + "step": 34650 + }, + { + "epoch": 1.15780331373597, + "grad_norm": 0.6987412571907043, + "learning_rate": 0.00014438876427565943, + "loss": 0.3832, + "step": 34660 + }, + { + "epoch": 1.1581373597006948, + "grad_norm": 0.6630870699882507, + "learning_rate": 0.0001443557651930337, + "loss": 0.3574, + "step": 34670 + }, + { + "epoch": 1.1584714056654195, + "grad_norm": 0.907490074634552, + "learning_rate": 0.00014432276009619687, + "loss": 0.4015, + "step": 34680 + }, + { + "epoch": 1.1588054516301443, + "grad_norm": 0.6965689063072205, + "learning_rate": 0.00014428974898962417, + "loss": 0.3885, + "step": 34690 + }, + { + "epoch": 1.159139497594869, + "grad_norm": 0.6783816814422607, + "learning_rate": 0.00014425673187779158, + "loss": 0.3891, + "step": 34700 + }, + { + "epoch": 1.1594735435595938, + "grad_norm": 0.6534827351570129, + "learning_rate": 0.0001442237087651759, + "loss": 0.4209, + "step": 34710 + }, + { + "epoch": 1.1598075895243185, + "grad_norm": 0.7539647817611694, + "learning_rate": 0.0001441906796562547, + "loss": 0.4032, + "step": 34720 + }, + { + "epoch": 1.1601416354890433, + "grad_norm": 0.7617059946060181, + "learning_rate": 0.0001441576445555065, + "loss": 0.377, + "step": 34730 + }, + { + "epoch": 1.160475681453768, + "grad_norm": 0.7518334984779358, + "learning_rate": 0.00014412460346741046, + "loss": 0.3841, + "step": 34740 + }, + { + "epoch": 1.1608097274184928, + "grad_norm": 0.7835471630096436, + "learning_rate": 0.00014409155639644666, + "loss": 0.422, + "step": 34750 + }, + { + "epoch": 1.1611437733832175, + "grad_norm": 0.6011009812355042, + "learning_rate": 0.00014405850334709593, + "loss": 0.3851, + "step": 34760 + }, + { + "epoch": 1.1614778193479423, + "grad_norm": 0.5605944991111755, + "learning_rate": 0.00014402544432383997, + "loss": 0.4015, + "step": 34770 + }, + { + "epoch": 1.161811865312667, + "grad_norm": 0.6632351875305176, + "learning_rate": 0.00014399237933116128, + "loss": 0.4029, + "step": 34780 + }, + { + "epoch": 1.1621459112773918, + "grad_norm": 0.7774651050567627, + "learning_rate": 0.00014395930837354315, + "loss": 0.3701, + "step": 34790 + }, + { + "epoch": 1.1624799572421165, + "grad_norm": 0.7308304309844971, + "learning_rate": 0.00014392623145546965, + "loss": 0.3549, + "step": 34800 + }, + { + "epoch": 1.1628140032068413, + "grad_norm": 0.6493458151817322, + "learning_rate": 0.00014389314858142567, + "loss": 0.3594, + "step": 34810 + }, + { + "epoch": 1.163148049171566, + "grad_norm": 0.6181018948554993, + "learning_rate": 0.000143860059755897, + "loss": 0.3721, + "step": 34820 + }, + { + "epoch": 1.1634820951362908, + "grad_norm": 0.5286789536476135, + "learning_rate": 0.00014382696498337012, + "loss": 0.4216, + "step": 34830 + }, + { + "epoch": 1.1638161411010155, + "grad_norm": 0.6694506406784058, + "learning_rate": 0.00014379386426833235, + "loss": 0.4165, + "step": 34840 + }, + { + "epoch": 1.1641501870657402, + "grad_norm": 0.7325911521911621, + "learning_rate": 0.00014376075761527183, + "loss": 0.3702, + "step": 34850 + }, + { + "epoch": 1.164484233030465, + "grad_norm": 0.7271037101745605, + "learning_rate": 0.00014372764502867754, + "loss": 0.4045, + "step": 34860 + }, + { + "epoch": 1.1648182789951897, + "grad_norm": 0.685337245464325, + "learning_rate": 0.00014369452651303912, + "loss": 0.394, + "step": 34870 + }, + { + "epoch": 1.1651523249599145, + "grad_norm": 0.7185946702957153, + "learning_rate": 0.00014366140207284727, + "loss": 0.3935, + "step": 34880 + }, + { + "epoch": 1.1654863709246392, + "grad_norm": 0.612058162689209, + "learning_rate": 0.0001436282717125932, + "loss": 0.3716, + "step": 34890 + }, + { + "epoch": 1.165820416889364, + "grad_norm": 0.5456045269966125, + "learning_rate": 0.00014359513543676922, + "loss": 0.4082, + "step": 34900 + }, + { + "epoch": 1.1661544628540887, + "grad_norm": 0.6410994529724121, + "learning_rate": 0.00014356199324986813, + "loss": 0.3757, + "step": 34910 + }, + { + "epoch": 1.1664885088188135, + "grad_norm": 0.6261378526687622, + "learning_rate": 0.0001435288451563838, + "loss": 0.4026, + "step": 34920 + }, + { + "epoch": 1.1668225547835382, + "grad_norm": 0.702870786190033, + "learning_rate": 0.00014349569116081068, + "loss": 0.398, + "step": 34930 + }, + { + "epoch": 1.167156600748263, + "grad_norm": 0.6722800135612488, + "learning_rate": 0.00014346253126764426, + "loss": 0.3802, + "step": 34940 + }, + { + "epoch": 1.1674906467129877, + "grad_norm": 0.5611696839332581, + "learning_rate": 0.00014342936548138058, + "loss": 0.3727, + "step": 34950 + }, + { + "epoch": 1.1678246926777125, + "grad_norm": 0.7008408308029175, + "learning_rate": 0.0001433961938065167, + "loss": 0.3699, + "step": 34960 + }, + { + "epoch": 1.1681587386424372, + "grad_norm": 0.6457080841064453, + "learning_rate": 0.00014336301624755032, + "loss": 0.3959, + "step": 34970 + }, + { + "epoch": 1.168492784607162, + "grad_norm": 0.62956702709198, + "learning_rate": 0.00014332983280898, + "loss": 0.3797, + "step": 34980 + }, + { + "epoch": 1.1688268305718867, + "grad_norm": 0.5156487226486206, + "learning_rate": 0.0001432966434953051, + "loss": 0.4008, + "step": 34990 + }, + { + "epoch": 1.1691608765366115, + "grad_norm": 0.7267006635665894, + "learning_rate": 0.00014326344831102573, + "loss": 0.3891, + "step": 35000 + }, + { + "epoch": 1.1691608765366115, + "eval_chrf": 82.80436321787339, + "eval_loss": 0.7284471988677979, + "eval_runtime": 185.6656, + "eval_samples_per_second": 0.539, + "eval_steps_per_second": 0.022, + "step": 35000 + } + ], + "logging_steps": 10, + "max_steps": 89808, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.686818885590516e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}