diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100755--- "a/trainer_state.json" +++ /dev/null @@ -1,12447 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.99974356782631, - "eval_steps": 1000, - "global_step": 17547, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0017095478246003932, - "grad_norm": 1.1821174621582031, - "learning_rate": 2.3098631214675524e-05, - "loss": 4.1548, - "step": 10 - }, - { - "epoch": 0.0034190956492007865, - "grad_norm": 1.193515419960022, - "learning_rate": 2.9600467075577225e-05, - "loss": 4.1713, - "step": 20 - }, - { - "epoch": 0.005128643473801179, - "grad_norm": 1.1349470615386963, - "learning_rate": 3.340379724004877e-05, - "loss": 4.1866, - "step": 30 - }, - { - "epoch": 0.006838191298401573, - "grad_norm": 1.111504077911377, - "learning_rate": 3.610230293647892e-05, - "loss": 4.2217, - "step": 40 - }, - { - "epoch": 0.008547739123001967, - "grad_norm": 1.159877896308899, - "learning_rate": 3.819542656844934e-05, - "loss": 4.2092, - "step": 50 - }, - { - "epoch": 0.010257286947602359, - "grad_norm": 1.16355562210083, - "learning_rate": 3.9905633100950474e-05, - "loss": 4.1877, - "step": 60 - }, - { - "epoch": 0.011966834772202752, - "grad_norm": 1.1880412101745605, - "learning_rate": 4.135159212118856e-05, - "loss": 4.1561, - "step": 70 - }, - { - "epoch": 0.013676382596803146, - "grad_norm": 1.1700432300567627, - "learning_rate": 4.260413879738062e-05, - "loss": 4.1462, - "step": 80 - }, - { - "epoch": 0.01538593042140354, - "grad_norm": 1.111504077911377, - "learning_rate": 4.3708963265422026e-05, - "loss": 4.2, - "step": 90 - }, - { - "epoch": 0.017095478246003933, - "grad_norm": 1.1691927909851074, - "learning_rate": 4.469726242935105e-05, - "loss": 4.1791, - "step": 100 - }, - { - "epoch": 0.018805026070604323, - "grad_norm": 1.1136229038238525, - "learning_rate": 4.5591287771068725e-05, - "loss": 4.1666, - "step": 110 - }, - { - "epoch": 0.020514573895204717, - "grad_norm": 1.1708928346633911, - "learning_rate": 4.640746896185217e-05, - "loss": 4.1895, - "step": 120 - }, - { - "epoch": 0.02222412171980511, - "grad_norm": 1.155065655708313, - "learning_rate": 4.7158282875190257e-05, - "loss": 4.2258, - "step": 130 - }, - { - "epoch": 0.023933669544405504, - "grad_norm": 1.1818805932998657, - "learning_rate": 4.785342798209025e-05, - "loss": 4.198, - "step": 140 - }, - { - "epoch": 0.025643217369005898, - "grad_norm": 1.1986870765686035, - "learning_rate": 4.850059259382259e-05, - "loss": 4.2076, - "step": 150 - }, - { - "epoch": 0.027352765193606292, - "grad_norm": 1.194960117340088, - "learning_rate": 4.9105974658282323e-05, - "loss": 4.2284, - "step": 160 - }, - { - "epoch": 0.029062313018206685, - "grad_norm": 1.1937111616134644, - "learning_rate": 4.967464369602014e-05, - "loss": 4.1888, - "step": 170 - }, - { - "epoch": 0.03077186084280708, - "grad_norm": 1.1659150123596191, - "learning_rate": 4.9999994112612914e-05, - "loss": 4.2267, - "step": 180 - }, - { - "epoch": 0.03248140866740747, - "grad_norm": 1.2013612985610962, - "learning_rate": 4.9999927879543565e-05, - "loss": 4.2247, - "step": 190 - }, - { - "epoch": 0.034190956492007867, - "grad_norm": 1.1687592267990112, - "learning_rate": 4.999978805438837e-05, - "loss": 4.2525, - "step": 200 - }, - { - "epoch": 0.03590050431660826, - "grad_norm": 1.1863995790481567, - "learning_rate": 4.999957463760467e-05, - "loss": 4.2098, - "step": 210 - }, - { - "epoch": 0.03761005214120865, - "grad_norm": 1.1269944906234741, - "learning_rate": 4.999928762989048e-05, - "loss": 4.2399, - "step": 220 - }, - { - "epoch": 0.03931959996580904, - "grad_norm": 1.1635268926620483, - "learning_rate": 4.9998927032184556e-05, - "loss": 4.2367, - "step": 230 - }, - { - "epoch": 0.041029147790409434, - "grad_norm": 1.1812241077423096, - "learning_rate": 4.9998492845666314e-05, - "loss": 4.2138, - "step": 240 - }, - { - "epoch": 0.04273869561500983, - "grad_norm": 1.1617891788482666, - "learning_rate": 4.99979850717559e-05, - "loss": 4.201, - "step": 250 - }, - { - "epoch": 0.04444824343961022, - "grad_norm": 1.2027676105499268, - "learning_rate": 4.999740371211411e-05, - "loss": 4.1907, - "step": 260 - }, - { - "epoch": 0.046157791264210615, - "grad_norm": 1.1802945137023926, - "learning_rate": 4.999674876864244e-05, - "loss": 4.2068, - "step": 270 - }, - { - "epoch": 0.04786733908881101, - "grad_norm": 1.1236801147460938, - "learning_rate": 4.9996020243483065e-05, - "loss": 4.2343, - "step": 280 - }, - { - "epoch": 0.0495768869134114, - "grad_norm": 1.1623650789260864, - "learning_rate": 4.999521813901881e-05, - "loss": 4.2541, - "step": 290 - }, - { - "epoch": 0.051286434738011796, - "grad_norm": 1.1530183553695679, - "learning_rate": 4.999434245787319e-05, - "loss": 4.2239, - "step": 300 - }, - { - "epoch": 0.05299598256261219, - "grad_norm": 1.1978169679641724, - "learning_rate": 4.999339320291035e-05, - "loss": 4.2451, - "step": 310 - }, - { - "epoch": 0.054705530387212584, - "grad_norm": 1.1139030456542969, - "learning_rate": 4.999237037723508e-05, - "loss": 4.2139, - "step": 320 - }, - { - "epoch": 0.05641507821181298, - "grad_norm": 1.1536692380905151, - "learning_rate": 4.999127398419281e-05, - "loss": 4.2152, - "step": 330 - }, - { - "epoch": 0.05812462603641337, - "grad_norm": 1.1244193315505981, - "learning_rate": 4.999010402736959e-05, - "loss": 4.2172, - "step": 340 - }, - { - "epoch": 0.059834173861013765, - "grad_norm": 1.1217882633209229, - "learning_rate": 4.9988860510592086e-05, - "loss": 4.2495, - "step": 350 - }, - { - "epoch": 0.06154372168561416, - "grad_norm": 1.1943985223770142, - "learning_rate": 4.9987543437927546e-05, - "loss": 4.2158, - "step": 360 - }, - { - "epoch": 0.06325326951021455, - "grad_norm": 1.12190580368042, - "learning_rate": 4.998615281368382e-05, - "loss": 4.2651, - "step": 370 - }, - { - "epoch": 0.06496281733481495, - "grad_norm": 1.151970386505127, - "learning_rate": 4.9984688642409306e-05, - "loss": 4.1822, - "step": 380 - }, - { - "epoch": 0.06667236515941534, - "grad_norm": 1.1749805212020874, - "learning_rate": 4.9983150928892985e-05, - "loss": 4.2183, - "step": 390 - }, - { - "epoch": 0.06838191298401573, - "grad_norm": 1.1214799880981445, - "learning_rate": 4.9981539678164355e-05, - "loss": 4.2691, - "step": 400 - }, - { - "epoch": 0.07009146080861613, - "grad_norm": 1.158668875694275, - "learning_rate": 4.997985489549346e-05, - "loss": 4.231, - "step": 410 - }, - { - "epoch": 0.07180100863321652, - "grad_norm": 1.193569302558899, - "learning_rate": 4.997809658639082e-05, - "loss": 4.2476, - "step": 420 - }, - { - "epoch": 0.07351055645781691, - "grad_norm": 1.1600579023361206, - "learning_rate": 4.997626475660746e-05, - "loss": 4.2166, - "step": 430 - }, - { - "epoch": 0.0752201042824173, - "grad_norm": 1.1265846490859985, - "learning_rate": 4.997435941213489e-05, - "loss": 4.2515, - "step": 440 - }, - { - "epoch": 0.07692965210701769, - "grad_norm": 1.1360176801681519, - "learning_rate": 4.9972380559205034e-05, - "loss": 4.2447, - "step": 450 - }, - { - "epoch": 0.07863919993161808, - "grad_norm": 1.1372891664505005, - "learning_rate": 4.9970328204290274e-05, - "loss": 4.2377, - "step": 460 - }, - { - "epoch": 0.08034874775621847, - "grad_norm": 1.1625288724899292, - "learning_rate": 4.99682023541034e-05, - "loss": 4.2279, - "step": 470 - }, - { - "epoch": 0.08205829558081887, - "grad_norm": 1.1566433906555176, - "learning_rate": 4.996600301559756e-05, - "loss": 4.1933, - "step": 480 - }, - { - "epoch": 0.08376784340541926, - "grad_norm": 1.1480728387832642, - "learning_rate": 4.996373019596629e-05, - "loss": 4.2615, - "step": 490 - }, - { - "epoch": 0.08547739123001966, - "grad_norm": 1.1991804838180542, - "learning_rate": 4.996138390264347e-05, - "loss": 4.2741, - "step": 500 - }, - { - "epoch": 0.08718693905462005, - "grad_norm": 1.107237696647644, - "learning_rate": 4.995896414330327e-05, - "loss": 4.2693, - "step": 510 - }, - { - "epoch": 0.08889648687922044, - "grad_norm": 1.1205779314041138, - "learning_rate": 4.9956470925860184e-05, - "loss": 4.2686, - "step": 520 - }, - { - "epoch": 0.09060603470382084, - "grad_norm": 1.165174961090088, - "learning_rate": 4.995390425846894e-05, - "loss": 4.2429, - "step": 530 - }, - { - "epoch": 0.09231558252842123, - "grad_norm": 1.154123306274414, - "learning_rate": 4.9951264149524516e-05, - "loss": 4.2725, - "step": 540 - }, - { - "epoch": 0.09402513035302162, - "grad_norm": 1.155328392982483, - "learning_rate": 4.99485506076621e-05, - "loss": 4.2227, - "step": 550 - }, - { - "epoch": 0.09573467817762202, - "grad_norm": 1.1417720317840576, - "learning_rate": 4.9945763641757085e-05, - "loss": 4.2685, - "step": 560 - }, - { - "epoch": 0.09744422600222241, - "grad_norm": 1.1608834266662598, - "learning_rate": 4.9942903260924956e-05, - "loss": 4.301, - "step": 570 - }, - { - "epoch": 0.0991537738268228, - "grad_norm": 1.1794325113296509, - "learning_rate": 4.9939969474521384e-05, - "loss": 4.236, - "step": 580 - }, - { - "epoch": 0.1008633216514232, - "grad_norm": 1.2055026292800903, - "learning_rate": 4.993696229214211e-05, - "loss": 4.2268, - "step": 590 - }, - { - "epoch": 0.10257286947602359, - "grad_norm": 1.15099036693573, - "learning_rate": 4.993388172362294e-05, - "loss": 4.2463, - "step": 600 - }, - { - "epoch": 0.10428241730062399, - "grad_norm": 1.156812071800232, - "learning_rate": 4.993072777903969e-05, - "loss": 4.2566, - "step": 610 - }, - { - "epoch": 0.10599196512522438, - "grad_norm": 1.1477752923965454, - "learning_rate": 4.992750046870819e-05, - "loss": 4.2726, - "step": 620 - }, - { - "epoch": 0.10770151294982477, - "grad_norm": 1.1635210514068604, - "learning_rate": 4.992419980318423e-05, - "loss": 4.1988, - "step": 630 - }, - { - "epoch": 0.10941106077442517, - "grad_norm": 1.1531188488006592, - "learning_rate": 4.992082579326354e-05, - "loss": 4.2282, - "step": 640 - }, - { - "epoch": 0.11112060859902556, - "grad_norm": 1.1423894166946411, - "learning_rate": 4.991737844998171e-05, - "loss": 4.2511, - "step": 650 - }, - { - "epoch": 0.11283015642362595, - "grad_norm": 1.1461811065673828, - "learning_rate": 4.991385778461422e-05, - "loss": 4.2269, - "step": 660 - }, - { - "epoch": 0.11453970424822635, - "grad_norm": 1.141455888748169, - "learning_rate": 4.991026380867635e-05, - "loss": 4.2239, - "step": 670 - }, - { - "epoch": 0.11624925207282674, - "grad_norm": 1.1366734504699707, - "learning_rate": 4.990659653392317e-05, - "loss": 4.2275, - "step": 680 - }, - { - "epoch": 0.11795879989742714, - "grad_norm": 1.1327872276306152, - "learning_rate": 4.9902855972349496e-05, - "loss": 4.2478, - "step": 690 - }, - { - "epoch": 0.11966834772202753, - "grad_norm": 1.1548588275909424, - "learning_rate": 4.9899042136189846e-05, - "loss": 4.2505, - "step": 700 - }, - { - "epoch": 0.12137789554662792, - "grad_norm": 1.136805772781372, - "learning_rate": 4.989515503791839e-05, - "loss": 4.2448, - "step": 710 - }, - { - "epoch": 0.12308744337122832, - "grad_norm": 1.1104696989059448, - "learning_rate": 4.9891194690248936e-05, - "loss": 4.2441, - "step": 720 - }, - { - "epoch": 0.1247969911958287, - "grad_norm": 1.1387354135513306, - "learning_rate": 4.988716110613487e-05, - "loss": 4.2148, - "step": 730 - }, - { - "epoch": 0.1265065390204291, - "grad_norm": 1.1516064405441284, - "learning_rate": 4.9883054298769104e-05, - "loss": 4.2598, - "step": 740 - }, - { - "epoch": 0.1282160868450295, - "grad_norm": 1.158873200416565, - "learning_rate": 4.987887428158409e-05, - "loss": 4.244, - "step": 750 - }, - { - "epoch": 0.1299256346696299, - "grad_norm": 1.1433042287826538, - "learning_rate": 4.987462106825168e-05, - "loss": 4.2656, - "step": 760 - }, - { - "epoch": 0.13163518249423029, - "grad_norm": 1.5770233869552612, - "learning_rate": 4.987029467268317e-05, - "loss": 4.2769, - "step": 770 - }, - { - "epoch": 0.13334473031883068, - "grad_norm": 1.1281023025512695, - "learning_rate": 4.9865895109029186e-05, - "loss": 4.251, - "step": 780 - }, - { - "epoch": 0.13505427814343107, - "grad_norm": 1.1215118169784546, - "learning_rate": 4.9861422391679694e-05, - "loss": 4.2407, - "step": 790 - }, - { - "epoch": 0.13676382596803147, - "grad_norm": 1.1515504121780396, - "learning_rate": 4.985687653526394e-05, - "loss": 4.257, - "step": 800 - }, - { - "epoch": 0.13847337379263186, - "grad_norm": 1.0918059349060059, - "learning_rate": 4.985225755465035e-05, - "loss": 4.2381, - "step": 810 - }, - { - "epoch": 0.14018292161723225, - "grad_norm": 1.1224377155303955, - "learning_rate": 4.984756546494657e-05, - "loss": 4.2302, - "step": 820 - }, - { - "epoch": 0.14189246944183265, - "grad_norm": 1.1818785667419434, - "learning_rate": 4.984280028149932e-05, - "loss": 4.2459, - "step": 830 - }, - { - "epoch": 0.14360201726643304, - "grad_norm": 1.11785089969635, - "learning_rate": 4.983796201989444e-05, - "loss": 4.2125, - "step": 840 - }, - { - "epoch": 0.14531156509103343, - "grad_norm": 1.1406887769699097, - "learning_rate": 4.983305069595676e-05, - "loss": 4.2459, - "step": 850 - }, - { - "epoch": 0.14702111291563383, - "grad_norm": 1.1392037868499756, - "learning_rate": 4.9828066325750086e-05, - "loss": 4.2021, - "step": 860 - }, - { - "epoch": 0.14873066074023422, - "grad_norm": 1.139798641204834, - "learning_rate": 4.9823008925577156e-05, - "loss": 4.286, - "step": 870 - }, - { - "epoch": 0.1504402085648346, - "grad_norm": 1.2841224670410156, - "learning_rate": 4.9817878511979554e-05, - "loss": 4.2587, - "step": 880 - }, - { - "epoch": 0.15214975638943498, - "grad_norm": 1.1851228475570679, - "learning_rate": 4.981267510173769e-05, - "loss": 4.2458, - "step": 890 - }, - { - "epoch": 0.15385930421403538, - "grad_norm": 1.122159481048584, - "learning_rate": 4.980739871187073e-05, - "loss": 4.2803, - "step": 900 - }, - { - "epoch": 0.15556885203863577, - "grad_norm": 1.1415982246398926, - "learning_rate": 4.9802049359636504e-05, - "loss": 4.245, - "step": 910 - }, - { - "epoch": 0.15727839986323616, - "grad_norm": 1.1452499628067017, - "learning_rate": 4.979662706253153e-05, - "loss": 4.2073, - "step": 920 - }, - { - "epoch": 0.15898794768783656, - "grad_norm": 1.115782380104065, - "learning_rate": 4.9791131838290886e-05, - "loss": 4.2927, - "step": 930 - }, - { - "epoch": 0.16069749551243695, - "grad_norm": 1.1482303142547607, - "learning_rate": 4.9785563704888186e-05, - "loss": 4.2274, - "step": 940 - }, - { - "epoch": 0.16240704333703734, - "grad_norm": 1.123155951499939, - "learning_rate": 4.977992268053553e-05, - "loss": 4.2694, - "step": 950 - }, - { - "epoch": 0.16411659116163774, - "grad_norm": 1.1416784524917603, - "learning_rate": 4.9774208783683384e-05, - "loss": 4.2265, - "step": 960 - }, - { - "epoch": 0.16582613898623813, - "grad_norm": 1.121285319328308, - "learning_rate": 4.9768422033020613e-05, - "loss": 4.2615, - "step": 970 - }, - { - "epoch": 0.16753568681083852, - "grad_norm": 1.1549098491668701, - "learning_rate": 4.976256244747434e-05, - "loss": 4.3153, - "step": 980 - }, - { - "epoch": 0.16924523463543892, - "grad_norm": 1.1774247884750366, - "learning_rate": 4.975663004620991e-05, - "loss": 4.2361, - "step": 990 - }, - { - "epoch": 0.1709547824600393, - "grad_norm": 1.1121091842651367, - "learning_rate": 4.975062484863085e-05, - "loss": 4.2677, - "step": 1000 - }, - { - "epoch": 0.1709547824600393, - "eval_loss": 4.255488872528076, - "eval_runtime": 14.9726, - "eval_samples_per_second": 65.052, - "eval_steps_per_second": 0.868, - "step": 1000 - }, - { - "epoch": 0.1726643302846397, - "grad_norm": 1.1005922555923462, - "learning_rate": 4.974454687437878e-05, - "loss": 4.2919, - "step": 1010 - }, - { - "epoch": 0.1743738781092401, - "grad_norm": 1.0923320055007935, - "learning_rate": 4.973839614333335e-05, - "loss": 4.2096, - "step": 1020 - }, - { - "epoch": 0.1760834259338405, - "grad_norm": 1.1861742734909058, - "learning_rate": 4.9732172675612196e-05, - "loss": 4.2309, - "step": 1030 - }, - { - "epoch": 0.1777929737584409, - "grad_norm": 1.1588759422302246, - "learning_rate": 4.972587649157083e-05, - "loss": 4.2765, - "step": 1040 - }, - { - "epoch": 0.17950252158304128, - "grad_norm": 1.1154515743255615, - "learning_rate": 4.971950761180264e-05, - "loss": 4.2137, - "step": 1050 - }, - { - "epoch": 0.18121206940764167, - "grad_norm": 1.128987193107605, - "learning_rate": 4.9713066057138766e-05, - "loss": 4.2499, - "step": 1060 - }, - { - "epoch": 0.18292161723224207, - "grad_norm": 1.1510378122329712, - "learning_rate": 4.970655184864804e-05, - "loss": 4.2498, - "step": 1070 - }, - { - "epoch": 0.18463116505684246, - "grad_norm": 1.1662580966949463, - "learning_rate": 4.9699965007636937e-05, - "loss": 4.2622, - "step": 1080 - }, - { - "epoch": 0.18634071288144285, - "grad_norm": 1.1380529403686523, - "learning_rate": 4.969330555564951e-05, - "loss": 4.2203, - "step": 1090 - }, - { - "epoch": 0.18805026070604325, - "grad_norm": 1.10121488571167, - "learning_rate": 4.9686573514467286e-05, - "loss": 4.3229, - "step": 1100 - }, - { - "epoch": 0.18975980853064364, - "grad_norm": 1.1269512176513672, - "learning_rate": 4.967976890610922e-05, - "loss": 4.2354, - "step": 1110 - }, - { - "epoch": 0.19146935635524404, - "grad_norm": 1.1446894407272339, - "learning_rate": 4.967289175283163e-05, - "loss": 4.2699, - "step": 1120 - }, - { - "epoch": 0.19317890417984443, - "grad_norm": 1.182586908340454, - "learning_rate": 4.9665942077128086e-05, - "loss": 4.2577, - "step": 1130 - }, - { - "epoch": 0.19488845200444482, - "grad_norm": 1.126490831375122, - "learning_rate": 4.965891990172939e-05, - "loss": 4.2883, - "step": 1140 - }, - { - "epoch": 0.19659799982904522, - "grad_norm": 1.1510419845581055, - "learning_rate": 4.9651825249603465e-05, - "loss": 4.2294, - "step": 1150 - }, - { - "epoch": 0.1983075476536456, - "grad_norm": 1.1695685386657715, - "learning_rate": 4.9644658143955277e-05, - "loss": 4.261, - "step": 1160 - }, - { - "epoch": 0.200017095478246, - "grad_norm": 1.1542822122573853, - "learning_rate": 4.9637418608226776e-05, - "loss": 4.2271, - "step": 1170 - }, - { - "epoch": 0.2017266433028464, - "grad_norm": 1.1387609243392944, - "learning_rate": 4.963010666609682e-05, - "loss": 4.2707, - "step": 1180 - }, - { - "epoch": 0.2034361911274468, - "grad_norm": 1.126368761062622, - "learning_rate": 4.9622722341481104e-05, - "loss": 4.2245, - "step": 1190 - }, - { - "epoch": 0.20514573895204719, - "grad_norm": 1.1217005252838135, - "learning_rate": 4.961526565853203e-05, - "loss": 4.2707, - "step": 1200 - }, - { - "epoch": 0.20685528677664758, - "grad_norm": 1.1429729461669922, - "learning_rate": 4.960773664163871e-05, - "loss": 4.2612, - "step": 1210 - }, - { - "epoch": 0.20856483460124797, - "grad_norm": 1.1268584728240967, - "learning_rate": 4.960013531542682e-05, - "loss": 4.292, - "step": 1220 - }, - { - "epoch": 0.21027438242584837, - "grad_norm": 1.1454453468322754, - "learning_rate": 4.9592461704758534e-05, - "loss": 4.2558, - "step": 1230 - }, - { - "epoch": 0.21198393025044876, - "grad_norm": 1.1507179737091064, - "learning_rate": 4.958471583473249e-05, - "loss": 4.2748, - "step": 1240 - }, - { - "epoch": 0.21369347807504915, - "grad_norm": 1.153035283088684, - "learning_rate": 4.9576897730683636e-05, - "loss": 4.2617, - "step": 1250 - }, - { - "epoch": 0.21540302589964955, - "grad_norm": 1.1353284120559692, - "learning_rate": 4.9569007418183193e-05, - "loss": 4.3057, - "step": 1260 - }, - { - "epoch": 0.21711257372424994, - "grad_norm": 1.0776561498641968, - "learning_rate": 4.956104492303855e-05, - "loss": 4.2484, - "step": 1270 - }, - { - "epoch": 0.21882212154885033, - "grad_norm": 1.1253888607025146, - "learning_rate": 4.955301027129321e-05, - "loss": 4.242, - "step": 1280 - }, - { - "epoch": 0.22053166937345073, - "grad_norm": 1.1258540153503418, - "learning_rate": 4.9544903489226656e-05, - "loss": 4.2533, - "step": 1290 - }, - { - "epoch": 0.22224121719805112, - "grad_norm": 1.1274919509887695, - "learning_rate": 4.953672460335431e-05, - "loss": 4.2394, - "step": 1300 - }, - { - "epoch": 0.22395076502265152, - "grad_norm": 1.1325613260269165, - "learning_rate": 4.952847364042743e-05, - "loss": 4.2695, - "step": 1310 - }, - { - "epoch": 0.2256603128472519, - "grad_norm": 1.1356984376907349, - "learning_rate": 4.952015062743301e-05, - "loss": 4.2455, - "step": 1320 - }, - { - "epoch": 0.2273698606718523, - "grad_norm": 1.142335057258606, - "learning_rate": 4.9511755591593715e-05, - "loss": 4.2751, - "step": 1330 - }, - { - "epoch": 0.2290794084964527, - "grad_norm": 1.1270580291748047, - "learning_rate": 4.950328856036777e-05, - "loss": 4.2677, - "step": 1340 - }, - { - "epoch": 0.2307889563210531, - "grad_norm": 1.1133089065551758, - "learning_rate": 4.9494749561448904e-05, - "loss": 4.2357, - "step": 1350 - }, - { - "epoch": 0.23249850414565348, - "grad_norm": 1.1664237976074219, - "learning_rate": 4.9486138622766186e-05, - "loss": 4.2545, - "step": 1360 - }, - { - "epoch": 0.23420805197025388, - "grad_norm": 1.0999196767807007, - "learning_rate": 4.9477455772484045e-05, - "loss": 4.2491, - "step": 1370 - }, - { - "epoch": 0.23591759979485427, - "grad_norm": 1.117451548576355, - "learning_rate": 4.9468701039002065e-05, - "loss": 4.2063, - "step": 1380 - }, - { - "epoch": 0.23762714761945467, - "grad_norm": 1.1639446020126343, - "learning_rate": 4.9459874450954974e-05, - "loss": 4.2524, - "step": 1390 - }, - { - "epoch": 0.23933669544405506, - "grad_norm": 1.1011021137237549, - "learning_rate": 4.945097603721251e-05, - "loss": 4.2665, - "step": 1400 - }, - { - "epoch": 0.24104624326865545, - "grad_norm": 1.1098192930221558, - "learning_rate": 4.944200582687932e-05, - "loss": 4.2449, - "step": 1410 - }, - { - "epoch": 0.24275579109325585, - "grad_norm": 1.178839087486267, - "learning_rate": 4.943296384929493e-05, - "loss": 4.2775, - "step": 1420 - }, - { - "epoch": 0.24446533891785624, - "grad_norm": 1.1217193603515625, - "learning_rate": 4.942385013403354e-05, - "loss": 4.2729, - "step": 1430 - }, - { - "epoch": 0.24617488674245663, - "grad_norm": 1.1466504335403442, - "learning_rate": 4.941466471090403e-05, - "loss": 4.2642, - "step": 1440 - }, - { - "epoch": 0.24788443456705703, - "grad_norm": 1.1177374124526978, - "learning_rate": 4.940540760994979e-05, - "loss": 4.286, - "step": 1450 - }, - { - "epoch": 0.2495939823916574, - "grad_norm": 1.170060157775879, - "learning_rate": 4.939607886144869e-05, - "loss": 4.2379, - "step": 1460 - }, - { - "epoch": 0.2513035302162578, - "grad_norm": 1.1181217432022095, - "learning_rate": 4.93866784959129e-05, - "loss": 4.3102, - "step": 1470 - }, - { - "epoch": 0.2530130780408582, - "grad_norm": 1.1651679277420044, - "learning_rate": 4.937720654408886e-05, - "loss": 4.2364, - "step": 1480 - }, - { - "epoch": 0.2547226258654586, - "grad_norm": 1.1205878257751465, - "learning_rate": 4.936766303695713e-05, - "loss": 4.2286, - "step": 1490 - }, - { - "epoch": 0.256432173690059, - "grad_norm": 1.1291285753250122, - "learning_rate": 4.9358048005732355e-05, - "loss": 4.2475, - "step": 1500 - }, - { - "epoch": 0.25814172151465936, - "grad_norm": 1.126664400100708, - "learning_rate": 4.934836148186306e-05, - "loss": 4.2405, - "step": 1510 - }, - { - "epoch": 0.2598512693392598, - "grad_norm": 1.190224051475525, - "learning_rate": 4.933860349703165e-05, - "loss": 4.2893, - "step": 1520 - }, - { - "epoch": 0.26156081716386015, - "grad_norm": 1.1472687721252441, - "learning_rate": 4.932877408315425e-05, - "loss": 4.2516, - "step": 1530 - }, - { - "epoch": 0.26327036498846057, - "grad_norm": 1.1440709829330444, - "learning_rate": 4.9318873272380614e-05, - "loss": 4.2627, - "step": 1540 - }, - { - "epoch": 0.26497991281306094, - "grad_norm": 1.1374613046646118, - "learning_rate": 4.9308901097093985e-05, - "loss": 4.2749, - "step": 1550 - }, - { - "epoch": 0.26668946063766136, - "grad_norm": 1.1236183643341064, - "learning_rate": 4.9298857589911094e-05, - "loss": 4.2734, - "step": 1560 - }, - { - "epoch": 0.2683990084622617, - "grad_norm": 1.1393808126449585, - "learning_rate": 4.928874278368192e-05, - "loss": 4.2395, - "step": 1570 - }, - { - "epoch": 0.27010855628686214, - "grad_norm": 1.1651018857955933, - "learning_rate": 4.927855671148966e-05, - "loss": 4.2512, - "step": 1580 - }, - { - "epoch": 0.2718181041114625, - "grad_norm": 1.1326340436935425, - "learning_rate": 4.9268299406650626e-05, - "loss": 4.2929, - "step": 1590 - }, - { - "epoch": 0.27352765193606293, - "grad_norm": 1.1119053363800049, - "learning_rate": 4.9257970902714094e-05, - "loss": 4.2569, - "step": 1600 - }, - { - "epoch": 0.2752371997606633, - "grad_norm": 1.1707072257995605, - "learning_rate": 4.9247571233462236e-05, - "loss": 4.2519, - "step": 1610 - }, - { - "epoch": 0.2769467475852637, - "grad_norm": 1.1098936796188354, - "learning_rate": 4.9237100432909964e-05, - "loss": 4.2092, - "step": 1620 - }, - { - "epoch": 0.2786562954098641, - "grad_norm": 1.2068954706192017, - "learning_rate": 4.922655853530486e-05, - "loss": 4.227, - "step": 1630 - }, - { - "epoch": 0.2803658432344645, - "grad_norm": 1.0763604640960693, - "learning_rate": 4.921594557512703e-05, - "loss": 4.3321, - "step": 1640 - }, - { - "epoch": 0.2820753910590649, - "grad_norm": 1.1225224733352661, - "learning_rate": 4.9205261587089054e-05, - "loss": 4.2266, - "step": 1650 - }, - { - "epoch": 0.2837849388836653, - "grad_norm": 1.0870682001113892, - "learning_rate": 4.919450660613578e-05, - "loss": 4.2611, - "step": 1660 - }, - { - "epoch": 0.28549448670826566, - "grad_norm": 1.1199606657028198, - "learning_rate": 4.9183680667444276e-05, - "loss": 4.2617, - "step": 1670 - }, - { - "epoch": 0.2872040345328661, - "grad_norm": 1.1667757034301758, - "learning_rate": 4.917278380642369e-05, - "loss": 4.2242, - "step": 1680 - }, - { - "epoch": 0.28891358235746645, - "grad_norm": 1.1540544033050537, - "learning_rate": 4.916181605871516e-05, - "loss": 4.2537, - "step": 1690 - }, - { - "epoch": 0.29062313018206687, - "grad_norm": 1.139373540878296, - "learning_rate": 4.915077746019167e-05, - "loss": 4.2058, - "step": 1700 - }, - { - "epoch": 0.29233267800666723, - "grad_norm": 1.0952222347259521, - "learning_rate": 4.913966804695792e-05, - "loss": 4.2757, - "step": 1710 - }, - { - "epoch": 0.29404222583126766, - "grad_norm": 1.131005048751831, - "learning_rate": 4.912848785535024e-05, - "loss": 4.2835, - "step": 1720 - }, - { - "epoch": 0.295751773655868, - "grad_norm": 1.193410873413086, - "learning_rate": 4.911723692193648e-05, - "loss": 4.2301, - "step": 1730 - }, - { - "epoch": 0.29746132148046844, - "grad_norm": 1.119791030883789, - "learning_rate": 4.910591528351584e-05, - "loss": 4.2404, - "step": 1740 - }, - { - "epoch": 0.2991708693050688, - "grad_norm": 1.1116772890090942, - "learning_rate": 4.90945229771188e-05, - "loss": 4.3126, - "step": 1750 - }, - { - "epoch": 0.3008804171296692, - "grad_norm": 1.156311273574829, - "learning_rate": 4.908306004000698e-05, - "loss": 4.2201, - "step": 1760 - }, - { - "epoch": 0.3025899649542696, - "grad_norm": 1.1149965524673462, - "learning_rate": 4.9071526509672996e-05, - "loss": 4.2334, - "step": 1770 - }, - { - "epoch": 0.30429951277886996, - "grad_norm": 1.0730156898498535, - "learning_rate": 4.9059922423840373e-05, - "loss": 4.2942, - "step": 1780 - }, - { - "epoch": 0.3060090606034704, - "grad_norm": 1.0852235555648804, - "learning_rate": 4.904824782046341e-05, - "loss": 4.2513, - "step": 1790 - }, - { - "epoch": 0.30771860842807075, - "grad_norm": 1.0940625667572021, - "learning_rate": 4.9036502737727055e-05, - "loss": 4.2903, - "step": 1800 - }, - { - "epoch": 0.30942815625267117, - "grad_norm": 1.1397515535354614, - "learning_rate": 4.9024687214046747e-05, - "loss": 4.2695, - "step": 1810 - }, - { - "epoch": 0.31113770407727154, - "grad_norm": 1.1789498329162598, - "learning_rate": 4.901280128806837e-05, - "loss": 4.2674, - "step": 1820 - }, - { - "epoch": 0.31284725190187196, - "grad_norm": 1.1676262617111206, - "learning_rate": 4.900084499866805e-05, - "loss": 4.2586, - "step": 1830 - }, - { - "epoch": 0.3145567997264723, - "grad_norm": 1.1302201747894287, - "learning_rate": 4.8988818384952036e-05, - "loss": 4.2559, - "step": 1840 - }, - { - "epoch": 0.31626634755107275, - "grad_norm": 1.1213160753250122, - "learning_rate": 4.897672148625663e-05, - "loss": 4.2505, - "step": 1850 - }, - { - "epoch": 0.3179758953756731, - "grad_norm": 1.1657159328460693, - "learning_rate": 4.8964554342148005e-05, - "loss": 4.1869, - "step": 1860 - }, - { - "epoch": 0.31968544320027353, - "grad_norm": 1.1311578750610352, - "learning_rate": 4.895231699242208e-05, - "loss": 4.2903, - "step": 1870 - }, - { - "epoch": 0.3213949910248739, - "grad_norm": 1.0744332075119019, - "learning_rate": 4.894000947710441e-05, - "loss": 4.2492, - "step": 1880 - }, - { - "epoch": 0.3231045388494743, - "grad_norm": 1.091806411743164, - "learning_rate": 4.8927631836450064e-05, - "loss": 4.2289, - "step": 1890 - }, - { - "epoch": 0.3248140866740747, - "grad_norm": 1.1294193267822266, - "learning_rate": 4.891518411094344e-05, - "loss": 4.2649, - "step": 1900 - }, - { - "epoch": 0.3265236344986751, - "grad_norm": 1.1349658966064453, - "learning_rate": 4.890266634129817e-05, - "loss": 4.2583, - "step": 1910 - }, - { - "epoch": 0.3282331823232755, - "grad_norm": 1.135793685913086, - "learning_rate": 4.889007856845703e-05, - "loss": 4.2113, - "step": 1920 - }, - { - "epoch": 0.3299427301478759, - "grad_norm": 1.133028507232666, - "learning_rate": 4.88774208335917e-05, - "loss": 4.2083, - "step": 1930 - }, - { - "epoch": 0.33165227797247626, - "grad_norm": 1.1581465005874634, - "learning_rate": 4.8864693178102745e-05, - "loss": 4.2258, - "step": 1940 - }, - { - "epoch": 0.3333618257970767, - "grad_norm": 1.1020305156707764, - "learning_rate": 4.885189564361936e-05, - "loss": 4.2132, - "step": 1950 - }, - { - "epoch": 0.33507137362167705, - "grad_norm": 1.1180683374404907, - "learning_rate": 4.8839028271999363e-05, - "loss": 4.2155, - "step": 1960 - }, - { - "epoch": 0.33678092144627747, - "grad_norm": 1.1715418100357056, - "learning_rate": 4.882609110532896e-05, - "loss": 4.2111, - "step": 1970 - }, - { - "epoch": 0.33849046927087784, - "grad_norm": 1.1252162456512451, - "learning_rate": 4.881308418592264e-05, - "loss": 4.2842, - "step": 1980 - }, - { - "epoch": 0.34020001709547826, - "grad_norm": 1.154093861579895, - "learning_rate": 4.8800007556323053e-05, - "loss": 4.2188, - "step": 1990 - }, - { - "epoch": 0.3419095649200786, - "grad_norm": 1.1171457767486572, - "learning_rate": 4.878686125930083e-05, - "loss": 4.2418, - "step": 2000 - }, - { - "epoch": 0.3419095649200786, - "eval_loss": 4.253417491912842, - "eval_runtime": 14.4478, - "eval_samples_per_second": 67.415, - "eval_steps_per_second": 0.9, - "step": 2000 - }, - { - "epoch": 0.34361911274467904, - "grad_norm": 1.075865626335144, - "learning_rate": 4.877364533785449e-05, - "loss": 4.2649, - "step": 2010 - }, - { - "epoch": 0.3453286605692794, - "grad_norm": 1.1201914548873901, - "learning_rate": 4.876035983521028e-05, - "loss": 4.2924, - "step": 2020 - }, - { - "epoch": 0.34703820839387983, - "grad_norm": 1.155360460281372, - "learning_rate": 4.874700479482201e-05, - "loss": 4.1964, - "step": 2030 - }, - { - "epoch": 0.3487477562184802, - "grad_norm": 1.1003767251968384, - "learning_rate": 4.873358026037095e-05, - "loss": 4.194, - "step": 2040 - }, - { - "epoch": 0.3504573040430806, - "grad_norm": 1.0945653915405273, - "learning_rate": 4.872008627576565e-05, - "loss": 4.2609, - "step": 2050 - }, - { - "epoch": 0.352166851867681, - "grad_norm": 1.149955153465271, - "learning_rate": 4.870652288514184e-05, - "loss": 4.2659, - "step": 2060 - }, - { - "epoch": 0.3538763996922814, - "grad_norm": 1.1289923191070557, - "learning_rate": 4.8692890132862256e-05, - "loss": 4.2813, - "step": 2070 - }, - { - "epoch": 0.3555859475168818, - "grad_norm": 1.1177889108657837, - "learning_rate": 4.8679188063516473e-05, - "loss": 4.2623, - "step": 2080 - }, - { - "epoch": 0.3572954953414822, - "grad_norm": 1.0939865112304688, - "learning_rate": 4.866541672192082e-05, - "loss": 4.2508, - "step": 2090 - }, - { - "epoch": 0.35900504316608256, - "grad_norm": 1.1325429677963257, - "learning_rate": 4.865157615311819e-05, - "loss": 4.2521, - "step": 2100 - }, - { - "epoch": 0.360714590990683, - "grad_norm": 1.1375011205673218, - "learning_rate": 4.863766640237788e-05, - "loss": 4.2694, - "step": 2110 - }, - { - "epoch": 0.36242413881528335, - "grad_norm": 1.054932713508606, - "learning_rate": 4.86236875151955e-05, - "loss": 4.2457, - "step": 2120 - }, - { - "epoch": 0.36413368663988377, - "grad_norm": 1.0878148078918457, - "learning_rate": 4.8609639537292775e-05, - "loss": 4.2813, - "step": 2130 - }, - { - "epoch": 0.36584323446448413, - "grad_norm": 1.11296546459198, - "learning_rate": 4.859552251461739e-05, - "loss": 4.2282, - "step": 2140 - }, - { - "epoch": 0.36755278228908456, - "grad_norm": 1.112873911857605, - "learning_rate": 4.858133649334289e-05, - "loss": 4.2381, - "step": 2150 - }, - { - "epoch": 0.3692623301136849, - "grad_norm": 1.1013659238815308, - "learning_rate": 4.856708151986848e-05, - "loss": 4.2399, - "step": 2160 - }, - { - "epoch": 0.37097187793828534, - "grad_norm": 1.0834161043167114, - "learning_rate": 4.855275764081891e-05, - "loss": 4.3049, - "step": 2170 - }, - { - "epoch": 0.3726814257628857, - "grad_norm": 1.1093565225601196, - "learning_rate": 4.853836490304427e-05, - "loss": 4.2574, - "step": 2180 - }, - { - "epoch": 0.37439097358748613, - "grad_norm": 1.101039171218872, - "learning_rate": 4.852390335361992e-05, - "loss": 4.24, - "step": 2190 - }, - { - "epoch": 0.3761005214120865, - "grad_norm": 1.122449517250061, - "learning_rate": 4.850937303984623e-05, - "loss": 4.2325, - "step": 2200 - }, - { - "epoch": 0.3778100692366869, - "grad_norm": 1.0810867547988892, - "learning_rate": 4.849477400924853e-05, - "loss": 4.277, - "step": 2210 - }, - { - "epoch": 0.3795196170612873, - "grad_norm": 1.0708153247833252, - "learning_rate": 4.84801063095769e-05, - "loss": 4.2745, - "step": 2220 - }, - { - "epoch": 0.3812291648858877, - "grad_norm": 1.151571273803711, - "learning_rate": 4.846536998880598e-05, - "loss": 4.2241, - "step": 2230 - }, - { - "epoch": 0.38293871271048807, - "grad_norm": 1.0987738370895386, - "learning_rate": 4.845056509513491e-05, - "loss": 4.2201, - "step": 2240 - }, - { - "epoch": 0.3846482605350885, - "grad_norm": 1.0751088857650757, - "learning_rate": 4.8435691676987075e-05, - "loss": 4.2451, - "step": 2250 - }, - { - "epoch": 0.38635780835968886, - "grad_norm": 1.107405185699463, - "learning_rate": 4.842074978301001e-05, - "loss": 4.2228, - "step": 2260 - }, - { - "epoch": 0.3880673561842893, - "grad_norm": 1.0876752138137817, - "learning_rate": 4.840573946207522e-05, - "loss": 4.2751, - "step": 2270 - }, - { - "epoch": 0.38977690400888965, - "grad_norm": 1.1553744077682495, - "learning_rate": 4.8390660763277985e-05, - "loss": 4.1931, - "step": 2280 - }, - { - "epoch": 0.39148645183349007, - "grad_norm": 1.119031548500061, - "learning_rate": 4.837551373593728e-05, - "loss": 4.2611, - "step": 2290 - }, - { - "epoch": 0.39319599965809043, - "grad_norm": 1.130132794380188, - "learning_rate": 4.8360298429595534e-05, - "loss": 4.2435, - "step": 2300 - }, - { - "epoch": 0.39490554748269086, - "grad_norm": 1.1434454917907715, - "learning_rate": 4.834501489401852e-05, - "loss": 4.2402, - "step": 2310 - }, - { - "epoch": 0.3966150953072912, - "grad_norm": 1.1280814409255981, - "learning_rate": 4.8329663179195164e-05, - "loss": 4.219, - "step": 2320 - }, - { - "epoch": 0.3983246431318916, - "grad_norm": 1.095200777053833, - "learning_rate": 4.8314243335337386e-05, - "loss": 4.2092, - "step": 2330 - }, - { - "epoch": 0.400034190956492, - "grad_norm": 1.1288820505142212, - "learning_rate": 4.829875541287995e-05, - "loss": 4.2488, - "step": 2340 - }, - { - "epoch": 0.4017437387810924, - "grad_norm": 1.103519320487976, - "learning_rate": 4.82831994624803e-05, - "loss": 4.2968, - "step": 2350 - }, - { - "epoch": 0.4034532866056928, - "grad_norm": 1.1223841905593872, - "learning_rate": 4.826757553501834e-05, - "loss": 4.2457, - "step": 2360 - }, - { - "epoch": 0.40516283443029316, - "grad_norm": 1.0765419006347656, - "learning_rate": 4.825188368159636e-05, - "loss": 4.2587, - "step": 2370 - }, - { - "epoch": 0.4068723822548936, - "grad_norm": 1.1339383125305176, - "learning_rate": 4.823612395353881e-05, - "loss": 4.2248, - "step": 2380 - }, - { - "epoch": 0.40858193007949395, - "grad_norm": 1.081447958946228, - "learning_rate": 4.8220296402392114e-05, - "loss": 4.2888, - "step": 2390 - }, - { - "epoch": 0.41029147790409437, - "grad_norm": 1.1328470706939697, - "learning_rate": 4.820440107992455e-05, - "loss": 4.2774, - "step": 2400 - }, - { - "epoch": 0.41200102572869474, - "grad_norm": 1.0910515785217285, - "learning_rate": 4.818843803812607e-05, - "loss": 4.3028, - "step": 2410 - }, - { - "epoch": 0.41371057355329516, - "grad_norm": 1.1610660552978516, - "learning_rate": 4.8172407329208116e-05, - "loss": 4.2652, - "step": 2420 - }, - { - "epoch": 0.4154201213778955, - "grad_norm": 1.1534645557403564, - "learning_rate": 4.815630900560343e-05, - "loss": 4.2415, - "step": 2430 - }, - { - "epoch": 0.41712966920249595, - "grad_norm": 1.1002289056777954, - "learning_rate": 4.8140143119965935e-05, - "loss": 4.2027, - "step": 2440 - }, - { - "epoch": 0.4188392170270963, - "grad_norm": 1.0853290557861328, - "learning_rate": 4.812390972517052e-05, - "loss": 4.2699, - "step": 2450 - }, - { - "epoch": 0.42054876485169673, - "grad_norm": 1.0976474285125732, - "learning_rate": 4.810760887431288e-05, - "loss": 4.2558, - "step": 2460 - }, - { - "epoch": 0.4222583126762971, - "grad_norm": 1.0604748725891113, - "learning_rate": 4.809124062070935e-05, - "loss": 4.2621, - "step": 2470 - }, - { - "epoch": 0.4239678605008975, - "grad_norm": 1.1342027187347412, - "learning_rate": 4.807480501789672e-05, - "loss": 4.2491, - "step": 2480 - }, - { - "epoch": 0.4256774083254979, - "grad_norm": 1.1212213039398193, - "learning_rate": 4.805830211963206e-05, - "loss": 4.2644, - "step": 2490 - }, - { - "epoch": 0.4273869561500983, - "grad_norm": 1.0877577066421509, - "learning_rate": 4.8041731979892555e-05, - "loss": 4.2851, - "step": 2500 - }, - { - "epoch": 0.4290965039746987, - "grad_norm": 1.0952221155166626, - "learning_rate": 4.8025094652875315e-05, - "loss": 4.2164, - "step": 2510 - }, - { - "epoch": 0.4308060517992991, - "grad_norm": 1.1732182502746582, - "learning_rate": 4.800839019299723e-05, - "loss": 4.2226, - "step": 2520 - }, - { - "epoch": 0.43251559962389946, - "grad_norm": 1.1240397691726685, - "learning_rate": 4.7991618654894717e-05, - "loss": 4.2252, - "step": 2530 - }, - { - "epoch": 0.4342251474484999, - "grad_norm": 1.0977295637130737, - "learning_rate": 4.797478009342361e-05, - "loss": 4.248, - "step": 2540 - }, - { - "epoch": 0.43593469527310025, - "grad_norm": 1.1156551837921143, - "learning_rate": 4.795787456365901e-05, - "loss": 4.1919, - "step": 2550 - }, - { - "epoch": 0.43764424309770067, - "grad_norm": 1.0808706283569336, - "learning_rate": 4.794090212089498e-05, - "loss": 4.2221, - "step": 2560 - }, - { - "epoch": 0.43935379092230104, - "grad_norm": 1.107500672340393, - "learning_rate": 4.7923862820644476e-05, - "loss": 4.2635, - "step": 2570 - }, - { - "epoch": 0.44106333874690146, - "grad_norm": 1.1214717626571655, - "learning_rate": 4.790675671863914e-05, - "loss": 4.2384, - "step": 2580 - }, - { - "epoch": 0.4427728865715018, - "grad_norm": 1.1030008792877197, - "learning_rate": 4.788958387082909e-05, - "loss": 4.2645, - "step": 2590 - }, - { - "epoch": 0.44448243439610224, - "grad_norm": 1.099306344985962, - "learning_rate": 4.787234433338275e-05, - "loss": 4.2891, - "step": 2600 - }, - { - "epoch": 0.4461919822207026, - "grad_norm": 1.1382032632827759, - "learning_rate": 4.785503816268669e-05, - "loss": 4.2656, - "step": 2610 - }, - { - "epoch": 0.44790153004530303, - "grad_norm": 1.1182737350463867, - "learning_rate": 4.783766541534542e-05, - "loss": 4.2391, - "step": 2620 - }, - { - "epoch": 0.4496110778699034, - "grad_norm": 1.1378432512283325, - "learning_rate": 4.782022614818117e-05, - "loss": 4.2459, - "step": 2630 - }, - { - "epoch": 0.4513206256945038, - "grad_norm": 1.1752971410751343, - "learning_rate": 4.78027204182338e-05, - "loss": 4.2285, - "step": 2640 - }, - { - "epoch": 0.4530301735191042, - "grad_norm": 1.091764211654663, - "learning_rate": 4.77851482827605e-05, - "loss": 4.2629, - "step": 2650 - }, - { - "epoch": 0.4547397213437046, - "grad_norm": 1.1209800243377686, - "learning_rate": 4.77675097992357e-05, - "loss": 4.2338, - "step": 2660 - }, - { - "epoch": 0.45644926916830497, - "grad_norm": 1.1220327615737915, - "learning_rate": 4.7749805025350803e-05, - "loss": 4.2687, - "step": 2670 - }, - { - "epoch": 0.4581588169929054, - "grad_norm": 1.0927966833114624, - "learning_rate": 4.773203401901406e-05, - "loss": 4.1862, - "step": 2680 - }, - { - "epoch": 0.45986836481750576, - "grad_norm": 1.0681684017181396, - "learning_rate": 4.771419683835036e-05, - "loss": 4.2443, - "step": 2690 - }, - { - "epoch": 0.4615779126421062, - "grad_norm": 1.0760319232940674, - "learning_rate": 4.769629354170097e-05, - "loss": 4.2658, - "step": 2700 - }, - { - "epoch": 0.46328746046670655, - "grad_norm": 1.090036392211914, - "learning_rate": 4.7678324187623496e-05, - "loss": 4.2675, - "step": 2710 - }, - { - "epoch": 0.46499700829130697, - "grad_norm": 1.1084853410720825, - "learning_rate": 4.766028883489154e-05, - "loss": 4.2593, - "step": 2720 - }, - { - "epoch": 0.46670655611590733, - "grad_norm": 1.0939213037490845, - "learning_rate": 4.7642187542494584e-05, - "loss": 4.2722, - "step": 2730 - }, - { - "epoch": 0.46841610394050776, - "grad_norm": 1.1262520551681519, - "learning_rate": 4.762402036963781e-05, - "loss": 4.2115, - "step": 2740 - }, - { - "epoch": 0.4701256517651081, - "grad_norm": 1.0964442491531372, - "learning_rate": 4.760578737574184e-05, - "loss": 4.2595, - "step": 2750 - }, - { - "epoch": 0.47183519958970854, - "grad_norm": 1.1190603971481323, - "learning_rate": 4.758748862044261e-05, - "loss": 4.2832, - "step": 2760 - }, - { - "epoch": 0.4735447474143089, - "grad_norm": 1.1074726581573486, - "learning_rate": 4.756912416359114e-05, - "loss": 4.2675, - "step": 2770 - }, - { - "epoch": 0.47525429523890933, - "grad_norm": 1.1360183954238892, - "learning_rate": 4.755069406525334e-05, - "loss": 4.23, - "step": 2780 - }, - { - "epoch": 0.4769638430635097, - "grad_norm": 1.1268521547317505, - "learning_rate": 4.753219838570982e-05, - "loss": 4.25, - "step": 2790 - }, - { - "epoch": 0.4786733908881101, - "grad_norm": 1.142379879951477, - "learning_rate": 4.751363718545568e-05, - "loss": 4.234, - "step": 2800 - }, - { - "epoch": 0.4803829387127105, - "grad_norm": 1.061362624168396, - "learning_rate": 4.749501052520036e-05, - "loss": 4.2431, - "step": 2810 - }, - { - "epoch": 0.4820924865373109, - "grad_norm": 1.112080693244934, - "learning_rate": 4.7476318465867356e-05, - "loss": 4.2332, - "step": 2820 - }, - { - "epoch": 0.48380203436191127, - "grad_norm": 1.0578895807266235, - "learning_rate": 4.745756106859409e-05, - "loss": 4.2682, - "step": 2830 - }, - { - "epoch": 0.4855115821865117, - "grad_norm": 1.1330028772354126, - "learning_rate": 4.74387383947317e-05, - "loss": 4.2712, - "step": 2840 - }, - { - "epoch": 0.48722113001111206, - "grad_norm": 1.1072999238967896, - "learning_rate": 4.741985050584482e-05, - "loss": 4.2445, - "step": 2850 - }, - { - "epoch": 0.4889306778357125, - "grad_norm": 1.1306287050247192, - "learning_rate": 4.74008974637114e-05, - "loss": 4.1949, - "step": 2860 - }, - { - "epoch": 0.49064022566031285, - "grad_norm": 1.0843833684921265, - "learning_rate": 4.738187933032246e-05, - "loss": 4.2508, - "step": 2870 - }, - { - "epoch": 0.49234977348491327, - "grad_norm": 1.0874364376068115, - "learning_rate": 4.736279616788195e-05, - "loss": 4.2516, - "step": 2880 - }, - { - "epoch": 0.49405932130951363, - "grad_norm": 1.135063886642456, - "learning_rate": 4.7343648038806486e-05, - "loss": 4.2367, - "step": 2890 - }, - { - "epoch": 0.49576886913411405, - "grad_norm": 1.1294223070144653, - "learning_rate": 4.732443500572521e-05, - "loss": 4.3008, - "step": 2900 - }, - { - "epoch": 0.4974784169587144, - "grad_norm": 1.0720285177230835, - "learning_rate": 4.730515713147952e-05, - "loss": 4.2614, - "step": 2910 - }, - { - "epoch": 0.4991879647833148, - "grad_norm": 1.1082816123962402, - "learning_rate": 4.728581447912291e-05, - "loss": 4.2576, - "step": 2920 - }, - { - "epoch": 0.5008975126079152, - "grad_norm": 1.0759425163269043, - "learning_rate": 4.7266407111920745e-05, - "loss": 4.2564, - "step": 2930 - }, - { - "epoch": 0.5026070604325156, - "grad_norm": 1.1233313083648682, - "learning_rate": 4.724693509335006e-05, - "loss": 4.2568, - "step": 2940 - }, - { - "epoch": 0.504316608257116, - "grad_norm": 1.122125506401062, - "learning_rate": 4.7227398487099336e-05, - "loss": 4.2618, - "step": 2950 - }, - { - "epoch": 0.5060261560817164, - "grad_norm": 1.1373542547225952, - "learning_rate": 4.720779735706832e-05, - "loss": 4.1918, - "step": 2960 - }, - { - "epoch": 0.5077357039063167, - "grad_norm": 1.0733320713043213, - "learning_rate": 4.7188131767367806e-05, - "loss": 4.2425, - "step": 2970 - }, - { - "epoch": 0.5094452517309171, - "grad_norm": 1.146315097808838, - "learning_rate": 4.7168401782319396e-05, - "loss": 4.2388, - "step": 2980 - }, - { - "epoch": 0.5111547995555176, - "grad_norm": 1.0983136892318726, - "learning_rate": 4.714860746645534e-05, - "loss": 4.2482, - "step": 2990 - }, - { - "epoch": 0.512864347380118, - "grad_norm": 1.1013798713684082, - "learning_rate": 4.7128748884518295e-05, - "loss": 4.2651, - "step": 3000 - }, - { - "epoch": 0.512864347380118, - "eval_loss": 4.2494635581970215, - "eval_runtime": 14.9148, - "eval_samples_per_second": 65.304, - "eval_steps_per_second": 0.872, - "step": 3000 - }, - { - "epoch": 0.5145738952047183, - "grad_norm": 1.0576733350753784, - "learning_rate": 4.71088261014611e-05, - "loss": 4.273, - "step": 3010 - }, - { - "epoch": 0.5162834430293187, - "grad_norm": 1.1068296432495117, - "learning_rate": 4.70888391824466e-05, - "loss": 4.236, - "step": 3020 - }, - { - "epoch": 0.5179929908539191, - "grad_norm": 1.124989628791809, - "learning_rate": 4.706878819284741e-05, - "loss": 4.2661, - "step": 3030 - }, - { - "epoch": 0.5197025386785196, - "grad_norm": 1.1100960969924927, - "learning_rate": 4.7048673198245705e-05, - "loss": 4.2466, - "step": 3040 - }, - { - "epoch": 0.5214120865031199, - "grad_norm": 1.1102838516235352, - "learning_rate": 4.702849426443298e-05, - "loss": 4.2562, - "step": 3050 - }, - { - "epoch": 0.5231216343277203, - "grad_norm": 1.1083430051803589, - "learning_rate": 4.7008251457409915e-05, - "loss": 4.2682, - "step": 3060 - }, - { - "epoch": 0.5248311821523207, - "grad_norm": 1.108059048652649, - "learning_rate": 4.698794484338605e-05, - "loss": 4.2612, - "step": 3070 - }, - { - "epoch": 0.5265407299769211, - "grad_norm": 1.0801516771316528, - "learning_rate": 4.6967574488779654e-05, - "loss": 4.2318, - "step": 3080 - }, - { - "epoch": 0.5282502778015215, - "grad_norm": 1.076148271560669, - "learning_rate": 4.694714046021747e-05, - "loss": 4.2367, - "step": 3090 - }, - { - "epoch": 0.5299598256261219, - "grad_norm": 1.1273393630981445, - "learning_rate": 4.6926642824534506e-05, - "loss": 4.2691, - "step": 3100 - }, - { - "epoch": 0.5316693734507223, - "grad_norm": 1.09604012966156, - "learning_rate": 4.69060816487738e-05, - "loss": 4.2346, - "step": 3110 - }, - { - "epoch": 0.5333789212753227, - "grad_norm": 1.0888229608535767, - "learning_rate": 4.688545700018624e-05, - "loss": 4.2224, - "step": 3120 - }, - { - "epoch": 0.535088469099923, - "grad_norm": 1.149349570274353, - "learning_rate": 4.6864768946230295e-05, - "loss": 4.1663, - "step": 3130 - }, - { - "epoch": 0.5367980169245234, - "grad_norm": 1.1183565855026245, - "learning_rate": 4.684401755457183e-05, - "loss": 4.2262, - "step": 3140 - }, - { - "epoch": 0.5385075647491239, - "grad_norm": 1.1260793209075928, - "learning_rate": 4.682320289308387e-05, - "loss": 4.2502, - "step": 3150 - }, - { - "epoch": 0.5402171125737243, - "grad_norm": 1.0971378087997437, - "learning_rate": 4.680232502984638e-05, - "loss": 4.2679, - "step": 3160 - }, - { - "epoch": 0.5419266603983246, - "grad_norm": 1.0903050899505615, - "learning_rate": 4.6781384033146055e-05, - "loss": 4.2639, - "step": 3170 - }, - { - "epoch": 0.543636208222925, - "grad_norm": 1.1086989641189575, - "learning_rate": 4.676037997147606e-05, - "loss": 4.2398, - "step": 3180 - }, - { - "epoch": 0.5453457560475254, - "grad_norm": 1.090357780456543, - "learning_rate": 4.673931291353587e-05, - "loss": 4.2149, - "step": 3190 - }, - { - "epoch": 0.5470553038721259, - "grad_norm": 1.11151921749115, - "learning_rate": 4.671818292823097e-05, - "loss": 4.2859, - "step": 3200 - }, - { - "epoch": 0.5487648516967262, - "grad_norm": 1.132080316543579, - "learning_rate": 4.669699008467267e-05, - "loss": 4.2502, - "step": 3210 - }, - { - "epoch": 0.5504743995213266, - "grad_norm": 1.0804109573364258, - "learning_rate": 4.66757344521779e-05, - "loss": 4.279, - "step": 3220 - }, - { - "epoch": 0.552183947345927, - "grad_norm": 1.1069560050964355, - "learning_rate": 4.665441610026893e-05, - "loss": 4.2503, - "step": 3230 - }, - { - "epoch": 0.5538934951705274, - "grad_norm": 1.137535572052002, - "learning_rate": 4.6633035098673194e-05, - "loss": 4.2781, - "step": 3240 - }, - { - "epoch": 0.5556030429951277, - "grad_norm": 1.115520715713501, - "learning_rate": 4.6611591517323016e-05, - "loss": 4.2151, - "step": 3250 - }, - { - "epoch": 0.5573125908197282, - "grad_norm": 1.0897964239120483, - "learning_rate": 4.659008542635542e-05, - "loss": 4.2764, - "step": 3260 - }, - { - "epoch": 0.5590221386443286, - "grad_norm": 1.1034948825836182, - "learning_rate": 4.656851689611189e-05, - "loss": 4.2565, - "step": 3270 - }, - { - "epoch": 0.560731686468929, - "grad_norm": 1.1032607555389404, - "learning_rate": 4.6546885997138114e-05, - "loss": 4.2185, - "step": 3280 - }, - { - "epoch": 0.5624412342935293, - "grad_norm": 1.1427913904190063, - "learning_rate": 4.6525192800183776e-05, - "loss": 4.2378, - "step": 3290 - }, - { - "epoch": 0.5641507821181297, - "grad_norm": 1.1001359224319458, - "learning_rate": 4.650343737620235e-05, - "loss": 4.2643, - "step": 3300 - }, - { - "epoch": 0.5658603299427302, - "grad_norm": 1.1006237268447876, - "learning_rate": 4.64816197963508e-05, - "loss": 4.2608, - "step": 3310 - }, - { - "epoch": 0.5675698777673306, - "grad_norm": 1.1036549806594849, - "learning_rate": 4.645974013198943e-05, - "loss": 4.253, - "step": 3320 - }, - { - "epoch": 0.5692794255919309, - "grad_norm": 1.1601523160934448, - "learning_rate": 4.643779845468158e-05, - "loss": 4.213, - "step": 3330 - }, - { - "epoch": 0.5709889734165313, - "grad_norm": 1.1104052066802979, - "learning_rate": 4.641579483619341e-05, - "loss": 4.2864, - "step": 3340 - }, - { - "epoch": 0.5726985212411317, - "grad_norm": 1.1196630001068115, - "learning_rate": 4.639372934849372e-05, - "loss": 4.2396, - "step": 3350 - }, - { - "epoch": 0.5744080690657322, - "grad_norm": 1.090217113494873, - "learning_rate": 4.6371602063753616e-05, - "loss": 4.2596, - "step": 3360 - }, - { - "epoch": 0.5761176168903325, - "grad_norm": 1.1001068353652954, - "learning_rate": 4.634941305434637e-05, - "loss": 4.2211, - "step": 3370 - }, - { - "epoch": 0.5778271647149329, - "grad_norm": 1.0934302806854248, - "learning_rate": 4.632716239284712e-05, - "loss": 4.1944, - "step": 3380 - }, - { - "epoch": 0.5795367125395333, - "grad_norm": 1.1170393228530884, - "learning_rate": 4.630485015203265e-05, - "loss": 4.2588, - "step": 3390 - }, - { - "epoch": 0.5812462603641337, - "grad_norm": 1.1072651147842407, - "learning_rate": 4.628247640488119e-05, - "loss": 4.1731, - "step": 3400 - }, - { - "epoch": 0.582955808188734, - "grad_norm": 1.1301066875457764, - "learning_rate": 4.626004122457209e-05, - "loss": 4.2372, - "step": 3410 - }, - { - "epoch": 0.5846653560133345, - "grad_norm": 1.109696865081787, - "learning_rate": 4.623754468448567e-05, - "loss": 4.2602, - "step": 3420 - }, - { - "epoch": 0.5863749038379349, - "grad_norm": 1.1419066190719604, - "learning_rate": 4.6214986858202946e-05, - "loss": 4.237, - "step": 3430 - }, - { - "epoch": 0.5880844516625353, - "grad_norm": 1.1035685539245605, - "learning_rate": 4.6192367819505364e-05, - "loss": 4.2046, - "step": 3440 - }, - { - "epoch": 0.5897939994871356, - "grad_norm": 1.122310996055603, - "learning_rate": 4.6169687642374606e-05, - "loss": 4.2252, - "step": 3450 - }, - { - "epoch": 0.591503547311736, - "grad_norm": 1.1042860746383667, - "learning_rate": 4.61469464009923e-05, - "loss": 4.2522, - "step": 3460 - }, - { - "epoch": 0.5932130951363365, - "grad_norm": 1.120194435119629, - "learning_rate": 4.612414416973981e-05, - "loss": 4.2771, - "step": 3470 - }, - { - "epoch": 0.5949226429609369, - "grad_norm": 1.1289923191070557, - "learning_rate": 4.6101281023198e-05, - "loss": 4.2541, - "step": 3480 - }, - { - "epoch": 0.5966321907855372, - "grad_norm": 1.15150785446167, - "learning_rate": 4.607835703614696e-05, - "loss": 4.1996, - "step": 3490 - }, - { - "epoch": 0.5983417386101376, - "grad_norm": 1.0959882736206055, - "learning_rate": 4.605537228356575e-05, - "loss": 4.2606, - "step": 3500 - }, - { - "epoch": 0.600051286434738, - "grad_norm": 1.0947035551071167, - "learning_rate": 4.603232684063224e-05, - "loss": 4.2762, - "step": 3510 - }, - { - "epoch": 0.6017608342593384, - "grad_norm": 1.0830228328704834, - "learning_rate": 4.600922078272275e-05, - "loss": 4.2868, - "step": 3520 - }, - { - "epoch": 0.6034703820839388, - "grad_norm": 1.1198160648345947, - "learning_rate": 4.5986054185411884e-05, - "loss": 4.2171, - "step": 3530 - }, - { - "epoch": 0.6051799299085392, - "grad_norm": 1.0933665037155151, - "learning_rate": 4.596282712447225e-05, - "loss": 4.2171, - "step": 3540 - }, - { - "epoch": 0.6068894777331396, - "grad_norm": 1.0863643884658813, - "learning_rate": 4.593953967587423e-05, - "loss": 4.2509, - "step": 3550 - }, - { - "epoch": 0.6085990255577399, - "grad_norm": 1.1644642353057861, - "learning_rate": 4.591619191578568e-05, - "loss": 4.2064, - "step": 3560 - }, - { - "epoch": 0.6103085733823403, - "grad_norm": 1.1349512338638306, - "learning_rate": 4.589278392057177e-05, - "loss": 4.2735, - "step": 3570 - }, - { - "epoch": 0.6120181212069408, - "grad_norm": 1.1134692430496216, - "learning_rate": 4.586931576679466e-05, - "loss": 4.2695, - "step": 3580 - }, - { - "epoch": 0.6137276690315412, - "grad_norm": 1.0873674154281616, - "learning_rate": 4.584578753121329e-05, - "loss": 4.2156, - "step": 3590 - }, - { - "epoch": 0.6154372168561415, - "grad_norm": 1.1341534852981567, - "learning_rate": 4.58221992907831e-05, - "loss": 4.2078, - "step": 3600 - }, - { - "epoch": 0.6171467646807419, - "grad_norm": 1.0820064544677734, - "learning_rate": 4.5798551122655776e-05, - "loss": 4.2307, - "step": 3610 - }, - { - "epoch": 0.6188563125053423, - "grad_norm": 1.0933773517608643, - "learning_rate": 4.5774843104179054e-05, - "loss": 4.2015, - "step": 3620 - }, - { - "epoch": 0.6205658603299428, - "grad_norm": 1.1277698278427124, - "learning_rate": 4.5751075312896405e-05, - "loss": 4.2684, - "step": 3630 - }, - { - "epoch": 0.6222754081545431, - "grad_norm": 1.0553826093673706, - "learning_rate": 4.572724782654679e-05, - "loss": 4.2298, - "step": 3640 - }, - { - "epoch": 0.6239849559791435, - "grad_norm": 1.073564052581787, - "learning_rate": 4.570336072306446e-05, - "loss": 4.2617, - "step": 3650 - }, - { - "epoch": 0.6256945038037439, - "grad_norm": 1.1252152919769287, - "learning_rate": 4.5679414080578603e-05, - "loss": 4.2351, - "step": 3660 - }, - { - "epoch": 0.6274040516283443, - "grad_norm": 1.1542309522628784, - "learning_rate": 4.565540797741319e-05, - "loss": 4.2243, - "step": 3670 - }, - { - "epoch": 0.6291135994529446, - "grad_norm": 1.055681586265564, - "learning_rate": 4.563134249208666e-05, - "loss": 4.2507, - "step": 3680 - }, - { - "epoch": 0.6308231472775451, - "grad_norm": 1.1172776222229004, - "learning_rate": 4.5607217703311676e-05, - "loss": 4.2606, - "step": 3690 - }, - { - "epoch": 0.6325326951021455, - "grad_norm": 1.15364670753479, - "learning_rate": 4.558303368999487e-05, - "loss": 4.224, - "step": 3700 - }, - { - "epoch": 0.6342422429267459, - "grad_norm": 1.1916589736938477, - "learning_rate": 4.5558790531236575e-05, - "loss": 4.2253, - "step": 3710 - }, - { - "epoch": 0.6359517907513462, - "grad_norm": 1.0901060104370117, - "learning_rate": 4.55344883063306e-05, - "loss": 4.2788, - "step": 3720 - }, - { - "epoch": 0.6376613385759466, - "grad_norm": 1.096258282661438, - "learning_rate": 4.551012709476393e-05, - "loss": 4.2547, - "step": 3730 - }, - { - "epoch": 0.6393708864005471, - "grad_norm": 1.077452301979065, - "learning_rate": 4.548570697621645e-05, - "loss": 4.2748, - "step": 3740 - }, - { - "epoch": 0.6410804342251475, - "grad_norm": 1.08258855342865, - "learning_rate": 4.546122803056079e-05, - "loss": 4.2624, - "step": 3750 - }, - { - "epoch": 0.6427899820497478, - "grad_norm": 1.096158742904663, - "learning_rate": 4.543669033786192e-05, - "loss": 4.265, - "step": 3760 - }, - { - "epoch": 0.6444995298743482, - "grad_norm": 1.082450032234192, - "learning_rate": 4.5412093978376986e-05, - "loss": 4.2444, - "step": 3770 - }, - { - "epoch": 0.6462090776989486, - "grad_norm": 1.0835603475570679, - "learning_rate": 4.538743903255504e-05, - "loss": 4.2631, - "step": 3780 - }, - { - "epoch": 0.6479186255235491, - "grad_norm": 1.0919485092163086, - "learning_rate": 4.536272558103671e-05, - "loss": 4.2378, - "step": 3790 - }, - { - "epoch": 0.6496281733481494, - "grad_norm": 1.1219440698623657, - "learning_rate": 4.5337953704654024e-05, - "loss": 4.1823, - "step": 3800 - }, - { - "epoch": 0.6513377211727498, - "grad_norm": 1.0954729318618774, - "learning_rate": 4.5313123484430083e-05, - "loss": 4.241, - "step": 3810 - }, - { - "epoch": 0.6530472689973502, - "grad_norm": 1.0964537858963013, - "learning_rate": 4.528823500157883e-05, - "loss": 4.228, - "step": 3820 - }, - { - "epoch": 0.6547568168219506, - "grad_norm": 1.0883979797363281, - "learning_rate": 4.5263288337504755e-05, - "loss": 4.2544, - "step": 3830 - }, - { - "epoch": 0.656466364646551, - "grad_norm": 1.1328805685043335, - "learning_rate": 4.523828357380266e-05, - "loss": 4.2553, - "step": 3840 - }, - { - "epoch": 0.6581759124711514, - "grad_norm": 1.1367276906967163, - "learning_rate": 4.521322079225737e-05, - "loss": 4.2154, - "step": 3850 - }, - { - "epoch": 0.6598854602957518, - "grad_norm": 1.0790966749191284, - "learning_rate": 4.518810007484349e-05, - "loss": 4.2676, - "step": 3860 - }, - { - "epoch": 0.6615950081203522, - "grad_norm": 1.0894708633422852, - "learning_rate": 4.516292150372507e-05, - "loss": 4.2828, - "step": 3870 - }, - { - "epoch": 0.6633045559449525, - "grad_norm": 1.1014150381088257, - "learning_rate": 4.5137685161255455e-05, - "loss": 4.1967, - "step": 3880 - }, - { - "epoch": 0.6650141037695529, - "grad_norm": 1.0934723615646362, - "learning_rate": 4.51123911299769e-05, - "loss": 4.2731, - "step": 3890 - }, - { - "epoch": 0.6667236515941534, - "grad_norm": 1.1232093572616577, - "learning_rate": 4.5087039492620366e-05, - "loss": 4.256, - "step": 3900 - }, - { - "epoch": 0.6684331994187538, - "grad_norm": 1.1155422925949097, - "learning_rate": 4.506163033210521e-05, - "loss": 4.1889, - "step": 3910 - }, - { - "epoch": 0.6701427472433541, - "grad_norm": 1.0867037773132324, - "learning_rate": 4.503616373153896e-05, - "loss": 4.2583, - "step": 3920 - }, - { - "epoch": 0.6718522950679545, - "grad_norm": 1.1338471174240112, - "learning_rate": 4.5010639774217014e-05, - "loss": 4.2064, - "step": 3930 - }, - { - "epoch": 0.6735618428925549, - "grad_norm": 1.099379301071167, - "learning_rate": 4.4985058543622346e-05, - "loss": 4.2003, - "step": 3940 - }, - { - "epoch": 0.6752713907171554, - "grad_norm": 1.1022380590438843, - "learning_rate": 4.4959420123425274e-05, - "loss": 4.2364, - "step": 3950 - }, - { - "epoch": 0.6769809385417557, - "grad_norm": 1.0789977312088013, - "learning_rate": 4.4933724597483175e-05, - "loss": 4.2373, - "step": 3960 - }, - { - "epoch": 0.6786904863663561, - "grad_norm": 1.1275978088378906, - "learning_rate": 4.4907972049840206e-05, - "loss": 4.2081, - "step": 3970 - }, - { - "epoch": 0.6804000341909565, - "grad_norm": 1.0824768543243408, - "learning_rate": 4.4882162564727015e-05, - "loss": 4.2623, - "step": 3980 - }, - { - "epoch": 0.6821095820155569, - "grad_norm": 1.1104145050048828, - "learning_rate": 4.48562962265605e-05, - "loss": 4.2303, - "step": 3990 - }, - { - "epoch": 0.6838191298401572, - "grad_norm": 1.104638695716858, - "learning_rate": 4.483037311994349e-05, - "loss": 4.1918, - "step": 4000 - }, - { - "epoch": 0.6838191298401572, - "eval_loss": 4.2362799644470215, - "eval_runtime": 14.4635, - "eval_samples_per_second": 67.342, - "eval_steps_per_second": 0.899, - "step": 4000 - }, - { - "epoch": 0.6855286776647577, - "grad_norm": 1.104688048362732, - "learning_rate": 4.480439332966449e-05, - "loss": 4.2264, - "step": 4010 - }, - { - "epoch": 0.6872382254893581, - "grad_norm": 1.118034839630127, - "learning_rate": 4.4778356940697435e-05, - "loss": 4.2533, - "step": 4020 - }, - { - "epoch": 0.6889477733139585, - "grad_norm": 1.1144695281982422, - "learning_rate": 4.475226403820136e-05, - "loss": 4.2246, - "step": 4030 - }, - { - "epoch": 0.6906573211385588, - "grad_norm": 1.1089283227920532, - "learning_rate": 4.4726114707520126e-05, - "loss": 4.2198, - "step": 4040 - }, - { - "epoch": 0.6923668689631592, - "grad_norm": 1.1141961812973022, - "learning_rate": 4.46999090341822e-05, - "loss": 4.2679, - "step": 4050 - }, - { - "epoch": 0.6940764167877597, - "grad_norm": 1.0797265768051147, - "learning_rate": 4.467364710390028e-05, - "loss": 4.2525, - "step": 4060 - }, - { - "epoch": 0.6957859646123601, - "grad_norm": 1.09474778175354, - "learning_rate": 4.464732900257111e-05, - "loss": 4.2482, - "step": 4070 - }, - { - "epoch": 0.6974955124369604, - "grad_norm": 1.103073239326477, - "learning_rate": 4.4620954816275154e-05, - "loss": 4.248, - "step": 4080 - }, - { - "epoch": 0.6992050602615608, - "grad_norm": 1.1403964757919312, - "learning_rate": 4.45945246312763e-05, - "loss": 4.221, - "step": 4090 - }, - { - "epoch": 0.7009146080861612, - "grad_norm": 1.0685380697250366, - "learning_rate": 4.4568038534021586e-05, - "loss": 4.2431, - "step": 4100 - }, - { - "epoch": 0.7026241559107615, - "grad_norm": 1.1259526014328003, - "learning_rate": 4.454149661114095e-05, - "loss": 4.2249, - "step": 4110 - }, - { - "epoch": 0.704333703735362, - "grad_norm": 1.1042156219482422, - "learning_rate": 4.451489894944691e-05, - "loss": 4.1924, - "step": 4120 - }, - { - "epoch": 0.7060432515599624, - "grad_norm": 1.0901907682418823, - "learning_rate": 4.448824563593431e-05, - "loss": 4.2358, - "step": 4130 - }, - { - "epoch": 0.7077527993845628, - "grad_norm": 1.1413730382919312, - "learning_rate": 4.446153675777998e-05, - "loss": 4.2339, - "step": 4140 - }, - { - "epoch": 0.7094623472091631, - "grad_norm": 1.101927399635315, - "learning_rate": 4.4434772402342534e-05, - "loss": 4.216, - "step": 4150 - }, - { - "epoch": 0.7111718950337635, - "grad_norm": 1.1446946859359741, - "learning_rate": 4.4407952657162016e-05, - "loss": 4.2325, - "step": 4160 - }, - { - "epoch": 0.712881442858364, - "grad_norm": 1.1129071712493896, - "learning_rate": 4.438107760995963e-05, - "loss": 4.2449, - "step": 4170 - }, - { - "epoch": 0.7145909906829644, - "grad_norm": 1.1356310844421387, - "learning_rate": 4.435414734863748e-05, - "loss": 4.2189, - "step": 4180 - }, - { - "epoch": 0.7163005385075647, - "grad_norm": 1.0867831707000732, - "learning_rate": 4.432716196127825e-05, - "loss": 4.2176, - "step": 4190 - }, - { - "epoch": 0.7180100863321651, - "grad_norm": 1.112840175628662, - "learning_rate": 4.430012153614493e-05, - "loss": 4.2103, - "step": 4200 - }, - { - "epoch": 0.7197196341567655, - "grad_norm": 1.0951179265975952, - "learning_rate": 4.427302616168052e-05, - "loss": 4.1883, - "step": 4210 - }, - { - "epoch": 0.721429181981366, - "grad_norm": 1.1026438474655151, - "learning_rate": 4.424587592650777e-05, - "loss": 4.2978, - "step": 4220 - }, - { - "epoch": 0.7231387298059663, - "grad_norm": 1.0973010063171387, - "learning_rate": 4.421867091942885e-05, - "loss": 4.2315, - "step": 4230 - }, - { - "epoch": 0.7248482776305667, - "grad_norm": 1.0951300859451294, - "learning_rate": 4.4191411229425056e-05, - "loss": 4.2349, - "step": 4240 - }, - { - "epoch": 0.7265578254551671, - "grad_norm": 1.1219373941421509, - "learning_rate": 4.416409694565659e-05, - "loss": 4.211, - "step": 4250 - }, - { - "epoch": 0.7282673732797675, - "grad_norm": 1.1204344034194946, - "learning_rate": 4.413672815746216e-05, - "loss": 4.219, - "step": 4260 - }, - { - "epoch": 0.7299769211043678, - "grad_norm": 1.1001601219177246, - "learning_rate": 4.410930495435879e-05, - "loss": 4.2447, - "step": 4270 - }, - { - "epoch": 0.7316864689289683, - "grad_norm": 1.1077347993850708, - "learning_rate": 4.4081827426041454e-05, - "loss": 4.2218, - "step": 4280 - }, - { - "epoch": 0.7333960167535687, - "grad_norm": 1.1098490953445435, - "learning_rate": 4.4054295662382844e-05, - "loss": 4.1847, - "step": 4290 - }, - { - "epoch": 0.7351055645781691, - "grad_norm": 1.1118247509002686, - "learning_rate": 4.4026709753432996e-05, - "loss": 4.2398, - "step": 4300 - }, - { - "epoch": 0.7368151124027694, - "grad_norm": 1.1175380945205688, - "learning_rate": 4.3999069789419086e-05, - "loss": 4.23, - "step": 4310 - }, - { - "epoch": 0.7385246602273698, - "grad_norm": 1.0682730674743652, - "learning_rate": 4.3971375860745085e-05, - "loss": 4.2315, - "step": 4320 - }, - { - "epoch": 0.7402342080519703, - "grad_norm": 1.1111900806427002, - "learning_rate": 4.394362805799144e-05, - "loss": 4.2391, - "step": 4330 - }, - { - "epoch": 0.7419437558765707, - "grad_norm": 1.1240830421447754, - "learning_rate": 4.3915826471914833e-05, - "loss": 4.2111, - "step": 4340 - }, - { - "epoch": 0.743653303701171, - "grad_norm": 1.1100127696990967, - "learning_rate": 4.3887971193447856e-05, - "loss": 4.2284, - "step": 4350 - }, - { - "epoch": 0.7453628515257714, - "grad_norm": 1.0930662155151367, - "learning_rate": 4.386006231369872e-05, - "loss": 4.1917, - "step": 4360 - }, - { - "epoch": 0.7470723993503718, - "grad_norm": 1.1069419384002686, - "learning_rate": 4.3832099923950926e-05, - "loss": 4.2334, - "step": 4370 - }, - { - "epoch": 0.7487819471749723, - "grad_norm": 1.081602931022644, - "learning_rate": 4.380408411566304e-05, - "loss": 4.2334, - "step": 4380 - }, - { - "epoch": 0.7504914949995726, - "grad_norm": 1.0922625064849854, - "learning_rate": 4.377601498046832e-05, - "loss": 4.2167, - "step": 4390 - }, - { - "epoch": 0.752201042824173, - "grad_norm": 1.094083547592163, - "learning_rate": 4.374789261017443e-05, - "loss": 4.2241, - "step": 4400 - }, - { - "epoch": 0.7539105906487734, - "grad_norm": 1.1250125169754028, - "learning_rate": 4.371971709676319e-05, - "loss": 4.1653, - "step": 4410 - }, - { - "epoch": 0.7556201384733738, - "grad_norm": 1.0784069299697876, - "learning_rate": 4.3691488532390206e-05, - "loss": 4.2166, - "step": 4420 - }, - { - "epoch": 0.7573296862979741, - "grad_norm": 1.10162353515625, - "learning_rate": 4.366320700938463e-05, - "loss": 4.2462, - "step": 4430 - }, - { - "epoch": 0.7590392341225746, - "grad_norm": 1.089783787727356, - "learning_rate": 4.36348726202488e-05, - "loss": 4.2017, - "step": 4440 - }, - { - "epoch": 0.760748781947175, - "grad_norm": 1.1015251874923706, - "learning_rate": 4.360648545765799e-05, - "loss": 4.2205, - "step": 4450 - }, - { - "epoch": 0.7624583297717754, - "grad_norm": 1.1049211025238037, - "learning_rate": 4.357804561446008e-05, - "loss": 4.2088, - "step": 4460 - }, - { - "epoch": 0.7641678775963757, - "grad_norm": 1.0902608633041382, - "learning_rate": 4.354955318367524e-05, - "loss": 4.1983, - "step": 4470 - }, - { - "epoch": 0.7658774254209761, - "grad_norm": 1.0911825895309448, - "learning_rate": 4.352100825849566e-05, - "loss": 4.2342, - "step": 4480 - }, - { - "epoch": 0.7675869732455766, - "grad_norm": 1.0726137161254883, - "learning_rate": 4.349241093228522e-05, - "loss": 4.216, - "step": 4490 - }, - { - "epoch": 0.769296521070177, - "grad_norm": 1.0986886024475098, - "learning_rate": 4.346376129857919e-05, - "loss": 4.2202, - "step": 4500 - }, - { - "epoch": 0.7710060688947773, - "grad_norm": 1.1380634307861328, - "learning_rate": 4.3435059451083935e-05, - "loss": 4.2443, - "step": 4510 - }, - { - "epoch": 0.7727156167193777, - "grad_norm": 1.0970183610916138, - "learning_rate": 4.3406305483676594e-05, - "loss": 4.177, - "step": 4520 - }, - { - "epoch": 0.7744251645439781, - "grad_norm": 1.093316674232483, - "learning_rate": 4.337749949040477e-05, - "loss": 4.2163, - "step": 4530 - }, - { - "epoch": 0.7761347123685786, - "grad_norm": 1.0828700065612793, - "learning_rate": 4.334864156548623e-05, - "loss": 4.2401, - "step": 4540 - }, - { - "epoch": 0.7778442601931789, - "grad_norm": 1.0797021389007568, - "learning_rate": 4.331973180330862e-05, - "loss": 4.2224, - "step": 4550 - }, - { - "epoch": 0.7795538080177793, - "grad_norm": 1.0937657356262207, - "learning_rate": 4.3290770298429115e-05, - "loss": 4.2106, - "step": 4560 - }, - { - "epoch": 0.7812633558423797, - "grad_norm": 1.1316373348236084, - "learning_rate": 4.326175714557412e-05, - "loss": 4.179, - "step": 4570 - }, - { - "epoch": 0.7829729036669801, - "grad_norm": 1.0798097848892212, - "learning_rate": 4.3232692439638995e-05, - "loss": 4.2328, - "step": 4580 - }, - { - "epoch": 0.7846824514915804, - "grad_norm": 1.0894954204559326, - "learning_rate": 4.3203576275687705e-05, - "loss": 4.2626, - "step": 4590 - }, - { - "epoch": 0.7863919993161809, - "grad_norm": 1.0629162788391113, - "learning_rate": 4.317440874895251e-05, - "loss": 4.2163, - "step": 4600 - }, - { - "epoch": 0.7881015471407813, - "grad_norm": 1.097376823425293, - "learning_rate": 4.314518995483369e-05, - "loss": 4.2429, - "step": 4610 - }, - { - "epoch": 0.7898110949653817, - "grad_norm": 1.0930514335632324, - "learning_rate": 4.3115919988899186e-05, - "loss": 4.2357, - "step": 4620 - }, - { - "epoch": 0.791520642789982, - "grad_norm": 1.0703063011169434, - "learning_rate": 4.308659894688432e-05, - "loss": 4.1958, - "step": 4630 - }, - { - "epoch": 0.7932301906145824, - "grad_norm": 1.1003260612487793, - "learning_rate": 4.3057226924691465e-05, - "loss": 4.1927, - "step": 4640 - }, - { - "epoch": 0.7949397384391829, - "grad_norm": 1.0571287870407104, - "learning_rate": 4.3027804018389776e-05, - "loss": 4.2646, - "step": 4650 - }, - { - "epoch": 0.7966492862637832, - "grad_norm": 1.0914182662963867, - "learning_rate": 4.2998330324214776e-05, - "loss": 4.2019, - "step": 4660 - }, - { - "epoch": 0.7983588340883836, - "grad_norm": 1.097407579421997, - "learning_rate": 4.2968805938568154e-05, - "loss": 4.27, - "step": 4670 - }, - { - "epoch": 0.800068381912984, - "grad_norm": 1.097981572151184, - "learning_rate": 4.293923095801737e-05, - "loss": 4.2701, - "step": 4680 - }, - { - "epoch": 0.8017779297375844, - "grad_norm": 1.0830339193344116, - "learning_rate": 4.290960547929538e-05, - "loss": 4.2053, - "step": 4690 - }, - { - "epoch": 0.8034874775621847, - "grad_norm": 1.083641767501831, - "learning_rate": 4.287992959930032e-05, - "loss": 4.2239, - "step": 4700 - }, - { - "epoch": 0.8051970253867852, - "grad_norm": 1.1531246900558472, - "learning_rate": 4.285020341509515e-05, - "loss": 4.2022, - "step": 4710 - }, - { - "epoch": 0.8069065732113856, - "grad_norm": 1.10173761844635, - "learning_rate": 4.2820427023907384e-05, - "loss": 4.2122, - "step": 4720 - }, - { - "epoch": 0.808616121035986, - "grad_norm": 1.108216643333435, - "learning_rate": 4.279060052312873e-05, - "loss": 4.2393, - "step": 4730 - }, - { - "epoch": 0.8103256688605863, - "grad_norm": 1.0830931663513184, - "learning_rate": 4.2760724010314826e-05, - "loss": 4.2304, - "step": 4740 - }, - { - "epoch": 0.8120352166851867, - "grad_norm": 1.073140025138855, - "learning_rate": 4.273079758318486e-05, - "loss": 4.2177, - "step": 4750 - }, - { - "epoch": 0.8137447645097872, - "grad_norm": 1.080021619796753, - "learning_rate": 4.27008213396213e-05, - "loss": 4.213, - "step": 4760 - }, - { - "epoch": 0.8154543123343876, - "grad_norm": 1.0792510509490967, - "learning_rate": 4.267079537766952e-05, - "loss": 4.2055, - "step": 4770 - }, - { - "epoch": 0.8171638601589879, - "grad_norm": 1.11334228515625, - "learning_rate": 4.264071979553755e-05, - "loss": 4.2278, - "step": 4780 - }, - { - "epoch": 0.8188734079835883, - "grad_norm": 1.114492416381836, - "learning_rate": 4.2610594691595685e-05, - "loss": 4.2176, - "step": 4790 - }, - { - "epoch": 0.8205829558081887, - "grad_norm": 1.1293338537216187, - "learning_rate": 4.258042016437621e-05, - "loss": 4.1826, - "step": 4800 - }, - { - "epoch": 0.8222925036327892, - "grad_norm": 1.158013105392456, - "learning_rate": 4.2550196312573054e-05, - "loss": 4.2373, - "step": 4810 - }, - { - "epoch": 0.8240020514573895, - "grad_norm": 1.0849990844726562, - "learning_rate": 4.2519923235041494e-05, - "loss": 4.2518, - "step": 4820 - }, - { - "epoch": 0.8257115992819899, - "grad_norm": 1.1048305034637451, - "learning_rate": 4.248960103079779e-05, - "loss": 4.1755, - "step": 4830 - }, - { - "epoch": 0.8274211471065903, - "grad_norm": 1.0961942672729492, - "learning_rate": 4.24592297990189e-05, - "loss": 4.2039, - "step": 4840 - }, - { - "epoch": 0.8291306949311907, - "grad_norm": 1.1146312952041626, - "learning_rate": 4.242880963904211e-05, - "loss": 4.1711, - "step": 4850 - }, - { - "epoch": 0.830840242755791, - "grad_norm": 1.0729835033416748, - "learning_rate": 4.23983406503648e-05, - "loss": 4.1687, - "step": 4860 - }, - { - "epoch": 0.8325497905803915, - "grad_norm": 1.1004365682601929, - "learning_rate": 4.236782293264399e-05, - "loss": 4.195, - "step": 4870 - }, - { - "epoch": 0.8342593384049919, - "grad_norm": 1.1307947635650635, - "learning_rate": 4.233725658569612e-05, - "loss": 4.1697, - "step": 4880 - }, - { - "epoch": 0.8359688862295923, - "grad_norm": 1.093675971031189, - "learning_rate": 4.230664170949669e-05, - "loss": 4.2011, - "step": 4890 - }, - { - "epoch": 0.8376784340541926, - "grad_norm": 1.0708363056182861, - "learning_rate": 4.227597840417989e-05, - "loss": 4.2191, - "step": 4900 - }, - { - "epoch": 0.839387981878793, - "grad_norm": 1.0706011056900024, - "learning_rate": 4.2245266770038344e-05, - "loss": 4.2323, - "step": 4910 - }, - { - "epoch": 0.8410975297033935, - "grad_norm": 1.0779540538787842, - "learning_rate": 4.221450690752275e-05, - "loss": 4.2261, - "step": 4920 - }, - { - "epoch": 0.8428070775279939, - "grad_norm": 1.1013174057006836, - "learning_rate": 4.218369891724153e-05, - "loss": 4.2521, - "step": 4930 - }, - { - "epoch": 0.8445166253525942, - "grad_norm": 1.1218292713165283, - "learning_rate": 4.215284289996053e-05, - "loss": 4.2262, - "step": 4940 - }, - { - "epoch": 0.8462261731771946, - "grad_norm": 1.074468970298767, - "learning_rate": 4.212193895660269e-05, - "loss": 4.1897, - "step": 4950 - }, - { - "epoch": 0.847935721001795, - "grad_norm": 1.1277469396591187, - "learning_rate": 4.2090987188247664e-05, - "loss": 4.1936, - "step": 4960 - }, - { - "epoch": 0.8496452688263955, - "grad_norm": 1.1249181032180786, - "learning_rate": 4.205998769613161e-05, - "loss": 4.2345, - "step": 4970 - }, - { - "epoch": 0.8513548166509958, - "grad_norm": 1.1029151678085327, - "learning_rate": 4.2028940581646706e-05, - "loss": 4.2012, - "step": 4980 - }, - { - "epoch": 0.8530643644755962, - "grad_norm": 1.1064203977584839, - "learning_rate": 4.199784594634091e-05, - "loss": 4.2091, - "step": 4990 - }, - { - "epoch": 0.8547739123001966, - "grad_norm": 1.0808665752410889, - "learning_rate": 4.1966703891917644e-05, - "loss": 4.2125, - "step": 5000 - }, - { - "epoch": 0.8547739123001966, - "eval_loss": 4.221324443817139, - "eval_runtime": 14.9467, - "eval_samples_per_second": 65.165, - "eval_steps_per_second": 0.87, - "step": 5000 - }, - { - "epoch": 0.856483460124797, - "grad_norm": 1.1059165000915527, - "learning_rate": 4.193551452023537e-05, - "loss": 4.2717, - "step": 5010 - }, - { - "epoch": 0.8581930079493973, - "grad_norm": 1.109848141670227, - "learning_rate": 4.190427793330737e-05, - "loss": 4.2135, - "step": 5020 - }, - { - "epoch": 0.8599025557739978, - "grad_norm": 1.1132358312606812, - "learning_rate": 4.187299423330132e-05, - "loss": 4.2306, - "step": 5030 - }, - { - "epoch": 0.8616121035985982, - "grad_norm": 1.1210432052612305, - "learning_rate": 4.184166352253901e-05, - "loss": 4.1562, - "step": 5040 - }, - { - "epoch": 0.8633216514231986, - "grad_norm": 1.1282910108566284, - "learning_rate": 4.1810285903495975e-05, - "loss": 4.1731, - "step": 5050 - }, - { - "epoch": 0.8650311992477989, - "grad_norm": 1.1050913333892822, - "learning_rate": 4.177886147880119e-05, - "loss": 4.1918, - "step": 5060 - }, - { - "epoch": 0.8667407470723993, - "grad_norm": 1.1351230144500732, - "learning_rate": 4.174739035123671e-05, - "loss": 4.2229, - "step": 5070 - }, - { - "epoch": 0.8684502948969998, - "grad_norm": 1.0826314687728882, - "learning_rate": 4.171587262373737e-05, - "loss": 4.1589, - "step": 5080 - }, - { - "epoch": 0.8701598427216002, - "grad_norm": 1.0703433752059937, - "learning_rate": 4.168430839939038e-05, - "loss": 4.2402, - "step": 5090 - }, - { - "epoch": 0.8718693905462005, - "grad_norm": 1.0734028816223145, - "learning_rate": 4.1652697781435064e-05, - "loss": 4.235, - "step": 5100 - }, - { - "epoch": 0.8735789383708009, - "grad_norm": 1.1095879077911377, - "learning_rate": 4.162104087326247e-05, - "loss": 4.1781, - "step": 5110 - }, - { - "epoch": 0.8752884861954013, - "grad_norm": 1.0920771360397339, - "learning_rate": 4.158933777841507e-05, - "loss": 4.219, - "step": 5120 - }, - { - "epoch": 0.8769980340200018, - "grad_norm": 1.1102924346923828, - "learning_rate": 4.155758860058639e-05, - "loss": 4.1812, - "step": 5130 - }, - { - "epoch": 0.8787075818446021, - "grad_norm": 1.0601921081542969, - "learning_rate": 4.1525793443620674e-05, - "loss": 4.1896, - "step": 5140 - }, - { - "epoch": 0.8804171296692025, - "grad_norm": 1.1162163019180298, - "learning_rate": 4.149395241151257e-05, - "loss": 4.1976, - "step": 5150 - }, - { - "epoch": 0.8821266774938029, - "grad_norm": 1.0771912336349487, - "learning_rate": 4.1462065608406766e-05, - "loss": 4.2136, - "step": 5160 - }, - { - "epoch": 0.8838362253184033, - "grad_norm": 1.0650315284729004, - "learning_rate": 4.143013313859767e-05, - "loss": 4.2158, - "step": 5170 - }, - { - "epoch": 0.8855457731430036, - "grad_norm": 1.075141429901123, - "learning_rate": 4.1398155106529006e-05, - "loss": 4.2686, - "step": 5180 - }, - { - "epoch": 0.8872553209676041, - "grad_norm": 1.1017076969146729, - "learning_rate": 4.1366131616793585e-05, - "loss": 4.1909, - "step": 5190 - }, - { - "epoch": 0.8889648687922045, - "grad_norm": 1.108544111251831, - "learning_rate": 4.133406277413288e-05, - "loss": 4.2278, - "step": 5200 - }, - { - "epoch": 0.8906744166168049, - "grad_norm": 1.0897852182388306, - "learning_rate": 4.130194868343666e-05, - "loss": 4.182, - "step": 5210 - }, - { - "epoch": 0.8923839644414052, - "grad_norm": 1.1739139556884766, - "learning_rate": 4.1269789449742754e-05, - "loss": 4.1697, - "step": 5220 - }, - { - "epoch": 0.8940935122660056, - "grad_norm": 1.1182951927185059, - "learning_rate": 4.12375851782366e-05, - "loss": 4.1842, - "step": 5230 - }, - { - "epoch": 0.8958030600906061, - "grad_norm": 1.0998613834381104, - "learning_rate": 4.120533597425097e-05, - "loss": 4.1925, - "step": 5240 - }, - { - "epoch": 0.8975126079152064, - "grad_norm": 1.084879994392395, - "learning_rate": 4.117304194326558e-05, - "loss": 4.2585, - "step": 5250 - }, - { - "epoch": 0.8992221557398068, - "grad_norm": 1.1102412939071655, - "learning_rate": 4.114070319090678e-05, - "loss": 4.2135, - "step": 5260 - }, - { - "epoch": 0.9009317035644072, - "grad_norm": 1.0872443914413452, - "learning_rate": 4.1108319822947165e-05, - "loss": 4.2241, - "step": 5270 - }, - { - "epoch": 0.9026412513890076, - "grad_norm": 1.1075351238250732, - "learning_rate": 4.107589194530531e-05, - "loss": 4.1706, - "step": 5280 - }, - { - "epoch": 0.904350799213608, - "grad_norm": 1.0992456674575806, - "learning_rate": 4.1043419664045316e-05, - "loss": 4.2158, - "step": 5290 - }, - { - "epoch": 0.9060603470382084, - "grad_norm": 1.1312294006347656, - "learning_rate": 4.101090308537656e-05, - "loss": 4.2728, - "step": 5300 - }, - { - "epoch": 0.9077698948628088, - "grad_norm": 1.1104251146316528, - "learning_rate": 4.097834231565328e-05, - "loss": 4.2013, - "step": 5310 - }, - { - "epoch": 0.9094794426874092, - "grad_norm": 1.0897973775863647, - "learning_rate": 4.094573746137424e-05, - "loss": 4.2098, - "step": 5320 - }, - { - "epoch": 0.9111889905120095, - "grad_norm": 1.135635256767273, - "learning_rate": 4.091308862918244e-05, - "loss": 4.185, - "step": 5330 - }, - { - "epoch": 0.9128985383366099, - "grad_norm": 1.0775920152664185, - "learning_rate": 4.08803959258647e-05, - "loss": 4.1961, - "step": 5340 - }, - { - "epoch": 0.9146080861612104, - "grad_norm": 1.1442935466766357, - "learning_rate": 4.0847659458351306e-05, - "loss": 4.2054, - "step": 5350 - }, - { - "epoch": 0.9163176339858108, - "grad_norm": 1.072646141052246, - "learning_rate": 4.0814879333715725e-05, - "loss": 4.2467, - "step": 5360 - }, - { - "epoch": 0.9180271818104111, - "grad_norm": 1.1238670349121094, - "learning_rate": 4.078205565917419e-05, - "loss": 4.1525, - "step": 5370 - }, - { - "epoch": 0.9197367296350115, - "grad_norm": 1.100277066230774, - "learning_rate": 4.07491885420854e-05, - "loss": 4.2445, - "step": 5380 - }, - { - "epoch": 0.9214462774596119, - "grad_norm": 1.072641134262085, - "learning_rate": 4.0716278089950117e-05, - "loss": 4.2497, - "step": 5390 - }, - { - "epoch": 0.9231558252842124, - "grad_norm": 1.0912519693374634, - "learning_rate": 4.068332441041086e-05, - "loss": 4.1493, - "step": 5400 - }, - { - "epoch": 0.9248653731088127, - "grad_norm": 1.1013576984405518, - "learning_rate": 4.065032761125153e-05, - "loss": 4.2458, - "step": 5410 - }, - { - "epoch": 0.9265749209334131, - "grad_norm": 1.0836546421051025, - "learning_rate": 4.061728780039707e-05, - "loss": 4.2171, - "step": 5420 - }, - { - "epoch": 0.9282844687580135, - "grad_norm": 1.0915653705596924, - "learning_rate": 4.058420508591309e-05, - "loss": 4.215, - "step": 5430 - }, - { - "epoch": 0.9299940165826139, - "grad_norm": 1.1450116634368896, - "learning_rate": 4.055107957600553e-05, - "loss": 4.2096, - "step": 5440 - }, - { - "epoch": 0.9317035644072142, - "grad_norm": 1.1178923845291138, - "learning_rate": 4.0517911379020326e-05, - "loss": 4.2071, - "step": 5450 - }, - { - "epoch": 0.9334131122318147, - "grad_norm": 1.1007795333862305, - "learning_rate": 4.048470060344301e-05, - "loss": 4.2124, - "step": 5460 - }, - { - "epoch": 0.9351226600564151, - "grad_norm": 1.0651965141296387, - "learning_rate": 4.045144735789838e-05, - "loss": 4.2442, - "step": 5470 - }, - { - "epoch": 0.9368322078810155, - "grad_norm": 1.098145604133606, - "learning_rate": 4.041815175115015e-05, - "loss": 4.1669, - "step": 5480 - }, - { - "epoch": 0.9385417557056158, - "grad_norm": 1.1463416814804077, - "learning_rate": 4.038481389210059e-05, - "loss": 4.2448, - "step": 5490 - }, - { - "epoch": 0.9402513035302162, - "grad_norm": 1.1300941705703735, - "learning_rate": 4.0351433889790166e-05, - "loss": 4.1918, - "step": 5500 - }, - { - "epoch": 0.9419608513548167, - "grad_norm": 1.0922478437423706, - "learning_rate": 4.031801185339717e-05, - "loss": 4.1968, - "step": 5510 - }, - { - "epoch": 0.9436703991794171, - "grad_norm": 1.145541787147522, - "learning_rate": 4.028454789223742e-05, - "loss": 4.1791, - "step": 5520 - }, - { - "epoch": 0.9453799470040174, - "grad_norm": 1.1304386854171753, - "learning_rate": 4.02510421157638e-05, - "loss": 4.1873, - "step": 5530 - }, - { - "epoch": 0.9470894948286178, - "grad_norm": 1.1011337041854858, - "learning_rate": 4.021749463356602e-05, - "loss": 4.2359, - "step": 5540 - }, - { - "epoch": 0.9487990426532182, - "grad_norm": 1.1082210540771484, - "learning_rate": 4.018390555537015e-05, - "loss": 4.2105, - "step": 5550 - }, - { - "epoch": 0.9505085904778187, - "grad_norm": 1.0832359790802002, - "learning_rate": 4.015027499103834e-05, - "loss": 4.1937, - "step": 5560 - }, - { - "epoch": 0.952218138302419, - "grad_norm": 1.0481384992599487, - "learning_rate": 4.0116603050568446e-05, - "loss": 4.2306, - "step": 5570 - }, - { - "epoch": 0.9539276861270194, - "grad_norm": 1.10129714012146, - "learning_rate": 4.008288984409361e-05, - "loss": 4.1873, - "step": 5580 - }, - { - "epoch": 0.9556372339516198, - "grad_norm": 1.1311404705047607, - "learning_rate": 4.0049135481881995e-05, - "loss": 4.236, - "step": 5590 - }, - { - "epoch": 0.9573467817762202, - "grad_norm": 1.1674425601959229, - "learning_rate": 4.001534007433633e-05, - "loss": 4.1932, - "step": 5600 - }, - { - "epoch": 0.9590563296008205, - "grad_norm": 1.0756914615631104, - "learning_rate": 3.998150373199362e-05, - "loss": 4.2012, - "step": 5610 - }, - { - "epoch": 0.960765877425421, - "grad_norm": 1.0900822877883911, - "learning_rate": 3.9947626565524754e-05, - "loss": 4.1879, - "step": 5620 - }, - { - "epoch": 0.9624754252500214, - "grad_norm": 1.1168532371520996, - "learning_rate": 3.9913708685734156e-05, - "loss": 4.22, - "step": 5630 - }, - { - "epoch": 0.9641849730746218, - "grad_norm": 1.1325898170471191, - "learning_rate": 3.98797502035594e-05, - "loss": 4.1834, - "step": 5640 - }, - { - "epoch": 0.9658945208992221, - "grad_norm": 1.0924124717712402, - "learning_rate": 3.984575123007087e-05, - "loss": 4.2006, - "step": 5650 - }, - { - "epoch": 0.9676040687238225, - "grad_norm": 1.0883935689926147, - "learning_rate": 3.9811711876471374e-05, - "loss": 4.2489, - "step": 5660 - }, - { - "epoch": 0.969313616548423, - "grad_norm": 1.1189782619476318, - "learning_rate": 3.977763225409581e-05, - "loss": 4.2134, - "step": 5670 - }, - { - "epoch": 0.9710231643730234, - "grad_norm": 1.1052134037017822, - "learning_rate": 3.9743512474410776e-05, - "loss": 4.2618, - "step": 5680 - }, - { - "epoch": 0.9727327121976237, - "grad_norm": 1.106770396232605, - "learning_rate": 3.970935264901422e-05, - "loss": 4.1571, - "step": 5690 - }, - { - "epoch": 0.9744422600222241, - "grad_norm": 1.1165562868118286, - "learning_rate": 3.967515288963507e-05, - "loss": 4.1984, - "step": 5700 - }, - { - "epoch": 0.9761518078468245, - "grad_norm": 1.0882115364074707, - "learning_rate": 3.964091330813287e-05, - "loss": 4.2074, - "step": 5710 - }, - { - "epoch": 0.977861355671425, - "grad_norm": 1.0754927396774292, - "learning_rate": 3.9606634016497404e-05, - "loss": 4.2458, - "step": 5720 - }, - { - "epoch": 0.9795709034960253, - "grad_norm": 1.1058735847473145, - "learning_rate": 3.9572315126848356e-05, - "loss": 4.1825, - "step": 5730 - }, - { - "epoch": 0.9812804513206257, - "grad_norm": 1.1208829879760742, - "learning_rate": 3.9537956751434904e-05, - "loss": 4.2134, - "step": 5740 - }, - { - "epoch": 0.9829899991452261, - "grad_norm": 1.1145589351654053, - "learning_rate": 3.95035590026354e-05, - "loss": 4.1889, - "step": 5750 - }, - { - "epoch": 0.9846995469698265, - "grad_norm": 1.0948501825332642, - "learning_rate": 3.946912199295695e-05, - "loss": 4.216, - "step": 5760 - }, - { - "epoch": 0.9864090947944268, - "grad_norm": 1.0784988403320312, - "learning_rate": 3.94346458350351e-05, - "loss": 4.2454, - "step": 5770 - }, - { - "epoch": 0.9881186426190273, - "grad_norm": 1.0936200618743896, - "learning_rate": 3.940013064163342e-05, - "loss": 4.208, - "step": 5780 - }, - { - "epoch": 0.9898281904436277, - "grad_norm": 1.0742484331130981, - "learning_rate": 3.9365576525643164e-05, - "loss": 4.2146, - "step": 5790 - }, - { - "epoch": 0.9915377382682281, - "grad_norm": 1.1136304140090942, - "learning_rate": 3.93309836000829e-05, - "loss": 4.2021, - "step": 5800 - }, - { - "epoch": 0.9932472860928284, - "grad_norm": 1.1223887205123901, - "learning_rate": 3.9296351978098114e-05, - "loss": 4.2077, - "step": 5810 - }, - { - "epoch": 0.9949568339174288, - "grad_norm": 1.149657130241394, - "learning_rate": 3.9261681772960884e-05, - "loss": 4.2155, - "step": 5820 - }, - { - "epoch": 0.9966663817420293, - "grad_norm": 1.0799331665039062, - "learning_rate": 3.9226973098069456e-05, - "loss": 4.1809, - "step": 5830 - }, - { - "epoch": 0.9983759295666296, - "grad_norm": 1.1035246849060059, - "learning_rate": 3.919222606694794e-05, - "loss": 4.204, - "step": 5840 - }, - { - "epoch": 1.00008547739123, - "grad_norm": 1.2470793724060059, - "learning_rate": 3.9157440793245855e-05, - "loss": 4.1949, - "step": 5850 - }, - { - "epoch": 1.0017950252158303, - "grad_norm": 1.3840519189834595, - "learning_rate": 3.912261739073785e-05, - "loss": 3.9217, - "step": 5860 - }, - { - "epoch": 1.0035045730404308, - "grad_norm": 1.1869735717773438, - "learning_rate": 3.9087755973323234e-05, - "loss": 3.9142, - "step": 5870 - }, - { - "epoch": 1.0052141208650311, - "grad_norm": 1.2040549516677856, - "learning_rate": 3.90528566550257e-05, - "loss": 3.8906, - "step": 5880 - }, - { - "epoch": 1.0069236686896317, - "grad_norm": 1.1796395778656006, - "learning_rate": 3.901791954999287e-05, - "loss": 3.861, - "step": 5890 - }, - { - "epoch": 1.008633216514232, - "grad_norm": 1.2027372121810913, - "learning_rate": 3.8982944772496e-05, - "loss": 3.8843, - "step": 5900 - }, - { - "epoch": 1.0103427643388323, - "grad_norm": 1.2046513557434082, - "learning_rate": 3.8947932436929506e-05, - "loss": 3.9101, - "step": 5910 - }, - { - "epoch": 1.0120523121634328, - "grad_norm": 1.2311046123504639, - "learning_rate": 3.89128826578107e-05, - "loss": 3.8354, - "step": 5920 - }, - { - "epoch": 1.0137618599880331, - "grad_norm": 1.202433466911316, - "learning_rate": 3.887779554977934e-05, - "loss": 3.8879, - "step": 5930 - }, - { - "epoch": 1.0154714078126335, - "grad_norm": 1.1902302503585815, - "learning_rate": 3.884267122759727e-05, - "loss": 3.9615, - "step": 5940 - }, - { - "epoch": 1.017180955637234, - "grad_norm": 1.2200571298599243, - "learning_rate": 3.880750980614807e-05, - "loss": 3.8552, - "step": 5950 - }, - { - "epoch": 1.0188905034618343, - "grad_norm": 1.1822001934051514, - "learning_rate": 3.8772311400436674e-05, - "loss": 3.8755, - "step": 5960 - }, - { - "epoch": 1.0206000512864348, - "grad_norm": 1.1966800689697266, - "learning_rate": 3.8737076125588945e-05, - "loss": 3.94, - "step": 5970 - }, - { - "epoch": 1.0223095991110351, - "grad_norm": 1.2249361276626587, - "learning_rate": 3.870180409685136e-05, - "loss": 3.8646, - "step": 5980 - }, - { - "epoch": 1.0240191469356354, - "grad_norm": 1.225176453590393, - "learning_rate": 3.866649542959062e-05, - "loss": 3.8819, - "step": 5990 - }, - { - "epoch": 1.025728694760236, - "grad_norm": 1.2029138803482056, - "learning_rate": 3.863115023929324e-05, - "loss": 3.8938, - "step": 6000 - }, - { - "epoch": 1.025728694760236, - "eval_loss": 4.225888729095459, - "eval_runtime": 14.726, - "eval_samples_per_second": 66.142, - "eval_steps_per_second": 0.883, - "step": 6000 - }, - { - "epoch": 1.0274382425848363, - "grad_norm": 1.2310200929641724, - "learning_rate": 3.859576864156521e-05, - "loss": 3.8488, - "step": 6010 - }, - { - "epoch": 1.0291477904094366, - "grad_norm": 1.2333732843399048, - "learning_rate": 3.856035075213159e-05, - "loss": 3.9137, - "step": 6020 - }, - { - "epoch": 1.0308573382340371, - "grad_norm": 1.2623525857925415, - "learning_rate": 3.852489668683614e-05, - "loss": 3.876, - "step": 6030 - }, - { - "epoch": 1.0325668860586374, - "grad_norm": 1.1837838888168335, - "learning_rate": 3.8489406561640945e-05, - "loss": 3.8762, - "step": 6040 - }, - { - "epoch": 1.034276433883238, - "grad_norm": 1.2944416999816895, - "learning_rate": 3.845388049262604e-05, - "loss": 3.8984, - "step": 6050 - }, - { - "epoch": 1.0359859817078383, - "grad_norm": 1.215786337852478, - "learning_rate": 3.841831859598903e-05, - "loss": 3.8963, - "step": 6060 - }, - { - "epoch": 1.0376955295324386, - "grad_norm": 1.2454984188079834, - "learning_rate": 3.8382720988044704e-05, - "loss": 3.9005, - "step": 6070 - }, - { - "epoch": 1.0394050773570391, - "grad_norm": 1.2577461004257202, - "learning_rate": 3.834708778522461e-05, - "loss": 3.8671, - "step": 6080 - }, - { - "epoch": 1.0411146251816394, - "grad_norm": 1.2162314653396606, - "learning_rate": 3.831141910407678e-05, - "loss": 3.8755, - "step": 6090 - }, - { - "epoch": 1.0428241730062398, - "grad_norm": 1.2268115282058716, - "learning_rate": 3.827571506126525e-05, - "loss": 3.9084, - "step": 6100 - }, - { - "epoch": 1.0445337208308403, - "grad_norm": 1.2096734046936035, - "learning_rate": 3.823997577356974e-05, - "loss": 3.8858, - "step": 6110 - }, - { - "epoch": 1.0462432686554406, - "grad_norm": 1.2104732990264893, - "learning_rate": 3.820420135788521e-05, - "loss": 3.9149, - "step": 6120 - }, - { - "epoch": 1.0479528164800411, - "grad_norm": 1.201377272605896, - "learning_rate": 3.816839193122157e-05, - "loss": 3.8929, - "step": 6130 - }, - { - "epoch": 1.0496623643046414, - "grad_norm": 1.2180641889572144, - "learning_rate": 3.8132547610703196e-05, - "loss": 3.8968, - "step": 6140 - }, - { - "epoch": 1.0513719121292417, - "grad_norm": 1.2416244745254517, - "learning_rate": 3.8096668513568615e-05, - "loss": 3.8875, - "step": 6150 - }, - { - "epoch": 1.0530814599538423, - "grad_norm": 1.2351043224334717, - "learning_rate": 3.806075475717008e-05, - "loss": 3.9181, - "step": 6160 - }, - { - "epoch": 1.0547910077784426, - "grad_norm": 1.2557121515274048, - "learning_rate": 3.802480645897326e-05, - "loss": 3.8905, - "step": 6170 - }, - { - "epoch": 1.056500555603043, - "grad_norm": 1.2172404527664185, - "learning_rate": 3.798882373655673e-05, - "loss": 3.8292, - "step": 6180 - }, - { - "epoch": 1.0582101034276434, - "grad_norm": 1.2274837493896484, - "learning_rate": 3.7952806707611714e-05, - "loss": 3.836, - "step": 6190 - }, - { - "epoch": 1.0599196512522437, - "grad_norm": 1.3620704412460327, - "learning_rate": 3.791675548994162e-05, - "loss": 3.8912, - "step": 6200 - }, - { - "epoch": 1.0616291990768443, - "grad_norm": 1.2791434526443481, - "learning_rate": 3.788067020146168e-05, - "loss": 3.8829, - "step": 6210 - }, - { - "epoch": 1.0633387469014446, - "grad_norm": 1.2293933629989624, - "learning_rate": 3.7844550960198565e-05, - "loss": 3.8702, - "step": 6220 - }, - { - "epoch": 1.065048294726045, - "grad_norm": 1.3083046674728394, - "learning_rate": 3.780839788429001e-05, - "loss": 3.919, - "step": 6230 - }, - { - "epoch": 1.0667578425506454, - "grad_norm": 1.2462128400802612, - "learning_rate": 3.77722110919844e-05, - "loss": 3.9038, - "step": 6240 - }, - { - "epoch": 1.0684673903752457, - "grad_norm": 1.244581699371338, - "learning_rate": 3.77359907016404e-05, - "loss": 3.8946, - "step": 6250 - }, - { - "epoch": 1.070176938199846, - "grad_norm": 1.2279540300369263, - "learning_rate": 3.769973683172659e-05, - "loss": 3.9165, - "step": 6260 - }, - { - "epoch": 1.0718864860244466, - "grad_norm": 1.216293215751648, - "learning_rate": 3.766344960082101e-05, - "loss": 3.9155, - "step": 6270 - }, - { - "epoch": 1.073596033849047, - "grad_norm": 1.2287122011184692, - "learning_rate": 3.762712912761085e-05, - "loss": 3.9076, - "step": 6280 - }, - { - "epoch": 1.0753055816736472, - "grad_norm": 1.2020460367202759, - "learning_rate": 3.759077553089202e-05, - "loss": 3.8562, - "step": 6290 - }, - { - "epoch": 1.0770151294982477, - "grad_norm": 1.1856670379638672, - "learning_rate": 3.755438892956878e-05, - "loss": 3.8494, - "step": 6300 - }, - { - "epoch": 1.078724677322848, - "grad_norm": 1.221611499786377, - "learning_rate": 3.751796944265331e-05, - "loss": 3.9129, - "step": 6310 - }, - { - "epoch": 1.0804342251474486, - "grad_norm": 1.2119340896606445, - "learning_rate": 3.748151718926539e-05, - "loss": 3.9301, - "step": 6320 - }, - { - "epoch": 1.082143772972049, - "grad_norm": 1.2519149780273438, - "learning_rate": 3.7445032288631924e-05, - "loss": 3.8967, - "step": 6330 - }, - { - "epoch": 1.0838533207966492, - "grad_norm": 1.211293339729309, - "learning_rate": 3.740851486008665e-05, - "loss": 3.9127, - "step": 6340 - }, - { - "epoch": 1.0855628686212497, - "grad_norm": 1.1981501579284668, - "learning_rate": 3.737196502306966e-05, - "loss": 3.8868, - "step": 6350 - }, - { - "epoch": 1.08727241644585, - "grad_norm": 1.2011394500732422, - "learning_rate": 3.7335382897127064e-05, - "loss": 3.9544, - "step": 6360 - }, - { - "epoch": 1.0889819642704506, - "grad_norm": 1.3066656589508057, - "learning_rate": 3.729876860191057e-05, - "loss": 3.8781, - "step": 6370 - }, - { - "epoch": 1.0906915120950509, - "grad_norm": 1.243750810623169, - "learning_rate": 3.726212225717712e-05, - "loss": 3.8878, - "step": 6380 - }, - { - "epoch": 1.0924010599196512, - "grad_norm": 1.2609118223190308, - "learning_rate": 3.722544398278847e-05, - "loss": 3.8988, - "step": 6390 - }, - { - "epoch": 1.0941106077442517, - "grad_norm": 1.2118616104125977, - "learning_rate": 3.718873389871082e-05, - "loss": 3.9101, - "step": 6400 - }, - { - "epoch": 1.095820155568852, - "grad_norm": 1.200276255607605, - "learning_rate": 3.715199212501439e-05, - "loss": 3.8612, - "step": 6410 - }, - { - "epoch": 1.0975297033934523, - "grad_norm": 1.2141379117965698, - "learning_rate": 3.711521878187308e-05, - "loss": 3.9304, - "step": 6420 - }, - { - "epoch": 1.0992392512180529, - "grad_norm": 1.2518267631530762, - "learning_rate": 3.707841398956403e-05, - "loss": 3.8914, - "step": 6430 - }, - { - "epoch": 1.1009487990426532, - "grad_norm": 1.2810555696487427, - "learning_rate": 3.704157786846724e-05, - "loss": 3.8915, - "step": 6440 - }, - { - "epoch": 1.1026583468672535, - "grad_norm": 1.214570164680481, - "learning_rate": 3.7004710539065194e-05, - "loss": 3.8604, - "step": 6450 - }, - { - "epoch": 1.104367894691854, - "grad_norm": 1.2016327381134033, - "learning_rate": 3.6967812121942436e-05, - "loss": 3.8653, - "step": 6460 - }, - { - "epoch": 1.1060774425164543, - "grad_norm": 1.252418875694275, - "learning_rate": 3.6930882737785185e-05, - "loss": 3.9079, - "step": 6470 - }, - { - "epoch": 1.1077869903410549, - "grad_norm": 1.2139511108398438, - "learning_rate": 3.689392250738098e-05, - "loss": 3.8985, - "step": 6480 - }, - { - "epoch": 1.1094965381656552, - "grad_norm": 1.2347521781921387, - "learning_rate": 3.685693155161821e-05, - "loss": 3.8889, - "step": 6490 - }, - { - "epoch": 1.1112060859902555, - "grad_norm": 1.2597804069519043, - "learning_rate": 3.681990999148579e-05, - "loss": 3.9317, - "step": 6500 - }, - { - "epoch": 1.112915633814856, - "grad_norm": 1.2117327451705933, - "learning_rate": 3.6782857948072715e-05, - "loss": 3.8808, - "step": 6510 - }, - { - "epoch": 1.1146251816394563, - "grad_norm": 1.2120871543884277, - "learning_rate": 3.6745775542567694e-05, - "loss": 3.893, - "step": 6520 - }, - { - "epoch": 1.1163347294640569, - "grad_norm": 1.2200185060501099, - "learning_rate": 3.670866289625875e-05, - "loss": 3.8855, - "step": 6530 - }, - { - "epoch": 1.1180442772886572, - "grad_norm": 1.2236151695251465, - "learning_rate": 3.667152013053279e-05, - "loss": 3.859, - "step": 6540 - }, - { - "epoch": 1.1197538251132575, - "grad_norm": 1.2398489713668823, - "learning_rate": 3.663434736687526e-05, - "loss": 3.8726, - "step": 6550 - }, - { - "epoch": 1.121463372937858, - "grad_norm": 1.2758897542953491, - "learning_rate": 3.6597144726869725e-05, - "loss": 3.8955, - "step": 6560 - }, - { - "epoch": 1.1231729207624583, - "grad_norm": 1.1870349645614624, - "learning_rate": 3.6559912332197447e-05, - "loss": 3.903, - "step": 6570 - }, - { - "epoch": 1.1248824685870586, - "grad_norm": 1.2525904178619385, - "learning_rate": 3.6522650304637024e-05, - "loss": 3.9007, - "step": 6580 - }, - { - "epoch": 1.1265920164116592, - "grad_norm": 1.2074586153030396, - "learning_rate": 3.648535876606397e-05, - "loss": 3.8676, - "step": 6590 - }, - { - "epoch": 1.1283015642362595, - "grad_norm": 1.2804462909698486, - "learning_rate": 3.6448037838450323e-05, - "loss": 3.9066, - "step": 6600 - }, - { - "epoch": 1.1300111120608598, - "grad_norm": 1.207937479019165, - "learning_rate": 3.641068764386426e-05, - "loss": 3.9523, - "step": 6610 - }, - { - "epoch": 1.1317206598854603, - "grad_norm": 1.2353366613388062, - "learning_rate": 3.637330830446966e-05, - "loss": 3.8874, - "step": 6620 - }, - { - "epoch": 1.1334302077100606, - "grad_norm": 1.2423683404922485, - "learning_rate": 3.6335899942525744e-05, - "loss": 3.896, - "step": 6630 - }, - { - "epoch": 1.1351397555346612, - "grad_norm": 1.2394087314605713, - "learning_rate": 3.629846268038665e-05, - "loss": 3.8373, - "step": 6640 - }, - { - "epoch": 1.1368493033592615, - "grad_norm": 1.2041878700256348, - "learning_rate": 3.626099664050106e-05, - "loss": 3.8531, - "step": 6650 - }, - { - "epoch": 1.1385588511838618, - "grad_norm": 1.22921621799469, - "learning_rate": 3.622350194541175e-05, - "loss": 3.8988, - "step": 6660 - }, - { - "epoch": 1.1402683990084623, - "grad_norm": 1.2405158281326294, - "learning_rate": 3.6185978717755244e-05, - "loss": 3.8591, - "step": 6670 - }, - { - "epoch": 1.1419779468330626, - "grad_norm": 1.2411692142486572, - "learning_rate": 3.6148427080261395e-05, - "loss": 3.8639, - "step": 6680 - }, - { - "epoch": 1.1436874946576632, - "grad_norm": 1.2315113544464111, - "learning_rate": 3.6110847155752956e-05, - "loss": 3.925, - "step": 6690 - }, - { - "epoch": 1.1453970424822635, - "grad_norm": 1.2906644344329834, - "learning_rate": 3.60732390671452e-05, - "loss": 3.8885, - "step": 6700 - }, - { - "epoch": 1.1471065903068638, - "grad_norm": 1.2568696737289429, - "learning_rate": 3.603560293744556e-05, - "loss": 3.876, - "step": 6710 - }, - { - "epoch": 1.148816138131464, - "grad_norm": 1.2818893194198608, - "learning_rate": 3.599793888975312e-05, - "loss": 3.8812, - "step": 6720 - }, - { - "epoch": 1.1505256859560646, - "grad_norm": 1.2619463205337524, - "learning_rate": 3.596024704725835e-05, - "loss": 3.8784, - "step": 6730 - }, - { - "epoch": 1.152235233780665, - "grad_norm": 1.2190254926681519, - "learning_rate": 3.592252753324256e-05, - "loss": 3.8766, - "step": 6740 - }, - { - "epoch": 1.1539447816052655, - "grad_norm": 1.2215932607650757, - "learning_rate": 3.588478047107761e-05, - "loss": 3.9069, - "step": 6750 - }, - { - "epoch": 1.1556543294298658, - "grad_norm": 1.248286485671997, - "learning_rate": 3.584700598422545e-05, - "loss": 3.8796, - "step": 6760 - }, - { - "epoch": 1.157363877254466, - "grad_norm": 1.2150369882583618, - "learning_rate": 3.580920419623775e-05, - "loss": 3.9021, - "step": 6770 - }, - { - "epoch": 1.1590734250790666, - "grad_norm": 1.2244809865951538, - "learning_rate": 3.577137523075544e-05, - "loss": 3.899, - "step": 6780 - }, - { - "epoch": 1.160782972903667, - "grad_norm": 1.2323485612869263, - "learning_rate": 3.573351921150837e-05, - "loss": 3.9305, - "step": 6790 - }, - { - "epoch": 1.1624925207282675, - "grad_norm": 1.2519583702087402, - "learning_rate": 3.5695636262314844e-05, - "loss": 3.9092, - "step": 6800 - }, - { - "epoch": 1.1642020685528678, - "grad_norm": 1.228255271911621, - "learning_rate": 3.565772650708131e-05, - "loss": 3.8897, - "step": 6810 - }, - { - "epoch": 1.165911616377468, - "grad_norm": 1.2322989702224731, - "learning_rate": 3.561979006980181e-05, - "loss": 3.8942, - "step": 6820 - }, - { - "epoch": 1.1676211642020686, - "grad_norm": 1.2342458963394165, - "learning_rate": 3.558182707455773e-05, - "loss": 3.9165, - "step": 6830 - }, - { - "epoch": 1.169330712026669, - "grad_norm": 1.249831199645996, - "learning_rate": 3.554383764551729e-05, - "loss": 3.8498, - "step": 6840 - }, - { - "epoch": 1.1710402598512692, - "grad_norm": 1.2363636493682861, - "learning_rate": 3.550582190693514e-05, - "loss": 3.9315, - "step": 6850 - }, - { - "epoch": 1.1727498076758698, - "grad_norm": 1.2619434595108032, - "learning_rate": 3.546777998315205e-05, - "loss": 3.8729, - "step": 6860 - }, - { - "epoch": 1.17445935550047, - "grad_norm": 1.2603269815444946, - "learning_rate": 3.542971199859437e-05, - "loss": 3.9219, - "step": 6870 - }, - { - "epoch": 1.1761689033250704, - "grad_norm": 1.2825045585632324, - "learning_rate": 3.539161807777373e-05, - "loss": 3.8721, - "step": 6880 - }, - { - "epoch": 1.177878451149671, - "grad_norm": 1.2009698152542114, - "learning_rate": 3.5353498345286565e-05, - "loss": 3.9017, - "step": 6890 - }, - { - "epoch": 1.1795879989742712, - "grad_norm": 1.2658491134643555, - "learning_rate": 3.531535292581377e-05, - "loss": 3.914, - "step": 6900 - }, - { - "epoch": 1.1812975467988718, - "grad_norm": 1.244885802268982, - "learning_rate": 3.5277181944120206e-05, - "loss": 3.8641, - "step": 6910 - }, - { - "epoch": 1.183007094623472, - "grad_norm": 1.2868492603302002, - "learning_rate": 3.5238985525054384e-05, - "loss": 3.9079, - "step": 6920 - }, - { - "epoch": 1.1847166424480724, - "grad_norm": 1.2604506015777588, - "learning_rate": 3.5200763793547996e-05, - "loss": 3.882, - "step": 6930 - }, - { - "epoch": 1.186426190272673, - "grad_norm": 1.2101595401763916, - "learning_rate": 3.516251687461555e-05, - "loss": 3.8874, - "step": 6940 - }, - { - "epoch": 1.1881357380972732, - "grad_norm": 1.2412956953048706, - "learning_rate": 3.512424489335388e-05, - "loss": 3.895, - "step": 6950 - }, - { - "epoch": 1.1898452859218738, - "grad_norm": 1.2661387920379639, - "learning_rate": 3.508594797494184e-05, - "loss": 3.8637, - "step": 6960 - }, - { - "epoch": 1.191554833746474, - "grad_norm": 1.2516443729400635, - "learning_rate": 3.504762624463985e-05, - "loss": 3.9112, - "step": 6970 - }, - { - "epoch": 1.1932643815710744, - "grad_norm": 1.1882774829864502, - "learning_rate": 3.500927982778946e-05, - "loss": 3.9356, - "step": 6980 - }, - { - "epoch": 1.194973929395675, - "grad_norm": 1.2080714702606201, - "learning_rate": 3.497090884981298e-05, - "loss": 3.9211, - "step": 6990 - }, - { - "epoch": 1.1966834772202752, - "grad_norm": 1.1993480920791626, - "learning_rate": 3.493251343621304e-05, - "loss": 3.9367, - "step": 7000 - }, - { - "epoch": 1.1966834772202752, - "eval_loss": 4.2265472412109375, - "eval_runtime": 15.0294, - "eval_samples_per_second": 64.806, - "eval_steps_per_second": 0.865, - "step": 7000 - }, - { - "epoch": 1.1983930250448755, - "grad_norm": 1.2580934762954712, - "learning_rate": 3.48940937125722e-05, - "loss": 3.9175, - "step": 7010 - }, - { - "epoch": 1.200102572869476, - "grad_norm": 1.2459419965744019, - "learning_rate": 3.485564980455255e-05, - "loss": 3.844, - "step": 7020 - }, - { - "epoch": 1.2018121206940764, - "grad_norm": 1.1815834045410156, - "learning_rate": 3.481718183789525e-05, - "loss": 3.9026, - "step": 7030 - }, - { - "epoch": 1.2035216685186767, - "grad_norm": 1.2724698781967163, - "learning_rate": 3.477868993842017e-05, - "loss": 3.8767, - "step": 7040 - }, - { - "epoch": 1.2052312163432772, - "grad_norm": 1.2331515550613403, - "learning_rate": 3.474017423202545e-05, - "loss": 3.9254, - "step": 7050 - }, - { - "epoch": 1.2069407641678775, - "grad_norm": 1.2293578386306763, - "learning_rate": 3.470163484468712e-05, - "loss": 3.897, - "step": 7060 - }, - { - "epoch": 1.208650311992478, - "grad_norm": 1.2193176746368408, - "learning_rate": 3.466307190245862e-05, - "loss": 3.9067, - "step": 7070 - }, - { - "epoch": 1.2103598598170784, - "grad_norm": 1.2721304893493652, - "learning_rate": 3.462448553147048e-05, - "loss": 3.8766, - "step": 7080 - }, - { - "epoch": 1.2120694076416787, - "grad_norm": 1.1889150142669678, - "learning_rate": 3.4585875857929825e-05, - "loss": 3.9371, - "step": 7090 - }, - { - "epoch": 1.2137789554662792, - "grad_norm": 1.2700316905975342, - "learning_rate": 3.454724300812003e-05, - "loss": 3.8933, - "step": 7100 - }, - { - "epoch": 1.2154885032908795, - "grad_norm": 1.253462314605713, - "learning_rate": 3.450858710840023e-05, - "loss": 3.8761, - "step": 7110 - }, - { - "epoch": 1.21719805111548, - "grad_norm": 1.233017921447754, - "learning_rate": 3.446990828520499e-05, - "loss": 3.8948, - "step": 7120 - }, - { - "epoch": 1.2189075989400804, - "grad_norm": 1.2558211088180542, - "learning_rate": 3.443120666504384e-05, - "loss": 3.9235, - "step": 7130 - }, - { - "epoch": 1.2206171467646807, - "grad_norm": 1.2310528755187988, - "learning_rate": 3.439248237450086e-05, - "loss": 3.909, - "step": 7140 - }, - { - "epoch": 1.2223266945892812, - "grad_norm": 1.206688642501831, - "learning_rate": 3.435373554023432e-05, - "loss": 3.9416, - "step": 7150 - }, - { - "epoch": 1.2240362424138815, - "grad_norm": 1.2658628225326538, - "learning_rate": 3.431496628897617e-05, - "loss": 3.9142, - "step": 7160 - }, - { - "epoch": 1.2257457902384818, - "grad_norm": 1.2464946508407593, - "learning_rate": 3.427617474753173e-05, - "loss": 3.8713, - "step": 7170 - }, - { - "epoch": 1.2274553380630824, - "grad_norm": 1.2582857608795166, - "learning_rate": 3.423736104277919e-05, - "loss": 3.9234, - "step": 7180 - }, - { - "epoch": 1.2291648858876827, - "grad_norm": 1.1837236881256104, - "learning_rate": 3.4198525301669235e-05, - "loss": 3.9338, - "step": 7190 - }, - { - "epoch": 1.230874433712283, - "grad_norm": 1.2386939525604248, - "learning_rate": 3.415966765122467e-05, - "loss": 3.8593, - "step": 7200 - }, - { - "epoch": 1.2325839815368835, - "grad_norm": 1.1985162496566772, - "learning_rate": 3.4120788218539895e-05, - "loss": 3.8577, - "step": 7210 - }, - { - "epoch": 1.2342935293614838, - "grad_norm": 1.240938425064087, - "learning_rate": 3.4081887130780594e-05, - "loss": 3.8484, - "step": 7220 - }, - { - "epoch": 1.2360030771860844, - "grad_norm": 1.203911304473877, - "learning_rate": 3.404296451518328e-05, - "loss": 3.9054, - "step": 7230 - }, - { - "epoch": 1.2377126250106847, - "grad_norm": 1.2592321634292603, - "learning_rate": 3.4004020499054875e-05, - "loss": 3.8955, - "step": 7240 - }, - { - "epoch": 1.239422172835285, - "grad_norm": 1.2568918466567993, - "learning_rate": 3.396505520977227e-05, - "loss": 3.9412, - "step": 7250 - }, - { - "epoch": 1.2411317206598855, - "grad_norm": 1.2267158031463623, - "learning_rate": 3.3926068774781985e-05, - "loss": 3.9222, - "step": 7260 - }, - { - "epoch": 1.2428412684844858, - "grad_norm": 1.2438641786575317, - "learning_rate": 3.388706132159967e-05, - "loss": 3.9293, - "step": 7270 - }, - { - "epoch": 1.2445508163090861, - "grad_norm": 1.2480589151382446, - "learning_rate": 3.3848032977809704e-05, - "loss": 3.8873, - "step": 7280 - }, - { - "epoch": 1.2462603641336867, - "grad_norm": 1.256628394126892, - "learning_rate": 3.3808983871064845e-05, - "loss": 3.9241, - "step": 7290 - }, - { - "epoch": 1.247969911958287, - "grad_norm": 1.2636611461639404, - "learning_rate": 3.376991412908573e-05, - "loss": 3.917, - "step": 7300 - }, - { - "epoch": 1.2496794597828873, - "grad_norm": 1.1977349519729614, - "learning_rate": 3.373082387966048e-05, - "loss": 3.8716, - "step": 7310 - }, - { - "epoch": 1.2513890076074878, - "grad_norm": 1.2927277088165283, - "learning_rate": 3.369171325064431e-05, - "loss": 3.857, - "step": 7320 - }, - { - "epoch": 1.2530985554320881, - "grad_norm": 1.2924025058746338, - "learning_rate": 3.3652582369959096e-05, - "loss": 3.8391, - "step": 7330 - }, - { - "epoch": 1.2548081032566887, - "grad_norm": 1.2852236032485962, - "learning_rate": 3.3613431365592936e-05, - "loss": 3.8368, - "step": 7340 - }, - { - "epoch": 1.256517651081289, - "grad_norm": 1.2513238191604614, - "learning_rate": 3.3574260365599754e-05, - "loss": 3.8681, - "step": 7350 - }, - { - "epoch": 1.2582271989058893, - "grad_norm": 1.2437723875045776, - "learning_rate": 3.353506949809886e-05, - "loss": 3.927, - "step": 7360 - }, - { - "epoch": 1.2599367467304898, - "grad_norm": 1.2713721990585327, - "learning_rate": 3.34958588912746e-05, - "loss": 3.9136, - "step": 7370 - }, - { - "epoch": 1.2616462945550901, - "grad_norm": 1.273529291152954, - "learning_rate": 3.345662867337581e-05, - "loss": 3.9218, - "step": 7380 - }, - { - "epoch": 1.2633558423796907, - "grad_norm": 1.2572320699691772, - "learning_rate": 3.341737897271552e-05, - "loss": 3.8671, - "step": 7390 - }, - { - "epoch": 1.265065390204291, - "grad_norm": 1.2237032651901245, - "learning_rate": 3.337810991767047e-05, - "loss": 3.8708, - "step": 7400 - }, - { - "epoch": 1.2667749380288913, - "grad_norm": 1.2337538003921509, - "learning_rate": 3.3338821636680694e-05, - "loss": 3.9117, - "step": 7410 - }, - { - "epoch": 1.2684844858534918, - "grad_norm": 1.2676743268966675, - "learning_rate": 3.329951425824912e-05, - "loss": 3.9141, - "step": 7420 - }, - { - "epoch": 1.2701940336780921, - "grad_norm": 1.2536479234695435, - "learning_rate": 3.326018791094113e-05, - "loss": 3.9077, - "step": 7430 - }, - { - "epoch": 1.2719035815026927, - "grad_norm": 1.3165647983551025, - "learning_rate": 3.3220842723384176e-05, - "loss": 3.9068, - "step": 7440 - }, - { - "epoch": 1.273613129327293, - "grad_norm": 1.2718582153320312, - "learning_rate": 3.3181478824267284e-05, - "loss": 3.8407, - "step": 7450 - }, - { - "epoch": 1.2753226771518933, - "grad_norm": 1.2179574966430664, - "learning_rate": 3.314209634234073e-05, - "loss": 3.9224, - "step": 7460 - }, - { - "epoch": 1.2770322249764936, - "grad_norm": 1.2652764320373535, - "learning_rate": 3.3102695406415526e-05, - "loss": 3.9085, - "step": 7470 - }, - { - "epoch": 1.2787417728010941, - "grad_norm": 1.2868931293487549, - "learning_rate": 3.30632761453631e-05, - "loss": 3.9061, - "step": 7480 - }, - { - "epoch": 1.2804513206256944, - "grad_norm": 1.2277872562408447, - "learning_rate": 3.3023838688114744e-05, - "loss": 3.9338, - "step": 7490 - }, - { - "epoch": 1.282160868450295, - "grad_norm": 1.2414730787277222, - "learning_rate": 3.298438316366133e-05, - "loss": 3.9297, - "step": 7500 - }, - { - "epoch": 1.2838704162748953, - "grad_norm": 1.29763662815094, - "learning_rate": 3.294490970105279e-05, - "loss": 3.8913, - "step": 7510 - }, - { - "epoch": 1.2855799640994956, - "grad_norm": 1.263052225112915, - "learning_rate": 3.2905418429397736e-05, - "loss": 3.8636, - "step": 7520 - }, - { - "epoch": 1.2872895119240961, - "grad_norm": 1.2544949054718018, - "learning_rate": 3.2865909477863034e-05, - "loss": 3.8825, - "step": 7530 - }, - { - "epoch": 1.2889990597486964, - "grad_norm": 1.2706531286239624, - "learning_rate": 3.2826382975673364e-05, - "loss": 3.9284, - "step": 7540 - }, - { - "epoch": 1.290708607573297, - "grad_norm": 1.2620859146118164, - "learning_rate": 3.278683905211082e-05, - "loss": 3.9086, - "step": 7550 - }, - { - "epoch": 1.2924181553978973, - "grad_norm": 1.2306818962097168, - "learning_rate": 3.274727783651448e-05, - "loss": 3.8986, - "step": 7560 - }, - { - "epoch": 1.2941277032224976, - "grad_norm": 1.231574296951294, - "learning_rate": 3.270769945827996e-05, - "loss": 3.9494, - "step": 7570 - }, - { - "epoch": 1.295837251047098, - "grad_norm": 1.2590519189834595, - "learning_rate": 3.2668104046859046e-05, - "loss": 3.8872, - "step": 7580 - }, - { - "epoch": 1.2975467988716984, - "grad_norm": 1.288218379020691, - "learning_rate": 3.26284917317592e-05, - "loss": 3.9112, - "step": 7590 - }, - { - "epoch": 1.2992563466962987, - "grad_norm": 1.2678600549697876, - "learning_rate": 3.258886264254321e-05, - "loss": 3.9286, - "step": 7600 - }, - { - "epoch": 1.3009658945208993, - "grad_norm": 1.2126678228378296, - "learning_rate": 3.254921690882867e-05, - "loss": 3.9309, - "step": 7610 - }, - { - "epoch": 1.3026754423454996, - "grad_norm": 1.3168165683746338, - "learning_rate": 3.2509554660287694e-05, - "loss": 3.912, - "step": 7620 - }, - { - "epoch": 1.3043849901701, - "grad_norm": 1.2809703350067139, - "learning_rate": 3.246987602664634e-05, - "loss": 3.9634, - "step": 7630 - }, - { - "epoch": 1.3060945379947004, - "grad_norm": 1.2900375127792358, - "learning_rate": 3.24301811376843e-05, - "loss": 3.8878, - "step": 7640 - }, - { - "epoch": 1.3078040858193007, - "grad_norm": 1.2212899923324585, - "learning_rate": 3.239047012323441e-05, - "loss": 3.9122, - "step": 7650 - }, - { - "epoch": 1.3095136336439013, - "grad_norm": 1.2610219717025757, - "learning_rate": 3.235074311318226e-05, - "loss": 3.9077, - "step": 7660 - }, - { - "epoch": 1.3112231814685016, - "grad_norm": 1.2156187295913696, - "learning_rate": 3.2311000237465764e-05, - "loss": 3.9057, - "step": 7670 - }, - { - "epoch": 1.312932729293102, - "grad_norm": 1.2554469108581543, - "learning_rate": 3.227124162607472e-05, - "loss": 3.8691, - "step": 7680 - }, - { - "epoch": 1.3146422771177024, - "grad_norm": 1.215631365776062, - "learning_rate": 3.2231467409050385e-05, - "loss": 3.91, - "step": 7690 - }, - { - "epoch": 1.3163518249423027, - "grad_norm": 1.303505301475525, - "learning_rate": 3.2191677716485095e-05, - "loss": 3.8993, - "step": 7700 - }, - { - "epoch": 1.3180613727669033, - "grad_norm": 1.3015791177749634, - "learning_rate": 3.215187267852175e-05, - "loss": 3.8578, - "step": 7710 - }, - { - "epoch": 1.3197709205915036, - "grad_norm": 1.2639282941818237, - "learning_rate": 3.211205242535348e-05, - "loss": 3.9168, - "step": 7720 - }, - { - "epoch": 1.321480468416104, - "grad_norm": 1.2508628368377686, - "learning_rate": 3.207221708722317e-05, - "loss": 3.9376, - "step": 7730 - }, - { - "epoch": 1.3231900162407042, - "grad_norm": 1.2599705457687378, - "learning_rate": 3.2032366794423035e-05, - "loss": 3.8869, - "step": 7740 - }, - { - "epoch": 1.3248995640653047, - "grad_norm": 1.2388734817504883, - "learning_rate": 3.199250167729422e-05, - "loss": 3.8983, - "step": 7750 - }, - { - "epoch": 1.326609111889905, - "grad_norm": 1.2401822805404663, - "learning_rate": 3.195262186622635e-05, - "loss": 3.9057, - "step": 7760 - }, - { - "epoch": 1.3283186597145056, - "grad_norm": 1.2875392436981201, - "learning_rate": 3.1912727491657094e-05, - "loss": 3.9184, - "step": 7770 - }, - { - "epoch": 1.3300282075391059, - "grad_norm": 1.242212176322937, - "learning_rate": 3.1872818684071784e-05, - "loss": 3.8775, - "step": 7780 - }, - { - "epoch": 1.3317377553637062, - "grad_norm": 1.208766222000122, - "learning_rate": 3.183289557400294e-05, - "loss": 3.8907, - "step": 7790 - }, - { - "epoch": 1.3334473031883067, - "grad_norm": 1.3063687086105347, - "learning_rate": 3.1792958292029866e-05, - "loss": 3.8771, - "step": 7800 - }, - { - "epoch": 1.335156851012907, - "grad_norm": 1.2906602621078491, - "learning_rate": 3.175300696877823e-05, - "loss": 3.9579, - "step": 7810 - }, - { - "epoch": 1.3368663988375076, - "grad_norm": 1.1862751245498657, - "learning_rate": 3.17130417349196e-05, - "loss": 3.9033, - "step": 7820 - }, - { - "epoch": 1.3385759466621079, - "grad_norm": 1.2207679748535156, - "learning_rate": 3.1673062721171064e-05, - "loss": 3.9076, - "step": 7830 - }, - { - "epoch": 1.3402854944867082, - "grad_norm": 1.2842761278152466, - "learning_rate": 3.163307005829477e-05, - "loss": 3.8936, - "step": 7840 - }, - { - "epoch": 1.3419950423113087, - "grad_norm": 1.1974037885665894, - "learning_rate": 3.159306387709754e-05, - "loss": 3.8949, - "step": 7850 - }, - { - "epoch": 1.343704590135909, - "grad_norm": 1.223143458366394, - "learning_rate": 3.155304430843035e-05, - "loss": 3.8845, - "step": 7860 - }, - { - "epoch": 1.3454141379605096, - "grad_norm": 1.2655061483383179, - "learning_rate": 3.151301148318802e-05, - "loss": 3.8893, - "step": 7870 - }, - { - "epoch": 1.3471236857851099, - "grad_norm": 1.280807375907898, - "learning_rate": 3.14729655323087e-05, - "loss": 3.9172, - "step": 7880 - }, - { - "epoch": 1.3488332336097102, - "grad_norm": 1.272719383239746, - "learning_rate": 3.143290658677349e-05, - "loss": 3.9677, - "step": 7890 - }, - { - "epoch": 1.3505427814343105, - "grad_norm": 1.2324796915054321, - "learning_rate": 3.139283477760596e-05, - "loss": 3.8952, - "step": 7900 - }, - { - "epoch": 1.352252329258911, - "grad_norm": 1.2370543479919434, - "learning_rate": 3.13527502358718e-05, - "loss": 3.9136, - "step": 7910 - }, - { - "epoch": 1.3539618770835113, - "grad_norm": 1.2388861179351807, - "learning_rate": 3.13126530926783e-05, - "loss": 3.9036, - "step": 7920 - }, - { - "epoch": 1.3556714249081119, - "grad_norm": 1.229732871055603, - "learning_rate": 3.1272543479174006e-05, - "loss": 3.9366, - "step": 7930 - }, - { - "epoch": 1.3573809727327122, - "grad_norm": 1.2483370304107666, - "learning_rate": 3.123242152654822e-05, - "loss": 3.8964, - "step": 7940 - }, - { - "epoch": 1.3590905205573125, - "grad_norm": 1.275407314300537, - "learning_rate": 3.119228736603062e-05, - "loss": 3.8757, - "step": 7950 - }, - { - "epoch": 1.360800068381913, - "grad_norm": 1.2666151523590088, - "learning_rate": 3.1152141128890796e-05, - "loss": 3.8904, - "step": 7960 - }, - { - "epoch": 1.3625096162065133, - "grad_norm": 1.235478401184082, - "learning_rate": 3.1111982946437875e-05, - "loss": 3.9065, - "step": 7970 - }, - { - "epoch": 1.3642191640311139, - "grad_norm": 1.195444941520691, - "learning_rate": 3.107181295002001e-05, - "loss": 3.9304, - "step": 7980 - }, - { - "epoch": 1.3659287118557142, - "grad_norm": 1.2654317617416382, - "learning_rate": 3.103163127102402e-05, - "loss": 3.8815, - "step": 7990 - }, - { - "epoch": 1.3676382596803145, - "grad_norm": 1.326063871383667, - "learning_rate": 3.099143804087493e-05, - "loss": 3.9163, - "step": 8000 - }, - { - "epoch": 1.3676382596803145, - "eval_loss": 4.215038776397705, - "eval_runtime": 15.4991, - "eval_samples_per_second": 62.842, - "eval_steps_per_second": 0.839, - "step": 8000 - }, - { - "epoch": 1.369347807504915, - "grad_norm": 1.3022469282150269, - "learning_rate": 3.095123339103554e-05, - "loss": 3.9068, - "step": 8010 - }, - { - "epoch": 1.3710573553295153, - "grad_norm": 1.2434028387069702, - "learning_rate": 3.0911017453006016e-05, - "loss": 3.8834, - "step": 8020 - }, - { - "epoch": 1.3727669031541159, - "grad_norm": 1.2971705198287964, - "learning_rate": 3.087079035832344e-05, - "loss": 3.8905, - "step": 8030 - }, - { - "epoch": 1.3744764509787162, - "grad_norm": 1.2566988468170166, - "learning_rate": 3.0830552238561366e-05, - "loss": 3.9276, - "step": 8040 - }, - { - "epoch": 1.3761859988033165, - "grad_norm": 1.2567497491836548, - "learning_rate": 3.079030322532942e-05, - "loss": 3.8935, - "step": 8050 - }, - { - "epoch": 1.3778955466279168, - "grad_norm": 1.2925610542297363, - "learning_rate": 3.075004345027289e-05, - "loss": 3.9073, - "step": 8060 - }, - { - "epoch": 1.3796050944525173, - "grad_norm": 1.308101773262024, - "learning_rate": 3.07097730450722e-05, - "loss": 3.8717, - "step": 8070 - }, - { - "epoch": 1.3813146422771176, - "grad_norm": 1.2724037170410156, - "learning_rate": 3.06694921414426e-05, - "loss": 3.9138, - "step": 8080 - }, - { - "epoch": 1.3830241901017182, - "grad_norm": 1.2126636505126953, - "learning_rate": 3.0629200871133645e-05, - "loss": 3.929, - "step": 8090 - }, - { - "epoch": 1.3847337379263185, - "grad_norm": 1.2173937559127808, - "learning_rate": 3.0588899365928816e-05, - "loss": 3.884, - "step": 8100 - }, - { - "epoch": 1.3864432857509188, - "grad_norm": 1.3094903230667114, - "learning_rate": 3.0548587757645044e-05, - "loss": 3.8804, - "step": 8110 - }, - { - "epoch": 1.3881528335755193, - "grad_norm": 1.2229657173156738, - "learning_rate": 3.0508266178132338e-05, - "loss": 3.8911, - "step": 8120 - }, - { - "epoch": 1.3898623814001196, - "grad_norm": 1.2964317798614502, - "learning_rate": 3.0467934759273297e-05, - "loss": 3.9178, - "step": 8130 - }, - { - "epoch": 1.3915719292247202, - "grad_norm": 1.2695974111557007, - "learning_rate": 3.042759363298272e-05, - "loss": 3.8651, - "step": 8140 - }, - { - "epoch": 1.3932814770493205, - "grad_norm": 1.2603505849838257, - "learning_rate": 3.038724293120714e-05, - "loss": 3.869, - "step": 8150 - }, - { - "epoch": 1.3949910248739208, - "grad_norm": 1.2657015323638916, - "learning_rate": 3.034688278592442e-05, - "loss": 3.8617, - "step": 8160 - }, - { - "epoch": 1.396700572698521, - "grad_norm": 1.2484221458435059, - "learning_rate": 3.0306513329143292e-05, - "loss": 3.8778, - "step": 8170 - }, - { - "epoch": 1.3984101205231216, - "grad_norm": 1.3020051717758179, - "learning_rate": 3.026613469290298e-05, - "loss": 3.8287, - "step": 8180 - }, - { - "epoch": 1.400119668347722, - "grad_norm": 1.2208669185638428, - "learning_rate": 3.0225747009272693e-05, - "loss": 3.8563, - "step": 8190 - }, - { - "epoch": 1.4018292161723225, - "grad_norm": 1.2694666385650635, - "learning_rate": 3.0185350410351258e-05, - "loss": 3.8498, - "step": 8200 - }, - { - "epoch": 1.4035387639969228, - "grad_norm": 1.270048975944519, - "learning_rate": 3.014494502826665e-05, - "loss": 3.903, - "step": 8210 - }, - { - "epoch": 1.405248311821523, - "grad_norm": 1.2498942613601685, - "learning_rate": 3.010453099517558e-05, - "loss": 3.9048, - "step": 8220 - }, - { - "epoch": 1.4069578596461236, - "grad_norm": 1.2931632995605469, - "learning_rate": 3.0064108443263035e-05, - "loss": 3.8802, - "step": 8230 - }, - { - "epoch": 1.408667407470724, - "grad_norm": 1.2145774364471436, - "learning_rate": 3.0023677504741894e-05, - "loss": 3.9361, - "step": 8240 - }, - { - "epoch": 1.4103769552953245, - "grad_norm": 1.262025237083435, - "learning_rate": 2.9983238311852447e-05, - "loss": 3.9099, - "step": 8250 - }, - { - "epoch": 1.4120865031199248, - "grad_norm": 1.2575366497039795, - "learning_rate": 2.9942790996861986e-05, - "loss": 3.8723, - "step": 8260 - }, - { - "epoch": 1.413796050944525, - "grad_norm": 1.326175570487976, - "learning_rate": 2.9902335692064365e-05, - "loss": 3.8927, - "step": 8270 - }, - { - "epoch": 1.4155055987691256, - "grad_norm": 1.2309988737106323, - "learning_rate": 2.9861872529779582e-05, - "loss": 3.8933, - "step": 8280 - }, - { - "epoch": 1.417215146593726, - "grad_norm": 1.2834300994873047, - "learning_rate": 2.9821401642353336e-05, - "loss": 3.8416, - "step": 8290 - }, - { - "epoch": 1.4189246944183265, - "grad_norm": 1.2373608350753784, - "learning_rate": 2.978092316215657e-05, - "loss": 3.9053, - "step": 8300 - }, - { - "epoch": 1.4206342422429268, - "grad_norm": 1.2766106128692627, - "learning_rate": 2.9740437221585088e-05, - "loss": 3.9102, - "step": 8310 - }, - { - "epoch": 1.422343790067527, - "grad_norm": 1.2524621486663818, - "learning_rate": 2.9699943953059074e-05, - "loss": 3.9036, - "step": 8320 - }, - { - "epoch": 1.4240533378921274, - "grad_norm": 1.235935926437378, - "learning_rate": 2.96594434890227e-05, - "loss": 3.8423, - "step": 8330 - }, - { - "epoch": 1.425762885716728, - "grad_norm": 1.2629297971725464, - "learning_rate": 2.961893596194365e-05, - "loss": 3.9217, - "step": 8340 - }, - { - "epoch": 1.4274724335413282, - "grad_norm": 1.2264230251312256, - "learning_rate": 2.9578421504312743e-05, - "loss": 3.9892, - "step": 8350 - }, - { - "epoch": 1.4291819813659288, - "grad_norm": 1.223467469215393, - "learning_rate": 2.953790024864342e-05, - "loss": 3.9633, - "step": 8360 - }, - { - "epoch": 1.430891529190529, - "grad_norm": 1.226707100868225, - "learning_rate": 2.949737232747141e-05, - "loss": 3.9216, - "step": 8370 - }, - { - "epoch": 1.4326010770151294, - "grad_norm": 1.2447867393493652, - "learning_rate": 2.9456837873354188e-05, - "loss": 3.9147, - "step": 8380 - }, - { - "epoch": 1.43431062483973, - "grad_norm": 1.2209540605545044, - "learning_rate": 2.9416297018870654e-05, - "loss": 3.8992, - "step": 8390 - }, - { - "epoch": 1.4360201726643302, - "grad_norm": 1.227596402168274, - "learning_rate": 2.9375749896620585e-05, - "loss": 3.9009, - "step": 8400 - }, - { - "epoch": 1.4377297204889308, - "grad_norm": 1.3147202730178833, - "learning_rate": 2.9335196639224305e-05, - "loss": 3.8807, - "step": 8410 - }, - { - "epoch": 1.439439268313531, - "grad_norm": 1.2258697748184204, - "learning_rate": 2.9294637379322177e-05, - "loss": 3.9245, - "step": 8420 - }, - { - "epoch": 1.4411488161381314, - "grad_norm": 1.2285287380218506, - "learning_rate": 2.925407224957421e-05, - "loss": 3.8962, - "step": 8430 - }, - { - "epoch": 1.442858363962732, - "grad_norm": 1.342581868171692, - "learning_rate": 2.9213501382659602e-05, - "loss": 3.9292, - "step": 8440 - }, - { - "epoch": 1.4445679117873322, - "grad_norm": 1.2347153425216675, - "learning_rate": 2.9172924911276323e-05, - "loss": 3.9033, - "step": 8450 - }, - { - "epoch": 1.4462774596119328, - "grad_norm": 1.308929204940796, - "learning_rate": 2.9132342968140682e-05, - "loss": 3.8606, - "step": 8460 - }, - { - "epoch": 1.447987007436533, - "grad_norm": 1.2376822233200073, - "learning_rate": 2.9091755685986866e-05, - "loss": 3.869, - "step": 8470 - }, - { - "epoch": 1.4496965552611334, - "grad_norm": 1.2993402481079102, - "learning_rate": 2.9051163197566528e-05, - "loss": 3.8553, - "step": 8480 - }, - { - "epoch": 1.4514061030857337, - "grad_norm": 1.2593785524368286, - "learning_rate": 2.9010565635648375e-05, - "loss": 3.8998, - "step": 8490 - }, - { - "epoch": 1.4531156509103342, - "grad_norm": 1.3027465343475342, - "learning_rate": 2.8969963133017662e-05, - "loss": 3.8821, - "step": 8500 - }, - { - "epoch": 1.4548251987349345, - "grad_norm": 1.2336262464523315, - "learning_rate": 2.892935582247586e-05, - "loss": 3.8659, - "step": 8510 - }, - { - "epoch": 1.456534746559535, - "grad_norm": 1.246215581893921, - "learning_rate": 2.8888743836840116e-05, - "loss": 3.9437, - "step": 8520 - }, - { - "epoch": 1.4582442943841354, - "grad_norm": 1.2747962474822998, - "learning_rate": 2.884812730894289e-05, - "loss": 3.8875, - "step": 8530 - }, - { - "epoch": 1.4599538422087357, - "grad_norm": 1.227078914642334, - "learning_rate": 2.880750637163151e-05, - "loss": 3.9209, - "step": 8540 - }, - { - "epoch": 1.4616633900333362, - "grad_norm": 1.2943015098571777, - "learning_rate": 2.8766881157767695e-05, - "loss": 3.9089, - "step": 8550 - }, - { - "epoch": 1.4633729378579365, - "grad_norm": 1.3103044033050537, - "learning_rate": 2.8726251800227176e-05, - "loss": 3.8835, - "step": 8560 - }, - { - "epoch": 1.465082485682537, - "grad_norm": 1.3410990238189697, - "learning_rate": 2.868561843189924e-05, - "loss": 3.8822, - "step": 8570 - }, - { - "epoch": 1.4667920335071374, - "grad_norm": 1.2901086807250977, - "learning_rate": 2.864498118568628e-05, - "loss": 3.8641, - "step": 8580 - }, - { - "epoch": 1.4685015813317377, - "grad_norm": 1.289597988128662, - "learning_rate": 2.8604340194503372e-05, - "loss": 3.8905, - "step": 8590 - }, - { - "epoch": 1.4702111291563382, - "grad_norm": 1.2292598485946655, - "learning_rate": 2.856369559127785e-05, - "loss": 3.8897, - "step": 8600 - }, - { - "epoch": 1.4719206769809385, - "grad_norm": 1.2346113920211792, - "learning_rate": 2.8523047508948847e-05, - "loss": 3.9077, - "step": 8610 - }, - { - "epoch": 1.473630224805539, - "grad_norm": 1.2346540689468384, - "learning_rate": 2.84823960804669e-05, - "loss": 3.8784, - "step": 8620 - }, - { - "epoch": 1.4753397726301394, - "grad_norm": 1.2645827531814575, - "learning_rate": 2.8441741438793456e-05, - "loss": 3.8738, - "step": 8630 - }, - { - "epoch": 1.4770493204547397, - "grad_norm": 1.268717885017395, - "learning_rate": 2.8401083716900513e-05, - "loss": 3.8957, - "step": 8640 - }, - { - "epoch": 1.47875886827934, - "grad_norm": 1.3104504346847534, - "learning_rate": 2.83604230477701e-05, - "loss": 3.8341, - "step": 8650 - }, - { - "epoch": 1.4804684161039405, - "grad_norm": 1.218449592590332, - "learning_rate": 2.8319759564393934e-05, - "loss": 3.9161, - "step": 8660 - }, - { - "epoch": 1.4821779639285408, - "grad_norm": 1.272817611694336, - "learning_rate": 2.8279093399772882e-05, - "loss": 3.88, - "step": 8670 - }, - { - "epoch": 1.4838875117531414, - "grad_norm": 1.2418755292892456, - "learning_rate": 2.823842468691663e-05, - "loss": 3.9404, - "step": 8680 - }, - { - "epoch": 1.4855970595777417, - "grad_norm": 1.25066077709198, - "learning_rate": 2.819775355884317e-05, - "loss": 3.903, - "step": 8690 - }, - { - "epoch": 1.487306607402342, - "grad_norm": 1.2251503467559814, - "learning_rate": 2.815708014857841e-05, - "loss": 3.8809, - "step": 8700 - }, - { - "epoch": 1.4890161552269425, - "grad_norm": 1.2772635221481323, - "learning_rate": 2.81164045891557e-05, - "loss": 3.9073, - "step": 8710 - }, - { - "epoch": 1.4907257030515428, - "grad_norm": 1.257362723350525, - "learning_rate": 2.8075727013615445e-05, - "loss": 3.8749, - "step": 8720 - }, - { - "epoch": 1.4924352508761434, - "grad_norm": 1.2010557651519775, - "learning_rate": 2.8035047555004627e-05, - "loss": 3.9597, - "step": 8730 - }, - { - "epoch": 1.4941447987007437, - "grad_norm": 1.2486116886138916, - "learning_rate": 2.7994366346376398e-05, - "loss": 3.9023, - "step": 8740 - }, - { - "epoch": 1.495854346525344, - "grad_norm": 1.2991310358047485, - "learning_rate": 2.7953683520789625e-05, - "loss": 3.9064, - "step": 8750 - }, - { - "epoch": 1.4975638943499443, - "grad_norm": 1.2580043077468872, - "learning_rate": 2.791299921130847e-05, - "loss": 3.9031, - "step": 8760 - }, - { - "epoch": 1.4992734421745448, - "grad_norm": 1.2715848684310913, - "learning_rate": 2.7872313551001945e-05, - "loss": 3.9135, - "step": 8770 - }, - { - "epoch": 1.5009829899991454, - "grad_norm": 1.2051122188568115, - "learning_rate": 2.7831626672943473e-05, - "loss": 3.8901, - "step": 8780 - }, - { - "epoch": 1.5026925378237457, - "grad_norm": 1.2270910739898682, - "learning_rate": 2.7790938710210478e-05, - "loss": 3.9485, - "step": 8790 - }, - { - "epoch": 1.504402085648346, - "grad_norm": 1.2596473693847656, - "learning_rate": 2.7750249795883932e-05, - "loss": 3.8285, - "step": 8800 - }, - { - "epoch": 1.5061116334729463, - "grad_norm": 1.2697139978408813, - "learning_rate": 2.7709560063047885e-05, - "loss": 3.9058, - "step": 8810 - }, - { - "epoch": 1.5078211812975468, - "grad_norm": 1.2353649139404297, - "learning_rate": 2.7668869644789103e-05, - "loss": 3.9171, - "step": 8820 - }, - { - "epoch": 1.5095307291221471, - "grad_norm": 1.2431164979934692, - "learning_rate": 2.762817867419658e-05, - "loss": 3.9034, - "step": 8830 - }, - { - "epoch": 1.5112402769467477, - "grad_norm": 1.2307651042938232, - "learning_rate": 2.7587487284361108e-05, - "loss": 3.9135, - "step": 8840 - }, - { - "epoch": 1.512949824771348, - "grad_norm": 1.2992744445800781, - "learning_rate": 2.7546795608374877e-05, - "loss": 3.8752, - "step": 8850 - }, - { - "epoch": 1.5146593725959483, - "grad_norm": 1.2936491966247559, - "learning_rate": 2.7506103779330965e-05, - "loss": 3.8706, - "step": 8860 - }, - { - "epoch": 1.5163689204205486, - "grad_norm": 1.2603998184204102, - "learning_rate": 2.7465411930323008e-05, - "loss": 3.9489, - "step": 8870 - }, - { - "epoch": 1.5180784682451491, - "grad_norm": 1.304750919342041, - "learning_rate": 2.7424720194444653e-05, - "loss": 3.8845, - "step": 8880 - }, - { - "epoch": 1.5197880160697497, - "grad_norm": 1.2697685956954956, - "learning_rate": 2.7384028704789228e-05, - "loss": 3.872, - "step": 8890 - }, - { - "epoch": 1.52149756389435, - "grad_norm": 1.3253589868545532, - "learning_rate": 2.7343337594449214e-05, - "loss": 3.8902, - "step": 8900 - }, - { - "epoch": 1.5232071117189503, - "grad_norm": 1.3067569732666016, - "learning_rate": 2.730264699651588e-05, - "loss": 3.8514, - "step": 8910 - }, - { - "epoch": 1.5249166595435506, - "grad_norm": 1.226184368133545, - "learning_rate": 2.7261957044078788e-05, - "loss": 3.8748, - "step": 8920 - }, - { - "epoch": 1.5266262073681511, - "grad_norm": 1.2410087585449219, - "learning_rate": 2.7221267870225437e-05, - "loss": 3.8701, - "step": 8930 - }, - { - "epoch": 1.5283357551927517, - "grad_norm": 1.2869501113891602, - "learning_rate": 2.7180579608040725e-05, - "loss": 3.9553, - "step": 8940 - }, - { - "epoch": 1.530045303017352, - "grad_norm": 1.2438207864761353, - "learning_rate": 2.7139892390606603e-05, - "loss": 3.9031, - "step": 8950 - }, - { - "epoch": 1.5317548508419523, - "grad_norm": 1.3107331991195679, - "learning_rate": 2.7099206351001593e-05, - "loss": 3.9141, - "step": 8960 - }, - { - "epoch": 1.5334643986665526, - "grad_norm": 1.2537789344787598, - "learning_rate": 2.705852162230037e-05, - "loss": 3.9294, - "step": 8970 - }, - { - "epoch": 1.5351739464911531, - "grad_norm": 1.2856779098510742, - "learning_rate": 2.7017838337573314e-05, - "loss": 3.8686, - "step": 8980 - }, - { - "epoch": 1.5368834943157534, - "grad_norm": 1.2688082456588745, - "learning_rate": 2.697715662988609e-05, - "loss": 3.9259, - "step": 8990 - }, - { - "epoch": 1.538593042140354, - "grad_norm": 1.2736730575561523, - "learning_rate": 2.6936476632299195e-05, - "loss": 3.887, - "step": 9000 - }, - { - "epoch": 1.538593042140354, - "eval_loss": 4.19655704498291, - "eval_runtime": 15.1603, - "eval_samples_per_second": 64.247, - "eval_steps_per_second": 0.858, - "step": 9000 - }, - { - "epoch": 1.5403025899649543, - "grad_norm": 1.3021864891052246, - "learning_rate": 2.689579847786755e-05, - "loss": 3.908, - "step": 9010 - }, - { - "epoch": 1.5420121377895546, - "grad_norm": 1.2925386428833008, - "learning_rate": 2.685512229964003e-05, - "loss": 3.8826, - "step": 9020 - }, - { - "epoch": 1.543721685614155, - "grad_norm": 1.2435417175292969, - "learning_rate": 2.6814448230659055e-05, - "loss": 3.9017, - "step": 9030 - }, - { - "epoch": 1.5454312334387554, - "grad_norm": 1.3260468244552612, - "learning_rate": 2.677377640396014e-05, - "loss": 3.8952, - "step": 9040 - }, - { - "epoch": 1.547140781263356, - "grad_norm": 1.277796745300293, - "learning_rate": 2.673310695257147e-05, - "loss": 3.9061, - "step": 9050 - }, - { - "epoch": 1.5488503290879563, - "grad_norm": 1.2566475868225098, - "learning_rate": 2.6692440009513452e-05, - "loss": 3.8685, - "step": 9060 - }, - { - "epoch": 1.5505598769125566, - "grad_norm": 1.216264009475708, - "learning_rate": 2.665177570779832e-05, - "loss": 3.8776, - "step": 9070 - }, - { - "epoch": 1.552269424737157, - "grad_norm": 1.2410032749176025, - "learning_rate": 2.6611114180429624e-05, - "loss": 3.8665, - "step": 9080 - }, - { - "epoch": 1.5539789725617574, - "grad_norm": 1.2790521383285522, - "learning_rate": 2.657045556040188e-05, - "loss": 3.8846, - "step": 9090 - }, - { - "epoch": 1.555688520386358, - "grad_norm": 1.2979615926742554, - "learning_rate": 2.652979998070006e-05, - "loss": 3.8915, - "step": 9100 - }, - { - "epoch": 1.5573980682109583, - "grad_norm": 1.2675455808639526, - "learning_rate": 2.6489147574299223e-05, - "loss": 3.8868, - "step": 9110 - }, - { - "epoch": 1.5591076160355586, - "grad_norm": 1.266910195350647, - "learning_rate": 2.6448498474164023e-05, - "loss": 3.9171, - "step": 9120 - }, - { - "epoch": 1.560817163860159, - "grad_norm": 1.3084989786148071, - "learning_rate": 2.6407852813248317e-05, - "loss": 3.8646, - "step": 9130 - }, - { - "epoch": 1.5625267116847594, - "grad_norm": 1.2483104467391968, - "learning_rate": 2.636721072449471e-05, - "loss": 3.8949, - "step": 9140 - }, - { - "epoch": 1.5642362595093597, - "grad_norm": 1.2625364065170288, - "learning_rate": 2.6326572340834115e-05, - "loss": 3.9131, - "step": 9150 - }, - { - "epoch": 1.5659458073339603, - "grad_norm": 1.218084454536438, - "learning_rate": 2.6285937795185335e-05, - "loss": 3.8998, - "step": 9160 - }, - { - "epoch": 1.5676553551585606, - "grad_norm": 1.2928606271743774, - "learning_rate": 2.624530722045462e-05, - "loss": 3.8989, - "step": 9170 - }, - { - "epoch": 1.569364902983161, - "grad_norm": 1.2976471185684204, - "learning_rate": 2.6204680749535222e-05, - "loss": 3.9138, - "step": 9180 - }, - { - "epoch": 1.5710744508077612, - "grad_norm": 1.2483446598052979, - "learning_rate": 2.616405851530698e-05, - "loss": 3.9146, - "step": 9190 - }, - { - "epoch": 1.5727839986323617, - "grad_norm": 1.3242456912994385, - "learning_rate": 2.6123440650635877e-05, - "loss": 3.8992, - "step": 9200 - }, - { - "epoch": 1.5744935464569623, - "grad_norm": 1.257159948348999, - "learning_rate": 2.608282728837359e-05, - "loss": 3.8603, - "step": 9210 - }, - { - "epoch": 1.5762030942815626, - "grad_norm": 1.321650743484497, - "learning_rate": 2.6042218561357095e-05, - "loss": 3.9135, - "step": 9220 - }, - { - "epoch": 1.5779126421061629, - "grad_norm": 1.2606945037841797, - "learning_rate": 2.6001614602408186e-05, - "loss": 3.9143, - "step": 9230 - }, - { - "epoch": 1.5796221899307632, - "grad_norm": 1.2827436923980713, - "learning_rate": 2.5961015544333068e-05, - "loss": 3.8882, - "step": 9240 - }, - { - "epoch": 1.5813317377553637, - "grad_norm": 1.317748785018921, - "learning_rate": 2.5920421519921912e-05, - "loss": 3.8709, - "step": 9250 - }, - { - "epoch": 1.5830412855799643, - "grad_norm": 1.296419382095337, - "learning_rate": 2.5879832661948432e-05, - "loss": 3.8888, - "step": 9260 - }, - { - "epoch": 1.5847508334045646, - "grad_norm": 1.240578055381775, - "learning_rate": 2.583924910316944e-05, - "loss": 3.8979, - "step": 9270 - }, - { - "epoch": 1.5864603812291649, - "grad_norm": 1.3039534091949463, - "learning_rate": 2.5798670976324424e-05, - "loss": 3.9132, - "step": 9280 - }, - { - "epoch": 1.5881699290537652, - "grad_norm": 1.2953767776489258, - "learning_rate": 2.5758098414135083e-05, - "loss": 3.8538, - "step": 9290 - }, - { - "epoch": 1.5898794768783657, - "grad_norm": 1.2949522733688354, - "learning_rate": 2.5717531549304946e-05, - "loss": 3.9266, - "step": 9300 - }, - { - "epoch": 1.591589024702966, - "grad_norm": 1.2457222938537598, - "learning_rate": 2.5676970514518884e-05, - "loss": 3.8674, - "step": 9310 - }, - { - "epoch": 1.5932985725275666, - "grad_norm": 1.2903707027435303, - "learning_rate": 2.563641544244271e-05, - "loss": 3.8684, - "step": 9320 - }, - { - "epoch": 1.5950081203521669, - "grad_norm": 1.2799752950668335, - "learning_rate": 2.5595866465722716e-05, - "loss": 3.8627, - "step": 9330 - }, - { - "epoch": 1.5967176681767672, - "grad_norm": 1.2234859466552734, - "learning_rate": 2.55553237169853e-05, - "loss": 3.9217, - "step": 9340 - }, - { - "epoch": 1.5984272160013675, - "grad_norm": 1.3255417346954346, - "learning_rate": 2.5514787328836448e-05, - "loss": 3.8783, - "step": 9350 - }, - { - "epoch": 1.600136763825968, - "grad_norm": 1.297765851020813, - "learning_rate": 2.5474257433861355e-05, - "loss": 3.8484, - "step": 9360 - }, - { - "epoch": 1.6018463116505686, - "grad_norm": 1.261024832725525, - "learning_rate": 2.543373416462398e-05, - "loss": 3.9035, - "step": 9370 - }, - { - "epoch": 1.6035558594751689, - "grad_norm": 1.3538291454315186, - "learning_rate": 2.5393217653666614e-05, - "loss": 3.8698, - "step": 9380 - }, - { - "epoch": 1.6052654072997692, - "grad_norm": 1.2962650060653687, - "learning_rate": 2.535270803350943e-05, - "loss": 3.96, - "step": 9390 - }, - { - "epoch": 1.6069749551243695, - "grad_norm": 1.269261360168457, - "learning_rate": 2.5312205436650087e-05, - "loss": 3.9214, - "step": 9400 - }, - { - "epoch": 1.60868450294897, - "grad_norm": 1.2482683658599854, - "learning_rate": 2.5271709995563253e-05, - "loss": 3.887, - "step": 9410 - }, - { - "epoch": 1.6103940507735703, - "grad_norm": 1.2640718221664429, - "learning_rate": 2.5231221842700188e-05, - "loss": 3.8752, - "step": 9420 - }, - { - "epoch": 1.6121035985981709, - "grad_norm": 1.235435128211975, - "learning_rate": 2.5190741110488324e-05, - "loss": 3.8999, - "step": 9430 - }, - { - "epoch": 1.6138131464227712, - "grad_norm": 1.2240153551101685, - "learning_rate": 2.515026793133083e-05, - "loss": 3.8663, - "step": 9440 - }, - { - "epoch": 1.6155226942473715, - "grad_norm": 1.2666761875152588, - "learning_rate": 2.5109802437606155e-05, - "loss": 3.8505, - "step": 9450 - }, - { - "epoch": 1.6172322420719718, - "grad_norm": 1.3136582374572754, - "learning_rate": 2.5069344761667613e-05, - "loss": 3.9094, - "step": 9460 - }, - { - "epoch": 1.6189417898965723, - "grad_norm": 1.2728362083435059, - "learning_rate": 2.5028895035842955e-05, - "loss": 3.8798, - "step": 9470 - }, - { - "epoch": 1.6206513377211729, - "grad_norm": 1.2531934976577759, - "learning_rate": 2.498845339243392e-05, - "loss": 3.9392, - "step": 9480 - }, - { - "epoch": 1.6223608855457732, - "grad_norm": 1.3685166835784912, - "learning_rate": 2.494801996371582e-05, - "loss": 3.8561, - "step": 9490 - }, - { - "epoch": 1.6240704333703735, - "grad_norm": 1.2812875509262085, - "learning_rate": 2.490759488193709e-05, - "loss": 3.8528, - "step": 9500 - }, - { - "epoch": 1.6257799811949738, - "grad_norm": 1.2563095092773438, - "learning_rate": 2.4867178279318877e-05, - "loss": 3.9159, - "step": 9510 - }, - { - "epoch": 1.6274895290195743, - "grad_norm": 1.2650867700576782, - "learning_rate": 2.4826770288054572e-05, - "loss": 3.9071, - "step": 9520 - }, - { - "epoch": 1.6291990768441749, - "grad_norm": 1.2760249376296997, - "learning_rate": 2.4786371040309442e-05, - "loss": 3.865, - "step": 9530 - }, - { - "epoch": 1.6309086246687752, - "grad_norm": 1.2874990701675415, - "learning_rate": 2.4745980668220093e-05, - "loss": 3.8586, - "step": 9540 - }, - { - "epoch": 1.6326181724933755, - "grad_norm": 1.2674753665924072, - "learning_rate": 2.470559930389416e-05, - "loss": 3.9343, - "step": 9550 - }, - { - "epoch": 1.6343277203179758, - "grad_norm": 1.2468199729919434, - "learning_rate": 2.4665227079409792e-05, - "loss": 3.8876, - "step": 9560 - }, - { - "epoch": 1.6360372681425763, - "grad_norm": 1.3047510385513306, - "learning_rate": 2.4624864126815232e-05, - "loss": 3.8495, - "step": 9570 - }, - { - "epoch": 1.6377468159671766, - "grad_norm": 1.2966421842575073, - "learning_rate": 2.4584510578128422e-05, - "loss": 3.8867, - "step": 9580 - }, - { - "epoch": 1.6394563637917772, - "grad_norm": 1.286103367805481, - "learning_rate": 2.4544166565336523e-05, - "loss": 3.848, - "step": 9590 - }, - { - "epoch": 1.6411659116163775, - "grad_norm": 1.259310007095337, - "learning_rate": 2.4503832220395518e-05, - "loss": 3.8632, - "step": 9600 - }, - { - "epoch": 1.6428754594409778, - "grad_norm": 1.2936660051345825, - "learning_rate": 2.4463507675229776e-05, - "loss": 3.8886, - "step": 9610 - }, - { - "epoch": 1.644585007265578, - "grad_norm": 1.3069586753845215, - "learning_rate": 2.442319306173158e-05, - "loss": 3.8702, - "step": 9620 - }, - { - "epoch": 1.6462945550901786, - "grad_norm": 1.232775092124939, - "learning_rate": 2.4382888511760776e-05, - "loss": 3.8753, - "step": 9630 - }, - { - "epoch": 1.6480041029147792, - "grad_norm": 1.3313182592391968, - "learning_rate": 2.434259415714426e-05, - "loss": 3.8978, - "step": 9640 - }, - { - "epoch": 1.6497136507393795, - "grad_norm": 1.2919247150421143, - "learning_rate": 2.4302310129675594e-05, - "loss": 3.8855, - "step": 9650 - }, - { - "epoch": 1.6514231985639798, - "grad_norm": 1.2249016761779785, - "learning_rate": 2.4262036561114565e-05, - "loss": 3.8922, - "step": 9660 - }, - { - "epoch": 1.65313274638858, - "grad_norm": 1.321050763130188, - "learning_rate": 2.422177358318674e-05, - "loss": 3.8587, - "step": 9670 - }, - { - "epoch": 1.6548422942131806, - "grad_norm": 1.380196213722229, - "learning_rate": 2.4181521327583056e-05, - "loss": 3.8634, - "step": 9680 - }, - { - "epoch": 1.6565518420377812, - "grad_norm": 1.2210720777511597, - "learning_rate": 2.4141279925959387e-05, - "loss": 3.8573, - "step": 9690 - }, - { - "epoch": 1.6582613898623815, - "grad_norm": 1.2415847778320312, - "learning_rate": 2.410104950993608e-05, - "loss": 3.9209, - "step": 9700 - }, - { - "epoch": 1.6599709376869818, - "grad_norm": 1.2057108879089355, - "learning_rate": 2.406083021109758e-05, - "loss": 3.9557, - "step": 9710 - }, - { - "epoch": 1.661680485511582, - "grad_norm": 1.352909803390503, - "learning_rate": 2.4020622160991946e-05, - "loss": 3.8646, - "step": 9720 - }, - { - "epoch": 1.6633900333361826, - "grad_norm": 1.250783920288086, - "learning_rate": 2.3980425491130465e-05, - "loss": 3.8374, - "step": 9730 - }, - { - "epoch": 1.665099581160783, - "grad_norm": 1.2474188804626465, - "learning_rate": 2.3940240332987196e-05, - "loss": 3.8921, - "step": 9740 - }, - { - "epoch": 1.6668091289853835, - "grad_norm": 1.3152629137039185, - "learning_rate": 2.390006681799853e-05, - "loss": 3.8301, - "step": 9750 - }, - { - "epoch": 1.6685186768099838, - "grad_norm": 1.247890591621399, - "learning_rate": 2.3859905077562796e-05, - "loss": 3.9219, - "step": 9760 - }, - { - "epoch": 1.670228224634584, - "grad_norm": 1.3047056198120117, - "learning_rate": 2.3819755243039803e-05, - "loss": 3.8708, - "step": 9770 - }, - { - "epoch": 1.6719377724591844, - "grad_norm": 1.3482645750045776, - "learning_rate": 2.3779617445750423e-05, - "loss": 3.8805, - "step": 9780 - }, - { - "epoch": 1.673647320283785, - "grad_norm": 1.2976834774017334, - "learning_rate": 2.373949181697615e-05, - "loss": 3.8615, - "step": 9790 - }, - { - "epoch": 1.6753568681083855, - "grad_norm": 1.2727550268173218, - "learning_rate": 2.369937848795869e-05, - "loss": 3.8997, - "step": 9800 - }, - { - "epoch": 1.6770664159329858, - "grad_norm": 1.2745579481124878, - "learning_rate": 2.3659277589899494e-05, - "loss": 3.8806, - "step": 9810 - }, - { - "epoch": 1.678775963757586, - "grad_norm": 1.2276194095611572, - "learning_rate": 2.361918925395938e-05, - "loss": 3.8935, - "step": 9820 - }, - { - "epoch": 1.6804855115821864, - "grad_norm": 1.255108118057251, - "learning_rate": 2.3579113611258065e-05, - "loss": 3.9087, - "step": 9830 - }, - { - "epoch": 1.682195059406787, - "grad_norm": 1.2766635417938232, - "learning_rate": 2.353905079287376e-05, - "loss": 3.922, - "step": 9840 - }, - { - "epoch": 1.6839046072313875, - "grad_norm": 1.3522673845291138, - "learning_rate": 2.3499000929842713e-05, - "loss": 3.8864, - "step": 9850 - }, - { - "epoch": 1.6856141550559878, - "grad_norm": 1.2986549139022827, - "learning_rate": 2.3458964153158826e-05, - "loss": 3.8961, - "step": 9860 - }, - { - "epoch": 1.687323702880588, - "grad_norm": 1.3090636730194092, - "learning_rate": 2.341894059377315e-05, - "loss": 3.9284, - "step": 9870 - }, - { - "epoch": 1.6890332507051884, - "grad_norm": 1.3390604257583618, - "learning_rate": 2.3378930382593574e-05, - "loss": 3.8812, - "step": 9880 - }, - { - "epoch": 1.690742798529789, - "grad_norm": 1.2201941013336182, - "learning_rate": 2.3338933650484262e-05, - "loss": 3.9063, - "step": 9890 - }, - { - "epoch": 1.6924523463543892, - "grad_norm": 1.229230523109436, - "learning_rate": 2.329895052826534e-05, - "loss": 3.9013, - "step": 9900 - }, - { - "epoch": 1.6941618941789898, - "grad_norm": 1.2750295400619507, - "learning_rate": 2.325898114671238e-05, - "loss": 3.8168, - "step": 9910 - }, - { - "epoch": 1.69587144200359, - "grad_norm": 1.2591314315795898, - "learning_rate": 2.321902563655606e-05, - "loss": 3.9355, - "step": 9920 - }, - { - "epoch": 1.6975809898281904, - "grad_norm": 1.248218059539795, - "learning_rate": 2.3179084128481616e-05, - "loss": 3.8983, - "step": 9930 - }, - { - "epoch": 1.6992905376527907, - "grad_norm": 1.2849711179733276, - "learning_rate": 2.3139156753128567e-05, - "loss": 3.9136, - "step": 9940 - }, - { - "epoch": 1.7010000854773912, - "grad_norm": 1.2716853618621826, - "learning_rate": 2.309924364109014e-05, - "loss": 3.8699, - "step": 9950 - }, - { - "epoch": 1.7027096333019918, - "grad_norm": 1.2410062551498413, - "learning_rate": 2.3059344922912962e-05, - "loss": 3.8983, - "step": 9960 - }, - { - "epoch": 1.704419181126592, - "grad_norm": 1.2836254835128784, - "learning_rate": 2.301946072909653e-05, - "loss": 3.9238, - "step": 9970 - }, - { - "epoch": 1.7061287289511924, - "grad_norm": 1.279064416885376, - "learning_rate": 2.2979591190092897e-05, - "loss": 3.9029, - "step": 9980 - }, - { - "epoch": 1.7078382767757927, - "grad_norm": 1.2862904071807861, - "learning_rate": 2.2939736436306124e-05, - "loss": 3.9036, - "step": 9990 - }, - { - "epoch": 1.7095478246003932, - "grad_norm": 1.3002405166625977, - "learning_rate": 2.2899896598091946e-05, - "loss": 3.9252, - "step": 10000 - }, - { - "epoch": 1.7095478246003932, - "eval_loss": 4.180829048156738, - "eval_runtime": 15.5279, - "eval_samples_per_second": 62.726, - "eval_steps_per_second": 0.837, - "step": 10000 - }, - { - "epoch": 1.7112573724249935, - "grad_norm": 1.2758474349975586, - "learning_rate": 2.2860071805757312e-05, - "loss": 3.8464, - "step": 10010 - }, - { - "epoch": 1.712966920249594, - "grad_norm": 1.296708345413208, - "learning_rate": 2.2820262189559938e-05, - "loss": 3.864, - "step": 10020 - }, - { - "epoch": 1.7146764680741944, - "grad_norm": 1.250022292137146, - "learning_rate": 2.2780467879707927e-05, - "loss": 3.855, - "step": 10030 - }, - { - "epoch": 1.7163860158987947, - "grad_norm": 1.2888106107711792, - "learning_rate": 2.2740689006359307e-05, - "loss": 3.8715, - "step": 10040 - }, - { - "epoch": 1.718095563723395, - "grad_norm": 1.2414958477020264, - "learning_rate": 2.2700925699621618e-05, - "loss": 3.9214, - "step": 10050 - }, - { - "epoch": 1.7198051115479955, - "grad_norm": 1.292668342590332, - "learning_rate": 2.2661178089551476e-05, - "loss": 3.8977, - "step": 10060 - }, - { - "epoch": 1.721514659372596, - "grad_norm": 1.2437152862548828, - "learning_rate": 2.2621446306154176e-05, - "loss": 3.8945, - "step": 10070 - }, - { - "epoch": 1.7232242071971964, - "grad_norm": 1.2428617477416992, - "learning_rate": 2.258173047938323e-05, - "loss": 3.8361, - "step": 10080 - }, - { - "epoch": 1.7249337550217967, - "grad_norm": 1.2099937200546265, - "learning_rate": 2.2542030739139975e-05, - "loss": 3.8734, - "step": 10090 - }, - { - "epoch": 1.726643302846397, - "grad_norm": 1.332078218460083, - "learning_rate": 2.2502347215273117e-05, - "loss": 3.8501, - "step": 10100 - }, - { - "epoch": 1.7283528506709975, - "grad_norm": 1.256426215171814, - "learning_rate": 2.2462680037578342e-05, - "loss": 3.9005, - "step": 10110 - }, - { - "epoch": 1.730062398495598, - "grad_norm": 1.2563941478729248, - "learning_rate": 2.242302933579784e-05, - "loss": 3.8573, - "step": 10120 - }, - { - "epoch": 1.7317719463201984, - "grad_norm": 1.2846729755401611, - "learning_rate": 2.2383395239619952e-05, - "loss": 3.9124, - "step": 10130 - }, - { - "epoch": 1.7334814941447987, - "grad_norm": 1.362068772315979, - "learning_rate": 2.234377787867867e-05, - "loss": 3.8878, - "step": 10140 - }, - { - "epoch": 1.735191041969399, - "grad_norm": 1.2930997610092163, - "learning_rate": 2.2304177382553265e-05, - "loss": 3.8797, - "step": 10150 - }, - { - "epoch": 1.7369005897939995, - "grad_norm": 1.3178917169570923, - "learning_rate": 2.2264593880767855e-05, - "loss": 3.8846, - "step": 10160 - }, - { - "epoch": 1.7386101376185998, - "grad_norm": 1.3368654251098633, - "learning_rate": 2.2225027502790957e-05, - "loss": 3.8633, - "step": 10170 - }, - { - "epoch": 1.7403196854432004, - "grad_norm": 1.281380534172058, - "learning_rate": 2.2185478378035086e-05, - "loss": 3.8815, - "step": 10180 - }, - { - "epoch": 1.7420292332678007, - "grad_norm": 1.3193190097808838, - "learning_rate": 2.214594663585633e-05, - "loss": 3.8478, - "step": 10190 - }, - { - "epoch": 1.743738781092401, - "grad_norm": 1.2199701070785522, - "learning_rate": 2.2106432405553923e-05, - "loss": 3.8848, - "step": 10200 - }, - { - "epoch": 1.7454483289170013, - "grad_norm": 1.2947921752929688, - "learning_rate": 2.2066935816369815e-05, - "loss": 3.878, - "step": 10210 - }, - { - "epoch": 1.7471578767416018, - "grad_norm": 1.273738980293274, - "learning_rate": 2.2027456997488254e-05, - "loss": 3.9202, - "step": 10220 - }, - { - "epoch": 1.7488674245662024, - "grad_norm": 1.2859188318252563, - "learning_rate": 2.198799607803539e-05, - "loss": 3.8924, - "step": 10230 - }, - { - "epoch": 1.7505769723908027, - "grad_norm": 1.3119840621948242, - "learning_rate": 2.194855318707878e-05, - "loss": 3.8581, - "step": 10240 - }, - { - "epoch": 1.752286520215403, - "grad_norm": 1.2660763263702393, - "learning_rate": 2.1909128453627065e-05, - "loss": 3.9139, - "step": 10250 - }, - { - "epoch": 1.7539960680400033, - "grad_norm": 1.224841594696045, - "learning_rate": 2.1869722006629468e-05, - "loss": 3.8511, - "step": 10260 - }, - { - "epoch": 1.7557056158646038, - "grad_norm": 1.3232380151748657, - "learning_rate": 2.1830333974975404e-05, - "loss": 3.8567, - "step": 10270 - }, - { - "epoch": 1.7574151636892044, - "grad_norm": 1.334989070892334, - "learning_rate": 2.1790964487494065e-05, - "loss": 3.8354, - "step": 10280 - }, - { - "epoch": 1.7591247115138047, - "grad_norm": 1.3073331117630005, - "learning_rate": 2.1751613672953974e-05, - "loss": 3.9122, - "step": 10290 - }, - { - "epoch": 1.760834259338405, - "grad_norm": 1.2953872680664062, - "learning_rate": 2.1712281660062605e-05, - "loss": 3.8532, - "step": 10300 - }, - { - "epoch": 1.7625438071630053, - "grad_norm": 1.2862553596496582, - "learning_rate": 2.1672968577465908e-05, - "loss": 3.8651, - "step": 10310 - }, - { - "epoch": 1.7642533549876058, - "grad_norm": 1.2047370672225952, - "learning_rate": 2.163367455374794e-05, - "loss": 3.9254, - "step": 10320 - }, - { - "epoch": 1.7659629028122061, - "grad_norm": 1.2860280275344849, - "learning_rate": 2.1594399717430396e-05, - "loss": 3.9022, - "step": 10330 - }, - { - "epoch": 1.7676724506368067, - "grad_norm": 1.3194483518600464, - "learning_rate": 2.1555144196972233e-05, - "loss": 3.8812, - "step": 10340 - }, - { - "epoch": 1.769381998461407, - "grad_norm": 1.2725915908813477, - "learning_rate": 2.1515908120769223e-05, - "loss": 3.8803, - "step": 10350 - }, - { - "epoch": 1.7710915462860073, - "grad_norm": 1.2913448810577393, - "learning_rate": 2.1476691617153544e-05, - "loss": 3.8367, - "step": 10360 - }, - { - "epoch": 1.7728010941106076, - "grad_norm": 1.3318886756896973, - "learning_rate": 2.1437494814393345e-05, - "loss": 3.8907, - "step": 10370 - }, - { - "epoch": 1.7745106419352081, - "grad_norm": 1.2967519760131836, - "learning_rate": 2.139831784069236e-05, - "loss": 3.8418, - "step": 10380 - }, - { - "epoch": 1.7762201897598087, - "grad_norm": 1.2569862604141235, - "learning_rate": 2.1359160824189445e-05, - "loss": 3.8672, - "step": 10390 - }, - { - "epoch": 1.777929737584409, - "grad_norm": 1.312217354774475, - "learning_rate": 2.132002389295819e-05, - "loss": 3.9359, - "step": 10400 - }, - { - "epoch": 1.7796392854090093, - "grad_norm": 1.2990175485610962, - "learning_rate": 2.1280907175006488e-05, - "loss": 3.8755, - "step": 10410 - }, - { - "epoch": 1.7813488332336096, - "grad_norm": 1.2998316287994385, - "learning_rate": 2.124181079827613e-05, - "loss": 3.8824, - "step": 10420 - }, - { - "epoch": 1.7830583810582101, - "grad_norm": 1.2888137102127075, - "learning_rate": 2.1202734890642356e-05, - "loss": 3.8838, - "step": 10430 - }, - { - "epoch": 1.7847679288828107, - "grad_norm": 1.3377214670181274, - "learning_rate": 2.1163679579913472e-05, - "loss": 3.8637, - "step": 10440 - }, - { - "epoch": 1.786477476707411, - "grad_norm": 1.348799467086792, - "learning_rate": 2.1124644993830406e-05, - "loss": 3.8661, - "step": 10450 - }, - { - "epoch": 1.7881870245320113, - "grad_norm": 1.2619905471801758, - "learning_rate": 2.108563126006632e-05, - "loss": 3.875, - "step": 10460 - }, - { - "epoch": 1.7898965723566116, - "grad_norm": 1.2741824388504028, - "learning_rate": 2.104663850622614e-05, - "loss": 3.8791, - "step": 10470 - }, - { - "epoch": 1.791606120181212, - "grad_norm": 1.2534178495407104, - "learning_rate": 2.1007666859846203e-05, - "loss": 3.8851, - "step": 10480 - }, - { - "epoch": 1.7933156680058124, - "grad_norm": 1.271044135093689, - "learning_rate": 2.0968716448393782e-05, - "loss": 3.8965, - "step": 10490 - }, - { - "epoch": 1.795025215830413, - "grad_norm": 1.2308266162872314, - "learning_rate": 2.0929787399266722e-05, - "loss": 3.854, - "step": 10500 - }, - { - "epoch": 1.7967347636550133, - "grad_norm": 1.3654961585998535, - "learning_rate": 2.0890879839792977e-05, - "loss": 3.8555, - "step": 10510 - }, - { - "epoch": 1.7984443114796136, - "grad_norm": 1.2335867881774902, - "learning_rate": 2.0851993897230214e-05, - "loss": 3.9145, - "step": 10520 - }, - { - "epoch": 1.800153859304214, - "grad_norm": 1.354956865310669, - "learning_rate": 2.0813129698765415e-05, - "loss": 3.8776, - "step": 10530 - }, - { - "epoch": 1.8018634071288144, - "grad_norm": 1.2766063213348389, - "learning_rate": 2.0774287371514424e-05, - "loss": 3.8877, - "step": 10540 - }, - { - "epoch": 1.803572954953415, - "grad_norm": 1.2609217166900635, - "learning_rate": 2.073546704252155e-05, - "loss": 3.8939, - "step": 10550 - }, - { - "epoch": 1.8052825027780153, - "grad_norm": 1.2677260637283325, - "learning_rate": 2.0696668838759154e-05, - "loss": 3.9045, - "step": 10560 - }, - { - "epoch": 1.8069920506026156, - "grad_norm": 1.3016165494918823, - "learning_rate": 2.0657892887127234e-05, - "loss": 3.857, - "step": 10570 - }, - { - "epoch": 1.808701598427216, - "grad_norm": 1.303471565246582, - "learning_rate": 2.0619139314453007e-05, - "loss": 3.808, - "step": 10580 - }, - { - "epoch": 1.8104111462518164, - "grad_norm": 1.2504576444625854, - "learning_rate": 2.058040824749049e-05, - "loss": 3.9096, - "step": 10590 - }, - { - "epoch": 1.8121206940764167, - "grad_norm": 1.2932789325714111, - "learning_rate": 2.054169981292008e-05, - "loss": 3.878, - "step": 10600 - }, - { - "epoch": 1.8138302419010173, - "grad_norm": 1.30540132522583, - "learning_rate": 2.0503014137348163e-05, - "loss": 3.9004, - "step": 10610 - }, - { - "epoch": 1.8155397897256176, - "grad_norm": 1.280108094215393, - "learning_rate": 2.0464351347306686e-05, - "loss": 3.8997, - "step": 10620 - }, - { - "epoch": 1.817249337550218, - "grad_norm": 1.262285590171814, - "learning_rate": 2.0425711569252738e-05, - "loss": 3.8759, - "step": 10630 - }, - { - "epoch": 1.8189588853748182, - "grad_norm": 1.2662343978881836, - "learning_rate": 2.038709492956813e-05, - "loss": 3.8246, - "step": 10640 - }, - { - "epoch": 1.8206684331994187, - "grad_norm": 1.2879695892333984, - "learning_rate": 2.0348501554559006e-05, - "loss": 3.8564, - "step": 10650 - }, - { - "epoch": 1.8223779810240193, - "grad_norm": 1.2456399202346802, - "learning_rate": 2.030993157045542e-05, - "loss": 3.9006, - "step": 10660 - }, - { - "epoch": 1.8240875288486196, - "grad_norm": 1.30082368850708, - "learning_rate": 2.027138510341091e-05, - "loss": 3.9003, - "step": 10670 - }, - { - "epoch": 1.8257970766732199, - "grad_norm": 1.2501704692840576, - "learning_rate": 2.0232862279502096e-05, - "loss": 3.8732, - "step": 10680 - }, - { - "epoch": 1.8275066244978202, - "grad_norm": 1.3736177682876587, - "learning_rate": 2.019436322472827e-05, - "loss": 3.8394, - "step": 10690 - }, - { - "epoch": 1.8292161723224207, - "grad_norm": 1.2765625715255737, - "learning_rate": 2.015588806501098e-05, - "loss": 3.8967, - "step": 10700 - }, - { - "epoch": 1.8309257201470213, - "grad_norm": 1.248347282409668, - "learning_rate": 2.011743692619362e-05, - "loss": 3.8399, - "step": 10710 - }, - { - "epoch": 1.8326352679716216, - "grad_norm": 1.263159155845642, - "learning_rate": 2.0079009934041015e-05, - "loss": 3.8874, - "step": 10720 - }, - { - "epoch": 1.8343448157962219, - "grad_norm": 1.3195605278015137, - "learning_rate": 2.0040607214239015e-05, - "loss": 3.846, - "step": 10730 - }, - { - "epoch": 1.8360543636208222, - "grad_norm": 1.4088664054870605, - "learning_rate": 2.0002228892394072e-05, - "loss": 3.866, - "step": 10740 - }, - { - "epoch": 1.8377639114454227, - "grad_norm": 1.2861645221710205, - "learning_rate": 1.9963875094032853e-05, - "loss": 3.8798, - "step": 10750 - }, - { - "epoch": 1.839473459270023, - "grad_norm": 1.2638813257217407, - "learning_rate": 1.9925545944601792e-05, - "loss": 3.9059, - "step": 10760 - }, - { - "epoch": 1.8411830070946236, - "grad_norm": 1.285369873046875, - "learning_rate": 1.988724156946673e-05, - "loss": 3.8901, - "step": 10770 - }, - { - "epoch": 1.8428925549192239, - "grad_norm": 1.3214296102523804, - "learning_rate": 1.9848962093912456e-05, - "loss": 3.9192, - "step": 10780 - }, - { - "epoch": 1.8446021027438242, - "grad_norm": 1.2964173555374146, - "learning_rate": 1.9810707643142317e-05, - "loss": 3.8479, - "step": 10790 - }, - { - "epoch": 1.8463116505684245, - "grad_norm": 1.373185634613037, - "learning_rate": 1.9772478342277832e-05, - "loss": 3.8499, - "step": 10800 - }, - { - "epoch": 1.848021198393025, - "grad_norm": 1.2716069221496582, - "learning_rate": 1.973427431635824e-05, - "loss": 3.7954, - "step": 10810 - }, - { - "epoch": 1.8497307462176256, - "grad_norm": 1.281115174293518, - "learning_rate": 1.9696095690340115e-05, - "loss": 3.8848, - "step": 10820 - }, - { - "epoch": 1.8514402940422259, - "grad_norm": 1.307884693145752, - "learning_rate": 1.9657942589096966e-05, - "loss": 3.8685, - "step": 10830 - }, - { - "epoch": 1.8531498418668262, - "grad_norm": 1.340362310409546, - "learning_rate": 1.961981513741879e-05, - "loss": 3.8944, - "step": 10840 - }, - { - "epoch": 1.8548593896914265, - "grad_norm": 1.2787150144577026, - "learning_rate": 1.958171346001172e-05, - "loss": 3.8569, - "step": 10850 - }, - { - "epoch": 1.856568937516027, - "grad_norm": 1.27812659740448, - "learning_rate": 1.954363768149756e-05, - "loss": 3.8822, - "step": 10860 - }, - { - "epoch": 1.8582784853406276, - "grad_norm": 1.2958768606185913, - "learning_rate": 1.9505587926413432e-05, - "loss": 3.8029, - "step": 10870 - }, - { - "epoch": 1.8599880331652279, - "grad_norm": 1.2305864095687866, - "learning_rate": 1.946756431921133e-05, - "loss": 3.8387, - "step": 10880 - }, - { - "epoch": 1.8616975809898282, - "grad_norm": 1.4085084199905396, - "learning_rate": 1.9429566984257707e-05, - "loss": 3.885, - "step": 10890 - }, - { - "epoch": 1.8634071288144285, - "grad_norm": 1.3260706663131714, - "learning_rate": 1.9391596045833117e-05, - "loss": 3.8257, - "step": 10900 - }, - { - "epoch": 1.865116676639029, - "grad_norm": 1.2515817880630493, - "learning_rate": 1.9353651628131746e-05, - "loss": 3.9052, - "step": 10910 - }, - { - "epoch": 1.8668262244636293, - "grad_norm": 1.2600115537643433, - "learning_rate": 1.9315733855261058e-05, - "loss": 3.8528, - "step": 10920 - }, - { - "epoch": 1.8685357722882299, - "grad_norm": 1.3052595853805542, - "learning_rate": 1.9277842851241368e-05, - "loss": 3.8565, - "step": 10930 - }, - { - "epoch": 1.8702453201128302, - "grad_norm": 1.312430739402771, - "learning_rate": 1.923997874000542e-05, - "loss": 3.8808, - "step": 10940 - }, - { - "epoch": 1.8719548679374305, - "grad_norm": 1.2781634330749512, - "learning_rate": 1.9202141645398014e-05, - "loss": 3.8727, - "step": 10950 - }, - { - "epoch": 1.8736644157620308, - "grad_norm": 1.2667443752288818, - "learning_rate": 1.9164331691175575e-05, - "loss": 3.8919, - "step": 10960 - }, - { - "epoch": 1.8753739635866313, - "grad_norm": 1.2835055589675903, - "learning_rate": 1.9126549001005755e-05, - "loss": 3.8877, - "step": 10970 - }, - { - "epoch": 1.8770835114112319, - "grad_norm": 1.288201928138733, - "learning_rate": 1.9088793698467055e-05, - "loss": 3.8597, - "step": 10980 - }, - { - "epoch": 1.8787930592358322, - "grad_norm": 1.2961546182632446, - "learning_rate": 1.9051065907048364e-05, - "loss": 3.8504, - "step": 10990 - }, - { - "epoch": 1.8805026070604325, - "grad_norm": 1.2987557649612427, - "learning_rate": 1.901336575014862e-05, - "loss": 3.8725, - "step": 11000 - }, - { - "epoch": 1.8805026070604325, - "eval_loss": 4.164763927459717, - "eval_runtime": 15.0743, - "eval_samples_per_second": 64.613, - "eval_steps_per_second": 0.862, - "step": 11000 - }, - { - "epoch": 1.8822121548850328, - "grad_norm": 1.2628397941589355, - "learning_rate": 1.8975693351076335e-05, - "loss": 3.9048, - "step": 11010 - }, - { - "epoch": 1.8839217027096333, - "grad_norm": 1.2374736070632935, - "learning_rate": 1.8938048833049282e-05, - "loss": 3.8894, - "step": 11020 - }, - { - "epoch": 1.8856312505342339, - "grad_norm": 1.2675261497497559, - "learning_rate": 1.8900432319194012e-05, - "loss": 3.847, - "step": 11030 - }, - { - "epoch": 1.8873407983588342, - "grad_norm": 1.330662727355957, - "learning_rate": 1.8862843932545476e-05, - "loss": 3.8529, - "step": 11040 - }, - { - "epoch": 1.8890503461834345, - "grad_norm": 1.2348227500915527, - "learning_rate": 1.8825283796046633e-05, - "loss": 3.9012, - "step": 11050 - }, - { - "epoch": 1.8907598940080348, - "grad_norm": 1.217609167098999, - "learning_rate": 1.8787752032548063e-05, - "loss": 3.8768, - "step": 11060 - }, - { - "epoch": 1.892469441832635, - "grad_norm": 1.2971266508102417, - "learning_rate": 1.87502487648075e-05, - "loss": 3.8547, - "step": 11070 - }, - { - "epoch": 1.8941789896572356, - "grad_norm": 1.336425542831421, - "learning_rate": 1.8712774115489527e-05, - "loss": 3.8555, - "step": 11080 - }, - { - "epoch": 1.8958885374818362, - "grad_norm": 1.2660021781921387, - "learning_rate": 1.8675328207165083e-05, - "loss": 3.8174, - "step": 11090 - }, - { - "epoch": 1.8975980853064365, - "grad_norm": 1.3119652271270752, - "learning_rate": 1.8637911162311123e-05, - "loss": 3.8352, - "step": 11100 - }, - { - "epoch": 1.8993076331310368, - "grad_norm": 1.3453423976898193, - "learning_rate": 1.860052310331018e-05, - "loss": 3.8597, - "step": 11110 - }, - { - "epoch": 1.901017180955637, - "grad_norm": 1.3236535787582397, - "learning_rate": 1.8563164152450003e-05, - "loss": 3.8974, - "step": 11120 - }, - { - "epoch": 1.9027267287802376, - "grad_norm": 1.341765284538269, - "learning_rate": 1.8525834431923117e-05, - "loss": 3.8699, - "step": 11130 - }, - { - "epoch": 1.9044362766048382, - "grad_norm": 1.3991672992706299, - "learning_rate": 1.848853406382644e-05, - "loss": 3.842, - "step": 11140 - }, - { - "epoch": 1.9061458244294385, - "grad_norm": 1.3339606523513794, - "learning_rate": 1.84512631701609e-05, - "loss": 3.8271, - "step": 11150 - }, - { - "epoch": 1.9078553722540388, - "grad_norm": 1.2688626050949097, - "learning_rate": 1.8414021872831007e-05, - "loss": 3.8753, - "step": 11160 - }, - { - "epoch": 1.909564920078639, - "grad_norm": 1.2893714904785156, - "learning_rate": 1.8376810293644477e-05, - "loss": 3.8862, - "step": 11170 - }, - { - "epoch": 1.9112744679032396, - "grad_norm": 1.3027559518814087, - "learning_rate": 1.833962855431182e-05, - "loss": 3.8584, - "step": 11180 - }, - { - "epoch": 1.91298401572784, - "grad_norm": 1.2188791036605835, - "learning_rate": 1.8302476776445955e-05, - "loss": 3.8807, - "step": 11190 - }, - { - "epoch": 1.9146935635524405, - "grad_norm": 1.2724025249481201, - "learning_rate": 1.826535508156178e-05, - "loss": 3.8357, - "step": 11200 - }, - { - "epoch": 1.9164031113770408, - "grad_norm": 1.2755258083343506, - "learning_rate": 1.8228263591075835e-05, - "loss": 3.9157, - "step": 11210 - }, - { - "epoch": 1.918112659201641, - "grad_norm": 1.287720799446106, - "learning_rate": 1.8191202426305836e-05, - "loss": 3.8569, - "step": 11220 - }, - { - "epoch": 1.9198222070262414, - "grad_norm": 1.2490489482879639, - "learning_rate": 1.8154171708470324e-05, - "loss": 3.8382, - "step": 11230 - }, - { - "epoch": 1.921531754850842, - "grad_norm": 1.3437286615371704, - "learning_rate": 1.8117171558688255e-05, - "loss": 3.8578, - "step": 11240 - }, - { - "epoch": 1.9232413026754425, - "grad_norm": 1.3718810081481934, - "learning_rate": 1.8080202097978617e-05, - "loss": 3.8571, - "step": 11250 - }, - { - "epoch": 1.9249508505000428, - "grad_norm": 1.2912505865097046, - "learning_rate": 1.8043263447260006e-05, - "loss": 3.844, - "step": 11260 - }, - { - "epoch": 1.926660398324643, - "grad_norm": 1.3142791986465454, - "learning_rate": 1.8006355727350225e-05, - "loss": 3.8293, - "step": 11270 - }, - { - "epoch": 1.9283699461492434, - "grad_norm": 1.2706501483917236, - "learning_rate": 1.796947905896595e-05, - "loss": 3.8331, - "step": 11280 - }, - { - "epoch": 1.930079493973844, - "grad_norm": 1.3424609899520874, - "learning_rate": 1.7932633562722268e-05, - "loss": 3.8287, - "step": 11290 - }, - { - "epoch": 1.9317890417984445, - "grad_norm": 1.247891902923584, - "learning_rate": 1.789581935913233e-05, - "loss": 3.9148, - "step": 11300 - }, - { - "epoch": 1.9334985896230448, - "grad_norm": 1.336173176765442, - "learning_rate": 1.7859036568606914e-05, - "loss": 3.8282, - "step": 11310 - }, - { - "epoch": 1.935208137447645, - "grad_norm": 1.3850946426391602, - "learning_rate": 1.7822285311454058e-05, - "loss": 3.8443, - "step": 11320 - }, - { - "epoch": 1.9369176852722454, - "grad_norm": 1.2851150035858154, - "learning_rate": 1.7785565707878667e-05, - "loss": 3.8615, - "step": 11330 - }, - { - "epoch": 1.938627233096846, - "grad_norm": 1.2853121757507324, - "learning_rate": 1.7748877877982134e-05, - "loss": 3.8024, - "step": 11340 - }, - { - "epoch": 1.9403367809214462, - "grad_norm": 1.2064129114151, - "learning_rate": 1.7712221941761877e-05, - "loss": 3.8728, - "step": 11350 - }, - { - "epoch": 1.9420463287460468, - "grad_norm": 1.3317078351974487, - "learning_rate": 1.767559801911104e-05, - "loss": 3.8378, - "step": 11360 - }, - { - "epoch": 1.943755876570647, - "grad_norm": 1.2767136096954346, - "learning_rate": 1.763900622981805e-05, - "loss": 3.863, - "step": 11370 - }, - { - "epoch": 1.9454654243952474, - "grad_norm": 1.3056049346923828, - "learning_rate": 1.7602446693566232e-05, - "loss": 3.8775, - "step": 11380 - }, - { - "epoch": 1.9471749722198477, - "grad_norm": 1.3146320581436157, - "learning_rate": 1.7565919529933393e-05, - "loss": 3.8565, - "step": 11390 - }, - { - "epoch": 1.9488845200444482, - "grad_norm": 1.2762694358825684, - "learning_rate": 1.75294248583915e-05, - "loss": 3.8394, - "step": 11400 - }, - { - "epoch": 1.9505940678690488, - "grad_norm": 1.2540512084960938, - "learning_rate": 1.749296279830622e-05, - "loss": 3.8395, - "step": 11410 - }, - { - "epoch": 1.952303615693649, - "grad_norm": 1.2483302354812622, - "learning_rate": 1.7456533468936554e-05, - "loss": 3.8899, - "step": 11420 - }, - { - "epoch": 1.9540131635182494, - "grad_norm": 1.280833125114441, - "learning_rate": 1.7420136989434464e-05, - "loss": 3.8462, - "step": 11430 - }, - { - "epoch": 1.9557227113428497, - "grad_norm": 1.3020068407058716, - "learning_rate": 1.738377347884445e-05, - "loss": 3.8374, - "step": 11440 - }, - { - "epoch": 1.9574322591674502, - "grad_norm": 1.3307894468307495, - "learning_rate": 1.7347443056103173e-05, - "loss": 3.8373, - "step": 11450 - }, - { - "epoch": 1.9591418069920508, - "grad_norm": 1.298108696937561, - "learning_rate": 1.7311145840039112e-05, - "loss": 3.8897, - "step": 11460 - }, - { - "epoch": 1.960851354816651, - "grad_norm": 1.321644902229309, - "learning_rate": 1.7274881949372106e-05, - "loss": 3.8059, - "step": 11470 - }, - { - "epoch": 1.9625609026412514, - "grad_norm": 1.2560114860534668, - "learning_rate": 1.723865150271297e-05, - "loss": 3.8741, - "step": 11480 - }, - { - "epoch": 1.9642704504658517, - "grad_norm": 1.3190380334854126, - "learning_rate": 1.7202454618563178e-05, - "loss": 3.8528, - "step": 11490 - }, - { - "epoch": 1.9659799982904522, - "grad_norm": 1.3108000755310059, - "learning_rate": 1.7166291415314393e-05, - "loss": 3.8576, - "step": 11500 - }, - { - "epoch": 1.9676895461150525, - "grad_norm": 1.3373942375183105, - "learning_rate": 1.713016201124815e-05, - "loss": 3.8121, - "step": 11510 - }, - { - "epoch": 1.969399093939653, - "grad_norm": 1.3309253454208374, - "learning_rate": 1.7094066524535395e-05, - "loss": 3.8461, - "step": 11520 - }, - { - "epoch": 1.9711086417642534, - "grad_norm": 1.2973037958145142, - "learning_rate": 1.705800507323617e-05, - "loss": 3.8425, - "step": 11530 - }, - { - "epoch": 1.9728181895888537, - "grad_norm": 1.2716772556304932, - "learning_rate": 1.7021977775299175e-05, - "loss": 3.8713, - "step": 11540 - }, - { - "epoch": 1.974527737413454, - "grad_norm": 1.3161543607711792, - "learning_rate": 1.6985984748561416e-05, - "loss": 3.8099, - "step": 11550 - }, - { - "epoch": 1.9762372852380545, - "grad_norm": 1.3454633951187134, - "learning_rate": 1.6950026110747796e-05, - "loss": 3.8588, - "step": 11560 - }, - { - "epoch": 1.977946833062655, - "grad_norm": 1.3398163318634033, - "learning_rate": 1.6914101979470737e-05, - "loss": 3.8269, - "step": 11570 - }, - { - "epoch": 1.9796563808872554, - "grad_norm": 1.2668365240097046, - "learning_rate": 1.687821247222982e-05, - "loss": 3.9077, - "step": 11580 - }, - { - "epoch": 1.9813659287118557, - "grad_norm": 1.3431310653686523, - "learning_rate": 1.684235770641136e-05, - "loss": 3.8471, - "step": 11590 - }, - { - "epoch": 1.983075476536456, - "grad_norm": 1.306074619293213, - "learning_rate": 1.680653779928803e-05, - "loss": 3.91, - "step": 11600 - }, - { - "epoch": 1.9847850243610565, - "grad_norm": 1.3266032934188843, - "learning_rate": 1.6770752868018515e-05, - "loss": 3.8874, - "step": 11610 - }, - { - "epoch": 1.986494572185657, - "grad_norm": 1.2727481126785278, - "learning_rate": 1.6735003029647088e-05, - "loss": 3.8786, - "step": 11620 - }, - { - "epoch": 1.9882041200102574, - "grad_norm": 1.3078691959381104, - "learning_rate": 1.669928840110325e-05, - "loss": 3.8292, - "step": 11630 - }, - { - "epoch": 1.9899136678348577, - "grad_norm": 1.3428819179534912, - "learning_rate": 1.666360909920131e-05, - "loss": 3.8807, - "step": 11640 - }, - { - "epoch": 1.991623215659458, - "grad_norm": 1.2383522987365723, - "learning_rate": 1.662796524064007e-05, - "loss": 3.871, - "step": 11650 - }, - { - "epoch": 1.9933327634840583, - "grad_norm": 1.3393865823745728, - "learning_rate": 1.6592356942002373e-05, - "loss": 3.8017, - "step": 11660 - }, - { - "epoch": 1.9950423113086588, - "grad_norm": 1.2964119911193848, - "learning_rate": 1.65567843197548e-05, - "loss": 3.8205, - "step": 11670 - }, - { - "epoch": 1.9967518591332594, - "grad_norm": 1.3231319189071655, - "learning_rate": 1.6521247490247178e-05, - "loss": 3.8564, - "step": 11680 - }, - { - "epoch": 1.9984614069578597, - "grad_norm": 1.3185927867889404, - "learning_rate": 1.6485746569712317e-05, - "loss": 3.8472, - "step": 11690 - }, - { - "epoch": 2.00017095478246, - "grad_norm": 2.1887097358703613, - "learning_rate": 1.6450281674265547e-05, - "loss": 3.8308, - "step": 11700 - }, - { - "epoch": 2.0018805026070603, - "grad_norm": 1.8212313652038574, - "learning_rate": 1.6414852919904394e-05, - "loss": 3.4911, - "step": 11710 - }, - { - "epoch": 2.0035900504316606, - "grad_norm": 1.5174710750579834, - "learning_rate": 1.637946042250814e-05, - "loss": 3.4782, - "step": 11720 - }, - { - "epoch": 2.0052995982562614, - "grad_norm": 1.6044405698776245, - "learning_rate": 1.6344104297837512e-05, - "loss": 3.483, - "step": 11730 - }, - { - "epoch": 2.0070091460808617, - "grad_norm": 1.554380178451538, - "learning_rate": 1.6308784661534248e-05, - "loss": 3.4857, - "step": 11740 - }, - { - "epoch": 2.008718693905462, - "grad_norm": 1.4844028949737549, - "learning_rate": 1.6273501629120757e-05, - "loss": 3.4458, - "step": 11750 - }, - { - "epoch": 2.0104282417300623, - "grad_norm": 1.5765936374664307, - "learning_rate": 1.6238255315999722e-05, - "loss": 3.4616, - "step": 11760 - }, - { - "epoch": 2.0121377895546626, - "grad_norm": 1.5239100456237793, - "learning_rate": 1.6203045837453713e-05, - "loss": 3.4481, - "step": 11770 - }, - { - "epoch": 2.0138473373792634, - "grad_norm": 1.4969544410705566, - "learning_rate": 1.6167873308644828e-05, - "loss": 3.4544, - "step": 11780 - }, - { - "epoch": 2.0155568852038637, - "grad_norm": 1.6062504053115845, - "learning_rate": 1.6132737844614316e-05, - "loss": 3.5313, - "step": 11790 - }, - { - "epoch": 2.017266433028464, - "grad_norm": 1.5167871713638306, - "learning_rate": 1.609763956028221e-05, - "loss": 3.4999, - "step": 11800 - }, - { - "epoch": 2.0189759808530643, - "grad_norm": 1.5412012338638306, - "learning_rate": 1.6062578570446896e-05, - "loss": 3.4868, - "step": 11810 - }, - { - "epoch": 2.0206855286776646, - "grad_norm": 1.7249890565872192, - "learning_rate": 1.6027554989784812e-05, - "loss": 3.4463, - "step": 11820 - }, - { - "epoch": 2.0223950765022654, - "grad_norm": 1.55708646774292, - "learning_rate": 1.5992568932850032e-05, - "loss": 3.4092, - "step": 11830 - }, - { - "epoch": 2.0241046243268657, - "grad_norm": 1.582253336906433, - "learning_rate": 1.595762051407391e-05, - "loss": 3.4705, - "step": 11840 - }, - { - "epoch": 2.025814172151466, - "grad_norm": 1.5189117193222046, - "learning_rate": 1.592270984776465e-05, - "loss": 3.4712, - "step": 11850 - }, - { - "epoch": 2.0275237199760663, - "grad_norm": 1.5142782926559448, - "learning_rate": 1.5887837048107028e-05, - "loss": 3.4391, - "step": 11860 - }, - { - "epoch": 2.0292332678006666, - "grad_norm": 1.574446439743042, - "learning_rate": 1.585300222916194e-05, - "loss": 3.4758, - "step": 11870 - }, - { - "epoch": 2.030942815625267, - "grad_norm": 1.4984582662582397, - "learning_rate": 1.5818205504866064e-05, - "loss": 3.4646, - "step": 11880 - }, - { - "epoch": 2.0326523634498677, - "grad_norm": 1.5172866582870483, - "learning_rate": 1.578344698903147e-05, - "loss": 3.4861, - "step": 11890 - }, - { - "epoch": 2.034361911274468, - "grad_norm": 1.5115021467208862, - "learning_rate": 1.5748726795345274e-05, - "loss": 3.4605, - "step": 11900 - }, - { - "epoch": 2.0360714590990683, - "grad_norm": 1.576919674873352, - "learning_rate": 1.5714045037369236e-05, - "loss": 3.4594, - "step": 11910 - }, - { - "epoch": 2.0377810069236686, - "grad_norm": 1.5137405395507812, - "learning_rate": 1.5679401828539406e-05, - "loss": 3.4785, - "step": 11920 - }, - { - "epoch": 2.039490554748269, - "grad_norm": 1.5592314004898071, - "learning_rate": 1.5644797282165738e-05, - "loss": 3.4429, - "step": 11930 - }, - { - "epoch": 2.0412001025728697, - "grad_norm": 1.6012767553329468, - "learning_rate": 1.5610231511431744e-05, - "loss": 3.4965, - "step": 11940 - }, - { - "epoch": 2.04290965039747, - "grad_norm": 1.5601153373718262, - "learning_rate": 1.557570462939411e-05, - "loss": 3.47, - "step": 11950 - }, - { - "epoch": 2.0446191982220703, - "grad_norm": 1.53611159324646, - "learning_rate": 1.5541216748982324e-05, - "loss": 3.4944, - "step": 11960 - }, - { - "epoch": 2.0463287460466706, - "grad_norm": 1.5188260078430176, - "learning_rate": 1.5506767982998288e-05, - "loss": 3.5231, - "step": 11970 - }, - { - "epoch": 2.048038293871271, - "grad_norm": 1.512756586074829, - "learning_rate": 1.5472358444116003e-05, - "loss": 3.4887, - "step": 11980 - }, - { - "epoch": 2.0497478416958717, - "grad_norm": 1.5189011096954346, - "learning_rate": 1.5437988244881142e-05, - "loss": 3.4845, - "step": 11990 - }, - { - "epoch": 2.051457389520472, - "grad_norm": 1.6560404300689697, - "learning_rate": 1.540365749771072e-05, - "loss": 3.4353, - "step": 12000 - }, - { - "epoch": 2.051457389520472, - "eval_loss": 4.245818614959717, - "eval_runtime": 15.5499, - "eval_samples_per_second": 62.637, - "eval_steps_per_second": 0.836, - "step": 12000 - }, - { - "epoch": 2.0531669373450723, - "grad_norm": 1.4618886709213257, - "learning_rate": 1.5369366314892724e-05, - "loss": 3.4895, - "step": 12010 - }, - { - "epoch": 2.0548764851696726, - "grad_norm": 1.5343492031097412, - "learning_rate": 1.5335114808585698e-05, - "loss": 3.4641, - "step": 12020 - }, - { - "epoch": 2.056586032994273, - "grad_norm": 1.6436240673065186, - "learning_rate": 1.5300903090818445e-05, - "loss": 3.5109, - "step": 12030 - }, - { - "epoch": 2.058295580818873, - "grad_norm": 1.5576632022857666, - "learning_rate": 1.526673127348962e-05, - "loss": 3.4763, - "step": 12040 - }, - { - "epoch": 2.060005128643474, - "grad_norm": 1.5234733819961548, - "learning_rate": 1.5232599468367386e-05, - "loss": 3.4975, - "step": 12050 - }, - { - "epoch": 2.0617146764680743, - "grad_norm": 1.5966142416000366, - "learning_rate": 1.5198507787089003e-05, - "loss": 3.4507, - "step": 12060 - }, - { - "epoch": 2.0634242242926746, - "grad_norm": 1.5968408584594727, - "learning_rate": 1.516445634116052e-05, - "loss": 3.4977, - "step": 12070 - }, - { - "epoch": 2.065133772117275, - "grad_norm": 1.6041375398635864, - "learning_rate": 1.5130445241956384e-05, - "loss": 3.4808, - "step": 12080 - }, - { - "epoch": 2.066843319941875, - "grad_norm": 1.5407582521438599, - "learning_rate": 1.5096474600719079e-05, - "loss": 3.4691, - "step": 12090 - }, - { - "epoch": 2.068552867766476, - "grad_norm": 1.550850749015808, - "learning_rate": 1.5062544528558727e-05, - "loss": 3.4867, - "step": 12100 - }, - { - "epoch": 2.0702624155910763, - "grad_norm": 1.523517370223999, - "learning_rate": 1.5028655136452818e-05, - "loss": 3.4812, - "step": 12110 - }, - { - "epoch": 2.0719719634156766, - "grad_norm": 1.4962027072906494, - "learning_rate": 1.4994806535245737e-05, - "loss": 3.4739, - "step": 12120 - }, - { - "epoch": 2.073681511240277, - "grad_norm": 1.514771819114685, - "learning_rate": 1.4960998835648481e-05, - "loss": 3.4806, - "step": 12130 - }, - { - "epoch": 2.075391059064877, - "grad_norm": 1.540757179260254, - "learning_rate": 1.4927232148238241e-05, - "loss": 3.5083, - "step": 12140 - }, - { - "epoch": 2.077100606889478, - "grad_norm": 1.595438003540039, - "learning_rate": 1.4893506583458084e-05, - "loss": 3.462, - "step": 12150 - }, - { - "epoch": 2.0788101547140783, - "grad_norm": 1.5514867305755615, - "learning_rate": 1.4859822251616568e-05, - "loss": 3.4926, - "step": 12160 - }, - { - "epoch": 2.0805197025386786, - "grad_norm": 1.625446081161499, - "learning_rate": 1.4826179262887407e-05, - "loss": 3.4784, - "step": 12170 - }, - { - "epoch": 2.082229250363279, - "grad_norm": 1.5098861455917358, - "learning_rate": 1.4792577727309053e-05, - "loss": 3.5126, - "step": 12180 - }, - { - "epoch": 2.083938798187879, - "grad_norm": 1.5547230243682861, - "learning_rate": 1.4759017754784399e-05, - "loss": 3.4583, - "step": 12190 - }, - { - "epoch": 2.0856483460124795, - "grad_norm": 1.49271559715271, - "learning_rate": 1.4725499455080401e-05, - "loss": 3.4836, - "step": 12200 - }, - { - "epoch": 2.0873578938370803, - "grad_norm": 1.6288962364196777, - "learning_rate": 1.4692022937827704e-05, - "loss": 3.5, - "step": 12210 - }, - { - "epoch": 2.0890674416616806, - "grad_norm": 1.6325870752334595, - "learning_rate": 1.4658588312520283e-05, - "loss": 3.486, - "step": 12220 - }, - { - "epoch": 2.090776989486281, - "grad_norm": 1.5841093063354492, - "learning_rate": 1.4625195688515103e-05, - "loss": 3.4536, - "step": 12230 - }, - { - "epoch": 2.092486537310881, - "grad_norm": 1.6167705059051514, - "learning_rate": 1.4591845175031753e-05, - "loss": 3.5184, - "step": 12240 - }, - { - "epoch": 2.0941960851354815, - "grad_norm": 1.5663753747940063, - "learning_rate": 1.4558536881152098e-05, - "loss": 3.4372, - "step": 12250 - }, - { - "epoch": 2.0959056329600823, - "grad_norm": 1.491481065750122, - "learning_rate": 1.4525270915819889e-05, - "loss": 3.5134, - "step": 12260 - }, - { - "epoch": 2.0976151807846826, - "grad_norm": 1.5205885171890259, - "learning_rate": 1.449204738784044e-05, - "loss": 3.4766, - "step": 12270 - }, - { - "epoch": 2.099324728609283, - "grad_norm": 1.5294101238250732, - "learning_rate": 1.4458866405880272e-05, - "loss": 3.4682, - "step": 12280 - }, - { - "epoch": 2.101034276433883, - "grad_norm": 1.482836127281189, - "learning_rate": 1.4425728078466743e-05, - "loss": 3.5174, - "step": 12290 - }, - { - "epoch": 2.1027438242584835, - "grad_norm": 1.5116338729858398, - "learning_rate": 1.4392632513987698e-05, - "loss": 3.4719, - "step": 12300 - }, - { - "epoch": 2.104453372083084, - "grad_norm": 1.5900734663009644, - "learning_rate": 1.4359579820691093e-05, - "loss": 3.4893, - "step": 12310 - }, - { - "epoch": 2.1061629199076846, - "grad_norm": 1.504593849182129, - "learning_rate": 1.4326570106684691e-05, - "loss": 3.5037, - "step": 12320 - }, - { - "epoch": 2.107872467732285, - "grad_norm": 1.5572094917297363, - "learning_rate": 1.429360347993568e-05, - "loss": 3.4697, - "step": 12330 - }, - { - "epoch": 2.109582015556885, - "grad_norm": 1.5680009126663208, - "learning_rate": 1.4260680048270308e-05, - "loss": 3.4547, - "step": 12340 - }, - { - "epoch": 2.1112915633814855, - "grad_norm": 1.654805302619934, - "learning_rate": 1.4227799919373527e-05, - "loss": 3.4625, - "step": 12350 - }, - { - "epoch": 2.113001111206086, - "grad_norm": 1.5475122928619385, - "learning_rate": 1.419496320078868e-05, - "loss": 3.5467, - "step": 12360 - }, - { - "epoch": 2.1147106590306866, - "grad_norm": 1.5265480279922485, - "learning_rate": 1.4162169999917119e-05, - "loss": 3.4881, - "step": 12370 - }, - { - "epoch": 2.116420206855287, - "grad_norm": 1.5225414037704468, - "learning_rate": 1.4129420424017858e-05, - "loss": 3.4584, - "step": 12380 - }, - { - "epoch": 2.118129754679887, - "grad_norm": 1.5427072048187256, - "learning_rate": 1.4096714580207213e-05, - "loss": 3.4961, - "step": 12390 - }, - { - "epoch": 2.1198393025044875, - "grad_norm": 1.552883505821228, - "learning_rate": 1.4064052575458475e-05, - "loss": 3.5251, - "step": 12400 - }, - { - "epoch": 2.121548850329088, - "grad_norm": 1.5687028169631958, - "learning_rate": 1.4031434516601551e-05, - "loss": 3.4735, - "step": 12410 - }, - { - "epoch": 2.1232583981536886, - "grad_norm": 1.5004371404647827, - "learning_rate": 1.3998860510322609e-05, - "loss": 3.5287, - "step": 12420 - }, - { - "epoch": 2.124967945978289, - "grad_norm": 1.545844554901123, - "learning_rate": 1.3966330663163706e-05, - "loss": 3.4865, - "step": 12430 - }, - { - "epoch": 2.126677493802889, - "grad_norm": 1.4647488594055176, - "learning_rate": 1.3933845081522507e-05, - "loss": 3.4538, - "step": 12440 - }, - { - "epoch": 2.1283870416274895, - "grad_norm": 1.5729986429214478, - "learning_rate": 1.3901403871651859e-05, - "loss": 3.4634, - "step": 12450 - }, - { - "epoch": 2.13009658945209, - "grad_norm": 1.5696868896484375, - "learning_rate": 1.3869007139659507e-05, - "loss": 3.4728, - "step": 12460 - }, - { - "epoch": 2.1318061372766905, - "grad_norm": 1.556570053100586, - "learning_rate": 1.3836654991507686e-05, - "loss": 3.4802, - "step": 12470 - }, - { - "epoch": 2.133515685101291, - "grad_norm": 1.5223067998886108, - "learning_rate": 1.3804347533012846e-05, - "loss": 3.5224, - "step": 12480 - }, - { - "epoch": 2.135225232925891, - "grad_norm": 1.5621014833450317, - "learning_rate": 1.3772084869845233e-05, - "loss": 3.4965, - "step": 12490 - }, - { - "epoch": 2.1369347807504915, - "grad_norm": 1.4942463636398315, - "learning_rate": 1.3739867107528612e-05, - "loss": 3.4841, - "step": 12500 - }, - { - "epoch": 2.138644328575092, - "grad_norm": 1.553877830505371, - "learning_rate": 1.3707694351439851e-05, - "loss": 3.4953, - "step": 12510 - }, - { - "epoch": 2.140353876399692, - "grad_norm": 1.7334165573120117, - "learning_rate": 1.3675566706808635e-05, - "loss": 3.4712, - "step": 12520 - }, - { - "epoch": 2.142063424224293, - "grad_norm": 1.6699564456939697, - "learning_rate": 1.364348427871709e-05, - "loss": 3.5031, - "step": 12530 - }, - { - "epoch": 2.143772972048893, - "grad_norm": 1.526386022567749, - "learning_rate": 1.3611447172099478e-05, - "loss": 3.4845, - "step": 12540 - }, - { - "epoch": 2.1454825198734935, - "grad_norm": 1.5450302362442017, - "learning_rate": 1.3579455491741777e-05, - "loss": 3.4724, - "step": 12550 - }, - { - "epoch": 2.147192067698094, - "grad_norm": 1.6168586015701294, - "learning_rate": 1.3547509342281423e-05, - "loss": 3.4452, - "step": 12560 - }, - { - "epoch": 2.148901615522694, - "grad_norm": 1.528833031654358, - "learning_rate": 1.351560882820691e-05, - "loss": 3.48, - "step": 12570 - }, - { - "epoch": 2.1506111633472944, - "grad_norm": 1.525650143623352, - "learning_rate": 1.3483754053857494e-05, - "loss": 3.4803, - "step": 12580 - }, - { - "epoch": 2.152320711171895, - "grad_norm": 1.5675545930862427, - "learning_rate": 1.3451945123422794e-05, - "loss": 3.5272, - "step": 12590 - }, - { - "epoch": 2.1540302589964955, - "grad_norm": 1.5302727222442627, - "learning_rate": 1.3420182140942506e-05, - "loss": 3.5426, - "step": 12600 - }, - { - "epoch": 2.155739806821096, - "grad_norm": 1.6017122268676758, - "learning_rate": 1.3388465210306034e-05, - "loss": 3.5037, - "step": 12610 - }, - { - "epoch": 2.157449354645696, - "grad_norm": 1.49859619140625, - "learning_rate": 1.3356794435252162e-05, - "loss": 3.4925, - "step": 12620 - }, - { - "epoch": 2.1591589024702964, - "grad_norm": 1.5130283832550049, - "learning_rate": 1.3325169919368713e-05, - "loss": 3.4943, - "step": 12630 - }, - { - "epoch": 2.160868450294897, - "grad_norm": 1.5903511047363281, - "learning_rate": 1.3293591766092184e-05, - "loss": 3.4693, - "step": 12640 - }, - { - "epoch": 2.1625779981194975, - "grad_norm": 1.561729907989502, - "learning_rate": 1.3262060078707455e-05, - "loss": 3.4693, - "step": 12650 - }, - { - "epoch": 2.164287545944098, - "grad_norm": 1.526179313659668, - "learning_rate": 1.323057496034742e-05, - "loss": 3.468, - "step": 12660 - }, - { - "epoch": 2.165997093768698, - "grad_norm": 1.5185110569000244, - "learning_rate": 1.3199136513992661e-05, - "loss": 3.4804, - "step": 12670 - }, - { - "epoch": 2.1677066415932984, - "grad_norm": 1.6287295818328857, - "learning_rate": 1.3167744842471087e-05, - "loss": 3.4589, - "step": 12680 - }, - { - "epoch": 2.169416189417899, - "grad_norm": 1.5340545177459717, - "learning_rate": 1.3136400048457637e-05, - "loss": 3.5121, - "step": 12690 - }, - { - "epoch": 2.1711257372424995, - "grad_norm": 1.5690017938613892, - "learning_rate": 1.3105102234473915e-05, - "loss": 3.4619, - "step": 12700 - }, - { - "epoch": 2.1728352850670998, - "grad_norm": 1.4637198448181152, - "learning_rate": 1.307385150288788e-05, - "loss": 3.5355, - "step": 12710 - }, - { - "epoch": 2.1745448328917, - "grad_norm": 1.674858570098877, - "learning_rate": 1.304264795591347e-05, - "loss": 3.5371, - "step": 12720 - }, - { - "epoch": 2.1762543807163004, - "grad_norm": 1.5729752779006958, - "learning_rate": 1.3011491695610307e-05, - "loss": 3.4339, - "step": 12730 - }, - { - "epoch": 2.177963928540901, - "grad_norm": 1.5545339584350586, - "learning_rate": 1.2980382823883353e-05, - "loss": 3.5363, - "step": 12740 - }, - { - "epoch": 2.1796734763655015, - "grad_norm": 1.6165239810943604, - "learning_rate": 1.2949321442482562e-05, - "loss": 3.4063, - "step": 12750 - }, - { - "epoch": 2.1813830241901018, - "grad_norm": 1.5497561693191528, - "learning_rate": 1.291830765300257e-05, - "loss": 3.4667, - "step": 12760 - }, - { - "epoch": 2.183092572014702, - "grad_norm": 1.5250275135040283, - "learning_rate": 1.2887341556882343e-05, - "loss": 3.4679, - "step": 12770 - }, - { - "epoch": 2.1848021198393024, - "grad_norm": 1.542285442352295, - "learning_rate": 1.2856423255404854e-05, - "loss": 3.4485, - "step": 12780 - }, - { - "epoch": 2.1865116676639027, - "grad_norm": 1.6169849634170532, - "learning_rate": 1.282555284969676e-05, - "loss": 3.4706, - "step": 12790 - }, - { - "epoch": 2.1882212154885035, - "grad_norm": 1.5907604694366455, - "learning_rate": 1.2794730440728036e-05, - "loss": 3.4433, - "step": 12800 - }, - { - "epoch": 2.1899307633131038, - "grad_norm": 1.5525481700897217, - "learning_rate": 1.2763956129311693e-05, - "loss": 3.4934, - "step": 12810 - }, - { - "epoch": 2.191640311137704, - "grad_norm": 1.5504132509231567, - "learning_rate": 1.2733230016103432e-05, - "loss": 3.4921, - "step": 12820 - }, - { - "epoch": 2.1933498589623044, - "grad_norm": 1.6375573873519897, - "learning_rate": 1.2702552201601292e-05, - "loss": 3.4836, - "step": 12830 - }, - { - "epoch": 2.1950594067869047, - "grad_norm": 1.5626866817474365, - "learning_rate": 1.267192278614534e-05, - "loss": 3.4785, - "step": 12840 - }, - { - "epoch": 2.1967689546115055, - "grad_norm": 1.5184632539749146, - "learning_rate": 1.2641341869917344e-05, - "loss": 3.474, - "step": 12850 - }, - { - "epoch": 2.1984785024361058, - "grad_norm": 1.5994203090667725, - "learning_rate": 1.2610809552940451e-05, - "loss": 3.5021, - "step": 12860 - }, - { - "epoch": 2.200188050260706, - "grad_norm": 1.6299086809158325, - "learning_rate": 1.2580325935078838e-05, - "loss": 3.4908, - "step": 12870 - }, - { - "epoch": 2.2018975980853064, - "grad_norm": 1.4888657331466675, - "learning_rate": 1.254989111603741e-05, - "loss": 3.5205, - "step": 12880 - }, - { - "epoch": 2.2036071459099067, - "grad_norm": 1.5885967016220093, - "learning_rate": 1.2519505195361442e-05, - "loss": 3.4663, - "step": 12890 - }, - { - "epoch": 2.205316693734507, - "grad_norm": 1.5278260707855225, - "learning_rate": 1.2489168272436297e-05, - "loss": 3.4651, - "step": 12900 - }, - { - "epoch": 2.2070262415591078, - "grad_norm": 1.6347227096557617, - "learning_rate": 1.2458880446487062e-05, - "loss": 3.4845, - "step": 12910 - }, - { - "epoch": 2.208735789383708, - "grad_norm": 1.5426234006881714, - "learning_rate": 1.242864181657826e-05, - "loss": 3.4901, - "step": 12920 - }, - { - "epoch": 2.2104453372083084, - "grad_norm": 1.533155083656311, - "learning_rate": 1.2398452481613471e-05, - "loss": 3.507, - "step": 12930 - }, - { - "epoch": 2.2121548850329087, - "grad_norm": 1.611399531364441, - "learning_rate": 1.2368312540335068e-05, - "loss": 3.4849, - "step": 12940 - }, - { - "epoch": 2.213864432857509, - "grad_norm": 1.583790898323059, - "learning_rate": 1.2338222091323865e-05, - "loss": 3.4567, - "step": 12950 - }, - { - "epoch": 2.2155739806821098, - "grad_norm": 1.5620285272598267, - "learning_rate": 1.2308181232998804e-05, - "loss": 3.4956, - "step": 12960 - }, - { - "epoch": 2.21728352850671, - "grad_norm": 1.6146880388259888, - "learning_rate": 1.2278190063616594e-05, - "loss": 3.5142, - "step": 12970 - }, - { - "epoch": 2.2189930763313104, - "grad_norm": 1.493303656578064, - "learning_rate": 1.224824868127147e-05, - "loss": 3.4714, - "step": 12980 - }, - { - "epoch": 2.2207026241559107, - "grad_norm": 1.5347646474838257, - "learning_rate": 1.2218357183894797e-05, - "loss": 3.4649, - "step": 12990 - }, - { - "epoch": 2.222412171980511, - "grad_norm": 1.693292260169983, - "learning_rate": 1.218851566925479e-05, - "loss": 3.4788, - "step": 13000 - }, - { - "epoch": 2.222412171980511, - "eval_loss": 4.241896152496338, - "eval_runtime": 14.9926, - "eval_samples_per_second": 64.965, - "eval_steps_per_second": 0.867, - "step": 13000 - }, - { - "epoch": 2.2241217198051118, - "grad_norm": 1.5442662239074707, - "learning_rate": 1.215872423495616e-05, - "loss": 3.4858, - "step": 13010 - }, - { - "epoch": 2.225831267629712, - "grad_norm": 1.5760022401809692, - "learning_rate": 1.2128982978439842e-05, - "loss": 3.4482, - "step": 13020 - }, - { - "epoch": 2.2275408154543124, - "grad_norm": 1.5510753393173218, - "learning_rate": 1.2099291996982641e-05, - "loss": 3.5252, - "step": 13030 - }, - { - "epoch": 2.2292503632789127, - "grad_norm": 1.4661802053451538, - "learning_rate": 1.2069651387696927e-05, - "loss": 3.5343, - "step": 13040 - }, - { - "epoch": 2.230959911103513, - "grad_norm": 1.567291259765625, - "learning_rate": 1.2040061247530303e-05, - "loss": 3.4637, - "step": 13050 - }, - { - "epoch": 2.2326694589281137, - "grad_norm": 1.4744683504104614, - "learning_rate": 1.2010521673265309e-05, - "loss": 3.5287, - "step": 13060 - }, - { - "epoch": 2.234379006752714, - "grad_norm": 1.6249645948410034, - "learning_rate": 1.19810327615191e-05, - "loss": 3.5045, - "step": 13070 - }, - { - "epoch": 2.2360885545773144, - "grad_norm": 1.5532242059707642, - "learning_rate": 1.195159460874312e-05, - "loss": 3.4616, - "step": 13080 - }, - { - "epoch": 2.2377981024019147, - "grad_norm": 1.5068901777267456, - "learning_rate": 1.1922207311222783e-05, - "loss": 3.5185, - "step": 13090 - }, - { - "epoch": 2.239507650226515, - "grad_norm": 1.4816795587539673, - "learning_rate": 1.189287096507718e-05, - "loss": 3.4597, - "step": 13100 - }, - { - "epoch": 2.2412171980511153, - "grad_norm": 1.52855384349823, - "learning_rate": 1.1863585666258747e-05, - "loss": 3.5459, - "step": 13110 - }, - { - "epoch": 2.242926745875716, - "grad_norm": 1.5710679292678833, - "learning_rate": 1.1834351510552969e-05, - "loss": 3.4601, - "step": 13120 - }, - { - "epoch": 2.2446362937003164, - "grad_norm": 1.5301172733306885, - "learning_rate": 1.1805168593578022e-05, - "loss": 3.4801, - "step": 13130 - }, - { - "epoch": 2.2463458415249167, - "grad_norm": 1.4949095249176025, - "learning_rate": 1.1776037010784517e-05, - "loss": 3.5267, - "step": 13140 - }, - { - "epoch": 2.248055389349517, - "grad_norm": 1.5218536853790283, - "learning_rate": 1.174695685745516e-05, - "loss": 3.4541, - "step": 13150 - }, - { - "epoch": 2.2497649371741173, - "grad_norm": 1.5098601579666138, - "learning_rate": 1.171792822870444e-05, - "loss": 3.4724, - "step": 13160 - }, - { - "epoch": 2.2514744849987176, - "grad_norm": 1.5409384965896606, - "learning_rate": 1.1688951219478328e-05, - "loss": 3.523, - "step": 13170 - }, - { - "epoch": 2.2531840328233184, - "grad_norm": 1.611556053161621, - "learning_rate": 1.1660025924553936e-05, - "loss": 3.4438, - "step": 13180 - }, - { - "epoch": 2.2548935806479187, - "grad_norm": 1.5700173377990723, - "learning_rate": 1.1631152438539248e-05, - "loss": 3.4929, - "step": 13190 - }, - { - "epoch": 2.256603128472519, - "grad_norm": 1.5640277862548828, - "learning_rate": 1.1602330855872806e-05, - "loss": 3.4418, - "step": 13200 - }, - { - "epoch": 2.2583126762971193, - "grad_norm": 1.5610255002975464, - "learning_rate": 1.1573561270823373e-05, - "loss": 3.4834, - "step": 13210 - }, - { - "epoch": 2.2600222241217196, - "grad_norm": 1.5984883308410645, - "learning_rate": 1.1544843777489627e-05, - "loss": 3.4878, - "step": 13220 - }, - { - "epoch": 2.2617317719463204, - "grad_norm": 1.5799033641815186, - "learning_rate": 1.1516178469799887e-05, - "loss": 3.4813, - "step": 13230 - }, - { - "epoch": 2.2634413197709207, - "grad_norm": 1.601928949356079, - "learning_rate": 1.1487565441511776e-05, - "loss": 3.5192, - "step": 13240 - }, - { - "epoch": 2.265150867595521, - "grad_norm": 1.5968588590621948, - "learning_rate": 1.1459004786211934e-05, - "loss": 3.3924, - "step": 13250 - }, - { - "epoch": 2.2668604154201213, - "grad_norm": 1.5955078601837158, - "learning_rate": 1.1430496597315673e-05, - "loss": 3.4611, - "step": 13260 - }, - { - "epoch": 2.2685699632447216, - "grad_norm": 1.576472520828247, - "learning_rate": 1.1402040968066727e-05, - "loss": 3.4669, - "step": 13270 - }, - { - "epoch": 2.2702795110693224, - "grad_norm": 1.5610746145248413, - "learning_rate": 1.1373637991536914e-05, - "loss": 3.4305, - "step": 13280 - }, - { - "epoch": 2.2719890588939227, - "grad_norm": 1.5872756242752075, - "learning_rate": 1.1345287760625835e-05, - "loss": 3.4809, - "step": 13290 - }, - { - "epoch": 2.273698606718523, - "grad_norm": 1.5393967628479004, - "learning_rate": 1.1316990368060557e-05, - "loss": 3.4839, - "step": 13300 - }, - { - "epoch": 2.2754081545431233, - "grad_norm": 1.5615211725234985, - "learning_rate": 1.128874590639535e-05, - "loss": 3.4977, - "step": 13310 - }, - { - "epoch": 2.2771177023677236, - "grad_norm": 1.525169849395752, - "learning_rate": 1.1260554468011344e-05, - "loss": 3.4798, - "step": 13320 - }, - { - "epoch": 2.2788272501923243, - "grad_norm": 1.584350824356079, - "learning_rate": 1.1232416145116254e-05, - "loss": 3.4964, - "step": 13330 - }, - { - "epoch": 2.2805367980169247, - "grad_norm": 1.5722397565841675, - "learning_rate": 1.1204331029744047e-05, - "loss": 3.5333, - "step": 13340 - }, - { - "epoch": 2.282246345841525, - "grad_norm": 1.5394337177276611, - "learning_rate": 1.117629921375467e-05, - "loss": 3.4887, - "step": 13350 - }, - { - "epoch": 2.2839558936661253, - "grad_norm": 1.6659812927246094, - "learning_rate": 1.1148320788833752e-05, - "loss": 3.455, - "step": 13360 - }, - { - "epoch": 2.2856654414907256, - "grad_norm": 1.529876708984375, - "learning_rate": 1.1120395846492285e-05, - "loss": 3.4973, - "step": 13370 - }, - { - "epoch": 2.2873749893153263, - "grad_norm": 1.5481332540512085, - "learning_rate": 1.1092524478066314e-05, - "loss": 3.4634, - "step": 13380 - }, - { - "epoch": 2.2890845371399267, - "grad_norm": 1.5900788307189941, - "learning_rate": 1.1064706774716677e-05, - "loss": 3.4847, - "step": 13390 - }, - { - "epoch": 2.290794084964527, - "grad_norm": 1.5862674713134766, - "learning_rate": 1.1036942827428679e-05, - "loss": 3.4886, - "step": 13400 - }, - { - "epoch": 2.2925036327891273, - "grad_norm": 1.5092564821243286, - "learning_rate": 1.1009232727011803e-05, - "loss": 3.4447, - "step": 13410 - }, - { - "epoch": 2.2942131806137276, - "grad_norm": 1.614074468612671, - "learning_rate": 1.0981576564099414e-05, - "loss": 3.4545, - "step": 13420 - }, - { - "epoch": 2.295922728438328, - "grad_norm": 1.514497995376587, - "learning_rate": 1.0953974429148451e-05, - "loss": 3.4975, - "step": 13430 - }, - { - "epoch": 2.297632276262928, - "grad_norm": 1.5800952911376953, - "learning_rate": 1.0926426412439141e-05, - "loss": 3.4843, - "step": 13440 - }, - { - "epoch": 2.299341824087529, - "grad_norm": 1.5350944995880127, - "learning_rate": 1.0898932604074719e-05, - "loss": 3.5153, - "step": 13450 - }, - { - "epoch": 2.3010513719121293, - "grad_norm": 1.5409454107284546, - "learning_rate": 1.0871493093981102e-05, - "loss": 3.5141, - "step": 13460 - }, - { - "epoch": 2.3027609197367296, - "grad_norm": 1.5740351676940918, - "learning_rate": 1.08441079719066e-05, - "loss": 3.4351, - "step": 13470 - }, - { - "epoch": 2.30447046756133, - "grad_norm": 1.6539771556854248, - "learning_rate": 1.0816777327421655e-05, - "loss": 3.4928, - "step": 13480 - }, - { - "epoch": 2.30618001538593, - "grad_norm": 1.5417935848236084, - "learning_rate": 1.078950124991851e-05, - "loss": 3.4778, - "step": 13490 - }, - { - "epoch": 2.307889563210531, - "grad_norm": 1.5595388412475586, - "learning_rate": 1.0762279828610949e-05, - "loss": 3.4637, - "step": 13500 - }, - { - "epoch": 2.3095991110351313, - "grad_norm": 1.5763639211654663, - "learning_rate": 1.073511315253396e-05, - "loss": 3.4517, - "step": 13510 - }, - { - "epoch": 2.3113086588597316, - "grad_norm": 1.6174907684326172, - "learning_rate": 1.0708001310543494e-05, - "loss": 3.4894, - "step": 13520 - }, - { - "epoch": 2.313018206684332, - "grad_norm": 1.566760540008545, - "learning_rate": 1.0680944391316147e-05, - "loss": 3.4704, - "step": 13530 - }, - { - "epoch": 2.314727754508932, - "grad_norm": 1.600784420967102, - "learning_rate": 1.0653942483348883e-05, - "loss": 3.4429, - "step": 13540 - }, - { - "epoch": 2.316437302333533, - "grad_norm": 1.6054022312164307, - "learning_rate": 1.0626995674958717e-05, - "loss": 3.4569, - "step": 13550 - }, - { - "epoch": 2.3181468501581333, - "grad_norm": 1.546038269996643, - "learning_rate": 1.0600104054282457e-05, - "loss": 3.5007, - "step": 13560 - }, - { - "epoch": 2.3198563979827336, - "grad_norm": 1.5943323373794556, - "learning_rate": 1.0573267709276415e-05, - "loss": 3.427, - "step": 13570 - }, - { - "epoch": 2.321565945807334, - "grad_norm": 1.5938525199890137, - "learning_rate": 1.0546486727716096e-05, - "loss": 3.4786, - "step": 13580 - }, - { - "epoch": 2.323275493631934, - "grad_norm": 1.5921151638031006, - "learning_rate": 1.0519761197195921e-05, - "loss": 3.5042, - "step": 13590 - }, - { - "epoch": 2.324985041456535, - "grad_norm": 1.5935509204864502, - "learning_rate": 1.0493091205128955e-05, - "loss": 3.4787, - "step": 13600 - }, - { - "epoch": 2.3266945892811353, - "grad_norm": 1.5425726175308228, - "learning_rate": 1.0466476838746602e-05, - "loss": 3.482, - "step": 13610 - }, - { - "epoch": 2.3284041371057356, - "grad_norm": 1.6595596075057983, - "learning_rate": 1.0439918185098333e-05, - "loss": 3.4985, - "step": 13620 - }, - { - "epoch": 2.330113684930336, - "grad_norm": 1.5957894325256348, - "learning_rate": 1.0413415331051384e-05, - "loss": 3.4082, - "step": 13630 - }, - { - "epoch": 2.331823232754936, - "grad_norm": 1.5388213396072388, - "learning_rate": 1.03869683632905e-05, - "loss": 3.5013, - "step": 13640 - }, - { - "epoch": 2.333532780579537, - "grad_norm": 1.5300538539886475, - "learning_rate": 1.0360577368317623e-05, - "loss": 3.4781, - "step": 13650 - }, - { - "epoch": 2.3352423284041373, - "grad_norm": 1.5240168571472168, - "learning_rate": 1.0334242432451628e-05, - "loss": 3.5098, - "step": 13660 - }, - { - "epoch": 2.3369518762287376, - "grad_norm": 1.6283427476882935, - "learning_rate": 1.0307963641828014e-05, - "loss": 3.4729, - "step": 13670 - }, - { - "epoch": 2.338661424053338, - "grad_norm": 1.5240871906280518, - "learning_rate": 1.0281741082398671e-05, - "loss": 3.464, - "step": 13680 - }, - { - "epoch": 2.340370971877938, - "grad_norm": 1.4739030599594116, - "learning_rate": 1.0255574839931553e-05, - "loss": 3.5338, - "step": 13690 - }, - { - "epoch": 2.3420805197025385, - "grad_norm": 1.5820657014846802, - "learning_rate": 1.0229465000010422e-05, - "loss": 3.4851, - "step": 13700 - }, - { - "epoch": 2.3437900675271393, - "grad_norm": 1.597467064857483, - "learning_rate": 1.0203411648034545e-05, - "loss": 3.5102, - "step": 13710 - }, - { - "epoch": 2.3454996153517396, - "grad_norm": 1.5561059713363647, - "learning_rate": 1.0177414869218441e-05, - "loss": 3.4718, - "step": 13720 - }, - { - "epoch": 2.34720916317634, - "grad_norm": 1.5923752784729004, - "learning_rate": 1.0151474748591596e-05, - "loss": 3.4864, - "step": 13730 - }, - { - "epoch": 2.34891871100094, - "grad_norm": 1.5788894891738892, - "learning_rate": 1.0125591370998177e-05, - "loss": 3.4637, - "step": 13740 - }, - { - "epoch": 2.3506282588255405, - "grad_norm": 1.5612471103668213, - "learning_rate": 1.0099764821096752e-05, - "loss": 3.4733, - "step": 13750 - }, - { - "epoch": 2.352337806650141, - "grad_norm": 1.561638355255127, - "learning_rate": 1.0073995183360019e-05, - "loss": 3.4624, - "step": 13760 - }, - { - "epoch": 2.3540473544747416, - "grad_norm": 1.629211664199829, - "learning_rate": 1.0048282542074533e-05, - "loss": 3.5, - "step": 13770 - }, - { - "epoch": 2.355756902299342, - "grad_norm": 1.5979266166687012, - "learning_rate": 1.0022626981340427e-05, - "loss": 3.4909, - "step": 13780 - }, - { - "epoch": 2.357466450123942, - "grad_norm": 1.628868818283081, - "learning_rate": 9.997028585071145e-06, - "loss": 3.4574, - "step": 13790 - }, - { - "epoch": 2.3591759979485425, - "grad_norm": 1.599206805229187, - "learning_rate": 9.971487436993132e-06, - "loss": 3.4662, - "step": 13800 - }, - { - "epoch": 2.360885545773143, - "grad_norm": 1.4822697639465332, - "learning_rate": 9.946003620645612e-06, - "loss": 3.5583, - "step": 13810 - }, - { - "epoch": 2.3625950935977436, - "grad_norm": 1.5987297296524048, - "learning_rate": 9.92057721938029e-06, - "loss": 3.482, - "step": 13820 - }, - { - "epoch": 2.364304641422344, - "grad_norm": 1.594603180885315, - "learning_rate": 9.895208316361073e-06, - "loss": 3.4961, - "step": 13830 - }, - { - "epoch": 2.366014189246944, - "grad_norm": 1.5749777555465698, - "learning_rate": 9.869896994563787e-06, - "loss": 3.5129, - "step": 13840 - }, - { - "epoch": 2.3677237370715445, - "grad_norm": 1.5456875562667847, - "learning_rate": 9.844643336775961e-06, - "loss": 3.5073, - "step": 13850 - }, - { - "epoch": 2.369433284896145, - "grad_norm": 1.5770814418792725, - "learning_rate": 9.819447425596485e-06, - "loss": 3.4744, - "step": 13860 - }, - { - "epoch": 2.3711428327207456, - "grad_norm": 1.5004186630249023, - "learning_rate": 9.7943093434354e-06, - "loss": 3.4857, - "step": 13870 - }, - { - "epoch": 2.372852380545346, - "grad_norm": 1.656900405883789, - "learning_rate": 9.76922917251357e-06, - "loss": 3.4393, - "step": 13880 - }, - { - "epoch": 2.374561928369946, - "grad_norm": 1.512762188911438, - "learning_rate": 9.74420699486247e-06, - "loss": 3.48, - "step": 13890 - }, - { - "epoch": 2.3762714761945465, - "grad_norm": 1.590187668800354, - "learning_rate": 9.719242892323882e-06, - "loss": 3.5098, - "step": 13900 - }, - { - "epoch": 2.377981024019147, - "grad_norm": 1.6135469675064087, - "learning_rate": 9.69433694654965e-06, - "loss": 3.5268, - "step": 13910 - }, - { - "epoch": 2.3796905718437475, - "grad_norm": 1.595780372619629, - "learning_rate": 9.669489239001377e-06, - "loss": 3.4044, - "step": 13920 - }, - { - "epoch": 2.381400119668348, - "grad_norm": 1.5471464395523071, - "learning_rate": 9.6446998509502e-06, - "loss": 3.4651, - "step": 13930 - }, - { - "epoch": 2.383109667492948, - "grad_norm": 1.581976056098938, - "learning_rate": 9.619968863476506e-06, - "loss": 3.5202, - "step": 13940 - }, - { - "epoch": 2.3848192153175485, - "grad_norm": 1.599338412284851, - "learning_rate": 9.595296357469666e-06, - "loss": 3.4619, - "step": 13950 - }, - { - "epoch": 2.386528763142149, - "grad_norm": 1.573503017425537, - "learning_rate": 9.570682413627757e-06, - "loss": 3.4839, - "step": 13960 - }, - { - "epoch": 2.388238310966749, - "grad_norm": 1.5063750743865967, - "learning_rate": 9.546127112457331e-06, - "loss": 3.5167, - "step": 13970 - }, - { - "epoch": 2.38994785879135, - "grad_norm": 1.5325621366500854, - "learning_rate": 9.521630534273129e-06, - "loss": 3.5269, - "step": 13980 - }, - { - "epoch": 2.39165740661595, - "grad_norm": 1.5386368036270142, - "learning_rate": 9.497192759197817e-06, - "loss": 3.4546, - "step": 13990 - }, - { - "epoch": 2.3933669544405505, - "grad_norm": 1.58113431930542, - "learning_rate": 9.472813867161742e-06, - "loss": 3.4951, - "step": 14000 - }, - { - "epoch": 2.3933669544405505, - "eval_loss": 4.236238956451416, - "eval_runtime": 15.4568, - "eval_samples_per_second": 63.014, - "eval_steps_per_second": 0.841, - "step": 14000 - }, - { - "epoch": 2.395076502265151, - "grad_norm": 1.5124735832214355, - "learning_rate": 9.448493937902635e-06, - "loss": 3.4898, - "step": 14010 - }, - { - "epoch": 2.396786050089751, - "grad_norm": 1.518292784690857, - "learning_rate": 9.424233050965395e-06, - "loss": 3.4343, - "step": 14020 - }, - { - "epoch": 2.3984955979143514, - "grad_norm": 1.5973814725875854, - "learning_rate": 9.400031285701799e-06, - "loss": 3.4543, - "step": 14030 - }, - { - "epoch": 2.400205145738952, - "grad_norm": 1.5919679403305054, - "learning_rate": 9.375888721270257e-06, - "loss": 3.4772, - "step": 14040 - }, - { - "epoch": 2.4019146935635525, - "grad_norm": 1.5431290864944458, - "learning_rate": 9.35180543663553e-06, - "loss": 3.4644, - "step": 14050 - }, - { - "epoch": 2.403624241388153, - "grad_norm": 1.5697131156921387, - "learning_rate": 9.327781510568498e-06, - "loss": 3.4665, - "step": 14060 - }, - { - "epoch": 2.405333789212753, - "grad_norm": 1.6250470876693726, - "learning_rate": 9.303817021645905e-06, - "loss": 3.4795, - "step": 14070 - }, - { - "epoch": 2.4070433370373534, - "grad_norm": 1.5369354486465454, - "learning_rate": 9.279912048250083e-06, - "loss": 3.4422, - "step": 14080 - }, - { - "epoch": 2.408752884861954, - "grad_norm": 1.566135048866272, - "learning_rate": 9.256066668568685e-06, - "loss": 3.4979, - "step": 14090 - }, - { - "epoch": 2.4104624326865545, - "grad_norm": 1.611475944519043, - "learning_rate": 9.232280960594465e-06, - "loss": 3.498, - "step": 14100 - }, - { - "epoch": 2.412171980511155, - "grad_norm": 1.5692287683486938, - "learning_rate": 9.208555002124998e-06, - "loss": 3.4976, - "step": 14110 - }, - { - "epoch": 2.413881528335755, - "grad_norm": 1.5273486375808716, - "learning_rate": 9.184888870762443e-06, - "loss": 3.4401, - "step": 14120 - }, - { - "epoch": 2.4155910761603554, - "grad_norm": 1.5379239320755005, - "learning_rate": 9.161282643913256e-06, - "loss": 3.4595, - "step": 14130 - }, - { - "epoch": 2.417300623984956, - "grad_norm": 1.5572803020477295, - "learning_rate": 9.137736398787978e-06, - "loss": 3.5201, - "step": 14140 - }, - { - "epoch": 2.4190101718095565, - "grad_norm": 1.5785205364227295, - "learning_rate": 9.114250212400954e-06, - "loss": 3.4871, - "step": 14150 - }, - { - "epoch": 2.4207197196341568, - "grad_norm": 1.59848952293396, - "learning_rate": 9.09082416157011e-06, - "loss": 3.4667, - "step": 14160 - }, - { - "epoch": 2.422429267458757, - "grad_norm": 1.529939889907837, - "learning_rate": 9.067458322916645e-06, - "loss": 3.5029, - "step": 14170 - }, - { - "epoch": 2.4241388152833574, - "grad_norm": 1.5984946489334106, - "learning_rate": 9.04415277286485e-06, - "loss": 3.4792, - "step": 14180 - }, - { - "epoch": 2.425848363107958, - "grad_norm": 1.5384325981140137, - "learning_rate": 9.020907587641817e-06, - "loss": 3.5502, - "step": 14190 - }, - { - "epoch": 2.4275579109325585, - "grad_norm": 1.5560745000839233, - "learning_rate": 8.997722843277199e-06, - "loss": 3.5176, - "step": 14200 - }, - { - "epoch": 2.4292674587571588, - "grad_norm": 1.5412474870681763, - "learning_rate": 8.974598615602948e-06, - "loss": 3.4563, - "step": 14210 - }, - { - "epoch": 2.430977006581759, - "grad_norm": 1.6349189281463623, - "learning_rate": 8.951534980253102e-06, - "loss": 3.5075, - "step": 14220 - }, - { - "epoch": 2.4326865544063594, - "grad_norm": 1.6454579830169678, - "learning_rate": 8.928532012663499e-06, - "loss": 3.4856, - "step": 14230 - }, - { - "epoch": 2.43439610223096, - "grad_norm": 1.6096782684326172, - "learning_rate": 8.905589788071558e-06, - "loss": 3.4366, - "step": 14240 - }, - { - "epoch": 2.4361056500555605, - "grad_norm": 1.5786097049713135, - "learning_rate": 8.882708381516003e-06, - "loss": 3.4547, - "step": 14250 - }, - { - "epoch": 2.4378151978801608, - "grad_norm": 1.571348786354065, - "learning_rate": 8.859887867836662e-06, - "loss": 3.5105, - "step": 14260 - }, - { - "epoch": 2.439524745704761, - "grad_norm": 1.5501421689987183, - "learning_rate": 8.837128321674174e-06, - "loss": 3.4942, - "step": 14270 - }, - { - "epoch": 2.4412342935293614, - "grad_norm": 1.6200729608535767, - "learning_rate": 8.81442981746978e-06, - "loss": 3.4902, - "step": 14280 - }, - { - "epoch": 2.4429438413539617, - "grad_norm": 1.6052160263061523, - "learning_rate": 8.791792429465065e-06, - "loss": 3.403, - "step": 14290 - }, - { - "epoch": 2.4446533891785625, - "grad_norm": 1.645658254623413, - "learning_rate": 8.769216231701715e-06, - "loss": 3.4643, - "step": 14300 - }, - { - "epoch": 2.4463629370031628, - "grad_norm": 1.5763663053512573, - "learning_rate": 8.746701298021277e-06, - "loss": 3.5119, - "step": 14310 - }, - { - "epoch": 2.448072484827763, - "grad_norm": 1.5959539413452148, - "learning_rate": 8.724247702064914e-06, - "loss": 3.4262, - "step": 14320 - }, - { - "epoch": 2.4497820326523634, - "grad_norm": 1.5789684057235718, - "learning_rate": 8.70185551727318e-06, - "loss": 3.4731, - "step": 14330 - }, - { - "epoch": 2.4514915804769637, - "grad_norm": 1.5542010068893433, - "learning_rate": 8.679524816885745e-06, - "loss": 3.515, - "step": 14340 - }, - { - "epoch": 2.453201128301564, - "grad_norm": 1.5897349119186401, - "learning_rate": 8.657255673941196e-06, - "loss": 3.4346, - "step": 14350 - }, - { - "epoch": 2.4549106761261648, - "grad_norm": 1.581602692604065, - "learning_rate": 8.635048161276778e-06, - "loss": 3.5032, - "step": 14360 - }, - { - "epoch": 2.456620223950765, - "grad_norm": 1.5077800750732422, - "learning_rate": 8.612902351528154e-06, - "loss": 3.4337, - "step": 14370 - }, - { - "epoch": 2.4583297717753654, - "grad_norm": 1.60201895236969, - "learning_rate": 8.590818317129164e-06, - "loss": 3.5071, - "step": 14380 - }, - { - "epoch": 2.4600393195999657, - "grad_norm": 1.6140286922454834, - "learning_rate": 8.568796130311607e-06, - "loss": 3.4641, - "step": 14390 - }, - { - "epoch": 2.461748867424566, - "grad_norm": 1.5946362018585205, - "learning_rate": 8.546835863104986e-06, - "loss": 3.4803, - "step": 14400 - }, - { - "epoch": 2.4634584152491668, - "grad_norm": 1.6644922494888306, - "learning_rate": 8.524937587336288e-06, - "loss": 3.5076, - "step": 14410 - }, - { - "epoch": 2.465167963073767, - "grad_norm": 1.6053881645202637, - "learning_rate": 8.50310137462972e-06, - "loss": 3.4394, - "step": 14420 - }, - { - "epoch": 2.4668775108983674, - "grad_norm": 1.6288022994995117, - "learning_rate": 8.481327296406519e-06, - "loss": 3.443, - "step": 14430 - }, - { - "epoch": 2.4685870587229677, - "grad_norm": 1.491320252418518, - "learning_rate": 8.45961542388468e-06, - "loss": 3.5193, - "step": 14440 - }, - { - "epoch": 2.470296606547568, - "grad_norm": 1.5719027519226074, - "learning_rate": 8.437965828078746e-06, - "loss": 3.4974, - "step": 14450 - }, - { - "epoch": 2.4720061543721688, - "grad_norm": 1.5726176500320435, - "learning_rate": 8.416378579799549e-06, - "loss": 3.5031, - "step": 14460 - }, - { - "epoch": 2.473715702196769, - "grad_norm": 1.5793628692626953, - "learning_rate": 8.394853749654023e-06, - "loss": 3.5316, - "step": 14470 - }, - { - "epoch": 2.4754252500213694, - "grad_norm": 1.5251953601837158, - "learning_rate": 8.373391408044926e-06, - "loss": 3.4799, - "step": 14480 - }, - { - "epoch": 2.4771347978459697, - "grad_norm": 1.5378332138061523, - "learning_rate": 8.351991625170649e-06, - "loss": 3.4708, - "step": 14490 - }, - { - "epoch": 2.47884434567057, - "grad_norm": 1.554356336593628, - "learning_rate": 8.330654471024936e-06, - "loss": 3.4818, - "step": 14500 - }, - { - "epoch": 2.4805538934951707, - "grad_norm": 1.552653193473816, - "learning_rate": 8.309380015396725e-06, - "loss": 3.4417, - "step": 14510 - }, - { - "epoch": 2.482263441319771, - "grad_norm": 1.573130488395691, - "learning_rate": 8.288168327869863e-06, - "loss": 3.4315, - "step": 14520 - }, - { - "epoch": 2.4839729891443714, - "grad_norm": 1.5173275470733643, - "learning_rate": 8.2670194778229e-06, - "loss": 3.4881, - "step": 14530 - }, - { - "epoch": 2.4856825369689717, - "grad_norm": 1.5558350086212158, - "learning_rate": 8.245933534428845e-06, - "loss": 3.5103, - "step": 14540 - }, - { - "epoch": 2.487392084793572, - "grad_norm": 1.5627878904342651, - "learning_rate": 8.224910566654973e-06, - "loss": 3.489, - "step": 14550 - }, - { - "epoch": 2.4891016326181723, - "grad_norm": 1.602602243423462, - "learning_rate": 8.203950643262576e-06, - "loss": 3.428, - "step": 14560 - }, - { - "epoch": 2.490811180442773, - "grad_norm": 1.6493867635726929, - "learning_rate": 8.183053832806737e-06, - "loss": 3.4609, - "step": 14570 - }, - { - "epoch": 2.4925207282673734, - "grad_norm": 1.598954439163208, - "learning_rate": 8.162220203636112e-06, - "loss": 3.4757, - "step": 14580 - }, - { - "epoch": 2.4942302760919737, - "grad_norm": 1.625220775604248, - "learning_rate": 8.141449823892707e-06, - "loss": 3.4526, - "step": 14590 - }, - { - "epoch": 2.495939823916574, - "grad_norm": 1.6900430917739868, - "learning_rate": 8.12074276151166e-06, - "loss": 3.4564, - "step": 14600 - }, - { - "epoch": 2.4976493717411743, - "grad_norm": 1.5862665176391602, - "learning_rate": 8.100099084220997e-06, - "loss": 3.4925, - "step": 14610 - }, - { - "epoch": 2.4993589195657746, - "grad_norm": 1.5359658002853394, - "learning_rate": 8.079518859541447e-06, - "loss": 3.5289, - "step": 14620 - }, - { - "epoch": 2.5010684673903754, - "grad_norm": 1.6525688171386719, - "learning_rate": 8.059002154786176e-06, - "loss": 3.5285, - "step": 14630 - }, - { - "epoch": 2.5027780152149757, - "grad_norm": 1.5398362874984741, - "learning_rate": 8.038549037060612e-06, - "loss": 3.4627, - "step": 14640 - }, - { - "epoch": 2.504487563039576, - "grad_norm": 1.5572253465652466, - "learning_rate": 8.018159573262192e-06, - "loss": 3.4282, - "step": 14650 - }, - { - "epoch": 2.5061971108641763, - "grad_norm": 1.6275733709335327, - "learning_rate": 7.997833830080167e-06, - "loss": 3.4884, - "step": 14660 - }, - { - "epoch": 2.5079066586887766, - "grad_norm": 1.531006097793579, - "learning_rate": 7.977571873995353e-06, - "loss": 3.5202, - "step": 14670 - }, - { - "epoch": 2.5096162065133774, - "grad_norm": 1.5890415906906128, - "learning_rate": 7.957373771279952e-06, - "loss": 3.5064, - "step": 14680 - }, - { - "epoch": 2.5113257543379777, - "grad_norm": 1.6101802587509155, - "learning_rate": 7.937239587997308e-06, - "loss": 3.4892, - "step": 14690 - }, - { - "epoch": 2.513035302162578, - "grad_norm": 1.5846967697143555, - "learning_rate": 7.917169390001707e-06, - "loss": 3.4587, - "step": 14700 - }, - { - "epoch": 2.5147448499871783, - "grad_norm": 1.5637435913085938, - "learning_rate": 7.897163242938133e-06, - "loss": 3.4755, - "step": 14710 - }, - { - "epoch": 2.5164543978117786, - "grad_norm": 1.5079200267791748, - "learning_rate": 7.877221212242098e-06, - "loss": 3.5306, - "step": 14720 - }, - { - "epoch": 2.5181639456363794, - "grad_norm": 1.5671281814575195, - "learning_rate": 7.857343363139399e-06, - "loss": 3.5099, - "step": 14730 - }, - { - "epoch": 2.5198734934609797, - "grad_norm": 1.5503950119018555, - "learning_rate": 7.837529760645905e-06, - "loss": 3.4423, - "step": 14740 - }, - { - "epoch": 2.52158304128558, - "grad_norm": 1.658644676208496, - "learning_rate": 7.817780469567341e-06, - "loss": 3.4438, - "step": 14750 - }, - { - "epoch": 2.5232925891101803, - "grad_norm": 1.642662525177002, - "learning_rate": 7.7980955544991e-06, - "loss": 3.4651, - "step": 14760 - }, - { - "epoch": 2.5250021369347806, - "grad_norm": 1.5603840351104736, - "learning_rate": 7.778475079826003e-06, - "loss": 3.5084, - "step": 14770 - }, - { - "epoch": 2.5267116847593813, - "grad_norm": 1.5619406700134277, - "learning_rate": 7.758919109722117e-06, - "loss": 3.4705, - "step": 14780 - }, - { - "epoch": 2.5284212325839817, - "grad_norm": 1.5967891216278076, - "learning_rate": 7.739427708150508e-06, - "loss": 3.5094, - "step": 14790 - }, - { - "epoch": 2.530130780408582, - "grad_norm": 1.5418777465820312, - "learning_rate": 7.720000938863071e-06, - "loss": 3.4986, - "step": 14800 - }, - { - "epoch": 2.5318403282331823, - "grad_norm": 1.5510917901992798, - "learning_rate": 7.700638865400297e-06, - "loss": 3.4866, - "step": 14810 - }, - { - "epoch": 2.5335498760577826, - "grad_norm": 1.583345890045166, - "learning_rate": 7.681341551091076e-06, - "loss": 3.5302, - "step": 14820 - }, - { - "epoch": 2.5352594238823833, - "grad_norm": 1.5534437894821167, - "learning_rate": 7.662109059052471e-06, - "loss": 3.4641, - "step": 14830 - }, - { - "epoch": 2.5369689717069837, - "grad_norm": 1.5720946788787842, - "learning_rate": 7.642941452189545e-06, - "loss": 3.4646, - "step": 14840 - }, - { - "epoch": 2.538678519531584, - "grad_norm": 1.6022883653640747, - "learning_rate": 7.623838793195128e-06, - "loss": 3.494, - "step": 14850 - }, - { - "epoch": 2.5403880673561843, - "grad_norm": 1.532145619392395, - "learning_rate": 7.60480114454962e-06, - "loss": 3.4694, - "step": 14860 - }, - { - "epoch": 2.5420976151807846, - "grad_norm": 1.5419657230377197, - "learning_rate": 7.585828568520794e-06, - "loss": 3.4939, - "step": 14870 - }, - { - "epoch": 2.5438071630053853, - "grad_norm": 1.570682406425476, - "learning_rate": 7.566921127163569e-06, - "loss": 3.4617, - "step": 14880 - }, - { - "epoch": 2.545516710829985, - "grad_norm": 1.6500587463378906, - "learning_rate": 7.54807888231984e-06, - "loss": 3.4756, - "step": 14890 - }, - { - "epoch": 2.547226258654586, - "grad_norm": 1.5383124351501465, - "learning_rate": 7.529301895618254e-06, - "loss": 3.5291, - "step": 14900 - }, - { - "epoch": 2.5489358064791863, - "grad_norm": 1.5403473377227783, - "learning_rate": 7.510590228474015e-06, - "loss": 3.4964, - "step": 14910 - }, - { - "epoch": 2.5506453543037866, - "grad_norm": 1.6596778631210327, - "learning_rate": 7.4919439420886765e-06, - "loss": 3.4864, - "step": 14920 - }, - { - "epoch": 2.552354902128387, - "grad_norm": 1.5545663833618164, - "learning_rate": 7.473363097449943e-06, - "loss": 3.4477, - "step": 14930 - }, - { - "epoch": 2.554064449952987, - "grad_norm": 1.5662345886230469, - "learning_rate": 7.4548477553314955e-06, - "loss": 3.4744, - "step": 14940 - }, - { - "epoch": 2.555773997777588, - "grad_norm": 1.624878168106079, - "learning_rate": 7.436397976292752e-06, - "loss": 3.4845, - "step": 14950 - }, - { - "epoch": 2.5574835456021883, - "grad_norm": 1.516316533088684, - "learning_rate": 7.418013820678687e-06, - "loss": 3.523, - "step": 14960 - }, - { - "epoch": 2.5591930934267886, - "grad_norm": 1.6222889423370361, - "learning_rate": 7.3996953486196505e-06, - "loss": 3.4805, - "step": 14970 - }, - { - "epoch": 2.560902641251389, - "grad_norm": 1.5746357440948486, - "learning_rate": 7.381442620031144e-06, - "loss": 3.4845, - "step": 14980 - }, - { - "epoch": 2.562612189075989, - "grad_norm": 1.5857216119766235, - "learning_rate": 7.3632556946136464e-06, - "loss": 3.4842, - "step": 14990 - }, - { - "epoch": 2.56432173690059, - "grad_norm": 1.5834261178970337, - "learning_rate": 7.345134631852397e-06, - "loss": 3.4974, - "step": 15000 - }, - { - "epoch": 2.56432173690059, - "eval_loss": 4.23234224319458, - "eval_runtime": 15.2985, - "eval_samples_per_second": 63.666, - "eval_steps_per_second": 0.85, - "step": 15000 - }, - { - "epoch": 2.5660312847251903, - "grad_norm": 1.603843331336975, - "learning_rate": 7.327079491017229e-06, - "loss": 3.5096, - "step": 15010 - }, - { - "epoch": 2.5677408325497906, - "grad_norm": 1.5631141662597656, - "learning_rate": 7.309090331162346e-06, - "loss": 3.4732, - "step": 15020 - }, - { - "epoch": 2.569450380374391, - "grad_norm": 1.54705011844635, - "learning_rate": 7.291167211126164e-06, - "loss": 3.4553, - "step": 15030 - }, - { - "epoch": 2.571159928198991, - "grad_norm": 1.5998766422271729, - "learning_rate": 7.273310189531067e-06, - "loss": 3.5009, - "step": 15040 - }, - { - "epoch": 2.572869476023592, - "grad_norm": 1.592857837677002, - "learning_rate": 7.25551932478327e-06, - "loss": 3.513, - "step": 15050 - }, - { - "epoch": 2.5745790238481923, - "grad_norm": 1.655718207359314, - "learning_rate": 7.237794675072596e-06, - "loss": 3.4567, - "step": 15060 - }, - { - "epoch": 2.5762885716727926, - "grad_norm": 1.613963007926941, - "learning_rate": 7.220136298372301e-06, - "loss": 3.4445, - "step": 15070 - }, - { - "epoch": 2.577998119497393, - "grad_norm": 1.5572508573532104, - "learning_rate": 7.202544252438861e-06, - "loss": 3.5091, - "step": 15080 - }, - { - "epoch": 2.579707667321993, - "grad_norm": 1.5239120721817017, - "learning_rate": 7.185018594811817e-06, - "loss": 3.4655, - "step": 15090 - }, - { - "epoch": 2.581417215146594, - "grad_norm": 1.6304435729980469, - "learning_rate": 7.167559382813559e-06, - "loss": 3.4783, - "step": 15100 - }, - { - "epoch": 2.5831267629711943, - "grad_norm": 1.5935367345809937, - "learning_rate": 7.1501666735491555e-06, - "loss": 3.4648, - "step": 15110 - }, - { - "epoch": 2.5848363107957946, - "grad_norm": 1.5939339399337769, - "learning_rate": 7.132840523906145e-06, - "loss": 3.5398, - "step": 15120 - }, - { - "epoch": 2.586545858620395, - "grad_norm": 1.5915974378585815, - "learning_rate": 7.115580990554384e-06, - "loss": 3.5105, - "step": 15130 - }, - { - "epoch": 2.588255406444995, - "grad_norm": 1.5749688148498535, - "learning_rate": 7.098388129945832e-06, - "loss": 3.4829, - "step": 15140 - }, - { - "epoch": 2.589964954269596, - "grad_norm": 1.5355912446975708, - "learning_rate": 7.081261998314379e-06, - "loss": 3.4813, - "step": 15150 - }, - { - "epoch": 2.591674502094196, - "grad_norm": 1.5493956804275513, - "learning_rate": 7.064202651675661e-06, - "loss": 3.4471, - "step": 15160 - }, - { - "epoch": 2.5933840499187966, - "grad_norm": 1.4904630184173584, - "learning_rate": 7.047210145826874e-06, - "loss": 3.5119, - "step": 15170 - }, - { - "epoch": 2.595093597743397, - "grad_norm": 1.5285837650299072, - "learning_rate": 7.0302845363465916e-06, - "loss": 3.5115, - "step": 15180 - }, - { - "epoch": 2.596803145567997, - "grad_norm": 1.5241292715072632, - "learning_rate": 7.013425878594594e-06, - "loss": 3.4683, - "step": 15190 - }, - { - "epoch": 2.5985126933925975, - "grad_norm": 1.5343016386032104, - "learning_rate": 6.996634227711667e-06, - "loss": 3.4511, - "step": 15200 - }, - { - "epoch": 2.600222241217198, - "grad_norm": 1.6006766557693481, - "learning_rate": 6.979909638619432e-06, - "loss": 3.4761, - "step": 15210 - }, - { - "epoch": 2.6019317890417986, - "grad_norm": 1.573333740234375, - "learning_rate": 6.963252166020173e-06, - "loss": 3.4627, - "step": 15220 - }, - { - "epoch": 2.603641336866399, - "grad_norm": 1.6183596849441528, - "learning_rate": 6.946661864396654e-06, - "loss": 3.4986, - "step": 15230 - }, - { - "epoch": 2.605350884690999, - "grad_norm": 1.571237325668335, - "learning_rate": 6.930138788011931e-06, - "loss": 3.518, - "step": 15240 - }, - { - "epoch": 2.6070604325155995, - "grad_norm": 1.5077455043792725, - "learning_rate": 6.913682990909179e-06, - "loss": 3.4766, - "step": 15250 - }, - { - "epoch": 2.6087699803402, - "grad_norm": 1.6395853757858276, - "learning_rate": 6.8972945269115284e-06, - "loss": 3.4913, - "step": 15260 - }, - { - "epoch": 2.6104795281648006, - "grad_norm": 1.6249921321868896, - "learning_rate": 6.880973449621869e-06, - "loss": 3.4453, - "step": 15270 - }, - { - "epoch": 2.612189075989401, - "grad_norm": 1.6368459463119507, - "learning_rate": 6.864719812422698e-06, - "loss": 3.4861, - "step": 15280 - }, - { - "epoch": 2.613898623814001, - "grad_norm": 1.5408482551574707, - "learning_rate": 6.8485336684759126e-06, - "loss": 3.4473, - "step": 15290 - }, - { - "epoch": 2.6156081716386015, - "grad_norm": 1.5803616046905518, - "learning_rate": 6.832415070722664e-06, - "loss": 3.5212, - "step": 15300 - }, - { - "epoch": 2.617317719463202, - "grad_norm": 1.5303839445114136, - "learning_rate": 6.81636407188318e-06, - "loss": 3.4756, - "step": 15310 - }, - { - "epoch": 2.6190272672878026, - "grad_norm": 1.592599630355835, - "learning_rate": 6.800380724456588e-06, - "loss": 3.4705, - "step": 15320 - }, - { - "epoch": 2.620736815112403, - "grad_norm": 1.5929478406906128, - "learning_rate": 6.78446508072073e-06, - "loss": 3.4614, - "step": 15330 - }, - { - "epoch": 2.622446362937003, - "grad_norm": 1.5237799882888794, - "learning_rate": 6.768617192732017e-06, - "loss": 3.5319, - "step": 15340 - }, - { - "epoch": 2.6241559107616035, - "grad_norm": 1.5368871688842773, - "learning_rate": 6.752837112325243e-06, - "loss": 3.4814, - "step": 15350 - }, - { - "epoch": 2.625865458586204, - "grad_norm": 1.6269627809524536, - "learning_rate": 6.7371248911134245e-06, - "loss": 3.4271, - "step": 15360 - }, - { - "epoch": 2.6275750064108045, - "grad_norm": 1.5989304780960083, - "learning_rate": 6.72148058048761e-06, - "loss": 3.4654, - "step": 15370 - }, - { - "epoch": 2.629284554235405, - "grad_norm": 1.5785729885101318, - "learning_rate": 6.705904231616747e-06, - "loss": 3.5077, - "step": 15380 - }, - { - "epoch": 2.630994102060005, - "grad_norm": 1.5278352499008179, - "learning_rate": 6.690395895447488e-06, - "loss": 3.4799, - "step": 15390 - }, - { - "epoch": 2.6327036498846055, - "grad_norm": 1.564408540725708, - "learning_rate": 6.674955622704035e-06, - "loss": 3.5017, - "step": 15400 - }, - { - "epoch": 2.634413197709206, - "grad_norm": 1.5855345726013184, - "learning_rate": 6.659583463887954e-06, - "loss": 3.4918, - "step": 15410 - }, - { - "epoch": 2.6361227455338065, - "grad_norm": 1.5776944160461426, - "learning_rate": 6.644279469278051e-06, - "loss": 3.5083, - "step": 15420 - }, - { - "epoch": 2.637832293358407, - "grad_norm": 1.519146203994751, - "learning_rate": 6.629043688930161e-06, - "loss": 3.4957, - "step": 15430 - }, - { - "epoch": 2.639541841183007, - "grad_norm": 1.6147756576538086, - "learning_rate": 6.613876172677027e-06, - "loss": 3.4664, - "step": 15440 - }, - { - "epoch": 2.6412513890076075, - "grad_norm": 1.5790156126022339, - "learning_rate": 6.598776970128091e-06, - "loss": 3.4945, - "step": 15450 - }, - { - "epoch": 2.642960936832208, - "grad_norm": 1.5960100889205933, - "learning_rate": 6.583746130669376e-06, - "loss": 3.4354, - "step": 15460 - }, - { - "epoch": 2.6446704846568085, - "grad_norm": 1.5889085531234741, - "learning_rate": 6.5687837034632995e-06, - "loss": 3.4968, - "step": 15470 - }, - { - "epoch": 2.6463800324814084, - "grad_norm": 1.559181809425354, - "learning_rate": 6.553889737448513e-06, - "loss": 3.5152, - "step": 15480 - }, - { - "epoch": 2.648089580306009, - "grad_norm": 1.5886166095733643, - "learning_rate": 6.539064281339758e-06, - "loss": 3.4652, - "step": 15490 - }, - { - "epoch": 2.6497991281306095, - "grad_norm": 1.5852969884872437, - "learning_rate": 6.524307383627682e-06, - "loss": 3.4883, - "step": 15500 - }, - { - "epoch": 2.65150867595521, - "grad_norm": 1.517879843711853, - "learning_rate": 6.509619092578703e-06, - "loss": 3.4761, - "step": 15510 - }, - { - "epoch": 2.65321822377981, - "grad_norm": 1.577673316001892, - "learning_rate": 6.494999456234844e-06, - "loss": 3.5317, - "step": 15520 - }, - { - "epoch": 2.6549277716044104, - "grad_norm": 1.5800429582595825, - "learning_rate": 6.4804485224135734e-06, - "loss": 3.5168, - "step": 15530 - }, - { - "epoch": 2.656637319429011, - "grad_norm": 1.451449990272522, - "learning_rate": 6.465966338707639e-06, - "loss": 3.5089, - "step": 15540 - }, - { - "epoch": 2.6583468672536115, - "grad_norm": 1.632088303565979, - "learning_rate": 6.451552952484932e-06, - "loss": 3.4716, - "step": 15550 - }, - { - "epoch": 2.6600564150782118, - "grad_norm": 1.5247554779052734, - "learning_rate": 6.437208410888326e-06, - "loss": 3.4503, - "step": 15560 - }, - { - "epoch": 2.661765962902812, - "grad_norm": 1.542463779449463, - "learning_rate": 6.422932760835516e-06, - "loss": 3.473, - "step": 15570 - }, - { - "epoch": 2.6634755107274124, - "grad_norm": 1.505157232284546, - "learning_rate": 6.4087260490188584e-06, - "loss": 3.4267, - "step": 15580 - }, - { - "epoch": 2.665185058552013, - "grad_norm": 1.4808629751205444, - "learning_rate": 6.394588321905248e-06, - "loss": 3.5262, - "step": 15590 - }, - { - "epoch": 2.6668946063766135, - "grad_norm": 1.574536919593811, - "learning_rate": 6.380519625735932e-06, - "loss": 3.4919, - "step": 15600 - }, - { - "epoch": 2.6686041542012138, - "grad_norm": 1.5207796096801758, - "learning_rate": 6.366520006526386e-06, - "loss": 3.5219, - "step": 15610 - }, - { - "epoch": 2.670313702025814, - "grad_norm": 1.6055320501327515, - "learning_rate": 6.352589510066127e-06, - "loss": 3.495, - "step": 15620 - }, - { - "epoch": 2.6720232498504144, - "grad_norm": 1.5443027019500732, - "learning_rate": 6.3387281819186145e-06, - "loss": 3.4722, - "step": 15630 - }, - { - "epoch": 2.673732797675015, - "grad_norm": 1.5386123657226562, - "learning_rate": 6.324936067421057e-06, - "loss": 3.5005, - "step": 15640 - }, - { - "epoch": 2.6754423454996155, - "grad_norm": 1.541304111480713, - "learning_rate": 6.311213211684295e-06, - "loss": 3.5391, - "step": 15650 - }, - { - "epoch": 2.6771518933242158, - "grad_norm": 1.5200693607330322, - "learning_rate": 6.297559659592621e-06, - "loss": 3.4552, - "step": 15660 - }, - { - "epoch": 2.678861441148816, - "grad_norm": 1.5660852193832397, - "learning_rate": 6.283975455803666e-06, - "loss": 3.4638, - "step": 15670 - }, - { - "epoch": 2.6805709889734164, - "grad_norm": 1.5799615383148193, - "learning_rate": 6.270460644748233e-06, - "loss": 3.4711, - "step": 15680 - }, - { - "epoch": 2.682280536798017, - "grad_norm": 1.611094355583191, - "learning_rate": 6.257015270630165e-06, - "loss": 3.4926, - "step": 15690 - }, - { - "epoch": 2.6839900846226175, - "grad_norm": 1.7782231569290161, - "learning_rate": 6.2436393774261785e-06, - "loss": 3.4581, - "step": 15700 - }, - { - "epoch": 2.6856996324472178, - "grad_norm": 1.6342053413391113, - "learning_rate": 6.230333008885744e-06, - "loss": 3.5187, - "step": 15710 - }, - { - "epoch": 2.687409180271818, - "grad_norm": 1.6415696144104004, - "learning_rate": 6.217096208530931e-06, - "loss": 3.4421, - "step": 15720 - }, - { - "epoch": 2.6891187280964184, - "grad_norm": 1.539741039276123, - "learning_rate": 6.20392901965627e-06, - "loss": 3.45, - "step": 15730 - }, - { - "epoch": 2.690828275921019, - "grad_norm": 1.586663007736206, - "learning_rate": 6.190831485328602e-06, - "loss": 3.4114, - "step": 15740 - }, - { - "epoch": 2.692537823745619, - "grad_norm": 1.5391227006912231, - "learning_rate": 6.177803648386948e-06, - "loss": 3.428, - "step": 15750 - }, - { - "epoch": 2.6942473715702198, - "grad_norm": 1.613932728767395, - "learning_rate": 6.1648455514423625e-06, - "loss": 3.4445, - "step": 15760 - }, - { - "epoch": 2.69595691939482, - "grad_norm": 1.4927505254745483, - "learning_rate": 6.151957236877802e-06, - "loss": 3.515, - "step": 15770 - }, - { - "epoch": 2.6976664672194204, - "grad_norm": 1.5189650058746338, - "learning_rate": 6.139138746847979e-06, - "loss": 3.4466, - "step": 15780 - }, - { - "epoch": 2.6993760150440207, - "grad_norm": 1.5461863279342651, - "learning_rate": 6.126390123279217e-06, - "loss": 3.4456, - "step": 15790 - }, - { - "epoch": 2.701085562868621, - "grad_norm": 1.5661964416503906, - "learning_rate": 6.113711407869329e-06, - "loss": 3.4183, - "step": 15800 - }, - { - "epoch": 2.7027951106932218, - "grad_norm": 1.54511559009552, - "learning_rate": 6.101102642087486e-06, - "loss": 3.5091, - "step": 15810 - }, - { - "epoch": 2.704504658517822, - "grad_norm": 1.5995324850082397, - "learning_rate": 6.088563867174054e-06, - "loss": 3.4356, - "step": 15820 - }, - { - "epoch": 2.7062142063424224, - "grad_norm": 1.533016324043274, - "learning_rate": 6.076095124140478e-06, - "loss": 3.47, - "step": 15830 - }, - { - "epoch": 2.7079237541670227, - "grad_norm": 1.613906979560852, - "learning_rate": 6.0636964537691455e-06, - "loss": 3.5113, - "step": 15840 - }, - { - "epoch": 2.709633301991623, - "grad_norm": 1.5783125162124634, - "learning_rate": 6.0513678966132576e-06, - "loss": 3.507, - "step": 15850 - }, - { - "epoch": 2.7113428498162238, - "grad_norm": 1.608413815498352, - "learning_rate": 6.039109492996687e-06, - "loss": 3.4921, - "step": 15860 - }, - { - "epoch": 2.713052397640824, - "grad_norm": 1.5947372913360596, - "learning_rate": 6.026921283013847e-06, - "loss": 3.4493, - "step": 15870 - }, - { - "epoch": 2.7147619454654244, - "grad_norm": 1.5950342416763306, - "learning_rate": 6.014803306529572e-06, - "loss": 3.4824, - "step": 15880 - }, - { - "epoch": 2.7164714932900247, - "grad_norm": 1.5306484699249268, - "learning_rate": 6.002755603178971e-06, - "loss": 3.5075, - "step": 15890 - }, - { - "epoch": 2.718181041114625, - "grad_norm": 1.5265611410140991, - "learning_rate": 5.990778212367311e-06, - "loss": 3.5023, - "step": 15900 - }, - { - "epoch": 2.7198905889392258, - "grad_norm": 1.573938012123108, - "learning_rate": 5.978871173269878e-06, - "loss": 3.4623, - "step": 15910 - }, - { - "epoch": 2.721600136763826, - "grad_norm": 1.5388987064361572, - "learning_rate": 5.967034524831859e-06, - "loss": 3.4502, - "step": 15920 - }, - { - "epoch": 2.7233096845884264, - "grad_norm": 1.6098390817642212, - "learning_rate": 5.955268305768207e-06, - "loss": 3.4681, - "step": 15930 - }, - { - "epoch": 2.7250192324130267, - "grad_norm": 1.580206274986267, - "learning_rate": 5.943572554563519e-06, - "loss": 3.4663, - "step": 15940 - }, - { - "epoch": 2.726728780237627, - "grad_norm": 1.6139272451400757, - "learning_rate": 5.9319473094718985e-06, - "loss": 3.5039, - "step": 15950 - }, - { - "epoch": 2.7284383280622277, - "grad_norm": 1.4743162393569946, - "learning_rate": 5.92039260851685e-06, - "loss": 3.496, - "step": 15960 - }, - { - "epoch": 2.730147875886828, - "grad_norm": 1.5346719026565552, - "learning_rate": 5.908908489491143e-06, - "loss": 3.4665, - "step": 15970 - }, - { - "epoch": 2.7318574237114284, - "grad_norm": 1.5181905031204224, - "learning_rate": 5.8974949899566875e-06, - "loss": 3.5383, - "step": 15980 - }, - { - "epoch": 2.7335669715360287, - "grad_norm": 1.5852978229522705, - "learning_rate": 5.886152147244413e-06, - "loss": 3.4517, - "step": 15990 - }, - { - "epoch": 2.735276519360629, - "grad_norm": 1.6163884401321411, - "learning_rate": 5.874879998454145e-06, - "loss": 3.4639, - "step": 16000 - }, - { - "epoch": 2.735276519360629, - "eval_loss": 4.224907875061035, - "eval_runtime": 15.5306, - "eval_samples_per_second": 62.715, - "eval_steps_per_second": 0.837, - "step": 16000 - }, - { - "epoch": 2.7369860671852297, - "grad_norm": 1.6069376468658447, - "learning_rate": 5.863678580454489e-06, - "loss": 3.4537, - "step": 16010 - }, - { - "epoch": 2.73869561500983, - "grad_norm": 1.6015331745147705, - "learning_rate": 5.852547929882707e-06, - "loss": 3.4298, - "step": 16020 - }, - { - "epoch": 2.7404051628344304, - "grad_norm": 1.5375254154205322, - "learning_rate": 5.841488083144598e-06, - "loss": 3.4906, - "step": 16030 - }, - { - "epoch": 2.7421147106590307, - "grad_norm": 1.5258957147598267, - "learning_rate": 5.83049907641437e-06, - "loss": 3.4652, - "step": 16040 - }, - { - "epoch": 2.743824258483631, - "grad_norm": 1.5778003931045532, - "learning_rate": 5.819580945634536e-06, - "loss": 3.4512, - "step": 16050 - }, - { - "epoch": 2.7455338063082317, - "grad_norm": 1.5654321908950806, - "learning_rate": 5.808733726515793e-06, - "loss": 3.4594, - "step": 16060 - }, - { - "epoch": 2.7472433541328316, - "grad_norm": 1.5390222072601318, - "learning_rate": 5.7979574545368955e-06, - "loss": 3.4596, - "step": 16070 - }, - { - "epoch": 2.7489529019574324, - "grad_norm": 1.561747670173645, - "learning_rate": 5.787252164944549e-06, - "loss": 3.4711, - "step": 16080 - }, - { - "epoch": 2.7506624497820327, - "grad_norm": 1.4792470932006836, - "learning_rate": 5.776617892753288e-06, - "loss": 3.4621, - "step": 16090 - }, - { - "epoch": 2.752371997606633, - "grad_norm": 1.629335880279541, - "learning_rate": 5.766054672745376e-06, - "loss": 3.4438, - "step": 16100 - }, - { - "epoch": 2.7540815454312333, - "grad_norm": 1.5419238805770874, - "learning_rate": 5.755562539470676e-06, - "loss": 3.4771, - "step": 16110 - }, - { - "epoch": 2.7557910932558336, - "grad_norm": 1.575222373008728, - "learning_rate": 5.745141527246532e-06, - "loss": 3.4251, - "step": 16120 - }, - { - "epoch": 2.7575006410804344, - "grad_norm": 1.554957628250122, - "learning_rate": 5.734791670157685e-06, - "loss": 3.5337, - "step": 16130 - }, - { - "epoch": 2.7592101889050347, - "grad_norm": 1.6310604810714722, - "learning_rate": 5.72451300205613e-06, - "loss": 3.4754, - "step": 16140 - }, - { - "epoch": 2.760919736729635, - "grad_norm": 1.6721609830856323, - "learning_rate": 5.7143055565610305e-06, - "loss": 3.4226, - "step": 16150 - }, - { - "epoch": 2.7626292845542353, - "grad_norm": 1.4773035049438477, - "learning_rate": 5.704169367058584e-06, - "loss": 3.492, - "step": 16160 - }, - { - "epoch": 2.7643388323788356, - "grad_norm": 1.5694059133529663, - "learning_rate": 5.69410446670194e-06, - "loss": 3.485, - "step": 16170 - }, - { - "epoch": 2.7660483802034364, - "grad_norm": 1.4766429662704468, - "learning_rate": 5.6841108884110685e-06, - "loss": 3.5044, - "step": 16180 - }, - { - "epoch": 2.7677579280280367, - "grad_norm": 1.5573360919952393, - "learning_rate": 5.674188664872666e-06, - "loss": 3.4463, - "step": 16190 - }, - { - "epoch": 2.769467475852637, - "grad_norm": 1.5700033903121948, - "learning_rate": 5.664337828540039e-06, - "loss": 3.4899, - "step": 16200 - }, - { - "epoch": 2.7711770236772373, - "grad_norm": 1.5571167469024658, - "learning_rate": 5.654558411633007e-06, - "loss": 3.4981, - "step": 16210 - }, - { - "epoch": 2.7728865715018376, - "grad_norm": 1.5823817253112793, - "learning_rate": 5.644850446137795e-06, - "loss": 3.5013, - "step": 16220 - }, - { - "epoch": 2.7745961193264383, - "grad_norm": 1.6469289064407349, - "learning_rate": 5.635213963806921e-06, - "loss": 3.499, - "step": 16230 - }, - { - "epoch": 2.7763056671510387, - "grad_norm": 1.6478776931762695, - "learning_rate": 5.625648996159099e-06, - "loss": 3.4472, - "step": 16240 - }, - { - "epoch": 2.778015214975639, - "grad_norm": 1.5723638534545898, - "learning_rate": 5.616155574479142e-06, - "loss": 3.4958, - "step": 16250 - }, - { - "epoch": 2.7797247628002393, - "grad_norm": 1.5589793920516968, - "learning_rate": 5.606733729817844e-06, - "loss": 3.4959, - "step": 16260 - }, - { - "epoch": 2.7814343106248396, - "grad_norm": 1.523707389831543, - "learning_rate": 5.597383492991894e-06, - "loss": 3.5127, - "step": 16270 - }, - { - "epoch": 2.7831438584494403, - "grad_norm": 1.588152289390564, - "learning_rate": 5.588104894583759e-06, - "loss": 3.4847, - "step": 16280 - }, - { - "epoch": 2.7848534062740407, - "grad_norm": 1.5875675678253174, - "learning_rate": 5.578897964941604e-06, - "loss": 3.4411, - "step": 16290 - }, - { - "epoch": 2.786562954098641, - "grad_norm": 1.563027024269104, - "learning_rate": 5.569762734179172e-06, - "loss": 3.4388, - "step": 16300 - }, - { - "epoch": 2.7882725019232413, - "grad_norm": 1.5270148515701294, - "learning_rate": 5.560699232175707e-06, - "loss": 3.4489, - "step": 16310 - }, - { - "epoch": 2.7899820497478416, - "grad_norm": 1.523610234260559, - "learning_rate": 5.551707488575835e-06, - "loss": 3.4868, - "step": 16320 - }, - { - "epoch": 2.7916915975724423, - "grad_norm": 1.5547958612442017, - "learning_rate": 5.542787532789478e-06, - "loss": 3.4173, - "step": 16330 - }, - { - "epoch": 2.793401145397042, - "grad_norm": 1.6203300952911377, - "learning_rate": 5.53393939399176e-06, - "loss": 3.458, - "step": 16340 - }, - { - "epoch": 2.795110693221643, - "grad_norm": 1.6532878875732422, - "learning_rate": 5.525163101122905e-06, - "loss": 3.4433, - "step": 16350 - }, - { - "epoch": 2.7968202410462433, - "grad_norm": 1.5966763496398926, - "learning_rate": 5.5164586828881525e-06, - "loss": 3.4187, - "step": 16360 - }, - { - "epoch": 2.7985297888708436, - "grad_norm": 1.5658292770385742, - "learning_rate": 5.507826167757643e-06, - "loss": 3.4841, - "step": 16370 - }, - { - "epoch": 2.800239336695444, - "grad_norm": 1.5719127655029297, - "learning_rate": 5.499265583966354e-06, - "loss": 3.4902, - "step": 16380 - }, - { - "epoch": 2.801948884520044, - "grad_norm": 1.601792573928833, - "learning_rate": 5.490776959513982e-06, - "loss": 3.5053, - "step": 16390 - }, - { - "epoch": 2.803658432344645, - "grad_norm": 1.5923479795455933, - "learning_rate": 5.482360322164866e-06, - "loss": 3.5287, - "step": 16400 - }, - { - "epoch": 2.8053679801692453, - "grad_norm": 1.507544755935669, - "learning_rate": 5.474015699447885e-06, - "loss": 3.5349, - "step": 16410 - }, - { - "epoch": 2.8070775279938456, - "grad_norm": 1.537821888923645, - "learning_rate": 5.465743118656384e-06, - "loss": 3.4865, - "step": 16420 - }, - { - "epoch": 2.808787075818446, - "grad_norm": 1.5500425100326538, - "learning_rate": 5.457542606848067e-06, - "loss": 3.4822, - "step": 16430 - }, - { - "epoch": 2.810496623643046, - "grad_norm": 1.5936212539672852, - "learning_rate": 5.449414190844926e-06, - "loss": 3.4534, - "step": 16440 - }, - { - "epoch": 2.812206171467647, - "grad_norm": 1.6193571090698242, - "learning_rate": 5.441357897233134e-06, - "loss": 3.4578, - "step": 16450 - }, - { - "epoch": 2.8139157192922473, - "grad_norm": 1.5809321403503418, - "learning_rate": 5.433373752362976e-06, - "loss": 3.521, - "step": 16460 - }, - { - "epoch": 2.8156252671168476, - "grad_norm": 1.5119826793670654, - "learning_rate": 5.425461782348752e-06, - "loss": 3.488, - "step": 16470 - }, - { - "epoch": 2.817334814941448, - "grad_norm": 1.655507206916809, - "learning_rate": 5.417622013068694e-06, - "loss": 3.4779, - "step": 16480 - }, - { - "epoch": 2.819044362766048, - "grad_norm": 1.5827959775924683, - "learning_rate": 5.409854470164886e-06, - "loss": 3.4724, - "step": 16490 - }, - { - "epoch": 2.820753910590649, - "grad_norm": 1.5659741163253784, - "learning_rate": 5.402159179043167e-06, - "loss": 3.4927, - "step": 16500 - }, - { - "epoch": 2.8224634584152493, - "grad_norm": 1.548431158065796, - "learning_rate": 5.394536164873071e-06, - "loss": 3.449, - "step": 16510 - }, - { - "epoch": 2.8241730062398496, - "grad_norm": 1.528773546218872, - "learning_rate": 5.386985452587718e-06, - "loss": 3.4498, - "step": 16520 - }, - { - "epoch": 2.82588255406445, - "grad_norm": 1.6188921928405762, - "learning_rate": 5.379507066883752e-06, - "loss": 3.4561, - "step": 16530 - }, - { - "epoch": 2.82759210188905, - "grad_norm": 1.5871434211730957, - "learning_rate": 5.3721010322212485e-06, - "loss": 3.4299, - "step": 16540 - }, - { - "epoch": 2.829301649713651, - "grad_norm": 1.613559603691101, - "learning_rate": 5.3647673728236435e-06, - "loss": 3.4325, - "step": 16550 - }, - { - "epoch": 2.8310111975382513, - "grad_norm": 1.5737433433532715, - "learning_rate": 5.357506112677656e-06, - "loss": 3.4705, - "step": 16560 - }, - { - "epoch": 2.8327207453628516, - "grad_norm": 1.6331027746200562, - "learning_rate": 5.350317275533186e-06, - "loss": 3.5203, - "step": 16570 - }, - { - "epoch": 2.834430293187452, - "grad_norm": 1.5439718961715698, - "learning_rate": 5.3432008849032736e-06, - "loss": 3.4428, - "step": 16580 - }, - { - "epoch": 2.836139841012052, - "grad_norm": 1.5566649436950684, - "learning_rate": 5.33615696406399e-06, - "loss": 3.469, - "step": 16590 - }, - { - "epoch": 2.837849388836653, - "grad_norm": 1.5220750570297241, - "learning_rate": 5.32918553605438e-06, - "loss": 3.4321, - "step": 16600 - }, - { - "epoch": 2.8395589366612533, - "grad_norm": 1.5076227188110352, - "learning_rate": 5.322286623676383e-06, - "loss": 3.4785, - "step": 16610 - }, - { - "epoch": 2.8412684844858536, - "grad_norm": 1.5874429941177368, - "learning_rate": 5.315460249494749e-06, - "loss": 3.4696, - "step": 16620 - }, - { - "epoch": 2.842978032310454, - "grad_norm": 1.5496973991394043, - "learning_rate": 5.308706435836981e-06, - "loss": 3.4259, - "step": 16630 - }, - { - "epoch": 2.844687580135054, - "grad_norm": 1.556490421295166, - "learning_rate": 5.302025204793249e-06, - "loss": 3.4688, - "step": 16640 - }, - { - "epoch": 2.846397127959655, - "grad_norm": 1.529276967048645, - "learning_rate": 5.2954165782163195e-06, - "loss": 3.5264, - "step": 16650 - }, - { - "epoch": 2.848106675784255, - "grad_norm": 1.5668138265609741, - "learning_rate": 5.288880577721489e-06, - "loss": 3.5482, - "step": 16660 - }, - { - "epoch": 2.8498162236088556, - "grad_norm": 1.6061265468597412, - "learning_rate": 5.282417224686509e-06, - "loss": 3.4419, - "step": 16670 - }, - { - "epoch": 2.851525771433456, - "grad_norm": 1.5832321643829346, - "learning_rate": 5.2760265402515265e-06, - "loss": 3.4719, - "step": 16680 - }, - { - "epoch": 2.853235319258056, - "grad_norm": 1.54146409034729, - "learning_rate": 5.269708545318998e-06, - "loss": 3.4806, - "step": 16690 - }, - { - "epoch": 2.8549448670826565, - "grad_norm": 1.5943055152893066, - "learning_rate": 5.263463260553627e-06, - "loss": 3.4906, - "step": 16700 - }, - { - "epoch": 2.856654414907257, - "grad_norm": 1.5406030416488647, - "learning_rate": 5.2572907063823045e-06, - "loss": 3.4835, - "step": 16710 - }, - { - "epoch": 2.8583639627318576, - "grad_norm": 1.5307209491729736, - "learning_rate": 5.251190902994034e-06, - "loss": 3.4513, - "step": 16720 - }, - { - "epoch": 2.860073510556458, - "grad_norm": 1.5451802015304565, - "learning_rate": 5.24516387033987e-06, - "loss": 3.482, - "step": 16730 - }, - { - "epoch": 2.861783058381058, - "grad_norm": 1.5372310876846313, - "learning_rate": 5.239209628132847e-06, - "loss": 3.4306, - "step": 16740 - }, - { - "epoch": 2.8634926062056585, - "grad_norm": 1.657265305519104, - "learning_rate": 5.233328195847922e-06, - "loss": 3.4189, - "step": 16750 - }, - { - "epoch": 2.865202154030259, - "grad_norm": 1.5818283557891846, - "learning_rate": 5.227519592721907e-06, - "loss": 3.4521, - "step": 16760 - }, - { - "epoch": 2.8669117018548596, - "grad_norm": 1.5124504566192627, - "learning_rate": 5.221783837753409e-06, - "loss": 3.4837, - "step": 16770 - }, - { - "epoch": 2.86862124967946, - "grad_norm": 1.5402297973632812, - "learning_rate": 5.216120949702755e-06, - "loss": 3.4523, - "step": 16780 - }, - { - "epoch": 2.87033079750406, - "grad_norm": 1.5758874416351318, - "learning_rate": 5.210530947091952e-06, - "loss": 3.4132, - "step": 16790 - }, - { - "epoch": 2.8720403453286605, - "grad_norm": 1.5502175092697144, - "learning_rate": 5.205013848204615e-06, - "loss": 3.4658, - "step": 16800 - }, - { - "epoch": 2.873749893153261, - "grad_norm": 1.5034229755401611, - "learning_rate": 5.199569671085902e-06, - "loss": 3.5025, - "step": 16810 - }, - { - "epoch": 2.8754594409778615, - "grad_norm": 1.5125559568405151, - "learning_rate": 5.194198433542463e-06, - "loss": 3.5012, - "step": 16820 - }, - { - "epoch": 2.877168988802462, - "grad_norm": 1.6217219829559326, - "learning_rate": 5.18890015314238e-06, - "loss": 3.4487, - "step": 16830 - }, - { - "epoch": 2.878878536627062, - "grad_norm": 1.5170769691467285, - "learning_rate": 5.1836748472151124e-06, - "loss": 3.4286, - "step": 16840 - }, - { - "epoch": 2.8805880844516625, - "grad_norm": 1.6245899200439453, - "learning_rate": 5.178522532851428e-06, - "loss": 3.4171, - "step": 16850 - }, - { - "epoch": 2.882297632276263, - "grad_norm": 1.6358392238616943, - "learning_rate": 5.173443226903364e-06, - "loss": 3.4882, - "step": 16860 - }, - { - "epoch": 2.8840071801008635, - "grad_norm": 1.6245265007019043, - "learning_rate": 5.168436945984162e-06, - "loss": 3.497, - "step": 16870 - }, - { - "epoch": 2.885716727925464, - "grad_norm": 1.5667799711227417, - "learning_rate": 5.16350370646821e-06, - "loss": 3.4126, - "step": 16880 - }, - { - "epoch": 2.887426275750064, - "grad_norm": 1.6029492616653442, - "learning_rate": 5.158643524491004e-06, - "loss": 3.4952, - "step": 16890 - }, - { - "epoch": 2.8891358235746645, - "grad_norm": 1.5323750972747803, - "learning_rate": 5.153856415949082e-06, - "loss": 3.4609, - "step": 16900 - }, - { - "epoch": 2.890845371399265, - "grad_norm": 1.6187094449996948, - "learning_rate": 5.149142396499968e-06, - "loss": 3.5243, - "step": 16910 - }, - { - "epoch": 2.8925549192238655, - "grad_norm": 1.5734087228775024, - "learning_rate": 5.144501481562141e-06, - "loss": 3.5135, - "step": 16920 - }, - { - "epoch": 2.8942644670484654, - "grad_norm": 1.5811856985092163, - "learning_rate": 5.1399336863149626e-06, - "loss": 3.4614, - "step": 16930 - }, - { - "epoch": 2.895974014873066, - "grad_norm": 1.5570889711380005, - "learning_rate": 5.135439025698641e-06, - "loss": 3.4527, - "step": 16940 - }, - { - "epoch": 2.8976835626976665, - "grad_norm": 1.5832979679107666, - "learning_rate": 5.131017514414173e-06, - "loss": 3.5227, - "step": 16950 - }, - { - "epoch": 2.899393110522267, - "grad_norm": 1.5674128532409668, - "learning_rate": 5.126669166923309e-06, - "loss": 3.4148, - "step": 16960 - }, - { - "epoch": 2.901102658346867, - "grad_norm": 1.5816925764083862, - "learning_rate": 5.1223939974484925e-06, - "loss": 3.4556, - "step": 16970 - }, - { - "epoch": 2.9028122061714674, - "grad_norm": 1.6206274032592773, - "learning_rate": 5.118192019972817e-06, - "loss": 3.4546, - "step": 16980 - }, - { - "epoch": 2.904521753996068, - "grad_norm": 1.5509247779846191, - "learning_rate": 5.1140632482399835e-06, - "loss": 3.4569, - "step": 16990 - }, - { - "epoch": 2.9062313018206685, - "grad_norm": 1.5082319974899292, - "learning_rate": 5.110007695754256e-06, - "loss": 3.5374, - "step": 17000 - }, - { - "epoch": 2.9062313018206685, - "eval_loss": 4.223108291625977, - "eval_runtime": 14.9424, - "eval_samples_per_second": 65.184, - "eval_steps_per_second": 0.87, - "step": 17000 - }, - { - "epoch": 2.9079408496452688, - "grad_norm": 1.516559362411499, - "learning_rate": 5.10602537578041e-06, - "loss": 3.5133, - "step": 17010 - }, - { - "epoch": 2.909650397469869, - "grad_norm": 1.538204550743103, - "learning_rate": 5.102116301343703e-06, - "loss": 3.5246, - "step": 17020 - }, - { - "epoch": 2.9113599452944694, - "grad_norm": 1.5654205083847046, - "learning_rate": 5.098280485229813e-06, - "loss": 3.4644, - "step": 17030 - }, - { - "epoch": 2.91306949311907, - "grad_norm": 1.488978624343872, - "learning_rate": 5.094517939984815e-06, - "loss": 3.4912, - "step": 17040 - }, - { - "epoch": 2.9147790409436705, - "grad_norm": 1.5564743280410767, - "learning_rate": 5.090828677915125e-06, - "loss": 3.4821, - "step": 17050 - }, - { - "epoch": 2.9164885887682708, - "grad_norm": 1.5342092514038086, - "learning_rate": 5.087212711087474e-06, - "loss": 3.464, - "step": 17060 - }, - { - "epoch": 2.918198136592871, - "grad_norm": 1.5899741649627686, - "learning_rate": 5.083670051328856e-06, - "loss": 3.4885, - "step": 17070 - }, - { - "epoch": 2.9199076844174714, - "grad_norm": 1.5108574628829956, - "learning_rate": 5.0802007102264935e-06, - "loss": 3.4989, - "step": 17080 - }, - { - "epoch": 2.921617232242072, - "grad_norm": 1.608015775680542, - "learning_rate": 5.076804699127806e-06, - "loss": 3.4584, - "step": 17090 - }, - { - "epoch": 2.9233267800666725, - "grad_norm": 1.5380176305770874, - "learning_rate": 5.0734820291403615e-06, - "loss": 3.5107, - "step": 17100 - }, - { - "epoch": 2.9250363278912728, - "grad_norm": 1.540371298789978, - "learning_rate": 5.070232711131849e-06, - "loss": 3.4877, - "step": 17110 - }, - { - "epoch": 2.926745875715873, - "grad_norm": 1.5647467374801636, - "learning_rate": 5.067056755730043e-06, - "loss": 3.5342, - "step": 17120 - }, - { - "epoch": 2.9284554235404734, - "grad_norm": 1.5876312255859375, - "learning_rate": 5.063954173322759e-06, - "loss": 3.5114, - "step": 17130 - }, - { - "epoch": 2.930164971365074, - "grad_norm": 1.5713095664978027, - "learning_rate": 5.060924974057834e-06, - "loss": 3.4586, - "step": 17140 - }, - { - "epoch": 2.9318745191896745, - "grad_norm": 1.6856889724731445, - "learning_rate": 5.057969167843078e-06, - "loss": 3.4709, - "step": 17150 - }, - { - "epoch": 2.9335840670142748, - "grad_norm": 1.5709922313690186, - "learning_rate": 5.055086764346256e-06, - "loss": 3.4227, - "step": 17160 - }, - { - "epoch": 2.935293614838875, - "grad_norm": 1.6004196405410767, - "learning_rate": 5.0522777729950445e-06, - "loss": 3.463, - "step": 17170 - }, - { - "epoch": 2.9370031626634754, - "grad_norm": 1.5680336952209473, - "learning_rate": 5.049542202977005e-06, - "loss": 3.4485, - "step": 17180 - }, - { - "epoch": 2.938712710488076, - "grad_norm": 1.5728260278701782, - "learning_rate": 5.046880063239563e-06, - "loss": 3.5265, - "step": 17190 - }, - { - "epoch": 2.9404222583126765, - "grad_norm": 1.5976927280426025, - "learning_rate": 5.044291362489961e-06, - "loss": 3.4885, - "step": 17200 - }, - { - "epoch": 2.9421318061372768, - "grad_norm": 1.5452165603637695, - "learning_rate": 5.0417761091952455e-06, - "loss": 3.4748, - "step": 17210 - }, - { - "epoch": 2.943841353961877, - "grad_norm": 1.536504864692688, - "learning_rate": 5.039334311582232e-06, - "loss": 3.4749, - "step": 17220 - }, - { - "epoch": 2.9455509017864774, - "grad_norm": 1.4919332265853882, - "learning_rate": 5.0369659776374736e-06, - "loss": 3.4617, - "step": 17230 - }, - { - "epoch": 2.947260449611078, - "grad_norm": 1.5618114471435547, - "learning_rate": 5.03467111510725e-06, - "loss": 3.4445, - "step": 17240 - }, - { - "epoch": 2.948969997435678, - "grad_norm": 1.596140742301941, - "learning_rate": 5.032449731497528e-06, - "loss": 3.4772, - "step": 17250 - }, - { - "epoch": 2.9506795452602788, - "grad_norm": 1.541021704673767, - "learning_rate": 5.03030183407394e-06, - "loss": 3.4162, - "step": 17260 - }, - { - "epoch": 2.952389093084879, - "grad_norm": 1.571519136428833, - "learning_rate": 5.028227429861765e-06, - "loss": 3.481, - "step": 17270 - }, - { - "epoch": 2.9540986409094794, - "grad_norm": 1.547511339187622, - "learning_rate": 5.026226525645905e-06, - "loss": 3.4703, - "step": 17280 - }, - { - "epoch": 2.9558081887340797, - "grad_norm": 1.5132800340652466, - "learning_rate": 5.024299127970851e-06, - "loss": 3.4469, - "step": 17290 - }, - { - "epoch": 2.95751773655868, - "grad_norm": 1.626301646232605, - "learning_rate": 5.022445243140684e-06, - "loss": 3.432, - "step": 17300 - }, - { - "epoch": 2.9592272843832808, - "grad_norm": 1.5868024826049805, - "learning_rate": 5.02066487721903e-06, - "loss": 3.4923, - "step": 17310 - }, - { - "epoch": 2.960936832207881, - "grad_norm": 1.562615990638733, - "learning_rate": 5.018958036029057e-06, - "loss": 3.488, - "step": 17320 - }, - { - "epoch": 2.9626463800324814, - "grad_norm": 1.5346853733062744, - "learning_rate": 5.0173247251534485e-06, - "loss": 3.4659, - "step": 17330 - }, - { - "epoch": 2.9643559278570817, - "grad_norm": 1.590325117111206, - "learning_rate": 5.015764949934394e-06, - "loss": 3.4393, - "step": 17340 - }, - { - "epoch": 2.966065475681682, - "grad_norm": 1.6443595886230469, - "learning_rate": 5.014278715473553e-06, - "loss": 3.4844, - "step": 17350 - }, - { - "epoch": 2.9677750235062827, - "grad_norm": 1.6490458250045776, - "learning_rate": 5.012866026632059e-06, - "loss": 3.5026, - "step": 17360 - }, - { - "epoch": 2.969484571330883, - "grad_norm": 1.582161545753479, - "learning_rate": 5.0115268880304935e-06, - "loss": 3.4495, - "step": 17370 - }, - { - "epoch": 2.9711941191554834, - "grad_norm": 1.5719423294067383, - "learning_rate": 5.010261304048867e-06, - "loss": 3.4701, - "step": 17380 - }, - { - "epoch": 2.9729036669800837, - "grad_norm": 1.5458106994628906, - "learning_rate": 5.009069278826615e-06, - "loss": 3.4915, - "step": 17390 - }, - { - "epoch": 2.974613214804684, - "grad_norm": 1.6326204538345337, - "learning_rate": 5.00795081626258e-06, - "loss": 3.4873, - "step": 17400 - }, - { - "epoch": 2.9763227626292847, - "grad_norm": 1.5448585748672485, - "learning_rate": 5.006905920014994e-06, - "loss": 3.4679, - "step": 17410 - }, - { - "epoch": 2.978032310453885, - "grad_norm": 1.5477879047393799, - "learning_rate": 5.00593459350147e-06, - "loss": 3.5038, - "step": 17420 - }, - { - "epoch": 2.9797418582784854, - "grad_norm": 1.6072031259536743, - "learning_rate": 5.005036839898997e-06, - "loss": 3.4691, - "step": 17430 - }, - { - "epoch": 2.9814514061030857, - "grad_norm": 1.628769040107727, - "learning_rate": 5.004212662143918e-06, - "loss": 3.4314, - "step": 17440 - }, - { - "epoch": 2.983160953927686, - "grad_norm": 1.53169846534729, - "learning_rate": 5.003462062931929e-06, - "loss": 3.4268, - "step": 17450 - }, - { - "epoch": 2.9848705017522867, - "grad_norm": 1.5772202014923096, - "learning_rate": 5.002785044718068e-06, - "loss": 3.4801, - "step": 17460 - }, - { - "epoch": 2.986580049576887, - "grad_norm": 1.5207418203353882, - "learning_rate": 5.002181609716708e-06, - "loss": 3.4901, - "step": 17470 - }, - { - "epoch": 2.9882895974014874, - "grad_norm": 1.5498665571212769, - "learning_rate": 5.001651759901544e-06, - "loss": 3.5038, - "step": 17480 - }, - { - "epoch": 2.9899991452260877, - "grad_norm": 1.6516672372817993, - "learning_rate": 5.0011954970055885e-06, - "loss": 3.4208, - "step": 17490 - }, - { - "epoch": 2.991708693050688, - "grad_norm": 1.5161845684051514, - "learning_rate": 5.000812822521178e-06, - "loss": 3.4738, - "step": 17500 - }, - { - "epoch": 2.9934182408752887, - "grad_norm": 1.560577392578125, - "learning_rate": 5.000503737699951e-06, - "loss": 3.4089, - "step": 17510 - }, - { - "epoch": 2.9951277886998886, - "grad_norm": 1.7022500038146973, - "learning_rate": 5.000268243552853e-06, - "loss": 3.4648, - "step": 17520 - }, - { - "epoch": 2.9968373365244894, - "grad_norm": 1.5325472354888916, - "learning_rate": 5.000106340850131e-06, - "loss": 3.4687, - "step": 17530 - }, - { - "epoch": 2.9985468843490897, - "grad_norm": 1.5595728158950806, - "learning_rate": 5.000018030121334e-06, - "loss": 3.4539, - "step": 17540 - } - ], - "logging_steps": 10, - "max_steps": 17547, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 9.724342205077783e+19, - "train_batch_size": 10, - "trial_name": null, - "trial_params": null -}