diff --git "a/checkpoint-5724/trainer_state.json" "b/checkpoint-5724/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5724/trainer_state.json" @@ -0,0 +1,4038 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 5724, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0069881201956673656, + "grad_norm": 14.528067588806152, + "learning_rate": 1.99748427672956e-05, + "loss": 4.6827, + "step": 10 + }, + { + "epoch": 0.013976240391334731, + "grad_norm": 13.050820350646973, + "learning_rate": 1.994689028651293e-05, + "loss": 4.529, + "step": 20 + }, + { + "epoch": 0.020964360587002098, + "grad_norm": 2.765507698059082, + "learning_rate": 1.991893780573026e-05, + "loss": 3.4294, + "step": 30 + }, + { + "epoch": 0.027952480782669462, + "grad_norm": 16.95716667175293, + "learning_rate": 1.989098532494759e-05, + "loss": 3.352, + "step": 40 + }, + { + "epoch": 0.03494060097833683, + "grad_norm": 5.339346408843994, + "learning_rate": 1.986303284416492e-05, + "loss": 2.2654, + "step": 50 + }, + { + "epoch": 0.041928721174004195, + "grad_norm": 7.168309688568115, + "learning_rate": 1.983508036338225e-05, + "loss": 2.0076, + "step": 60 + }, + { + "epoch": 0.04891684136967156, + "grad_norm": 5.793034076690674, + "learning_rate": 1.9807127882599582e-05, + "loss": 1.6779, + "step": 70 + }, + { + "epoch": 0.055904961565338925, + "grad_norm": 6.093694686889648, + "learning_rate": 1.9779175401816913e-05, + "loss": 1.2564, + "step": 80 + }, + { + "epoch": 0.06289308176100629, + "grad_norm": 3.321807622909546, + "learning_rate": 1.9751222921034243e-05, + "loss": 1.4143, + "step": 90 + }, + { + "epoch": 0.06988120195667366, + "grad_norm": 2.9472098350524902, + "learning_rate": 1.9723270440251574e-05, + "loss": 1.0533, + "step": 100 + }, + { + "epoch": 0.07686932215234102, + "grad_norm": 8.515447616577148, + "learning_rate": 1.9695317959468904e-05, + "loss": 1.2895, + "step": 110 + }, + { + "epoch": 0.08385744234800839, + "grad_norm": 3.560924530029297, + "learning_rate": 1.9667365478686235e-05, + "loss": 1.097, + "step": 120 + }, + { + "epoch": 0.09084556254367575, + "grad_norm": 4.986349582672119, + "learning_rate": 1.9639412997903566e-05, + "loss": 0.9556, + "step": 130 + }, + { + "epoch": 0.09783368273934312, + "grad_norm": 1.27532160282135, + "learning_rate": 1.9611460517120896e-05, + "loss": 0.8306, + "step": 140 + }, + { + "epoch": 0.10482180293501048, + "grad_norm": 2.6796844005584717, + "learning_rate": 1.9583508036338227e-05, + "loss": 0.7747, + "step": 150 + }, + { + "epoch": 0.11180992313067785, + "grad_norm": 3.4592413902282715, + "learning_rate": 1.9555555555555557e-05, + "loss": 0.6187, + "step": 160 + }, + { + "epoch": 0.1187980433263452, + "grad_norm": 4.833241939544678, + "learning_rate": 1.9527603074772888e-05, + "loss": 0.6831, + "step": 170 + }, + { + "epoch": 0.12578616352201258, + "grad_norm": 1.7203434705734253, + "learning_rate": 1.949965059399022e-05, + "loss": 0.586, + "step": 180 + }, + { + "epoch": 0.13277428371767994, + "grad_norm": 1.333950161933899, + "learning_rate": 1.947169811320755e-05, + "loss": 0.6214, + "step": 190 + }, + { + "epoch": 0.13976240391334732, + "grad_norm": 1.5078257322311401, + "learning_rate": 1.944374563242488e-05, + "loss": 0.5652, + "step": 200 + }, + { + "epoch": 0.14675052410901468, + "grad_norm": 3.7645132541656494, + "learning_rate": 1.941579315164221e-05, + "loss": 0.5229, + "step": 210 + }, + { + "epoch": 0.15373864430468204, + "grad_norm": 1.2441000938415527, + "learning_rate": 1.938784067085954e-05, + "loss": 0.3669, + "step": 220 + }, + { + "epoch": 0.1607267645003494, + "grad_norm": 2.2875142097473145, + "learning_rate": 1.935988819007687e-05, + "loss": 0.4162, + "step": 230 + }, + { + "epoch": 0.16771488469601678, + "grad_norm": 2.1816585063934326, + "learning_rate": 1.9331935709294202e-05, + "loss": 0.3836, + "step": 240 + }, + { + "epoch": 0.17470300489168414, + "grad_norm": 3.4903249740600586, + "learning_rate": 1.9303983228511532e-05, + "loss": 0.4131, + "step": 250 + }, + { + "epoch": 0.1816911250873515, + "grad_norm": 2.1905159950256348, + "learning_rate": 1.9276030747728863e-05, + "loss": 0.3725, + "step": 260 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 3.6003665924072266, + "learning_rate": 1.9248078266946194e-05, + "loss": 0.4131, + "step": 270 + }, + { + "epoch": 0.19566736547868624, + "grad_norm": 3.012568235397339, + "learning_rate": 1.9220125786163524e-05, + "loss": 0.3991, + "step": 280 + }, + { + "epoch": 0.2026554856743536, + "grad_norm": 1.2049503326416016, + "learning_rate": 1.9192173305380855e-05, + "loss": 0.2732, + "step": 290 + }, + { + "epoch": 0.20964360587002095, + "grad_norm": 2.0943682193756104, + "learning_rate": 1.9164220824598185e-05, + "loss": 0.3052, + "step": 300 + }, + { + "epoch": 0.21663172606568834, + "grad_norm": 4.1970086097717285, + "learning_rate": 1.9136268343815516e-05, + "loss": 0.2544, + "step": 310 + }, + { + "epoch": 0.2236198462613557, + "grad_norm": 2.6826064586639404, + "learning_rate": 1.9108315863032846e-05, + "loss": 0.1921, + "step": 320 + }, + { + "epoch": 0.23060796645702306, + "grad_norm": 6.013642311096191, + "learning_rate": 1.9080363382250177e-05, + "loss": 0.3014, + "step": 330 + }, + { + "epoch": 0.2375960866526904, + "grad_norm": 4.2075629234313965, + "learning_rate": 1.9052410901467508e-05, + "loss": 0.2569, + "step": 340 + }, + { + "epoch": 0.2445842068483578, + "grad_norm": 0.5800023078918457, + "learning_rate": 1.9024458420684838e-05, + "loss": 0.1486, + "step": 350 + }, + { + "epoch": 0.25157232704402516, + "grad_norm": 8.123499870300293, + "learning_rate": 1.899650593990217e-05, + "loss": 0.2215, + "step": 360 + }, + { + "epoch": 0.2585604472396925, + "grad_norm": 4.439255237579346, + "learning_rate": 1.89685534591195e-05, + "loss": 0.1692, + "step": 370 + }, + { + "epoch": 0.2655485674353599, + "grad_norm": 4.177631855010986, + "learning_rate": 1.894060097833683e-05, + "loss": 0.1176, + "step": 380 + }, + { + "epoch": 0.27253668763102723, + "grad_norm": 0.5754140019416809, + "learning_rate": 1.891264849755416e-05, + "loss": 0.1538, + "step": 390 + }, + { + "epoch": 0.27952480782669464, + "grad_norm": 0.5612320303916931, + "learning_rate": 1.888469601677149e-05, + "loss": 0.1054, + "step": 400 + }, + { + "epoch": 0.286512928022362, + "grad_norm": 1.949729561805725, + "learning_rate": 1.8856743535988818e-05, + "loss": 0.0824, + "step": 410 + }, + { + "epoch": 0.29350104821802936, + "grad_norm": 3.153822898864746, + "learning_rate": 1.8828791055206152e-05, + "loss": 0.1174, + "step": 420 + }, + { + "epoch": 0.3004891684136967, + "grad_norm": 0.6545250415802002, + "learning_rate": 1.8800838574423483e-05, + "loss": 0.1305, + "step": 430 + }, + { + "epoch": 0.3074772886093641, + "grad_norm": 1.2621614933013916, + "learning_rate": 1.8772886093640813e-05, + "loss": 0.0684, + "step": 440 + }, + { + "epoch": 0.31446540880503143, + "grad_norm": 0.3028416335582733, + "learning_rate": 1.8744933612858144e-05, + "loss": 0.0863, + "step": 450 + }, + { + "epoch": 0.3214535290006988, + "grad_norm": 0.4452091455459595, + "learning_rate": 1.8716981132075474e-05, + "loss": 0.1016, + "step": 460 + }, + { + "epoch": 0.3284416491963662, + "grad_norm": 0.4046468436717987, + "learning_rate": 1.8689028651292805e-05, + "loss": 0.2317, + "step": 470 + }, + { + "epoch": 0.33542976939203356, + "grad_norm": 0.5121694803237915, + "learning_rate": 1.8661076170510135e-05, + "loss": 0.1965, + "step": 480 + }, + { + "epoch": 0.3424178895877009, + "grad_norm": 2.7892794609069824, + "learning_rate": 1.8633123689727466e-05, + "loss": 0.2082, + "step": 490 + }, + { + "epoch": 0.3494060097833683, + "grad_norm": 0.29012227058410645, + "learning_rate": 1.8605171208944793e-05, + "loss": 0.1393, + "step": 500 + }, + { + "epoch": 0.35639412997903563, + "grad_norm": 0.14969919621944427, + "learning_rate": 1.8577218728162124e-05, + "loss": 0.1917, + "step": 510 + }, + { + "epoch": 0.363382250174703, + "grad_norm": 2.5301618576049805, + "learning_rate": 1.8549266247379458e-05, + "loss": 0.0939, + "step": 520 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.5171930193901062, + "learning_rate": 1.8521313766596788e-05, + "loss": 0.0643, + "step": 530 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.23746764659881592, + "learning_rate": 1.849336128581412e-05, + "loss": 0.2094, + "step": 540 + }, + { + "epoch": 0.3843466107617051, + "grad_norm": 1.2422730922698975, + "learning_rate": 1.846540880503145e-05, + "loss": 0.109, + "step": 550 + }, + { + "epoch": 0.3913347309573725, + "grad_norm": 0.8466539978981018, + "learning_rate": 1.843745632424878e-05, + "loss": 0.1514, + "step": 560 + }, + { + "epoch": 0.39832285115303984, + "grad_norm": 0.201154425740242, + "learning_rate": 1.840950384346611e-05, + "loss": 0.1597, + "step": 570 + }, + { + "epoch": 0.4053109713487072, + "grad_norm": 0.19054199755191803, + "learning_rate": 1.838155136268344e-05, + "loss": 0.0955, + "step": 580 + }, + { + "epoch": 0.41229909154437455, + "grad_norm": 1.8036576509475708, + "learning_rate": 1.8353598881900768e-05, + "loss": 0.1369, + "step": 590 + }, + { + "epoch": 0.4192872117400419, + "grad_norm": 0.3122437596321106, + "learning_rate": 1.83256464011181e-05, + "loss": 0.1439, + "step": 600 + }, + { + "epoch": 0.42627533193570927, + "grad_norm": 2.6353561878204346, + "learning_rate": 1.829769392033543e-05, + "loss": 0.1566, + "step": 610 + }, + { + "epoch": 0.4332634521313767, + "grad_norm": 3.645707368850708, + "learning_rate": 1.8269741439552763e-05, + "loss": 0.2505, + "step": 620 + }, + { + "epoch": 0.44025157232704404, + "grad_norm": 0.20553793013095856, + "learning_rate": 1.8241788958770094e-05, + "loss": 0.0509, + "step": 630 + }, + { + "epoch": 0.4472396925227114, + "grad_norm": 3.8460299968719482, + "learning_rate": 1.8213836477987425e-05, + "loss": 0.118, + "step": 640 + }, + { + "epoch": 0.45422781271837875, + "grad_norm": 0.4825759828090668, + "learning_rate": 1.8185883997204755e-05, + "loss": 0.0671, + "step": 650 + }, + { + "epoch": 0.4612159329140461, + "grad_norm": 2.5147883892059326, + "learning_rate": 1.8157931516422086e-05, + "loss": 0.1381, + "step": 660 + }, + { + "epoch": 0.46820405310971347, + "grad_norm": 4.72428035736084, + "learning_rate": 1.8129979035639413e-05, + "loss": 0.1093, + "step": 670 + }, + { + "epoch": 0.4751921733053808, + "grad_norm": 3.091625928878784, + "learning_rate": 1.8102026554856743e-05, + "loss": 0.2363, + "step": 680 + }, + { + "epoch": 0.48218029350104824, + "grad_norm": 0.10304142534732819, + "learning_rate": 1.8074074074074074e-05, + "loss": 0.1347, + "step": 690 + }, + { + "epoch": 0.4891684136967156, + "grad_norm": 0.4455307424068451, + "learning_rate": 1.8046121593291405e-05, + "loss": 0.1064, + "step": 700 + }, + { + "epoch": 0.49615653389238296, + "grad_norm": 0.11594414710998535, + "learning_rate": 1.8018169112508735e-05, + "loss": 0.123, + "step": 710 + }, + { + "epoch": 0.5031446540880503, + "grad_norm": 0.1224653497338295, + "learning_rate": 1.799021663172607e-05, + "loss": 0.1263, + "step": 720 + }, + { + "epoch": 0.5101327742837177, + "grad_norm": 4.951889991760254, + "learning_rate": 1.79622641509434e-05, + "loss": 0.2111, + "step": 730 + }, + { + "epoch": 0.517120894479385, + "grad_norm": 0.16316205263137817, + "learning_rate": 1.793431167016073e-05, + "loss": 0.0366, + "step": 740 + }, + { + "epoch": 0.5241090146750524, + "grad_norm": 0.1896398663520813, + "learning_rate": 1.790635918937806e-05, + "loss": 0.1656, + "step": 750 + }, + { + "epoch": 0.5310971348707197, + "grad_norm": 0.3920069932937622, + "learning_rate": 1.7878406708595388e-05, + "loss": 0.2804, + "step": 760 + }, + { + "epoch": 0.5380852550663872, + "grad_norm": 0.12546010315418243, + "learning_rate": 1.785045422781272e-05, + "loss": 0.1404, + "step": 770 + }, + { + "epoch": 0.5450733752620545, + "grad_norm": 5.382510185241699, + "learning_rate": 1.782250174703005e-05, + "loss": 0.1194, + "step": 780 + }, + { + "epoch": 0.5520614954577219, + "grad_norm": 0.3077394366264343, + "learning_rate": 1.779454926624738e-05, + "loss": 0.1744, + "step": 790 + }, + { + "epoch": 0.5590496156533893, + "grad_norm": 0.41757112741470337, + "learning_rate": 1.776659678546471e-05, + "loss": 0.0729, + "step": 800 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.17300930619239807, + "learning_rate": 1.773864430468204e-05, + "loss": 0.1443, + "step": 810 + }, + { + "epoch": 0.573025856044724, + "grad_norm": 0.09419082850217819, + "learning_rate": 1.7710691823899375e-05, + "loss": 0.1587, + "step": 820 + }, + { + "epoch": 0.5800139762403913, + "grad_norm": 0.14113612473011017, + "learning_rate": 1.7682739343116705e-05, + "loss": 0.1196, + "step": 830 + }, + { + "epoch": 0.5870020964360587, + "grad_norm": 0.6690120697021484, + "learning_rate": 1.7654786862334036e-05, + "loss": 0.0619, + "step": 840 + }, + { + "epoch": 0.593990216631726, + "grad_norm": 0.5076020956039429, + "learning_rate": 1.7626834381551363e-05, + "loss": 0.2922, + "step": 850 + }, + { + "epoch": 0.6009783368273934, + "grad_norm": 0.12461690604686737, + "learning_rate": 1.7598881900768694e-05, + "loss": 0.0351, + "step": 860 + }, + { + "epoch": 0.6079664570230608, + "grad_norm": 5.236762523651123, + "learning_rate": 1.7570929419986024e-05, + "loss": 0.1571, + "step": 870 + }, + { + "epoch": 0.6149545772187281, + "grad_norm": 0.12950463593006134, + "learning_rate": 1.7542976939203355e-05, + "loss": 0.0204, + "step": 880 + }, + { + "epoch": 0.6219426974143956, + "grad_norm": 0.08026222884654999, + "learning_rate": 1.7515024458420685e-05, + "loss": 0.1185, + "step": 890 + }, + { + "epoch": 0.6289308176100629, + "grad_norm": 4.237085342407227, + "learning_rate": 1.7487071977638016e-05, + "loss": 0.0876, + "step": 900 + }, + { + "epoch": 0.6359189378057303, + "grad_norm": 0.916748046875, + "learning_rate": 1.7459119496855346e-05, + "loss": 0.0619, + "step": 910 + }, + { + "epoch": 0.6429070580013976, + "grad_norm": 4.51674747467041, + "learning_rate": 1.743116701607268e-05, + "loss": 0.1116, + "step": 920 + }, + { + "epoch": 0.649895178197065, + "grad_norm": 1.1469999551773071, + "learning_rate": 1.740321453529001e-05, + "loss": 0.0147, + "step": 930 + }, + { + "epoch": 0.6568832983927324, + "grad_norm": 2.3627333641052246, + "learning_rate": 1.7375262054507338e-05, + "loss": 0.1485, + "step": 940 + }, + { + "epoch": 0.6638714185883997, + "grad_norm": 8.216973304748535, + "learning_rate": 1.734730957372467e-05, + "loss": 0.0472, + "step": 950 + }, + { + "epoch": 0.6708595387840671, + "grad_norm": 3.5901684761047363, + "learning_rate": 1.7319357092942e-05, + "loss": 0.0979, + "step": 960 + }, + { + "epoch": 0.6778476589797344, + "grad_norm": 7.015125274658203, + "learning_rate": 1.729140461215933e-05, + "loss": 0.0537, + "step": 970 + }, + { + "epoch": 0.6848357791754018, + "grad_norm": 4.372157096862793, + "learning_rate": 1.726345213137666e-05, + "loss": 0.1152, + "step": 980 + }, + { + "epoch": 0.6918238993710691, + "grad_norm": 4.862729072570801, + "learning_rate": 1.723549965059399e-05, + "loss": 0.1131, + "step": 990 + }, + { + "epoch": 0.6988120195667366, + "grad_norm": 0.07732150703668594, + "learning_rate": 1.720754716981132e-05, + "loss": 0.0707, + "step": 1000 + }, + { + "epoch": 0.705800139762404, + "grad_norm": 0.19970574975013733, + "learning_rate": 1.7179594689028652e-05, + "loss": 0.0738, + "step": 1010 + }, + { + "epoch": 0.7127882599580713, + "grad_norm": 0.21677608788013458, + "learning_rate": 1.7151642208245983e-05, + "loss": 0.0992, + "step": 1020 + }, + { + "epoch": 0.7197763801537387, + "grad_norm": 0.10738188028335571, + "learning_rate": 1.7123689727463313e-05, + "loss": 0.0103, + "step": 1030 + }, + { + "epoch": 0.726764500349406, + "grad_norm": 3.7014143466949463, + "learning_rate": 1.7095737246680644e-05, + "loss": 0.0803, + "step": 1040 + }, + { + "epoch": 0.7337526205450734, + "grad_norm": 0.04619055986404419, + "learning_rate": 1.7067784765897974e-05, + "loss": 0.049, + "step": 1050 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.2753339409828186, + "learning_rate": 1.7039832285115305e-05, + "loss": 0.144, + "step": 1060 + }, + { + "epoch": 0.7477288609364081, + "grad_norm": 2.8835108280181885, + "learning_rate": 1.7011879804332635e-05, + "loss": 0.0885, + "step": 1070 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.22654394805431366, + "learning_rate": 1.6983927323549966e-05, + "loss": 0.0512, + "step": 1080 + }, + { + "epoch": 0.7617051013277428, + "grad_norm": 2.421400547027588, + "learning_rate": 1.6955974842767297e-05, + "loss": 0.1544, + "step": 1090 + }, + { + "epoch": 0.7686932215234102, + "grad_norm": 0.03806561604142189, + "learning_rate": 1.6928022361984627e-05, + "loss": 0.0225, + "step": 1100 + }, + { + "epoch": 0.7756813417190775, + "grad_norm": 0.08347579091787338, + "learning_rate": 1.6900069881201958e-05, + "loss": 0.1386, + "step": 1110 + }, + { + "epoch": 0.782669461914745, + "grad_norm": 0.2059401273727417, + "learning_rate": 1.687211740041929e-05, + "loss": 0.2406, + "step": 1120 + }, + { + "epoch": 0.7896575821104123, + "grad_norm": 0.060870036482810974, + "learning_rate": 1.684416491963662e-05, + "loss": 0.1158, + "step": 1130 + }, + { + "epoch": 0.7966457023060797, + "grad_norm": 0.14614495635032654, + "learning_rate": 1.681621243885395e-05, + "loss": 0.1938, + "step": 1140 + }, + { + "epoch": 0.803633822501747, + "grad_norm": 0.09593985229730606, + "learning_rate": 1.678825995807128e-05, + "loss": 0.0627, + "step": 1150 + }, + { + "epoch": 0.8106219426974144, + "grad_norm": 9.543107032775879, + "learning_rate": 1.676030747728861e-05, + "loss": 0.1365, + "step": 1160 + }, + { + "epoch": 0.8176100628930818, + "grad_norm": 0.09678485244512558, + "learning_rate": 1.673235499650594e-05, + "loss": 0.1438, + "step": 1170 + }, + { + "epoch": 0.8245981830887491, + "grad_norm": 0.24556872248649597, + "learning_rate": 1.6704402515723272e-05, + "loss": 0.2233, + "step": 1180 + }, + { + "epoch": 0.8315863032844165, + "grad_norm": 0.06892254948616028, + "learning_rate": 1.6676450034940602e-05, + "loss": 0.0601, + "step": 1190 + }, + { + "epoch": 0.8385744234800838, + "grad_norm": 0.6429911255836487, + "learning_rate": 1.6648497554157933e-05, + "loss": 0.132, + "step": 1200 + }, + { + "epoch": 0.8455625436757512, + "grad_norm": 0.03480253368616104, + "learning_rate": 1.6620545073375263e-05, + "loss": 0.0472, + "step": 1210 + }, + { + "epoch": 0.8525506638714185, + "grad_norm": 0.25101321935653687, + "learning_rate": 1.6592592592592594e-05, + "loss": 0.0485, + "step": 1220 + }, + { + "epoch": 0.859538784067086, + "grad_norm": 0.0525231771171093, + "learning_rate": 1.6564640111809925e-05, + "loss": 0.1824, + "step": 1230 + }, + { + "epoch": 0.8665269042627534, + "grad_norm": 4.694910526275635, + "learning_rate": 1.6536687631027255e-05, + "loss": 0.1666, + "step": 1240 + }, + { + "epoch": 0.8735150244584207, + "grad_norm": 0.09604217112064362, + "learning_rate": 1.6508735150244586e-05, + "loss": 0.0638, + "step": 1250 + }, + { + "epoch": 0.8805031446540881, + "grad_norm": 9.125068664550781, + "learning_rate": 1.6480782669461916e-05, + "loss": 0.0744, + "step": 1260 + }, + { + "epoch": 0.8874912648497554, + "grad_norm": 0.5258679986000061, + "learning_rate": 1.6452830188679247e-05, + "loss": 0.019, + "step": 1270 + }, + { + "epoch": 0.8944793850454228, + "grad_norm": 3.6146976947784424, + "learning_rate": 1.6424877707896577e-05, + "loss": 0.0141, + "step": 1280 + }, + { + "epoch": 0.9014675052410901, + "grad_norm": 0.05512338504195213, + "learning_rate": 1.6396925227113908e-05, + "loss": 0.0138, + "step": 1290 + }, + { + "epoch": 0.9084556254367575, + "grad_norm": 0.028468603268265724, + "learning_rate": 1.636897274633124e-05, + "loss": 0.0815, + "step": 1300 + }, + { + "epoch": 0.9154437456324249, + "grad_norm": 0.7173348665237427, + "learning_rate": 1.634102026554857e-05, + "loss": 0.0919, + "step": 1310 + }, + { + "epoch": 0.9224318658280922, + "grad_norm": 0.06138516217470169, + "learning_rate": 1.63130677847659e-05, + "loss": 0.1383, + "step": 1320 + }, + { + "epoch": 0.9294199860237596, + "grad_norm": 10.367135047912598, + "learning_rate": 1.628511530398323e-05, + "loss": 0.1265, + "step": 1330 + }, + { + "epoch": 0.9364081062194269, + "grad_norm": 0.3321629762649536, + "learning_rate": 1.625716282320056e-05, + "loss": 0.1149, + "step": 1340 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.21935391426086426, + "learning_rate": 1.622921034241789e-05, + "loss": 0.1147, + "step": 1350 + }, + { + "epoch": 0.9503843466107617, + "grad_norm": 4.630390644073486, + "learning_rate": 1.6201257861635222e-05, + "loss": 0.1159, + "step": 1360 + }, + { + "epoch": 0.9573724668064291, + "grad_norm": 0.03748277202248573, + "learning_rate": 1.6173305380852552e-05, + "loss": 0.0029, + "step": 1370 + }, + { + "epoch": 0.9643605870020965, + "grad_norm": 0.09677435457706451, + "learning_rate": 1.6145352900069883e-05, + "loss": 0.179, + "step": 1380 + }, + { + "epoch": 0.9713487071977638, + "grad_norm": 0.022979794070124626, + "learning_rate": 1.6117400419287214e-05, + "loss": 0.0034, + "step": 1390 + }, + { + "epoch": 0.9783368273934312, + "grad_norm": 3.3972229957580566, + "learning_rate": 1.6089447938504544e-05, + "loss": 0.1633, + "step": 1400 + }, + { + "epoch": 0.9853249475890985, + "grad_norm": 0.0410487987101078, + "learning_rate": 1.6061495457721875e-05, + "loss": 0.0758, + "step": 1410 + }, + { + "epoch": 0.9923130677847659, + "grad_norm": 0.1519143134355545, + "learning_rate": 1.6033542976939205e-05, + "loss": 0.0967, + "step": 1420 + }, + { + "epoch": 0.9993011879804332, + "grad_norm": 0.11000066250562668, + "learning_rate": 1.6005590496156536e-05, + "loss": 0.1054, + "step": 1430 + }, + { + "epoch": 1.0062893081761006, + "grad_norm": 9.595348358154297, + "learning_rate": 1.5977638015373866e-05, + "loss": 0.0367, + "step": 1440 + }, + { + "epoch": 1.013277428371768, + "grad_norm": 0.044866789132356644, + "learning_rate": 1.5949685534591197e-05, + "loss": 0.145, + "step": 1450 + }, + { + "epoch": 1.0202655485674355, + "grad_norm": 6.3100361824035645, + "learning_rate": 1.5921733053808524e-05, + "loss": 0.2702, + "step": 1460 + }, + { + "epoch": 1.0272536687631026, + "grad_norm": 1.0666788816452026, + "learning_rate": 1.5893780573025858e-05, + "loss": 0.1257, + "step": 1470 + }, + { + "epoch": 1.03424178895877, + "grad_norm": 15.676182746887207, + "learning_rate": 1.586582809224319e-05, + "loss": 0.0373, + "step": 1480 + }, + { + "epoch": 1.0412299091544375, + "grad_norm": 13.464042663574219, + "learning_rate": 1.583787561146052e-05, + "loss": 0.1107, + "step": 1490 + }, + { + "epoch": 1.0482180293501049, + "grad_norm": 0.029935991391539574, + "learning_rate": 1.580992313067785e-05, + "loss": 0.0778, + "step": 1500 + }, + { + "epoch": 1.0552061495457723, + "grad_norm": 1.0842170715332031, + "learning_rate": 1.578197064989518e-05, + "loss": 0.2003, + "step": 1510 + }, + { + "epoch": 1.0621942697414395, + "grad_norm": 0.8753694891929626, + "learning_rate": 1.575401816911251e-05, + "loss": 0.0869, + "step": 1520 + }, + { + "epoch": 1.069182389937107, + "grad_norm": 0.03639127314090729, + "learning_rate": 1.572606568832984e-05, + "loss": 0.0035, + "step": 1530 + }, + { + "epoch": 1.0761705101327743, + "grad_norm": 0.10639658570289612, + "learning_rate": 1.5698113207547172e-05, + "loss": 0.0709, + "step": 1540 + }, + { + "epoch": 1.0831586303284417, + "grad_norm": 0.09013110399246216, + "learning_rate": 1.56701607267645e-05, + "loss": 0.0041, + "step": 1550 + }, + { + "epoch": 1.090146750524109, + "grad_norm": 0.1398763209581375, + "learning_rate": 1.564220824598183e-05, + "loss": 0.0039, + "step": 1560 + }, + { + "epoch": 1.0971348707197763, + "grad_norm": 0.13840065896511078, + "learning_rate": 1.5614255765199164e-05, + "loss": 0.0288, + "step": 1570 + }, + { + "epoch": 1.1041229909154437, + "grad_norm": 0.03229377046227455, + "learning_rate": 1.5586303284416494e-05, + "loss": 0.023, + "step": 1580 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.09332744032144547, + "learning_rate": 1.5558350803633825e-05, + "loss": 0.148, + "step": 1590 + }, + { + "epoch": 1.1180992313067786, + "grad_norm": 0.24537114799022675, + "learning_rate": 1.5530398322851156e-05, + "loss": 0.0143, + "step": 1600 + }, + { + "epoch": 1.1250873515024458, + "grad_norm": 0.02455071546137333, + "learning_rate": 1.5502445842068486e-05, + "loss": 0.0378, + "step": 1610 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 0.027023492380976677, + "learning_rate": 1.5474493361285817e-05, + "loss": 0.0558, + "step": 1620 + }, + { + "epoch": 1.1390635918937806, + "grad_norm": 0.048673246055841446, + "learning_rate": 1.5446540880503147e-05, + "loss": 0.0519, + "step": 1630 + }, + { + "epoch": 1.146051712089448, + "grad_norm": 8.062767028808594, + "learning_rate": 1.5418588399720474e-05, + "loss": 0.0404, + "step": 1640 + }, + { + "epoch": 1.1530398322851152, + "grad_norm": 8.358656883239746, + "learning_rate": 1.5390635918937805e-05, + "loss": 0.1054, + "step": 1650 + }, + { + "epoch": 1.1600279524807826, + "grad_norm": 1.3057444095611572, + "learning_rate": 1.5362683438155136e-05, + "loss": 0.1049, + "step": 1660 + }, + { + "epoch": 1.16701607267645, + "grad_norm": 0.018700115382671356, + "learning_rate": 1.533473095737247e-05, + "loss": 0.0106, + "step": 1670 + }, + { + "epoch": 1.1740041928721174, + "grad_norm": 8.731624603271484, + "learning_rate": 1.53067784765898e-05, + "loss": 0.0541, + "step": 1680 + }, + { + "epoch": 1.1809923130677848, + "grad_norm": 1.358834147453308, + "learning_rate": 1.527882599580713e-05, + "loss": 0.1515, + "step": 1690 + }, + { + "epoch": 1.187980433263452, + "grad_norm": 0.05055036395788193, + "learning_rate": 1.525087351502446e-05, + "loss": 0.1181, + "step": 1700 + }, + { + "epoch": 1.1949685534591195, + "grad_norm": 0.19085977971553802, + "learning_rate": 1.5222921034241792e-05, + "loss": 0.0624, + "step": 1710 + }, + { + "epoch": 1.2019566736547869, + "grad_norm": 1.282493233680725, + "learning_rate": 1.5194968553459122e-05, + "loss": 0.016, + "step": 1720 + }, + { + "epoch": 1.2089447938504543, + "grad_norm": 0.03596067801117897, + "learning_rate": 1.5167016072676451e-05, + "loss": 0.0211, + "step": 1730 + }, + { + "epoch": 1.2159329140461215, + "grad_norm": 0.033778220415115356, + "learning_rate": 1.5139063591893782e-05, + "loss": 0.0105, + "step": 1740 + }, + { + "epoch": 1.2229210342417889, + "grad_norm": 0.022970452904701233, + "learning_rate": 1.5111111111111112e-05, + "loss": 0.1634, + "step": 1750 + }, + { + "epoch": 1.2299091544374563, + "grad_norm": 0.08840584754943848, + "learning_rate": 1.5083158630328443e-05, + "loss": 0.0062, + "step": 1760 + }, + { + "epoch": 1.2368972746331237, + "grad_norm": 0.0703321248292923, + "learning_rate": 1.5055206149545773e-05, + "loss": 0.0554, + "step": 1770 + }, + { + "epoch": 1.2438853948287911, + "grad_norm": 2.026752471923828, + "learning_rate": 1.5027253668763104e-05, + "loss": 0.03, + "step": 1780 + }, + { + "epoch": 1.2508735150244585, + "grad_norm": 0.08599984645843506, + "learning_rate": 1.4999301187980435e-05, + "loss": 0.0303, + "step": 1790 + }, + { + "epoch": 1.2578616352201257, + "grad_norm": 0.0998782068490982, + "learning_rate": 1.4971348707197765e-05, + "loss": 0.1389, + "step": 1800 + }, + { + "epoch": 1.2648497554157931, + "grad_norm": 1.7642574310302734, + "learning_rate": 1.4943396226415094e-05, + "loss": 0.0556, + "step": 1810 + }, + { + "epoch": 1.2718378756114606, + "grad_norm": 0.012091516517102718, + "learning_rate": 1.4915443745632425e-05, + "loss": 0.1116, + "step": 1820 + }, + { + "epoch": 1.2788259958071277, + "grad_norm": 0.030048305168747902, + "learning_rate": 1.4887491264849757e-05, + "loss": 0.0433, + "step": 1830 + }, + { + "epoch": 1.2858141160027952, + "grad_norm": 0.025840098038315773, + "learning_rate": 1.4859538784067087e-05, + "loss": 0.038, + "step": 1840 + }, + { + "epoch": 1.2928022361984626, + "grad_norm": 0.4607846736907959, + "learning_rate": 1.4831586303284418e-05, + "loss": 0.0815, + "step": 1850 + }, + { + "epoch": 1.29979035639413, + "grad_norm": 3.925401210784912, + "learning_rate": 1.4803633822501749e-05, + "loss": 0.0708, + "step": 1860 + }, + { + "epoch": 1.3067784765897974, + "grad_norm": 5.653714656829834, + "learning_rate": 1.4775681341719079e-05, + "loss": 0.0166, + "step": 1870 + }, + { + "epoch": 1.3137665967854648, + "grad_norm": 0.02135496586561203, + "learning_rate": 1.474772886093641e-05, + "loss": 0.0782, + "step": 1880 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 0.020742664113640785, + "learning_rate": 1.471977638015374e-05, + "loss": 0.0048, + "step": 1890 + }, + { + "epoch": 1.3277428371767994, + "grad_norm": 0.02374901808798313, + "learning_rate": 1.4691823899371069e-05, + "loss": 0.081, + "step": 1900 + }, + { + "epoch": 1.3347309573724668, + "grad_norm": 0.03886565566062927, + "learning_rate": 1.46638714185884e-05, + "loss": 0.005, + "step": 1910 + }, + { + "epoch": 1.3417190775681342, + "grad_norm": 0.16429579257965088, + "learning_rate": 1.463591893780573e-05, + "loss": 0.0154, + "step": 1920 + }, + { + "epoch": 1.3487071977638014, + "grad_norm": 0.15556152164936066, + "learning_rate": 1.4607966457023063e-05, + "loss": 0.0659, + "step": 1930 + }, + { + "epoch": 1.3556953179594688, + "grad_norm": 0.022907955572009087, + "learning_rate": 1.4580013976240393e-05, + "loss": 0.068, + "step": 1940 + }, + { + "epoch": 1.3626834381551363, + "grad_norm": 0.028623567894101143, + "learning_rate": 1.4552061495457724e-05, + "loss": 0.0023, + "step": 1950 + }, + { + "epoch": 1.3696715583508037, + "grad_norm": 7.010847091674805, + "learning_rate": 1.4524109014675054e-05, + "loss": 0.147, + "step": 1960 + }, + { + "epoch": 1.376659678546471, + "grad_norm": 0.03581179678440094, + "learning_rate": 1.4496156533892385e-05, + "loss": 0.1209, + "step": 1970 + }, + { + "epoch": 1.3836477987421385, + "grad_norm": 0.019916867837309837, + "learning_rate": 1.4468204053109715e-05, + "loss": 0.0233, + "step": 1980 + }, + { + "epoch": 1.3906359189378057, + "grad_norm": 10.045747756958008, + "learning_rate": 1.4440251572327044e-05, + "loss": 0.0416, + "step": 1990 + }, + { + "epoch": 1.397624039133473, + "grad_norm": 0.04035484418272972, + "learning_rate": 1.4412299091544375e-05, + "loss": 0.0265, + "step": 2000 + }, + { + "epoch": 1.4046121593291405, + "grad_norm": 0.7727859616279602, + "learning_rate": 1.4384346610761705e-05, + "loss": 0.0331, + "step": 2010 + }, + { + "epoch": 1.4116002795248077, + "grad_norm": 0.018749171867966652, + "learning_rate": 1.4356394129979036e-05, + "loss": 0.0749, + "step": 2020 + }, + { + "epoch": 1.4185883997204751, + "grad_norm": 5.582619667053223, + "learning_rate": 1.4328441649196368e-05, + "loss": 0.0622, + "step": 2030 + }, + { + "epoch": 1.4255765199161425, + "grad_norm": 0.03347029909491539, + "learning_rate": 1.4300489168413699e-05, + "loss": 0.0021, + "step": 2040 + }, + { + "epoch": 1.43256464011181, + "grad_norm": 0.1696143001317978, + "learning_rate": 1.427253668763103e-05, + "loss": 0.0021, + "step": 2050 + }, + { + "epoch": 1.4395527603074774, + "grad_norm": 0.021968642249703407, + "learning_rate": 1.424458420684836e-05, + "loss": 0.0025, + "step": 2060 + }, + { + "epoch": 1.4465408805031448, + "grad_norm": 0.007907007820904255, + "learning_rate": 1.421663172606569e-05, + "loss": 0.0478, + "step": 2070 + }, + { + "epoch": 1.453529000698812, + "grad_norm": 0.035767070949077606, + "learning_rate": 1.418867924528302e-05, + "loss": 0.0012, + "step": 2080 + }, + { + "epoch": 1.4605171208944794, + "grad_norm": 0.011390830390155315, + "learning_rate": 1.416072676450035e-05, + "loss": 0.1533, + "step": 2090 + }, + { + "epoch": 1.4675052410901468, + "grad_norm": 7.848016738891602, + "learning_rate": 1.413277428371768e-05, + "loss": 0.0675, + "step": 2100 + }, + { + "epoch": 1.474493361285814, + "grad_norm": 0.011802544817328453, + "learning_rate": 1.4104821802935011e-05, + "loss": 0.0261, + "step": 2110 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 2.9148340225219727, + "learning_rate": 1.4076869322152342e-05, + "loss": 0.2831, + "step": 2120 + }, + { + "epoch": 1.4884696016771488, + "grad_norm": 0.08199399709701538, + "learning_rate": 1.4048916841369674e-05, + "loss": 0.0088, + "step": 2130 + }, + { + "epoch": 1.4954577218728162, + "grad_norm": 0.01199902594089508, + "learning_rate": 1.4020964360587004e-05, + "loss": 0.0693, + "step": 2140 + }, + { + "epoch": 1.5024458420684836, + "grad_norm": 0.07146008312702179, + "learning_rate": 1.3993011879804335e-05, + "loss": 0.0746, + "step": 2150 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 0.02118564024567604, + "learning_rate": 1.3965059399021666e-05, + "loss": 0.003, + "step": 2160 + }, + { + "epoch": 1.5164220824598185, + "grad_norm": 0.016972247511148453, + "learning_rate": 1.3937106918238994e-05, + "loss": 0.0106, + "step": 2170 + }, + { + "epoch": 1.5234102026554857, + "grad_norm": 0.023427631705999374, + "learning_rate": 1.3909154437456325e-05, + "loss": 0.0444, + "step": 2180 + }, + { + "epoch": 1.530398322851153, + "grad_norm": 0.03280281275510788, + "learning_rate": 1.3881201956673656e-05, + "loss": 0.0026, + "step": 2190 + }, + { + "epoch": 1.5373864430468203, + "grad_norm": 0.06073632091283798, + "learning_rate": 1.3853249475890986e-05, + "loss": 0.194, + "step": 2200 + }, + { + "epoch": 1.5443745632424877, + "grad_norm": 0.008114098571240902, + "learning_rate": 1.3825296995108317e-05, + "loss": 0.0016, + "step": 2210 + }, + { + "epoch": 1.551362683438155, + "grad_norm": 0.0467991828918457, + "learning_rate": 1.3797344514325647e-05, + "loss": 0.1305, + "step": 2220 + }, + { + "epoch": 1.5583508036338225, + "grad_norm": 0.06984475255012512, + "learning_rate": 1.376939203354298e-05, + "loss": 0.1386, + "step": 2230 + }, + { + "epoch": 1.56533892382949, + "grad_norm": 0.014983494766056538, + "learning_rate": 1.374143955276031e-05, + "loss": 0.1399, + "step": 2240 + }, + { + "epoch": 1.5723270440251573, + "grad_norm": 0.01568700559437275, + "learning_rate": 1.3713487071977637e-05, + "loss": 0.1084, + "step": 2250 + }, + { + "epoch": 1.5793151642208247, + "grad_norm": 10.549098014831543, + "learning_rate": 1.368553459119497e-05, + "loss": 0.0082, + "step": 2260 + }, + { + "epoch": 1.586303284416492, + "grad_norm": 0.02495586685836315, + "learning_rate": 1.36575821104123e-05, + "loss": 0.0019, + "step": 2270 + }, + { + "epoch": 1.5932914046121593, + "grad_norm": 0.01876898854970932, + "learning_rate": 1.362962962962963e-05, + "loss": 0.1002, + "step": 2280 + }, + { + "epoch": 1.6002795248078265, + "grad_norm": 0.044464971870183945, + "learning_rate": 1.3601677148846961e-05, + "loss": 0.0019, + "step": 2290 + }, + { + "epoch": 1.607267645003494, + "grad_norm": 0.04108593240380287, + "learning_rate": 1.3573724668064292e-05, + "loss": 0.0753, + "step": 2300 + }, + { + "epoch": 1.6142557651991614, + "grad_norm": 7.812976837158203, + "learning_rate": 1.3545772187281622e-05, + "loss": 0.2875, + "step": 2310 + }, + { + "epoch": 1.6212438853948288, + "grad_norm": 4.261682987213135, + "learning_rate": 1.3517819706498953e-05, + "loss": 0.1059, + "step": 2320 + }, + { + "epoch": 1.6282320055904962, + "grad_norm": 0.03309008479118347, + "learning_rate": 1.3489867225716285e-05, + "loss": 0.0051, + "step": 2330 + }, + { + "epoch": 1.6352201257861636, + "grad_norm": 9.983598709106445, + "learning_rate": 1.3461914744933612e-05, + "loss": 0.0249, + "step": 2340 + }, + { + "epoch": 1.642208245981831, + "grad_norm": 4.973055839538574, + "learning_rate": 1.3433962264150943e-05, + "loss": 0.1441, + "step": 2350 + }, + { + "epoch": 1.6491963661774982, + "grad_norm": 9.258291244506836, + "learning_rate": 1.3406009783368275e-05, + "loss": 0.1151, + "step": 2360 + }, + { + "epoch": 1.6561844863731656, + "grad_norm": 0.015135078690946102, + "learning_rate": 1.3378057302585606e-05, + "loss": 0.0458, + "step": 2370 + }, + { + "epoch": 1.663172606568833, + "grad_norm": 0.027922656387090683, + "learning_rate": 1.3350104821802936e-05, + "loss": 0.012, + "step": 2380 + }, + { + "epoch": 1.6701607267645002, + "grad_norm": 0.043312493711709976, + "learning_rate": 1.3322152341020267e-05, + "loss": 0.1028, + "step": 2390 + }, + { + "epoch": 1.6771488469601676, + "grad_norm": 0.012087292037904263, + "learning_rate": 1.3294199860237597e-05, + "loss": 0.0475, + "step": 2400 + }, + { + "epoch": 1.684136967155835, + "grad_norm": 0.5556979775428772, + "learning_rate": 1.3266247379454928e-05, + "loss": 0.0142, + "step": 2410 + }, + { + "epoch": 1.6911250873515025, + "grad_norm": 5.196042537689209, + "learning_rate": 1.3238294898672259e-05, + "loss": 0.1002, + "step": 2420 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 0.010861317627131939, + "learning_rate": 1.3210342417889587e-05, + "loss": 0.0953, + "step": 2430 + }, + { + "epoch": 1.7051013277428373, + "grad_norm": 0.017955880612134933, + "learning_rate": 1.3182389937106918e-05, + "loss": 0.0642, + "step": 2440 + }, + { + "epoch": 1.7120894479385047, + "grad_norm": 0.028966449201107025, + "learning_rate": 1.3154437456324249e-05, + "loss": 0.1619, + "step": 2450 + }, + { + "epoch": 1.719077568134172, + "grad_norm": 0.8712167739868164, + "learning_rate": 1.312648497554158e-05, + "loss": 0.0324, + "step": 2460 + }, + { + "epoch": 1.7260656883298393, + "grad_norm": 0.2874550521373749, + "learning_rate": 1.3098532494758911e-05, + "loss": 0.026, + "step": 2470 + }, + { + "epoch": 1.7330538085255065, + "grad_norm": 0.010579444468021393, + "learning_rate": 1.3070580013976242e-05, + "loss": 0.0048, + "step": 2480 + }, + { + "epoch": 1.740041928721174, + "grad_norm": 0.15773366391658783, + "learning_rate": 1.3042627533193573e-05, + "loss": 0.108, + "step": 2490 + }, + { + "epoch": 1.7470300489168413, + "grad_norm": 0.10269400477409363, + "learning_rate": 1.3014675052410903e-05, + "loss": 0.0884, + "step": 2500 + }, + { + "epoch": 1.7540181691125087, + "grad_norm": 0.012210669927299023, + "learning_rate": 1.2986722571628234e-05, + "loss": 0.0025, + "step": 2510 + }, + { + "epoch": 1.7610062893081762, + "grad_norm": 0.046242229640483856, + "learning_rate": 1.2958770090845563e-05, + "loss": 0.0124, + "step": 2520 + }, + { + "epoch": 1.7679944095038436, + "grad_norm": 12.532265663146973, + "learning_rate": 1.2930817610062893e-05, + "loss": 0.1502, + "step": 2530 + }, + { + "epoch": 1.774982529699511, + "grad_norm": 0.018337048590183258, + "learning_rate": 1.2902865129280224e-05, + "loss": 0.0402, + "step": 2540 + }, + { + "epoch": 1.7819706498951782, + "grad_norm": 0.9082455039024353, + "learning_rate": 1.2874912648497554e-05, + "loss": 0.0719, + "step": 2550 + }, + { + "epoch": 1.7889587700908456, + "grad_norm": 0.6127817034721375, + "learning_rate": 1.2846960167714887e-05, + "loss": 0.011, + "step": 2560 + }, + { + "epoch": 1.7959468902865128, + "grad_norm": 0.02741195261478424, + "learning_rate": 1.2819007686932217e-05, + "loss": 0.0436, + "step": 2570 + }, + { + "epoch": 1.8029350104821802, + "grad_norm": 0.012224216014146805, + "learning_rate": 1.2791055206149548e-05, + "loss": 0.0026, + "step": 2580 + }, + { + "epoch": 1.8099231306778476, + "grad_norm": 0.01689975894987583, + "learning_rate": 1.2763102725366878e-05, + "loss": 0.0067, + "step": 2590 + }, + { + "epoch": 1.816911250873515, + "grad_norm": 0.010701341554522514, + "learning_rate": 1.2735150244584207e-05, + "loss": 0.008, + "step": 2600 + }, + { + "epoch": 1.8238993710691824, + "grad_norm": 0.12163686007261276, + "learning_rate": 1.2707197763801538e-05, + "loss": 0.0767, + "step": 2610 + }, + { + "epoch": 1.8308874912648498, + "grad_norm": 0.07165365666151047, + "learning_rate": 1.2679245283018868e-05, + "loss": 0.0028, + "step": 2620 + }, + { + "epoch": 1.8378756114605173, + "grad_norm": 0.007849736139178276, + "learning_rate": 1.2651292802236199e-05, + "loss": 0.0488, + "step": 2630 + }, + { + "epoch": 1.8448637316561844, + "grad_norm": 0.01707429252564907, + "learning_rate": 1.262334032145353e-05, + "loss": 0.0022, + "step": 2640 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 6.567415237426758, + "learning_rate": 1.259538784067086e-05, + "loss": 0.0677, + "step": 2650 + }, + { + "epoch": 1.858839972047519, + "grad_norm": 0.005324068479239941, + "learning_rate": 1.2567435359888192e-05, + "loss": 0.0057, + "step": 2660 + }, + { + "epoch": 1.8658280922431865, + "grad_norm": 11.791135787963867, + "learning_rate": 1.2539482879105523e-05, + "loss": 0.0764, + "step": 2670 + }, + { + "epoch": 1.8728162124388539, + "grad_norm": 0.005892631132155657, + "learning_rate": 1.2511530398322853e-05, + "loss": 0.0736, + "step": 2680 + }, + { + "epoch": 1.8798043326345213, + "grad_norm": 0.04005246236920357, + "learning_rate": 1.2483577917540182e-05, + "loss": 0.0617, + "step": 2690 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.010624396614730358, + "learning_rate": 1.2455625436757513e-05, + "loss": 0.0011, + "step": 2700 + }, + { + "epoch": 1.8937805730258561, + "grad_norm": 0.01761520840227604, + "learning_rate": 1.2427672955974843e-05, + "loss": 0.1574, + "step": 2710 + }, + { + "epoch": 1.9007686932215235, + "grad_norm": 0.05212152749300003, + "learning_rate": 1.2399720475192174e-05, + "loss": 0.0618, + "step": 2720 + }, + { + "epoch": 1.9077568134171907, + "grad_norm": 0.012297810055315495, + "learning_rate": 1.2371767994409504e-05, + "loss": 0.0802, + "step": 2730 + }, + { + "epoch": 1.9147449336128581, + "grad_norm": 1.4671796560287476, + "learning_rate": 1.2343815513626835e-05, + "loss": 0.0351, + "step": 2740 + }, + { + "epoch": 1.9217330538085255, + "grad_norm": 1.28997802734375, + "learning_rate": 1.2315863032844166e-05, + "loss": 0.0069, + "step": 2750 + }, + { + "epoch": 1.9287211740041927, + "grad_norm": 0.15810908377170563, + "learning_rate": 1.2287910552061498e-05, + "loss": 0.0037, + "step": 2760 + }, + { + "epoch": 1.9357092941998602, + "grad_norm": 0.01939568482339382, + "learning_rate": 1.2259958071278828e-05, + "loss": 0.1375, + "step": 2770 + }, + { + "epoch": 1.9426974143955276, + "grad_norm": 0.023233506828546524, + "learning_rate": 1.2232005590496157e-05, + "loss": 0.0016, + "step": 2780 + }, + { + "epoch": 1.949685534591195, + "grad_norm": 13.808353424072266, + "learning_rate": 1.2204053109713488e-05, + "loss": 0.1251, + "step": 2790 + }, + { + "epoch": 1.9566736547868624, + "grad_norm": 0.007430393248796463, + "learning_rate": 1.2176100628930818e-05, + "loss": 0.0957, + "step": 2800 + }, + { + "epoch": 1.9636617749825298, + "grad_norm": 0.015328769572079182, + "learning_rate": 1.2148148148148149e-05, + "loss": 0.0019, + "step": 2810 + }, + { + "epoch": 1.9706498951781972, + "grad_norm": 0.05328751727938652, + "learning_rate": 1.212019566736548e-05, + "loss": 0.2193, + "step": 2820 + }, + { + "epoch": 1.9776380153738644, + "grad_norm": 0.05544736981391907, + "learning_rate": 1.209224318658281e-05, + "loss": 0.011, + "step": 2830 + }, + { + "epoch": 1.9846261355695318, + "grad_norm": 0.006596289575099945, + "learning_rate": 1.206429070580014e-05, + "loss": 0.0454, + "step": 2840 + }, + { + "epoch": 1.991614255765199, + "grad_norm": 7.7824015617370605, + "learning_rate": 1.2036338225017471e-05, + "loss": 0.024, + "step": 2850 + }, + { + "epoch": 1.9986023759608664, + "grad_norm": 0.024772530421614647, + "learning_rate": 1.2008385744234804e-05, + "loss": 0.0109, + "step": 2860 + }, + { + "epoch": 2.005590496156534, + "grad_norm": 0.026197150349617004, + "learning_rate": 1.198043326345213e-05, + "loss": 0.0362, + "step": 2870 + }, + { + "epoch": 2.0125786163522013, + "grad_norm": 18.572996139526367, + "learning_rate": 1.1952480782669463e-05, + "loss": 0.0935, + "step": 2880 + }, + { + "epoch": 2.0195667365478687, + "grad_norm": 0.17002998292446136, + "learning_rate": 1.1924528301886794e-05, + "loss": 0.0343, + "step": 2890 + }, + { + "epoch": 2.026554856743536, + "grad_norm": 0.029017208144068718, + "learning_rate": 1.1896575821104124e-05, + "loss": 0.0013, + "step": 2900 + }, + { + "epoch": 2.0335429769392035, + "grad_norm": 0.03208720684051514, + "learning_rate": 1.1868623340321455e-05, + "loss": 0.0595, + "step": 2910 + }, + { + "epoch": 2.040531097134871, + "grad_norm": 0.03342962637543678, + "learning_rate": 1.1840670859538785e-05, + "loss": 0.0025, + "step": 2920 + }, + { + "epoch": 2.047519217330538, + "grad_norm": 0.00888112373650074, + "learning_rate": 1.1812718378756116e-05, + "loss": 0.0007, + "step": 2930 + }, + { + "epoch": 2.0545073375262053, + "grad_norm": 0.14470069110393524, + "learning_rate": 1.1784765897973446e-05, + "loss": 0.1185, + "step": 2940 + }, + { + "epoch": 2.0614954577218727, + "grad_norm": 0.006698206998407841, + "learning_rate": 1.1756813417190777e-05, + "loss": 0.0027, + "step": 2950 + }, + { + "epoch": 2.06848357791754, + "grad_norm": 0.013084967620670795, + "learning_rate": 1.1728860936408106e-05, + "loss": 0.1188, + "step": 2960 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 4.599226951599121, + "learning_rate": 1.1700908455625436e-05, + "loss": 0.192, + "step": 2970 + }, + { + "epoch": 2.082459818308875, + "grad_norm": 0.04704582691192627, + "learning_rate": 1.1672955974842769e-05, + "loss": 0.0214, + "step": 2980 + }, + { + "epoch": 2.0894479385045424, + "grad_norm": 0.00998607650399208, + "learning_rate": 1.16450034940601e-05, + "loss": 0.0642, + "step": 2990 + }, + { + "epoch": 2.0964360587002098, + "grad_norm": 11.119775772094727, + "learning_rate": 1.161705101327743e-05, + "loss": 0.0075, + "step": 3000 + }, + { + "epoch": 2.103424178895877, + "grad_norm": 10.634467124938965, + "learning_rate": 1.158909853249476e-05, + "loss": 0.1313, + "step": 3010 + }, + { + "epoch": 2.1104122990915446, + "grad_norm": 0.05366509407758713, + "learning_rate": 1.1561146051712091e-05, + "loss": 0.0151, + "step": 3020 + }, + { + "epoch": 2.1174004192872116, + "grad_norm": 0.014267387799918652, + "learning_rate": 1.1533193570929421e-05, + "loss": 0.1542, + "step": 3030 + }, + { + "epoch": 2.124388539482879, + "grad_norm": 0.008031471632421017, + "learning_rate": 1.150524109014675e-05, + "loss": 0.0015, + "step": 3040 + }, + { + "epoch": 2.1313766596785464, + "grad_norm": 8.725931167602539, + "learning_rate": 1.1477288609364081e-05, + "loss": 0.0981, + "step": 3050 + }, + { + "epoch": 2.138364779874214, + "grad_norm": 0.010948359034955502, + "learning_rate": 1.1449336128581411e-05, + "loss": 0.0513, + "step": 3060 + }, + { + "epoch": 2.145352900069881, + "grad_norm": 0.22167250514030457, + "learning_rate": 1.1421383647798742e-05, + "loss": 0.0295, + "step": 3070 + }, + { + "epoch": 2.1523410202655486, + "grad_norm": 0.038010116666555405, + "learning_rate": 1.1393431167016074e-05, + "loss": 0.001, + "step": 3080 + }, + { + "epoch": 2.159329140461216, + "grad_norm": 0.014428159222006798, + "learning_rate": 1.1365478686233405e-05, + "loss": 0.0015, + "step": 3090 + }, + { + "epoch": 2.1663172606568835, + "grad_norm": 0.027289502322673798, + "learning_rate": 1.1337526205450735e-05, + "loss": 0.0036, + "step": 3100 + }, + { + "epoch": 2.1733053808525504, + "grad_norm": 1.7627710103988647, + "learning_rate": 1.1309573724668066e-05, + "loss": 0.0314, + "step": 3110 + }, + { + "epoch": 2.180293501048218, + "grad_norm": 0.006555517669767141, + "learning_rate": 1.1281621243885397e-05, + "loss": 0.0017, + "step": 3120 + }, + { + "epoch": 2.1872816212438853, + "grad_norm": 0.006436652038246393, + "learning_rate": 1.1253668763102725e-05, + "loss": 0.006, + "step": 3130 + }, + { + "epoch": 2.1942697414395527, + "grad_norm": 0.010117010213434696, + "learning_rate": 1.1225716282320056e-05, + "loss": 0.0127, + "step": 3140 + }, + { + "epoch": 2.20125786163522, + "grad_norm": 0.01584050804376602, + "learning_rate": 1.1197763801537387e-05, + "loss": 0.0011, + "step": 3150 + }, + { + "epoch": 2.2082459818308875, + "grad_norm": 3.1698734760284424, + "learning_rate": 1.1169811320754717e-05, + "loss": 0.0643, + "step": 3160 + }, + { + "epoch": 2.215234102026555, + "grad_norm": 20.464344024658203, + "learning_rate": 1.1141858839972048e-05, + "loss": 0.1347, + "step": 3170 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.04433240368962288, + "learning_rate": 1.111390635918938e-05, + "loss": 0.0079, + "step": 3180 + }, + { + "epoch": 2.2292103424178897, + "grad_norm": 0.04871518909931183, + "learning_rate": 1.108595387840671e-05, + "loss": 0.1057, + "step": 3190 + }, + { + "epoch": 2.236198462613557, + "grad_norm": 21.398836135864258, + "learning_rate": 1.1058001397624041e-05, + "loss": 0.0091, + "step": 3200 + }, + { + "epoch": 2.243186582809224, + "grad_norm": 0.023466596379876137, + "learning_rate": 1.1030048916841372e-05, + "loss": 0.0057, + "step": 3210 + }, + { + "epoch": 2.2501747030048915, + "grad_norm": 0.011096821166574955, + "learning_rate": 1.10020964360587e-05, + "loss": 0.0048, + "step": 3220 + }, + { + "epoch": 2.257162823200559, + "grad_norm": 7.414328098297119, + "learning_rate": 1.0974143955276031e-05, + "loss": 0.0135, + "step": 3230 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 0.03595978021621704, + "learning_rate": 1.0946191474493362e-05, + "loss": 0.001, + "step": 3240 + }, + { + "epoch": 2.2711390635918938, + "grad_norm": 17.89633560180664, + "learning_rate": 1.0918238993710692e-05, + "loss": 0.0867, + "step": 3250 + }, + { + "epoch": 2.278127183787561, + "grad_norm": 0.009794089011847973, + "learning_rate": 1.0890286512928023e-05, + "loss": 0.0324, + "step": 3260 + }, + { + "epoch": 2.2851153039832286, + "grad_norm": 0.04615940898656845, + "learning_rate": 1.0862334032145353e-05, + "loss": 0.0013, + "step": 3270 + }, + { + "epoch": 2.292103424178896, + "grad_norm": 0.14327336847782135, + "learning_rate": 1.0834381551362686e-05, + "loss": 0.0009, + "step": 3280 + }, + { + "epoch": 2.2990915443745634, + "grad_norm": 0.003527725813910365, + "learning_rate": 1.0806429070580016e-05, + "loss": 0.0751, + "step": 3290 + }, + { + "epoch": 2.3060796645702304, + "grad_norm": 0.013885865919291973, + "learning_rate": 1.0778476589797347e-05, + "loss": 0.0537, + "step": 3300 + }, + { + "epoch": 2.313067784765898, + "grad_norm": 0.01316741295158863, + "learning_rate": 1.0750524109014676e-05, + "loss": 0.0008, + "step": 3310 + }, + { + "epoch": 2.320055904961565, + "grad_norm": 0.0060685062780976295, + "learning_rate": 1.0722571628232006e-05, + "loss": 0.0673, + "step": 3320 + }, + { + "epoch": 2.3270440251572326, + "grad_norm": 0.021814968436956406, + "learning_rate": 1.0694619147449337e-05, + "loss": 0.0014, + "step": 3330 + }, + { + "epoch": 2.3340321453529, + "grad_norm": 0.013100036419928074, + "learning_rate": 1.0666666666666667e-05, + "loss": 0.0756, + "step": 3340 + }, + { + "epoch": 2.3410202655485675, + "grad_norm": 0.01045132428407669, + "learning_rate": 1.0638714185883998e-05, + "loss": 0.0459, + "step": 3350 + }, + { + "epoch": 2.348008385744235, + "grad_norm": 0.006095814984291792, + "learning_rate": 1.0610761705101328e-05, + "loss": 0.0551, + "step": 3360 + }, + { + "epoch": 2.3549965059399023, + "grad_norm": 0.011639703065156937, + "learning_rate": 1.0582809224318659e-05, + "loss": 0.0032, + "step": 3370 + }, + { + "epoch": 2.3619846261355697, + "grad_norm": 0.0661187469959259, + "learning_rate": 1.0554856743535991e-05, + "loss": 0.0009, + "step": 3380 + }, + { + "epoch": 2.368972746331237, + "grad_norm": 0.030025649815797806, + "learning_rate": 1.0526904262753318e-05, + "loss": 0.0096, + "step": 3390 + }, + { + "epoch": 2.375960866526904, + "grad_norm": 0.05209723860025406, + "learning_rate": 1.049895178197065e-05, + "loss": 0.1485, + "step": 3400 + }, + { + "epoch": 2.3829489867225715, + "grad_norm": 0.4318474233150482, + "learning_rate": 1.0470999301187981e-05, + "loss": 0.0011, + "step": 3410 + }, + { + "epoch": 2.389937106918239, + "grad_norm": 0.011814878322184086, + "learning_rate": 1.0443046820405312e-05, + "loss": 0.0019, + "step": 3420 + }, + { + "epoch": 2.3969252271139063, + "grad_norm": 27.78424644470215, + "learning_rate": 1.0415094339622642e-05, + "loss": 0.1376, + "step": 3430 + }, + { + "epoch": 2.4039133473095737, + "grad_norm": 0.27662497758865356, + "learning_rate": 1.0387141858839973e-05, + "loss": 0.0009, + "step": 3440 + }, + { + "epoch": 2.410901467505241, + "grad_norm": 0.0203730296343565, + "learning_rate": 1.0359189378057304e-05, + "loss": 0.001, + "step": 3450 + }, + { + "epoch": 2.4178895877009086, + "grad_norm": 0.005023602396249771, + "learning_rate": 1.0331236897274634e-05, + "loss": 0.0026, + "step": 3460 + }, + { + "epoch": 2.424877707896576, + "grad_norm": 0.005766034591943026, + "learning_rate": 1.0303284416491965e-05, + "loss": 0.0043, + "step": 3470 + }, + { + "epoch": 2.431865828092243, + "grad_norm": 0.0078241853043437, + "learning_rate": 1.0275331935709294e-05, + "loss": 0.0666, + "step": 3480 + }, + { + "epoch": 2.4388539482879104, + "grad_norm": 0.13950783014297485, + "learning_rate": 1.0247379454926624e-05, + "loss": 0.0299, + "step": 3490 + }, + { + "epoch": 2.4458420684835778, + "grad_norm": 0.03160367161035538, + "learning_rate": 1.0219426974143956e-05, + "loss": 0.003, + "step": 3500 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 0.01961313560605049, + "learning_rate": 1.0191474493361287e-05, + "loss": 0.0103, + "step": 3510 + }, + { + "epoch": 2.4598183088749126, + "grad_norm": 0.007672517094761133, + "learning_rate": 1.0163522012578618e-05, + "loss": 0.0037, + "step": 3520 + }, + { + "epoch": 2.46680642907058, + "grad_norm": 0.005141376983374357, + "learning_rate": 1.0135569531795948e-05, + "loss": 0.0005, + "step": 3530 + }, + { + "epoch": 2.4737945492662474, + "grad_norm": 0.027929285541176796, + "learning_rate": 1.0107617051013279e-05, + "loss": 0.0248, + "step": 3540 + }, + { + "epoch": 2.480782669461915, + "grad_norm": 1.3310028314590454, + "learning_rate": 1.007966457023061e-05, + "loss": 0.124, + "step": 3550 + }, + { + "epoch": 2.4877707896575822, + "grad_norm": 0.06481662392616272, + "learning_rate": 1.005171208944794e-05, + "loss": 0.0024, + "step": 3560 + }, + { + "epoch": 2.4947589098532497, + "grad_norm": 0.008162762969732285, + "learning_rate": 1.0023759608665269e-05, + "loss": 0.0017, + "step": 3570 + }, + { + "epoch": 2.501747030048917, + "grad_norm": 7.582877159118652, + "learning_rate": 9.995807127882601e-06, + "loss": 0.0838, + "step": 3580 + }, + { + "epoch": 2.508735150244584, + "grad_norm": 0.06521458923816681, + "learning_rate": 9.96785464709993e-06, + "loss": 0.0009, + "step": 3590 + }, + { + "epoch": 2.5157232704402515, + "grad_norm": 0.005363748874515295, + "learning_rate": 9.939902166317262e-06, + "loss": 0.0797, + "step": 3600 + }, + { + "epoch": 2.522711390635919, + "grad_norm": 0.007538496516644955, + "learning_rate": 9.911949685534593e-06, + "loss": 0.039, + "step": 3610 + }, + { + "epoch": 2.5296995108315863, + "grad_norm": 0.007935012690722942, + "learning_rate": 9.883997204751923e-06, + "loss": 0.0039, + "step": 3620 + }, + { + "epoch": 2.5366876310272537, + "grad_norm": 0.005138902924954891, + "learning_rate": 9.856044723969254e-06, + "loss": 0.0006, + "step": 3630 + }, + { + "epoch": 2.543675751222921, + "grad_norm": 0.012060786597430706, + "learning_rate": 9.828092243186583e-06, + "loss": 0.0013, + "step": 3640 + }, + { + "epoch": 2.5506638714185885, + "grad_norm": 0.09823895990848541, + "learning_rate": 9.800139762403915e-06, + "loss": 0.0319, + "step": 3650 + }, + { + "epoch": 2.5576519916142555, + "grad_norm": 0.009398439899086952, + "learning_rate": 9.772187281621245e-06, + "loss": 0.0012, + "step": 3660 + }, + { + "epoch": 2.564640111809923, + "grad_norm": 0.009316815994679928, + "learning_rate": 9.744234800838576e-06, + "loss": 0.0008, + "step": 3670 + }, + { + "epoch": 2.5716282320055903, + "grad_norm": 0.01164779718965292, + "learning_rate": 9.716282320055905e-06, + "loss": 0.0014, + "step": 3680 + }, + { + "epoch": 2.5786163522012577, + "grad_norm": 7.828653335571289, + "learning_rate": 9.688329839273235e-06, + "loss": 0.0452, + "step": 3690 + }, + { + "epoch": 2.585604472396925, + "grad_norm": 0.003899825969710946, + "learning_rate": 9.660377358490568e-06, + "loss": 0.0004, + "step": 3700 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 6.803162097930908, + "learning_rate": 9.632424877707898e-06, + "loss": 0.0892, + "step": 3710 + }, + { + "epoch": 2.59958071278826, + "grad_norm": 0.00998039823025465, + "learning_rate": 9.604472396925227e-06, + "loss": 0.123, + "step": 3720 + }, + { + "epoch": 2.6065688329839274, + "grad_norm": 0.0067862533032894135, + "learning_rate": 9.576519916142558e-06, + "loss": 0.001, + "step": 3730 + }, + { + "epoch": 2.613556953179595, + "grad_norm": 0.004215400665998459, + "learning_rate": 9.548567435359888e-06, + "loss": 0.177, + "step": 3740 + }, + { + "epoch": 2.620545073375262, + "grad_norm": 0.02502995729446411, + "learning_rate": 9.52061495457722e-06, + "loss": 0.008, + "step": 3750 + }, + { + "epoch": 2.6275331935709296, + "grad_norm": 5.30055046081543, + "learning_rate": 9.492662473794551e-06, + "loss": 0.1303, + "step": 3760 + }, + { + "epoch": 2.634521313766597, + "grad_norm": 0.01970675028860569, + "learning_rate": 9.46470999301188e-06, + "loss": 0.0012, + "step": 3770 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 0.004884845577180386, + "learning_rate": 9.43675751222921e-06, + "loss": 0.0307, + "step": 3780 + }, + { + "epoch": 2.6484975541579314, + "grad_norm": 0.08335088938474655, + "learning_rate": 9.408805031446541e-06, + "loss": 0.002, + "step": 3790 + }, + { + "epoch": 2.655485674353599, + "grad_norm": 0.012866518460214138, + "learning_rate": 9.380852550663873e-06, + "loss": 0.0006, + "step": 3800 + }, + { + "epoch": 2.6624737945492662, + "grad_norm": 0.024261610582470894, + "learning_rate": 9.352900069881202e-06, + "loss": 0.0012, + "step": 3810 + }, + { + "epoch": 2.6694619147449337, + "grad_norm": 0.00420184712857008, + "learning_rate": 9.324947589098533e-06, + "loss": 0.0076, + "step": 3820 + }, + { + "epoch": 2.676450034940601, + "grad_norm": 0.13258372247219086, + "learning_rate": 9.296995108315863e-06, + "loss": 0.0254, + "step": 3830 + }, + { + "epoch": 2.6834381551362685, + "grad_norm": 0.05237485095858574, + "learning_rate": 9.269042627533194e-06, + "loss": 0.2556, + "step": 3840 + }, + { + "epoch": 2.6904262753319355, + "grad_norm": 0.007042170502245426, + "learning_rate": 9.241090146750526e-06, + "loss": 0.0451, + "step": 3850 + }, + { + "epoch": 2.697414395527603, + "grad_norm": 0.0603499561548233, + "learning_rate": 9.213137665967855e-06, + "loss": 0.018, + "step": 3860 + }, + { + "epoch": 2.7044025157232703, + "grad_norm": 0.005806710571050644, + "learning_rate": 9.185185185185186e-06, + "loss": 0.0268, + "step": 3870 + }, + { + "epoch": 2.7113906359189377, + "grad_norm": 0.019446423277258873, + "learning_rate": 9.157232704402516e-06, + "loss": 0.0013, + "step": 3880 + }, + { + "epoch": 2.718378756114605, + "grad_norm": 0.046024855226278305, + "learning_rate": 9.129280223619847e-06, + "loss": 0.0094, + "step": 3890 + }, + { + "epoch": 2.7253668763102725, + "grad_norm": 0.2116568386554718, + "learning_rate": 9.101327742837177e-06, + "loss": 0.0079, + "step": 3900 + }, + { + "epoch": 2.73235499650594, + "grad_norm": 0.014955777674913406, + "learning_rate": 9.073375262054508e-06, + "loss": 0.0007, + "step": 3910 + }, + { + "epoch": 2.7393431167016074, + "grad_norm": 0.7381608486175537, + "learning_rate": 9.045422781271838e-06, + "loss": 0.0017, + "step": 3920 + }, + { + "epoch": 2.7463312368972748, + "grad_norm": 0.0042509473860263824, + "learning_rate": 9.017470300489169e-06, + "loss": 0.0328, + "step": 3930 + }, + { + "epoch": 2.753319357092942, + "grad_norm": 0.008610788732767105, + "learning_rate": 8.9895178197065e-06, + "loss": 0.0033, + "step": 3940 + }, + { + "epoch": 2.7603074772886096, + "grad_norm": 0.032417889684438705, + "learning_rate": 8.96156533892383e-06, + "loss": 0.0007, + "step": 3950 + }, + { + "epoch": 2.767295597484277, + "grad_norm": 0.003039875766262412, + "learning_rate": 8.93361285814116e-06, + "loss": 0.0006, + "step": 3960 + }, + { + "epoch": 2.774283717679944, + "grad_norm": 0.1048172265291214, + "learning_rate": 8.905660377358491e-06, + "loss": 0.0213, + "step": 3970 + }, + { + "epoch": 2.7812718378756114, + "grad_norm": 0.0031095799058675766, + "learning_rate": 8.877707896575822e-06, + "loss": 0.0358, + "step": 3980 + }, + { + "epoch": 2.788259958071279, + "grad_norm": 0.08362710475921631, + "learning_rate": 8.849755415793152e-06, + "loss": 0.0433, + "step": 3990 + }, + { + "epoch": 2.795248078266946, + "grad_norm": 0.006064527668058872, + "learning_rate": 8.821802935010483e-06, + "loss": 0.0009, + "step": 4000 + }, + { + "epoch": 2.8022361984626136, + "grad_norm": 0.009575744159519672, + "learning_rate": 8.793850454227814e-06, + "loss": 0.0003, + "step": 4010 + }, + { + "epoch": 2.809224318658281, + "grad_norm": 0.017887070775032043, + "learning_rate": 8.765897973445144e-06, + "loss": 0.0152, + "step": 4020 + }, + { + "epoch": 2.8162124388539485, + "grad_norm": 0.0015486754709854722, + "learning_rate": 8.737945492662475e-06, + "loss": 0.0692, + "step": 4030 + }, + { + "epoch": 2.8232005590496154, + "grad_norm": 0.00923781655728817, + "learning_rate": 8.709993011879805e-06, + "loss": 0.0006, + "step": 4040 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 14.54555892944336, + "learning_rate": 8.682040531097136e-06, + "loss": 0.0804, + "step": 4050 + }, + { + "epoch": 2.8371767994409502, + "grad_norm": 0.035101618617773056, + "learning_rate": 8.654088050314466e-06, + "loss": 0.0077, + "step": 4060 + }, + { + "epoch": 2.8441649196366177, + "grad_norm": 0.22155028581619263, + "learning_rate": 8.626135569531797e-06, + "loss": 0.0257, + "step": 4070 + }, + { + "epoch": 2.851153039832285, + "grad_norm": 0.013333957642316818, + "learning_rate": 8.598183088749128e-06, + "loss": 0.0033, + "step": 4080 + }, + { + "epoch": 2.8581411600279525, + "grad_norm": 0.029933998361229897, + "learning_rate": 8.570230607966458e-06, + "loss": 0.0018, + "step": 4090 + }, + { + "epoch": 2.86512928022362, + "grad_norm": 0.021358752623200417, + "learning_rate": 8.542278127183789e-06, + "loss": 0.0004, + "step": 4100 + }, + { + "epoch": 2.8721174004192873, + "grad_norm": 0.0044855596497654915, + "learning_rate": 8.51432564640112e-06, + "loss": 0.0007, + "step": 4110 + }, + { + "epoch": 2.8791055206149547, + "grad_norm": 4.667630672454834, + "learning_rate": 8.486373165618448e-06, + "loss": 0.0307, + "step": 4120 + }, + { + "epoch": 2.886093640810622, + "grad_norm": 0.0070539130829274654, + "learning_rate": 8.45842068483578e-06, + "loss": 0.0006, + "step": 4130 + }, + { + "epoch": 2.8930817610062896, + "grad_norm": 0.029348071664571762, + "learning_rate": 8.430468204053111e-06, + "loss": 0.0014, + "step": 4140 + }, + { + "epoch": 2.9000698812019565, + "grad_norm": 0.12070006877183914, + "learning_rate": 8.402515723270442e-06, + "loss": 0.0043, + "step": 4150 + }, + { + "epoch": 2.907058001397624, + "grad_norm": 0.00619628606364131, + "learning_rate": 8.37456324248777e-06, + "loss": 0.0004, + "step": 4160 + }, + { + "epoch": 2.9140461215932913, + "grad_norm": 15.53851318359375, + "learning_rate": 8.346610761705101e-06, + "loss": 0.0074, + "step": 4170 + }, + { + "epoch": 2.9210342417889588, + "grad_norm": 9.77151870727539, + "learning_rate": 8.318658280922433e-06, + "loss": 0.1214, + "step": 4180 + }, + { + "epoch": 2.928022361984626, + "grad_norm": 0.0042813620530068874, + "learning_rate": 8.290705800139764e-06, + "loss": 0.0853, + "step": 4190 + }, + { + "epoch": 2.9350104821802936, + "grad_norm": 0.0018544025951996446, + "learning_rate": 8.262753319357094e-06, + "loss": 0.0183, + "step": 4200 + }, + { + "epoch": 2.941998602375961, + "grad_norm": 0.0065402681939303875, + "learning_rate": 8.234800838574423e-06, + "loss": 0.0746, + "step": 4210 + }, + { + "epoch": 2.948986722571628, + "grad_norm": 0.0039556859992444515, + "learning_rate": 8.206848357791754e-06, + "loss": 0.0101, + "step": 4220 + }, + { + "epoch": 2.9559748427672954, + "grad_norm": 0.8577067255973816, + "learning_rate": 8.178895877009086e-06, + "loss": 0.0323, + "step": 4230 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.013186552561819553, + "learning_rate": 8.150943396226417e-06, + "loss": 0.0135, + "step": 4240 + }, + { + "epoch": 2.96995108315863, + "grad_norm": 0.008083418011665344, + "learning_rate": 8.122990915443745e-06, + "loss": 0.0133, + "step": 4250 + }, + { + "epoch": 2.9769392033542976, + "grad_norm": 13.401435852050781, + "learning_rate": 8.095038434661076e-06, + "loss": 0.0557, + "step": 4260 + }, + { + "epoch": 2.983927323549965, + "grad_norm": 0.00372733804397285, + "learning_rate": 8.067085953878407e-06, + "loss": 0.0125, + "step": 4270 + }, + { + "epoch": 2.9909154437456325, + "grad_norm": 0.21677538752555847, + "learning_rate": 8.039133473095739e-06, + "loss": 0.0026, + "step": 4280 + }, + { + "epoch": 2.9979035639413, + "grad_norm": 0.005610453896224499, + "learning_rate": 8.011180992313068e-06, + "loss": 0.0241, + "step": 4290 + }, + { + "epoch": 3.0048916841369673, + "grad_norm": 2.1690850257873535, + "learning_rate": 7.983228511530398e-06, + "loss": 0.0026, + "step": 4300 + }, + { + "epoch": 3.0118798043326347, + "grad_norm": 0.029843270778656006, + "learning_rate": 7.955276030747729e-06, + "loss": 0.0008, + "step": 4310 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 0.0014904079725965858, + "learning_rate": 7.92732354996506e-06, + "loss": 0.0224, + "step": 4320 + }, + { + "epoch": 3.025856044723969, + "grad_norm": 3.553492546081543, + "learning_rate": 7.899371069182392e-06, + "loss": 0.0045, + "step": 4330 + }, + { + "epoch": 3.0328441649196365, + "grad_norm": 12.98043155670166, + "learning_rate": 7.87141858839972e-06, + "loss": 0.0501, + "step": 4340 + }, + { + "epoch": 3.039832285115304, + "grad_norm": 0.0013916384195908904, + "learning_rate": 7.843466107617051e-06, + "loss": 0.0005, + "step": 4350 + }, + { + "epoch": 3.0468204053109713, + "grad_norm": 0.0015694483881816268, + "learning_rate": 7.815513626834382e-06, + "loss": 0.1447, + "step": 4360 + }, + { + "epoch": 3.0538085255066387, + "grad_norm": 0.0035848869010806084, + "learning_rate": 7.787561146051712e-06, + "loss": 0.0015, + "step": 4370 + }, + { + "epoch": 3.060796645702306, + "grad_norm": 0.005436725448817015, + "learning_rate": 7.759608665269043e-06, + "loss": 0.0619, + "step": 4380 + }, + { + "epoch": 3.0677847658979736, + "grad_norm": 0.01864534057676792, + "learning_rate": 7.731656184486373e-06, + "loss": 0.013, + "step": 4390 + }, + { + "epoch": 3.074772886093641, + "grad_norm": 0.004284723661839962, + "learning_rate": 7.703703703703704e-06, + "loss": 0.0883, + "step": 4400 + }, + { + "epoch": 3.0817610062893084, + "grad_norm": 0.06042485311627388, + "learning_rate": 7.675751222921035e-06, + "loss": 0.001, + "step": 4410 + }, + { + "epoch": 3.0887491264849753, + "grad_norm": 0.005046827718615532, + "learning_rate": 7.647798742138365e-06, + "loss": 0.0218, + "step": 4420 + }, + { + "epoch": 3.0957372466806428, + "grad_norm": 6.6962361335754395, + "learning_rate": 7.619846261355696e-06, + "loss": 0.0592, + "step": 4430 + }, + { + "epoch": 3.10272536687631, + "grad_norm": 0.021092260256409645, + "learning_rate": 7.591893780573026e-06, + "loss": 0.0005, + "step": 4440 + }, + { + "epoch": 3.1097134870719776, + "grad_norm": 0.009671575389802456, + "learning_rate": 7.563941299790357e-06, + "loss": 0.0008, + "step": 4450 + }, + { + "epoch": 3.116701607267645, + "grad_norm": 0.011312313377857208, + "learning_rate": 7.535988819007688e-06, + "loss": 0.1175, + "step": 4460 + }, + { + "epoch": 3.1236897274633124, + "grad_norm": 0.009435279294848442, + "learning_rate": 7.508036338225018e-06, + "loss": 0.2328, + "step": 4470 + }, + { + "epoch": 3.13067784765898, + "grad_norm": 0.014669501222670078, + "learning_rate": 7.4800838574423485e-06, + "loss": 0.0003, + "step": 4480 + }, + { + "epoch": 3.1376659678546472, + "grad_norm": 0.005058380775153637, + "learning_rate": 7.452131376659679e-06, + "loss": 0.0698, + "step": 4490 + }, + { + "epoch": 3.1446540880503147, + "grad_norm": 0.004547697491943836, + "learning_rate": 7.42417889587701e-06, + "loss": 0.0962, + "step": 4500 + }, + { + "epoch": 3.1516422082459816, + "grad_norm": 0.030391795560717583, + "learning_rate": 7.396226415094339e-06, + "loss": 0.0011, + "step": 4510 + }, + { + "epoch": 3.158630328441649, + "grad_norm": 0.00580151192843914, + "learning_rate": 7.368273934311671e-06, + "loss": 0.0006, + "step": 4520 + }, + { + "epoch": 3.1656184486373165, + "grad_norm": 0.08100881427526474, + "learning_rate": 7.340321453529001e-06, + "loss": 0.11, + "step": 4530 + }, + { + "epoch": 3.172606568832984, + "grad_norm": 0.17724710702896118, + "learning_rate": 7.312368972746332e-06, + "loss": 0.0141, + "step": 4540 + }, + { + "epoch": 3.1795946890286513, + "grad_norm": 0.015520211309194565, + "learning_rate": 7.2844164919636625e-06, + "loss": 0.0051, + "step": 4550 + }, + { + "epoch": 3.1865828092243187, + "grad_norm": 0.06920566409826279, + "learning_rate": 7.256464011180992e-06, + "loss": 0.002, + "step": 4560 + }, + { + "epoch": 3.193570929419986, + "grad_norm": 0.036474499851465225, + "learning_rate": 7.228511530398324e-06, + "loss": 0.0007, + "step": 4570 + }, + { + "epoch": 3.2005590496156535, + "grad_norm": 0.0034632752649486065, + "learning_rate": 7.200559049615654e-06, + "loss": 0.0201, + "step": 4580 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 0.01172909326851368, + "learning_rate": 7.172606568832985e-06, + "loss": 0.0072, + "step": 4590 + }, + { + "epoch": 3.214535290006988, + "grad_norm": 0.010598461143672466, + "learning_rate": 7.1446540880503145e-06, + "loss": 0.0035, + "step": 4600 + }, + { + "epoch": 3.2215234102026553, + "grad_norm": 0.0054387301206588745, + "learning_rate": 7.116701607267645e-06, + "loss": 0.0021, + "step": 4610 + }, + { + "epoch": 3.2285115303983227, + "grad_norm": 0.004587017931044102, + "learning_rate": 7.0887491264849764e-06, + "loss": 0.0556, + "step": 4620 + }, + { + "epoch": 3.23549965059399, + "grad_norm": 0.541749894618988, + "learning_rate": 7.060796645702307e-06, + "loss": 0.0277, + "step": 4630 + }, + { + "epoch": 3.2424877707896576, + "grad_norm": 0.020434614270925522, + "learning_rate": 7.032844164919638e-06, + "loss": 0.0006, + "step": 4640 + }, + { + "epoch": 3.249475890985325, + "grad_norm": 0.08088918775320053, + "learning_rate": 7.004891684136967e-06, + "loss": 0.0017, + "step": 4650 + }, + { + "epoch": 3.2564640111809924, + "grad_norm": 0.00651569152250886, + "learning_rate": 6.976939203354298e-06, + "loss": 0.0048, + "step": 4660 + }, + { + "epoch": 3.26345213137666, + "grad_norm": 0.012207509018480778, + "learning_rate": 6.948986722571629e-06, + "loss": 0.0527, + "step": 4670 + }, + { + "epoch": 3.270440251572327, + "grad_norm": 0.40066081285476685, + "learning_rate": 6.92103424178896e-06, + "loss": 0.0013, + "step": 4680 + }, + { + "epoch": 3.2774283717679946, + "grad_norm": 0.037062227725982666, + "learning_rate": 6.8930817610062896e-06, + "loss": 0.0767, + "step": 4690 + }, + { + "epoch": 3.2844164919636616, + "grad_norm": 0.002083055442199111, + "learning_rate": 6.86512928022362e-06, + "loss": 0.0313, + "step": 4700 + }, + { + "epoch": 3.291404612159329, + "grad_norm": 0.01235890295356512, + "learning_rate": 6.837176799440951e-06, + "loss": 0.0003, + "step": 4710 + }, + { + "epoch": 3.2983927323549964, + "grad_norm": 0.0049998946487903595, + "learning_rate": 6.809224318658282e-06, + "loss": 0.0005, + "step": 4720 + }, + { + "epoch": 3.305380852550664, + "grad_norm": 0.017238592728972435, + "learning_rate": 6.781271837875612e-06, + "loss": 0.0005, + "step": 4730 + }, + { + "epoch": 3.3123689727463312, + "grad_norm": 0.0085734399035573, + "learning_rate": 6.753319357092942e-06, + "loss": 0.0008, + "step": 4740 + }, + { + "epoch": 3.3193570929419987, + "grad_norm": 0.02260555699467659, + "learning_rate": 6.725366876310273e-06, + "loss": 0.0006, + "step": 4750 + }, + { + "epoch": 3.326345213137666, + "grad_norm": 0.006640274077653885, + "learning_rate": 6.6974143955276035e-06, + "loss": 0.0547, + "step": 4760 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.006710624787956476, + "learning_rate": 6.669461914744935e-06, + "loss": 0.0517, + "step": 4770 + }, + { + "epoch": 3.3403214535290005, + "grad_norm": 0.015057160519063473, + "learning_rate": 6.641509433962265e-06, + "loss": 0.0007, + "step": 4780 + }, + { + "epoch": 3.347309573724668, + "grad_norm": 0.01974908635020256, + "learning_rate": 6.613556953179595e-06, + "loss": 0.0005, + "step": 4790 + }, + { + "epoch": 3.3542976939203353, + "grad_norm": 0.005637224763631821, + "learning_rate": 6.585604472396926e-06, + "loss": 0.0327, + "step": 4800 + }, + { + "epoch": 3.3612858141160027, + "grad_norm": 0.0025775651447474957, + "learning_rate": 6.557651991614256e-06, + "loss": 0.0791, + "step": 4810 + }, + { + "epoch": 3.36827393431167, + "grad_norm": 0.005838762037456036, + "learning_rate": 6.529699510831586e-06, + "loss": 0.0004, + "step": 4820 + }, + { + "epoch": 3.3752620545073375, + "grad_norm": 0.3478299379348755, + "learning_rate": 6.5017470300489175e-06, + "loss": 0.0011, + "step": 4830 + }, + { + "epoch": 3.382250174703005, + "grad_norm": 0.005249143578112125, + "learning_rate": 6.473794549266248e-06, + "loss": 0.0006, + "step": 4840 + }, + { + "epoch": 3.3892382948986723, + "grad_norm": 1.6451876163482666, + "learning_rate": 6.445842068483579e-06, + "loss": 0.0598, + "step": 4850 + }, + { + "epoch": 3.3962264150943398, + "grad_norm": 0.005776327569037676, + "learning_rate": 6.417889587700909e-06, + "loss": 0.0008, + "step": 4860 + }, + { + "epoch": 3.403214535290007, + "grad_norm": 0.0033416105434298515, + "learning_rate": 6.389937106918239e-06, + "loss": 0.0572, + "step": 4870 + }, + { + "epoch": 3.4102026554856746, + "grad_norm": 0.002429103944450617, + "learning_rate": 6.36198462613557e-06, + "loss": 0.1072, + "step": 4880 + }, + { + "epoch": 3.4171907756813416, + "grad_norm": 8.708498001098633, + "learning_rate": 6.334032145352901e-06, + "loss": 0.1464, + "step": 4890 + }, + { + "epoch": 3.424178895877009, + "grad_norm": 0.0021170196123421192, + "learning_rate": 6.3060796645702315e-06, + "loss": 0.0004, + "step": 4900 + }, + { + "epoch": 3.4311670160726764, + "grad_norm": 0.012093136087059975, + "learning_rate": 6.278127183787561e-06, + "loss": 0.0619, + "step": 4910 + }, + { + "epoch": 3.438155136268344, + "grad_norm": 0.005036477465182543, + "learning_rate": 6.250174703004892e-06, + "loss": 0.0004, + "step": 4920 + }, + { + "epoch": 3.445143256464011, + "grad_norm": 0.004095244687050581, + "learning_rate": 6.222222222222223e-06, + "loss": 0.0004, + "step": 4930 + }, + { + "epoch": 3.4521313766596786, + "grad_norm": 0.019728029146790504, + "learning_rate": 6.194269741439554e-06, + "loss": 0.0406, + "step": 4940 + }, + { + "epoch": 3.459119496855346, + "grad_norm": 0.025431908667087555, + "learning_rate": 6.1663172606568835e-06, + "loss": 0.0018, + "step": 4950 + }, + { + "epoch": 3.4661076170510134, + "grad_norm": 0.0277637280523777, + "learning_rate": 6.138364779874214e-06, + "loss": 0.0029, + "step": 4960 + }, + { + "epoch": 3.4730957372466804, + "grad_norm": 0.4063994288444519, + "learning_rate": 6.110412299091545e-06, + "loss": 0.0864, + "step": 4970 + }, + { + "epoch": 3.480083857442348, + "grad_norm": 0.06079300493001938, + "learning_rate": 6.082459818308876e-06, + "loss": 0.003, + "step": 4980 + }, + { + "epoch": 3.4870719776380152, + "grad_norm": 30.48621368408203, + "learning_rate": 6.0545073375262066e-06, + "loss": 0.0506, + "step": 4990 + }, + { + "epoch": 3.4940600978336827, + "grad_norm": 0.0112813301384449, + "learning_rate": 6.026554856743536e-06, + "loss": 0.0003, + "step": 5000 + }, + { + "epoch": 3.50104821802935, + "grad_norm": 0.005831410177052021, + "learning_rate": 5.998602375960867e-06, + "loss": 0.0006, + "step": 5010 + }, + { + "epoch": 3.5080363382250175, + "grad_norm": 0.0054182917810976505, + "learning_rate": 5.970649895178197e-06, + "loss": 0.0004, + "step": 5020 + }, + { + "epoch": 3.515024458420685, + "grad_norm": 0.12203137576580048, + "learning_rate": 5.942697414395529e-06, + "loss": 0.1176, + "step": 5030 + }, + { + "epoch": 3.5220125786163523, + "grad_norm": 0.002840982051566243, + "learning_rate": 5.9147449336128585e-06, + "loss": 0.0133, + "step": 5040 + }, + { + "epoch": 3.5290006988120197, + "grad_norm": 0.026787206530570984, + "learning_rate": 5.886792452830189e-06, + "loss": 0.089, + "step": 5050 + }, + { + "epoch": 3.535988819007687, + "grad_norm": 0.06147437170147896, + "learning_rate": 5.85883997204752e-06, + "loss": 0.0035, + "step": 5060 + }, + { + "epoch": 3.5429769392033545, + "grad_norm": 9.293349266052246, + "learning_rate": 5.83088749126485e-06, + "loss": 0.0678, + "step": 5070 + }, + { + "epoch": 3.5499650593990215, + "grad_norm": 0.006805673241615295, + "learning_rate": 5.80293501048218e-06, + "loss": 0.0056, + "step": 5080 + }, + { + "epoch": 3.556953179594689, + "grad_norm": 0.006203502882272005, + "learning_rate": 5.774982529699511e-06, + "loss": 0.06, + "step": 5090 + }, + { + "epoch": 3.5639412997903563, + "grad_norm": 9.436474800109863, + "learning_rate": 5.747030048916842e-06, + "loss": 0.1251, + "step": 5100 + }, + { + "epoch": 3.5709294199860238, + "grad_norm": 0.007246874738484621, + "learning_rate": 5.7190775681341725e-06, + "loss": 0.0004, + "step": 5110 + }, + { + "epoch": 3.577917540181691, + "grad_norm": 0.0030432320199906826, + "learning_rate": 5.691125087351503e-06, + "loss": 0.0005, + "step": 5120 + }, + { + "epoch": 3.5849056603773586, + "grad_norm": 0.011605805717408657, + "learning_rate": 5.663172606568833e-06, + "loss": 0.0247, + "step": 5130 + }, + { + "epoch": 3.591893780573026, + "grad_norm": 0.06202976033091545, + "learning_rate": 5.635220125786164e-06, + "loss": 0.0597, + "step": 5140 + }, + { + "epoch": 3.598881900768693, + "grad_norm": 0.0037594842724502087, + "learning_rate": 5.607267645003495e-06, + "loss": 0.0011, + "step": 5150 + }, + { + "epoch": 3.6058700209643604, + "grad_norm": 0.006377144251018763, + "learning_rate": 5.579315164220825e-06, + "loss": 0.0016, + "step": 5160 + }, + { + "epoch": 3.612858141160028, + "grad_norm": 0.006404062733054161, + "learning_rate": 5.551362683438155e-06, + "loss": 0.1364, + "step": 5170 + }, + { + "epoch": 3.619846261355695, + "grad_norm": 0.00531811686232686, + "learning_rate": 5.523410202655486e-06, + "loss": 0.0012, + "step": 5180 + }, + { + "epoch": 3.6268343815513626, + "grad_norm": 0.002592286095023155, + "learning_rate": 5.495457721872817e-06, + "loss": 0.0004, + "step": 5190 + }, + { + "epoch": 3.63382250174703, + "grad_norm": 0.013801293447613716, + "learning_rate": 5.467505241090148e-06, + "loss": 0.0106, + "step": 5200 + }, + { + "epoch": 3.6408106219426974, + "grad_norm": 0.0015551424585282803, + "learning_rate": 5.439552760307478e-06, + "loss": 0.0004, + "step": 5210 + }, + { + "epoch": 3.647798742138365, + "grad_norm": 0.013982684351503849, + "learning_rate": 5.411600279524808e-06, + "loss": 0.0004, + "step": 5220 + }, + { + "epoch": 3.6547868623340323, + "grad_norm": 0.007577819749712944, + "learning_rate": 5.3836477987421385e-06, + "loss": 0.0201, + "step": 5230 + }, + { + "epoch": 3.6617749825296997, + "grad_norm": 0.0024994234554469585, + "learning_rate": 5.35569531795947e-06, + "loss": 0.0014, + "step": 5240 + }, + { + "epoch": 3.668763102725367, + "grad_norm": 0.303627073764801, + "learning_rate": 5.3277428371768004e-06, + "loss": 0.0006, + "step": 5250 + }, + { + "epoch": 3.6757512229210345, + "grad_norm": 0.0030300321523100138, + "learning_rate": 5.29979035639413e-06, + "loss": 0.0011, + "step": 5260 + }, + { + "epoch": 3.6827393431167015, + "grad_norm": 0.02062870003283024, + "learning_rate": 5.271837875611461e-06, + "loss": 0.0583, + "step": 5270 + }, + { + "epoch": 3.689727463312369, + "grad_norm": 0.005427168216556311, + "learning_rate": 5.243885394828791e-06, + "loss": 0.0355, + "step": 5280 + }, + { + "epoch": 3.6967155835080363, + "grad_norm": 0.14737765491008759, + "learning_rate": 5.215932914046123e-06, + "loss": 0.0011, + "step": 5290 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 0.047865480184555054, + "learning_rate": 5.1879804332634524e-06, + "loss": 0.0003, + "step": 5300 + }, + { + "epoch": 3.710691823899371, + "grad_norm": 0.005643247161060572, + "learning_rate": 5.160027952480783e-06, + "loss": 0.0703, + "step": 5310 + }, + { + "epoch": 3.7176799440950385, + "grad_norm": 0.002993800677359104, + "learning_rate": 5.1320754716981136e-06, + "loss": 0.0003, + "step": 5320 + }, + { + "epoch": 3.724668064290706, + "grad_norm": 0.0012869026977568865, + "learning_rate": 5.104122990915444e-06, + "loss": 0.0002, + "step": 5330 + }, + { + "epoch": 3.731656184486373, + "grad_norm": 0.015380630269646645, + "learning_rate": 5.0761705101327755e-06, + "loss": 0.0116, + "step": 5340 + }, + { + "epoch": 3.7386443046820403, + "grad_norm": 0.012133465148508549, + "learning_rate": 5.048218029350105e-06, + "loss": 0.0437, + "step": 5350 + }, + { + "epoch": 3.7456324248777078, + "grad_norm": 0.004281846806406975, + "learning_rate": 5.020265548567436e-06, + "loss": 0.0008, + "step": 5360 + }, + { + "epoch": 3.752620545073375, + "grad_norm": 7.5166401863098145, + "learning_rate": 4.992313067784766e-06, + "loss": 0.0455, + "step": 5370 + }, + { + "epoch": 3.7596086652690426, + "grad_norm": 0.04558609798550606, + "learning_rate": 4.964360587002097e-06, + "loss": 0.0005, + "step": 5380 + }, + { + "epoch": 3.76659678546471, + "grad_norm": 0.006235541310161352, + "learning_rate": 4.9364081062194275e-06, + "loss": 0.0413, + "step": 5390 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 0.003967160824686289, + "learning_rate": 4.908455625436758e-06, + "loss": 0.0004, + "step": 5400 + }, + { + "epoch": 3.780573025856045, + "grad_norm": 0.006097977515310049, + "learning_rate": 4.880503144654089e-06, + "loss": 0.0004, + "step": 5410 + }, + { + "epoch": 3.7875611460517122, + "grad_norm": 0.0025531253777444363, + "learning_rate": 4.852550663871418e-06, + "loss": 0.071, + "step": 5420 + }, + { + "epoch": 3.7945492662473796, + "grad_norm": 0.058856215327978134, + "learning_rate": 4.82459818308875e-06, + "loss": 0.0088, + "step": 5430 + }, + { + "epoch": 3.801537386443047, + "grad_norm": 0.0011401502415537834, + "learning_rate": 4.79664570230608e-06, + "loss": 0.0007, + "step": 5440 + }, + { + "epoch": 3.808525506638714, + "grad_norm": 0.18255279958248138, + "learning_rate": 4.768693221523411e-06, + "loss": 0.0009, + "step": 5450 + }, + { + "epoch": 3.8155136268343814, + "grad_norm": 0.0024249760899692774, + "learning_rate": 4.7407407407407415e-06, + "loss": 0.0001, + "step": 5460 + }, + { + "epoch": 3.822501747030049, + "grad_norm": 0.002625074004754424, + "learning_rate": 4.712788259958071e-06, + "loss": 0.0173, + "step": 5470 + }, + { + "epoch": 3.8294898672257163, + "grad_norm": 0.00629691407084465, + "learning_rate": 4.684835779175403e-06, + "loss": 0.0003, + "step": 5480 + }, + { + "epoch": 3.8364779874213837, + "grad_norm": 0.007325862068682909, + "learning_rate": 4.656883298392732e-06, + "loss": 0.0002, + "step": 5490 + }, + { + "epoch": 3.843466107617051, + "grad_norm": 1.3780497312545776, + "learning_rate": 4.628930817610064e-06, + "loss": 0.0025, + "step": 5500 + }, + { + "epoch": 3.8504542278127185, + "grad_norm": 0.002903768327087164, + "learning_rate": 4.6009783368273935e-06, + "loss": 0.0613, + "step": 5510 + }, + { + "epoch": 3.8574423480083855, + "grad_norm": 3.889617443084717, + "learning_rate": 4.573025856044724e-06, + "loss": 0.0677, + "step": 5520 + }, + { + "epoch": 3.864430468204053, + "grad_norm": 0.0012939589796587825, + "learning_rate": 4.545073375262055e-06, + "loss": 0.0002, + "step": 5530 + }, + { + "epoch": 3.8714185883997203, + "grad_norm": 0.0032597167883068323, + "learning_rate": 4.517120894479385e-06, + "loss": 0.0685, + "step": 5540 + }, + { + "epoch": 3.8784067085953877, + "grad_norm": 0.004127015825361013, + "learning_rate": 4.489168413696717e-06, + "loss": 0.006, + "step": 5550 + }, + { + "epoch": 3.885394828791055, + "grad_norm": 0.005040404852479696, + "learning_rate": 4.461215932914046e-06, + "loss": 0.0003, + "step": 5560 + }, + { + "epoch": 3.8923829489867225, + "grad_norm": 0.006766372825950384, + "learning_rate": 4.433263452131377e-06, + "loss": 0.0013, + "step": 5570 + }, + { + "epoch": 3.89937106918239, + "grad_norm": 0.014896499924361706, + "learning_rate": 4.4053109713487075e-06, + "loss": 0.1589, + "step": 5580 + }, + { + "epoch": 3.9063591893780574, + "grad_norm": 0.008297051303088665, + "learning_rate": 4.377358490566038e-06, + "loss": 0.0012, + "step": 5590 + }, + { + "epoch": 3.913347309573725, + "grad_norm": 0.015372090972959995, + "learning_rate": 4.349406009783369e-06, + "loss": 0.0002, + "step": 5600 + }, + { + "epoch": 3.920335429769392, + "grad_norm": 0.008451149798929691, + "learning_rate": 4.321453529000699e-06, + "loss": 0.0004, + "step": 5610 + }, + { + "epoch": 3.9273235499650596, + "grad_norm": 0.014892960898578167, + "learning_rate": 4.29350104821803e-06, + "loss": 0.0779, + "step": 5620 + }, + { + "epoch": 3.934311670160727, + "grad_norm": 0.008910837583243847, + "learning_rate": 4.26554856743536e-06, + "loss": 0.0491, + "step": 5630 + }, + { + "epoch": 3.941299790356394, + "grad_norm": 0.004672807175666094, + "learning_rate": 4.237596086652691e-06, + "loss": 0.0003, + "step": 5640 + }, + { + "epoch": 3.9482879105520614, + "grad_norm": 0.002980695106089115, + "learning_rate": 4.209643605870021e-06, + "loss": 0.0008, + "step": 5650 + }, + { + "epoch": 3.955276030747729, + "grad_norm": 0.0031832789536565542, + "learning_rate": 4.181691125087352e-06, + "loss": 0.0776, + "step": 5660 + }, + { + "epoch": 3.9622641509433962, + "grad_norm": 0.00444215489551425, + "learning_rate": 4.1537386443046825e-06, + "loss": 0.0002, + "step": 5670 + }, + { + "epoch": 3.9692522711390636, + "grad_norm": 0.005608571693301201, + "learning_rate": 4.125786163522013e-06, + "loss": 0.031, + "step": 5680 + }, + { + "epoch": 3.976240391334731, + "grad_norm": 0.009694025851786137, + "learning_rate": 4.097833682739344e-06, + "loss": 0.1584, + "step": 5690 + }, + { + "epoch": 3.9832285115303985, + "grad_norm": 26.081716537475586, + "learning_rate": 4.069881201956674e-06, + "loss": 0.0181, + "step": 5700 + }, + { + "epoch": 3.9902166317260654, + "grad_norm": 0.0018772291950881481, + "learning_rate": 4.041928721174004e-06, + "loss": 0.0108, + "step": 5710 + }, + { + "epoch": 3.997204751921733, + "grad_norm": 0.0026039350777864456, + "learning_rate": 4.013976240391335e-06, + "loss": 0.0536, + "step": 5720 + } + ], + "logging_steps": 10, + "max_steps": 7155, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6187647906816000.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}