diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6633499170812603, + "eval_steps": 500, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002763957987838585, + "grad_norm": 15.539673805236816, + "learning_rate": 5.000000000000001e-07, + "loss": 11.227, + "step": 5 + }, + { + "epoch": 0.000552791597567717, + "grad_norm": 16.613014221191406, + "learning_rate": 1.0000000000000002e-06, + "loss": 11.1624, + "step": 10 + }, + { + "epoch": 0.0008291873963515755, + "grad_norm": 16.244783401489258, + "learning_rate": 1.5e-06, + "loss": 11.0487, + "step": 15 + }, + { + "epoch": 0.001105583195135434, + "grad_norm": 12.021380424499512, + "learning_rate": 2.0000000000000003e-06, + "loss": 10.8436, + "step": 20 + }, + { + "epoch": 0.0013819789939192924, + "grad_norm": 11.764349937438965, + "learning_rate": 2.5e-06, + "loss": 10.637, + "step": 25 + }, + { + "epoch": 0.001658374792703151, + "grad_norm": 8.930374145507812, + "learning_rate": 3e-06, + "loss": 10.4451, + "step": 30 + }, + { + "epoch": 0.0019347705914870095, + "grad_norm": 6.179296016693115, + "learning_rate": 3.5000000000000004e-06, + "loss": 10.2835, + "step": 35 + }, + { + "epoch": 0.002211166390270868, + "grad_norm": 5.235034942626953, + "learning_rate": 4.000000000000001e-06, + "loss": 10.1911, + "step": 40 + }, + { + "epoch": 0.0024875621890547263, + "grad_norm": 4.519562244415283, + "learning_rate": 4.5e-06, + "loss": 10.1875, + "step": 45 + }, + { + "epoch": 0.002763957987838585, + "grad_norm": 4.634215354919434, + "learning_rate": 5e-06, + "loss": 9.9655, + "step": 50 + }, + { + "epoch": 0.0030403537866224434, + "grad_norm": 4.817412853240967, + "learning_rate": 5.500000000000001e-06, + "loss": 9.9965, + "step": 55 + }, + { + "epoch": 0.003316749585406302, + "grad_norm": 4.490515232086182, + "learning_rate": 6e-06, + "loss": 9.9253, + "step": 60 + }, + { + "epoch": 0.0035931453841901604, + "grad_norm": 3.931199789047241, + "learning_rate": 6.5000000000000004e-06, + "loss": 9.9455, + "step": 65 + }, + { + "epoch": 0.003869541182974019, + "grad_norm": 4.179060935974121, + "learning_rate": 7.000000000000001e-06, + "loss": 9.7904, + "step": 70 + }, + { + "epoch": 0.0041459369817578775, + "grad_norm": 4.031132698059082, + "learning_rate": 7.5e-06, + "loss": 9.7358, + "step": 75 + }, + { + "epoch": 0.004422332780541736, + "grad_norm": 3.7969465255737305, + "learning_rate": 8.000000000000001e-06, + "loss": 9.7051, + "step": 80 + }, + { + "epoch": 0.0046987285793255945, + "grad_norm": 4.057478904724121, + "learning_rate": 8.500000000000002e-06, + "loss": 9.6854, + "step": 85 + }, + { + "epoch": 0.004975124378109453, + "grad_norm": 4.059418201446533, + "learning_rate": 9e-06, + "loss": 9.6271, + "step": 90 + }, + { + "epoch": 0.005251520176893312, + "grad_norm": 4.01811408996582, + "learning_rate": 9.5e-06, + "loss": 9.614, + "step": 95 + }, + { + "epoch": 0.00552791597567717, + "grad_norm": 4.239936828613281, + "learning_rate": 1e-05, + "loss": 9.423, + "step": 100 + }, + { + "epoch": 0.005804311774461028, + "grad_norm": 3.8491415977478027, + "learning_rate": 1.05e-05, + "loss": 9.5076, + "step": 105 + }, + { + "epoch": 0.006080707573244887, + "grad_norm": 3.6000020503997803, + "learning_rate": 1.1000000000000001e-05, + "loss": 9.5322, + "step": 110 + }, + { + "epoch": 0.006357103372028745, + "grad_norm": 5.558045387268066, + "learning_rate": 1.1500000000000002e-05, + "loss": 9.2759, + "step": 115 + }, + { + "epoch": 0.006633499170812604, + "grad_norm": 4.2432637214660645, + "learning_rate": 1.2e-05, + "loss": 9.3174, + "step": 120 + }, + { + "epoch": 0.006909894969596462, + "grad_norm": 3.7061004638671875, + "learning_rate": 1.25e-05, + "loss": 9.3099, + "step": 125 + }, + { + "epoch": 0.007186290768380321, + "grad_norm": 3.863173007965088, + "learning_rate": 1.3000000000000001e-05, + "loss": 9.0892, + "step": 130 + }, + { + "epoch": 0.007462686567164179, + "grad_norm": 4.161376953125, + "learning_rate": 1.3500000000000001e-05, + "loss": 9.0976, + "step": 135 + }, + { + "epoch": 0.007739082365948038, + "grad_norm": 3.59145188331604, + "learning_rate": 1.4000000000000001e-05, + "loss": 9.2131, + "step": 140 + }, + { + "epoch": 0.008015478164731896, + "grad_norm": 3.4447615146636963, + "learning_rate": 1.45e-05, + "loss": 8.9092, + "step": 145 + }, + { + "epoch": 0.008291873963515755, + "grad_norm": 3.227489471435547, + "learning_rate": 1.5e-05, + "loss": 8.927, + "step": 150 + }, + { + "epoch": 0.008568269762299612, + "grad_norm": 3.0332329273223877, + "learning_rate": 1.55e-05, + "loss": 8.8161, + "step": 155 + }, + { + "epoch": 0.008844665561083471, + "grad_norm": 4.128123760223389, + "learning_rate": 1.6000000000000003e-05, + "loss": 8.8824, + "step": 160 + }, + { + "epoch": 0.00912106135986733, + "grad_norm": 3.798156499862671, + "learning_rate": 1.65e-05, + "loss": 8.7461, + "step": 165 + }, + { + "epoch": 0.009397457158651189, + "grad_norm": 4.008655548095703, + "learning_rate": 1.7000000000000003e-05, + "loss": 8.5756, + "step": 170 + }, + { + "epoch": 0.009673852957435046, + "grad_norm": 3.501603603363037, + "learning_rate": 1.75e-05, + "loss": 8.7104, + "step": 175 + }, + { + "epoch": 0.009950248756218905, + "grad_norm": 3.0541610717773438, + "learning_rate": 1.8e-05, + "loss": 8.6184, + "step": 180 + }, + { + "epoch": 0.010226644555002764, + "grad_norm": 3.2599024772644043, + "learning_rate": 1.85e-05, + "loss": 8.571, + "step": 185 + }, + { + "epoch": 0.010503040353786623, + "grad_norm": 2.8244316577911377, + "learning_rate": 1.9e-05, + "loss": 8.5187, + "step": 190 + }, + { + "epoch": 0.01077943615257048, + "grad_norm": 3.0465221405029297, + "learning_rate": 1.9500000000000003e-05, + "loss": 8.5168, + "step": 195 + }, + { + "epoch": 0.01105583195135434, + "grad_norm": 3.2354133129119873, + "learning_rate": 2e-05, + "loss": 8.2329, + "step": 200 + }, + { + "epoch": 0.011332227750138198, + "grad_norm": 2.9300291538238525, + "learning_rate": 2.05e-05, + "loss": 8.3095, + "step": 205 + }, + { + "epoch": 0.011608623548922056, + "grad_norm": 3.2358179092407227, + "learning_rate": 2.1e-05, + "loss": 8.3495, + "step": 210 + }, + { + "epoch": 0.011885019347705915, + "grad_norm": 3.006814479827881, + "learning_rate": 2.15e-05, + "loss": 8.068, + "step": 215 + }, + { + "epoch": 0.012161415146489774, + "grad_norm": 3.092186689376831, + "learning_rate": 2.2000000000000003e-05, + "loss": 8.0844, + "step": 220 + }, + { + "epoch": 0.012437810945273632, + "grad_norm": 2.7867186069488525, + "learning_rate": 2.25e-05, + "loss": 7.7844, + "step": 225 + }, + { + "epoch": 0.01271420674405749, + "grad_norm": 3.151169538497925, + "learning_rate": 2.3000000000000003e-05, + "loss": 8.0737, + "step": 230 + }, + { + "epoch": 0.012990602542841349, + "grad_norm": 2.7488925457000732, + "learning_rate": 2.35e-05, + "loss": 8.0835, + "step": 235 + }, + { + "epoch": 0.013266998341625208, + "grad_norm": 2.673896312713623, + "learning_rate": 2.4e-05, + "loss": 7.9807, + "step": 240 + }, + { + "epoch": 0.013543394140409067, + "grad_norm": 3.0482277870178223, + "learning_rate": 2.45e-05, + "loss": 7.7995, + "step": 245 + }, + { + "epoch": 0.013819789939192924, + "grad_norm": 2.6868321895599365, + "learning_rate": 2.5e-05, + "loss": 7.9454, + "step": 250 + }, + { + "epoch": 0.014096185737976783, + "grad_norm": 2.6580042839050293, + "learning_rate": 2.5500000000000003e-05, + "loss": 7.7582, + "step": 255 + }, + { + "epoch": 0.014372581536760642, + "grad_norm": 2.2857472896575928, + "learning_rate": 2.6000000000000002e-05, + "loss": 7.784, + "step": 260 + }, + { + "epoch": 0.014648977335544499, + "grad_norm": 2.3364319801330566, + "learning_rate": 2.6500000000000004e-05, + "loss": 7.8675, + "step": 265 + }, + { + "epoch": 0.014925373134328358, + "grad_norm": 2.896152973175049, + "learning_rate": 2.7000000000000002e-05, + "loss": 7.6433, + "step": 270 + }, + { + "epoch": 0.015201768933112217, + "grad_norm": 3.6049489974975586, + "learning_rate": 2.7500000000000004e-05, + "loss": 7.4608, + "step": 275 + }, + { + "epoch": 0.015478164731896076, + "grad_norm": 2.9428746700286865, + "learning_rate": 2.8000000000000003e-05, + "loss": 7.4888, + "step": 280 + }, + { + "epoch": 0.015754560530679935, + "grad_norm": 3.2422735691070557, + "learning_rate": 2.8499999999999998e-05, + "loss": 7.4838, + "step": 285 + }, + { + "epoch": 0.016030956329463792, + "grad_norm": 2.6515588760375977, + "learning_rate": 2.9e-05, + "loss": 7.4845, + "step": 290 + }, + { + "epoch": 0.01630735212824765, + "grad_norm": 2.2527542114257812, + "learning_rate": 2.95e-05, + "loss": 7.679, + "step": 295 + }, + { + "epoch": 0.01658374792703151, + "grad_norm": 2.841431140899658, + "learning_rate": 3e-05, + "loss": 7.5644, + "step": 300 + }, + { + "epoch": 0.016860143725815367, + "grad_norm": 3.071385383605957, + "learning_rate": 3.05e-05, + "loss": 7.3476, + "step": 305 + }, + { + "epoch": 0.017136539524599224, + "grad_norm": 2.392310380935669, + "learning_rate": 3.1e-05, + "loss": 7.4821, + "step": 310 + }, + { + "epoch": 0.017412935323383085, + "grad_norm": 2.3450920581817627, + "learning_rate": 3.15e-05, + "loss": 7.6444, + "step": 315 + }, + { + "epoch": 0.017689331122166942, + "grad_norm": 2.5122437477111816, + "learning_rate": 3.2000000000000005e-05, + "loss": 7.3695, + "step": 320 + }, + { + "epoch": 0.017965726920950803, + "grad_norm": 3.0981411933898926, + "learning_rate": 3.2500000000000004e-05, + "loss": 7.2349, + "step": 325 + }, + { + "epoch": 0.01824212271973466, + "grad_norm": 2.4662463665008545, + "learning_rate": 3.3e-05, + "loss": 7.4194, + "step": 330 + }, + { + "epoch": 0.018518518518518517, + "grad_norm": 2.5492987632751465, + "learning_rate": 3.35e-05, + "loss": 7.2674, + "step": 335 + }, + { + "epoch": 0.018794914317302378, + "grad_norm": 2.4681410789489746, + "learning_rate": 3.4000000000000007e-05, + "loss": 7.1909, + "step": 340 + }, + { + "epoch": 0.019071310116086235, + "grad_norm": 2.5183534622192383, + "learning_rate": 3.45e-05, + "loss": 7.3093, + "step": 345 + }, + { + "epoch": 0.019347705914870093, + "grad_norm": 2.3964405059814453, + "learning_rate": 3.5e-05, + "loss": 7.3742, + "step": 350 + }, + { + "epoch": 0.019624101713653953, + "grad_norm": 2.961442470550537, + "learning_rate": 3.55e-05, + "loss": 7.4874, + "step": 355 + }, + { + "epoch": 0.01990049751243781, + "grad_norm": 2.420823812484741, + "learning_rate": 3.6e-05, + "loss": 7.3551, + "step": 360 + }, + { + "epoch": 0.020176893311221668, + "grad_norm": 2.382369041442871, + "learning_rate": 3.65e-05, + "loss": 7.2977, + "step": 365 + }, + { + "epoch": 0.02045328911000553, + "grad_norm": 2.913240671157837, + "learning_rate": 3.7e-05, + "loss": 7.3576, + "step": 370 + }, + { + "epoch": 0.020729684908789386, + "grad_norm": 2.3969368934631348, + "learning_rate": 3.7500000000000003e-05, + "loss": 7.096, + "step": 375 + }, + { + "epoch": 0.021006080707573246, + "grad_norm": 3.0736069679260254, + "learning_rate": 3.8e-05, + "loss": 7.1848, + "step": 380 + }, + { + "epoch": 0.021282476506357104, + "grad_norm": 2.2005414962768555, + "learning_rate": 3.85e-05, + "loss": 7.082, + "step": 385 + }, + { + "epoch": 0.02155887230514096, + "grad_norm": 3.3333756923675537, + "learning_rate": 3.9000000000000006e-05, + "loss": 7.1535, + "step": 390 + }, + { + "epoch": 0.02183526810392482, + "grad_norm": 3.055220127105713, + "learning_rate": 3.9500000000000005e-05, + "loss": 6.8787, + "step": 395 + }, + { + "epoch": 0.02211166390270868, + "grad_norm": 2.4127659797668457, + "learning_rate": 4e-05, + "loss": 6.9886, + "step": 400 + }, + { + "epoch": 0.022388059701492536, + "grad_norm": 2.8047995567321777, + "learning_rate": 4.05e-05, + "loss": 7.0981, + "step": 405 + }, + { + "epoch": 0.022664455500276397, + "grad_norm": 2.9609084129333496, + "learning_rate": 4.1e-05, + "loss": 6.9062, + "step": 410 + }, + { + "epoch": 0.022940851299060254, + "grad_norm": 3.0030362606048584, + "learning_rate": 4.15e-05, + "loss": 7.1212, + "step": 415 + }, + { + "epoch": 0.02321724709784411, + "grad_norm": 3.864861011505127, + "learning_rate": 4.2e-05, + "loss": 7.2378, + "step": 420 + }, + { + "epoch": 0.023493642896627972, + "grad_norm": 2.880798816680908, + "learning_rate": 4.25e-05, + "loss": 7.073, + "step": 425 + }, + { + "epoch": 0.02377003869541183, + "grad_norm": 2.903695821762085, + "learning_rate": 4.3e-05, + "loss": 7.0507, + "step": 430 + }, + { + "epoch": 0.02404643449419569, + "grad_norm": 3.056842088699341, + "learning_rate": 4.35e-05, + "loss": 6.843, + "step": 435 + }, + { + "epoch": 0.024322830292979547, + "grad_norm": 2.9644432067871094, + "learning_rate": 4.4000000000000006e-05, + "loss": 7.0322, + "step": 440 + }, + { + "epoch": 0.024599226091763404, + "grad_norm": 2.783032178878784, + "learning_rate": 4.4500000000000004e-05, + "loss": 7.0572, + "step": 445 + }, + { + "epoch": 0.024875621890547265, + "grad_norm": 2.3176393508911133, + "learning_rate": 4.5e-05, + "loss": 6.8992, + "step": 450 + }, + { + "epoch": 0.025152017689331122, + "grad_norm": 2.587258815765381, + "learning_rate": 4.55e-05, + "loss": 6.9982, + "step": 455 + }, + { + "epoch": 0.02542841348811498, + "grad_norm": 3.1604034900665283, + "learning_rate": 4.600000000000001e-05, + "loss": 6.8609, + "step": 460 + }, + { + "epoch": 0.02570480928689884, + "grad_norm": 3.7053439617156982, + "learning_rate": 4.6500000000000005e-05, + "loss": 7.4181, + "step": 465 + }, + { + "epoch": 0.025981205085682697, + "grad_norm": 2.896162986755371, + "learning_rate": 4.7e-05, + "loss": 6.7783, + "step": 470 + }, + { + "epoch": 0.026257600884466555, + "grad_norm": 2.4549975395202637, + "learning_rate": 4.75e-05, + "loss": 6.8819, + "step": 475 + }, + { + "epoch": 0.026533996683250415, + "grad_norm": 2.6171348094940186, + "learning_rate": 4.8e-05, + "loss": 7.0355, + "step": 480 + }, + { + "epoch": 0.026810392482034272, + "grad_norm": 2.6561520099639893, + "learning_rate": 4.85e-05, + "loss": 6.8347, + "step": 485 + }, + { + "epoch": 0.027086788280818133, + "grad_norm": 2.9669582843780518, + "learning_rate": 4.9e-05, + "loss": 6.7402, + "step": 490 + }, + { + "epoch": 0.02736318407960199, + "grad_norm": 2.837339162826538, + "learning_rate": 4.9500000000000004e-05, + "loss": 6.9979, + "step": 495 + }, + { + "epoch": 0.027639579878385848, + "grad_norm": 2.7479424476623535, + "learning_rate": 5e-05, + "loss": 6.8928, + "step": 500 + }, + { + "epoch": 0.02791597567716971, + "grad_norm": 3.010474920272827, + "learning_rate": 4.9985787379192726e-05, + "loss": 7.0164, + "step": 505 + }, + { + "epoch": 0.028192371475953566, + "grad_norm": 2.812869071960449, + "learning_rate": 4.997157475838545e-05, + "loss": 6.9878, + "step": 510 + }, + { + "epoch": 0.028468767274737423, + "grad_norm": 2.1367461681365967, + "learning_rate": 4.9957362137578174e-05, + "loss": 7.0706, + "step": 515 + }, + { + "epoch": 0.028745163073521283, + "grad_norm": 2.864809989929199, + "learning_rate": 4.994314951677089e-05, + "loss": 6.9731, + "step": 520 + }, + { + "epoch": 0.02902155887230514, + "grad_norm": 2.8308370113372803, + "learning_rate": 4.9928936895963616e-05, + "loss": 6.8747, + "step": 525 + }, + { + "epoch": 0.029297954671088998, + "grad_norm": 3.112752676010132, + "learning_rate": 4.9914724275156346e-05, + "loss": 6.858, + "step": 530 + }, + { + "epoch": 0.02957435046987286, + "grad_norm": 2.711458683013916, + "learning_rate": 4.9900511654349063e-05, + "loss": 7.0695, + "step": 535 + }, + { + "epoch": 0.029850746268656716, + "grad_norm": 3.2045886516571045, + "learning_rate": 4.988629903354179e-05, + "loss": 7.0511, + "step": 540 + }, + { + "epoch": 0.030127142067440577, + "grad_norm": 2.655226707458496, + "learning_rate": 4.987208641273451e-05, + "loss": 7.1596, + "step": 545 + }, + { + "epoch": 0.030403537866224434, + "grad_norm": 3.0165657997131348, + "learning_rate": 4.9857873791927235e-05, + "loss": 7.2406, + "step": 550 + }, + { + "epoch": 0.03067993366500829, + "grad_norm": 3.2889368534088135, + "learning_rate": 4.984366117111995e-05, + "loss": 7.1024, + "step": 555 + }, + { + "epoch": 0.03095632946379215, + "grad_norm": 3.375981569290161, + "learning_rate": 4.9829448550312677e-05, + "loss": 6.8454, + "step": 560 + }, + { + "epoch": 0.03123272526257601, + "grad_norm": 3.135676383972168, + "learning_rate": 4.981523592950541e-05, + "loss": 7.3552, + "step": 565 + }, + { + "epoch": 0.03150912106135987, + "grad_norm": 3.073650598526001, + "learning_rate": 4.9801023308698125e-05, + "loss": 7.1633, + "step": 570 + }, + { + "epoch": 0.03178551686014373, + "grad_norm": 2.4921205043792725, + "learning_rate": 4.978681068789085e-05, + "loss": 7.0104, + "step": 575 + }, + { + "epoch": 0.032061912658927584, + "grad_norm": 2.602283477783203, + "learning_rate": 4.977259806708357e-05, + "loss": 6.7988, + "step": 580 + }, + { + "epoch": 0.03233830845771144, + "grad_norm": 2.6181817054748535, + "learning_rate": 4.9758385446276296e-05, + "loss": 6.9489, + "step": 585 + }, + { + "epoch": 0.0326147042564953, + "grad_norm": 2.7632694244384766, + "learning_rate": 4.974417282546902e-05, + "loss": 6.8527, + "step": 590 + }, + { + "epoch": 0.03289110005527916, + "grad_norm": 2.530285120010376, + "learning_rate": 4.972996020466174e-05, + "loss": 6.902, + "step": 595 + }, + { + "epoch": 0.03316749585406302, + "grad_norm": 2.339693784713745, + "learning_rate": 4.971574758385447e-05, + "loss": 6.7824, + "step": 600 + }, + { + "epoch": 0.03344389165284688, + "grad_norm": 2.539701223373413, + "learning_rate": 4.9701534963047186e-05, + "loss": 6.7379, + "step": 605 + }, + { + "epoch": 0.033720287451630734, + "grad_norm": 2.586127758026123, + "learning_rate": 4.968732234223991e-05, + "loss": 7.1041, + "step": 610 + }, + { + "epoch": 0.03399668325041459, + "grad_norm": 2.9324231147766113, + "learning_rate": 4.9673109721432634e-05, + "loss": 6.778, + "step": 615 + }, + { + "epoch": 0.03427307904919845, + "grad_norm": 2.5098533630371094, + "learning_rate": 4.965889710062536e-05, + "loss": 6.8565, + "step": 620 + }, + { + "epoch": 0.03454947484798231, + "grad_norm": 3.0486905574798584, + "learning_rate": 4.964468447981808e-05, + "loss": 6.6843, + "step": 625 + }, + { + "epoch": 0.03482587064676617, + "grad_norm": 2.632387161254883, + "learning_rate": 4.96304718590108e-05, + "loss": 6.7764, + "step": 630 + }, + { + "epoch": 0.03510226644555003, + "grad_norm": 2.5123403072357178, + "learning_rate": 4.961625923820353e-05, + "loss": 6.7861, + "step": 635 + }, + { + "epoch": 0.035378662244333885, + "grad_norm": 2.678485631942749, + "learning_rate": 4.9602046617396253e-05, + "loss": 6.7484, + "step": 640 + }, + { + "epoch": 0.03565505804311774, + "grad_norm": 2.7005465030670166, + "learning_rate": 4.958783399658897e-05, + "loss": 6.7331, + "step": 645 + }, + { + "epoch": 0.035931453841901606, + "grad_norm": 2.8947432041168213, + "learning_rate": 4.95736213757817e-05, + "loss": 6.7002, + "step": 650 + }, + { + "epoch": 0.03620784964068546, + "grad_norm": 2.788076162338257, + "learning_rate": 4.955940875497442e-05, + "loss": 6.9371, + "step": 655 + }, + { + "epoch": 0.03648424543946932, + "grad_norm": 2.3308234214782715, + "learning_rate": 4.954519613416714e-05, + "loss": 6.7643, + "step": 660 + }, + { + "epoch": 0.03676064123825318, + "grad_norm": 2.0851082801818848, + "learning_rate": 4.953098351335986e-05, + "loss": 6.5921, + "step": 665 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 2.5905802249908447, + "learning_rate": 4.951677089255259e-05, + "loss": 6.7768, + "step": 670 + }, + { + "epoch": 0.03731343283582089, + "grad_norm": 2.8827717304229736, + "learning_rate": 4.9502558271745315e-05, + "loss": 6.7569, + "step": 675 + }, + { + "epoch": 0.037589828634604756, + "grad_norm": 4.079367160797119, + "learning_rate": 4.948834565093803e-05, + "loss": 6.8087, + "step": 680 + }, + { + "epoch": 0.037866224433388614, + "grad_norm": 2.45166277885437, + "learning_rate": 4.947413303013076e-05, + "loss": 6.7949, + "step": 685 + }, + { + "epoch": 0.03814262023217247, + "grad_norm": 3.1089086532592773, + "learning_rate": 4.945992040932348e-05, + "loss": 6.5695, + "step": 690 + }, + { + "epoch": 0.03841901603095633, + "grad_norm": 2.3953475952148438, + "learning_rate": 4.9445707788516204e-05, + "loss": 6.7555, + "step": 695 + }, + { + "epoch": 0.038695411829740185, + "grad_norm": 3.1515231132507324, + "learning_rate": 4.943149516770893e-05, + "loss": 6.7233, + "step": 700 + }, + { + "epoch": 0.03897180762852405, + "grad_norm": 2.5655906200408936, + "learning_rate": 4.941728254690165e-05, + "loss": 6.8136, + "step": 705 + }, + { + "epoch": 0.03924820342730791, + "grad_norm": 2.360464096069336, + "learning_rate": 4.9403069926094376e-05, + "loss": 6.4072, + "step": 710 + }, + { + "epoch": 0.039524599226091764, + "grad_norm": 2.64111590385437, + "learning_rate": 4.938885730528709e-05, + "loss": 6.9907, + "step": 715 + }, + { + "epoch": 0.03980099502487562, + "grad_norm": 2.419311761856079, + "learning_rate": 4.9374644684479824e-05, + "loss": 6.6928, + "step": 720 + }, + { + "epoch": 0.04007739082365948, + "grad_norm": 3.4732820987701416, + "learning_rate": 4.936043206367255e-05, + "loss": 6.7389, + "step": 725 + }, + { + "epoch": 0.040353786622443336, + "grad_norm": 2.9171254634857178, + "learning_rate": 4.9346219442865265e-05, + "loss": 6.7079, + "step": 730 + }, + { + "epoch": 0.0406301824212272, + "grad_norm": 3.3903751373291016, + "learning_rate": 4.933200682205799e-05, + "loss": 6.4443, + "step": 735 + }, + { + "epoch": 0.04090657822001106, + "grad_norm": 3.01202130317688, + "learning_rate": 4.931779420125071e-05, + "loss": 7.024, + "step": 740 + }, + { + "epoch": 0.041182974018794914, + "grad_norm": 2.3956549167633057, + "learning_rate": 4.930358158044344e-05, + "loss": 6.4869, + "step": 745 + }, + { + "epoch": 0.04145936981757877, + "grad_norm": 2.6889407634735107, + "learning_rate": 4.928936895963616e-05, + "loss": 6.5392, + "step": 750 + }, + { + "epoch": 0.04173576561636263, + "grad_norm": 2.521833896636963, + "learning_rate": 4.9275156338828885e-05, + "loss": 6.5671, + "step": 755 + }, + { + "epoch": 0.04201216141514649, + "grad_norm": 2.741121530532837, + "learning_rate": 4.926094371802161e-05, + "loss": 6.6626, + "step": 760 + }, + { + "epoch": 0.04228855721393035, + "grad_norm": 2.8125641345977783, + "learning_rate": 4.9246731097214326e-05, + "loss": 6.961, + "step": 765 + }, + { + "epoch": 0.04256495301271421, + "grad_norm": 2.6478195190429688, + "learning_rate": 4.923251847640705e-05, + "loss": 6.6519, + "step": 770 + }, + { + "epoch": 0.042841348811498065, + "grad_norm": 2.3772130012512207, + "learning_rate": 4.9218305855599774e-05, + "loss": 6.5116, + "step": 775 + }, + { + "epoch": 0.04311774461028192, + "grad_norm": 2.921616554260254, + "learning_rate": 4.92040932347925e-05, + "loss": 6.7481, + "step": 780 + }, + { + "epoch": 0.04339414040906578, + "grad_norm": 2.6683406829833984, + "learning_rate": 4.918988061398522e-05, + "loss": 6.5871, + "step": 785 + }, + { + "epoch": 0.04367053620784964, + "grad_norm": 2.636958360671997, + "learning_rate": 4.9175667993177946e-05, + "loss": 6.353, + "step": 790 + }, + { + "epoch": 0.0439469320066335, + "grad_norm": 3.1544365882873535, + "learning_rate": 4.916145537237067e-05, + "loss": 6.7493, + "step": 795 + }, + { + "epoch": 0.04422332780541736, + "grad_norm": 3.5060462951660156, + "learning_rate": 4.914724275156339e-05, + "loss": 6.4544, + "step": 800 + }, + { + "epoch": 0.044499723604201215, + "grad_norm": 2.6805920600891113, + "learning_rate": 4.913303013075611e-05, + "loss": 6.736, + "step": 805 + }, + { + "epoch": 0.04477611940298507, + "grad_norm": 2.880324125289917, + "learning_rate": 4.911881750994884e-05, + "loss": 6.8972, + "step": 810 + }, + { + "epoch": 0.045052515201768936, + "grad_norm": 2.333005428314209, + "learning_rate": 4.910460488914156e-05, + "loss": 6.7566, + "step": 815 + }, + { + "epoch": 0.04532891100055279, + "grad_norm": 2.3347644805908203, + "learning_rate": 4.909039226833428e-05, + "loss": 6.4314, + "step": 820 + }, + { + "epoch": 0.04560530679933665, + "grad_norm": 3.022331476211548, + "learning_rate": 4.907617964752701e-05, + "loss": 6.773, + "step": 825 + }, + { + "epoch": 0.04588170259812051, + "grad_norm": 2.466000556945801, + "learning_rate": 4.906196702671973e-05, + "loss": 6.6999, + "step": 830 + }, + { + "epoch": 0.046158098396904365, + "grad_norm": 2.828768730163574, + "learning_rate": 4.9047754405912455e-05, + "loss": 6.92, + "step": 835 + }, + { + "epoch": 0.04643449419568822, + "grad_norm": 2.3771440982818604, + "learning_rate": 4.903354178510517e-05, + "loss": 6.8686, + "step": 840 + }, + { + "epoch": 0.046710889994472087, + "grad_norm": 2.7735869884490967, + "learning_rate": 4.90193291642979e-05, + "loss": 6.3304, + "step": 845 + }, + { + "epoch": 0.046987285793255944, + "grad_norm": 2.560903310775757, + "learning_rate": 4.900511654349062e-05, + "loss": 6.5665, + "step": 850 + }, + { + "epoch": 0.0472636815920398, + "grad_norm": 2.7346339225769043, + "learning_rate": 4.8990903922683344e-05, + "loss": 6.4082, + "step": 855 + }, + { + "epoch": 0.04754007739082366, + "grad_norm": 2.8105099201202393, + "learning_rate": 4.897669130187607e-05, + "loss": 6.55, + "step": 860 + }, + { + "epoch": 0.047816473189607515, + "grad_norm": 3.484123468399048, + "learning_rate": 4.896247868106879e-05, + "loss": 6.313, + "step": 865 + }, + { + "epoch": 0.04809286898839138, + "grad_norm": 2.645644426345825, + "learning_rate": 4.8948266060261516e-05, + "loss": 6.3278, + "step": 870 + }, + { + "epoch": 0.04836926478717524, + "grad_norm": 2.5998237133026123, + "learning_rate": 4.893405343945423e-05, + "loss": 6.6541, + "step": 875 + }, + { + "epoch": 0.048645660585959094, + "grad_norm": 2.4943838119506836, + "learning_rate": 4.8919840818646964e-05, + "loss": 6.6822, + "step": 880 + }, + { + "epoch": 0.04892205638474295, + "grad_norm": 3.156522035598755, + "learning_rate": 4.890562819783968e-05, + "loss": 6.5528, + "step": 885 + }, + { + "epoch": 0.04919845218352681, + "grad_norm": 3.012643337249756, + "learning_rate": 4.8891415577032405e-05, + "loss": 6.4346, + "step": 890 + }, + { + "epoch": 0.049474847982310666, + "grad_norm": 2.3925185203552246, + "learning_rate": 4.887720295622513e-05, + "loss": 6.7773, + "step": 895 + }, + { + "epoch": 0.04975124378109453, + "grad_norm": 2.846290349960327, + "learning_rate": 4.886299033541785e-05, + "loss": 6.3221, + "step": 900 + }, + { + "epoch": 0.05002763957987839, + "grad_norm": 2.715515375137329, + "learning_rate": 4.884877771461058e-05, + "loss": 6.4721, + "step": 905 + }, + { + "epoch": 0.050304035378662244, + "grad_norm": 2.458165407180786, + "learning_rate": 4.8834565093803294e-05, + "loss": 6.7841, + "step": 910 + }, + { + "epoch": 0.0505804311774461, + "grad_norm": 3.0126917362213135, + "learning_rate": 4.8820352472996025e-05, + "loss": 6.5939, + "step": 915 + }, + { + "epoch": 0.05085682697622996, + "grad_norm": 2.5035128593444824, + "learning_rate": 4.880613985218875e-05, + "loss": 6.4, + "step": 920 + }, + { + "epoch": 0.05113322277501382, + "grad_norm": 3.497965097427368, + "learning_rate": 4.8791927231381466e-05, + "loss": 6.7437, + "step": 925 + }, + { + "epoch": 0.05140961857379768, + "grad_norm": 2.88411021232605, + "learning_rate": 4.877771461057419e-05, + "loss": 6.4189, + "step": 930 + }, + { + "epoch": 0.05168601437258154, + "grad_norm": 2.9255616664886475, + "learning_rate": 4.8763501989766914e-05, + "loss": 6.7637, + "step": 935 + }, + { + "epoch": 0.051962410171365395, + "grad_norm": 3.0357139110565186, + "learning_rate": 4.874928936895964e-05, + "loss": 6.4721, + "step": 940 + }, + { + "epoch": 0.05223880597014925, + "grad_norm": 2.340343713760376, + "learning_rate": 4.873507674815236e-05, + "loss": 6.3291, + "step": 945 + }, + { + "epoch": 0.05251520176893311, + "grad_norm": 2.570775270462036, + "learning_rate": 4.8720864127345086e-05, + "loss": 6.2555, + "step": 950 + }, + { + "epoch": 0.05279159756771697, + "grad_norm": 2.6438934803009033, + "learning_rate": 4.870665150653781e-05, + "loss": 6.6452, + "step": 955 + }, + { + "epoch": 0.05306799336650083, + "grad_norm": 3.2586843967437744, + "learning_rate": 4.869243888573053e-05, + "loss": 6.6964, + "step": 960 + }, + { + "epoch": 0.05334438916528469, + "grad_norm": 2.744736433029175, + "learning_rate": 4.867822626492326e-05, + "loss": 6.3793, + "step": 965 + }, + { + "epoch": 0.053620784964068545, + "grad_norm": 2.517343044281006, + "learning_rate": 4.866401364411598e-05, + "loss": 6.6729, + "step": 970 + }, + { + "epoch": 0.0538971807628524, + "grad_norm": 2.725949287414551, + "learning_rate": 4.86498010233087e-05, + "loss": 6.341, + "step": 975 + }, + { + "epoch": 0.054173576561636266, + "grad_norm": 2.8635339736938477, + "learning_rate": 4.863558840250142e-05, + "loss": 6.4309, + "step": 980 + }, + { + "epoch": 0.054449972360420124, + "grad_norm": 2.70915150642395, + "learning_rate": 4.862137578169415e-05, + "loss": 6.3596, + "step": 985 + }, + { + "epoch": 0.05472636815920398, + "grad_norm": 3.162316083908081, + "learning_rate": 4.860716316088687e-05, + "loss": 6.6671, + "step": 990 + }, + { + "epoch": 0.05500276395798784, + "grad_norm": 2.084820508956909, + "learning_rate": 4.859295054007959e-05, + "loss": 6.5709, + "step": 995 + }, + { + "epoch": 0.055279159756771695, + "grad_norm": 2.8458499908447266, + "learning_rate": 4.857873791927232e-05, + "loss": 6.4753, + "step": 1000 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 3.0729644298553467, + "learning_rate": 4.856452529846504e-05, + "loss": 6.5452, + "step": 1005 + }, + { + "epoch": 0.05583195135433942, + "grad_norm": 2.171372890472412, + "learning_rate": 4.855031267765776e-05, + "loss": 6.2128, + "step": 1010 + }, + { + "epoch": 0.056108347153123274, + "grad_norm": 2.624799966812134, + "learning_rate": 4.8536100056850484e-05, + "loss": 6.4788, + "step": 1015 + }, + { + "epoch": 0.05638474295190713, + "grad_norm": 2.8430840969085693, + "learning_rate": 4.852188743604321e-05, + "loss": 6.3949, + "step": 1020 + }, + { + "epoch": 0.05666113875069099, + "grad_norm": 3.119079351425171, + "learning_rate": 4.850767481523593e-05, + "loss": 6.6569, + "step": 1025 + }, + { + "epoch": 0.056937534549474846, + "grad_norm": 2.7463550567626953, + "learning_rate": 4.8493462194428656e-05, + "loss": 6.5224, + "step": 1030 + }, + { + "epoch": 0.05721393034825871, + "grad_norm": 2.7679648399353027, + "learning_rate": 4.847924957362138e-05, + "loss": 6.5559, + "step": 1035 + }, + { + "epoch": 0.05749032614704257, + "grad_norm": 3.098196268081665, + "learning_rate": 4.8465036952814104e-05, + "loss": 6.4804, + "step": 1040 + }, + { + "epoch": 0.057766721945826424, + "grad_norm": 2.649479866027832, + "learning_rate": 4.845082433200682e-05, + "loss": 6.553, + "step": 1045 + }, + { + "epoch": 0.05804311774461028, + "grad_norm": 3.0194578170776367, + "learning_rate": 4.8436611711199545e-05, + "loss": 6.2782, + "step": 1050 + }, + { + "epoch": 0.05831951354339414, + "grad_norm": 2.331939220428467, + "learning_rate": 4.8422399090392276e-05, + "loss": 6.4653, + "step": 1055 + }, + { + "epoch": 0.058595909342177996, + "grad_norm": 2.84271502494812, + "learning_rate": 4.840818646958499e-05, + "loss": 6.5789, + "step": 1060 + }, + { + "epoch": 0.05887230514096186, + "grad_norm": 2.8978166580200195, + "learning_rate": 4.839397384877772e-05, + "loss": 6.495, + "step": 1065 + }, + { + "epoch": 0.05914870093974572, + "grad_norm": 2.4874818325042725, + "learning_rate": 4.837976122797044e-05, + "loss": 6.5368, + "step": 1070 + }, + { + "epoch": 0.059425096738529574, + "grad_norm": 3.212247848510742, + "learning_rate": 4.8365548607163165e-05, + "loss": 6.3398, + "step": 1075 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 2.3944737911224365, + "learning_rate": 4.835133598635589e-05, + "loss": 6.6459, + "step": 1080 + }, + { + "epoch": 0.05997788833609729, + "grad_norm": 3.152069091796875, + "learning_rate": 4.8337123365548606e-05, + "loss": 6.2656, + "step": 1085 + }, + { + "epoch": 0.06025428413488115, + "grad_norm": 3.0014097690582275, + "learning_rate": 4.832291074474134e-05, + "loss": 6.5379, + "step": 1090 + }, + { + "epoch": 0.06053067993366501, + "grad_norm": 2.4241628646850586, + "learning_rate": 4.8308698123934054e-05, + "loss": 6.518, + "step": 1095 + }, + { + "epoch": 0.06080707573244887, + "grad_norm": 2.194061040878296, + "learning_rate": 4.829448550312678e-05, + "loss": 6.2291, + "step": 1100 + }, + { + "epoch": 0.061083471531232725, + "grad_norm": 2.6672093868255615, + "learning_rate": 4.82802728823195e-05, + "loss": 6.2079, + "step": 1105 + }, + { + "epoch": 0.06135986733001658, + "grad_norm": 2.9733340740203857, + "learning_rate": 4.8266060261512226e-05, + "loss": 6.7986, + "step": 1110 + }, + { + "epoch": 0.06163626312880044, + "grad_norm": 2.6432945728302, + "learning_rate": 4.825184764070495e-05, + "loss": 6.3901, + "step": 1115 + }, + { + "epoch": 0.0619126589275843, + "grad_norm": 2.304767370223999, + "learning_rate": 4.823763501989767e-05, + "loss": 6.3117, + "step": 1120 + }, + { + "epoch": 0.06218905472636816, + "grad_norm": 2.7202680110931396, + "learning_rate": 4.82234223990904e-05, + "loss": 6.5007, + "step": 1125 + }, + { + "epoch": 0.06246545052515202, + "grad_norm": 2.201432943344116, + "learning_rate": 4.8209209778283115e-05, + "loss": 6.3724, + "step": 1130 + }, + { + "epoch": 0.06274184632393588, + "grad_norm": 2.9685938358306885, + "learning_rate": 4.819499715747584e-05, + "loss": 6.4615, + "step": 1135 + }, + { + "epoch": 0.06301824212271974, + "grad_norm": 2.2716915607452393, + "learning_rate": 4.818078453666856e-05, + "loss": 6.4481, + "step": 1140 + }, + { + "epoch": 0.0632946379215036, + "grad_norm": 2.6046628952026367, + "learning_rate": 4.816657191586129e-05, + "loss": 6.4018, + "step": 1145 + }, + { + "epoch": 0.06357103372028745, + "grad_norm": 3.6056673526763916, + "learning_rate": 4.815235929505401e-05, + "loss": 6.4416, + "step": 1150 + }, + { + "epoch": 0.06384742951907131, + "grad_norm": 3.628232002258301, + "learning_rate": 4.813814667424673e-05, + "loss": 6.5809, + "step": 1155 + }, + { + "epoch": 0.06412382531785517, + "grad_norm": 2.789034128189087, + "learning_rate": 4.812393405343946e-05, + "loss": 6.3966, + "step": 1160 + }, + { + "epoch": 0.06440022111663903, + "grad_norm": 2.2794246673583984, + "learning_rate": 4.810972143263218e-05, + "loss": 6.3319, + "step": 1165 + }, + { + "epoch": 0.06467661691542288, + "grad_norm": 2.4310152530670166, + "learning_rate": 4.80955088118249e-05, + "loss": 6.5324, + "step": 1170 + }, + { + "epoch": 0.06495301271420674, + "grad_norm": 2.6283979415893555, + "learning_rate": 4.8081296191017624e-05, + "loss": 6.5129, + "step": 1175 + }, + { + "epoch": 0.0652294085129906, + "grad_norm": 2.8399784564971924, + "learning_rate": 4.806708357021035e-05, + "loss": 6.5845, + "step": 1180 + }, + { + "epoch": 0.06550580431177445, + "grad_norm": 2.680716037750244, + "learning_rate": 4.805287094940307e-05, + "loss": 6.6554, + "step": 1185 + }, + { + "epoch": 0.06578220011055833, + "grad_norm": 3.5662529468536377, + "learning_rate": 4.8038658328595796e-05, + "loss": 6.3653, + "step": 1190 + }, + { + "epoch": 0.06605859590934218, + "grad_norm": 2.429107666015625, + "learning_rate": 4.802444570778852e-05, + "loss": 6.2625, + "step": 1195 + }, + { + "epoch": 0.06633499170812604, + "grad_norm": 2.9469411373138428, + "learning_rate": 4.8010233086981244e-05, + "loss": 6.3618, + "step": 1200 + }, + { + "epoch": 0.0666113875069099, + "grad_norm": 3.321293592453003, + "learning_rate": 4.799602046617396e-05, + "loss": 6.262, + "step": 1205 + }, + { + "epoch": 0.06688778330569375, + "grad_norm": 2.542504072189331, + "learning_rate": 4.7981807845366685e-05, + "loss": 6.2755, + "step": 1210 + }, + { + "epoch": 0.06716417910447761, + "grad_norm": 2.622391700744629, + "learning_rate": 4.796759522455941e-05, + "loss": 6.2732, + "step": 1215 + }, + { + "epoch": 0.06744057490326147, + "grad_norm": 2.2811062335968018, + "learning_rate": 4.7953382603752133e-05, + "loss": 6.4282, + "step": 1220 + }, + { + "epoch": 0.06771697070204533, + "grad_norm": 2.973114013671875, + "learning_rate": 4.793916998294486e-05, + "loss": 6.2921, + "step": 1225 + }, + { + "epoch": 0.06799336650082918, + "grad_norm": 2.5322935581207275, + "learning_rate": 4.792495736213758e-05, + "loss": 6.2762, + "step": 1230 + }, + { + "epoch": 0.06826976229961304, + "grad_norm": 3.3714489936828613, + "learning_rate": 4.7910744741330305e-05, + "loss": 6.4914, + "step": 1235 + }, + { + "epoch": 0.0685461580983969, + "grad_norm": 2.4342074394226074, + "learning_rate": 4.789653212052302e-05, + "loss": 6.1297, + "step": 1240 + }, + { + "epoch": 0.06882255389718077, + "grad_norm": 3.002720594406128, + "learning_rate": 4.7882319499715747e-05, + "loss": 6.4413, + "step": 1245 + }, + { + "epoch": 0.06909894969596463, + "grad_norm": 2.581346273422241, + "learning_rate": 4.786810687890848e-05, + "loss": 6.5842, + "step": 1250 + }, + { + "epoch": 0.06937534549474848, + "grad_norm": 2.6828501224517822, + "learning_rate": 4.7853894258101195e-05, + "loss": 6.3015, + "step": 1255 + }, + { + "epoch": 0.06965174129353234, + "grad_norm": 3.318371295928955, + "learning_rate": 4.783968163729392e-05, + "loss": 6.561, + "step": 1260 + }, + { + "epoch": 0.0699281370923162, + "grad_norm": 3.4630489349365234, + "learning_rate": 4.782546901648664e-05, + "loss": 6.7319, + "step": 1265 + }, + { + "epoch": 0.07020453289110005, + "grad_norm": 3.342996597290039, + "learning_rate": 4.7811256395679366e-05, + "loss": 6.2168, + "step": 1270 + }, + { + "epoch": 0.07048092868988391, + "grad_norm": 2.5235445499420166, + "learning_rate": 4.779704377487209e-05, + "loss": 6.5335, + "step": 1275 + }, + { + "epoch": 0.07075732448866777, + "grad_norm": 3.2131083011627197, + "learning_rate": 4.7782831154064814e-05, + "loss": 6.4834, + "step": 1280 + }, + { + "epoch": 0.07103372028745163, + "grad_norm": 2.924309015274048, + "learning_rate": 4.776861853325754e-05, + "loss": 6.3069, + "step": 1285 + }, + { + "epoch": 0.07131011608623548, + "grad_norm": 2.966257333755493, + "learning_rate": 4.7754405912450256e-05, + "loss": 6.1896, + "step": 1290 + }, + { + "epoch": 0.07158651188501934, + "grad_norm": 2.937412977218628, + "learning_rate": 4.774019329164298e-05, + "loss": 6.4074, + "step": 1295 + }, + { + "epoch": 0.07186290768380321, + "grad_norm": 2.4649055004119873, + "learning_rate": 4.772598067083571e-05, + "loss": 6.4064, + "step": 1300 + }, + { + "epoch": 0.07213930348258707, + "grad_norm": 2.4708802700042725, + "learning_rate": 4.771176805002843e-05, + "loss": 6.2426, + "step": 1305 + }, + { + "epoch": 0.07241569928137093, + "grad_norm": 2.374051094055176, + "learning_rate": 4.769755542922115e-05, + "loss": 6.1917, + "step": 1310 + }, + { + "epoch": 0.07269209508015478, + "grad_norm": 5.211256504058838, + "learning_rate": 4.7683342808413875e-05, + "loss": 6.0755, + "step": 1315 + }, + { + "epoch": 0.07296849087893864, + "grad_norm": 2.7231972217559814, + "learning_rate": 4.76691301876066e-05, + "loss": 6.2229, + "step": 1320 + }, + { + "epoch": 0.0732448866777225, + "grad_norm": 2.7478833198547363, + "learning_rate": 4.765491756679932e-05, + "loss": 6.2957, + "step": 1325 + }, + { + "epoch": 0.07352128247650636, + "grad_norm": 2.573377847671509, + "learning_rate": 4.764070494599204e-05, + "loss": 6.1357, + "step": 1330 + }, + { + "epoch": 0.07379767827529021, + "grad_norm": 2.7243764400482178, + "learning_rate": 4.762649232518477e-05, + "loss": 6.4513, + "step": 1335 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 2.9036333560943604, + "learning_rate": 4.761227970437749e-05, + "loss": 6.296, + "step": 1340 + }, + { + "epoch": 0.07435046987285793, + "grad_norm": 3.135991096496582, + "learning_rate": 4.759806708357021e-05, + "loss": 6.4276, + "step": 1345 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 2.428114175796509, + "learning_rate": 4.7583854462762937e-05, + "loss": 6.2983, + "step": 1350 + }, + { + "epoch": 0.07490326147042566, + "grad_norm": 2.220736026763916, + "learning_rate": 4.756964184195566e-05, + "loss": 6.2962, + "step": 1355 + }, + { + "epoch": 0.07517965726920951, + "grad_norm": 2.5993216037750244, + "learning_rate": 4.7555429221148385e-05, + "loss": 6.3815, + "step": 1360 + }, + { + "epoch": 0.07545605306799337, + "grad_norm": 2.7360010147094727, + "learning_rate": 4.75412166003411e-05, + "loss": 6.3496, + "step": 1365 + }, + { + "epoch": 0.07573244886677723, + "grad_norm": 2.4501261711120605, + "learning_rate": 4.752700397953383e-05, + "loss": 6.3474, + "step": 1370 + }, + { + "epoch": 0.07600884466556108, + "grad_norm": 2.4407808780670166, + "learning_rate": 4.751279135872655e-05, + "loss": 6.079, + "step": 1375 + }, + { + "epoch": 0.07628524046434494, + "grad_norm": 2.870891571044922, + "learning_rate": 4.7498578737919274e-05, + "loss": 6.009, + "step": 1380 + }, + { + "epoch": 0.0765616362631288, + "grad_norm": 2.539259910583496, + "learning_rate": 4.7484366117112e-05, + "loss": 6.4406, + "step": 1385 + }, + { + "epoch": 0.07683803206191266, + "grad_norm": 3.2458794116973877, + "learning_rate": 4.747015349630472e-05, + "loss": 6.1957, + "step": 1390 + }, + { + "epoch": 0.07711442786069651, + "grad_norm": 2.6678242683410645, + "learning_rate": 4.7455940875497446e-05, + "loss": 6.3314, + "step": 1395 + }, + { + "epoch": 0.07739082365948037, + "grad_norm": 2.6106460094451904, + "learning_rate": 4.744172825469016e-05, + "loss": 6.1362, + "step": 1400 + }, + { + "epoch": 0.07766721945826423, + "grad_norm": 2.8481876850128174, + "learning_rate": 4.7427515633882894e-05, + "loss": 6.3231, + "step": 1405 + }, + { + "epoch": 0.0779436152570481, + "grad_norm": 2.367493152618408, + "learning_rate": 4.741330301307562e-05, + "loss": 6.0713, + "step": 1410 + }, + { + "epoch": 0.07822001105583196, + "grad_norm": 3.546706199645996, + "learning_rate": 4.7399090392268335e-05, + "loss": 6.1802, + "step": 1415 + }, + { + "epoch": 0.07849640685461581, + "grad_norm": 2.659611701965332, + "learning_rate": 4.738487777146106e-05, + "loss": 6.2043, + "step": 1420 + }, + { + "epoch": 0.07877280265339967, + "grad_norm": 2.9869518280029297, + "learning_rate": 4.737066515065378e-05, + "loss": 6.354, + "step": 1425 + }, + { + "epoch": 0.07904919845218353, + "grad_norm": 2.5255351066589355, + "learning_rate": 4.735645252984651e-05, + "loss": 6.0689, + "step": 1430 + }, + { + "epoch": 0.07932559425096739, + "grad_norm": 3.4141147136688232, + "learning_rate": 4.7342239909039224e-05, + "loss": 6.2125, + "step": 1435 + }, + { + "epoch": 0.07960199004975124, + "grad_norm": 2.526256799697876, + "learning_rate": 4.7328027288231955e-05, + "loss": 6.055, + "step": 1440 + }, + { + "epoch": 0.0798783858485351, + "grad_norm": 2.8614776134490967, + "learning_rate": 4.731381466742468e-05, + "loss": 6.3068, + "step": 1445 + }, + { + "epoch": 0.08015478164731896, + "grad_norm": 2.7898523807525635, + "learning_rate": 4.7299602046617396e-05, + "loss": 6.2226, + "step": 1450 + }, + { + "epoch": 0.08043117744610281, + "grad_norm": 2.2415056228637695, + "learning_rate": 4.728538942581012e-05, + "loss": 6.1964, + "step": 1455 + }, + { + "epoch": 0.08070757324488667, + "grad_norm": 3.079174280166626, + "learning_rate": 4.7271176805002844e-05, + "loss": 6.2307, + "step": 1460 + }, + { + "epoch": 0.08098396904367054, + "grad_norm": 2.619187355041504, + "learning_rate": 4.725696418419557e-05, + "loss": 6.1481, + "step": 1465 + }, + { + "epoch": 0.0812603648424544, + "grad_norm": 3.067775249481201, + "learning_rate": 4.724275156338829e-05, + "loss": 6.3701, + "step": 1470 + }, + { + "epoch": 0.08153676064123826, + "grad_norm": 2.4046452045440674, + "learning_rate": 4.7228538942581016e-05, + "loss": 6.1519, + "step": 1475 + }, + { + "epoch": 0.08181315644002211, + "grad_norm": 2.766226291656494, + "learning_rate": 4.721432632177374e-05, + "loss": 6.1802, + "step": 1480 + }, + { + "epoch": 0.08208955223880597, + "grad_norm": 2.1929092407226562, + "learning_rate": 4.720011370096646e-05, + "loss": 6.4416, + "step": 1485 + }, + { + "epoch": 0.08236594803758983, + "grad_norm": 2.9942455291748047, + "learning_rate": 4.718590108015918e-05, + "loss": 6.1566, + "step": 1490 + }, + { + "epoch": 0.08264234383637369, + "grad_norm": 3.3760600090026855, + "learning_rate": 4.717168845935191e-05, + "loss": 6.2593, + "step": 1495 + }, + { + "epoch": 0.08291873963515754, + "grad_norm": 3.459946632385254, + "learning_rate": 4.715747583854463e-05, + "loss": 6.0409, + "step": 1500 + }, + { + "epoch": 0.0831951354339414, + "grad_norm": 2.9691433906555176, + "learning_rate": 4.714326321773735e-05, + "loss": 6.2912, + "step": 1505 + }, + { + "epoch": 0.08347153123272526, + "grad_norm": 2.3945460319519043, + "learning_rate": 4.712905059693008e-05, + "loss": 6.3243, + "step": 1510 + }, + { + "epoch": 0.08374792703150911, + "grad_norm": 3.2403388023376465, + "learning_rate": 4.71148379761228e-05, + "loss": 5.8076, + "step": 1515 + }, + { + "epoch": 0.08402432283029299, + "grad_norm": 2.4649736881256104, + "learning_rate": 4.710062535531552e-05, + "loss": 6.1595, + "step": 1520 + }, + { + "epoch": 0.08430071862907684, + "grad_norm": 2.9593818187713623, + "learning_rate": 4.708641273450824e-05, + "loss": 6.1091, + "step": 1525 + }, + { + "epoch": 0.0845771144278607, + "grad_norm": 2.794445753097534, + "learning_rate": 4.707220011370097e-05, + "loss": 6.1391, + "step": 1530 + }, + { + "epoch": 0.08485351022664456, + "grad_norm": 2.7484805583953857, + "learning_rate": 4.705798749289369e-05, + "loss": 6.0529, + "step": 1535 + }, + { + "epoch": 0.08512990602542841, + "grad_norm": 2.891587972640991, + "learning_rate": 4.7043774872086414e-05, + "loss": 6.1436, + "step": 1540 + }, + { + "epoch": 0.08540630182421227, + "grad_norm": 3.721700429916382, + "learning_rate": 4.702956225127914e-05, + "loss": 6.4103, + "step": 1545 + }, + { + "epoch": 0.08568269762299613, + "grad_norm": 2.4380359649658203, + "learning_rate": 4.701534963047186e-05, + "loss": 6.2798, + "step": 1550 + }, + { + "epoch": 0.08595909342177999, + "grad_norm": 3.160372257232666, + "learning_rate": 4.7001137009664586e-05, + "loss": 6.4028, + "step": 1555 + }, + { + "epoch": 0.08623548922056384, + "grad_norm": 2.3256590366363525, + "learning_rate": 4.69869243888573e-05, + "loss": 6.2657, + "step": 1560 + }, + { + "epoch": 0.0865118850193477, + "grad_norm": 2.191479444503784, + "learning_rate": 4.6972711768050034e-05, + "loss": 6.1302, + "step": 1565 + }, + { + "epoch": 0.08678828081813156, + "grad_norm": 2.7161953449249268, + "learning_rate": 4.695849914724275e-05, + "loss": 6.2056, + "step": 1570 + }, + { + "epoch": 0.08706467661691543, + "grad_norm": 2.7655863761901855, + "learning_rate": 4.6944286526435475e-05, + "loss": 6.1198, + "step": 1575 + }, + { + "epoch": 0.08734107241569929, + "grad_norm": 2.158665180206299, + "learning_rate": 4.6930073905628206e-05, + "loss": 6.2007, + "step": 1580 + }, + { + "epoch": 0.08761746821448314, + "grad_norm": 2.5755743980407715, + "learning_rate": 4.691586128482092e-05, + "loss": 6.2911, + "step": 1585 + }, + { + "epoch": 0.087893864013267, + "grad_norm": 2.5432612895965576, + "learning_rate": 4.690164866401365e-05, + "loss": 6.1612, + "step": 1590 + }, + { + "epoch": 0.08817025981205086, + "grad_norm": 2.5009267330169678, + "learning_rate": 4.688743604320637e-05, + "loss": 6.3085, + "step": 1595 + }, + { + "epoch": 0.08844665561083472, + "grad_norm": 2.381110429763794, + "learning_rate": 4.6873223422399095e-05, + "loss": 6.211, + "step": 1600 + }, + { + "epoch": 0.08872305140961857, + "grad_norm": 2.5844264030456543, + "learning_rate": 4.685901080159182e-05, + "loss": 6.0844, + "step": 1605 + }, + { + "epoch": 0.08899944720840243, + "grad_norm": 2.737116575241089, + "learning_rate": 4.6844798180784536e-05, + "loss": 6.2711, + "step": 1610 + }, + { + "epoch": 0.08927584300718629, + "grad_norm": 2.8457653522491455, + "learning_rate": 4.683058555997727e-05, + "loss": 6.3523, + "step": 1615 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 3.002805471420288, + "learning_rate": 4.6816372939169984e-05, + "loss": 6.3546, + "step": 1620 + }, + { + "epoch": 0.089828634604754, + "grad_norm": 3.3119866847991943, + "learning_rate": 4.680216031836271e-05, + "loss": 6.3509, + "step": 1625 + }, + { + "epoch": 0.09010503040353787, + "grad_norm": 2.4615001678466797, + "learning_rate": 4.678794769755543e-05, + "loss": 6.1708, + "step": 1630 + }, + { + "epoch": 0.09038142620232173, + "grad_norm": 2.660804271697998, + "learning_rate": 4.6773735076748156e-05, + "loss": 6.1863, + "step": 1635 + }, + { + "epoch": 0.09065782200110559, + "grad_norm": 2.9395289421081543, + "learning_rate": 4.675952245594088e-05, + "loss": 6.5699, + "step": 1640 + }, + { + "epoch": 0.09093421779988944, + "grad_norm": 3.1004257202148438, + "learning_rate": 4.67453098351336e-05, + "loss": 6.1951, + "step": 1645 + }, + { + "epoch": 0.0912106135986733, + "grad_norm": 3.4309775829315186, + "learning_rate": 4.673109721432633e-05, + "loss": 5.9744, + "step": 1650 + }, + { + "epoch": 0.09148700939745716, + "grad_norm": 2.9011757373809814, + "learning_rate": 4.6716884593519045e-05, + "loss": 6.1815, + "step": 1655 + }, + { + "epoch": 0.09176340519624102, + "grad_norm": 3.3524973392486572, + "learning_rate": 4.670267197271177e-05, + "loss": 6.1202, + "step": 1660 + }, + { + "epoch": 0.09203980099502487, + "grad_norm": 2.6807689666748047, + "learning_rate": 4.668845935190449e-05, + "loss": 5.9419, + "step": 1665 + }, + { + "epoch": 0.09231619679380873, + "grad_norm": 2.4009783267974854, + "learning_rate": 4.667424673109722e-05, + "loss": 6.2655, + "step": 1670 + }, + { + "epoch": 0.09259259259259259, + "grad_norm": 3.517772674560547, + "learning_rate": 4.666003411028994e-05, + "loss": 6.1189, + "step": 1675 + }, + { + "epoch": 0.09286898839137644, + "grad_norm": 2.677375555038452, + "learning_rate": 4.664582148948266e-05, + "loss": 6.2443, + "step": 1680 + }, + { + "epoch": 0.09314538419016032, + "grad_norm": 3.299330949783325, + "learning_rate": 4.663160886867539e-05, + "loss": 5.8239, + "step": 1685 + }, + { + "epoch": 0.09342177998894417, + "grad_norm": 2.366581916809082, + "learning_rate": 4.661739624786811e-05, + "loss": 6.244, + "step": 1690 + }, + { + "epoch": 0.09369817578772803, + "grad_norm": 2.2254860401153564, + "learning_rate": 4.660318362706083e-05, + "loss": 6.2237, + "step": 1695 + }, + { + "epoch": 0.09397457158651189, + "grad_norm": 2.3326756954193115, + "learning_rate": 4.6588971006253554e-05, + "loss": 6.008, + "step": 1700 + }, + { + "epoch": 0.09425096738529574, + "grad_norm": 2.39668345451355, + "learning_rate": 4.657475838544628e-05, + "loss": 6.4698, + "step": 1705 + }, + { + "epoch": 0.0945273631840796, + "grad_norm": 2.79451584815979, + "learning_rate": 4.6560545764639e-05, + "loss": 6.0543, + "step": 1710 + }, + { + "epoch": 0.09480375898286346, + "grad_norm": 3.0504002571105957, + "learning_rate": 4.6546333143831726e-05, + "loss": 5.9177, + "step": 1715 + }, + { + "epoch": 0.09508015478164732, + "grad_norm": 2.6137356758117676, + "learning_rate": 4.653212052302445e-05, + "loss": 6.0108, + "step": 1720 + }, + { + "epoch": 0.09535655058043117, + "grad_norm": 2.320517063140869, + "learning_rate": 4.6517907902217174e-05, + "loss": 5.9592, + "step": 1725 + }, + { + "epoch": 0.09563294637921503, + "grad_norm": 2.4836413860321045, + "learning_rate": 4.650369528140989e-05, + "loss": 6.1646, + "step": 1730 + }, + { + "epoch": 0.09590934217799889, + "grad_norm": 2.3726565837860107, + "learning_rate": 4.6489482660602615e-05, + "loss": 6.0548, + "step": 1735 + }, + { + "epoch": 0.09618573797678276, + "grad_norm": 3.755805492401123, + "learning_rate": 4.647527003979534e-05, + "loss": 6.1781, + "step": 1740 + }, + { + "epoch": 0.09646213377556662, + "grad_norm": 2.5394093990325928, + "learning_rate": 4.646105741898806e-05, + "loss": 5.9294, + "step": 1745 + }, + { + "epoch": 0.09673852957435047, + "grad_norm": 3.1130166053771973, + "learning_rate": 4.644684479818079e-05, + "loss": 6.0655, + "step": 1750 + }, + { + "epoch": 0.09701492537313433, + "grad_norm": 3.218590497970581, + "learning_rate": 4.643263217737351e-05, + "loss": 6.2316, + "step": 1755 + }, + { + "epoch": 0.09729132117191819, + "grad_norm": 3.100757598876953, + "learning_rate": 4.6418419556566235e-05, + "loss": 6.105, + "step": 1760 + }, + { + "epoch": 0.09756771697070205, + "grad_norm": 2.9586379528045654, + "learning_rate": 4.640420693575895e-05, + "loss": 6.3479, + "step": 1765 + }, + { + "epoch": 0.0978441127694859, + "grad_norm": 2.569976806640625, + "learning_rate": 4.6389994314951676e-05, + "loss": 6.2001, + "step": 1770 + }, + { + "epoch": 0.09812050856826976, + "grad_norm": 2.5895392894744873, + "learning_rate": 4.637578169414441e-05, + "loss": 6.2953, + "step": 1775 + }, + { + "epoch": 0.09839690436705362, + "grad_norm": 3.0334362983703613, + "learning_rate": 4.6361569073337124e-05, + "loss": 6.1325, + "step": 1780 + }, + { + "epoch": 0.09867330016583747, + "grad_norm": 3.2410378456115723, + "learning_rate": 4.634735645252985e-05, + "loss": 6.1757, + "step": 1785 + }, + { + "epoch": 0.09894969596462133, + "grad_norm": 2.2880196571350098, + "learning_rate": 4.633314383172257e-05, + "loss": 6.1355, + "step": 1790 + }, + { + "epoch": 0.0992260917634052, + "grad_norm": 3.8285810947418213, + "learning_rate": 4.6318931210915296e-05, + "loss": 6.078, + "step": 1795 + }, + { + "epoch": 0.09950248756218906, + "grad_norm": 2.5488243103027344, + "learning_rate": 4.630471859010802e-05, + "loss": 5.956, + "step": 1800 + }, + { + "epoch": 0.09977888336097292, + "grad_norm": 2.4518303871154785, + "learning_rate": 4.629050596930074e-05, + "loss": 6.0628, + "step": 1805 + }, + { + "epoch": 0.10005527915975677, + "grad_norm": 3.043971538543701, + "learning_rate": 4.627629334849347e-05, + "loss": 6.2971, + "step": 1810 + }, + { + "epoch": 0.10033167495854063, + "grad_norm": 3.071781873703003, + "learning_rate": 4.6262080727686185e-05, + "loss": 6.139, + "step": 1815 + }, + { + "epoch": 0.10060807075732449, + "grad_norm": 3.2873592376708984, + "learning_rate": 4.624786810687891e-05, + "loss": 6.1132, + "step": 1820 + }, + { + "epoch": 0.10088446655610835, + "grad_norm": 3.439819574356079, + "learning_rate": 4.623365548607163e-05, + "loss": 6.1686, + "step": 1825 + }, + { + "epoch": 0.1011608623548922, + "grad_norm": 3.1802821159362793, + "learning_rate": 4.621944286526436e-05, + "loss": 5.9691, + "step": 1830 + }, + { + "epoch": 0.10143725815367606, + "grad_norm": 2.665397882461548, + "learning_rate": 4.620523024445708e-05, + "loss": 6.2404, + "step": 1835 + }, + { + "epoch": 0.10171365395245992, + "grad_norm": 2.8647611141204834, + "learning_rate": 4.61910176236498e-05, + "loss": 6.1963, + "step": 1840 + }, + { + "epoch": 0.10199004975124377, + "grad_norm": 3.0262868404388428, + "learning_rate": 4.617680500284253e-05, + "loss": 6.1664, + "step": 1845 + }, + { + "epoch": 0.10226644555002765, + "grad_norm": 3.05643367767334, + "learning_rate": 4.6162592382035246e-05, + "loss": 6.2904, + "step": 1850 + }, + { + "epoch": 0.1025428413488115, + "grad_norm": 2.7000110149383545, + "learning_rate": 4.614837976122797e-05, + "loss": 6.0892, + "step": 1855 + }, + { + "epoch": 0.10281923714759536, + "grad_norm": 2.893402338027954, + "learning_rate": 4.6134167140420694e-05, + "loss": 5.8826, + "step": 1860 + }, + { + "epoch": 0.10309563294637922, + "grad_norm": 2.5244555473327637, + "learning_rate": 4.611995451961342e-05, + "loss": 5.9429, + "step": 1865 + }, + { + "epoch": 0.10337202874516307, + "grad_norm": 2.5196962356567383, + "learning_rate": 4.610574189880614e-05, + "loss": 5.8624, + "step": 1870 + }, + { + "epoch": 0.10364842454394693, + "grad_norm": 2.5912246704101562, + "learning_rate": 4.609152927799886e-05, + "loss": 6.1336, + "step": 1875 + }, + { + "epoch": 0.10392482034273079, + "grad_norm": 2.757858991622925, + "learning_rate": 4.607731665719159e-05, + "loss": 6.0385, + "step": 1880 + }, + { + "epoch": 0.10420121614151465, + "grad_norm": 2.6587064266204834, + "learning_rate": 4.6063104036384314e-05, + "loss": 6.0566, + "step": 1885 + }, + { + "epoch": 0.1044776119402985, + "grad_norm": 3.415855646133423, + "learning_rate": 4.604889141557703e-05, + "loss": 6.4943, + "step": 1890 + }, + { + "epoch": 0.10475400773908236, + "grad_norm": 2.9792914390563965, + "learning_rate": 4.603467879476976e-05, + "loss": 5.8498, + "step": 1895 + }, + { + "epoch": 0.10503040353786622, + "grad_norm": 3.112447500228882, + "learning_rate": 4.602046617396248e-05, + "loss": 6.2611, + "step": 1900 + }, + { + "epoch": 0.10530679933665009, + "grad_norm": 2.8653833866119385, + "learning_rate": 4.60062535531552e-05, + "loss": 6.0917, + "step": 1905 + }, + { + "epoch": 0.10558319513543395, + "grad_norm": 2.7462995052337646, + "learning_rate": 4.599204093234793e-05, + "loss": 6.2313, + "step": 1910 + }, + { + "epoch": 0.1058595909342178, + "grad_norm": 2.4381167888641357, + "learning_rate": 4.597782831154065e-05, + "loss": 6.1274, + "step": 1915 + }, + { + "epoch": 0.10613598673300166, + "grad_norm": 3.3550705909729004, + "learning_rate": 4.5963615690733375e-05, + "loss": 6.246, + "step": 1920 + }, + { + "epoch": 0.10641238253178552, + "grad_norm": 2.736597776412964, + "learning_rate": 4.594940306992609e-05, + "loss": 6.3576, + "step": 1925 + }, + { + "epoch": 0.10668877833056938, + "grad_norm": 2.4143283367156982, + "learning_rate": 4.593519044911882e-05, + "loss": 5.9825, + "step": 1930 + }, + { + "epoch": 0.10696517412935323, + "grad_norm": 2.622124195098877, + "learning_rate": 4.592097782831155e-05, + "loss": 6.1076, + "step": 1935 + }, + { + "epoch": 0.10724156992813709, + "grad_norm": 3.135380744934082, + "learning_rate": 4.5906765207504264e-05, + "loss": 6.0845, + "step": 1940 + }, + { + "epoch": 0.10751796572692095, + "grad_norm": 2.970335006713867, + "learning_rate": 4.589255258669699e-05, + "loss": 6.1974, + "step": 1945 + }, + { + "epoch": 0.1077943615257048, + "grad_norm": 4.032550811767578, + "learning_rate": 4.587833996588971e-05, + "loss": 6.0646, + "step": 1950 + }, + { + "epoch": 0.10807075732448866, + "grad_norm": 2.632209300994873, + "learning_rate": 4.5864127345082436e-05, + "loss": 5.8096, + "step": 1955 + }, + { + "epoch": 0.10834715312327253, + "grad_norm": 2.648122549057007, + "learning_rate": 4.5849914724275154e-05, + "loss": 6.139, + "step": 1960 + }, + { + "epoch": 0.10862354892205639, + "grad_norm": 2.7293598651885986, + "learning_rate": 4.5835702103467884e-05, + "loss": 5.9382, + "step": 1965 + }, + { + "epoch": 0.10889994472084025, + "grad_norm": 2.997490167617798, + "learning_rate": 4.582148948266061e-05, + "loss": 6.1471, + "step": 1970 + }, + { + "epoch": 0.1091763405196241, + "grad_norm": 3.3765926361083984, + "learning_rate": 4.5807276861853326e-05, + "loss": 6.2572, + "step": 1975 + }, + { + "epoch": 0.10945273631840796, + "grad_norm": 2.962548017501831, + "learning_rate": 4.579306424104605e-05, + "loss": 6.0159, + "step": 1980 + }, + { + "epoch": 0.10972913211719182, + "grad_norm": 2.896846294403076, + "learning_rate": 4.5778851620238773e-05, + "loss": 5.6825, + "step": 1985 + }, + { + "epoch": 0.11000552791597568, + "grad_norm": 3.3139047622680664, + "learning_rate": 4.57646389994315e-05, + "loss": 6.0216, + "step": 1990 + }, + { + "epoch": 0.11028192371475953, + "grad_norm": 2.7148447036743164, + "learning_rate": 4.575042637862422e-05, + "loss": 6.1073, + "step": 1995 + }, + { + "epoch": 0.11055831951354339, + "grad_norm": 2.597428798675537, + "learning_rate": 4.5736213757816945e-05, + "loss": 6.2397, + "step": 2000 + }, + { + "epoch": 0.11083471531232725, + "grad_norm": 3.059511184692383, + "learning_rate": 4.572200113700967e-05, + "loss": 6.2985, + "step": 2005 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 2.423731803894043, + "learning_rate": 4.570778851620239e-05, + "loss": 6.2662, + "step": 2010 + }, + { + "epoch": 0.11138750690989498, + "grad_norm": 2.665515899658203, + "learning_rate": 4.569357589539511e-05, + "loss": 6.1349, + "step": 2015 + }, + { + "epoch": 0.11166390270867883, + "grad_norm": 2.9362165927886963, + "learning_rate": 4.567936327458784e-05, + "loss": 6.1175, + "step": 2020 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 2.7762677669525146, + "learning_rate": 4.566515065378056e-05, + "loss": 5.9836, + "step": 2025 + }, + { + "epoch": 0.11221669430624655, + "grad_norm": 3.224057674407959, + "learning_rate": 4.565093803297328e-05, + "loss": 6.0051, + "step": 2030 + }, + { + "epoch": 0.1124930901050304, + "grad_norm": 3.2076215744018555, + "learning_rate": 4.5636725412166007e-05, + "loss": 6.1282, + "step": 2035 + }, + { + "epoch": 0.11276948590381426, + "grad_norm": 2.4749698638916016, + "learning_rate": 4.562251279135873e-05, + "loss": 6.0949, + "step": 2040 + }, + { + "epoch": 0.11304588170259812, + "grad_norm": 2.6076836585998535, + "learning_rate": 4.5608300170551454e-05, + "loss": 6.1536, + "step": 2045 + }, + { + "epoch": 0.11332227750138198, + "grad_norm": 2.7696752548217773, + "learning_rate": 4.559408754974417e-05, + "loss": 6.0666, + "step": 2050 + }, + { + "epoch": 0.11359867330016583, + "grad_norm": 2.964919328689575, + "learning_rate": 4.55798749289369e-05, + "loss": 6.0302, + "step": 2055 + }, + { + "epoch": 0.11387506909894969, + "grad_norm": 2.8098812103271484, + "learning_rate": 4.556566230812962e-05, + "loss": 6.0762, + "step": 2060 + }, + { + "epoch": 0.11415146489773355, + "grad_norm": 3.016932725906372, + "learning_rate": 4.5551449687322344e-05, + "loss": 5.993, + "step": 2065 + }, + { + "epoch": 0.11442786069651742, + "grad_norm": 2.170443058013916, + "learning_rate": 4.553723706651507e-05, + "loss": 6.0171, + "step": 2070 + }, + { + "epoch": 0.11470425649530128, + "grad_norm": 2.617326259613037, + "learning_rate": 4.552302444570779e-05, + "loss": 5.916, + "step": 2075 + }, + { + "epoch": 0.11498065229408513, + "grad_norm": 2.2992942333221436, + "learning_rate": 4.5508811824900516e-05, + "loss": 6.2716, + "step": 2080 + }, + { + "epoch": 0.11525704809286899, + "grad_norm": 2.6933460235595703, + "learning_rate": 4.549459920409323e-05, + "loss": 6.0187, + "step": 2085 + }, + { + "epoch": 0.11553344389165285, + "grad_norm": 2.333048105239868, + "learning_rate": 4.5480386583285964e-05, + "loss": 6.0797, + "step": 2090 + }, + { + "epoch": 0.1158098396904367, + "grad_norm": 2.7053120136260986, + "learning_rate": 4.546617396247868e-05, + "loss": 6.3752, + "step": 2095 + }, + { + "epoch": 0.11608623548922056, + "grad_norm": 2.7705891132354736, + "learning_rate": 4.5451961341671405e-05, + "loss": 5.9062, + "step": 2100 + }, + { + "epoch": 0.11636263128800442, + "grad_norm": 2.7516586780548096, + "learning_rate": 4.543774872086413e-05, + "loss": 5.8142, + "step": 2105 + }, + { + "epoch": 0.11663902708678828, + "grad_norm": 3.1628334522247314, + "learning_rate": 4.542353610005685e-05, + "loss": 6.151, + "step": 2110 + }, + { + "epoch": 0.11691542288557213, + "grad_norm": 2.7654128074645996, + "learning_rate": 4.540932347924958e-05, + "loss": 6.0071, + "step": 2115 + }, + { + "epoch": 0.11719181868435599, + "grad_norm": 2.2299458980560303, + "learning_rate": 4.5395110858442294e-05, + "loss": 5.9453, + "step": 2120 + }, + { + "epoch": 0.11746821448313986, + "grad_norm": 2.7742066383361816, + "learning_rate": 4.5380898237635025e-05, + "loss": 6.0043, + "step": 2125 + }, + { + "epoch": 0.11774461028192372, + "grad_norm": 2.242952585220337, + "learning_rate": 4.536668561682775e-05, + "loss": 5.8624, + "step": 2130 + }, + { + "epoch": 0.11802100608070758, + "grad_norm": 3.177030086517334, + "learning_rate": 4.5352472996020466e-05, + "loss": 5.9175, + "step": 2135 + }, + { + "epoch": 0.11829740187949143, + "grad_norm": 3.0436911582946777, + "learning_rate": 4.533826037521319e-05, + "loss": 6.0026, + "step": 2140 + }, + { + "epoch": 0.11857379767827529, + "grad_norm": 3.5062520503997803, + "learning_rate": 4.5324047754405914e-05, + "loss": 6.2474, + "step": 2145 + }, + { + "epoch": 0.11885019347705915, + "grad_norm": 2.291909694671631, + "learning_rate": 4.530983513359864e-05, + "loss": 6.2062, + "step": 2150 + }, + { + "epoch": 0.119126589275843, + "grad_norm": 2.3270986080169678, + "learning_rate": 4.529562251279136e-05, + "loss": 6.2895, + "step": 2155 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 3.0153403282165527, + "learning_rate": 4.5281409891984086e-05, + "loss": 6.1369, + "step": 2160 + }, + { + "epoch": 0.11967938087341072, + "grad_norm": 2.496553897857666, + "learning_rate": 4.526719727117681e-05, + "loss": 6.1861, + "step": 2165 + }, + { + "epoch": 0.11995577667219458, + "grad_norm": 2.5240025520324707, + "learning_rate": 4.525298465036953e-05, + "loss": 6.3099, + "step": 2170 + }, + { + "epoch": 0.12023217247097844, + "grad_norm": 2.7190871238708496, + "learning_rate": 4.523877202956225e-05, + "loss": 5.737, + "step": 2175 + }, + { + "epoch": 0.1205085682697623, + "grad_norm": 2.7631752490997314, + "learning_rate": 4.5224559408754975e-05, + "loss": 6.1848, + "step": 2180 + }, + { + "epoch": 0.12078496406854616, + "grad_norm": 2.1841232776641846, + "learning_rate": 4.52103467879477e-05, + "loss": 6.0631, + "step": 2185 + }, + { + "epoch": 0.12106135986733002, + "grad_norm": 2.531573534011841, + "learning_rate": 4.519613416714042e-05, + "loss": 6.1404, + "step": 2190 + }, + { + "epoch": 0.12133775566611388, + "grad_norm": 3.5655975341796875, + "learning_rate": 4.518192154633315e-05, + "loss": 6.1925, + "step": 2195 + }, + { + "epoch": 0.12161415146489774, + "grad_norm": 2.4978582859039307, + "learning_rate": 4.516770892552587e-05, + "loss": 5.9768, + "step": 2200 + }, + { + "epoch": 0.12189054726368159, + "grad_norm": 2.800384759902954, + "learning_rate": 4.515349630471859e-05, + "loss": 6.1647, + "step": 2205 + }, + { + "epoch": 0.12216694306246545, + "grad_norm": 3.2375316619873047, + "learning_rate": 4.513928368391132e-05, + "loss": 5.9686, + "step": 2210 + }, + { + "epoch": 0.1224433388612493, + "grad_norm": 3.0477945804595947, + "learning_rate": 4.512507106310404e-05, + "loss": 5.8481, + "step": 2215 + }, + { + "epoch": 0.12271973466003316, + "grad_norm": 2.7675116062164307, + "learning_rate": 4.511085844229676e-05, + "loss": 6.1965, + "step": 2220 + }, + { + "epoch": 0.12299613045881702, + "grad_norm": 3.220231056213379, + "learning_rate": 4.5096645821489484e-05, + "loss": 6.0191, + "step": 2225 + }, + { + "epoch": 0.12327252625760088, + "grad_norm": 2.473461389541626, + "learning_rate": 4.508243320068221e-05, + "loss": 5.7706, + "step": 2230 + }, + { + "epoch": 0.12354892205638475, + "grad_norm": 2.950104236602783, + "learning_rate": 4.506822057987493e-05, + "loss": 6.0164, + "step": 2235 + }, + { + "epoch": 0.1238253178551686, + "grad_norm": 3.0107226371765137, + "learning_rate": 4.5054007959067656e-05, + "loss": 5.9676, + "step": 2240 + }, + { + "epoch": 0.12410171365395246, + "grad_norm": 3.3894078731536865, + "learning_rate": 4.503979533826038e-05, + "loss": 5.9496, + "step": 2245 + }, + { + "epoch": 0.12437810945273632, + "grad_norm": 2.7371933460235596, + "learning_rate": 4.5025582717453104e-05, + "loss": 5.9917, + "step": 2250 + }, + { + "epoch": 0.12465450525152018, + "grad_norm": 2.829718589782715, + "learning_rate": 4.501137009664582e-05, + "loss": 5.881, + "step": 2255 + }, + { + "epoch": 0.12493090105030404, + "grad_norm": 2.5616791248321533, + "learning_rate": 4.4997157475838545e-05, + "loss": 5.8512, + "step": 2260 + }, + { + "epoch": 0.1252072968490879, + "grad_norm": 2.7623887062072754, + "learning_rate": 4.4982944855031276e-05, + "loss": 6.1254, + "step": 2265 + }, + { + "epoch": 0.12548369264787176, + "grad_norm": 2.115161895751953, + "learning_rate": 4.496873223422399e-05, + "loss": 5.9236, + "step": 2270 + }, + { + "epoch": 0.12576008844665562, + "grad_norm": 2.32071590423584, + "learning_rate": 4.495451961341672e-05, + "loss": 5.8425, + "step": 2275 + }, + { + "epoch": 0.12603648424543948, + "grad_norm": 3.4130492210388184, + "learning_rate": 4.494030699260944e-05, + "loss": 5.8947, + "step": 2280 + }, + { + "epoch": 0.12631288004422334, + "grad_norm": 2.7116572856903076, + "learning_rate": 4.4926094371802165e-05, + "loss": 6.2554, + "step": 2285 + }, + { + "epoch": 0.1265892758430072, + "grad_norm": 2.960740804672241, + "learning_rate": 4.491188175099488e-05, + "loss": 5.835, + "step": 2290 + }, + { + "epoch": 0.12686567164179105, + "grad_norm": 2.641920804977417, + "learning_rate": 4.4897669130187606e-05, + "loss": 6.0452, + "step": 2295 + }, + { + "epoch": 0.1271420674405749, + "grad_norm": 2.40816068649292, + "learning_rate": 4.488345650938034e-05, + "loss": 6.1682, + "step": 2300 + }, + { + "epoch": 0.12741846323935876, + "grad_norm": 2.953784942626953, + "learning_rate": 4.4869243888573054e-05, + "loss": 5.8982, + "step": 2305 + }, + { + "epoch": 0.12769485903814262, + "grad_norm": 2.7458655834198, + "learning_rate": 4.485503126776578e-05, + "loss": 5.7274, + "step": 2310 + }, + { + "epoch": 0.12797125483692648, + "grad_norm": 2.4384260177612305, + "learning_rate": 4.48408186469585e-05, + "loss": 5.8491, + "step": 2315 + }, + { + "epoch": 0.12824765063571034, + "grad_norm": 3.5330193042755127, + "learning_rate": 4.4826606026151226e-05, + "loss": 6.1105, + "step": 2320 + }, + { + "epoch": 0.1285240464344942, + "grad_norm": 3.598749876022339, + "learning_rate": 4.481239340534395e-05, + "loss": 5.7398, + "step": 2325 + }, + { + "epoch": 0.12880044223327805, + "grad_norm": 2.857062339782715, + "learning_rate": 4.479818078453667e-05, + "loss": 6.0652, + "step": 2330 + }, + { + "epoch": 0.1290768380320619, + "grad_norm": 2.578871250152588, + "learning_rate": 4.47839681637294e-05, + "loss": 5.9814, + "step": 2335 + }, + { + "epoch": 0.12935323383084577, + "grad_norm": 3.088256597518921, + "learning_rate": 4.4769755542922115e-05, + "loss": 5.8343, + "step": 2340 + }, + { + "epoch": 0.12962962962962962, + "grad_norm": 2.48786997795105, + "learning_rate": 4.475554292211484e-05, + "loss": 5.9264, + "step": 2345 + }, + { + "epoch": 0.12990602542841348, + "grad_norm": 2.87296199798584, + "learning_rate": 4.474133030130756e-05, + "loss": 5.8988, + "step": 2350 + }, + { + "epoch": 0.13018242122719734, + "grad_norm": 3.202390193939209, + "learning_rate": 4.472711768050029e-05, + "loss": 5.7166, + "step": 2355 + }, + { + "epoch": 0.1304588170259812, + "grad_norm": 3.6477270126342773, + "learning_rate": 4.471290505969301e-05, + "loss": 5.7914, + "step": 2360 + }, + { + "epoch": 0.13073521282476505, + "grad_norm": 2.6237668991088867, + "learning_rate": 4.469869243888573e-05, + "loss": 5.9484, + "step": 2365 + }, + { + "epoch": 0.1310116086235489, + "grad_norm": 2.782205104827881, + "learning_rate": 4.468447981807846e-05, + "loss": 6.0338, + "step": 2370 + }, + { + "epoch": 0.1312880044223328, + "grad_norm": 2.7849576473236084, + "learning_rate": 4.467026719727118e-05, + "loss": 6.1244, + "step": 2375 + }, + { + "epoch": 0.13156440022111665, + "grad_norm": 2.8187060356140137, + "learning_rate": 4.46560545764639e-05, + "loss": 5.6802, + "step": 2380 + }, + { + "epoch": 0.1318407960199005, + "grad_norm": 2.4502339363098145, + "learning_rate": 4.4641841955656624e-05, + "loss": 5.9567, + "step": 2385 + }, + { + "epoch": 0.13211719181868437, + "grad_norm": 2.743391275405884, + "learning_rate": 4.462762933484935e-05, + "loss": 5.737, + "step": 2390 + }, + { + "epoch": 0.13239358761746822, + "grad_norm": 2.580054998397827, + "learning_rate": 4.461341671404207e-05, + "loss": 5.7443, + "step": 2395 + }, + { + "epoch": 0.13266998341625208, + "grad_norm": 3.157968282699585, + "learning_rate": 4.459920409323479e-05, + "loss": 6.083, + "step": 2400 + }, + { + "epoch": 0.13294637921503594, + "grad_norm": 2.9599556922912598, + "learning_rate": 4.458499147242752e-05, + "loss": 6.0339, + "step": 2405 + }, + { + "epoch": 0.1332227750138198, + "grad_norm": 3.3958144187927246, + "learning_rate": 4.4570778851620244e-05, + "loss": 5.9857, + "step": 2410 + }, + { + "epoch": 0.13349917081260365, + "grad_norm": 3.3407669067382812, + "learning_rate": 4.455656623081296e-05, + "loss": 5.7594, + "step": 2415 + }, + { + "epoch": 0.1337755666113875, + "grad_norm": 2.816286563873291, + "learning_rate": 4.4542353610005685e-05, + "loss": 6.0003, + "step": 2420 + }, + { + "epoch": 0.13405196241017137, + "grad_norm": 2.553955554962158, + "learning_rate": 4.452814098919841e-05, + "loss": 6.0085, + "step": 2425 + }, + { + "epoch": 0.13432835820895522, + "grad_norm": 3.3297877311706543, + "learning_rate": 4.451392836839113e-05, + "loss": 5.8029, + "step": 2430 + }, + { + "epoch": 0.13460475400773908, + "grad_norm": 2.530499219894409, + "learning_rate": 4.449971574758386e-05, + "loss": 5.9603, + "step": 2435 + }, + { + "epoch": 0.13488114980652294, + "grad_norm": 2.859200954437256, + "learning_rate": 4.448550312677658e-05, + "loss": 5.9611, + "step": 2440 + }, + { + "epoch": 0.1351575456053068, + "grad_norm": 3.111984968185425, + "learning_rate": 4.4471290505969305e-05, + "loss": 5.9601, + "step": 2445 + }, + { + "epoch": 0.13543394140409065, + "grad_norm": 2.6724886894226074, + "learning_rate": 4.445707788516202e-05, + "loss": 5.9028, + "step": 2450 + }, + { + "epoch": 0.1357103372028745, + "grad_norm": 2.909550666809082, + "learning_rate": 4.4442865264354746e-05, + "loss": 6.1483, + "step": 2455 + }, + { + "epoch": 0.13598673300165837, + "grad_norm": 2.8380892276763916, + "learning_rate": 4.442865264354748e-05, + "loss": 5.9103, + "step": 2460 + }, + { + "epoch": 0.13626312880044222, + "grad_norm": 2.6073994636535645, + "learning_rate": 4.4414440022740194e-05, + "loss": 5.9974, + "step": 2465 + }, + { + "epoch": 0.13653952459922608, + "grad_norm": 3.0573458671569824, + "learning_rate": 4.440022740193292e-05, + "loss": 5.9833, + "step": 2470 + }, + { + "epoch": 0.13681592039800994, + "grad_norm": 2.835435152053833, + "learning_rate": 4.438601478112564e-05, + "loss": 5.9305, + "step": 2475 + }, + { + "epoch": 0.1370923161967938, + "grad_norm": 2.9409842491149902, + "learning_rate": 4.4371802160318366e-05, + "loss": 6.0945, + "step": 2480 + }, + { + "epoch": 0.13736871199557768, + "grad_norm": 2.7987194061279297, + "learning_rate": 4.435758953951108e-05, + "loss": 6.0813, + "step": 2485 + }, + { + "epoch": 0.13764510779436154, + "grad_norm": 2.926170825958252, + "learning_rate": 4.434337691870381e-05, + "loss": 6.1433, + "step": 2490 + }, + { + "epoch": 0.1379215035931454, + "grad_norm": 2.812005043029785, + "learning_rate": 4.432916429789654e-05, + "loss": 6.1284, + "step": 2495 + }, + { + "epoch": 0.13819789939192925, + "grad_norm": 4.218148231506348, + "learning_rate": 4.4314951677089255e-05, + "loss": 6.2463, + "step": 2500 + }, + { + "epoch": 0.1384742951907131, + "grad_norm": 3.2735049724578857, + "learning_rate": 4.430073905628198e-05, + "loss": 6.3637, + "step": 2505 + }, + { + "epoch": 0.13875069098949697, + "grad_norm": 2.975898265838623, + "learning_rate": 4.42865264354747e-05, + "loss": 5.8744, + "step": 2510 + }, + { + "epoch": 0.13902708678828082, + "grad_norm": 2.5579276084899902, + "learning_rate": 4.427231381466743e-05, + "loss": 5.8671, + "step": 2515 + }, + { + "epoch": 0.13930348258706468, + "grad_norm": 2.9788095951080322, + "learning_rate": 4.425810119386015e-05, + "loss": 5.7345, + "step": 2520 + }, + { + "epoch": 0.13957987838584854, + "grad_norm": 2.781212568283081, + "learning_rate": 4.4243888573052875e-05, + "loss": 6.0089, + "step": 2525 + }, + { + "epoch": 0.1398562741846324, + "grad_norm": 3.1490135192871094, + "learning_rate": 4.42296759522456e-05, + "loss": 5.9384, + "step": 2530 + }, + { + "epoch": 0.14013266998341625, + "grad_norm": 3.2188096046447754, + "learning_rate": 4.4215463331438316e-05, + "loss": 6.0045, + "step": 2535 + }, + { + "epoch": 0.1404090657822001, + "grad_norm": 3.2777388095855713, + "learning_rate": 4.420125071063104e-05, + "loss": 5.956, + "step": 2540 + }, + { + "epoch": 0.14068546158098397, + "grad_norm": 3.3653669357299805, + "learning_rate": 4.418703808982377e-05, + "loss": 5.9132, + "step": 2545 + }, + { + "epoch": 0.14096185737976782, + "grad_norm": 2.6247451305389404, + "learning_rate": 4.417282546901649e-05, + "loss": 5.8262, + "step": 2550 + }, + { + "epoch": 0.14123825317855168, + "grad_norm": 2.3984429836273193, + "learning_rate": 4.415861284820921e-05, + "loss": 5.8723, + "step": 2555 + }, + { + "epoch": 0.14151464897733554, + "grad_norm": 2.4766428470611572, + "learning_rate": 4.4144400227401936e-05, + "loss": 6.0487, + "step": 2560 + }, + { + "epoch": 0.1417910447761194, + "grad_norm": 2.6634018421173096, + "learning_rate": 4.413018760659466e-05, + "loss": 6.0009, + "step": 2565 + }, + { + "epoch": 0.14206744057490325, + "grad_norm": 3.0467846393585205, + "learning_rate": 4.4115974985787384e-05, + "loss": 6.0557, + "step": 2570 + }, + { + "epoch": 0.1423438363736871, + "grad_norm": 2.9271202087402344, + "learning_rate": 4.41017623649801e-05, + "loss": 5.7928, + "step": 2575 + }, + { + "epoch": 0.14262023217247097, + "grad_norm": 2.9135117530822754, + "learning_rate": 4.408754974417283e-05, + "loss": 5.6288, + "step": 2580 + }, + { + "epoch": 0.14289662797125482, + "grad_norm": 2.667640209197998, + "learning_rate": 4.407333712336555e-05, + "loss": 5.9423, + "step": 2585 + }, + { + "epoch": 0.14317302377003868, + "grad_norm": 2.59460711479187, + "learning_rate": 4.405912450255827e-05, + "loss": 6.1813, + "step": 2590 + }, + { + "epoch": 0.14344941956882257, + "grad_norm": 3.593357801437378, + "learning_rate": 4.4044911881751e-05, + "loss": 5.8255, + "step": 2595 + }, + { + "epoch": 0.14372581536760642, + "grad_norm": 2.5840964317321777, + "learning_rate": 4.403069926094372e-05, + "loss": 5.9867, + "step": 2600 + }, + { + "epoch": 0.14400221116639028, + "grad_norm": 3.780487060546875, + "learning_rate": 4.4016486640136445e-05, + "loss": 5.734, + "step": 2605 + }, + { + "epoch": 0.14427860696517414, + "grad_norm": 2.3756935596466064, + "learning_rate": 4.400227401932916e-05, + "loss": 6.0119, + "step": 2610 + }, + { + "epoch": 0.144555002763958, + "grad_norm": 2.420318126678467, + "learning_rate": 4.398806139852189e-05, + "loss": 5.8429, + "step": 2615 + }, + { + "epoch": 0.14483139856274185, + "grad_norm": 2.4723665714263916, + "learning_rate": 4.397384877771461e-05, + "loss": 5.9102, + "step": 2620 + }, + { + "epoch": 0.1451077943615257, + "grad_norm": 2.8111989498138428, + "learning_rate": 4.3959636156907334e-05, + "loss": 5.796, + "step": 2625 + }, + { + "epoch": 0.14538419016030957, + "grad_norm": 2.4505462646484375, + "learning_rate": 4.394542353610006e-05, + "loss": 5.7925, + "step": 2630 + }, + { + "epoch": 0.14566058595909342, + "grad_norm": 2.9195985794067383, + "learning_rate": 4.393121091529278e-05, + "loss": 5.9991, + "step": 2635 + }, + { + "epoch": 0.14593698175787728, + "grad_norm": 3.3594954013824463, + "learning_rate": 4.3916998294485506e-05, + "loss": 6.0823, + "step": 2640 + }, + { + "epoch": 0.14621337755666114, + "grad_norm": 2.6641616821289062, + "learning_rate": 4.3902785673678224e-05, + "loss": 6.0646, + "step": 2645 + }, + { + "epoch": 0.146489773355445, + "grad_norm": 3.2339580059051514, + "learning_rate": 4.3888573052870954e-05, + "loss": 6.2217, + "step": 2650 + }, + { + "epoch": 0.14676616915422885, + "grad_norm": 2.6006035804748535, + "learning_rate": 4.387436043206368e-05, + "loss": 5.8923, + "step": 2655 + }, + { + "epoch": 0.1470425649530127, + "grad_norm": 2.918951988220215, + "learning_rate": 4.3860147811256395e-05, + "loss": 5.8821, + "step": 2660 + }, + { + "epoch": 0.14731896075179657, + "grad_norm": 3.5756337642669678, + "learning_rate": 4.384593519044912e-05, + "loss": 5.9689, + "step": 2665 + }, + { + "epoch": 0.14759535655058043, + "grad_norm": 2.472301483154297, + "learning_rate": 4.3831722569641843e-05, + "loss": 6.1298, + "step": 2670 + }, + { + "epoch": 0.14787175234936428, + "grad_norm": 3.1976962089538574, + "learning_rate": 4.381750994883457e-05, + "loss": 5.9328, + "step": 2675 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 2.672852039337158, + "learning_rate": 4.380329732802729e-05, + "loss": 5.7058, + "step": 2680 + }, + { + "epoch": 0.148424543946932, + "grad_norm": 2.8173885345458984, + "learning_rate": 4.3789084707220015e-05, + "loss": 5.9733, + "step": 2685 + }, + { + "epoch": 0.14870093974571585, + "grad_norm": 2.9812614917755127, + "learning_rate": 4.377487208641274e-05, + "loss": 5.9916, + "step": 2690 + }, + { + "epoch": 0.1489773355444997, + "grad_norm": 2.7552757263183594, + "learning_rate": 4.3760659465605457e-05, + "loss": 5.4035, + "step": 2695 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 2.684389114379883, + "learning_rate": 4.374644684479818e-05, + "loss": 5.9404, + "step": 2700 + }, + { + "epoch": 0.14953012714206745, + "grad_norm": 3.3270440101623535, + "learning_rate": 4.3732234223990905e-05, + "loss": 5.9081, + "step": 2705 + }, + { + "epoch": 0.1498065229408513, + "grad_norm": 3.212458848953247, + "learning_rate": 4.371802160318363e-05, + "loss": 6.2057, + "step": 2710 + }, + { + "epoch": 0.15008291873963517, + "grad_norm": 2.9287021160125732, + "learning_rate": 4.370380898237635e-05, + "loss": 6.2109, + "step": 2715 + }, + { + "epoch": 0.15035931453841903, + "grad_norm": 3.4137637615203857, + "learning_rate": 4.3689596361569076e-05, + "loss": 5.805, + "step": 2720 + }, + { + "epoch": 0.15063571033720288, + "grad_norm": 2.5240910053253174, + "learning_rate": 4.36753837407618e-05, + "loss": 5.8864, + "step": 2725 + }, + { + "epoch": 0.15091210613598674, + "grad_norm": 3.12414288520813, + "learning_rate": 4.366117111995452e-05, + "loss": 6.0336, + "step": 2730 + }, + { + "epoch": 0.1511885019347706, + "grad_norm": 3.1242873668670654, + "learning_rate": 4.364695849914724e-05, + "loss": 6.021, + "step": 2735 + }, + { + "epoch": 0.15146489773355445, + "grad_norm": 3.223360538482666, + "learning_rate": 4.363274587833997e-05, + "loss": 6.0656, + "step": 2740 + }, + { + "epoch": 0.1517412935323383, + "grad_norm": 3.3459601402282715, + "learning_rate": 4.361853325753269e-05, + "loss": 6.0403, + "step": 2745 + }, + { + "epoch": 0.15201768933112217, + "grad_norm": 2.8621153831481934, + "learning_rate": 4.3604320636725414e-05, + "loss": 5.6734, + "step": 2750 + }, + { + "epoch": 0.15229408512990603, + "grad_norm": 2.841547966003418, + "learning_rate": 4.359010801591814e-05, + "loss": 5.8039, + "step": 2755 + }, + { + "epoch": 0.15257048092868988, + "grad_norm": 2.655369758605957, + "learning_rate": 4.357589539511086e-05, + "loss": 5.8627, + "step": 2760 + }, + { + "epoch": 0.15284687672747374, + "grad_norm": 3.5070695877075195, + "learning_rate": 4.3561682774303586e-05, + "loss": 5.7962, + "step": 2765 + }, + { + "epoch": 0.1531232725262576, + "grad_norm": 2.460430860519409, + "learning_rate": 4.35474701534963e-05, + "loss": 5.9058, + "step": 2770 + }, + { + "epoch": 0.15339966832504145, + "grad_norm": 2.9735209941864014, + "learning_rate": 4.3533257532689033e-05, + "loss": 5.9459, + "step": 2775 + }, + { + "epoch": 0.1536760641238253, + "grad_norm": 2.6049184799194336, + "learning_rate": 4.351904491188175e-05, + "loss": 5.8831, + "step": 2780 + }, + { + "epoch": 0.15395245992260917, + "grad_norm": 3.0773181915283203, + "learning_rate": 4.3504832291074475e-05, + "loss": 5.8567, + "step": 2785 + }, + { + "epoch": 0.15422885572139303, + "grad_norm": 2.9767990112304688, + "learning_rate": 4.34906196702672e-05, + "loss": 5.884, + "step": 2790 + }, + { + "epoch": 0.15450525152017688, + "grad_norm": 2.9856009483337402, + "learning_rate": 4.347640704945992e-05, + "loss": 5.7991, + "step": 2795 + }, + { + "epoch": 0.15478164731896074, + "grad_norm": 2.7694013118743896, + "learning_rate": 4.3462194428652647e-05, + "loss": 5.9117, + "step": 2800 + }, + { + "epoch": 0.1550580431177446, + "grad_norm": 2.524625778198242, + "learning_rate": 4.3447981807845364e-05, + "loss": 5.8089, + "step": 2805 + }, + { + "epoch": 0.15533443891652846, + "grad_norm": 2.265674591064453, + "learning_rate": 4.3433769187038095e-05, + "loss": 5.7669, + "step": 2810 + }, + { + "epoch": 0.15561083471531234, + "grad_norm": 2.9891350269317627, + "learning_rate": 4.341955656623081e-05, + "loss": 5.7408, + "step": 2815 + }, + { + "epoch": 0.1558872305140962, + "grad_norm": 2.8526082038879395, + "learning_rate": 4.3405343945423536e-05, + "loss": 5.5601, + "step": 2820 + }, + { + "epoch": 0.15616362631288006, + "grad_norm": 2.755467653274536, + "learning_rate": 4.3391131324616266e-05, + "loss": 5.6325, + "step": 2825 + }, + { + "epoch": 0.1564400221116639, + "grad_norm": 3.075253963470459, + "learning_rate": 4.3376918703808984e-05, + "loss": 6.0243, + "step": 2830 + }, + { + "epoch": 0.15671641791044777, + "grad_norm": 2.6494526863098145, + "learning_rate": 4.336270608300171e-05, + "loss": 5.7588, + "step": 2835 + }, + { + "epoch": 0.15699281370923163, + "grad_norm": 3.440871238708496, + "learning_rate": 4.334849346219443e-05, + "loss": 5.9682, + "step": 2840 + }, + { + "epoch": 0.15726920950801548, + "grad_norm": 3.129068374633789, + "learning_rate": 4.3334280841387156e-05, + "loss": 5.8397, + "step": 2845 + }, + { + "epoch": 0.15754560530679934, + "grad_norm": 2.8854775428771973, + "learning_rate": 4.332006822057988e-05, + "loss": 5.6775, + "step": 2850 + }, + { + "epoch": 0.1578220011055832, + "grad_norm": 2.4169180393218994, + "learning_rate": 4.33058555997726e-05, + "loss": 5.7348, + "step": 2855 + }, + { + "epoch": 0.15809839690436706, + "grad_norm": 2.6656081676483154, + "learning_rate": 4.329164297896533e-05, + "loss": 5.5804, + "step": 2860 + }, + { + "epoch": 0.1583747927031509, + "grad_norm": 2.1529061794281006, + "learning_rate": 4.3277430358158045e-05, + "loss": 6.0426, + "step": 2865 + }, + { + "epoch": 0.15865118850193477, + "grad_norm": 2.7012405395507812, + "learning_rate": 4.326321773735077e-05, + "loss": 5.8865, + "step": 2870 + }, + { + "epoch": 0.15892758430071863, + "grad_norm": 3.113957643508911, + "learning_rate": 4.324900511654349e-05, + "loss": 5.7905, + "step": 2875 + }, + { + "epoch": 0.15920398009950248, + "grad_norm": 2.826045274734497, + "learning_rate": 4.323479249573622e-05, + "loss": 5.8868, + "step": 2880 + }, + { + "epoch": 0.15948037589828634, + "grad_norm": 2.5294294357299805, + "learning_rate": 4.322057987492894e-05, + "loss": 5.6514, + "step": 2885 + }, + { + "epoch": 0.1597567716970702, + "grad_norm": 2.923967123031616, + "learning_rate": 4.320636725412166e-05, + "loss": 6.0134, + "step": 2890 + }, + { + "epoch": 0.16003316749585406, + "grad_norm": 3.1587777137756348, + "learning_rate": 4.319215463331439e-05, + "loss": 6.1809, + "step": 2895 + }, + { + "epoch": 0.1603095632946379, + "grad_norm": 2.6380045413970947, + "learning_rate": 4.317794201250711e-05, + "loss": 5.828, + "step": 2900 + }, + { + "epoch": 0.16058595909342177, + "grad_norm": 3.320361852645874, + "learning_rate": 4.316372939169983e-05, + "loss": 5.8606, + "step": 2905 + }, + { + "epoch": 0.16086235489220563, + "grad_norm": 2.4899818897247314, + "learning_rate": 4.3149516770892554e-05, + "loss": 5.7003, + "step": 2910 + }, + { + "epoch": 0.16113875069098949, + "grad_norm": 3.366610050201416, + "learning_rate": 4.313530415008528e-05, + "loss": 5.7348, + "step": 2915 + }, + { + "epoch": 0.16141514648977334, + "grad_norm": 3.16973876953125, + "learning_rate": 4.3121091529278e-05, + "loss": 5.6955, + "step": 2920 + }, + { + "epoch": 0.16169154228855723, + "grad_norm": 3.0917766094207764, + "learning_rate": 4.310687890847072e-05, + "loss": 5.8221, + "step": 2925 + }, + { + "epoch": 0.16196793808734108, + "grad_norm": 3.0360071659088135, + "learning_rate": 4.309266628766345e-05, + "loss": 5.7854, + "step": 2930 + }, + { + "epoch": 0.16224433388612494, + "grad_norm": 2.421694040298462, + "learning_rate": 4.3078453666856174e-05, + "loss": 5.5865, + "step": 2935 + }, + { + "epoch": 0.1625207296849088, + "grad_norm": 2.4142487049102783, + "learning_rate": 4.306424104604889e-05, + "loss": 5.6557, + "step": 2940 + }, + { + "epoch": 0.16279712548369266, + "grad_norm": 2.854003429412842, + "learning_rate": 4.3050028425241615e-05, + "loss": 5.8642, + "step": 2945 + }, + { + "epoch": 0.1630735212824765, + "grad_norm": 2.4394283294677734, + "learning_rate": 4.303581580443434e-05, + "loss": 5.5967, + "step": 2950 + }, + { + "epoch": 0.16334991708126037, + "grad_norm": 3.362593650817871, + "learning_rate": 4.302160318362706e-05, + "loss": 5.9702, + "step": 2955 + }, + { + "epoch": 0.16362631288004423, + "grad_norm": 2.2404420375823975, + "learning_rate": 4.300739056281979e-05, + "loss": 5.6981, + "step": 2960 + }, + { + "epoch": 0.16390270867882809, + "grad_norm": 2.7826569080352783, + "learning_rate": 4.299317794201251e-05, + "loss": 5.7644, + "step": 2965 + }, + { + "epoch": 0.16417910447761194, + "grad_norm": 2.6472930908203125, + "learning_rate": 4.2978965321205235e-05, + "loss": 6.105, + "step": 2970 + }, + { + "epoch": 0.1644555002763958, + "grad_norm": 3.248873472213745, + "learning_rate": 4.296475270039795e-05, + "loss": 5.7841, + "step": 2975 + }, + { + "epoch": 0.16473189607517966, + "grad_norm": 3.790102958679199, + "learning_rate": 4.2950540079590676e-05, + "loss": 5.7012, + "step": 2980 + }, + { + "epoch": 0.16500829187396351, + "grad_norm": 3.1310534477233887, + "learning_rate": 4.293632745878341e-05, + "loss": 5.7268, + "step": 2985 + }, + { + "epoch": 0.16528468767274737, + "grad_norm": 2.8928165435791016, + "learning_rate": 4.2922114837976124e-05, + "loss": 5.7898, + "step": 2990 + }, + { + "epoch": 0.16556108347153123, + "grad_norm": 2.956012010574341, + "learning_rate": 4.290790221716885e-05, + "loss": 6.1025, + "step": 2995 + }, + { + "epoch": 0.16583747927031509, + "grad_norm": 3.146536111831665, + "learning_rate": 4.289368959636157e-05, + "loss": 5.8821, + "step": 3000 + }, + { + "epoch": 0.16611387506909894, + "grad_norm": 3.373002767562866, + "learning_rate": 4.2879476975554296e-05, + "loss": 5.433, + "step": 3005 + }, + { + "epoch": 0.1663902708678828, + "grad_norm": 3.2501721382141113, + "learning_rate": 4.286526435474702e-05, + "loss": 6.0184, + "step": 3010 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 2.656000852584839, + "learning_rate": 4.285105173393974e-05, + "loss": 5.8359, + "step": 3015 + }, + { + "epoch": 0.16694306246545051, + "grad_norm": 3.513477325439453, + "learning_rate": 4.283683911313247e-05, + "loss": 6.0025, + "step": 3020 + }, + { + "epoch": 0.16721945826423437, + "grad_norm": 3.080552816390991, + "learning_rate": 4.2822626492325185e-05, + "loss": 5.7096, + "step": 3025 + }, + { + "epoch": 0.16749585406301823, + "grad_norm": 2.202702045440674, + "learning_rate": 4.280841387151791e-05, + "loss": 5.9594, + "step": 3030 + }, + { + "epoch": 0.16777224986180211, + "grad_norm": 2.7904155254364014, + "learning_rate": 4.279420125071063e-05, + "loss": 5.749, + "step": 3035 + }, + { + "epoch": 0.16804864566058597, + "grad_norm": 3.466552495956421, + "learning_rate": 4.277998862990336e-05, + "loss": 5.8486, + "step": 3040 + }, + { + "epoch": 0.16832504145936983, + "grad_norm": 2.64650297164917, + "learning_rate": 4.276577600909608e-05, + "loss": 5.8657, + "step": 3045 + }, + { + "epoch": 0.16860143725815369, + "grad_norm": 3.313481569290161, + "learning_rate": 4.27515633882888e-05, + "loss": 5.5925, + "step": 3050 + }, + { + "epoch": 0.16887783305693754, + "grad_norm": 3.433079957962036, + "learning_rate": 4.273735076748153e-05, + "loss": 5.6123, + "step": 3055 + }, + { + "epoch": 0.1691542288557214, + "grad_norm": 3.088146686553955, + "learning_rate": 4.2723138146674246e-05, + "loss": 5.6197, + "step": 3060 + }, + { + "epoch": 0.16943062465450526, + "grad_norm": 2.703796863555908, + "learning_rate": 4.270892552586697e-05, + "loss": 5.8104, + "step": 3065 + }, + { + "epoch": 0.16970702045328911, + "grad_norm": 2.511690139770508, + "learning_rate": 4.2694712905059694e-05, + "loss": 5.5404, + "step": 3070 + }, + { + "epoch": 0.16998341625207297, + "grad_norm": 2.678618907928467, + "learning_rate": 4.268050028425242e-05, + "loss": 5.8477, + "step": 3075 + }, + { + "epoch": 0.17025981205085683, + "grad_norm": 2.9002177715301514, + "learning_rate": 4.266628766344514e-05, + "loss": 5.907, + "step": 3080 + }, + { + "epoch": 0.1705362078496407, + "grad_norm": 3.1903774738311768, + "learning_rate": 4.265207504263786e-05, + "loss": 5.8601, + "step": 3085 + }, + { + "epoch": 0.17081260364842454, + "grad_norm": 3.3715593814849854, + "learning_rate": 4.263786242183059e-05, + "loss": 5.8326, + "step": 3090 + }, + { + "epoch": 0.1710889994472084, + "grad_norm": 2.6727895736694336, + "learning_rate": 4.2623649801023314e-05, + "loss": 5.6508, + "step": 3095 + }, + { + "epoch": 0.17136539524599226, + "grad_norm": 2.429774761199951, + "learning_rate": 4.260943718021603e-05, + "loss": 5.7209, + "step": 3100 + }, + { + "epoch": 0.17164179104477612, + "grad_norm": 2.418609619140625, + "learning_rate": 4.2595224559408755e-05, + "loss": 5.6107, + "step": 3105 + }, + { + "epoch": 0.17191818684355997, + "grad_norm": 3.0767714977264404, + "learning_rate": 4.258101193860148e-05, + "loss": 5.6858, + "step": 3110 + }, + { + "epoch": 0.17219458264234383, + "grad_norm": 3.297687530517578, + "learning_rate": 4.25667993177942e-05, + "loss": 5.9651, + "step": 3115 + }, + { + "epoch": 0.1724709784411277, + "grad_norm": 3.491100549697876, + "learning_rate": 4.255258669698693e-05, + "loss": 5.6576, + "step": 3120 + }, + { + "epoch": 0.17274737423991154, + "grad_norm": 2.744009256362915, + "learning_rate": 4.253837407617965e-05, + "loss": 5.6868, + "step": 3125 + }, + { + "epoch": 0.1730237700386954, + "grad_norm": 3.076695680618286, + "learning_rate": 4.2524161455372375e-05, + "loss": 5.7896, + "step": 3130 + }, + { + "epoch": 0.17330016583747926, + "grad_norm": 2.9809587001800537, + "learning_rate": 4.250994883456509e-05, + "loss": 5.7975, + "step": 3135 + }, + { + "epoch": 0.17357656163626312, + "grad_norm": 2.666238307952881, + "learning_rate": 4.249573621375782e-05, + "loss": 6.0512, + "step": 3140 + }, + { + "epoch": 0.173852957435047, + "grad_norm": 2.905074119567871, + "learning_rate": 4.248152359295054e-05, + "loss": 5.5453, + "step": 3145 + }, + { + "epoch": 0.17412935323383086, + "grad_norm": 3.2428863048553467, + "learning_rate": 4.2467310972143264e-05, + "loss": 5.5774, + "step": 3150 + }, + { + "epoch": 0.17440574903261472, + "grad_norm": 2.5885746479034424, + "learning_rate": 4.245309835133599e-05, + "loss": 5.8335, + "step": 3155 + }, + { + "epoch": 0.17468214483139857, + "grad_norm": 2.6241211891174316, + "learning_rate": 4.243888573052871e-05, + "loss": 5.8666, + "step": 3160 + }, + { + "epoch": 0.17495854063018243, + "grad_norm": 2.9486637115478516, + "learning_rate": 4.2424673109721436e-05, + "loss": 5.4234, + "step": 3165 + }, + { + "epoch": 0.1752349364289663, + "grad_norm": 3.1195502281188965, + "learning_rate": 4.241046048891415e-05, + "loss": 5.7667, + "step": 3170 + }, + { + "epoch": 0.17551133222775014, + "grad_norm": 2.9421496391296387, + "learning_rate": 4.2396247868106884e-05, + "loss": 5.6917, + "step": 3175 + }, + { + "epoch": 0.175787728026534, + "grad_norm": 3.021730422973633, + "learning_rate": 4.238203524729961e-05, + "loss": 6.0067, + "step": 3180 + }, + { + "epoch": 0.17606412382531786, + "grad_norm": 2.6247682571411133, + "learning_rate": 4.2367822626492325e-05, + "loss": 5.7843, + "step": 3185 + }, + { + "epoch": 0.17634051962410172, + "grad_norm": 2.8806023597717285, + "learning_rate": 4.235361000568505e-05, + "loss": 5.8293, + "step": 3190 + }, + { + "epoch": 0.17661691542288557, + "grad_norm": 3.3191425800323486, + "learning_rate": 4.233939738487777e-05, + "loss": 5.8644, + "step": 3195 + }, + { + "epoch": 0.17689331122166943, + "grad_norm": 2.656522274017334, + "learning_rate": 4.23251847640705e-05, + "loss": 5.8543, + "step": 3200 + }, + { + "epoch": 0.1771697070204533, + "grad_norm": 3.2881433963775635, + "learning_rate": 4.231097214326322e-05, + "loss": 5.5148, + "step": 3205 + }, + { + "epoch": 0.17744610281923714, + "grad_norm": 2.557159423828125, + "learning_rate": 4.2296759522455945e-05, + "loss": 5.8617, + "step": 3210 + }, + { + "epoch": 0.177722498618021, + "grad_norm": 3.0635502338409424, + "learning_rate": 4.228254690164867e-05, + "loss": 5.7063, + "step": 3215 + }, + { + "epoch": 0.17799889441680486, + "grad_norm": 2.6538784503936768, + "learning_rate": 4.2268334280841386e-05, + "loss": 5.6792, + "step": 3220 + }, + { + "epoch": 0.17827529021558872, + "grad_norm": 3.1617372035980225, + "learning_rate": 4.225412166003411e-05, + "loss": 5.9972, + "step": 3225 + }, + { + "epoch": 0.17855168601437257, + "grad_norm": 2.5315380096435547, + "learning_rate": 4.223990903922684e-05, + "loss": 5.7918, + "step": 3230 + }, + { + "epoch": 0.17882808181315643, + "grad_norm": 3.2303788661956787, + "learning_rate": 4.222569641841956e-05, + "loss": 5.7051, + "step": 3235 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 2.689042568206787, + "learning_rate": 4.221148379761228e-05, + "loss": 5.7993, + "step": 3240 + }, + { + "epoch": 0.17938087341072415, + "grad_norm": 2.5231776237487793, + "learning_rate": 4.2197271176805006e-05, + "loss": 5.9223, + "step": 3245 + }, + { + "epoch": 0.179657269209508, + "grad_norm": 3.4282543659210205, + "learning_rate": 4.218305855599773e-05, + "loss": 5.6935, + "step": 3250 + }, + { + "epoch": 0.1799336650082919, + "grad_norm": 2.655792236328125, + "learning_rate": 4.216884593519045e-05, + "loss": 5.6622, + "step": 3255 + }, + { + "epoch": 0.18021006080707574, + "grad_norm": 3.699481248855591, + "learning_rate": 4.215463331438317e-05, + "loss": 5.6104, + "step": 3260 + }, + { + "epoch": 0.1804864566058596, + "grad_norm": 2.7211077213287354, + "learning_rate": 4.21404206935759e-05, + "loss": 5.7032, + "step": 3265 + }, + { + "epoch": 0.18076285240464346, + "grad_norm": 3.4669954776763916, + "learning_rate": 4.212620807276862e-05, + "loss": 5.4362, + "step": 3270 + }, + { + "epoch": 0.18103924820342732, + "grad_norm": 2.743119716644287, + "learning_rate": 4.211199545196134e-05, + "loss": 5.5355, + "step": 3275 + }, + { + "epoch": 0.18131564400221117, + "grad_norm": 2.8650245666503906, + "learning_rate": 4.209778283115407e-05, + "loss": 5.6457, + "step": 3280 + }, + { + "epoch": 0.18159203980099503, + "grad_norm": 2.683289051055908, + "learning_rate": 4.208357021034679e-05, + "loss": 5.8724, + "step": 3285 + }, + { + "epoch": 0.1818684355997789, + "grad_norm": 2.849991798400879, + "learning_rate": 4.2069357589539515e-05, + "loss": 5.8989, + "step": 3290 + }, + { + "epoch": 0.18214483139856275, + "grad_norm": 3.0860819816589355, + "learning_rate": 4.205514496873223e-05, + "loss": 5.9586, + "step": 3295 + }, + { + "epoch": 0.1824212271973466, + "grad_norm": 3.3663172721862793, + "learning_rate": 4.204093234792496e-05, + "loss": 5.7115, + "step": 3300 + }, + { + "epoch": 0.18269762299613046, + "grad_norm": 3.262286901473999, + "learning_rate": 4.202671972711768e-05, + "loss": 5.9303, + "step": 3305 + }, + { + "epoch": 0.18297401879491432, + "grad_norm": 2.6756162643432617, + "learning_rate": 4.2012507106310404e-05, + "loss": 6.0514, + "step": 3310 + }, + { + "epoch": 0.18325041459369817, + "grad_norm": 2.981628179550171, + "learning_rate": 4.199829448550313e-05, + "loss": 5.5975, + "step": 3315 + }, + { + "epoch": 0.18352681039248203, + "grad_norm": 3.121140956878662, + "learning_rate": 4.198408186469585e-05, + "loss": 5.8209, + "step": 3320 + }, + { + "epoch": 0.1838032061912659, + "grad_norm": 2.5124523639678955, + "learning_rate": 4.1969869243888576e-05, + "loss": 5.7161, + "step": 3325 + }, + { + "epoch": 0.18407960199004975, + "grad_norm": 2.389613389968872, + "learning_rate": 4.1955656623081294e-05, + "loss": 5.5582, + "step": 3330 + }, + { + "epoch": 0.1843559977888336, + "grad_norm": 3.512432336807251, + "learning_rate": 4.1941444002274024e-05, + "loss": 5.5329, + "step": 3335 + }, + { + "epoch": 0.18463239358761746, + "grad_norm": 2.90018367767334, + "learning_rate": 4.192723138146675e-05, + "loss": 5.9161, + "step": 3340 + }, + { + "epoch": 0.18490878938640132, + "grad_norm": 3.013446092605591, + "learning_rate": 4.1913018760659465e-05, + "loss": 5.9448, + "step": 3345 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 2.8273327350616455, + "learning_rate": 4.189880613985219e-05, + "loss": 5.5355, + "step": 3350 + }, + { + "epoch": 0.18546158098396903, + "grad_norm": 3.1768956184387207, + "learning_rate": 4.1884593519044913e-05, + "loss": 6.1034, + "step": 3355 + }, + { + "epoch": 0.1857379767827529, + "grad_norm": 3.083205461502075, + "learning_rate": 4.187038089823764e-05, + "loss": 5.7557, + "step": 3360 + }, + { + "epoch": 0.18601437258153677, + "grad_norm": 3.252182960510254, + "learning_rate": 4.1856168277430355e-05, + "loss": 5.6727, + "step": 3365 + }, + { + "epoch": 0.18629076838032063, + "grad_norm": 2.590454578399658, + "learning_rate": 4.1841955656623085e-05, + "loss": 6.1319, + "step": 3370 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 2.937279224395752, + "learning_rate": 4.182774303581581e-05, + "loss": 5.8309, + "step": 3375 + }, + { + "epoch": 0.18684355997788835, + "grad_norm": 2.746546745300293, + "learning_rate": 4.1813530415008527e-05, + "loss": 5.6252, + "step": 3380 + }, + { + "epoch": 0.1871199557766722, + "grad_norm": 3.1793739795684814, + "learning_rate": 4.179931779420125e-05, + "loss": 5.7403, + "step": 3385 + }, + { + "epoch": 0.18739635157545606, + "grad_norm": 3.3896684646606445, + "learning_rate": 4.1785105173393974e-05, + "loss": 6.023, + "step": 3390 + }, + { + "epoch": 0.18767274737423992, + "grad_norm": 2.4643993377685547, + "learning_rate": 4.17708925525867e-05, + "loss": 5.9285, + "step": 3395 + }, + { + "epoch": 0.18794914317302377, + "grad_norm": 2.6043946743011475, + "learning_rate": 4.175667993177942e-05, + "loss": 5.655, + "step": 3400 + }, + { + "epoch": 0.18822553897180763, + "grad_norm": 2.694716691970825, + "learning_rate": 4.1742467310972146e-05, + "loss": 5.7065, + "step": 3405 + }, + { + "epoch": 0.1885019347705915, + "grad_norm": 2.6867024898529053, + "learning_rate": 4.172825469016487e-05, + "loss": 5.8445, + "step": 3410 + }, + { + "epoch": 0.18877833056937535, + "grad_norm": 2.5469698905944824, + "learning_rate": 4.171404206935759e-05, + "loss": 5.7215, + "step": 3415 + }, + { + "epoch": 0.1890547263681592, + "grad_norm": 2.410991668701172, + "learning_rate": 4.169982944855031e-05, + "loss": 5.4893, + "step": 3420 + }, + { + "epoch": 0.18933112216694306, + "grad_norm": 3.104326009750366, + "learning_rate": 4.168561682774304e-05, + "loss": 5.4989, + "step": 3425 + }, + { + "epoch": 0.18960751796572692, + "grad_norm": 2.4258954524993896, + "learning_rate": 4.167140420693576e-05, + "loss": 5.6876, + "step": 3430 + }, + { + "epoch": 0.18988391376451078, + "grad_norm": 2.748441219329834, + "learning_rate": 4.1657191586128484e-05, + "loss": 5.5542, + "step": 3435 + }, + { + "epoch": 0.19016030956329463, + "grad_norm": 2.7649223804473877, + "learning_rate": 4.164297896532121e-05, + "loss": 6.136, + "step": 3440 + }, + { + "epoch": 0.1904367053620785, + "grad_norm": 2.782585620880127, + "learning_rate": 4.162876634451393e-05, + "loss": 5.882, + "step": 3445 + }, + { + "epoch": 0.19071310116086235, + "grad_norm": 2.9212682247161865, + "learning_rate": 4.161455372370665e-05, + "loss": 5.9311, + "step": 3450 + }, + { + "epoch": 0.1909894969596462, + "grad_norm": 3.2894344329833984, + "learning_rate": 4.160034110289938e-05, + "loss": 5.734, + "step": 3455 + }, + { + "epoch": 0.19126589275843006, + "grad_norm": 2.266057014465332, + "learning_rate": 4.1586128482092103e-05, + "loss": 5.4366, + "step": 3460 + }, + { + "epoch": 0.19154228855721392, + "grad_norm": 3.4635226726531982, + "learning_rate": 4.157191586128482e-05, + "loss": 5.5668, + "step": 3465 + }, + { + "epoch": 0.19181868435599778, + "grad_norm": 2.7193620204925537, + "learning_rate": 4.1557703240477545e-05, + "loss": 5.8679, + "step": 3470 + }, + { + "epoch": 0.19209508015478166, + "grad_norm": 2.5179836750030518, + "learning_rate": 4.154349061967027e-05, + "loss": 5.5888, + "step": 3475 + }, + { + "epoch": 0.19237147595356552, + "grad_norm": 2.735239267349243, + "learning_rate": 4.152927799886299e-05, + "loss": 5.6179, + "step": 3480 + }, + { + "epoch": 0.19264787175234938, + "grad_norm": 2.676177501678467, + "learning_rate": 4.1515065378055717e-05, + "loss": 5.7376, + "step": 3485 + }, + { + "epoch": 0.19292426755113323, + "grad_norm": 3.584538221359253, + "learning_rate": 4.150085275724844e-05, + "loss": 5.9845, + "step": 3490 + }, + { + "epoch": 0.1932006633499171, + "grad_norm": 3.5439796447753906, + "learning_rate": 4.1486640136441164e-05, + "loss": 5.8218, + "step": 3495 + }, + { + "epoch": 0.19347705914870095, + "grad_norm": 3.0446224212646484, + "learning_rate": 4.147242751563388e-05, + "loss": 5.5155, + "step": 3500 + }, + { + "epoch": 0.1937534549474848, + "grad_norm": 2.8608455657958984, + "learning_rate": 4.1458214894826606e-05, + "loss": 5.9078, + "step": 3505 + }, + { + "epoch": 0.19402985074626866, + "grad_norm": 3.2207865715026855, + "learning_rate": 4.1444002274019336e-05, + "loss": 5.7971, + "step": 3510 + }, + { + "epoch": 0.19430624654505252, + "grad_norm": 3.01859188079834, + "learning_rate": 4.1429789653212054e-05, + "loss": 5.7289, + "step": 3515 + }, + { + "epoch": 0.19458264234383638, + "grad_norm": 3.373577117919922, + "learning_rate": 4.141557703240478e-05, + "loss": 5.3903, + "step": 3520 + }, + { + "epoch": 0.19485903814262023, + "grad_norm": 2.7322349548339844, + "learning_rate": 4.14013644115975e-05, + "loss": 5.3543, + "step": 3525 + }, + { + "epoch": 0.1951354339414041, + "grad_norm": 2.5114071369171143, + "learning_rate": 4.1387151790790226e-05, + "loss": 5.7759, + "step": 3530 + }, + { + "epoch": 0.19541182974018795, + "grad_norm": 2.954470634460449, + "learning_rate": 4.137293916998295e-05, + "loss": 5.8468, + "step": 3535 + }, + { + "epoch": 0.1956882255389718, + "grad_norm": 2.818599224090576, + "learning_rate": 4.135872654917567e-05, + "loss": 5.4117, + "step": 3540 + }, + { + "epoch": 0.19596462133775566, + "grad_norm": 3.0054855346679688, + "learning_rate": 4.13445139283684e-05, + "loss": 5.8437, + "step": 3545 + }, + { + "epoch": 0.19624101713653952, + "grad_norm": 2.919210433959961, + "learning_rate": 4.1330301307561115e-05, + "loss": 5.9989, + "step": 3550 + }, + { + "epoch": 0.19651741293532338, + "grad_norm": 2.6916286945343018, + "learning_rate": 4.131608868675384e-05, + "loss": 5.8415, + "step": 3555 + }, + { + "epoch": 0.19679380873410723, + "grad_norm": 2.8010470867156982, + "learning_rate": 4.130187606594656e-05, + "loss": 5.5132, + "step": 3560 + }, + { + "epoch": 0.1970702045328911, + "grad_norm": 3.2105109691619873, + "learning_rate": 4.128766344513929e-05, + "loss": 5.7389, + "step": 3565 + }, + { + "epoch": 0.19734660033167495, + "grad_norm": 2.8323161602020264, + "learning_rate": 4.127345082433201e-05, + "loss": 5.5191, + "step": 3570 + }, + { + "epoch": 0.1976229961304588, + "grad_norm": 2.6223807334899902, + "learning_rate": 4.125923820352473e-05, + "loss": 5.6713, + "step": 3575 + }, + { + "epoch": 0.19789939192924266, + "grad_norm": 3.166440963745117, + "learning_rate": 4.124502558271746e-05, + "loss": 5.4685, + "step": 3580 + }, + { + "epoch": 0.19817578772802655, + "grad_norm": 2.5099334716796875, + "learning_rate": 4.1230812961910176e-05, + "loss": 5.7664, + "step": 3585 + }, + { + "epoch": 0.1984521835268104, + "grad_norm": 2.3826889991760254, + "learning_rate": 4.12166003411029e-05, + "loss": 5.4451, + "step": 3590 + }, + { + "epoch": 0.19872857932559426, + "grad_norm": 2.5310139656066895, + "learning_rate": 4.1202387720295624e-05, + "loss": 5.748, + "step": 3595 + }, + { + "epoch": 0.19900497512437812, + "grad_norm": 2.9146652221679688, + "learning_rate": 4.118817509948835e-05, + "loss": 5.4966, + "step": 3600 + }, + { + "epoch": 0.19928137092316198, + "grad_norm": 2.7887117862701416, + "learning_rate": 4.117396247868107e-05, + "loss": 5.6988, + "step": 3605 + }, + { + "epoch": 0.19955776672194583, + "grad_norm": 2.6834566593170166, + "learning_rate": 4.115974985787379e-05, + "loss": 5.4005, + "step": 3610 + }, + { + "epoch": 0.1998341625207297, + "grad_norm": 2.7212436199188232, + "learning_rate": 4.114553723706652e-05, + "loss": 5.4062, + "step": 3615 + }, + { + "epoch": 0.20011055831951355, + "grad_norm": 2.5989837646484375, + "learning_rate": 4.1131324616259244e-05, + "loss": 5.7674, + "step": 3620 + }, + { + "epoch": 0.2003869541182974, + "grad_norm": 2.8077232837677, + "learning_rate": 4.111711199545196e-05, + "loss": 5.7709, + "step": 3625 + }, + { + "epoch": 0.20066334991708126, + "grad_norm": 2.3831634521484375, + "learning_rate": 4.1102899374644685e-05, + "loss": 5.6026, + "step": 3630 + }, + { + "epoch": 0.20093974571586512, + "grad_norm": 3.917057991027832, + "learning_rate": 4.108868675383741e-05, + "loss": 5.6208, + "step": 3635 + }, + { + "epoch": 0.20121614151464898, + "grad_norm": 2.841247081756592, + "learning_rate": 4.107447413303013e-05, + "loss": 5.6398, + "step": 3640 + }, + { + "epoch": 0.20149253731343283, + "grad_norm": 2.5820930004119873, + "learning_rate": 4.106026151222286e-05, + "loss": 5.7411, + "step": 3645 + }, + { + "epoch": 0.2017689331122167, + "grad_norm": 2.7610599994659424, + "learning_rate": 4.104604889141558e-05, + "loss": 5.5423, + "step": 3650 + }, + { + "epoch": 0.20204532891100055, + "grad_norm": 3.1269173622131348, + "learning_rate": 4.1031836270608305e-05, + "loss": 5.5471, + "step": 3655 + }, + { + "epoch": 0.2023217247097844, + "grad_norm": 2.523005247116089, + "learning_rate": 4.101762364980102e-05, + "loss": 5.8834, + "step": 3660 + }, + { + "epoch": 0.20259812050856826, + "grad_norm": 2.9817779064178467, + "learning_rate": 4.1003411028993746e-05, + "loss": 5.7866, + "step": 3665 + }, + { + "epoch": 0.20287451630735212, + "grad_norm": 2.738621950149536, + "learning_rate": 4.098919840818647e-05, + "loss": 5.7785, + "step": 3670 + }, + { + "epoch": 0.20315091210613598, + "grad_norm": 2.758222818374634, + "learning_rate": 4.0974985787379194e-05, + "loss": 5.4694, + "step": 3675 + }, + { + "epoch": 0.20342730790491984, + "grad_norm": 3.3197033405303955, + "learning_rate": 4.096077316657192e-05, + "loss": 5.6767, + "step": 3680 + }, + { + "epoch": 0.2037037037037037, + "grad_norm": 3.1457207202911377, + "learning_rate": 4.094656054576464e-05, + "loss": 5.8161, + "step": 3685 + }, + { + "epoch": 0.20398009950248755, + "grad_norm": 2.59710955619812, + "learning_rate": 4.0932347924957366e-05, + "loss": 5.3963, + "step": 3690 + }, + { + "epoch": 0.2042564953012714, + "grad_norm": 3.7148916721343994, + "learning_rate": 4.091813530415008e-05, + "loss": 5.7489, + "step": 3695 + }, + { + "epoch": 0.2045328911000553, + "grad_norm": 2.7009363174438477, + "learning_rate": 4.090392268334281e-05, + "loss": 5.619, + "step": 3700 + }, + { + "epoch": 0.20480928689883915, + "grad_norm": 2.5814390182495117, + "learning_rate": 4.088971006253554e-05, + "loss": 5.4951, + "step": 3705 + }, + { + "epoch": 0.205085682697623, + "grad_norm": 2.773533344268799, + "learning_rate": 4.0875497441728255e-05, + "loss": 5.763, + "step": 3710 + }, + { + "epoch": 0.20536207849640686, + "grad_norm": 3.0854074954986572, + "learning_rate": 4.086128482092098e-05, + "loss": 5.3959, + "step": 3715 + }, + { + "epoch": 0.20563847429519072, + "grad_norm": 2.8978331089019775, + "learning_rate": 4.08470722001137e-05, + "loss": 5.7336, + "step": 3720 + }, + { + "epoch": 0.20591487009397458, + "grad_norm": 2.4324569702148438, + "learning_rate": 4.083285957930643e-05, + "loss": 5.832, + "step": 3725 + }, + { + "epoch": 0.20619126589275844, + "grad_norm": 2.3226120471954346, + "learning_rate": 4.081864695849915e-05, + "loss": 5.6816, + "step": 3730 + }, + { + "epoch": 0.2064676616915423, + "grad_norm": 2.733042001724243, + "learning_rate": 4.080443433769187e-05, + "loss": 5.6433, + "step": 3735 + }, + { + "epoch": 0.20674405749032615, + "grad_norm": 2.787104845046997, + "learning_rate": 4.07902217168846e-05, + "loss": 5.7111, + "step": 3740 + }, + { + "epoch": 0.20702045328911, + "grad_norm": 2.508944511413574, + "learning_rate": 4.0776009096077316e-05, + "loss": 5.7261, + "step": 3745 + }, + { + "epoch": 0.20729684908789386, + "grad_norm": 3.51177978515625, + "learning_rate": 4.076179647527004e-05, + "loss": 5.3499, + "step": 3750 + }, + { + "epoch": 0.20757324488667772, + "grad_norm": 2.7052156925201416, + "learning_rate": 4.074758385446277e-05, + "loss": 5.7292, + "step": 3755 + }, + { + "epoch": 0.20784964068546158, + "grad_norm": 2.662250280380249, + "learning_rate": 4.073337123365549e-05, + "loss": 5.4517, + "step": 3760 + }, + { + "epoch": 0.20812603648424544, + "grad_norm": 2.365370750427246, + "learning_rate": 4.071915861284821e-05, + "loss": 5.7968, + "step": 3765 + }, + { + "epoch": 0.2084024322830293, + "grad_norm": 3.118715763092041, + "learning_rate": 4.0704945992040936e-05, + "loss": 5.5743, + "step": 3770 + }, + { + "epoch": 0.20867882808181315, + "grad_norm": 3.200096607208252, + "learning_rate": 4.069073337123366e-05, + "loss": 6.0694, + "step": 3775 + }, + { + "epoch": 0.208955223880597, + "grad_norm": 3.215636730194092, + "learning_rate": 4.067652075042638e-05, + "loss": 5.6844, + "step": 3780 + }, + { + "epoch": 0.20923161967938086, + "grad_norm": 3.387874126434326, + "learning_rate": 4.06623081296191e-05, + "loss": 5.5597, + "step": 3785 + }, + { + "epoch": 0.20950801547816472, + "grad_norm": 3.392089605331421, + "learning_rate": 4.064809550881183e-05, + "loss": 5.8506, + "step": 3790 + }, + { + "epoch": 0.20978441127694858, + "grad_norm": 2.8729498386383057, + "learning_rate": 4.063388288800455e-05, + "loss": 5.4329, + "step": 3795 + }, + { + "epoch": 0.21006080707573244, + "grad_norm": 2.7015790939331055, + "learning_rate": 4.061967026719727e-05, + "loss": 5.6762, + "step": 3800 + }, + { + "epoch": 0.2103372028745163, + "grad_norm": 3.2836618423461914, + "learning_rate": 4.060545764639e-05, + "loss": 5.6025, + "step": 3805 + }, + { + "epoch": 0.21061359867330018, + "grad_norm": 2.447291612625122, + "learning_rate": 4.059124502558272e-05, + "loss": 5.6676, + "step": 3810 + }, + { + "epoch": 0.21088999447208404, + "grad_norm": 2.9781346321105957, + "learning_rate": 4.0577032404775445e-05, + "loss": 5.6131, + "step": 3815 + }, + { + "epoch": 0.2111663902708679, + "grad_norm": 3.1656429767608643, + "learning_rate": 4.056281978396816e-05, + "loss": 5.7342, + "step": 3820 + }, + { + "epoch": 0.21144278606965175, + "grad_norm": 2.7425482273101807, + "learning_rate": 4.054860716316089e-05, + "loss": 5.6741, + "step": 3825 + }, + { + "epoch": 0.2117191818684356, + "grad_norm": 3.253185510635376, + "learning_rate": 4.053439454235361e-05, + "loss": 5.801, + "step": 3830 + }, + { + "epoch": 0.21199557766721946, + "grad_norm": 3.0883877277374268, + "learning_rate": 4.0520181921546334e-05, + "loss": 5.6809, + "step": 3835 + }, + { + "epoch": 0.21227197346600332, + "grad_norm": 3.222590446472168, + "learning_rate": 4.050596930073906e-05, + "loss": 5.4639, + "step": 3840 + }, + { + "epoch": 0.21254836926478718, + "grad_norm": 2.510165214538574, + "learning_rate": 4.049175667993178e-05, + "loss": 5.6433, + "step": 3845 + }, + { + "epoch": 0.21282476506357104, + "grad_norm": 3.1436879634857178, + "learning_rate": 4.0477544059124506e-05, + "loss": 5.3783, + "step": 3850 + }, + { + "epoch": 0.2131011608623549, + "grad_norm": 2.9798269271850586, + "learning_rate": 4.046333143831722e-05, + "loss": 5.6385, + "step": 3855 + }, + { + "epoch": 0.21337755666113875, + "grad_norm": 2.9167640209198, + "learning_rate": 4.0449118817509954e-05, + "loss": 5.6914, + "step": 3860 + }, + { + "epoch": 0.2136539524599226, + "grad_norm": 3.467259645462036, + "learning_rate": 4.043490619670268e-05, + "loss": 5.8466, + "step": 3865 + }, + { + "epoch": 0.21393034825870647, + "grad_norm": 2.689114809036255, + "learning_rate": 4.0420693575895395e-05, + "loss": 5.9034, + "step": 3870 + }, + { + "epoch": 0.21420674405749032, + "grad_norm": 2.7007036209106445, + "learning_rate": 4.040648095508812e-05, + "loss": 5.6707, + "step": 3875 + }, + { + "epoch": 0.21448313985627418, + "grad_norm": 2.5844995975494385, + "learning_rate": 4.039226833428084e-05, + "loss": 5.726, + "step": 3880 + }, + { + "epoch": 0.21475953565505804, + "grad_norm": 2.7026453018188477, + "learning_rate": 4.037805571347357e-05, + "loss": 5.6363, + "step": 3885 + }, + { + "epoch": 0.2150359314538419, + "grad_norm": 3.0205512046813965, + "learning_rate": 4.0363843092666284e-05, + "loss": 5.6149, + "step": 3890 + }, + { + "epoch": 0.21531232725262575, + "grad_norm": 2.615713596343994, + "learning_rate": 4.0349630471859015e-05, + "loss": 5.4971, + "step": 3895 + }, + { + "epoch": 0.2155887230514096, + "grad_norm": 3.4436938762664795, + "learning_rate": 4.033541785105174e-05, + "loss": 5.9492, + "step": 3900 + }, + { + "epoch": 0.21586511885019347, + "grad_norm": 2.6737794876098633, + "learning_rate": 4.0321205230244456e-05, + "loss": 5.5133, + "step": 3905 + }, + { + "epoch": 0.21614151464897732, + "grad_norm": 2.6840312480926514, + "learning_rate": 4.030699260943718e-05, + "loss": 5.9368, + "step": 3910 + }, + { + "epoch": 0.21641791044776118, + "grad_norm": 2.652055501937866, + "learning_rate": 4.0292779988629904e-05, + "loss": 5.7533, + "step": 3915 + }, + { + "epoch": 0.21669430624654507, + "grad_norm": 3.5088555812835693, + "learning_rate": 4.027856736782263e-05, + "loss": 5.6949, + "step": 3920 + }, + { + "epoch": 0.21697070204532892, + "grad_norm": 3.445812940597534, + "learning_rate": 4.026435474701535e-05, + "loss": 5.5864, + "step": 3925 + }, + { + "epoch": 0.21724709784411278, + "grad_norm": 3.1001298427581787, + "learning_rate": 4.0250142126208076e-05, + "loss": 5.8767, + "step": 3930 + }, + { + "epoch": 0.21752349364289664, + "grad_norm": 2.6372926235198975, + "learning_rate": 4.02359295054008e-05, + "loss": 5.6745, + "step": 3935 + }, + { + "epoch": 0.2177998894416805, + "grad_norm": 2.3404037952423096, + "learning_rate": 4.022171688459352e-05, + "loss": 5.6526, + "step": 3940 + }, + { + "epoch": 0.21807628524046435, + "grad_norm": 3.181204080581665, + "learning_rate": 4.020750426378624e-05, + "loss": 5.5995, + "step": 3945 + }, + { + "epoch": 0.2183526810392482, + "grad_norm": 3.0466437339782715, + "learning_rate": 4.019329164297897e-05, + "loss": 5.7707, + "step": 3950 + }, + { + "epoch": 0.21862907683803207, + "grad_norm": 2.544200897216797, + "learning_rate": 4.017907902217169e-05, + "loss": 5.9233, + "step": 3955 + }, + { + "epoch": 0.21890547263681592, + "grad_norm": 3.109133720397949, + "learning_rate": 4.016486640136441e-05, + "loss": 5.7499, + "step": 3960 + }, + { + "epoch": 0.21918186843559978, + "grad_norm": 2.555316209793091, + "learning_rate": 4.015065378055714e-05, + "loss": 5.3721, + "step": 3965 + }, + { + "epoch": 0.21945826423438364, + "grad_norm": 2.7728629112243652, + "learning_rate": 4.013644115974986e-05, + "loss": 5.7565, + "step": 3970 + }, + { + "epoch": 0.2197346600331675, + "grad_norm": 3.0431509017944336, + "learning_rate": 4.0122228538942585e-05, + "loss": 5.5652, + "step": 3975 + }, + { + "epoch": 0.22001105583195135, + "grad_norm": 3.0363245010375977, + "learning_rate": 4.01080159181353e-05, + "loss": 5.6372, + "step": 3980 + }, + { + "epoch": 0.2202874516307352, + "grad_norm": 2.699867010116577, + "learning_rate": 4.009380329732803e-05, + "loss": 5.5497, + "step": 3985 + }, + { + "epoch": 0.22056384742951907, + "grad_norm": 2.5388708114624023, + "learning_rate": 4.007959067652075e-05, + "loss": 5.4853, + "step": 3990 + }, + { + "epoch": 0.22084024322830292, + "grad_norm": 2.6334452629089355, + "learning_rate": 4.0065378055713474e-05, + "loss": 5.5215, + "step": 3995 + }, + { + "epoch": 0.22111663902708678, + "grad_norm": 2.968801498413086, + "learning_rate": 4.00511654349062e-05, + "loss": 5.5953, + "step": 4000 + }, + { + "epoch": 0.22139303482587064, + "grad_norm": 3.2419137954711914, + "learning_rate": 4.003695281409892e-05, + "loss": 5.8652, + "step": 4005 + }, + { + "epoch": 0.2216694306246545, + "grad_norm": 3.5117664337158203, + "learning_rate": 4.0022740193291646e-05, + "loss": 5.1776, + "step": 4010 + }, + { + "epoch": 0.22194582642343835, + "grad_norm": 3.309300661087036, + "learning_rate": 4.0008527572484363e-05, + "loss": 5.9688, + "step": 4015 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.4260222911834717, + "learning_rate": 3.9994314951677094e-05, + "loss": 5.6959, + "step": 4020 + }, + { + "epoch": 0.22249861802100607, + "grad_norm": 2.736858606338501, + "learning_rate": 3.998010233086981e-05, + "loss": 5.69, + "step": 4025 + }, + { + "epoch": 0.22277501381978995, + "grad_norm": 3.037052869796753, + "learning_rate": 3.9965889710062535e-05, + "loss": 5.3564, + "step": 4030 + }, + { + "epoch": 0.2230514096185738, + "grad_norm": 2.519865036010742, + "learning_rate": 3.995167708925526e-05, + "loss": 5.6226, + "step": 4035 + }, + { + "epoch": 0.22332780541735767, + "grad_norm": 2.619724988937378, + "learning_rate": 3.993746446844798e-05, + "loss": 5.8328, + "step": 4040 + }, + { + "epoch": 0.22360420121614152, + "grad_norm": 2.6823482513427734, + "learning_rate": 3.992325184764071e-05, + "loss": 5.2537, + "step": 4045 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 2.5863189697265625, + "learning_rate": 3.9909039226833425e-05, + "loss": 5.139, + "step": 4050 + }, + { + "epoch": 0.22415699281370924, + "grad_norm": 2.9364101886749268, + "learning_rate": 3.9894826606026155e-05, + "loss": 5.544, + "step": 4055 + }, + { + "epoch": 0.2244333886124931, + "grad_norm": 2.504804849624634, + "learning_rate": 3.988061398521888e-05, + "loss": 5.7221, + "step": 4060 + }, + { + "epoch": 0.22470978441127695, + "grad_norm": 2.4577889442443848, + "learning_rate": 3.9866401364411596e-05, + "loss": 5.6893, + "step": 4065 + }, + { + "epoch": 0.2249861802100608, + "grad_norm": 2.8523504734039307, + "learning_rate": 3.985218874360433e-05, + "loss": 5.4065, + "step": 4070 + }, + { + "epoch": 0.22526257600884467, + "grad_norm": 2.8694982528686523, + "learning_rate": 3.9837976122797044e-05, + "loss": 5.7412, + "step": 4075 + }, + { + "epoch": 0.22553897180762852, + "grad_norm": 2.2998769283294678, + "learning_rate": 3.982376350198977e-05, + "loss": 5.3882, + "step": 4080 + }, + { + "epoch": 0.22581536760641238, + "grad_norm": 2.82647442817688, + "learning_rate": 3.980955088118249e-05, + "loss": 5.5734, + "step": 4085 + }, + { + "epoch": 0.22609176340519624, + "grad_norm": 2.7505252361297607, + "learning_rate": 3.9795338260375216e-05, + "loss": 5.7028, + "step": 4090 + }, + { + "epoch": 0.2263681592039801, + "grad_norm": 3.391613483428955, + "learning_rate": 3.978112563956794e-05, + "loss": 5.731, + "step": 4095 + }, + { + "epoch": 0.22664455500276395, + "grad_norm": 3.5600905418395996, + "learning_rate": 3.976691301876066e-05, + "loss": 5.5761, + "step": 4100 + }, + { + "epoch": 0.2269209508015478, + "grad_norm": 3.087388515472412, + "learning_rate": 3.975270039795339e-05, + "loss": 6.0301, + "step": 4105 + }, + { + "epoch": 0.22719734660033167, + "grad_norm": 2.921119213104248, + "learning_rate": 3.9738487777146106e-05, + "loss": 5.5959, + "step": 4110 + }, + { + "epoch": 0.22747374239911552, + "grad_norm": 3.0585193634033203, + "learning_rate": 3.972427515633883e-05, + "loss": 5.61, + "step": 4115 + }, + { + "epoch": 0.22775013819789938, + "grad_norm": 3.3914947509765625, + "learning_rate": 3.9710062535531553e-05, + "loss": 5.3884, + "step": 4120 + }, + { + "epoch": 0.22802653399668324, + "grad_norm": 3.621385097503662, + "learning_rate": 3.969584991472428e-05, + "loss": 5.7627, + "step": 4125 + }, + { + "epoch": 0.2283029297954671, + "grad_norm": 2.755340099334717, + "learning_rate": 3.9681637293917e-05, + "loss": 5.6035, + "step": 4130 + }, + { + "epoch": 0.22857932559425095, + "grad_norm": 2.5260558128356934, + "learning_rate": 3.966742467310972e-05, + "loss": 5.795, + "step": 4135 + }, + { + "epoch": 0.22885572139303484, + "grad_norm": 2.8096625804901123, + "learning_rate": 3.965321205230245e-05, + "loss": 5.4867, + "step": 4140 + }, + { + "epoch": 0.2291321171918187, + "grad_norm": 2.795567274093628, + "learning_rate": 3.963899943149517e-05, + "loss": 5.6046, + "step": 4145 + }, + { + "epoch": 0.22940851299060255, + "grad_norm": 2.43609881401062, + "learning_rate": 3.962478681068789e-05, + "loss": 5.4769, + "step": 4150 + }, + { + "epoch": 0.2296849087893864, + "grad_norm": 3.2818362712860107, + "learning_rate": 3.9610574189880615e-05, + "loss": 5.4739, + "step": 4155 + }, + { + "epoch": 0.22996130458817027, + "grad_norm": 3.0869808197021484, + "learning_rate": 3.959636156907334e-05, + "loss": 5.6502, + "step": 4160 + }, + { + "epoch": 0.23023770038695412, + "grad_norm": 2.792592763900757, + "learning_rate": 3.958214894826606e-05, + "loss": 5.707, + "step": 4165 + }, + { + "epoch": 0.23051409618573798, + "grad_norm": 3.079249858856201, + "learning_rate": 3.9567936327458786e-05, + "loss": 5.8627, + "step": 4170 + }, + { + "epoch": 0.23079049198452184, + "grad_norm": 2.5989654064178467, + "learning_rate": 3.955372370665151e-05, + "loss": 5.6885, + "step": 4175 + }, + { + "epoch": 0.2310668877833057, + "grad_norm": 2.6450164318084717, + "learning_rate": 3.9539511085844234e-05, + "loss": 5.4694, + "step": 4180 + }, + { + "epoch": 0.23134328358208955, + "grad_norm": 2.6688590049743652, + "learning_rate": 3.952529846503695e-05, + "loss": 5.5172, + "step": 4185 + }, + { + "epoch": 0.2316196793808734, + "grad_norm": 2.876706600189209, + "learning_rate": 3.9511085844229676e-05, + "loss": 5.6633, + "step": 4190 + }, + { + "epoch": 0.23189607517965727, + "grad_norm": 2.682499408721924, + "learning_rate": 3.9496873223422406e-05, + "loss": 5.6945, + "step": 4195 + }, + { + "epoch": 0.23217247097844113, + "grad_norm": 2.5121068954467773, + "learning_rate": 3.9482660602615124e-05, + "loss": 5.6058, + "step": 4200 + }, + { + "epoch": 0.23244886677722498, + "grad_norm": 3.3784987926483154, + "learning_rate": 3.946844798180785e-05, + "loss": 5.4177, + "step": 4205 + }, + { + "epoch": 0.23272526257600884, + "grad_norm": 2.7231504917144775, + "learning_rate": 3.945423536100057e-05, + "loss": 5.3509, + "step": 4210 + }, + { + "epoch": 0.2330016583747927, + "grad_norm": 3.0387542247772217, + "learning_rate": 3.9440022740193296e-05, + "loss": 5.7707, + "step": 4215 + }, + { + "epoch": 0.23327805417357655, + "grad_norm": 2.9296815395355225, + "learning_rate": 3.942581011938601e-05, + "loss": 5.7334, + "step": 4220 + }, + { + "epoch": 0.2335544499723604, + "grad_norm": 2.396350622177124, + "learning_rate": 3.941159749857874e-05, + "loss": 5.254, + "step": 4225 + }, + { + "epoch": 0.23383084577114427, + "grad_norm": 3.36370587348938, + "learning_rate": 3.939738487777147e-05, + "loss": 5.8167, + "step": 4230 + }, + { + "epoch": 0.23410724156992813, + "grad_norm": 3.5464701652526855, + "learning_rate": 3.9383172256964185e-05, + "loss": 5.742, + "step": 4235 + }, + { + "epoch": 0.23438363736871198, + "grad_norm": 2.342013120651245, + "learning_rate": 3.936895963615691e-05, + "loss": 5.5515, + "step": 4240 + }, + { + "epoch": 0.23466003316749584, + "grad_norm": 2.8633322715759277, + "learning_rate": 3.935474701534963e-05, + "loss": 5.4822, + "step": 4245 + }, + { + "epoch": 0.23493642896627973, + "grad_norm": 2.7258920669555664, + "learning_rate": 3.9340534394542357e-05, + "loss": 5.2667, + "step": 4250 + }, + { + "epoch": 0.23521282476506358, + "grad_norm": 2.6543519496917725, + "learning_rate": 3.932632177373508e-05, + "loss": 5.5115, + "step": 4255 + }, + { + "epoch": 0.23548922056384744, + "grad_norm": 3.5072028636932373, + "learning_rate": 3.93121091529278e-05, + "loss": 5.4936, + "step": 4260 + }, + { + "epoch": 0.2357656163626313, + "grad_norm": 2.486929178237915, + "learning_rate": 3.929789653212053e-05, + "loss": 5.719, + "step": 4265 + }, + { + "epoch": 0.23604201216141515, + "grad_norm": 3.5531532764434814, + "learning_rate": 3.9283683911313246e-05, + "loss": 5.6424, + "step": 4270 + }, + { + "epoch": 0.236318407960199, + "grad_norm": 3.207571506500244, + "learning_rate": 3.926947129050597e-05, + "loss": 5.6972, + "step": 4275 + }, + { + "epoch": 0.23659480375898287, + "grad_norm": 3.538468599319458, + "learning_rate": 3.9255258669698694e-05, + "loss": 5.8425, + "step": 4280 + }, + { + "epoch": 0.23687119955776673, + "grad_norm": 3.163320541381836, + "learning_rate": 3.924104604889142e-05, + "loss": 5.2799, + "step": 4285 + }, + { + "epoch": 0.23714759535655058, + "grad_norm": 4.0374274253845215, + "learning_rate": 3.922683342808414e-05, + "loss": 5.5183, + "step": 4290 + }, + { + "epoch": 0.23742399115533444, + "grad_norm": 3.0995869636535645, + "learning_rate": 3.921262080727686e-05, + "loss": 5.7765, + "step": 4295 + }, + { + "epoch": 0.2377003869541183, + "grad_norm": 2.8097715377807617, + "learning_rate": 3.919840818646959e-05, + "loss": 5.7519, + "step": 4300 + }, + { + "epoch": 0.23797678275290216, + "grad_norm": 2.7227861881256104, + "learning_rate": 3.9184195565662314e-05, + "loss": 5.3365, + "step": 4305 + }, + { + "epoch": 0.238253178551686, + "grad_norm": 2.6679046154022217, + "learning_rate": 3.916998294485503e-05, + "loss": 5.4229, + "step": 4310 + }, + { + "epoch": 0.23852957435046987, + "grad_norm": 2.4163801670074463, + "learning_rate": 3.9155770324047755e-05, + "loss": 5.1811, + "step": 4315 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 2.996711254119873, + "learning_rate": 3.914155770324048e-05, + "loss": 5.8349, + "step": 4320 + }, + { + "epoch": 0.23908236594803758, + "grad_norm": 2.91500186920166, + "learning_rate": 3.91273450824332e-05, + "loss": 5.3502, + "step": 4325 + }, + { + "epoch": 0.23935876174682144, + "grad_norm": 2.5672600269317627, + "learning_rate": 3.911313246162592e-05, + "loss": 5.4671, + "step": 4330 + }, + { + "epoch": 0.2396351575456053, + "grad_norm": 2.848611831665039, + "learning_rate": 3.909891984081865e-05, + "loss": 5.6904, + "step": 4335 + }, + { + "epoch": 0.23991155334438916, + "grad_norm": 3.451052665710449, + "learning_rate": 3.9084707220011375e-05, + "loss": 5.4361, + "step": 4340 + }, + { + "epoch": 0.240187949143173, + "grad_norm": 2.8216664791107178, + "learning_rate": 3.907049459920409e-05, + "loss": 5.6986, + "step": 4345 + }, + { + "epoch": 0.24046434494195687, + "grad_norm": 2.856027841567993, + "learning_rate": 3.9056281978396816e-05, + "loss": 5.9277, + "step": 4350 + }, + { + "epoch": 0.24074074074074073, + "grad_norm": 2.87872576713562, + "learning_rate": 3.904206935758954e-05, + "loss": 5.6129, + "step": 4355 + }, + { + "epoch": 0.2410171365395246, + "grad_norm": 2.844909906387329, + "learning_rate": 3.9027856736782264e-05, + "loss": 5.5358, + "step": 4360 + }, + { + "epoch": 0.24129353233830847, + "grad_norm": 3.371220111846924, + "learning_rate": 3.901364411597499e-05, + "loss": 5.2788, + "step": 4365 + }, + { + "epoch": 0.24156992813709233, + "grad_norm": 2.4404332637786865, + "learning_rate": 3.899943149516771e-05, + "loss": 5.4483, + "step": 4370 + }, + { + "epoch": 0.24184632393587618, + "grad_norm": 2.876338481903076, + "learning_rate": 3.8985218874360436e-05, + "loss": 5.4887, + "step": 4375 + }, + { + "epoch": 0.24212271973466004, + "grad_norm": 3.4181478023529053, + "learning_rate": 3.897100625355315e-05, + "loss": 5.6298, + "step": 4380 + }, + { + "epoch": 0.2423991155334439, + "grad_norm": 2.8636157512664795, + "learning_rate": 3.8956793632745884e-05, + "loss": 5.7623, + "step": 4385 + }, + { + "epoch": 0.24267551133222776, + "grad_norm": 2.881701707839966, + "learning_rate": 3.894258101193861e-05, + "loss": 5.5819, + "step": 4390 + }, + { + "epoch": 0.2429519071310116, + "grad_norm": 2.676478385925293, + "learning_rate": 3.8928368391131325e-05, + "loss": 5.5775, + "step": 4395 + }, + { + "epoch": 0.24322830292979547, + "grad_norm": 3.2318267822265625, + "learning_rate": 3.891415577032405e-05, + "loss": 5.416, + "step": 4400 + }, + { + "epoch": 0.24350469872857933, + "grad_norm": 2.6567542552948, + "learning_rate": 3.889994314951677e-05, + "loss": 5.6167, + "step": 4405 + }, + { + "epoch": 0.24378109452736318, + "grad_norm": 2.6973955631256104, + "learning_rate": 3.88857305287095e-05, + "loss": 5.5814, + "step": 4410 + }, + { + "epoch": 0.24405749032614704, + "grad_norm": 2.4810473918914795, + "learning_rate": 3.8871517907902214e-05, + "loss": 5.61, + "step": 4415 + }, + { + "epoch": 0.2443338861249309, + "grad_norm": 2.4649670124053955, + "learning_rate": 3.8857305287094945e-05, + "loss": 5.3495, + "step": 4420 + }, + { + "epoch": 0.24461028192371476, + "grad_norm": 2.682745933532715, + "learning_rate": 3.884309266628767e-05, + "loss": 5.498, + "step": 4425 + }, + { + "epoch": 0.2448866777224986, + "grad_norm": 3.384434700012207, + "learning_rate": 3.8828880045480386e-05, + "loss": 5.4192, + "step": 4430 + }, + { + "epoch": 0.24516307352128247, + "grad_norm": 3.0535390377044678, + "learning_rate": 3.881466742467311e-05, + "loss": 5.4866, + "step": 4435 + }, + { + "epoch": 0.24543946932006633, + "grad_norm": 2.8525378704071045, + "learning_rate": 3.8800454803865834e-05, + "loss": 5.5854, + "step": 4440 + }, + { + "epoch": 0.24571586511885019, + "grad_norm": 2.7716307640075684, + "learning_rate": 3.878624218305856e-05, + "loss": 5.6654, + "step": 4445 + }, + { + "epoch": 0.24599226091763404, + "grad_norm": 2.2745914459228516, + "learning_rate": 3.877202956225128e-05, + "loss": 5.5521, + "step": 4450 + }, + { + "epoch": 0.2462686567164179, + "grad_norm": 3.1492974758148193, + "learning_rate": 3.8757816941444006e-05, + "loss": 5.5735, + "step": 4455 + }, + { + "epoch": 0.24654505251520176, + "grad_norm": 3.3391618728637695, + "learning_rate": 3.874360432063673e-05, + "loss": 5.3698, + "step": 4460 + }, + { + "epoch": 0.24682144831398561, + "grad_norm": 2.884638547897339, + "learning_rate": 3.872939169982945e-05, + "loss": 5.7967, + "step": 4465 + }, + { + "epoch": 0.2470978441127695, + "grad_norm": 2.4962894916534424, + "learning_rate": 3.871517907902217e-05, + "loss": 5.2797, + "step": 4470 + }, + { + "epoch": 0.24737423991155336, + "grad_norm": 3.132397413253784, + "learning_rate": 3.87009664582149e-05, + "loss": 5.6922, + "step": 4475 + }, + { + "epoch": 0.2476506357103372, + "grad_norm": 2.939412832260132, + "learning_rate": 3.868675383740762e-05, + "loss": 5.7016, + "step": 4480 + }, + { + "epoch": 0.24792703150912107, + "grad_norm": 2.6651360988616943, + "learning_rate": 3.867254121660034e-05, + "loss": 5.4948, + "step": 4485 + }, + { + "epoch": 0.24820342730790493, + "grad_norm": 3.104663372039795, + "learning_rate": 3.865832859579307e-05, + "loss": 5.6978, + "step": 4490 + }, + { + "epoch": 0.24847982310668879, + "grad_norm": 3.229151725769043, + "learning_rate": 3.864411597498579e-05, + "loss": 5.7317, + "step": 4495 + }, + { + "epoch": 0.24875621890547264, + "grad_norm": 2.4405629634857178, + "learning_rate": 3.8629903354178515e-05, + "loss": 5.7138, + "step": 4500 + }, + { + "epoch": 0.2490326147042565, + "grad_norm": 2.657789707183838, + "learning_rate": 3.861569073337123e-05, + "loss": 5.674, + "step": 4505 + }, + { + "epoch": 0.24930901050304036, + "grad_norm": 2.536699056625366, + "learning_rate": 3.860147811256396e-05, + "loss": 5.6521, + "step": 4510 + }, + { + "epoch": 0.24958540630182421, + "grad_norm": 2.947270631790161, + "learning_rate": 3.858726549175668e-05, + "loss": 5.6656, + "step": 4515 + }, + { + "epoch": 0.24986180210060807, + "grad_norm": 3.1712615489959717, + "learning_rate": 3.8573052870949404e-05, + "loss": 5.4181, + "step": 4520 + }, + { + "epoch": 0.25013819789939196, + "grad_norm": 2.742157459259033, + "learning_rate": 3.855884025014213e-05, + "loss": 5.5457, + "step": 4525 + }, + { + "epoch": 0.2504145936981758, + "grad_norm": 2.4692294597625732, + "learning_rate": 3.854462762933485e-05, + "loss": 5.5266, + "step": 4530 + }, + { + "epoch": 0.25069098949695967, + "grad_norm": 2.7221648693084717, + "learning_rate": 3.8530415008527576e-05, + "loss": 5.6376, + "step": 4535 + }, + { + "epoch": 0.25096738529574353, + "grad_norm": 2.809128999710083, + "learning_rate": 3.851620238772029e-05, + "loss": 5.4378, + "step": 4540 + }, + { + "epoch": 0.2512437810945274, + "grad_norm": 2.9492316246032715, + "learning_rate": 3.8501989766913024e-05, + "loss": 5.7847, + "step": 4545 + }, + { + "epoch": 0.25152017689331124, + "grad_norm": 3.0182037353515625, + "learning_rate": 3.848777714610574e-05, + "loss": 5.6268, + "step": 4550 + }, + { + "epoch": 0.2517965726920951, + "grad_norm": 3.395892381668091, + "learning_rate": 3.8473564525298465e-05, + "loss": 5.5913, + "step": 4555 + }, + { + "epoch": 0.25207296849087896, + "grad_norm": 2.6461169719696045, + "learning_rate": 3.845935190449119e-05, + "loss": 5.655, + "step": 4560 + }, + { + "epoch": 0.2523493642896628, + "grad_norm": 3.7275054454803467, + "learning_rate": 3.844513928368391e-05, + "loss": 5.5196, + "step": 4565 + }, + { + "epoch": 0.25262576008844667, + "grad_norm": 3.1424355506896973, + "learning_rate": 3.843092666287664e-05, + "loss": 5.3785, + "step": 4570 + }, + { + "epoch": 0.25290215588723053, + "grad_norm": 2.984804153442383, + "learning_rate": 3.8416714042069354e-05, + "loss": 5.4338, + "step": 4575 + }, + { + "epoch": 0.2531785516860144, + "grad_norm": 3.411898136138916, + "learning_rate": 3.8402501421262085e-05, + "loss": 5.7189, + "step": 4580 + }, + { + "epoch": 0.25345494748479824, + "grad_norm": 3.0697154998779297, + "learning_rate": 3.838828880045481e-05, + "loss": 5.554, + "step": 4585 + }, + { + "epoch": 0.2537313432835821, + "grad_norm": 2.617682695388794, + "learning_rate": 3.8374076179647526e-05, + "loss": 5.4068, + "step": 4590 + }, + { + "epoch": 0.25400773908236596, + "grad_norm": 2.9888644218444824, + "learning_rate": 3.835986355884025e-05, + "loss": 5.9172, + "step": 4595 + }, + { + "epoch": 0.2542841348811498, + "grad_norm": 2.7901175022125244, + "learning_rate": 3.8345650938032974e-05, + "loss": 5.3799, + "step": 4600 + }, + { + "epoch": 0.25456053067993367, + "grad_norm": 2.9775073528289795, + "learning_rate": 3.83314383172257e-05, + "loss": 5.9651, + "step": 4605 + }, + { + "epoch": 0.25483692647871753, + "grad_norm": 3.188081979751587, + "learning_rate": 3.831722569641842e-05, + "loss": 5.5412, + "step": 4610 + }, + { + "epoch": 0.2551133222775014, + "grad_norm": 3.1348631381988525, + "learning_rate": 3.8303013075611146e-05, + "loss": 5.4277, + "step": 4615 + }, + { + "epoch": 0.25538971807628524, + "grad_norm": 2.817936658859253, + "learning_rate": 3.828880045480387e-05, + "loss": 5.6873, + "step": 4620 + }, + { + "epoch": 0.2556661138750691, + "grad_norm": 2.883748769760132, + "learning_rate": 3.827458783399659e-05, + "loss": 5.4839, + "step": 4625 + }, + { + "epoch": 0.25594250967385296, + "grad_norm": 2.654115915298462, + "learning_rate": 3.826037521318931e-05, + "loss": 5.7442, + "step": 4630 + }, + { + "epoch": 0.2562189054726368, + "grad_norm": 2.976658821105957, + "learning_rate": 3.8246162592382035e-05, + "loss": 5.374, + "step": 4635 + }, + { + "epoch": 0.2564953012714207, + "grad_norm": 2.345407009124756, + "learning_rate": 3.823194997157476e-05, + "loss": 5.551, + "step": 4640 + }, + { + "epoch": 0.25677169707020453, + "grad_norm": 3.0773231983184814, + "learning_rate": 3.821773735076748e-05, + "loss": 5.7161, + "step": 4645 + }, + { + "epoch": 0.2570480928689884, + "grad_norm": 2.6813673973083496, + "learning_rate": 3.820352472996021e-05, + "loss": 5.2693, + "step": 4650 + }, + { + "epoch": 0.25732448866777224, + "grad_norm": 2.9697906970977783, + "learning_rate": 3.818931210915293e-05, + "loss": 5.2863, + "step": 4655 + }, + { + "epoch": 0.2576008844665561, + "grad_norm": 2.9038257598876953, + "learning_rate": 3.817509948834565e-05, + "loss": 5.4955, + "step": 4660 + }, + { + "epoch": 0.25787728026533996, + "grad_norm": 3.2474894523620605, + "learning_rate": 3.816088686753837e-05, + "loss": 5.316, + "step": 4665 + }, + { + "epoch": 0.2581536760641238, + "grad_norm": 3.7227532863616943, + "learning_rate": 3.81466742467311e-05, + "loss": 5.6569, + "step": 4670 + }, + { + "epoch": 0.2584300718629077, + "grad_norm": 3.429311513900757, + "learning_rate": 3.813246162592382e-05, + "loss": 5.6594, + "step": 4675 + }, + { + "epoch": 0.25870646766169153, + "grad_norm": 2.9430253505706787, + "learning_rate": 3.8118249005116544e-05, + "loss": 5.3997, + "step": 4680 + }, + { + "epoch": 0.2589828634604754, + "grad_norm": 3.2652957439422607, + "learning_rate": 3.810403638430927e-05, + "loss": 5.7445, + "step": 4685 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 3.255427360534668, + "learning_rate": 3.808982376350199e-05, + "loss": 5.3571, + "step": 4690 + }, + { + "epoch": 0.2595356550580431, + "grad_norm": 2.7767574787139893, + "learning_rate": 3.8075611142694716e-05, + "loss": 5.5315, + "step": 4695 + }, + { + "epoch": 0.25981205085682696, + "grad_norm": 2.555504560470581, + "learning_rate": 3.806139852188744e-05, + "loss": 5.4223, + "step": 4700 + }, + { + "epoch": 0.2600884466556108, + "grad_norm": 2.6958086490631104, + "learning_rate": 3.8047185901080164e-05, + "loss": 5.5566, + "step": 4705 + }, + { + "epoch": 0.2603648424543947, + "grad_norm": 3.072174310684204, + "learning_rate": 3.803297328027288e-05, + "loss": 5.9995, + "step": 4710 + }, + { + "epoch": 0.26064123825317853, + "grad_norm": 3.545971632003784, + "learning_rate": 3.8018760659465605e-05, + "loss": 5.5504, + "step": 4715 + }, + { + "epoch": 0.2609176340519624, + "grad_norm": 2.8304150104522705, + "learning_rate": 3.8004548038658336e-05, + "loss": 5.6472, + "step": 4720 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 3.0823028087615967, + "learning_rate": 3.799033541785105e-05, + "loss": 5.3301, + "step": 4725 + }, + { + "epoch": 0.2614704256495301, + "grad_norm": 3.2477657794952393, + "learning_rate": 3.797612279704378e-05, + "loss": 5.4662, + "step": 4730 + }, + { + "epoch": 0.26174682144831396, + "grad_norm": 2.6965558528900146, + "learning_rate": 3.79619101762365e-05, + "loss": 5.2598, + "step": 4735 + }, + { + "epoch": 0.2620232172470978, + "grad_norm": 3.41056752204895, + "learning_rate": 3.7947697555429225e-05, + "loss": 5.4817, + "step": 4740 + }, + { + "epoch": 0.26229961304588173, + "grad_norm": 3.00512433052063, + "learning_rate": 3.793348493462194e-05, + "loss": 5.3896, + "step": 4745 + }, + { + "epoch": 0.2625760088446656, + "grad_norm": 3.316678047180176, + "learning_rate": 3.7919272313814666e-05, + "loss": 6.0564, + "step": 4750 + }, + { + "epoch": 0.26285240464344944, + "grad_norm": 2.8839213848114014, + "learning_rate": 3.79050596930074e-05, + "loss": 5.715, + "step": 4755 + }, + { + "epoch": 0.2631288004422333, + "grad_norm": 3.264241933822632, + "learning_rate": 3.7890847072200114e-05, + "loss": 5.5688, + "step": 4760 + }, + { + "epoch": 0.26340519624101716, + "grad_norm": 2.51202130317688, + "learning_rate": 3.787663445139284e-05, + "loss": 5.4865, + "step": 4765 + }, + { + "epoch": 0.263681592039801, + "grad_norm": 2.940673828125, + "learning_rate": 3.786242183058556e-05, + "loss": 5.6147, + "step": 4770 + }, + { + "epoch": 0.2639579878385849, + "grad_norm": 2.740130662918091, + "learning_rate": 3.7848209209778286e-05, + "loss": 5.7488, + "step": 4775 + }, + { + "epoch": 0.26423438363736873, + "grad_norm": 2.5281081199645996, + "learning_rate": 3.783399658897101e-05, + "loss": 5.7751, + "step": 4780 + }, + { + "epoch": 0.2645107794361526, + "grad_norm": 2.7616381645202637, + "learning_rate": 3.781978396816373e-05, + "loss": 5.5114, + "step": 4785 + }, + { + "epoch": 0.26478717523493644, + "grad_norm": 3.112661838531494, + "learning_rate": 3.780557134735646e-05, + "loss": 5.7249, + "step": 4790 + }, + { + "epoch": 0.2650635710337203, + "grad_norm": 2.2833783626556396, + "learning_rate": 3.7791358726549175e-05, + "loss": 5.0357, + "step": 4795 + }, + { + "epoch": 0.26533996683250416, + "grad_norm": 2.7864673137664795, + "learning_rate": 3.77771461057419e-05, + "loss": 5.68, + "step": 4800 + }, + { + "epoch": 0.265616362631288, + "grad_norm": 2.8897244930267334, + "learning_rate": 3.7762933484934623e-05, + "loss": 5.5812, + "step": 4805 + }, + { + "epoch": 0.2658927584300719, + "grad_norm": 3.157487154006958, + "learning_rate": 3.774872086412735e-05, + "loss": 5.5694, + "step": 4810 + }, + { + "epoch": 0.26616915422885573, + "grad_norm": 3.268732786178589, + "learning_rate": 3.773450824332007e-05, + "loss": 5.3811, + "step": 4815 + }, + { + "epoch": 0.2664455500276396, + "grad_norm": 2.7463014125823975, + "learning_rate": 3.772029562251279e-05, + "loss": 5.4335, + "step": 4820 + }, + { + "epoch": 0.26672194582642345, + "grad_norm": 3.34234619140625, + "learning_rate": 3.770608300170552e-05, + "loss": 5.401, + "step": 4825 + }, + { + "epoch": 0.2669983416252073, + "grad_norm": 4.338949680328369, + "learning_rate": 3.769187038089824e-05, + "loss": 5.5645, + "step": 4830 + }, + { + "epoch": 0.26727473742399116, + "grad_norm": 2.9800288677215576, + "learning_rate": 3.767765776009096e-05, + "loss": 5.1976, + "step": 4835 + }, + { + "epoch": 0.267551133222775, + "grad_norm": 3.4728004932403564, + "learning_rate": 3.7663445139283685e-05, + "loss": 5.6093, + "step": 4840 + }, + { + "epoch": 0.2678275290215589, + "grad_norm": 3.496291399002075, + "learning_rate": 3.764923251847641e-05, + "loss": 5.5747, + "step": 4845 + }, + { + "epoch": 0.26810392482034273, + "grad_norm": 2.850144386291504, + "learning_rate": 3.763501989766913e-05, + "loss": 5.1295, + "step": 4850 + }, + { + "epoch": 0.2683803206191266, + "grad_norm": 2.8431968688964844, + "learning_rate": 3.762080727686185e-05, + "loss": 5.5363, + "step": 4855 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 2.7617273330688477, + "learning_rate": 3.760659465605458e-05, + "loss": 5.7338, + "step": 4860 + }, + { + "epoch": 0.2689331122166943, + "grad_norm": 2.990050792694092, + "learning_rate": 3.7592382035247304e-05, + "loss": 5.2071, + "step": 4865 + }, + { + "epoch": 0.26920950801547816, + "grad_norm": 3.1524643898010254, + "learning_rate": 3.757816941444002e-05, + "loss": 5.6822, + "step": 4870 + }, + { + "epoch": 0.269485903814262, + "grad_norm": 2.6048810482025146, + "learning_rate": 3.7563956793632746e-05, + "loss": 5.6043, + "step": 4875 + }, + { + "epoch": 0.2697622996130459, + "grad_norm": 2.8759994506835938, + "learning_rate": 3.754974417282547e-05, + "loss": 5.4891, + "step": 4880 + }, + { + "epoch": 0.27003869541182973, + "grad_norm": 2.8228535652160645, + "learning_rate": 3.7535531552018194e-05, + "loss": 5.5018, + "step": 4885 + }, + { + "epoch": 0.2703150912106136, + "grad_norm": 3.0626003742218018, + "learning_rate": 3.752131893121092e-05, + "loss": 5.3939, + "step": 4890 + }, + { + "epoch": 0.27059148700939745, + "grad_norm": 2.5288095474243164, + "learning_rate": 3.750710631040364e-05, + "loss": 5.6604, + "step": 4895 + }, + { + "epoch": 0.2708678828081813, + "grad_norm": 2.9323790073394775, + "learning_rate": 3.7492893689596365e-05, + "loss": 5.4378, + "step": 4900 + }, + { + "epoch": 0.27114427860696516, + "grad_norm": 2.815352439880371, + "learning_rate": 3.747868106878908e-05, + "loss": 5.6052, + "step": 4905 + }, + { + "epoch": 0.271420674405749, + "grad_norm": 2.981050968170166, + "learning_rate": 3.746446844798181e-05, + "loss": 5.5689, + "step": 4910 + }, + { + "epoch": 0.2716970702045329, + "grad_norm": 3.062026262283325, + "learning_rate": 3.745025582717454e-05, + "loss": 5.4102, + "step": 4915 + }, + { + "epoch": 0.27197346600331673, + "grad_norm": 3.0991098880767822, + "learning_rate": 3.7436043206367255e-05, + "loss": 5.4325, + "step": 4920 + }, + { + "epoch": 0.2722498618021006, + "grad_norm": 3.309856414794922, + "learning_rate": 3.742183058555998e-05, + "loss": 5.2731, + "step": 4925 + }, + { + "epoch": 0.27252625760088445, + "grad_norm": 3.4550788402557373, + "learning_rate": 3.74076179647527e-05, + "loss": 5.5482, + "step": 4930 + }, + { + "epoch": 0.2728026533996683, + "grad_norm": 2.121727705001831, + "learning_rate": 3.7393405343945427e-05, + "loss": 5.5073, + "step": 4935 + }, + { + "epoch": 0.27307904919845216, + "grad_norm": 2.894728660583496, + "learning_rate": 3.737919272313815e-05, + "loss": 5.693, + "step": 4940 + }, + { + "epoch": 0.273355444997236, + "grad_norm": 3.626741647720337, + "learning_rate": 3.736498010233087e-05, + "loss": 5.7241, + "step": 4945 + }, + { + "epoch": 0.2736318407960199, + "grad_norm": 2.86633563041687, + "learning_rate": 3.73507674815236e-05, + "loss": 5.4346, + "step": 4950 + }, + { + "epoch": 0.27390823659480373, + "grad_norm": 3.4413692951202393, + "learning_rate": 3.7336554860716316e-05, + "loss": 5.4228, + "step": 4955 + }, + { + "epoch": 0.2741846323935876, + "grad_norm": 2.800236225128174, + "learning_rate": 3.732234223990904e-05, + "loss": 5.3015, + "step": 4960 + }, + { + "epoch": 0.2744610281923715, + "grad_norm": 3.2357637882232666, + "learning_rate": 3.7308129619101764e-05, + "loss": 5.7801, + "step": 4965 + }, + { + "epoch": 0.27473742399115536, + "grad_norm": 2.5633909702301025, + "learning_rate": 3.729391699829449e-05, + "loss": 5.6144, + "step": 4970 + }, + { + "epoch": 0.2750138197899392, + "grad_norm": 2.8940200805664062, + "learning_rate": 3.727970437748721e-05, + "loss": 5.442, + "step": 4975 + }, + { + "epoch": 0.2752902155887231, + "grad_norm": 3.922053337097168, + "learning_rate": 3.726549175667993e-05, + "loss": 5.4147, + "step": 4980 + }, + { + "epoch": 0.27556661138750693, + "grad_norm": 3.393669366836548, + "learning_rate": 3.725127913587266e-05, + "loss": 5.6151, + "step": 4985 + }, + { + "epoch": 0.2758430071862908, + "grad_norm": 3.1016077995300293, + "learning_rate": 3.723706651506538e-05, + "loss": 5.5034, + "step": 4990 + }, + { + "epoch": 0.27611940298507465, + "grad_norm": 4.4078145027160645, + "learning_rate": 3.72228538942581e-05, + "loss": 5.5826, + "step": 4995 + }, + { + "epoch": 0.2763957987838585, + "grad_norm": 3.1527042388916016, + "learning_rate": 3.720864127345083e-05, + "loss": 5.4405, + "step": 5000 + }, + { + "epoch": 0.27667219458264236, + "grad_norm": 3.0095982551574707, + "learning_rate": 3.719442865264355e-05, + "loss": 5.6309, + "step": 5005 + }, + { + "epoch": 0.2769485903814262, + "grad_norm": 3.327120542526245, + "learning_rate": 3.718021603183627e-05, + "loss": 5.5869, + "step": 5010 + }, + { + "epoch": 0.2772249861802101, + "grad_norm": 3.148468017578125, + "learning_rate": 3.7166003411029e-05, + "loss": 5.3626, + "step": 5015 + }, + { + "epoch": 0.27750138197899393, + "grad_norm": 2.774047374725342, + "learning_rate": 3.715179079022172e-05, + "loss": 5.0038, + "step": 5020 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 2.5972516536712646, + "learning_rate": 3.7137578169414445e-05, + "loss": 5.8164, + "step": 5025 + }, + { + "epoch": 0.27805417357656165, + "grad_norm": 3.2580530643463135, + "learning_rate": 3.712336554860716e-05, + "loss": 5.5117, + "step": 5030 + }, + { + "epoch": 0.2783305693753455, + "grad_norm": 2.4557392597198486, + "learning_rate": 3.710915292779989e-05, + "loss": 5.4633, + "step": 5035 + }, + { + "epoch": 0.27860696517412936, + "grad_norm": 2.765587091445923, + "learning_rate": 3.709494030699261e-05, + "loss": 5.3116, + "step": 5040 + }, + { + "epoch": 0.2788833609729132, + "grad_norm": 3.075885057449341, + "learning_rate": 3.7080727686185334e-05, + "loss": 5.4564, + "step": 5045 + }, + { + "epoch": 0.2791597567716971, + "grad_norm": 2.8484435081481934, + "learning_rate": 3.706651506537806e-05, + "loss": 5.5371, + "step": 5050 + }, + { + "epoch": 0.27943615257048093, + "grad_norm": 2.929783344268799, + "learning_rate": 3.705230244457078e-05, + "loss": 5.4535, + "step": 5055 + }, + { + "epoch": 0.2797125483692648, + "grad_norm": 2.6549274921417236, + "learning_rate": 3.7038089823763506e-05, + "loss": 5.4736, + "step": 5060 + }, + { + "epoch": 0.27998894416804865, + "grad_norm": 3.388282537460327, + "learning_rate": 3.702387720295622e-05, + "loss": 5.3436, + "step": 5065 + }, + { + "epoch": 0.2802653399668325, + "grad_norm": 3.1960010528564453, + "learning_rate": 3.7009664582148954e-05, + "loss": 5.8104, + "step": 5070 + }, + { + "epoch": 0.28054173576561636, + "grad_norm": 2.6687440872192383, + "learning_rate": 3.699545196134167e-05, + "loss": 5.2787, + "step": 5075 + }, + { + "epoch": 0.2808181315644002, + "grad_norm": 3.2794923782348633, + "learning_rate": 3.6981239340534395e-05, + "loss": 5.5187, + "step": 5080 + }, + { + "epoch": 0.2810945273631841, + "grad_norm": 2.8185513019561768, + "learning_rate": 3.696702671972712e-05, + "loss": 5.3756, + "step": 5085 + }, + { + "epoch": 0.28137092316196793, + "grad_norm": 3.3090620040893555, + "learning_rate": 3.695281409891984e-05, + "loss": 5.6135, + "step": 5090 + }, + { + "epoch": 0.2816473189607518, + "grad_norm": 2.823322057723999, + "learning_rate": 3.693860147811257e-05, + "loss": 5.7067, + "step": 5095 + }, + { + "epoch": 0.28192371475953565, + "grad_norm": 4.680220603942871, + "learning_rate": 3.6924388857305284e-05, + "loss": 5.3231, + "step": 5100 + }, + { + "epoch": 0.2822001105583195, + "grad_norm": 3.1721339225769043, + "learning_rate": 3.6910176236498015e-05, + "loss": 5.3679, + "step": 5105 + }, + { + "epoch": 0.28247650635710336, + "grad_norm": 3.343784809112549, + "learning_rate": 3.689596361569074e-05, + "loss": 5.5471, + "step": 5110 + }, + { + "epoch": 0.2827529021558872, + "grad_norm": 2.900912284851074, + "learning_rate": 3.6881750994883456e-05, + "loss": 5.4448, + "step": 5115 + }, + { + "epoch": 0.2830292979546711, + "grad_norm": 2.725116729736328, + "learning_rate": 3.686753837407618e-05, + "loss": 5.1778, + "step": 5120 + }, + { + "epoch": 0.28330569375345493, + "grad_norm": 2.9266819953918457, + "learning_rate": 3.6853325753268904e-05, + "loss": 5.706, + "step": 5125 + }, + { + "epoch": 0.2835820895522388, + "grad_norm": 3.6640381813049316, + "learning_rate": 3.683911313246163e-05, + "loss": 5.355, + "step": 5130 + }, + { + "epoch": 0.28385848535102265, + "grad_norm": 3.3055994510650635, + "learning_rate": 3.682490051165435e-05, + "loss": 5.2927, + "step": 5135 + }, + { + "epoch": 0.2841348811498065, + "grad_norm": 3.171983242034912, + "learning_rate": 3.6810687890847076e-05, + "loss": 5.625, + "step": 5140 + }, + { + "epoch": 0.28441127694859036, + "grad_norm": 3.0071423053741455, + "learning_rate": 3.67964752700398e-05, + "loss": 5.2521, + "step": 5145 + }, + { + "epoch": 0.2846876727473742, + "grad_norm": 2.843132972717285, + "learning_rate": 3.678226264923252e-05, + "loss": 5.6159, + "step": 5150 + }, + { + "epoch": 0.2849640685461581, + "grad_norm": 2.8676323890686035, + "learning_rate": 3.676805002842524e-05, + "loss": 5.4452, + "step": 5155 + }, + { + "epoch": 0.28524046434494194, + "grad_norm": 2.7447266578674316, + "learning_rate": 3.675383740761797e-05, + "loss": 5.4009, + "step": 5160 + }, + { + "epoch": 0.2855168601437258, + "grad_norm": 3.2596964836120605, + "learning_rate": 3.673962478681069e-05, + "loss": 5.7716, + "step": 5165 + }, + { + "epoch": 0.28579325594250965, + "grad_norm": 2.8645823001861572, + "learning_rate": 3.672541216600341e-05, + "loss": 5.5613, + "step": 5170 + }, + { + "epoch": 0.2860696517412935, + "grad_norm": 2.947704315185547, + "learning_rate": 3.671119954519614e-05, + "loss": 5.343, + "step": 5175 + }, + { + "epoch": 0.28634604754007736, + "grad_norm": 3.6739652156829834, + "learning_rate": 3.669698692438886e-05, + "loss": 5.3991, + "step": 5180 + }, + { + "epoch": 0.2866224433388613, + "grad_norm": 3.433134078979492, + "learning_rate": 3.668277430358158e-05, + "loss": 5.6598, + "step": 5185 + }, + { + "epoch": 0.28689883913764513, + "grad_norm": 2.977222204208374, + "learning_rate": 3.66685616827743e-05, + "loss": 5.4816, + "step": 5190 + }, + { + "epoch": 0.287175234936429, + "grad_norm": 2.5906851291656494, + "learning_rate": 3.665434906196703e-05, + "loss": 5.3199, + "step": 5195 + }, + { + "epoch": 0.28745163073521285, + "grad_norm": 2.8117191791534424, + "learning_rate": 3.664013644115975e-05, + "loss": 5.4231, + "step": 5200 + }, + { + "epoch": 0.2877280265339967, + "grad_norm": 2.9381093978881836, + "learning_rate": 3.6625923820352474e-05, + "loss": 5.4396, + "step": 5205 + }, + { + "epoch": 0.28800442233278056, + "grad_norm": 3.0122227668762207, + "learning_rate": 3.66117111995452e-05, + "loss": 5.2438, + "step": 5210 + }, + { + "epoch": 0.2882808181315644, + "grad_norm": 2.6538515090942383, + "learning_rate": 3.659749857873792e-05, + "loss": 5.3428, + "step": 5215 + }, + { + "epoch": 0.2885572139303483, + "grad_norm": 3.465743064880371, + "learning_rate": 3.6583285957930646e-05, + "loss": 5.4583, + "step": 5220 + }, + { + "epoch": 0.28883360972913213, + "grad_norm": 3.0137925148010254, + "learning_rate": 3.656907333712336e-05, + "loss": 5.6031, + "step": 5225 + }, + { + "epoch": 0.289110005527916, + "grad_norm": 3.7536206245422363, + "learning_rate": 3.6554860716316094e-05, + "loss": 5.3189, + "step": 5230 + }, + { + "epoch": 0.28938640132669985, + "grad_norm": 2.687974691390991, + "learning_rate": 3.654064809550881e-05, + "loss": 5.1424, + "step": 5235 + }, + { + "epoch": 0.2896627971254837, + "grad_norm": 2.5427982807159424, + "learning_rate": 3.6526435474701535e-05, + "loss": 5.5922, + "step": 5240 + }, + { + "epoch": 0.28993919292426756, + "grad_norm": 2.6604325771331787, + "learning_rate": 3.651222285389426e-05, + "loss": 5.4598, + "step": 5245 + }, + { + "epoch": 0.2902155887230514, + "grad_norm": 2.8820483684539795, + "learning_rate": 3.649801023308698e-05, + "loss": 5.3329, + "step": 5250 + }, + { + "epoch": 0.2904919845218353, + "grad_norm": 4.090969085693359, + "learning_rate": 3.648379761227971e-05, + "loss": 5.5375, + "step": 5255 + }, + { + "epoch": 0.29076838032061914, + "grad_norm": 3.3297054767608643, + "learning_rate": 3.6469584991472424e-05, + "loss": 5.217, + "step": 5260 + }, + { + "epoch": 0.291044776119403, + "grad_norm": 2.8751707077026367, + "learning_rate": 3.6455372370665155e-05, + "loss": 5.3424, + "step": 5265 + }, + { + "epoch": 0.29132117191818685, + "grad_norm": 2.618412971496582, + "learning_rate": 3.644115974985788e-05, + "loss": 5.4186, + "step": 5270 + }, + { + "epoch": 0.2915975677169707, + "grad_norm": 3.5012338161468506, + "learning_rate": 3.6426947129050596e-05, + "loss": 5.5651, + "step": 5275 + }, + { + "epoch": 0.29187396351575456, + "grad_norm": 3.1529078483581543, + "learning_rate": 3.641273450824333e-05, + "loss": 5.4826, + "step": 5280 + }, + { + "epoch": 0.2921503593145384, + "grad_norm": 3.4742085933685303, + "learning_rate": 3.6398521887436044e-05, + "loss": 5.3871, + "step": 5285 + }, + { + "epoch": 0.2924267551133223, + "grad_norm": 3.666706085205078, + "learning_rate": 3.638430926662877e-05, + "loss": 5.6392, + "step": 5290 + }, + { + "epoch": 0.29270315091210614, + "grad_norm": 3.182141065597534, + "learning_rate": 3.6370096645821485e-05, + "loss": 5.3455, + "step": 5295 + }, + { + "epoch": 0.29297954671089, + "grad_norm": 3.0457582473754883, + "learning_rate": 3.6355884025014216e-05, + "loss": 5.7264, + "step": 5300 + }, + { + "epoch": 0.29325594250967385, + "grad_norm": 2.9654111862182617, + "learning_rate": 3.634167140420694e-05, + "loss": 5.3143, + "step": 5305 + }, + { + "epoch": 0.2935323383084577, + "grad_norm": 3.9714043140411377, + "learning_rate": 3.632745878339966e-05, + "loss": 5.6, + "step": 5310 + }, + { + "epoch": 0.29380873410724156, + "grad_norm": 3.186901092529297, + "learning_rate": 3.631324616259239e-05, + "loss": 5.4677, + "step": 5315 + }, + { + "epoch": 0.2940851299060254, + "grad_norm": 3.43259334564209, + "learning_rate": 3.6299033541785105e-05, + "loss": 5.5161, + "step": 5320 + }, + { + "epoch": 0.2943615257048093, + "grad_norm": 2.980299711227417, + "learning_rate": 3.628482092097783e-05, + "loss": 5.4279, + "step": 5325 + }, + { + "epoch": 0.29463792150359314, + "grad_norm": 2.5553932189941406, + "learning_rate": 3.627060830017055e-05, + "loss": 5.2727, + "step": 5330 + }, + { + "epoch": 0.294914317302377, + "grad_norm": 2.9207799434661865, + "learning_rate": 3.625639567936328e-05, + "loss": 5.5978, + "step": 5335 + }, + { + "epoch": 0.29519071310116085, + "grad_norm": 2.6809563636779785, + "learning_rate": 3.6242183058556e-05, + "loss": 5.6545, + "step": 5340 + }, + { + "epoch": 0.2954671088999447, + "grad_norm": 3.1138181686401367, + "learning_rate": 3.622797043774872e-05, + "loss": 5.3426, + "step": 5345 + }, + { + "epoch": 0.29574350469872857, + "grad_norm": 3.2143821716308594, + "learning_rate": 3.621375781694145e-05, + "loss": 5.5284, + "step": 5350 + }, + { + "epoch": 0.2960199004975124, + "grad_norm": 3.2884023189544678, + "learning_rate": 3.619954519613417e-05, + "loss": 5.3533, + "step": 5355 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 2.707595109939575, + "learning_rate": 3.618533257532689e-05, + "loss": 5.4062, + "step": 5360 + }, + { + "epoch": 0.29657269209508014, + "grad_norm": 2.758535146713257, + "learning_rate": 3.6171119954519614e-05, + "loss": 5.3409, + "step": 5365 + }, + { + "epoch": 0.296849087893864, + "grad_norm": 3.2588820457458496, + "learning_rate": 3.615690733371234e-05, + "loss": 5.4564, + "step": 5370 + }, + { + "epoch": 0.29712548369264785, + "grad_norm": 3.819500207901001, + "learning_rate": 3.614269471290506e-05, + "loss": 5.5978, + "step": 5375 + }, + { + "epoch": 0.2974018794914317, + "grad_norm": 3.1476078033447266, + "learning_rate": 3.6128482092097786e-05, + "loss": 5.1632, + "step": 5380 + }, + { + "epoch": 0.29767827529021557, + "grad_norm": 2.8561973571777344, + "learning_rate": 3.611426947129051e-05, + "loss": 5.5271, + "step": 5385 + }, + { + "epoch": 0.2979546710889994, + "grad_norm": 2.6981911659240723, + "learning_rate": 3.6100056850483234e-05, + "loss": 5.5108, + "step": 5390 + }, + { + "epoch": 0.2982310668877833, + "grad_norm": 3.0033581256866455, + "learning_rate": 3.608584422967595e-05, + "loss": 5.2921, + "step": 5395 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 3.7791874408721924, + "learning_rate": 3.6071631608868675e-05, + "loss": 5.5129, + "step": 5400 + }, + { + "epoch": 0.29878385848535105, + "grad_norm": 2.5647666454315186, + "learning_rate": 3.60574189880614e-05, + "loss": 5.7754, + "step": 5405 + }, + { + "epoch": 0.2990602542841349, + "grad_norm": 2.5552051067352295, + "learning_rate": 3.604320636725412e-05, + "loss": 5.2537, + "step": 5410 + }, + { + "epoch": 0.29933665008291876, + "grad_norm": 3.173316478729248, + "learning_rate": 3.602899374644685e-05, + "loss": 5.7515, + "step": 5415 + }, + { + "epoch": 0.2996130458817026, + "grad_norm": 2.4736528396606445, + "learning_rate": 3.601478112563957e-05, + "loss": 5.0059, + "step": 5420 + }, + { + "epoch": 0.2998894416804865, + "grad_norm": 3.348816394805908, + "learning_rate": 3.6000568504832295e-05, + "loss": 5.477, + "step": 5425 + }, + { + "epoch": 0.30016583747927034, + "grad_norm": 2.6805813312530518, + "learning_rate": 3.598635588402501e-05, + "loss": 5.5837, + "step": 5430 + }, + { + "epoch": 0.3004422332780542, + "grad_norm": 3.264838218688965, + "learning_rate": 3.5972143263217736e-05, + "loss": 5.6984, + "step": 5435 + }, + { + "epoch": 0.30071862907683805, + "grad_norm": 2.8391873836517334, + "learning_rate": 3.595793064241047e-05, + "loss": 5.2333, + "step": 5440 + }, + { + "epoch": 0.3009950248756219, + "grad_norm": 3.601405382156372, + "learning_rate": 3.5943718021603184e-05, + "loss": 5.2084, + "step": 5445 + }, + { + "epoch": 0.30127142067440577, + "grad_norm": 3.3208744525909424, + "learning_rate": 3.592950540079591e-05, + "loss": 5.18, + "step": 5450 + }, + { + "epoch": 0.3015478164731896, + "grad_norm": 3.2721526622772217, + "learning_rate": 3.591529277998863e-05, + "loss": 5.6478, + "step": 5455 + }, + { + "epoch": 0.3018242122719735, + "grad_norm": 3.912829637527466, + "learning_rate": 3.5901080159181356e-05, + "loss": 5.4232, + "step": 5460 + }, + { + "epoch": 0.30210060807075734, + "grad_norm": 3.12005615234375, + "learning_rate": 3.588686753837408e-05, + "loss": 5.6941, + "step": 5465 + }, + { + "epoch": 0.3023770038695412, + "grad_norm": 3.007559061050415, + "learning_rate": 3.58726549175668e-05, + "loss": 5.3598, + "step": 5470 + }, + { + "epoch": 0.30265339966832505, + "grad_norm": 3.6492421627044678, + "learning_rate": 3.585844229675953e-05, + "loss": 5.6721, + "step": 5475 + }, + { + "epoch": 0.3029297954671089, + "grad_norm": 3.9073736667633057, + "learning_rate": 3.5844229675952245e-05, + "loss": 5.4005, + "step": 5480 + }, + { + "epoch": 0.30320619126589277, + "grad_norm": 3.031067371368408, + "learning_rate": 3.583001705514497e-05, + "loss": 5.3484, + "step": 5485 + }, + { + "epoch": 0.3034825870646766, + "grad_norm": 3.599224090576172, + "learning_rate": 3.581580443433769e-05, + "loss": 5.4078, + "step": 5490 + }, + { + "epoch": 0.3037589828634605, + "grad_norm": 3.183856964111328, + "learning_rate": 3.580159181353042e-05, + "loss": 5.5047, + "step": 5495 + }, + { + "epoch": 0.30403537866224434, + "grad_norm": 3.2303097248077393, + "learning_rate": 3.578737919272314e-05, + "loss": 5.5218, + "step": 5500 + }, + { + "epoch": 0.3043117744610282, + "grad_norm": 3.168322801589966, + "learning_rate": 3.577316657191586e-05, + "loss": 5.4699, + "step": 5505 + }, + { + "epoch": 0.30458817025981205, + "grad_norm": 2.8198087215423584, + "learning_rate": 3.575895395110859e-05, + "loss": 4.9456, + "step": 5510 + }, + { + "epoch": 0.3048645660585959, + "grad_norm": 3.3285229206085205, + "learning_rate": 3.5744741330301307e-05, + "loss": 5.5881, + "step": 5515 + }, + { + "epoch": 0.30514096185737977, + "grad_norm": 3.381110668182373, + "learning_rate": 3.573052870949403e-05, + "loss": 5.8306, + "step": 5520 + }, + { + "epoch": 0.3054173576561636, + "grad_norm": 2.8767781257629395, + "learning_rate": 3.5716316088686754e-05, + "loss": 5.2034, + "step": 5525 + }, + { + "epoch": 0.3056937534549475, + "grad_norm": 4.404655456542969, + "learning_rate": 3.570210346787948e-05, + "loss": 5.4832, + "step": 5530 + }, + { + "epoch": 0.30597014925373134, + "grad_norm": 2.4534425735473633, + "learning_rate": 3.56878908470722e-05, + "loss": 5.1591, + "step": 5535 + }, + { + "epoch": 0.3062465450525152, + "grad_norm": 2.891896963119507, + "learning_rate": 3.567367822626492e-05, + "loss": 5.253, + "step": 5540 + }, + { + "epoch": 0.30652294085129905, + "grad_norm": 2.3647806644439697, + "learning_rate": 3.565946560545765e-05, + "loss": 5.4199, + "step": 5545 + }, + { + "epoch": 0.3067993366500829, + "grad_norm": 3.0437514781951904, + "learning_rate": 3.5645252984650374e-05, + "loss": 5.4336, + "step": 5550 + }, + { + "epoch": 0.30707573244886677, + "grad_norm": 3.7891740798950195, + "learning_rate": 3.563104036384309e-05, + "loss": 5.5751, + "step": 5555 + }, + { + "epoch": 0.3073521282476506, + "grad_norm": 3.7321300506591797, + "learning_rate": 3.5616827743035816e-05, + "loss": 5.319, + "step": 5560 + }, + { + "epoch": 0.3076285240464345, + "grad_norm": 3.2207529544830322, + "learning_rate": 3.560261512222854e-05, + "loss": 5.74, + "step": 5565 + }, + { + "epoch": 0.30790491984521834, + "grad_norm": 3.2805068492889404, + "learning_rate": 3.5588402501421263e-05, + "loss": 5.2347, + "step": 5570 + }, + { + "epoch": 0.3081813156440022, + "grad_norm": 2.671035051345825, + "learning_rate": 3.557418988061399e-05, + "loss": 5.4821, + "step": 5575 + }, + { + "epoch": 0.30845771144278605, + "grad_norm": 2.748237133026123, + "learning_rate": 3.555997725980671e-05, + "loss": 5.2953, + "step": 5580 + }, + { + "epoch": 0.3087341072415699, + "grad_norm": 3.34366512298584, + "learning_rate": 3.5545764638999435e-05, + "loss": 5.336, + "step": 5585 + }, + { + "epoch": 0.30901050304035377, + "grad_norm": 3.0358309745788574, + "learning_rate": 3.553155201819215e-05, + "loss": 5.6294, + "step": 5590 + }, + { + "epoch": 0.3092868988391376, + "grad_norm": 2.842928171157837, + "learning_rate": 3.5517339397384883e-05, + "loss": 5.2573, + "step": 5595 + }, + { + "epoch": 0.3095632946379215, + "grad_norm": 3.203237533569336, + "learning_rate": 3.55031267765776e-05, + "loss": 5.4097, + "step": 5600 + }, + { + "epoch": 0.30983969043670534, + "grad_norm": 3.44087290763855, + "learning_rate": 3.5488914155770325e-05, + "loss": 5.252, + "step": 5605 + }, + { + "epoch": 0.3101160862354892, + "grad_norm": 2.9737708568573, + "learning_rate": 3.547470153496305e-05, + "loss": 5.3748, + "step": 5610 + }, + { + "epoch": 0.31039248203427305, + "grad_norm": 2.6989824771881104, + "learning_rate": 3.546048891415577e-05, + "loss": 5.612, + "step": 5615 + }, + { + "epoch": 0.3106688778330569, + "grad_norm": 3.0308315753936768, + "learning_rate": 3.5446276293348497e-05, + "loss": 5.5512, + "step": 5620 + }, + { + "epoch": 0.31094527363184077, + "grad_norm": 3.383033275604248, + "learning_rate": 3.5432063672541214e-05, + "loss": 5.4498, + "step": 5625 + }, + { + "epoch": 0.3112216694306247, + "grad_norm": 2.5991249084472656, + "learning_rate": 3.5417851051733944e-05, + "loss": 5.3198, + "step": 5630 + }, + { + "epoch": 0.31149806522940854, + "grad_norm": 3.038034439086914, + "learning_rate": 3.540363843092667e-05, + "loss": 5.6306, + "step": 5635 + }, + { + "epoch": 0.3117744610281924, + "grad_norm": 3.4985435009002686, + "learning_rate": 3.5389425810119386e-05, + "loss": 5.7529, + "step": 5640 + }, + { + "epoch": 0.31205085682697625, + "grad_norm": 3.8740835189819336, + "learning_rate": 3.537521318931211e-05, + "loss": 5.5961, + "step": 5645 + }, + { + "epoch": 0.3123272526257601, + "grad_norm": 2.630774974822998, + "learning_rate": 3.5361000568504834e-05, + "loss": 5.4196, + "step": 5650 + }, + { + "epoch": 0.31260364842454397, + "grad_norm": 2.864548683166504, + "learning_rate": 3.534678794769756e-05, + "loss": 5.4804, + "step": 5655 + }, + { + "epoch": 0.3128800442233278, + "grad_norm": 2.944843053817749, + "learning_rate": 3.533257532689028e-05, + "loss": 5.6182, + "step": 5660 + }, + { + "epoch": 0.3131564400221117, + "grad_norm": 2.843414545059204, + "learning_rate": 3.5318362706083006e-05, + "loss": 5.3153, + "step": 5665 + }, + { + "epoch": 0.31343283582089554, + "grad_norm": 2.6151304244995117, + "learning_rate": 3.530415008527573e-05, + "loss": 5.2085, + "step": 5670 + }, + { + "epoch": 0.3137092316196794, + "grad_norm": 3.2855122089385986, + "learning_rate": 3.528993746446845e-05, + "loss": 5.1338, + "step": 5675 + }, + { + "epoch": 0.31398562741846325, + "grad_norm": 2.8010687828063965, + "learning_rate": 3.527572484366117e-05, + "loss": 5.644, + "step": 5680 + }, + { + "epoch": 0.3142620232172471, + "grad_norm": 3.428339958190918, + "learning_rate": 3.52615122228539e-05, + "loss": 5.4467, + "step": 5685 + }, + { + "epoch": 0.31453841901603097, + "grad_norm": 2.4719932079315186, + "learning_rate": 3.524729960204662e-05, + "loss": 5.5976, + "step": 5690 + }, + { + "epoch": 0.3148148148148148, + "grad_norm": 3.2966175079345703, + "learning_rate": 3.523308698123934e-05, + "loss": 5.6092, + "step": 5695 + }, + { + "epoch": 0.3150912106135987, + "grad_norm": 3.152592658996582, + "learning_rate": 3.521887436043207e-05, + "loss": 5.3276, + "step": 5700 + }, + { + "epoch": 0.31536760641238254, + "grad_norm": 2.5936498641967773, + "learning_rate": 3.520466173962479e-05, + "loss": 5.5664, + "step": 5705 + }, + { + "epoch": 0.3156440022111664, + "grad_norm": 3.2857308387756348, + "learning_rate": 3.519044911881751e-05, + "loss": 5.0564, + "step": 5710 + }, + { + "epoch": 0.31592039800995025, + "grad_norm": 3.022956371307373, + "learning_rate": 3.517623649801023e-05, + "loss": 5.3535, + "step": 5715 + }, + { + "epoch": 0.3161967938087341, + "grad_norm": 2.88362455368042, + "learning_rate": 3.516202387720296e-05, + "loss": 5.6187, + "step": 5720 + }, + { + "epoch": 0.31647318960751797, + "grad_norm": 3.379072427749634, + "learning_rate": 3.514781125639568e-05, + "loss": 5.6233, + "step": 5725 + }, + { + "epoch": 0.3167495854063018, + "grad_norm": 2.9026386737823486, + "learning_rate": 3.5133598635588404e-05, + "loss": 5.5723, + "step": 5730 + }, + { + "epoch": 0.3170259812050857, + "grad_norm": 3.1735050678253174, + "learning_rate": 3.511938601478113e-05, + "loss": 5.3733, + "step": 5735 + }, + { + "epoch": 0.31730237700386954, + "grad_norm": 2.6387147903442383, + "learning_rate": 3.510517339397385e-05, + "loss": 5.5433, + "step": 5740 + }, + { + "epoch": 0.3175787728026534, + "grad_norm": 2.5088794231414795, + "learning_rate": 3.5090960773166576e-05, + "loss": 5.3205, + "step": 5745 + }, + { + "epoch": 0.31785516860143725, + "grad_norm": 3.163351535797119, + "learning_rate": 3.507674815235929e-05, + "loss": 5.4311, + "step": 5750 + }, + { + "epoch": 0.3181315644002211, + "grad_norm": 3.1320881843566895, + "learning_rate": 3.5062535531552024e-05, + "loss": 5.3945, + "step": 5755 + }, + { + "epoch": 0.31840796019900497, + "grad_norm": 2.6302387714385986, + "learning_rate": 3.504832291074474e-05, + "loss": 5.2206, + "step": 5760 + }, + { + "epoch": 0.3186843559977888, + "grad_norm": 2.8344478607177734, + "learning_rate": 3.5034110289937465e-05, + "loss": 5.2453, + "step": 5765 + }, + { + "epoch": 0.3189607517965727, + "grad_norm": 3.076878786087036, + "learning_rate": 3.501989766913019e-05, + "loss": 5.6607, + "step": 5770 + }, + { + "epoch": 0.31923714759535654, + "grad_norm": 2.7953310012817383, + "learning_rate": 3.500568504832291e-05, + "loss": 5.4197, + "step": 5775 + }, + { + "epoch": 0.3195135433941404, + "grad_norm": 3.1712965965270996, + "learning_rate": 3.499147242751564e-05, + "loss": 5.5598, + "step": 5780 + }, + { + "epoch": 0.31978993919292426, + "grad_norm": 4.416378021240234, + "learning_rate": 3.4977259806708354e-05, + "loss": 5.5493, + "step": 5785 + }, + { + "epoch": 0.3200663349917081, + "grad_norm": 3.5662577152252197, + "learning_rate": 3.4963047185901085e-05, + "loss": 5.9117, + "step": 5790 + }, + { + "epoch": 0.32034273079049197, + "grad_norm": 3.769775390625, + "learning_rate": 3.494883456509381e-05, + "loss": 5.3641, + "step": 5795 + }, + { + "epoch": 0.3206191265892758, + "grad_norm": 3.322216749191284, + "learning_rate": 3.4934621944286526e-05, + "loss": 5.5554, + "step": 5800 + }, + { + "epoch": 0.3208955223880597, + "grad_norm": 2.9367949962615967, + "learning_rate": 3.492040932347925e-05, + "loss": 5.403, + "step": 5805 + }, + { + "epoch": 0.32117191818684354, + "grad_norm": 2.9416840076446533, + "learning_rate": 3.4906196702671974e-05, + "loss": 5.3599, + "step": 5810 + }, + { + "epoch": 0.3214483139856274, + "grad_norm": 3.0525002479553223, + "learning_rate": 3.48919840818647e-05, + "loss": 5.3231, + "step": 5815 + }, + { + "epoch": 0.32172470978441126, + "grad_norm": 3.840785503387451, + "learning_rate": 3.4877771461057415e-05, + "loss": 5.6919, + "step": 5820 + }, + { + "epoch": 0.3220011055831951, + "grad_norm": 2.738666296005249, + "learning_rate": 3.4863558840250146e-05, + "loss": 5.6685, + "step": 5825 + }, + { + "epoch": 0.32227750138197897, + "grad_norm": 4.050598621368408, + "learning_rate": 3.484934621944287e-05, + "loss": 5.4941, + "step": 5830 + }, + { + "epoch": 0.3225538971807628, + "grad_norm": 3.0634477138519287, + "learning_rate": 3.483513359863559e-05, + "loss": 5.3485, + "step": 5835 + }, + { + "epoch": 0.3228302929795467, + "grad_norm": 3.11482572555542, + "learning_rate": 3.482092097782831e-05, + "loss": 5.1452, + "step": 5840 + }, + { + "epoch": 0.32310668877833054, + "grad_norm": 2.8689663410186768, + "learning_rate": 3.4806708357021035e-05, + "loss": 5.397, + "step": 5845 + }, + { + "epoch": 0.32338308457711445, + "grad_norm": 3.4078924655914307, + "learning_rate": 3.479249573621376e-05, + "loss": 5.6103, + "step": 5850 + }, + { + "epoch": 0.3236594803758983, + "grad_norm": 3.1890652179718018, + "learning_rate": 3.477828311540648e-05, + "loss": 5.482, + "step": 5855 + }, + { + "epoch": 0.32393587617468217, + "grad_norm": 3.211745023727417, + "learning_rate": 3.476407049459921e-05, + "loss": 5.2123, + "step": 5860 + }, + { + "epoch": 0.324212271973466, + "grad_norm": 3.373166799545288, + "learning_rate": 3.474985787379193e-05, + "loss": 5.4259, + "step": 5865 + }, + { + "epoch": 0.3244886677722499, + "grad_norm": 3.0592551231384277, + "learning_rate": 3.473564525298465e-05, + "loss": 5.4406, + "step": 5870 + }, + { + "epoch": 0.32476506357103374, + "grad_norm": 3.850008010864258, + "learning_rate": 3.472143263217737e-05, + "loss": 5.4733, + "step": 5875 + }, + { + "epoch": 0.3250414593698176, + "grad_norm": 3.4215028285980225, + "learning_rate": 3.47072200113701e-05, + "loss": 5.6001, + "step": 5880 + }, + { + "epoch": 0.32531785516860146, + "grad_norm": 2.7632102966308594, + "learning_rate": 3.469300739056282e-05, + "loss": 5.3779, + "step": 5885 + }, + { + "epoch": 0.3255942509673853, + "grad_norm": 2.7702176570892334, + "learning_rate": 3.4678794769755544e-05, + "loss": 5.3826, + "step": 5890 + }, + { + "epoch": 0.32587064676616917, + "grad_norm": 2.870290517807007, + "learning_rate": 3.466458214894827e-05, + "loss": 5.4052, + "step": 5895 + }, + { + "epoch": 0.326147042564953, + "grad_norm": 3.039081573486328, + "learning_rate": 3.465036952814099e-05, + "loss": 5.2297, + "step": 5900 + }, + { + "epoch": 0.3264234383637369, + "grad_norm": 3.192931890487671, + "learning_rate": 3.4636156907333716e-05, + "loss": 5.424, + "step": 5905 + }, + { + "epoch": 0.32669983416252074, + "grad_norm": 2.7073121070861816, + "learning_rate": 3.462194428652644e-05, + "loss": 5.4958, + "step": 5910 + }, + { + "epoch": 0.3269762299613046, + "grad_norm": 3.2365293502807617, + "learning_rate": 3.4607731665719164e-05, + "loss": 5.099, + "step": 5915 + }, + { + "epoch": 0.32725262576008846, + "grad_norm": 3.3595356941223145, + "learning_rate": 3.459351904491188e-05, + "loss": 5.5425, + "step": 5920 + }, + { + "epoch": 0.3275290215588723, + "grad_norm": 2.8691320419311523, + "learning_rate": 3.4579306424104605e-05, + "loss": 5.4369, + "step": 5925 + }, + { + "epoch": 0.32780541735765617, + "grad_norm": 3.031155586242676, + "learning_rate": 3.456509380329733e-05, + "loss": 5.4964, + "step": 5930 + }, + { + "epoch": 0.32808181315644, + "grad_norm": 2.736032247543335, + "learning_rate": 3.455088118249005e-05, + "loss": 5.3402, + "step": 5935 + }, + { + "epoch": 0.3283582089552239, + "grad_norm": 3.4750332832336426, + "learning_rate": 3.453666856168278e-05, + "loss": 5.2481, + "step": 5940 + }, + { + "epoch": 0.32863460475400774, + "grad_norm": 2.969907522201538, + "learning_rate": 3.45224559408755e-05, + "loss": 5.2518, + "step": 5945 + }, + { + "epoch": 0.3289110005527916, + "grad_norm": 3.4692533016204834, + "learning_rate": 3.4508243320068225e-05, + "loss": 5.7448, + "step": 5950 + }, + { + "epoch": 0.32918739635157546, + "grad_norm": 2.769881010055542, + "learning_rate": 3.449403069926094e-05, + "loss": 5.5949, + "step": 5955 + }, + { + "epoch": 0.3294637921503593, + "grad_norm": 2.9238779544830322, + "learning_rate": 3.4479818078453666e-05, + "loss": 5.495, + "step": 5960 + }, + { + "epoch": 0.32974018794914317, + "grad_norm": 3.0299391746520996, + "learning_rate": 3.44656054576464e-05, + "loss": 5.2859, + "step": 5965 + }, + { + "epoch": 0.33001658374792703, + "grad_norm": 3.4507875442504883, + "learning_rate": 3.4451392836839114e-05, + "loss": 5.1926, + "step": 5970 + }, + { + "epoch": 0.3302929795467109, + "grad_norm": 3.147733688354492, + "learning_rate": 3.443718021603184e-05, + "loss": 5.4821, + "step": 5975 + }, + { + "epoch": 0.33056937534549474, + "grad_norm": 2.713547945022583, + "learning_rate": 3.442296759522456e-05, + "loss": 5.3562, + "step": 5980 + }, + { + "epoch": 0.3308457711442786, + "grad_norm": 3.38211727142334, + "learning_rate": 3.4408754974417286e-05, + "loss": 5.69, + "step": 5985 + }, + { + "epoch": 0.33112216694306246, + "grad_norm": 3.258375644683838, + "learning_rate": 3.439454235361001e-05, + "loss": 5.45, + "step": 5990 + }, + { + "epoch": 0.3313985627418463, + "grad_norm": 3.048330307006836, + "learning_rate": 3.438032973280273e-05, + "loss": 5.2408, + "step": 5995 + }, + { + "epoch": 0.33167495854063017, + "grad_norm": 3.6864914894104004, + "learning_rate": 3.436611711199546e-05, + "loss": 5.5333, + "step": 6000 + }, + { + "epoch": 0.33195135433941403, + "grad_norm": 3.327286958694458, + "learning_rate": 3.4351904491188175e-05, + "loss": 5.4176, + "step": 6005 + }, + { + "epoch": 0.3322277501381979, + "grad_norm": 2.82293438911438, + "learning_rate": 3.43376918703809e-05, + "loss": 5.7049, + "step": 6010 + }, + { + "epoch": 0.33250414593698174, + "grad_norm": 2.9626595973968506, + "learning_rate": 3.432347924957362e-05, + "loss": 5.7763, + "step": 6015 + }, + { + "epoch": 0.3327805417357656, + "grad_norm": 3.8101632595062256, + "learning_rate": 3.430926662876635e-05, + "loss": 5.1365, + "step": 6020 + }, + { + "epoch": 0.33305693753454946, + "grad_norm": 3.6021718978881836, + "learning_rate": 3.429505400795907e-05, + "loss": 5.3274, + "step": 6025 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.758277654647827, + "learning_rate": 3.428084138715179e-05, + "loss": 5.4596, + "step": 6030 + }, + { + "epoch": 0.33360972913211717, + "grad_norm": 3.2654664516448975, + "learning_rate": 3.426662876634452e-05, + "loss": 5.4182, + "step": 6035 + }, + { + "epoch": 0.33388612493090103, + "grad_norm": 3.1895666122436523, + "learning_rate": 3.4252416145537236e-05, + "loss": 5.4077, + "step": 6040 + }, + { + "epoch": 0.3341625207296849, + "grad_norm": 2.5739054679870605, + "learning_rate": 3.423820352472996e-05, + "loss": 5.4277, + "step": 6045 + }, + { + "epoch": 0.33443891652846874, + "grad_norm": 2.6572439670562744, + "learning_rate": 3.4223990903922684e-05, + "loss": 5.7119, + "step": 6050 + }, + { + "epoch": 0.3347153123272526, + "grad_norm": 3.490938663482666, + "learning_rate": 3.420977828311541e-05, + "loss": 5.3248, + "step": 6055 + }, + { + "epoch": 0.33499170812603646, + "grad_norm": 3.159533977508545, + "learning_rate": 3.419556566230813e-05, + "loss": 5.3683, + "step": 6060 + }, + { + "epoch": 0.3352681039248203, + "grad_norm": 2.4630796909332275, + "learning_rate": 3.418135304150085e-05, + "loss": 5.6767, + "step": 6065 + }, + { + "epoch": 0.33554449972360423, + "grad_norm": 2.582796812057495, + "learning_rate": 3.416714042069358e-05, + "loss": 5.3356, + "step": 6070 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 2.7957217693328857, + "learning_rate": 3.4152927799886304e-05, + "loss": 5.3454, + "step": 6075 + }, + { + "epoch": 0.33609729132117194, + "grad_norm": 2.7028167247772217, + "learning_rate": 3.413871517907902e-05, + "loss": 5.0287, + "step": 6080 + }, + { + "epoch": 0.3363736871199558, + "grad_norm": 2.7556872367858887, + "learning_rate": 3.4124502558271745e-05, + "loss": 5.5228, + "step": 6085 + }, + { + "epoch": 0.33665008291873966, + "grad_norm": 3.9431374073028564, + "learning_rate": 3.411028993746447e-05, + "loss": 5.4459, + "step": 6090 + }, + { + "epoch": 0.3369264787175235, + "grad_norm": 3.6302506923675537, + "learning_rate": 3.409607731665719e-05, + "loss": 5.7821, + "step": 6095 + }, + { + "epoch": 0.33720287451630737, + "grad_norm": 2.845989227294922, + "learning_rate": 3.408186469584992e-05, + "loss": 5.214, + "step": 6100 + }, + { + "epoch": 0.33747927031509123, + "grad_norm": 2.877218008041382, + "learning_rate": 3.406765207504264e-05, + "loss": 5.2843, + "step": 6105 + }, + { + "epoch": 0.3377556661138751, + "grad_norm": 2.90212082862854, + "learning_rate": 3.4053439454235365e-05, + "loss": 5.1521, + "step": 6110 + }, + { + "epoch": 0.33803206191265894, + "grad_norm": 3.701557159423828, + "learning_rate": 3.403922683342808e-05, + "loss": 5.4906, + "step": 6115 + }, + { + "epoch": 0.3383084577114428, + "grad_norm": 2.650724411010742, + "learning_rate": 3.4025014212620806e-05, + "loss": 5.339, + "step": 6120 + }, + { + "epoch": 0.33858485351022666, + "grad_norm": 3.82738995552063, + "learning_rate": 3.401080159181354e-05, + "loss": 5.489, + "step": 6125 + }, + { + "epoch": 0.3388612493090105, + "grad_norm": 3.1155989170074463, + "learning_rate": 3.3996588971006254e-05, + "loss": 4.9967, + "step": 6130 + }, + { + "epoch": 0.33913764510779437, + "grad_norm": 2.9814980030059814, + "learning_rate": 3.398237635019898e-05, + "loss": 5.3042, + "step": 6135 + }, + { + "epoch": 0.33941404090657823, + "grad_norm": 4.761054515838623, + "learning_rate": 3.39681637293917e-05, + "loss": 5.4005, + "step": 6140 + }, + { + "epoch": 0.3396904367053621, + "grad_norm": 2.9018607139587402, + "learning_rate": 3.3953951108584426e-05, + "loss": 5.2394, + "step": 6145 + }, + { + "epoch": 0.33996683250414594, + "grad_norm": 2.923269748687744, + "learning_rate": 3.3939738487777143e-05, + "loss": 5.3934, + "step": 6150 + }, + { + "epoch": 0.3402432283029298, + "grad_norm": 3.082612991333008, + "learning_rate": 3.392552586696987e-05, + "loss": 5.3588, + "step": 6155 + }, + { + "epoch": 0.34051962410171366, + "grad_norm": 2.4649646282196045, + "learning_rate": 3.39113132461626e-05, + "loss": 4.7427, + "step": 6160 + }, + { + "epoch": 0.3407960199004975, + "grad_norm": 4.229916095733643, + "learning_rate": 3.3897100625355315e-05, + "loss": 5.5868, + "step": 6165 + }, + { + "epoch": 0.3410724156992814, + "grad_norm": 2.851255416870117, + "learning_rate": 3.388288800454804e-05, + "loss": 5.097, + "step": 6170 + }, + { + "epoch": 0.34134881149806523, + "grad_norm": 3.1447596549987793, + "learning_rate": 3.386867538374076e-05, + "loss": 5.5086, + "step": 6175 + }, + { + "epoch": 0.3416252072968491, + "grad_norm": 3.191239833831787, + "learning_rate": 3.385446276293349e-05, + "loss": 5.7917, + "step": 6180 + }, + { + "epoch": 0.34190160309563294, + "grad_norm": 3.4441893100738525, + "learning_rate": 3.384025014212621e-05, + "loss": 5.2703, + "step": 6185 + }, + { + "epoch": 0.3421779988944168, + "grad_norm": 2.9748077392578125, + "learning_rate": 3.382603752131893e-05, + "loss": 5.4914, + "step": 6190 + }, + { + "epoch": 0.34245439469320066, + "grad_norm": 2.859182834625244, + "learning_rate": 3.381182490051166e-05, + "loss": 5.1884, + "step": 6195 + }, + { + "epoch": 0.3427307904919845, + "grad_norm": 3.529243230819702, + "learning_rate": 3.3797612279704376e-05, + "loss": 5.2572, + "step": 6200 + }, + { + "epoch": 0.3430071862907684, + "grad_norm": 3.0292012691497803, + "learning_rate": 3.37833996588971e-05, + "loss": 5.4534, + "step": 6205 + }, + { + "epoch": 0.34328358208955223, + "grad_norm": 2.9836554527282715, + "learning_rate": 3.376918703808983e-05, + "loss": 5.3841, + "step": 6210 + }, + { + "epoch": 0.3435599778883361, + "grad_norm": 2.9530041217803955, + "learning_rate": 3.375497441728255e-05, + "loss": 5.4718, + "step": 6215 + }, + { + "epoch": 0.34383637368711994, + "grad_norm": 4.38737678527832, + "learning_rate": 3.374076179647527e-05, + "loss": 5.5249, + "step": 6220 + }, + { + "epoch": 0.3441127694859038, + "grad_norm": 2.9150431156158447, + "learning_rate": 3.3726549175667996e-05, + "loss": 5.4543, + "step": 6225 + }, + { + "epoch": 0.34438916528468766, + "grad_norm": 2.641899824142456, + "learning_rate": 3.371233655486072e-05, + "loss": 5.3053, + "step": 6230 + }, + { + "epoch": 0.3446655610834715, + "grad_norm": 2.8919003009796143, + "learning_rate": 3.3698123934053444e-05, + "loss": 5.5736, + "step": 6235 + }, + { + "epoch": 0.3449419568822554, + "grad_norm": 4.31245756149292, + "learning_rate": 3.368391131324616e-05, + "loss": 5.4375, + "step": 6240 + }, + { + "epoch": 0.34521835268103923, + "grad_norm": 2.8349204063415527, + "learning_rate": 3.366969869243889e-05, + "loss": 5.4986, + "step": 6245 + }, + { + "epoch": 0.3454947484798231, + "grad_norm": 3.073392629623413, + "learning_rate": 3.365548607163161e-05, + "loss": 5.345, + "step": 6250 + }, + { + "epoch": 0.34577114427860695, + "grad_norm": 3.0366547107696533, + "learning_rate": 3.3641273450824333e-05, + "loss": 5.484, + "step": 6255 + }, + { + "epoch": 0.3460475400773908, + "grad_norm": 2.6022896766662598, + "learning_rate": 3.362706083001706e-05, + "loss": 5.1542, + "step": 6260 + }, + { + "epoch": 0.34632393587617466, + "grad_norm": 2.9307637214660645, + "learning_rate": 3.361284820920978e-05, + "loss": 5.3493, + "step": 6265 + }, + { + "epoch": 0.3466003316749585, + "grad_norm": 3.0827174186706543, + "learning_rate": 3.3598635588402505e-05, + "loss": 5.039, + "step": 6270 + }, + { + "epoch": 0.3468767274737424, + "grad_norm": 3.8074021339416504, + "learning_rate": 3.358442296759522e-05, + "loss": 5.382, + "step": 6275 + }, + { + "epoch": 0.34715312327252623, + "grad_norm": 3.806523561477661, + "learning_rate": 3.357021034678795e-05, + "loss": 5.5725, + "step": 6280 + }, + { + "epoch": 0.3474295190713101, + "grad_norm": 2.643564224243164, + "learning_rate": 3.355599772598067e-05, + "loss": 5.5118, + "step": 6285 + }, + { + "epoch": 0.347705914870094, + "grad_norm": 3.194028377532959, + "learning_rate": 3.3541785105173395e-05, + "loss": 5.5817, + "step": 6290 + }, + { + "epoch": 0.34798231066887786, + "grad_norm": 2.6427788734436035, + "learning_rate": 3.352757248436612e-05, + "loss": 5.3618, + "step": 6295 + }, + { + "epoch": 0.3482587064676617, + "grad_norm": 2.87471604347229, + "learning_rate": 3.351335986355884e-05, + "loss": 5.3261, + "step": 6300 + }, + { + "epoch": 0.3485351022664456, + "grad_norm": 3.181290864944458, + "learning_rate": 3.3499147242751566e-05, + "loss": 5.2144, + "step": 6305 + }, + { + "epoch": 0.34881149806522943, + "grad_norm": 3.502073049545288, + "learning_rate": 3.3484934621944284e-05, + "loss": 5.4402, + "step": 6310 + }, + { + "epoch": 0.3490878938640133, + "grad_norm": 3.2654974460601807, + "learning_rate": 3.3470722001137014e-05, + "loss": 5.5757, + "step": 6315 + }, + { + "epoch": 0.34936428966279715, + "grad_norm": 2.6679444313049316, + "learning_rate": 3.345650938032974e-05, + "loss": 5.2295, + "step": 6320 + }, + { + "epoch": 0.349640685461581, + "grad_norm": 3.1147098541259766, + "learning_rate": 3.3442296759522456e-05, + "loss": 5.3529, + "step": 6325 + }, + { + "epoch": 0.34991708126036486, + "grad_norm": 2.5592527389526367, + "learning_rate": 3.342808413871518e-05, + "loss": 5.1588, + "step": 6330 + }, + { + "epoch": 0.3501934770591487, + "grad_norm": 2.633533000946045, + "learning_rate": 3.3413871517907904e-05, + "loss": 5.4131, + "step": 6335 + }, + { + "epoch": 0.3504698728579326, + "grad_norm": 3.3299381732940674, + "learning_rate": 3.339965889710063e-05, + "loss": 5.1725, + "step": 6340 + }, + { + "epoch": 0.35074626865671643, + "grad_norm": 2.768136978149414, + "learning_rate": 3.338544627629335e-05, + "loss": 5.6988, + "step": 6345 + }, + { + "epoch": 0.3510226644555003, + "grad_norm": 3.701432228088379, + "learning_rate": 3.3371233655486076e-05, + "loss": 5.5261, + "step": 6350 + }, + { + "epoch": 0.35129906025428415, + "grad_norm": 4.025837421417236, + "learning_rate": 3.33570210346788e-05, + "loss": 5.1812, + "step": 6355 + }, + { + "epoch": 0.351575456053068, + "grad_norm": 2.9328114986419678, + "learning_rate": 3.334280841387152e-05, + "loss": 5.3862, + "step": 6360 + }, + { + "epoch": 0.35185185185185186, + "grad_norm": 2.7843148708343506, + "learning_rate": 3.332859579306424e-05, + "loss": 5.6493, + "step": 6365 + }, + { + "epoch": 0.3521282476506357, + "grad_norm": 3.647606134414673, + "learning_rate": 3.3314383172256965e-05, + "loss": 5.7016, + "step": 6370 + }, + { + "epoch": 0.3524046434494196, + "grad_norm": 2.85952091217041, + "learning_rate": 3.330017055144969e-05, + "loss": 5.287, + "step": 6375 + }, + { + "epoch": 0.35268103924820343, + "grad_norm": 2.931126356124878, + "learning_rate": 3.328595793064241e-05, + "loss": 5.2523, + "step": 6380 + }, + { + "epoch": 0.3529574350469873, + "grad_norm": 3.174170970916748, + "learning_rate": 3.3271745309835137e-05, + "loss": 5.1911, + "step": 6385 + }, + { + "epoch": 0.35323383084577115, + "grad_norm": 3.573742389678955, + "learning_rate": 3.325753268902786e-05, + "loss": 4.888, + "step": 6390 + }, + { + "epoch": 0.353510226644555, + "grad_norm": 2.854358673095703, + "learning_rate": 3.324332006822058e-05, + "loss": 5.3531, + "step": 6395 + }, + { + "epoch": 0.35378662244333886, + "grad_norm": 3.4788529872894287, + "learning_rate": 3.32291074474133e-05, + "loss": 5.5047, + "step": 6400 + }, + { + "epoch": 0.3540630182421227, + "grad_norm": 3.1905970573425293, + "learning_rate": 3.321489482660603e-05, + "loss": 5.311, + "step": 6405 + }, + { + "epoch": 0.3543394140409066, + "grad_norm": 3.261735439300537, + "learning_rate": 3.320068220579875e-05, + "loss": 5.3405, + "step": 6410 + }, + { + "epoch": 0.35461580983969043, + "grad_norm": 3.806363344192505, + "learning_rate": 3.3186469584991474e-05, + "loss": 5.2041, + "step": 6415 + }, + { + "epoch": 0.3548922056384743, + "grad_norm": 3.2285566329956055, + "learning_rate": 3.31722569641842e-05, + "loss": 5.2239, + "step": 6420 + }, + { + "epoch": 0.35516860143725815, + "grad_norm": 3.194967269897461, + "learning_rate": 3.315804434337692e-05, + "loss": 5.4333, + "step": 6425 + }, + { + "epoch": 0.355444997236042, + "grad_norm": 3.9801723957061768, + "learning_rate": 3.3143831722569646e-05, + "loss": 5.4375, + "step": 6430 + }, + { + "epoch": 0.35572139303482586, + "grad_norm": 2.690793514251709, + "learning_rate": 3.312961910176236e-05, + "loss": 5.3818, + "step": 6435 + }, + { + "epoch": 0.3559977888336097, + "grad_norm": 3.140059232711792, + "learning_rate": 3.3115406480955094e-05, + "loss": 5.2608, + "step": 6440 + }, + { + "epoch": 0.3562741846323936, + "grad_norm": 3.5246164798736572, + "learning_rate": 3.310119386014781e-05, + "loss": 5.4467, + "step": 6445 + }, + { + "epoch": 0.35655058043117743, + "grad_norm": 3.004757881164551, + "learning_rate": 3.3086981239340535e-05, + "loss": 5.6051, + "step": 6450 + }, + { + "epoch": 0.3568269762299613, + "grad_norm": 2.6584270000457764, + "learning_rate": 3.307276861853326e-05, + "loss": 5.4181, + "step": 6455 + }, + { + "epoch": 0.35710337202874515, + "grad_norm": 2.8345117568969727, + "learning_rate": 3.305855599772598e-05, + "loss": 5.5933, + "step": 6460 + }, + { + "epoch": 0.357379767827529, + "grad_norm": 2.8853676319122314, + "learning_rate": 3.304434337691871e-05, + "loss": 5.1924, + "step": 6465 + }, + { + "epoch": 0.35765616362631286, + "grad_norm": 2.9934706687927246, + "learning_rate": 3.3030130756111424e-05, + "loss": 5.5229, + "step": 6470 + }, + { + "epoch": 0.3579325594250967, + "grad_norm": 3.472885847091675, + "learning_rate": 3.3015918135304155e-05, + "loss": 5.6357, + "step": 6475 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 3.2652275562286377, + "learning_rate": 3.300170551449687e-05, + "loss": 5.2829, + "step": 6480 + }, + { + "epoch": 0.35848535102266443, + "grad_norm": 3.120035171508789, + "learning_rate": 3.2987492893689596e-05, + "loss": 5.3109, + "step": 6485 + }, + { + "epoch": 0.3587617468214483, + "grad_norm": 2.817810297012329, + "learning_rate": 3.297328027288232e-05, + "loss": 5.243, + "step": 6490 + }, + { + "epoch": 0.35903814262023215, + "grad_norm": 2.679710626602173, + "learning_rate": 3.2959067652075044e-05, + "loss": 5.6024, + "step": 6495 + }, + { + "epoch": 0.359314538419016, + "grad_norm": 3.362010955810547, + "learning_rate": 3.294485503126777e-05, + "loss": 5.3039, + "step": 6500 + }, + { + "epoch": 0.35959093421779986, + "grad_norm": 3.8571290969848633, + "learning_rate": 3.2930642410460485e-05, + "loss": 5.4973, + "step": 6505 + }, + { + "epoch": 0.3598673300165838, + "grad_norm": 3.4675230979919434, + "learning_rate": 3.2916429789653216e-05, + "loss": 5.2216, + "step": 6510 + }, + { + "epoch": 0.36014372581536763, + "grad_norm": 2.4890525341033936, + "learning_rate": 3.290221716884594e-05, + "loss": 5.2977, + "step": 6515 + }, + { + "epoch": 0.3604201216141515, + "grad_norm": 4.0375518798828125, + "learning_rate": 3.288800454803866e-05, + "loss": 5.5202, + "step": 6520 + }, + { + "epoch": 0.36069651741293535, + "grad_norm": 3.2754342555999756, + "learning_rate": 3.287379192723139e-05, + "loss": 5.082, + "step": 6525 + }, + { + "epoch": 0.3609729132117192, + "grad_norm": 2.7093911170959473, + "learning_rate": 3.2859579306424105e-05, + "loss": 5.4033, + "step": 6530 + }, + { + "epoch": 0.36124930901050306, + "grad_norm": 4.006176948547363, + "learning_rate": 3.284536668561683e-05, + "loss": 5.5329, + "step": 6535 + }, + { + "epoch": 0.3615257048092869, + "grad_norm": 4.503550052642822, + "learning_rate": 3.283115406480955e-05, + "loss": 5.3618, + "step": 6540 + }, + { + "epoch": 0.3618021006080708, + "grad_norm": 3.1193487644195557, + "learning_rate": 3.281694144400228e-05, + "loss": 5.3315, + "step": 6545 + }, + { + "epoch": 0.36207849640685463, + "grad_norm": 2.99080228805542, + "learning_rate": 3.2802728823195e-05, + "loss": 5.4393, + "step": 6550 + }, + { + "epoch": 0.3623548922056385, + "grad_norm": 3.051684617996216, + "learning_rate": 3.278851620238772e-05, + "loss": 5.2381, + "step": 6555 + }, + { + "epoch": 0.36263128800442235, + "grad_norm": 3.2539477348327637, + "learning_rate": 3.277430358158045e-05, + "loss": 5.1091, + "step": 6560 + }, + { + "epoch": 0.3629076838032062, + "grad_norm": 2.8935561180114746, + "learning_rate": 3.2760090960773166e-05, + "loss": 5.3882, + "step": 6565 + }, + { + "epoch": 0.36318407960199006, + "grad_norm": 3.101651430130005, + "learning_rate": 3.274587833996589e-05, + "loss": 5.5792, + "step": 6570 + }, + { + "epoch": 0.3634604754007739, + "grad_norm": 2.6292476654052734, + "learning_rate": 3.2731665719158614e-05, + "loss": 5.1631, + "step": 6575 + }, + { + "epoch": 0.3637368711995578, + "grad_norm": 3.1118204593658447, + "learning_rate": 3.271745309835134e-05, + "loss": 5.7077, + "step": 6580 + }, + { + "epoch": 0.36401326699834163, + "grad_norm": 3.469219207763672, + "learning_rate": 3.270324047754406e-05, + "loss": 5.4348, + "step": 6585 + }, + { + "epoch": 0.3642896627971255, + "grad_norm": 3.5681636333465576, + "learning_rate": 3.268902785673678e-05, + "loss": 5.1626, + "step": 6590 + }, + { + "epoch": 0.36456605859590935, + "grad_norm": 3.1905977725982666, + "learning_rate": 3.267481523592951e-05, + "loss": 5.0887, + "step": 6595 + }, + { + "epoch": 0.3648424543946932, + "grad_norm": 3.0895135402679443, + "learning_rate": 3.2660602615122234e-05, + "loss": 5.3645, + "step": 6600 + }, + { + "epoch": 0.36511885019347706, + "grad_norm": 2.88983154296875, + "learning_rate": 3.264638999431495e-05, + "loss": 5.0051, + "step": 6605 + }, + { + "epoch": 0.3653952459922609, + "grad_norm": 2.7491235733032227, + "learning_rate": 3.2632177373507675e-05, + "loss": 5.4856, + "step": 6610 + }, + { + "epoch": 0.3656716417910448, + "grad_norm": 2.5615084171295166, + "learning_rate": 3.26179647527004e-05, + "loss": 5.5509, + "step": 6615 + }, + { + "epoch": 0.36594803758982863, + "grad_norm": 3.6004726886749268, + "learning_rate": 3.260375213189312e-05, + "loss": 4.955, + "step": 6620 + }, + { + "epoch": 0.3662244333886125, + "grad_norm": 3.0466911792755127, + "learning_rate": 3.258953951108585e-05, + "loss": 5.3448, + "step": 6625 + }, + { + "epoch": 0.36650082918739635, + "grad_norm": 3.489597797393799, + "learning_rate": 3.257532689027857e-05, + "loss": 5.239, + "step": 6630 + }, + { + "epoch": 0.3667772249861802, + "grad_norm": 3.6642837524414062, + "learning_rate": 3.2561114269471295e-05, + "loss": 5.7073, + "step": 6635 + }, + { + "epoch": 0.36705362078496406, + "grad_norm": 3.3257694244384766, + "learning_rate": 3.254690164866401e-05, + "loss": 5.5338, + "step": 6640 + }, + { + "epoch": 0.3673300165837479, + "grad_norm": 2.889758825302124, + "learning_rate": 3.2532689027856736e-05, + "loss": 5.4763, + "step": 6645 + }, + { + "epoch": 0.3676064123825318, + "grad_norm": 3.0709822177886963, + "learning_rate": 3.251847640704947e-05, + "loss": 5.4343, + "step": 6650 + }, + { + "epoch": 0.36788280818131563, + "grad_norm": 3.2317843437194824, + "learning_rate": 3.2504263786242184e-05, + "loss": 5.48, + "step": 6655 + }, + { + "epoch": 0.3681592039800995, + "grad_norm": 2.843003034591675, + "learning_rate": 3.249005116543491e-05, + "loss": 5.3191, + "step": 6660 + }, + { + "epoch": 0.36843559977888335, + "grad_norm": 2.82918119430542, + "learning_rate": 3.247583854462763e-05, + "loss": 5.2936, + "step": 6665 + }, + { + "epoch": 0.3687119955776672, + "grad_norm": 3.3583202362060547, + "learning_rate": 3.2461625923820356e-05, + "loss": 5.5645, + "step": 6670 + }, + { + "epoch": 0.36898839137645106, + "grad_norm": 3.859200954437256, + "learning_rate": 3.244741330301307e-05, + "loss": 5.3669, + "step": 6675 + }, + { + "epoch": 0.3692647871752349, + "grad_norm": 2.8152244091033936, + "learning_rate": 3.24332006822058e-05, + "loss": 5.3654, + "step": 6680 + }, + { + "epoch": 0.3695411829740188, + "grad_norm": 2.6449549198150635, + "learning_rate": 3.241898806139853e-05, + "loss": 5.3025, + "step": 6685 + }, + { + "epoch": 0.36981757877280264, + "grad_norm": 2.791011095046997, + "learning_rate": 3.2404775440591245e-05, + "loss": 5.2969, + "step": 6690 + }, + { + "epoch": 0.3700939745715865, + "grad_norm": 3.3841540813446045, + "learning_rate": 3.239056281978397e-05, + "loss": 5.2648, + "step": 6695 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 2.6592512130737305, + "learning_rate": 3.237635019897669e-05, + "loss": 5.0054, + "step": 6700 + }, + { + "epoch": 0.3706467661691542, + "grad_norm": 3.7593088150024414, + "learning_rate": 3.236213757816942e-05, + "loss": 5.3497, + "step": 6705 + }, + { + "epoch": 0.37092316196793806, + "grad_norm": 3.4678800106048584, + "learning_rate": 3.234792495736214e-05, + "loss": 5.5969, + "step": 6710 + }, + { + "epoch": 0.3711995577667219, + "grad_norm": 2.172253370285034, + "learning_rate": 3.233371233655486e-05, + "loss": 5.2902, + "step": 6715 + }, + { + "epoch": 0.3714759535655058, + "grad_norm": 3.2627487182617188, + "learning_rate": 3.231949971574759e-05, + "loss": 5.5915, + "step": 6720 + }, + { + "epoch": 0.37175234936428964, + "grad_norm": 3.7664973735809326, + "learning_rate": 3.2305287094940306e-05, + "loss": 5.2726, + "step": 6725 + }, + { + "epoch": 0.37202874516307355, + "grad_norm": 4.006364345550537, + "learning_rate": 3.229107447413303e-05, + "loss": 5.1299, + "step": 6730 + }, + { + "epoch": 0.3723051409618574, + "grad_norm": 3.697758436203003, + "learning_rate": 3.2276861853325754e-05, + "loss": 5.4359, + "step": 6735 + }, + { + "epoch": 0.37258153676064126, + "grad_norm": 3.6661410331726074, + "learning_rate": 3.226264923251848e-05, + "loss": 5.3954, + "step": 6740 + }, + { + "epoch": 0.3728579325594251, + "grad_norm": 2.565558910369873, + "learning_rate": 3.22484366117112e-05, + "loss": 5.2579, + "step": 6745 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 4.1350483894348145, + "learning_rate": 3.223422399090392e-05, + "loss": 4.9195, + "step": 6750 + }, + { + "epoch": 0.37341072415699283, + "grad_norm": 2.775804042816162, + "learning_rate": 3.222001137009665e-05, + "loss": 5.2634, + "step": 6755 + }, + { + "epoch": 0.3736871199557767, + "grad_norm": 3.0823562145233154, + "learning_rate": 3.2205798749289374e-05, + "loss": 5.2957, + "step": 6760 + }, + { + "epoch": 0.37396351575456055, + "grad_norm": 2.7218778133392334, + "learning_rate": 3.219158612848209e-05, + "loss": 5.2254, + "step": 6765 + }, + { + "epoch": 0.3742399115533444, + "grad_norm": 2.774827718734741, + "learning_rate": 3.2177373507674815e-05, + "loss": 5.5636, + "step": 6770 + }, + { + "epoch": 0.37451630735212826, + "grad_norm": 3.3417789936065674, + "learning_rate": 3.216316088686754e-05, + "loss": 5.0876, + "step": 6775 + }, + { + "epoch": 0.3747927031509121, + "grad_norm": 2.745445489883423, + "learning_rate": 3.214894826606026e-05, + "loss": 5.5064, + "step": 6780 + }, + { + "epoch": 0.375069098949696, + "grad_norm": 3.0469281673431396, + "learning_rate": 3.213473564525298e-05, + "loss": 5.238, + "step": 6785 + }, + { + "epoch": 0.37534549474847984, + "grad_norm": 3.1263833045959473, + "learning_rate": 3.212052302444571e-05, + "loss": 5.1806, + "step": 6790 + }, + { + "epoch": 0.3756218905472637, + "grad_norm": 3.453270435333252, + "learning_rate": 3.2106310403638435e-05, + "loss": 5.2468, + "step": 6795 + }, + { + "epoch": 0.37589828634604755, + "grad_norm": 4.632005214691162, + "learning_rate": 3.209209778283115e-05, + "loss": 5.4471, + "step": 6800 + }, + { + "epoch": 0.3761746821448314, + "grad_norm": 3.877882242202759, + "learning_rate": 3.2077885162023876e-05, + "loss": 5.5113, + "step": 6805 + }, + { + "epoch": 0.37645107794361526, + "grad_norm": 3.9937760829925537, + "learning_rate": 3.20636725412166e-05, + "loss": 5.4494, + "step": 6810 + }, + { + "epoch": 0.3767274737423991, + "grad_norm": 2.7250454425811768, + "learning_rate": 3.2049459920409324e-05, + "loss": 5.1192, + "step": 6815 + }, + { + "epoch": 0.377003869541183, + "grad_norm": 2.984262466430664, + "learning_rate": 3.203524729960205e-05, + "loss": 5.1072, + "step": 6820 + }, + { + "epoch": 0.37728026533996684, + "grad_norm": 3.4100584983825684, + "learning_rate": 3.202103467879477e-05, + "loss": 5.2542, + "step": 6825 + }, + { + "epoch": 0.3775566611387507, + "grad_norm": 3.297354221343994, + "learning_rate": 3.2006822057987496e-05, + "loss": 5.2157, + "step": 6830 + }, + { + "epoch": 0.37783305693753455, + "grad_norm": 3.0777430534362793, + "learning_rate": 3.1992609437180213e-05, + "loss": 5.516, + "step": 6835 + }, + { + "epoch": 0.3781094527363184, + "grad_norm": 3.4484028816223145, + "learning_rate": 3.1978396816372944e-05, + "loss": 5.5581, + "step": 6840 + }, + { + "epoch": 0.37838584853510226, + "grad_norm": 3.6601834297180176, + "learning_rate": 3.196418419556567e-05, + "loss": 5.2055, + "step": 6845 + }, + { + "epoch": 0.3786622443338861, + "grad_norm": 3.5895004272460938, + "learning_rate": 3.1949971574758385e-05, + "loss": 5.1609, + "step": 6850 + }, + { + "epoch": 0.37893864013267, + "grad_norm": 2.961439371109009, + "learning_rate": 3.193575895395111e-05, + "loss": 5.4183, + "step": 6855 + }, + { + "epoch": 0.37921503593145384, + "grad_norm": 4.192780494689941, + "learning_rate": 3.192154633314383e-05, + "loss": 5.4031, + "step": 6860 + }, + { + "epoch": 0.3794914317302377, + "grad_norm": 3.4532811641693115, + "learning_rate": 3.190733371233656e-05, + "loss": 5.3026, + "step": 6865 + }, + { + "epoch": 0.37976782752902155, + "grad_norm": 3.2710492610931396, + "learning_rate": 3.189312109152928e-05, + "loss": 5.061, + "step": 6870 + }, + { + "epoch": 0.3800442233278054, + "grad_norm": 3.0584661960601807, + "learning_rate": 3.1878908470722005e-05, + "loss": 5.2326, + "step": 6875 + }, + { + "epoch": 0.38032061912658927, + "grad_norm": 2.271395206451416, + "learning_rate": 3.186469584991473e-05, + "loss": 4.9213, + "step": 6880 + }, + { + "epoch": 0.3805970149253731, + "grad_norm": 3.3883402347564697, + "learning_rate": 3.1850483229107446e-05, + "loss": 5.1073, + "step": 6885 + }, + { + "epoch": 0.380873410724157, + "grad_norm": 3.4375452995300293, + "learning_rate": 3.183627060830017e-05, + "loss": 5.5934, + "step": 6890 + }, + { + "epoch": 0.38114980652294084, + "grad_norm": 2.793851137161255, + "learning_rate": 3.1822057987492894e-05, + "loss": 5.0503, + "step": 6895 + }, + { + "epoch": 0.3814262023217247, + "grad_norm": 3.7272047996520996, + "learning_rate": 3.180784536668562e-05, + "loss": 5.3093, + "step": 6900 + }, + { + "epoch": 0.38170259812050855, + "grad_norm": 3.7055723667144775, + "learning_rate": 3.179363274587834e-05, + "loss": 5.3679, + "step": 6905 + }, + { + "epoch": 0.3819789939192924, + "grad_norm": 5.639072418212891, + "learning_rate": 3.1779420125071066e-05, + "loss": 5.4524, + "step": 6910 + }, + { + "epoch": 0.38225538971807627, + "grad_norm": 2.6650073528289795, + "learning_rate": 3.176520750426379e-05, + "loss": 5.022, + "step": 6915 + }, + { + "epoch": 0.3825317855168601, + "grad_norm": 3.388301372528076, + "learning_rate": 3.175099488345651e-05, + "loss": 5.1516, + "step": 6920 + }, + { + "epoch": 0.382808181315644, + "grad_norm": 4.096010208129883, + "learning_rate": 3.173678226264923e-05, + "loss": 5.127, + "step": 6925 + }, + { + "epoch": 0.38308457711442784, + "grad_norm": 3.0948524475097656, + "learning_rate": 3.172256964184196e-05, + "loss": 5.192, + "step": 6930 + }, + { + "epoch": 0.3833609729132117, + "grad_norm": 3.4886627197265625, + "learning_rate": 3.170835702103468e-05, + "loss": 5.19, + "step": 6935 + }, + { + "epoch": 0.38363736871199555, + "grad_norm": 3.2240042686462402, + "learning_rate": 3.1694144400227403e-05, + "loss": 5.6154, + "step": 6940 + }, + { + "epoch": 0.3839137645107794, + "grad_norm": 2.617563247680664, + "learning_rate": 3.167993177942013e-05, + "loss": 5.3793, + "step": 6945 + }, + { + "epoch": 0.3841901603095633, + "grad_norm": 3.2528016567230225, + "learning_rate": 3.166571915861285e-05, + "loss": 5.3784, + "step": 6950 + }, + { + "epoch": 0.3844665561083472, + "grad_norm": 3.294482707977295, + "learning_rate": 3.1651506537805575e-05, + "loss": 5.2618, + "step": 6955 + }, + { + "epoch": 0.38474295190713104, + "grad_norm": 2.874786615371704, + "learning_rate": 3.163729391699829e-05, + "loss": 5.4889, + "step": 6960 + }, + { + "epoch": 0.3850193477059149, + "grad_norm": 2.737448215484619, + "learning_rate": 3.162308129619102e-05, + "loss": 5.2753, + "step": 6965 + }, + { + "epoch": 0.38529574350469875, + "grad_norm": 3.391822338104248, + "learning_rate": 3.160886867538374e-05, + "loss": 5.3451, + "step": 6970 + }, + { + "epoch": 0.3855721393034826, + "grad_norm": 3.012768268585205, + "learning_rate": 3.1594656054576464e-05, + "loss": 5.1552, + "step": 6975 + }, + { + "epoch": 0.38584853510226647, + "grad_norm": 2.8616573810577393, + "learning_rate": 3.158044343376919e-05, + "loss": 5.3288, + "step": 6980 + }, + { + "epoch": 0.3861249309010503, + "grad_norm": 2.7854886054992676, + "learning_rate": 3.156623081296191e-05, + "loss": 5.2121, + "step": 6985 + }, + { + "epoch": 0.3864013266998342, + "grad_norm": 2.713104009628296, + "learning_rate": 3.1552018192154636e-05, + "loss": 5.1097, + "step": 6990 + }, + { + "epoch": 0.38667772249861804, + "grad_norm": 3.1725711822509766, + "learning_rate": 3.1537805571347354e-05, + "loss": 5.2764, + "step": 6995 + }, + { + "epoch": 0.3869541182974019, + "grad_norm": 2.950223684310913, + "learning_rate": 3.1523592950540084e-05, + "loss": 5.3748, + "step": 7000 + }, + { + "epoch": 0.38723051409618575, + "grad_norm": 3.557173252105713, + "learning_rate": 3.15093803297328e-05, + "loss": 5.7142, + "step": 7005 + }, + { + "epoch": 0.3875069098949696, + "grad_norm": 3.303514242172241, + "learning_rate": 3.1495167708925526e-05, + "loss": 5.4981, + "step": 7010 + }, + { + "epoch": 0.38778330569375347, + "grad_norm": 3.2154033184051514, + "learning_rate": 3.148095508811825e-05, + "loss": 5.3371, + "step": 7015 + }, + { + "epoch": 0.3880597014925373, + "grad_norm": 3.158167600631714, + "learning_rate": 3.1466742467310974e-05, + "loss": 5.2136, + "step": 7020 + }, + { + "epoch": 0.3883360972913212, + "grad_norm": 2.4815750122070312, + "learning_rate": 3.14525298465037e-05, + "loss": 5.3322, + "step": 7025 + }, + { + "epoch": 0.38861249309010504, + "grad_norm": 2.9701693058013916, + "learning_rate": 3.1438317225696415e-05, + "loss": 5.0189, + "step": 7030 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 2.927236318588257, + "learning_rate": 3.1424104604889145e-05, + "loss": 5.224, + "step": 7035 + }, + { + "epoch": 0.38916528468767275, + "grad_norm": 3.0851492881774902, + "learning_rate": 3.140989198408187e-05, + "loss": 5.1077, + "step": 7040 + }, + { + "epoch": 0.3894416804864566, + "grad_norm": 3.0460946559906006, + "learning_rate": 3.139567936327459e-05, + "loss": 4.9981, + "step": 7045 + }, + { + "epoch": 0.38971807628524047, + "grad_norm": 3.0376229286193848, + "learning_rate": 3.138146674246731e-05, + "loss": 5.4228, + "step": 7050 + }, + { + "epoch": 0.3899944720840243, + "grad_norm": 3.2579638957977295, + "learning_rate": 3.1367254121660035e-05, + "loss": 5.3469, + "step": 7055 + }, + { + "epoch": 0.3902708678828082, + "grad_norm": 3.601778984069824, + "learning_rate": 3.135304150085276e-05, + "loss": 5.3353, + "step": 7060 + }, + { + "epoch": 0.39054726368159204, + "grad_norm": 3.496253252029419, + "learning_rate": 3.133882888004548e-05, + "loss": 5.2927, + "step": 7065 + }, + { + "epoch": 0.3908236594803759, + "grad_norm": 2.954893112182617, + "learning_rate": 3.1324616259238207e-05, + "loss": 5.1752, + "step": 7070 + }, + { + "epoch": 0.39110005527915975, + "grad_norm": 3.126713752746582, + "learning_rate": 3.131040363843093e-05, + "loss": 5.1443, + "step": 7075 + }, + { + "epoch": 0.3913764510779436, + "grad_norm": 3.489055633544922, + "learning_rate": 3.129619101762365e-05, + "loss": 5.1166, + "step": 7080 + }, + { + "epoch": 0.39165284687672747, + "grad_norm": 2.989750623703003, + "learning_rate": 3.128197839681637e-05, + "loss": 5.2128, + "step": 7085 + }, + { + "epoch": 0.3919292426755113, + "grad_norm": 3.5772876739501953, + "learning_rate": 3.12677657760091e-05, + "loss": 5.371, + "step": 7090 + }, + { + "epoch": 0.3922056384742952, + "grad_norm": 3.1399166584014893, + "learning_rate": 3.125355315520182e-05, + "loss": 5.0302, + "step": 7095 + }, + { + "epoch": 0.39248203427307904, + "grad_norm": 3.717059373855591, + "learning_rate": 3.1239340534394544e-05, + "loss": 5.1814, + "step": 7100 + }, + { + "epoch": 0.3927584300718629, + "grad_norm": 3.603480815887451, + "learning_rate": 3.122512791358727e-05, + "loss": 5.2066, + "step": 7105 + }, + { + "epoch": 0.39303482587064675, + "grad_norm": 2.86629319190979, + "learning_rate": 3.121091529277999e-05, + "loss": 5.0585, + "step": 7110 + }, + { + "epoch": 0.3933112216694306, + "grad_norm": 3.81514573097229, + "learning_rate": 3.119670267197271e-05, + "loss": 5.4419, + "step": 7115 + }, + { + "epoch": 0.39358761746821447, + "grad_norm": 3.7256438732147217, + "learning_rate": 3.118249005116543e-05, + "loss": 5.4916, + "step": 7120 + }, + { + "epoch": 0.3938640132669983, + "grad_norm": 2.9998602867126465, + "learning_rate": 3.1168277430358164e-05, + "loss": 5.3949, + "step": 7125 + }, + { + "epoch": 0.3941404090657822, + "grad_norm": 3.518113613128662, + "learning_rate": 3.115406480955088e-05, + "loss": 5.4088, + "step": 7130 + }, + { + "epoch": 0.39441680486456604, + "grad_norm": 2.6121208667755127, + "learning_rate": 3.1139852188743605e-05, + "loss": 5.4199, + "step": 7135 + }, + { + "epoch": 0.3946932006633499, + "grad_norm": 3.279484510421753, + "learning_rate": 3.112563956793633e-05, + "loss": 5.1075, + "step": 7140 + }, + { + "epoch": 0.39496959646213375, + "grad_norm": 3.5185775756835938, + "learning_rate": 3.111142694712905e-05, + "loss": 5.2746, + "step": 7145 + }, + { + "epoch": 0.3952459922609176, + "grad_norm": 3.7516870498657227, + "learning_rate": 3.109721432632178e-05, + "loss": 5.1902, + "step": 7150 + }, + { + "epoch": 0.39552238805970147, + "grad_norm": 3.0849173069000244, + "learning_rate": 3.10830017055145e-05, + "loss": 5.1215, + "step": 7155 + }, + { + "epoch": 0.3957987838584853, + "grad_norm": 2.995088577270508, + "learning_rate": 3.1068789084707225e-05, + "loss": 5.2936, + "step": 7160 + }, + { + "epoch": 0.3960751796572692, + "grad_norm": 3.2057669162750244, + "learning_rate": 3.105457646389994e-05, + "loss": 5.5044, + "step": 7165 + }, + { + "epoch": 0.3963515754560531, + "grad_norm": 4.289910793304443, + "learning_rate": 3.1040363843092666e-05, + "loss": 5.522, + "step": 7170 + }, + { + "epoch": 0.39662797125483695, + "grad_norm": 2.816082000732422, + "learning_rate": 3.1026151222285397e-05, + "loss": 5.1474, + "step": 7175 + }, + { + "epoch": 0.3969043670536208, + "grad_norm": 3.6746461391448975, + "learning_rate": 3.1011938601478114e-05, + "loss": 5.5855, + "step": 7180 + }, + { + "epoch": 0.39718076285240467, + "grad_norm": 3.0824356079101562, + "learning_rate": 3.099772598067084e-05, + "loss": 5.1413, + "step": 7185 + }, + { + "epoch": 0.3974571586511885, + "grad_norm": 3.6951475143432617, + "learning_rate": 3.098351335986356e-05, + "loss": 5.3582, + "step": 7190 + }, + { + "epoch": 0.3977335544499724, + "grad_norm": 3.4342403411865234, + "learning_rate": 3.0969300739056286e-05, + "loss": 5.4512, + "step": 7195 + }, + { + "epoch": 0.39800995024875624, + "grad_norm": 2.806812047958374, + "learning_rate": 3.095508811824901e-05, + "loss": 4.7637, + "step": 7200 + }, + { + "epoch": 0.3982863460475401, + "grad_norm": 3.490265130996704, + "learning_rate": 3.094087549744173e-05, + "loss": 5.6499, + "step": 7205 + }, + { + "epoch": 0.39856274184632395, + "grad_norm": 2.978156566619873, + "learning_rate": 3.092666287663446e-05, + "loss": 5.4028, + "step": 7210 + }, + { + "epoch": 0.3988391376451078, + "grad_norm": 2.9881980419158936, + "learning_rate": 3.0912450255827175e-05, + "loss": 5.4023, + "step": 7215 + }, + { + "epoch": 0.39911553344389167, + "grad_norm": 3.111189365386963, + "learning_rate": 3.08982376350199e-05, + "loss": 5.2614, + "step": 7220 + }, + { + "epoch": 0.3993919292426755, + "grad_norm": 2.9220035076141357, + "learning_rate": 3.088402501421262e-05, + "loss": 5.1271, + "step": 7225 + }, + { + "epoch": 0.3996683250414594, + "grad_norm": 3.135206460952759, + "learning_rate": 3.086981239340535e-05, + "loss": 5.0486, + "step": 7230 + }, + { + "epoch": 0.39994472084024324, + "grad_norm": 3.383234739303589, + "learning_rate": 3.085559977259807e-05, + "loss": 4.99, + "step": 7235 + }, + { + "epoch": 0.4002211166390271, + "grad_norm": 3.352858066558838, + "learning_rate": 3.084138715179079e-05, + "loss": 5.206, + "step": 7240 + }, + { + "epoch": 0.40049751243781095, + "grad_norm": 2.6485846042633057, + "learning_rate": 3.082717453098352e-05, + "loss": 5.3789, + "step": 7245 + }, + { + "epoch": 0.4007739082365948, + "grad_norm": 2.586437702178955, + "learning_rate": 3.0812961910176236e-05, + "loss": 5.2531, + "step": 7250 + }, + { + "epoch": 0.40105030403537867, + "grad_norm": 3.3769021034240723, + "learning_rate": 3.079874928936896e-05, + "loss": 5.3068, + "step": 7255 + }, + { + "epoch": 0.4013266998341625, + "grad_norm": 2.9882419109344482, + "learning_rate": 3.0784536668561684e-05, + "loss": 5.1777, + "step": 7260 + }, + { + "epoch": 0.4016030956329464, + "grad_norm": 3.543286085128784, + "learning_rate": 3.077032404775441e-05, + "loss": 5.4205, + "step": 7265 + }, + { + "epoch": 0.40187949143173024, + "grad_norm": 3.5167641639709473, + "learning_rate": 3.075611142694713e-05, + "loss": 5.2147, + "step": 7270 + }, + { + "epoch": 0.4021558872305141, + "grad_norm": 4.086061477661133, + "learning_rate": 3.074189880613985e-05, + "loss": 5.3399, + "step": 7275 + }, + { + "epoch": 0.40243228302929795, + "grad_norm": 2.9162538051605225, + "learning_rate": 3.072768618533258e-05, + "loss": 5.2173, + "step": 7280 + }, + { + "epoch": 0.4027086788280818, + "grad_norm": 3.2344117164611816, + "learning_rate": 3.0713473564525304e-05, + "loss": 5.446, + "step": 7285 + }, + { + "epoch": 0.40298507462686567, + "grad_norm": 2.547792673110962, + "learning_rate": 3.069926094371802e-05, + "loss": 5.2879, + "step": 7290 + }, + { + "epoch": 0.4032614704256495, + "grad_norm": 2.705897092819214, + "learning_rate": 3.0685048322910745e-05, + "loss": 5.243, + "step": 7295 + }, + { + "epoch": 0.4035378662244334, + "grad_norm": 3.052686929702759, + "learning_rate": 3.067083570210347e-05, + "loss": 5.5547, + "step": 7300 + }, + { + "epoch": 0.40381426202321724, + "grad_norm": 2.7619709968566895, + "learning_rate": 3.065662308129619e-05, + "loss": 5.1307, + "step": 7305 + }, + { + "epoch": 0.4040906578220011, + "grad_norm": 2.8688414096832275, + "learning_rate": 3.064241046048892e-05, + "loss": 5.1096, + "step": 7310 + }, + { + "epoch": 0.40436705362078496, + "grad_norm": 3.4232749938964844, + "learning_rate": 3.062819783968164e-05, + "loss": 5.318, + "step": 7315 + }, + { + "epoch": 0.4046434494195688, + "grad_norm": 4.028477191925049, + "learning_rate": 3.0613985218874365e-05, + "loss": 5.4557, + "step": 7320 + }, + { + "epoch": 0.40491984521835267, + "grad_norm": 3.430436134338379, + "learning_rate": 3.059977259806708e-05, + "loss": 5.3102, + "step": 7325 + }, + { + "epoch": 0.4051962410171365, + "grad_norm": 2.725626230239868, + "learning_rate": 3.0585559977259806e-05, + "loss": 5.3974, + "step": 7330 + }, + { + "epoch": 0.4054726368159204, + "grad_norm": 3.0082714557647705, + "learning_rate": 3.057134735645253e-05, + "loss": 5.2993, + "step": 7335 + }, + { + "epoch": 0.40574903261470424, + "grad_norm": 3.552365303039551, + "learning_rate": 3.0557134735645254e-05, + "loss": 5.2335, + "step": 7340 + }, + { + "epoch": 0.4060254284134881, + "grad_norm": 3.0008647441864014, + "learning_rate": 3.054292211483798e-05, + "loss": 5.179, + "step": 7345 + }, + { + "epoch": 0.40630182421227196, + "grad_norm": 3.4368932247161865, + "learning_rate": 3.05287094940307e-05, + "loss": 5.1877, + "step": 7350 + }, + { + "epoch": 0.4065782200110558, + "grad_norm": 2.775163173675537, + "learning_rate": 3.0514496873223426e-05, + "loss": 5.3432, + "step": 7355 + }, + { + "epoch": 0.40685461580983967, + "grad_norm": 3.331177234649658, + "learning_rate": 3.0500284252416143e-05, + "loss": 5.6045, + "step": 7360 + }, + { + "epoch": 0.4071310116086235, + "grad_norm": 3.2628567218780518, + "learning_rate": 3.048607163160887e-05, + "loss": 5.3768, + "step": 7365 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 3.5824832916259766, + "learning_rate": 3.0471859010801594e-05, + "loss": 5.5101, + "step": 7370 + }, + { + "epoch": 0.40768380320619124, + "grad_norm": 2.661316394805908, + "learning_rate": 3.0457646389994315e-05, + "loss": 5.4241, + "step": 7375 + }, + { + "epoch": 0.4079601990049751, + "grad_norm": 3.2753899097442627, + "learning_rate": 3.0443433769187042e-05, + "loss": 5.1645, + "step": 7380 + }, + { + "epoch": 0.40823659480375896, + "grad_norm": 3.1966748237609863, + "learning_rate": 3.042922114837976e-05, + "loss": 5.3537, + "step": 7385 + }, + { + "epoch": 0.4085129906025428, + "grad_norm": 3.2661285400390625, + "learning_rate": 3.0415008527572487e-05, + "loss": 5.3202, + "step": 7390 + }, + { + "epoch": 0.4087893864013267, + "grad_norm": 2.994544506072998, + "learning_rate": 3.040079590676521e-05, + "loss": 5.3407, + "step": 7395 + }, + { + "epoch": 0.4090657822001106, + "grad_norm": 3.363358974456787, + "learning_rate": 3.038658328595793e-05, + "loss": 5.3274, + "step": 7400 + }, + { + "epoch": 0.40934217799889444, + "grad_norm": 3.417901039123535, + "learning_rate": 3.0372370665150656e-05, + "loss": 5.2245, + "step": 7405 + }, + { + "epoch": 0.4096185737976783, + "grad_norm": 3.499995470046997, + "learning_rate": 3.0358158044343376e-05, + "loss": 5.043, + "step": 7410 + }, + { + "epoch": 0.40989496959646216, + "grad_norm": 3.142343044281006, + "learning_rate": 3.0343945423536104e-05, + "loss": 5.2181, + "step": 7415 + }, + { + "epoch": 0.410171365395246, + "grad_norm": 2.7809529304504395, + "learning_rate": 3.032973280272882e-05, + "loss": 5.3378, + "step": 7420 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 3.567267656326294, + "learning_rate": 3.0315520181921548e-05, + "loss": 5.3781, + "step": 7425 + }, + { + "epoch": 0.4107241569928137, + "grad_norm": 4.007451057434082, + "learning_rate": 3.0301307561114272e-05, + "loss": 5.1938, + "step": 7430 + }, + { + "epoch": 0.4110005527915976, + "grad_norm": 3.4437944889068604, + "learning_rate": 3.0287094940306993e-05, + "loss": 5.0837, + "step": 7435 + }, + { + "epoch": 0.41127694859038144, + "grad_norm": 3.051156997680664, + "learning_rate": 3.0272882319499717e-05, + "loss": 5.3, + "step": 7440 + }, + { + "epoch": 0.4115533443891653, + "grad_norm": 3.0758109092712402, + "learning_rate": 3.0258669698692437e-05, + "loss": 5.0707, + "step": 7445 + }, + { + "epoch": 0.41182974018794916, + "grad_norm": 3.035916328430176, + "learning_rate": 3.0244457077885165e-05, + "loss": 5.1578, + "step": 7450 + }, + { + "epoch": 0.412106135986733, + "grad_norm": 3.541250228881836, + "learning_rate": 3.023024445707789e-05, + "loss": 4.8507, + "step": 7455 + }, + { + "epoch": 0.41238253178551687, + "grad_norm": 4.51204252243042, + "learning_rate": 3.021603183627061e-05, + "loss": 5.1498, + "step": 7460 + }, + { + "epoch": 0.4126589275843007, + "grad_norm": 2.9354135990142822, + "learning_rate": 3.0201819215463333e-05, + "loss": 5.0576, + "step": 7465 + }, + { + "epoch": 0.4129353233830846, + "grad_norm": 2.9570817947387695, + "learning_rate": 3.0187606594656054e-05, + "loss": 5.3592, + "step": 7470 + }, + { + "epoch": 0.41321171918186844, + "grad_norm": 2.823472499847412, + "learning_rate": 3.0173393973848778e-05, + "loss": 4.9364, + "step": 7475 + }, + { + "epoch": 0.4134881149806523, + "grad_norm": 2.96844744682312, + "learning_rate": 3.0159181353041505e-05, + "loss": 5.3399, + "step": 7480 + }, + { + "epoch": 0.41376451077943616, + "grad_norm": 3.269542694091797, + "learning_rate": 3.0144968732234226e-05, + "loss": 5.2783, + "step": 7485 + }, + { + "epoch": 0.41404090657822, + "grad_norm": 2.990311622619629, + "learning_rate": 3.013075611142695e-05, + "loss": 5.4059, + "step": 7490 + }, + { + "epoch": 0.41431730237700387, + "grad_norm": 2.818763494491577, + "learning_rate": 3.011654349061967e-05, + "loss": 5.3341, + "step": 7495 + }, + { + "epoch": 0.41459369817578773, + "grad_norm": 4.055943489074707, + "learning_rate": 3.0102330869812394e-05, + "loss": 5.2108, + "step": 7500 + }, + { + "epoch": 0.4148700939745716, + "grad_norm": 3.448615789413452, + "learning_rate": 3.008811824900512e-05, + "loss": 5.0803, + "step": 7505 + }, + { + "epoch": 0.41514648977335544, + "grad_norm": 2.7099623680114746, + "learning_rate": 3.007390562819784e-05, + "loss": 5.1597, + "step": 7510 + }, + { + "epoch": 0.4154228855721393, + "grad_norm": 2.860887050628662, + "learning_rate": 3.0059693007390566e-05, + "loss": 5.0554, + "step": 7515 + }, + { + "epoch": 0.41569928137092316, + "grad_norm": 3.508277177810669, + "learning_rate": 3.0045480386583287e-05, + "loss": 5.2466, + "step": 7520 + }, + { + "epoch": 0.415975677169707, + "grad_norm": 3.1834499835968018, + "learning_rate": 3.003126776577601e-05, + "loss": 5.4212, + "step": 7525 + }, + { + "epoch": 0.41625207296849087, + "grad_norm": 3.1005613803863525, + "learning_rate": 3.001705514496873e-05, + "loss": 5.4608, + "step": 7530 + }, + { + "epoch": 0.41652846876727473, + "grad_norm": 3.1550991535186768, + "learning_rate": 3.0002842524161455e-05, + "loss": 5.2565, + "step": 7535 + }, + { + "epoch": 0.4168048645660586, + "grad_norm": 3.293140172958374, + "learning_rate": 2.9988629903354183e-05, + "loss": 5.5235, + "step": 7540 + }, + { + "epoch": 0.41708126036484244, + "grad_norm": 3.3371574878692627, + "learning_rate": 2.9974417282546903e-05, + "loss": 5.3766, + "step": 7545 + }, + { + "epoch": 0.4173576561636263, + "grad_norm": 3.0962579250335693, + "learning_rate": 2.9960204661739627e-05, + "loss": 5.1429, + "step": 7550 + }, + { + "epoch": 0.41763405196241016, + "grad_norm": 2.632319927215576, + "learning_rate": 2.9945992040932348e-05, + "loss": 5.2072, + "step": 7555 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 3.5730085372924805, + "learning_rate": 2.9931779420125072e-05, + "loss": 5.3899, + "step": 7560 + }, + { + "epoch": 0.41818684355997787, + "grad_norm": 3.1113953590393066, + "learning_rate": 2.99175667993178e-05, + "loss": 5.2587, + "step": 7565 + }, + { + "epoch": 0.41846323935876173, + "grad_norm": 2.9837839603424072, + "learning_rate": 2.9903354178510516e-05, + "loss": 5.1953, + "step": 7570 + }, + { + "epoch": 0.4187396351575456, + "grad_norm": 3.156015634536743, + "learning_rate": 2.9889141557703244e-05, + "loss": 5.0675, + "step": 7575 + }, + { + "epoch": 0.41901603095632944, + "grad_norm": 2.8620293140411377, + "learning_rate": 2.9874928936895964e-05, + "loss": 5.2232, + "step": 7580 + }, + { + "epoch": 0.4192924267551133, + "grad_norm": 2.7657828330993652, + "learning_rate": 2.9860716316088688e-05, + "loss": 5.0537, + "step": 7585 + }, + { + "epoch": 0.41956882255389716, + "grad_norm": 2.991236448287964, + "learning_rate": 2.9846503695281412e-05, + "loss": 5.3795, + "step": 7590 + }, + { + "epoch": 0.419845218352681, + "grad_norm": 2.935781717300415, + "learning_rate": 2.9832291074474133e-05, + "loss": 5.0797, + "step": 7595 + }, + { + "epoch": 0.4201216141514649, + "grad_norm": 3.5586657524108887, + "learning_rate": 2.981807845366686e-05, + "loss": 5.3075, + "step": 7600 + }, + { + "epoch": 0.42039800995024873, + "grad_norm": 3.6937458515167236, + "learning_rate": 2.9803865832859577e-05, + "loss": 5.2942, + "step": 7605 + }, + { + "epoch": 0.4206744057490326, + "grad_norm": 3.0351436138153076, + "learning_rate": 2.9789653212052305e-05, + "loss": 5.1602, + "step": 7610 + }, + { + "epoch": 0.4209508015478165, + "grad_norm": 2.95098614692688, + "learning_rate": 2.977544059124503e-05, + "loss": 5.2933, + "step": 7615 + }, + { + "epoch": 0.42122719734660036, + "grad_norm": 2.8165371417999268, + "learning_rate": 2.976122797043775e-05, + "loss": 5.1234, + "step": 7620 + }, + { + "epoch": 0.4215035931453842, + "grad_norm": 3.436764717102051, + "learning_rate": 2.9747015349630473e-05, + "loss": 5.2743, + "step": 7625 + }, + { + "epoch": 0.42177998894416807, + "grad_norm": 3.4942984580993652, + "learning_rate": 2.9732802728823194e-05, + "loss": 5.5733, + "step": 7630 + }, + { + "epoch": 0.42205638474295193, + "grad_norm": 3.76875901222229, + "learning_rate": 2.971859010801592e-05, + "loss": 5.3345, + "step": 7635 + }, + { + "epoch": 0.4223327805417358, + "grad_norm": 3.2860209941864014, + "learning_rate": 2.970437748720864e-05, + "loss": 5.3476, + "step": 7640 + }, + { + "epoch": 0.42260917634051964, + "grad_norm": 3.403754949569702, + "learning_rate": 2.9690164866401366e-05, + "loss": 5.1001, + "step": 7645 + }, + { + "epoch": 0.4228855721393035, + "grad_norm": 3.613391876220703, + "learning_rate": 2.967595224559409e-05, + "loss": 5.0587, + "step": 7650 + }, + { + "epoch": 0.42316196793808736, + "grad_norm": 3.401045083999634, + "learning_rate": 2.966173962478681e-05, + "loss": 4.985, + "step": 7655 + }, + { + "epoch": 0.4234383637368712, + "grad_norm": 3.0673391819000244, + "learning_rate": 2.9647527003979534e-05, + "loss": 5.115, + "step": 7660 + }, + { + "epoch": 0.42371475953565507, + "grad_norm": 3.054534673690796, + "learning_rate": 2.9633314383172255e-05, + "loss": 5.1765, + "step": 7665 + }, + { + "epoch": 0.42399115533443893, + "grad_norm": 4.545713424682617, + "learning_rate": 2.9619101762364982e-05, + "loss": 5.0729, + "step": 7670 + }, + { + "epoch": 0.4242675511332228, + "grad_norm": 3.014993190765381, + "learning_rate": 2.9604889141557706e-05, + "loss": 5.4135, + "step": 7675 + }, + { + "epoch": 0.42454394693200664, + "grad_norm": 2.8630824089050293, + "learning_rate": 2.9590676520750427e-05, + "loss": 5.1434, + "step": 7680 + }, + { + "epoch": 0.4248203427307905, + "grad_norm": 3.143050193786621, + "learning_rate": 2.957646389994315e-05, + "loss": 5.2194, + "step": 7685 + }, + { + "epoch": 0.42509673852957436, + "grad_norm": 3.1831743717193604, + "learning_rate": 2.956225127913587e-05, + "loss": 5.4145, + "step": 7690 + }, + { + "epoch": 0.4253731343283582, + "grad_norm": 3.281905174255371, + "learning_rate": 2.95480386583286e-05, + "loss": 5.2458, + "step": 7695 + }, + { + "epoch": 0.4256495301271421, + "grad_norm": 3.2459661960601807, + "learning_rate": 2.9533826037521323e-05, + "loss": 5.1643, + "step": 7700 + }, + { + "epoch": 0.42592592592592593, + "grad_norm": 3.014573097229004, + "learning_rate": 2.9519613416714043e-05, + "loss": 5.3188, + "step": 7705 + }, + { + "epoch": 0.4262023217247098, + "grad_norm": 3.489968776702881, + "learning_rate": 2.9505400795906767e-05, + "loss": 5.359, + "step": 7710 + }, + { + "epoch": 0.42647871752349364, + "grad_norm": 2.6818461418151855, + "learning_rate": 2.9491188175099488e-05, + "loss": 5.2443, + "step": 7715 + }, + { + "epoch": 0.4267551133222775, + "grad_norm": 3.4301328659057617, + "learning_rate": 2.9476975554292212e-05, + "loss": 5.3404, + "step": 7720 + }, + { + "epoch": 0.42703150912106136, + "grad_norm": 2.975738525390625, + "learning_rate": 2.946276293348494e-05, + "loss": 5.5118, + "step": 7725 + }, + { + "epoch": 0.4273079049198452, + "grad_norm": 3.66871976852417, + "learning_rate": 2.944855031267766e-05, + "loss": 5.2737, + "step": 7730 + }, + { + "epoch": 0.4275843007186291, + "grad_norm": 2.812068462371826, + "learning_rate": 2.9434337691870384e-05, + "loss": 5.0211, + "step": 7735 + }, + { + "epoch": 0.42786069651741293, + "grad_norm": 3.8144164085388184, + "learning_rate": 2.9420125071063105e-05, + "loss": 5.3493, + "step": 7740 + }, + { + "epoch": 0.4281370923161968, + "grad_norm": 3.173715353012085, + "learning_rate": 2.940591245025583e-05, + "loss": 5.144, + "step": 7745 + }, + { + "epoch": 0.42841348811498065, + "grad_norm": 3.548175573348999, + "learning_rate": 2.939169982944855e-05, + "loss": 5.4388, + "step": 7750 + }, + { + "epoch": 0.4286898839137645, + "grad_norm": 2.4336702823638916, + "learning_rate": 2.9377487208641273e-05, + "loss": 5.1597, + "step": 7755 + }, + { + "epoch": 0.42896627971254836, + "grad_norm": 3.0814414024353027, + "learning_rate": 2.9363274587834e-05, + "loss": 5.2932, + "step": 7760 + }, + { + "epoch": 0.4292426755113322, + "grad_norm": 2.701829671859741, + "learning_rate": 2.934906196702672e-05, + "loss": 5.2772, + "step": 7765 + }, + { + "epoch": 0.4295190713101161, + "grad_norm": 2.623934268951416, + "learning_rate": 2.9334849346219445e-05, + "loss": 4.9666, + "step": 7770 + }, + { + "epoch": 0.42979546710889993, + "grad_norm": 3.162205696105957, + "learning_rate": 2.9320636725412166e-05, + "loss": 5.5104, + "step": 7775 + }, + { + "epoch": 0.4300718629076838, + "grad_norm": 3.1001057624816895, + "learning_rate": 2.930642410460489e-05, + "loss": 5.2675, + "step": 7780 + }, + { + "epoch": 0.43034825870646765, + "grad_norm": 3.4800355434417725, + "learning_rate": 2.9292211483797617e-05, + "loss": 5.246, + "step": 7785 + }, + { + "epoch": 0.4306246545052515, + "grad_norm": 3.1043055057525635, + "learning_rate": 2.9277998862990334e-05, + "loss": 4.9439, + "step": 7790 + }, + { + "epoch": 0.43090105030403536, + "grad_norm": 2.9988179206848145, + "learning_rate": 2.926378624218306e-05, + "loss": 5.1472, + "step": 7795 + }, + { + "epoch": 0.4311774461028192, + "grad_norm": 3.2718756198883057, + "learning_rate": 2.9249573621375782e-05, + "loss": 5.3382, + "step": 7800 + }, + { + "epoch": 0.4314538419016031, + "grad_norm": 2.6709251403808594, + "learning_rate": 2.9235361000568506e-05, + "loss": 5.1973, + "step": 7805 + }, + { + "epoch": 0.43173023770038693, + "grad_norm": 2.908034086227417, + "learning_rate": 2.922114837976123e-05, + "loss": 5.0482, + "step": 7810 + }, + { + "epoch": 0.4320066334991708, + "grad_norm": 2.8936550617218018, + "learning_rate": 2.920693575895395e-05, + "loss": 5.223, + "step": 7815 + }, + { + "epoch": 0.43228302929795465, + "grad_norm": 3.31170654296875, + "learning_rate": 2.9192723138146678e-05, + "loss": 5.218, + "step": 7820 + }, + { + "epoch": 0.4325594250967385, + "grad_norm": 3.3530709743499756, + "learning_rate": 2.9178510517339395e-05, + "loss": 5.1359, + "step": 7825 + }, + { + "epoch": 0.43283582089552236, + "grad_norm": 2.950207233428955, + "learning_rate": 2.9164297896532123e-05, + "loss": 5.538, + "step": 7830 + }, + { + "epoch": 0.4331122166943063, + "grad_norm": 3.7131307125091553, + "learning_rate": 2.9150085275724847e-05, + "loss": 5.2753, + "step": 7835 + }, + { + "epoch": 0.43338861249309013, + "grad_norm": 3.750091075897217, + "learning_rate": 2.9135872654917567e-05, + "loss": 5.0678, + "step": 7840 + }, + { + "epoch": 0.433665008291874, + "grad_norm": 2.7476789951324463, + "learning_rate": 2.9121660034110295e-05, + "loss": 5.3974, + "step": 7845 + }, + { + "epoch": 0.43394140409065785, + "grad_norm": 3.338824510574341, + "learning_rate": 2.9107447413303012e-05, + "loss": 4.9142, + "step": 7850 + }, + { + "epoch": 0.4342177998894417, + "grad_norm": 3.1479055881500244, + "learning_rate": 2.909323479249574e-05, + "loss": 5.0689, + "step": 7855 + }, + { + "epoch": 0.43449419568822556, + "grad_norm": 2.765441656112671, + "learning_rate": 2.907902217168846e-05, + "loss": 5.2719, + "step": 7860 + }, + { + "epoch": 0.4347705914870094, + "grad_norm": 3.1556508541107178, + "learning_rate": 2.9064809550881184e-05, + "loss": 5.391, + "step": 7865 + }, + { + "epoch": 0.4350469872857933, + "grad_norm": 3.2448055744171143, + "learning_rate": 2.9050596930073908e-05, + "loss": 5.3468, + "step": 7870 + }, + { + "epoch": 0.43532338308457713, + "grad_norm": 3.5034596920013428, + "learning_rate": 2.9036384309266628e-05, + "loss": 5.1951, + "step": 7875 + }, + { + "epoch": 0.435599778883361, + "grad_norm": 3.018533945083618, + "learning_rate": 2.9022171688459356e-05, + "loss": 5.1914, + "step": 7880 + }, + { + "epoch": 0.43587617468214485, + "grad_norm": 2.8897950649261475, + "learning_rate": 2.9007959067652073e-05, + "loss": 5.6935, + "step": 7885 + }, + { + "epoch": 0.4361525704809287, + "grad_norm": 2.643296957015991, + "learning_rate": 2.89937464468448e-05, + "loss": 5.1013, + "step": 7890 + }, + { + "epoch": 0.43642896627971256, + "grad_norm": 3.041177988052368, + "learning_rate": 2.8979533826037524e-05, + "loss": 5.3812, + "step": 7895 + }, + { + "epoch": 0.4367053620784964, + "grad_norm": 3.4055657386779785, + "learning_rate": 2.8965321205230245e-05, + "loss": 5.5113, + "step": 7900 + }, + { + "epoch": 0.4369817578772803, + "grad_norm": 4.0776495933532715, + "learning_rate": 2.895110858442297e-05, + "loss": 5.4321, + "step": 7905 + }, + { + "epoch": 0.43725815367606413, + "grad_norm": 3.4030396938323975, + "learning_rate": 2.893689596361569e-05, + "loss": 5.091, + "step": 7910 + }, + { + "epoch": 0.437534549474848, + "grad_norm": 3.5989456176757812, + "learning_rate": 2.8922683342808417e-05, + "loss": 5.48, + "step": 7915 + }, + { + "epoch": 0.43781094527363185, + "grad_norm": 2.3618738651275635, + "learning_rate": 2.890847072200114e-05, + "loss": 5.0508, + "step": 7920 + }, + { + "epoch": 0.4380873410724157, + "grad_norm": 3.1228749752044678, + "learning_rate": 2.889425810119386e-05, + "loss": 5.3904, + "step": 7925 + }, + { + "epoch": 0.43836373687119956, + "grad_norm": 3.0725607872009277, + "learning_rate": 2.8880045480386585e-05, + "loss": 5.4847, + "step": 7930 + }, + { + "epoch": 0.4386401326699834, + "grad_norm": 2.9387130737304688, + "learning_rate": 2.8865832859579306e-05, + "loss": 4.9448, + "step": 7935 + }, + { + "epoch": 0.4389165284687673, + "grad_norm": 3.3646342754364014, + "learning_rate": 2.885162023877203e-05, + "loss": 5.3713, + "step": 7940 + }, + { + "epoch": 0.43919292426755113, + "grad_norm": 3.699636697769165, + "learning_rate": 2.8837407617964757e-05, + "loss": 5.2954, + "step": 7945 + }, + { + "epoch": 0.439469320066335, + "grad_norm": 3.304562568664551, + "learning_rate": 2.8823194997157478e-05, + "loss": 5.1658, + "step": 7950 + }, + { + "epoch": 0.43974571586511885, + "grad_norm": 2.908881902694702, + "learning_rate": 2.8808982376350202e-05, + "loss": 5.0667, + "step": 7955 + }, + { + "epoch": 0.4400221116639027, + "grad_norm": 3.2574126720428467, + "learning_rate": 2.8794769755542922e-05, + "loss": 5.1662, + "step": 7960 + }, + { + "epoch": 0.44029850746268656, + "grad_norm": 3.021467924118042, + "learning_rate": 2.8780557134735646e-05, + "loss": 5.0305, + "step": 7965 + }, + { + "epoch": 0.4405749032614704, + "grad_norm": 2.6084797382354736, + "learning_rate": 2.8766344513928367e-05, + "loss": 5.3558, + "step": 7970 + }, + { + "epoch": 0.4408512990602543, + "grad_norm": 3.0199179649353027, + "learning_rate": 2.875213189312109e-05, + "loss": 5.2878, + "step": 7975 + }, + { + "epoch": 0.44112769485903813, + "grad_norm": 3.2974507808685303, + "learning_rate": 2.8737919272313818e-05, + "loss": 5.3241, + "step": 7980 + }, + { + "epoch": 0.441404090657822, + "grad_norm": 3.1707582473754883, + "learning_rate": 2.872370665150654e-05, + "loss": 4.94, + "step": 7985 + }, + { + "epoch": 0.44168048645660585, + "grad_norm": 2.6704931259155273, + "learning_rate": 2.8709494030699263e-05, + "loss": 5.1259, + "step": 7990 + }, + { + "epoch": 0.4419568822553897, + "grad_norm": 4.149257659912109, + "learning_rate": 2.8695281409891983e-05, + "loss": 5.4461, + "step": 7995 + }, + { + "epoch": 0.44223327805417356, + "grad_norm": 3.305680274963379, + "learning_rate": 2.8681068789084707e-05, + "loss": 5.1553, + "step": 8000 + }, + { + "epoch": 0.4425096738529574, + "grad_norm": 3.288146734237671, + "learning_rate": 2.8666856168277435e-05, + "loss": 5.2207, + "step": 8005 + }, + { + "epoch": 0.4427860696517413, + "grad_norm": 4.230424880981445, + "learning_rate": 2.8652643547470155e-05, + "loss": 5.3263, + "step": 8010 + }, + { + "epoch": 0.44306246545052513, + "grad_norm": 3.5503792762756348, + "learning_rate": 2.863843092666288e-05, + "loss": 5.2507, + "step": 8015 + }, + { + "epoch": 0.443338861249309, + "grad_norm": 2.99174427986145, + "learning_rate": 2.86242183058556e-05, + "loss": 5.2627, + "step": 8020 + }, + { + "epoch": 0.44361525704809285, + "grad_norm": 3.6940457820892334, + "learning_rate": 2.8610005685048324e-05, + "loss": 4.9232, + "step": 8025 + }, + { + "epoch": 0.4438916528468767, + "grad_norm": 3.4294097423553467, + "learning_rate": 2.859579306424105e-05, + "loss": 5.2177, + "step": 8030 + }, + { + "epoch": 0.44416804864566056, + "grad_norm": 3.6956839561462402, + "learning_rate": 2.858158044343377e-05, + "loss": 5.3431, + "step": 8035 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 3.1307976245880127, + "learning_rate": 2.8567367822626496e-05, + "loss": 5.3812, + "step": 8040 + }, + { + "epoch": 0.4447208402432283, + "grad_norm": 2.7979776859283447, + "learning_rate": 2.8553155201819216e-05, + "loss": 5.6822, + "step": 8045 + }, + { + "epoch": 0.44499723604201213, + "grad_norm": 2.8679885864257812, + "learning_rate": 2.853894258101194e-05, + "loss": 5.278, + "step": 8050 + }, + { + "epoch": 0.44527363184079605, + "grad_norm": 2.877314805984497, + "learning_rate": 2.8524729960204664e-05, + "loss": 5.2747, + "step": 8055 + }, + { + "epoch": 0.4455500276395799, + "grad_norm": 3.238851308822632, + "learning_rate": 2.8510517339397385e-05, + "loss": 5.3333, + "step": 8060 + }, + { + "epoch": 0.44582642343836376, + "grad_norm": 2.7757999897003174, + "learning_rate": 2.8496304718590112e-05, + "loss": 5.0878, + "step": 8065 + }, + { + "epoch": 0.4461028192371476, + "grad_norm": 3.3367576599121094, + "learning_rate": 2.848209209778283e-05, + "loss": 5.2714, + "step": 8070 + }, + { + "epoch": 0.4463792150359315, + "grad_norm": 2.983865976333618, + "learning_rate": 2.8467879476975557e-05, + "loss": 5.4693, + "step": 8075 + }, + { + "epoch": 0.44665561083471533, + "grad_norm": 3.211333751678467, + "learning_rate": 2.8453666856168278e-05, + "loss": 5.1891, + "step": 8080 + }, + { + "epoch": 0.4469320066334992, + "grad_norm": 3.608457565307617, + "learning_rate": 2.8439454235361e-05, + "loss": 4.9361, + "step": 8085 + }, + { + "epoch": 0.44720840243228305, + "grad_norm": 3.1287999153137207, + "learning_rate": 2.8425241614553725e-05, + "loss": 5.5907, + "step": 8090 + }, + { + "epoch": 0.4474847982310669, + "grad_norm": 3.5823168754577637, + "learning_rate": 2.8411028993746446e-05, + "loss": 5.1325, + "step": 8095 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 4.0038299560546875, + "learning_rate": 2.8396816372939173e-05, + "loss": 5.0001, + "step": 8100 + }, + { + "epoch": 0.4480375898286346, + "grad_norm": 3.3478615283966064, + "learning_rate": 2.838260375213189e-05, + "loss": 5.1217, + "step": 8105 + }, + { + "epoch": 0.4483139856274185, + "grad_norm": 3.304795503616333, + "learning_rate": 2.8368391131324618e-05, + "loss": 5.5211, + "step": 8110 + }, + { + "epoch": 0.44859038142620233, + "grad_norm": 3.8344953060150146, + "learning_rate": 2.8354178510517342e-05, + "loss": 5.2864, + "step": 8115 + }, + { + "epoch": 0.4488667772249862, + "grad_norm": 3.359548807144165, + "learning_rate": 2.8339965889710063e-05, + "loss": 5.5551, + "step": 8120 + }, + { + "epoch": 0.44914317302377005, + "grad_norm": 3.1302855014801025, + "learning_rate": 2.8325753268902787e-05, + "loss": 5.1885, + "step": 8125 + }, + { + "epoch": 0.4494195688225539, + "grad_norm": 3.1038334369659424, + "learning_rate": 2.8311540648095507e-05, + "loss": 5.1696, + "step": 8130 + }, + { + "epoch": 0.44969596462133776, + "grad_norm": 4.253241062164307, + "learning_rate": 2.8297328027288235e-05, + "loss": 5.5296, + "step": 8135 + }, + { + "epoch": 0.4499723604201216, + "grad_norm": 3.2857799530029297, + "learning_rate": 2.828311540648096e-05, + "loss": 5.2782, + "step": 8140 + }, + { + "epoch": 0.4502487562189055, + "grad_norm": 3.255506753921509, + "learning_rate": 2.826890278567368e-05, + "loss": 5.2445, + "step": 8145 + }, + { + "epoch": 0.45052515201768933, + "grad_norm": 4.212364673614502, + "learning_rate": 2.8254690164866403e-05, + "loss": 5.1744, + "step": 8150 + }, + { + "epoch": 0.4508015478164732, + "grad_norm": 3.6351518630981445, + "learning_rate": 2.8240477544059124e-05, + "loss": 5.3138, + "step": 8155 + }, + { + "epoch": 0.45107794361525705, + "grad_norm": 2.9749603271484375, + "learning_rate": 2.822626492325185e-05, + "loss": 5.2675, + "step": 8160 + }, + { + "epoch": 0.4513543394140409, + "grad_norm": 3.6853575706481934, + "learning_rate": 2.8212052302444575e-05, + "loss": 5.2495, + "step": 8165 + }, + { + "epoch": 0.45163073521282476, + "grad_norm": 3.170837640762329, + "learning_rate": 2.8197839681637296e-05, + "loss": 5.2689, + "step": 8170 + }, + { + "epoch": 0.4519071310116086, + "grad_norm": 3.9865872859954834, + "learning_rate": 2.818362706083002e-05, + "loss": 5.0987, + "step": 8175 + }, + { + "epoch": 0.4521835268103925, + "grad_norm": 3.1320040225982666, + "learning_rate": 2.816941444002274e-05, + "loss": 5.2081, + "step": 8180 + }, + { + "epoch": 0.45245992260917633, + "grad_norm": 3.1708438396453857, + "learning_rate": 2.8155201819215464e-05, + "loss": 5.1956, + "step": 8185 + }, + { + "epoch": 0.4527363184079602, + "grad_norm": 2.9454898834228516, + "learning_rate": 2.8140989198408185e-05, + "loss": 5.2115, + "step": 8190 + }, + { + "epoch": 0.45301271420674405, + "grad_norm": 3.4514904022216797, + "learning_rate": 2.8126776577600912e-05, + "loss": 5.1788, + "step": 8195 + }, + { + "epoch": 0.4532891100055279, + "grad_norm": 3.5502679347991943, + "learning_rate": 2.8112563956793636e-05, + "loss": 5.1326, + "step": 8200 + }, + { + "epoch": 0.45356550580431176, + "grad_norm": 3.267252206802368, + "learning_rate": 2.8098351335986357e-05, + "loss": 5.2808, + "step": 8205 + }, + { + "epoch": 0.4538419016030956, + "grad_norm": 3.928968667984009, + "learning_rate": 2.808413871517908e-05, + "loss": 5.262, + "step": 8210 + }, + { + "epoch": 0.4541182974018795, + "grad_norm": 3.202829360961914, + "learning_rate": 2.80699260943718e-05, + "loss": 5.2817, + "step": 8215 + }, + { + "epoch": 0.45439469320066334, + "grad_norm": 3.040146589279175, + "learning_rate": 2.8055713473564525e-05, + "loss": 5.3738, + "step": 8220 + }, + { + "epoch": 0.4546710889994472, + "grad_norm": 2.829195499420166, + "learning_rate": 2.8041500852757253e-05, + "loss": 4.9243, + "step": 8225 + }, + { + "epoch": 0.45494748479823105, + "grad_norm": 2.964588165283203, + "learning_rate": 2.8027288231949973e-05, + "loss": 5.2799, + "step": 8230 + }, + { + "epoch": 0.4552238805970149, + "grad_norm": 3.4491324424743652, + "learning_rate": 2.8013075611142697e-05, + "loss": 5.3802, + "step": 8235 + }, + { + "epoch": 0.45550027639579876, + "grad_norm": 3.446174383163452, + "learning_rate": 2.7998862990335418e-05, + "loss": 5.2512, + "step": 8240 + }, + { + "epoch": 0.4557766721945826, + "grad_norm": 3.0602688789367676, + "learning_rate": 2.7984650369528142e-05, + "loss": 5.281, + "step": 8245 + }, + { + "epoch": 0.4560530679933665, + "grad_norm": 2.8950977325439453, + "learning_rate": 2.797043774872087e-05, + "loss": 4.9311, + "step": 8250 + }, + { + "epoch": 0.45632946379215034, + "grad_norm": 3.147683620452881, + "learning_rate": 2.7956225127913586e-05, + "loss": 5.1642, + "step": 8255 + }, + { + "epoch": 0.4566058595909342, + "grad_norm": 3.45888090133667, + "learning_rate": 2.7942012507106314e-05, + "loss": 5.1064, + "step": 8260 + }, + { + "epoch": 0.45688225538971805, + "grad_norm": 3.435093641281128, + "learning_rate": 2.7927799886299034e-05, + "loss": 5.4801, + "step": 8265 + }, + { + "epoch": 0.4571586511885019, + "grad_norm": 3.746147394180298, + "learning_rate": 2.7913587265491758e-05, + "loss": 5.3686, + "step": 8270 + }, + { + "epoch": 0.4574350469872858, + "grad_norm": 3.0401806831359863, + "learning_rate": 2.7899374644684482e-05, + "loss": 5.0805, + "step": 8275 + }, + { + "epoch": 0.4577114427860697, + "grad_norm": 4.183225154876709, + "learning_rate": 2.7885162023877203e-05, + "loss": 5.5761, + "step": 8280 + }, + { + "epoch": 0.45798783858485353, + "grad_norm": 3.05330491065979, + "learning_rate": 2.787094940306993e-05, + "loss": 5.195, + "step": 8285 + }, + { + "epoch": 0.4582642343836374, + "grad_norm": 3.5704240798950195, + "learning_rate": 2.7856736782262647e-05, + "loss": 5.01, + "step": 8290 + }, + { + "epoch": 0.45854063018242125, + "grad_norm": 3.817938804626465, + "learning_rate": 2.7842524161455375e-05, + "loss": 5.2507, + "step": 8295 + }, + { + "epoch": 0.4588170259812051, + "grad_norm": 2.9435267448425293, + "learning_rate": 2.7828311540648095e-05, + "loss": 5.3353, + "step": 8300 + }, + { + "epoch": 0.45909342177998896, + "grad_norm": 3.9453604221343994, + "learning_rate": 2.781409891984082e-05, + "loss": 5.2214, + "step": 8305 + }, + { + "epoch": 0.4593698175787728, + "grad_norm": 3.7097527980804443, + "learning_rate": 2.7799886299033547e-05, + "loss": 4.9059, + "step": 8310 + }, + { + "epoch": 0.4596462133775567, + "grad_norm": 2.8728573322296143, + "learning_rate": 2.7785673678226264e-05, + "loss": 5.2303, + "step": 8315 + }, + { + "epoch": 0.45992260917634054, + "grad_norm": 2.946532726287842, + "learning_rate": 2.777146105741899e-05, + "loss": 5.1568, + "step": 8320 + }, + { + "epoch": 0.4601990049751244, + "grad_norm": 2.8694427013397217, + "learning_rate": 2.7757248436611712e-05, + "loss": 5.2556, + "step": 8325 + }, + { + "epoch": 0.46047540077390825, + "grad_norm": 2.947298049926758, + "learning_rate": 2.7743035815804436e-05, + "loss": 5.269, + "step": 8330 + }, + { + "epoch": 0.4607517965726921, + "grad_norm": 2.9417827129364014, + "learning_rate": 2.772882319499716e-05, + "loss": 5.0295, + "step": 8335 + }, + { + "epoch": 0.46102819237147596, + "grad_norm": 3.7710001468658447, + "learning_rate": 2.771461057418988e-05, + "loss": 5.403, + "step": 8340 + }, + { + "epoch": 0.4613045881702598, + "grad_norm": 3.4150478839874268, + "learning_rate": 2.7700397953382608e-05, + "loss": 5.2718, + "step": 8345 + }, + { + "epoch": 0.4615809839690437, + "grad_norm": 3.6512367725372314, + "learning_rate": 2.7686185332575325e-05, + "loss": 5.1832, + "step": 8350 + }, + { + "epoch": 0.46185737976782754, + "grad_norm": 3.080050468444824, + "learning_rate": 2.7671972711768052e-05, + "loss": 5.4398, + "step": 8355 + }, + { + "epoch": 0.4621337755666114, + "grad_norm": 3.351170778274536, + "learning_rate": 2.7657760090960776e-05, + "loss": 5.3517, + "step": 8360 + }, + { + "epoch": 0.46241017136539525, + "grad_norm": 3.467395782470703, + "learning_rate": 2.7643547470153497e-05, + "loss": 5.3345, + "step": 8365 + }, + { + "epoch": 0.4626865671641791, + "grad_norm": 3.781153440475464, + "learning_rate": 2.762933484934622e-05, + "loss": 5.2006, + "step": 8370 + }, + { + "epoch": 0.46296296296296297, + "grad_norm": 3.0200061798095703, + "learning_rate": 2.761512222853894e-05, + "loss": 4.8137, + "step": 8375 + }, + { + "epoch": 0.4632393587617468, + "grad_norm": 3.3280367851257324, + "learning_rate": 2.760090960773167e-05, + "loss": 5.0456, + "step": 8380 + }, + { + "epoch": 0.4635157545605307, + "grad_norm": 3.1769330501556396, + "learning_rate": 2.7586696986924386e-05, + "loss": 5.3049, + "step": 8385 + }, + { + "epoch": 0.46379215035931454, + "grad_norm": 4.280707836151123, + "learning_rate": 2.7572484366117113e-05, + "loss": 5.2953, + "step": 8390 + }, + { + "epoch": 0.4640685461580984, + "grad_norm": 2.720076322555542, + "learning_rate": 2.7558271745309837e-05, + "loss": 5.269, + "step": 8395 + }, + { + "epoch": 0.46434494195688225, + "grad_norm": 3.4472744464874268, + "learning_rate": 2.7544059124502558e-05, + "loss": 5.1512, + "step": 8400 + }, + { + "epoch": 0.4646213377556661, + "grad_norm": 3.2112371921539307, + "learning_rate": 2.7529846503695282e-05, + "loss": 5.3101, + "step": 8405 + }, + { + "epoch": 0.46489773355444997, + "grad_norm": 3.637249708175659, + "learning_rate": 2.7515633882888003e-05, + "loss": 5.1246, + "step": 8410 + }, + { + "epoch": 0.4651741293532338, + "grad_norm": 2.8876161575317383, + "learning_rate": 2.750142126208073e-05, + "loss": 5.4245, + "step": 8415 + }, + { + "epoch": 0.4654505251520177, + "grad_norm": 3.1725854873657227, + "learning_rate": 2.7487208641273454e-05, + "loss": 5.0848, + "step": 8420 + }, + { + "epoch": 0.46572692095080154, + "grad_norm": 2.7039942741394043, + "learning_rate": 2.7472996020466175e-05, + "loss": 5.2807, + "step": 8425 + }, + { + "epoch": 0.4660033167495854, + "grad_norm": 2.8240089416503906, + "learning_rate": 2.74587833996589e-05, + "loss": 5.4056, + "step": 8430 + }, + { + "epoch": 0.46627971254836925, + "grad_norm": 3.1696321964263916, + "learning_rate": 2.744457077885162e-05, + "loss": 5.4693, + "step": 8435 + }, + { + "epoch": 0.4665561083471531, + "grad_norm": 2.855938196182251, + "learning_rate": 2.7430358158044343e-05, + "loss": 5.1644, + "step": 8440 + }, + { + "epoch": 0.46683250414593697, + "grad_norm": 3.3975577354431152, + "learning_rate": 2.741614553723707e-05, + "loss": 5.1207, + "step": 8445 + }, + { + "epoch": 0.4671088999447208, + "grad_norm": 3.4103002548217773, + "learning_rate": 2.740193291642979e-05, + "loss": 5.2251, + "step": 8450 + }, + { + "epoch": 0.4673852957435047, + "grad_norm": 3.4105570316314697, + "learning_rate": 2.7387720295622515e-05, + "loss": 5.1505, + "step": 8455 + }, + { + "epoch": 0.46766169154228854, + "grad_norm": 3.4730947017669678, + "learning_rate": 2.7373507674815236e-05, + "loss": 5.056, + "step": 8460 + }, + { + "epoch": 0.4679380873410724, + "grad_norm": 3.272139549255371, + "learning_rate": 2.735929505400796e-05, + "loss": 5.3615, + "step": 8465 + }, + { + "epoch": 0.46821448313985625, + "grad_norm": 2.538783073425293, + "learning_rate": 2.7345082433200687e-05, + "loss": 5.3061, + "step": 8470 + }, + { + "epoch": 0.4684908789386401, + "grad_norm": 2.9536359310150146, + "learning_rate": 2.7330869812393408e-05, + "loss": 5.2621, + "step": 8475 + }, + { + "epoch": 0.46876727473742397, + "grad_norm": 4.758022308349609, + "learning_rate": 2.731665719158613e-05, + "loss": 5.0966, + "step": 8480 + }, + { + "epoch": 0.4690436705362078, + "grad_norm": 3.3707218170166016, + "learning_rate": 2.7302444570778852e-05, + "loss": 5.4979, + "step": 8485 + }, + { + "epoch": 0.4693200663349917, + "grad_norm": 2.765289783477783, + "learning_rate": 2.7288231949971576e-05, + "loss": 5.4462, + "step": 8490 + }, + { + "epoch": 0.4695964621337756, + "grad_norm": 2.88114070892334, + "learning_rate": 2.7274019329164297e-05, + "loss": 5.7333, + "step": 8495 + }, + { + "epoch": 0.46987285793255945, + "grad_norm": 3.3201422691345215, + "learning_rate": 2.725980670835702e-05, + "loss": 5.2429, + "step": 8500 + }, + { + "epoch": 0.4701492537313433, + "grad_norm": 3.3583285808563232, + "learning_rate": 2.7245594087549748e-05, + "loss": 5.0037, + "step": 8505 + }, + { + "epoch": 0.47042564953012717, + "grad_norm": 3.485041618347168, + "learning_rate": 2.723138146674247e-05, + "loss": 5.1081, + "step": 8510 + }, + { + "epoch": 0.470702045328911, + "grad_norm": 3.287743330001831, + "learning_rate": 2.7217168845935193e-05, + "loss": 5.1691, + "step": 8515 + }, + { + "epoch": 0.4709784411276949, + "grad_norm": 3.2845332622528076, + "learning_rate": 2.7202956225127913e-05, + "loss": 5.0228, + "step": 8520 + }, + { + "epoch": 0.47125483692647874, + "grad_norm": 3.746253728866577, + "learning_rate": 2.7188743604320637e-05, + "loss": 4.901, + "step": 8525 + }, + { + "epoch": 0.4715312327252626, + "grad_norm": 3.2223970890045166, + "learning_rate": 2.7174530983513365e-05, + "loss": 4.915, + "step": 8530 + }, + { + "epoch": 0.47180762852404645, + "grad_norm": 3.4350712299346924, + "learning_rate": 2.7160318362706082e-05, + "loss": 5.2032, + "step": 8535 + }, + { + "epoch": 0.4720840243228303, + "grad_norm": 3.5283148288726807, + "learning_rate": 2.714610574189881e-05, + "loss": 5.0011, + "step": 8540 + }, + { + "epoch": 0.47236042012161417, + "grad_norm": 3.2964229583740234, + "learning_rate": 2.713189312109153e-05, + "loss": 5.1857, + "step": 8545 + }, + { + "epoch": 0.472636815920398, + "grad_norm": 2.962416172027588, + "learning_rate": 2.7117680500284254e-05, + "loss": 4.9636, + "step": 8550 + }, + { + "epoch": 0.4729132117191819, + "grad_norm": 3.4900479316711426, + "learning_rate": 2.7103467879476978e-05, + "loss": 4.8722, + "step": 8555 + }, + { + "epoch": 0.47318960751796574, + "grad_norm": 3.9601972103118896, + "learning_rate": 2.7089255258669698e-05, + "loss": 5.3924, + "step": 8560 + }, + { + "epoch": 0.4734660033167496, + "grad_norm": 3.298952102661133, + "learning_rate": 2.7075042637862426e-05, + "loss": 4.971, + "step": 8565 + }, + { + "epoch": 0.47374239911553345, + "grad_norm": 3.38555645942688, + "learning_rate": 2.7060830017055143e-05, + "loss": 5.0353, + "step": 8570 + }, + { + "epoch": 0.4740187949143173, + "grad_norm": 2.793022632598877, + "learning_rate": 2.704661739624787e-05, + "loss": 5.0542, + "step": 8575 + }, + { + "epoch": 0.47429519071310117, + "grad_norm": 2.7033803462982178, + "learning_rate": 2.7032404775440594e-05, + "loss": 5.1854, + "step": 8580 + }, + { + "epoch": 0.474571586511885, + "grad_norm": 3.3331148624420166, + "learning_rate": 2.7018192154633315e-05, + "loss": 5.3229, + "step": 8585 + }, + { + "epoch": 0.4748479823106689, + "grad_norm": 3.3170993328094482, + "learning_rate": 2.700397953382604e-05, + "loss": 5.2236, + "step": 8590 + }, + { + "epoch": 0.47512437810945274, + "grad_norm": 3.055724859237671, + "learning_rate": 2.698976691301876e-05, + "loss": 5.447, + "step": 8595 + }, + { + "epoch": 0.4754007739082366, + "grad_norm": 3.705082654953003, + "learning_rate": 2.6975554292211487e-05, + "loss": 5.2754, + "step": 8600 + }, + { + "epoch": 0.47567716970702045, + "grad_norm": 3.6504838466644287, + "learning_rate": 2.6961341671404204e-05, + "loss": 5.1527, + "step": 8605 + }, + { + "epoch": 0.4759535655058043, + "grad_norm": 3.557499647140503, + "learning_rate": 2.694712905059693e-05, + "loss": 5.1805, + "step": 8610 + }, + { + "epoch": 0.47622996130458817, + "grad_norm": 3.1619584560394287, + "learning_rate": 2.6932916429789655e-05, + "loss": 5.0161, + "step": 8615 + }, + { + "epoch": 0.476506357103372, + "grad_norm": 3.1936593055725098, + "learning_rate": 2.6918703808982376e-05, + "loss": 5.0251, + "step": 8620 + }, + { + "epoch": 0.4767827529021559, + "grad_norm": 3.5965256690979004, + "learning_rate": 2.6904491188175103e-05, + "loss": 4.8984, + "step": 8625 + }, + { + "epoch": 0.47705914870093974, + "grad_norm": 3.2955245971679688, + "learning_rate": 2.689027856736782e-05, + "loss": 5.1065, + "step": 8630 + }, + { + "epoch": 0.4773355444997236, + "grad_norm": 2.86582350730896, + "learning_rate": 2.6876065946560548e-05, + "loss": 5.3203, + "step": 8635 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 2.8077163696289062, + "learning_rate": 2.6861853325753272e-05, + "loss": 5.1535, + "step": 8640 + }, + { + "epoch": 0.4778883360972913, + "grad_norm": 3.2445852756500244, + "learning_rate": 2.6847640704945992e-05, + "loss": 5.3598, + "step": 8645 + }, + { + "epoch": 0.47816473189607517, + "grad_norm": 2.902517557144165, + "learning_rate": 2.6833428084138716e-05, + "loss": 5.1782, + "step": 8650 + }, + { + "epoch": 0.478441127694859, + "grad_norm": 3.0056138038635254, + "learning_rate": 2.6819215463331437e-05, + "loss": 5.2646, + "step": 8655 + }, + { + "epoch": 0.4787175234936429, + "grad_norm": 3.432762861251831, + "learning_rate": 2.6805002842524164e-05, + "loss": 5.3692, + "step": 8660 + }, + { + "epoch": 0.47899391929242674, + "grad_norm": 2.9771177768707275, + "learning_rate": 2.6790790221716888e-05, + "loss": 5.2922, + "step": 8665 + }, + { + "epoch": 0.4792703150912106, + "grad_norm": 2.8784945011138916, + "learning_rate": 2.677657760090961e-05, + "loss": 4.8865, + "step": 8670 + }, + { + "epoch": 0.47954671088999445, + "grad_norm": 3.0475375652313232, + "learning_rate": 2.6762364980102333e-05, + "loss": 5.0122, + "step": 8675 + }, + { + "epoch": 0.4798231066887783, + "grad_norm": 3.4154276847839355, + "learning_rate": 2.6748152359295053e-05, + "loss": 5.1384, + "step": 8680 + }, + { + "epoch": 0.48009950248756217, + "grad_norm": 2.841428279876709, + "learning_rate": 2.6733939738487777e-05, + "loss": 5.3496, + "step": 8685 + }, + { + "epoch": 0.480375898286346, + "grad_norm": 3.4203035831451416, + "learning_rate": 2.6719727117680505e-05, + "loss": 5.2612, + "step": 8690 + }, + { + "epoch": 0.4806522940851299, + "grad_norm": 2.9312570095062256, + "learning_rate": 2.6705514496873225e-05, + "loss": 5.065, + "step": 8695 + }, + { + "epoch": 0.48092868988391374, + "grad_norm": 4.5229668617248535, + "learning_rate": 2.669130187606595e-05, + "loss": 5.2577, + "step": 8700 + }, + { + "epoch": 0.4812050856826976, + "grad_norm": 4.566796779632568, + "learning_rate": 2.667708925525867e-05, + "loss": 5.4462, + "step": 8705 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 3.907799005508423, + "learning_rate": 2.6662876634451394e-05, + "loss": 5.3402, + "step": 8710 + }, + { + "epoch": 0.48175787728026537, + "grad_norm": 4.1701836585998535, + "learning_rate": 2.6648664013644114e-05, + "loss": 5.3027, + "step": 8715 + }, + { + "epoch": 0.4820342730790492, + "grad_norm": 2.956747055053711, + "learning_rate": 2.663445139283684e-05, + "loss": 5.2394, + "step": 8720 + }, + { + "epoch": 0.4823106688778331, + "grad_norm": 3.6254959106445312, + "learning_rate": 2.6620238772029566e-05, + "loss": 5.1001, + "step": 8725 + }, + { + "epoch": 0.48258706467661694, + "grad_norm": 3.541015386581421, + "learning_rate": 2.6606026151222286e-05, + "loss": 5.2681, + "step": 8730 + }, + { + "epoch": 0.4828634604754008, + "grad_norm": 2.8555665016174316, + "learning_rate": 2.659181353041501e-05, + "loss": 5.1359, + "step": 8735 + }, + { + "epoch": 0.48313985627418465, + "grad_norm": 3.2647249698638916, + "learning_rate": 2.657760090960773e-05, + "loss": 5.1848, + "step": 8740 + }, + { + "epoch": 0.4834162520729685, + "grad_norm": 3.473285675048828, + "learning_rate": 2.6563388288800455e-05, + "loss": 5.1448, + "step": 8745 + }, + { + "epoch": 0.48369264787175237, + "grad_norm": 4.414984226226807, + "learning_rate": 2.6549175667993182e-05, + "loss": 4.8777, + "step": 8750 + }, + { + "epoch": 0.4839690436705362, + "grad_norm": 3.5118448734283447, + "learning_rate": 2.65349630471859e-05, + "loss": 4.973, + "step": 8755 + }, + { + "epoch": 0.4842454394693201, + "grad_norm": 3.8873729705810547, + "learning_rate": 2.6520750426378627e-05, + "loss": 5.1884, + "step": 8760 + }, + { + "epoch": 0.48452183526810394, + "grad_norm": 2.8551294803619385, + "learning_rate": 2.6506537805571347e-05, + "loss": 5.2593, + "step": 8765 + }, + { + "epoch": 0.4847982310668878, + "grad_norm": 3.043487310409546, + "learning_rate": 2.649232518476407e-05, + "loss": 5.3543, + "step": 8770 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 2.915309429168701, + "learning_rate": 2.64781125639568e-05, + "loss": 5.1256, + "step": 8775 + }, + { + "epoch": 0.4853510226644555, + "grad_norm": 3.0859193801879883, + "learning_rate": 2.6463899943149516e-05, + "loss": 5.1491, + "step": 8780 + }, + { + "epoch": 0.48562741846323937, + "grad_norm": 3.597959518432617, + "learning_rate": 2.6449687322342243e-05, + "loss": 5.2776, + "step": 8785 + }, + { + "epoch": 0.4859038142620232, + "grad_norm": 3.311375617980957, + "learning_rate": 2.6435474701534964e-05, + "loss": 5.0004, + "step": 8790 + }, + { + "epoch": 0.4861802100608071, + "grad_norm": 2.9994888305664062, + "learning_rate": 2.6421262080727688e-05, + "loss": 5.2205, + "step": 8795 + }, + { + "epoch": 0.48645660585959094, + "grad_norm": 3.1175289154052734, + "learning_rate": 2.6407049459920412e-05, + "loss": 4.9917, + "step": 8800 + }, + { + "epoch": 0.4867330016583748, + "grad_norm": 3.356494426727295, + "learning_rate": 2.6392836839113133e-05, + "loss": 5.119, + "step": 8805 + }, + { + "epoch": 0.48700939745715865, + "grad_norm": 2.93296480178833, + "learning_rate": 2.637862421830586e-05, + "loss": 5.1587, + "step": 8810 + }, + { + "epoch": 0.4872857932559425, + "grad_norm": 3.3215854167938232, + "learning_rate": 2.6364411597498577e-05, + "loss": 5.3971, + "step": 8815 + }, + { + "epoch": 0.48756218905472637, + "grad_norm": 2.9889779090881348, + "learning_rate": 2.6350198976691304e-05, + "loss": 5.023, + "step": 8820 + }, + { + "epoch": 0.4878385848535102, + "grad_norm": 4.003651142120361, + "learning_rate": 2.6335986355884025e-05, + "loss": 5.3191, + "step": 8825 + }, + { + "epoch": 0.4881149806522941, + "grad_norm": 4.0643744468688965, + "learning_rate": 2.632177373507675e-05, + "loss": 5.4204, + "step": 8830 + }, + { + "epoch": 0.48839137645107794, + "grad_norm": 3.0099825859069824, + "learning_rate": 2.6307561114269473e-05, + "loss": 5.0147, + "step": 8835 + }, + { + "epoch": 0.4886677722498618, + "grad_norm": 2.961827516555786, + "learning_rate": 2.6293348493462194e-05, + "loss": 4.8997, + "step": 8840 + }, + { + "epoch": 0.48894416804864566, + "grad_norm": 2.8388969898223877, + "learning_rate": 2.627913587265492e-05, + "loss": 5.0859, + "step": 8845 + }, + { + "epoch": 0.4892205638474295, + "grad_norm": 3.230729341506958, + "learning_rate": 2.6264923251847638e-05, + "loss": 5.2606, + "step": 8850 + }, + { + "epoch": 0.48949695964621337, + "grad_norm": 2.873793363571167, + "learning_rate": 2.6250710631040366e-05, + "loss": 5.5914, + "step": 8855 + }, + { + "epoch": 0.4897733554449972, + "grad_norm": 3.9639649391174316, + "learning_rate": 2.623649801023309e-05, + "loss": 4.8028, + "step": 8860 + }, + { + "epoch": 0.4900497512437811, + "grad_norm": 3.220949411392212, + "learning_rate": 2.622228538942581e-05, + "loss": 5.2824, + "step": 8865 + }, + { + "epoch": 0.49032614704256494, + "grad_norm": 4.145904541015625, + "learning_rate": 2.6208072768618534e-05, + "loss": 5.3472, + "step": 8870 + }, + { + "epoch": 0.4906025428413488, + "grad_norm": 3.1538095474243164, + "learning_rate": 2.6193860147811255e-05, + "loss": 5.2069, + "step": 8875 + }, + { + "epoch": 0.49087893864013266, + "grad_norm": 3.5244972705841064, + "learning_rate": 2.6179647527003982e-05, + "loss": 5.048, + "step": 8880 + }, + { + "epoch": 0.4911553344389165, + "grad_norm": 3.0251150131225586, + "learning_rate": 2.6165434906196706e-05, + "loss": 5.1505, + "step": 8885 + }, + { + "epoch": 0.49143173023770037, + "grad_norm": 2.746638298034668, + "learning_rate": 2.6151222285389427e-05, + "loss": 5.2146, + "step": 8890 + }, + { + "epoch": 0.4917081260364842, + "grad_norm": 2.9804468154907227, + "learning_rate": 2.613700966458215e-05, + "loss": 5.1231, + "step": 8895 + }, + { + "epoch": 0.4919845218352681, + "grad_norm": 3.2078375816345215, + "learning_rate": 2.612279704377487e-05, + "loss": 4.9723, + "step": 8900 + }, + { + "epoch": 0.49226091763405194, + "grad_norm": 3.1018552780151367, + "learning_rate": 2.6108584422967595e-05, + "loss": 5.1348, + "step": 8905 + }, + { + "epoch": 0.4925373134328358, + "grad_norm": 2.991896390914917, + "learning_rate": 2.6094371802160323e-05, + "loss": 5.1945, + "step": 8910 + }, + { + "epoch": 0.49281370923161966, + "grad_norm": 3.790259599685669, + "learning_rate": 2.6080159181353043e-05, + "loss": 5.2229, + "step": 8915 + }, + { + "epoch": 0.4930901050304035, + "grad_norm": 2.9007620811462402, + "learning_rate": 2.6065946560545767e-05, + "loss": 4.7525, + "step": 8920 + }, + { + "epoch": 0.49336650082918737, + "grad_norm": 3.5712127685546875, + "learning_rate": 2.6051733939738488e-05, + "loss": 5.2762, + "step": 8925 + }, + { + "epoch": 0.49364289662797123, + "grad_norm": 3.451338291168213, + "learning_rate": 2.6037521318931212e-05, + "loss": 4.9248, + "step": 8930 + }, + { + "epoch": 0.49391929242675514, + "grad_norm": 3.268249988555908, + "learning_rate": 2.6023308698123932e-05, + "loss": 5.2842, + "step": 8935 + }, + { + "epoch": 0.494195688225539, + "grad_norm": 2.8075647354125977, + "learning_rate": 2.600909607731666e-05, + "loss": 5.4364, + "step": 8940 + }, + { + "epoch": 0.49447208402432286, + "grad_norm": 3.8568859100341797, + "learning_rate": 2.5994883456509384e-05, + "loss": 5.1617, + "step": 8945 + }, + { + "epoch": 0.4947484798231067, + "grad_norm": 4.422865867614746, + "learning_rate": 2.5980670835702104e-05, + "loss": 5.2414, + "step": 8950 + }, + { + "epoch": 0.49502487562189057, + "grad_norm": 3.7072761058807373, + "learning_rate": 2.5966458214894828e-05, + "loss": 4.9748, + "step": 8955 + }, + { + "epoch": 0.4953012714206744, + "grad_norm": 3.6415674686431885, + "learning_rate": 2.595224559408755e-05, + "loss": 5.3649, + "step": 8960 + }, + { + "epoch": 0.4955776672194583, + "grad_norm": 3.360414505004883, + "learning_rate": 2.5938032973280273e-05, + "loss": 4.9524, + "step": 8965 + }, + { + "epoch": 0.49585406301824214, + "grad_norm": 3.060049057006836, + "learning_rate": 2.5923820352473e-05, + "loss": 5.1118, + "step": 8970 + }, + { + "epoch": 0.496130458817026, + "grad_norm": 4.066228866577148, + "learning_rate": 2.590960773166572e-05, + "loss": 5.4221, + "step": 8975 + }, + { + "epoch": 0.49640685461580986, + "grad_norm": 3.416016101837158, + "learning_rate": 2.5895395110858445e-05, + "loss": 5.1152, + "step": 8980 + }, + { + "epoch": 0.4966832504145937, + "grad_norm": 3.874455451965332, + "learning_rate": 2.5881182490051165e-05, + "loss": 5.1813, + "step": 8985 + }, + { + "epoch": 0.49695964621337757, + "grad_norm": 3.0348269939422607, + "learning_rate": 2.586696986924389e-05, + "loss": 5.1617, + "step": 8990 + }, + { + "epoch": 0.4972360420121614, + "grad_norm": 3.266130208969116, + "learning_rate": 2.5852757248436617e-05, + "loss": 5.2492, + "step": 8995 + }, + { + "epoch": 0.4975124378109453, + "grad_norm": 3.8973846435546875, + "learning_rate": 2.5838544627629334e-05, + "loss": 5.1134, + "step": 9000 + }, + { + "epoch": 0.49778883360972914, + "grad_norm": 3.322026491165161, + "learning_rate": 2.582433200682206e-05, + "loss": 5.2699, + "step": 9005 + }, + { + "epoch": 0.498065229408513, + "grad_norm": 3.2580060958862305, + "learning_rate": 2.5810119386014782e-05, + "loss": 5.0649, + "step": 9010 + }, + { + "epoch": 0.49834162520729686, + "grad_norm": 3.0472872257232666, + "learning_rate": 2.5795906765207506e-05, + "loss": 5.1669, + "step": 9015 + }, + { + "epoch": 0.4986180210060807, + "grad_norm": 3.3452460765838623, + "learning_rate": 2.578169414440023e-05, + "loss": 5.4016, + "step": 9020 + }, + { + "epoch": 0.49889441680486457, + "grad_norm": 3.5364160537719727, + "learning_rate": 2.576748152359295e-05, + "loss": 5.2059, + "step": 9025 + }, + { + "epoch": 0.49917081260364843, + "grad_norm": 3.439129114151001, + "learning_rate": 2.5753268902785678e-05, + "loss": 5.4018, + "step": 9030 + }, + { + "epoch": 0.4994472084024323, + "grad_norm": 4.166476249694824, + "learning_rate": 2.5739056281978395e-05, + "loss": 4.9097, + "step": 9035 + }, + { + "epoch": 0.49972360420121614, + "grad_norm": 2.5692834854125977, + "learning_rate": 2.5724843661171122e-05, + "loss": 5.2335, + "step": 9040 + }, + { + "epoch": 0.5, + "grad_norm": 3.3796117305755615, + "learning_rate": 2.5710631040363843e-05, + "loss": 5.1816, + "step": 9045 + }, + { + "epoch": 0.5002763957987839, + "grad_norm": 3.1776998043060303, + "learning_rate": 2.5696418419556567e-05, + "loss": 5.1573, + "step": 9050 + }, + { + "epoch": 0.5005527915975677, + "grad_norm": 3.153939723968506, + "learning_rate": 2.5682205798749294e-05, + "loss": 5.2645, + "step": 9055 + }, + { + "epoch": 0.5008291873963516, + "grad_norm": 2.713528633117676, + "learning_rate": 2.566799317794201e-05, + "loss": 4.9227, + "step": 9060 + }, + { + "epoch": 0.5011055831951354, + "grad_norm": 3.3882882595062256, + "learning_rate": 2.565378055713474e-05, + "loss": 5.1723, + "step": 9065 + }, + { + "epoch": 0.5013819789939193, + "grad_norm": 3.6385247707366943, + "learning_rate": 2.5639567936327456e-05, + "loss": 5.0634, + "step": 9070 + }, + { + "epoch": 0.5016583747927031, + "grad_norm": 3.6225831508636475, + "learning_rate": 2.5625355315520183e-05, + "loss": 4.8257, + "step": 9075 + }, + { + "epoch": 0.5019347705914871, + "grad_norm": 3.2693662643432617, + "learning_rate": 2.5611142694712907e-05, + "loss": 5.0228, + "step": 9080 + }, + { + "epoch": 0.5022111663902709, + "grad_norm": 3.6240193843841553, + "learning_rate": 2.5596930073905628e-05, + "loss": 5.0373, + "step": 9085 + }, + { + "epoch": 0.5024875621890548, + "grad_norm": 3.114393472671509, + "learning_rate": 2.5582717453098355e-05, + "loss": 5.4028, + "step": 9090 + }, + { + "epoch": 0.5027639579878386, + "grad_norm": 3.7577624320983887, + "learning_rate": 2.5568504832291073e-05, + "loss": 5.0717, + "step": 9095 + }, + { + "epoch": 0.5030403537866225, + "grad_norm": 3.0983474254608154, + "learning_rate": 2.55542922114838e-05, + "loss": 5.5069, + "step": 9100 + }, + { + "epoch": 0.5033167495854063, + "grad_norm": 3.490330696105957, + "learning_rate": 2.5540079590676524e-05, + "loss": 5.1826, + "step": 9105 + }, + { + "epoch": 0.5035931453841902, + "grad_norm": 4.066411018371582, + "learning_rate": 2.5525866969869244e-05, + "loss": 5.2115, + "step": 9110 + }, + { + "epoch": 0.503869541182974, + "grad_norm": 3.8119757175445557, + "learning_rate": 2.551165434906197e-05, + "loss": 5.2514, + "step": 9115 + }, + { + "epoch": 0.5041459369817579, + "grad_norm": 3.7238693237304688, + "learning_rate": 2.549744172825469e-05, + "loss": 4.7289, + "step": 9120 + }, + { + "epoch": 0.5044223327805417, + "grad_norm": 3.154292583465576, + "learning_rate": 2.5483229107447416e-05, + "loss": 5.1676, + "step": 9125 + }, + { + "epoch": 0.5046987285793256, + "grad_norm": 3.4549925327301025, + "learning_rate": 2.546901648664014e-05, + "loss": 5.1312, + "step": 9130 + }, + { + "epoch": 0.5049751243781094, + "grad_norm": 3.0969717502593994, + "learning_rate": 2.545480386583286e-05, + "loss": 5.2531, + "step": 9135 + }, + { + "epoch": 0.5052515201768933, + "grad_norm": 6.122801780700684, + "learning_rate": 2.5440591245025585e-05, + "loss": 5.0781, + "step": 9140 + }, + { + "epoch": 0.5055279159756771, + "grad_norm": 3.613739013671875, + "learning_rate": 2.5426378624218306e-05, + "loss": 5.2013, + "step": 9145 + }, + { + "epoch": 0.5058043117744611, + "grad_norm": 3.0882883071899414, + "learning_rate": 2.541216600341103e-05, + "loss": 5.1607, + "step": 9150 + }, + { + "epoch": 0.5060807075732449, + "grad_norm": 3.7881696224212646, + "learning_rate": 2.539795338260375e-05, + "loss": 5.0951, + "step": 9155 + }, + { + "epoch": 0.5063571033720288, + "grad_norm": 3.161926507949829, + "learning_rate": 2.5383740761796477e-05, + "loss": 5.1998, + "step": 9160 + }, + { + "epoch": 0.5066334991708126, + "grad_norm": 3.223527193069458, + "learning_rate": 2.53695281409892e-05, + "loss": 5.2554, + "step": 9165 + }, + { + "epoch": 0.5069098949695965, + "grad_norm": 3.8171050548553467, + "learning_rate": 2.5355315520181922e-05, + "loss": 5.2001, + "step": 9170 + }, + { + "epoch": 0.5071862907683803, + "grad_norm": 2.8175442218780518, + "learning_rate": 2.5341102899374646e-05, + "loss": 5.1805, + "step": 9175 + }, + { + "epoch": 0.5074626865671642, + "grad_norm": 3.0957937240600586, + "learning_rate": 2.5326890278567367e-05, + "loss": 5.4076, + "step": 9180 + }, + { + "epoch": 0.507739082365948, + "grad_norm": 2.975939989089966, + "learning_rate": 2.531267765776009e-05, + "loss": 5.2884, + "step": 9185 + }, + { + "epoch": 0.5080154781647319, + "grad_norm": 4.01318359375, + "learning_rate": 2.5298465036952818e-05, + "loss": 5.4881, + "step": 9190 + }, + { + "epoch": 0.5082918739635157, + "grad_norm": 2.6178340911865234, + "learning_rate": 2.528425241614554e-05, + "loss": 5.0587, + "step": 9195 + }, + { + "epoch": 0.5085682697622996, + "grad_norm": 2.9565234184265137, + "learning_rate": 2.5270039795338263e-05, + "loss": 5.16, + "step": 9200 + }, + { + "epoch": 0.5088446655610834, + "grad_norm": 3.489684820175171, + "learning_rate": 2.5255827174530983e-05, + "loss": 5.0378, + "step": 9205 + }, + { + "epoch": 0.5091210613598673, + "grad_norm": 2.891176700592041, + "learning_rate": 2.5241614553723707e-05, + "loss": 5.0495, + "step": 9210 + }, + { + "epoch": 0.5093974571586511, + "grad_norm": 3.31199312210083, + "learning_rate": 2.5227401932916434e-05, + "loss": 5.2217, + "step": 9215 + }, + { + "epoch": 0.5096738529574351, + "grad_norm": 2.859729290008545, + "learning_rate": 2.521318931210915e-05, + "loss": 5.2588, + "step": 9220 + }, + { + "epoch": 0.5099502487562189, + "grad_norm": 3.0680506229400635, + "learning_rate": 2.519897669130188e-05, + "loss": 4.9947, + "step": 9225 + }, + { + "epoch": 0.5102266445550028, + "grad_norm": 3.4097602367401123, + "learning_rate": 2.51847640704946e-05, + "loss": 5.156, + "step": 9230 + }, + { + "epoch": 0.5105030403537866, + "grad_norm": 3.063750743865967, + "learning_rate": 2.5170551449687324e-05, + "loss": 5.1035, + "step": 9235 + }, + { + "epoch": 0.5107794361525705, + "grad_norm": 3.8064587116241455, + "learning_rate": 2.515633882888005e-05, + "loss": 5.3786, + "step": 9240 + }, + { + "epoch": 0.5110558319513543, + "grad_norm": 2.8018996715545654, + "learning_rate": 2.5142126208072768e-05, + "loss": 5.0211, + "step": 9245 + }, + { + "epoch": 0.5113322277501382, + "grad_norm": 2.8001601696014404, + "learning_rate": 2.5127913587265496e-05, + "loss": 5.3029, + "step": 9250 + }, + { + "epoch": 0.511608623548922, + "grad_norm": 3.171142339706421, + "learning_rate": 2.5113700966458216e-05, + "loss": 4.9662, + "step": 9255 + }, + { + "epoch": 0.5118850193477059, + "grad_norm": 2.9701507091522217, + "learning_rate": 2.509948834565094e-05, + "loss": 4.8072, + "step": 9260 + }, + { + "epoch": 0.5121614151464897, + "grad_norm": 3.0788140296936035, + "learning_rate": 2.508527572484366e-05, + "loss": 4.9462, + "step": 9265 + }, + { + "epoch": 0.5124378109452736, + "grad_norm": 3.2449519634246826, + "learning_rate": 2.5071063104036385e-05, + "loss": 5.1457, + "step": 9270 + }, + { + "epoch": 0.5127142067440575, + "grad_norm": 3.2045812606811523, + "learning_rate": 2.5056850483229112e-05, + "loss": 5.3124, + "step": 9275 + }, + { + "epoch": 0.5129906025428413, + "grad_norm": 3.2076756954193115, + "learning_rate": 2.504263786242183e-05, + "loss": 5.187, + "step": 9280 + }, + { + "epoch": 0.5132669983416253, + "grad_norm": 3.471508264541626, + "learning_rate": 2.5028425241614557e-05, + "loss": 5.3885, + "step": 9285 + }, + { + "epoch": 0.5135433941404091, + "grad_norm": 3.8236048221588135, + "learning_rate": 2.5014212620807277e-05, + "loss": 5.1868, + "step": 9290 + }, + { + "epoch": 0.513819789939193, + "grad_norm": 3.278804302215576, + "learning_rate": 2.5e-05, + "loss": 5.2836, + "step": 9295 + }, + { + "epoch": 0.5140961857379768, + "grad_norm": 3.5111825466156006, + "learning_rate": 2.4985787379192725e-05, + "loss": 5.3669, + "step": 9300 + }, + { + "epoch": 0.5143725815367607, + "grad_norm": 3.7894766330718994, + "learning_rate": 2.4971574758385446e-05, + "loss": 4.9516, + "step": 9305 + }, + { + "epoch": 0.5146489773355445, + "grad_norm": 2.5481996536254883, + "learning_rate": 2.4957362137578173e-05, + "loss": 5.1002, + "step": 9310 + }, + { + "epoch": 0.5149253731343284, + "grad_norm": 3.0527219772338867, + "learning_rate": 2.4943149516770894e-05, + "loss": 5.3666, + "step": 9315 + }, + { + "epoch": 0.5152017689331122, + "grad_norm": 3.6825625896453857, + "learning_rate": 2.4928936895963618e-05, + "loss": 5.0798, + "step": 9320 + }, + { + "epoch": 0.5154781647318961, + "grad_norm": 3.366027593612671, + "learning_rate": 2.4914724275156338e-05, + "loss": 5.2866, + "step": 9325 + }, + { + "epoch": 0.5157545605306799, + "grad_norm": 3.307328462600708, + "learning_rate": 2.4900511654349062e-05, + "loss": 4.9818, + "step": 9330 + }, + { + "epoch": 0.5160309563294638, + "grad_norm": 3.386768102645874, + "learning_rate": 2.4886299033541786e-05, + "loss": 5.1491, + "step": 9335 + }, + { + "epoch": 0.5163073521282476, + "grad_norm": 3.7182559967041016, + "learning_rate": 2.487208641273451e-05, + "loss": 5.2406, + "step": 9340 + }, + { + "epoch": 0.5165837479270315, + "grad_norm": 4.657354354858398, + "learning_rate": 2.4857873791927234e-05, + "loss": 4.9486, + "step": 9345 + }, + { + "epoch": 0.5168601437258153, + "grad_norm": 3.5175914764404297, + "learning_rate": 2.4843661171119955e-05, + "loss": 5.1466, + "step": 9350 + }, + { + "epoch": 0.5171365395245993, + "grad_norm": 3.5786592960357666, + "learning_rate": 2.482944855031268e-05, + "loss": 5.1679, + "step": 9355 + }, + { + "epoch": 0.5174129353233831, + "grad_norm": 2.8684964179992676, + "learning_rate": 2.48152359295054e-05, + "loss": 4.9781, + "step": 9360 + }, + { + "epoch": 0.517689331122167, + "grad_norm": 3.130319356918335, + "learning_rate": 2.4801023308698127e-05, + "loss": 5.1694, + "step": 9365 + }, + { + "epoch": 0.5179657269209508, + "grad_norm": 3.6741483211517334, + "learning_rate": 2.478681068789085e-05, + "loss": 4.9329, + "step": 9370 + }, + { + "epoch": 0.5182421227197347, + "grad_norm": 3.3555543422698975, + "learning_rate": 2.477259806708357e-05, + "loss": 5.0482, + "step": 9375 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 3.1766357421875, + "learning_rate": 2.4758385446276295e-05, + "loss": 5.0693, + "step": 9380 + }, + { + "epoch": 0.5187949143173024, + "grad_norm": 3.1514878273010254, + "learning_rate": 2.4744172825469016e-05, + "loss": 4.7144, + "step": 9385 + }, + { + "epoch": 0.5190713101160862, + "grad_norm": 3.2456977367401123, + "learning_rate": 2.472996020466174e-05, + "loss": 4.9498, + "step": 9390 + }, + { + "epoch": 0.5193477059148701, + "grad_norm": 3.266662836074829, + "learning_rate": 2.4715747583854464e-05, + "loss": 5.3611, + "step": 9395 + }, + { + "epoch": 0.5196241017136539, + "grad_norm": 2.9359843730926514, + "learning_rate": 2.4701534963047188e-05, + "loss": 5.0735, + "step": 9400 + }, + { + "epoch": 0.5199004975124378, + "grad_norm": 2.956838369369507, + "learning_rate": 2.4687322342239912e-05, + "loss": 5.2266, + "step": 9405 + }, + { + "epoch": 0.5201768933112216, + "grad_norm": 3.2221481800079346, + "learning_rate": 2.4673109721432632e-05, + "loss": 4.8549, + "step": 9410 + }, + { + "epoch": 0.5204532891100055, + "grad_norm": 4.0131635665893555, + "learning_rate": 2.4658897100625356e-05, + "loss": 5.0946, + "step": 9415 + }, + { + "epoch": 0.5207296849087893, + "grad_norm": 3.1099162101745605, + "learning_rate": 2.464468447981808e-05, + "loss": 5.4147, + "step": 9420 + }, + { + "epoch": 0.5210060807075733, + "grad_norm": 2.945998430252075, + "learning_rate": 2.4630471859010804e-05, + "loss": 5.0275, + "step": 9425 + }, + { + "epoch": 0.5212824765063571, + "grad_norm": 3.6736671924591064, + "learning_rate": 2.4616259238203525e-05, + "loss": 5.0205, + "step": 9430 + }, + { + "epoch": 0.521558872305141, + "grad_norm": 3.3827338218688965, + "learning_rate": 2.460204661739625e-05, + "loss": 5.2464, + "step": 9435 + }, + { + "epoch": 0.5218352681039248, + "grad_norm": 3.2636919021606445, + "learning_rate": 2.4587833996588973e-05, + "loss": 4.7875, + "step": 9440 + }, + { + "epoch": 0.5221116639027087, + "grad_norm": 2.948478937149048, + "learning_rate": 2.4573621375781693e-05, + "loss": 5.0072, + "step": 9445 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 3.093000888824463, + "learning_rate": 2.455940875497442e-05, + "loss": 4.9569, + "step": 9450 + }, + { + "epoch": 0.5226644555002764, + "grad_norm": 3.29803466796875, + "learning_rate": 2.454519613416714e-05, + "loss": 5.0124, + "step": 9455 + }, + { + "epoch": 0.5229408512990602, + "grad_norm": 2.948936700820923, + "learning_rate": 2.4530983513359865e-05, + "loss": 5.0278, + "step": 9460 + }, + { + "epoch": 0.5232172470978441, + "grad_norm": 3.211668014526367, + "learning_rate": 2.4516770892552586e-05, + "loss": 5.1416, + "step": 9465 + }, + { + "epoch": 0.5234936428966279, + "grad_norm": 4.215364456176758, + "learning_rate": 2.450255827174531e-05, + "loss": 4.8566, + "step": 9470 + }, + { + "epoch": 0.5237700386954118, + "grad_norm": 3.8499324321746826, + "learning_rate": 2.4488345650938034e-05, + "loss": 4.9764, + "step": 9475 + }, + { + "epoch": 0.5240464344941956, + "grad_norm": 3.159186840057373, + "learning_rate": 2.4474133030130758e-05, + "loss": 5.1786, + "step": 9480 + }, + { + "epoch": 0.5243228302929795, + "grad_norm": 3.4853217601776123, + "learning_rate": 2.4459920409323482e-05, + "loss": 5.2136, + "step": 9485 + }, + { + "epoch": 0.5245992260917635, + "grad_norm": 3.228022813796997, + "learning_rate": 2.4445707788516203e-05, + "loss": 5.498, + "step": 9490 + }, + { + "epoch": 0.5248756218905473, + "grad_norm": 2.8631765842437744, + "learning_rate": 2.4431495167708926e-05, + "loss": 5.1078, + "step": 9495 + }, + { + "epoch": 0.5251520176893312, + "grad_norm": 3.351837396621704, + "learning_rate": 2.4417282546901647e-05, + "loss": 5.2389, + "step": 9500 + }, + { + "epoch": 0.525428413488115, + "grad_norm": 4.084488391876221, + "learning_rate": 2.4403069926094374e-05, + "loss": 5.1804, + "step": 9505 + }, + { + "epoch": 0.5257048092868989, + "grad_norm": 4.037084579467773, + "learning_rate": 2.4388857305287095e-05, + "loss": 5.162, + "step": 9510 + }, + { + "epoch": 0.5259812050856827, + "grad_norm": 3.0944149494171143, + "learning_rate": 2.437464468447982e-05, + "loss": 5.0158, + "step": 9515 + }, + { + "epoch": 0.5262576008844666, + "grad_norm": 3.6755268573760986, + "learning_rate": 2.4360432063672543e-05, + "loss": 5.1988, + "step": 9520 + }, + { + "epoch": 0.5265339966832504, + "grad_norm": 3.344731330871582, + "learning_rate": 2.4346219442865264e-05, + "loss": 5.2125, + "step": 9525 + }, + { + "epoch": 0.5268103924820343, + "grad_norm": 4.068390369415283, + "learning_rate": 2.433200682205799e-05, + "loss": 5.4192, + "step": 9530 + }, + { + "epoch": 0.5270867882808181, + "grad_norm": 2.7342958450317383, + "learning_rate": 2.431779420125071e-05, + "loss": 5.1935, + "step": 9535 + }, + { + "epoch": 0.527363184079602, + "grad_norm": 3.347498893737793, + "learning_rate": 2.4303581580443436e-05, + "loss": 4.9082, + "step": 9540 + }, + { + "epoch": 0.5276395798783858, + "grad_norm": 3.082366466522217, + "learning_rate": 2.428936895963616e-05, + "loss": 5.1855, + "step": 9545 + }, + { + "epoch": 0.5279159756771697, + "grad_norm": 3.0448951721191406, + "learning_rate": 2.427515633882888e-05, + "loss": 5.3123, + "step": 9550 + }, + { + "epoch": 0.5281923714759535, + "grad_norm": 3.1284632682800293, + "learning_rate": 2.4260943718021604e-05, + "loss": 5.0266, + "step": 9555 + }, + { + "epoch": 0.5284687672747375, + "grad_norm": 3.3635149002075195, + "learning_rate": 2.4246731097214328e-05, + "loss": 5.1604, + "step": 9560 + }, + { + "epoch": 0.5287451630735213, + "grad_norm": 4.114374160766602, + "learning_rate": 2.4232518476407052e-05, + "loss": 5.027, + "step": 9565 + }, + { + "epoch": 0.5290215588723052, + "grad_norm": 2.970109701156616, + "learning_rate": 2.4218305855599773e-05, + "loss": 5.0342, + "step": 9570 + }, + { + "epoch": 0.529297954671089, + "grad_norm": 3.081437110900879, + "learning_rate": 2.4204093234792497e-05, + "loss": 4.9838, + "step": 9575 + }, + { + "epoch": 0.5295743504698729, + "grad_norm": 3.3371689319610596, + "learning_rate": 2.418988061398522e-05, + "loss": 5.0638, + "step": 9580 + }, + { + "epoch": 0.5298507462686567, + "grad_norm": 3.4338042736053467, + "learning_rate": 2.4175667993177945e-05, + "loss": 5.2537, + "step": 9585 + }, + { + "epoch": 0.5301271420674406, + "grad_norm": 2.9125192165374756, + "learning_rate": 2.416145537237067e-05, + "loss": 5.1985, + "step": 9590 + }, + { + "epoch": 0.5304035378662244, + "grad_norm": 2.703503131866455, + "learning_rate": 2.414724275156339e-05, + "loss": 5.0267, + "step": 9595 + }, + { + "epoch": 0.5306799336650083, + "grad_norm": 2.609586238861084, + "learning_rate": 2.4133030130756113e-05, + "loss": 5.181, + "step": 9600 + }, + { + "epoch": 0.5309563294637921, + "grad_norm": 3.443605661392212, + "learning_rate": 2.4118817509948834e-05, + "loss": 5.0051, + "step": 9605 + }, + { + "epoch": 0.531232725262576, + "grad_norm": 3.419793128967285, + "learning_rate": 2.4104604889141558e-05, + "loss": 5.0689, + "step": 9610 + }, + { + "epoch": 0.5315091210613598, + "grad_norm": 3.239157199859619, + "learning_rate": 2.409039226833428e-05, + "loss": 5.023, + "step": 9615 + }, + { + "epoch": 0.5317855168601437, + "grad_norm": 3.928612470626831, + "learning_rate": 2.4076179647527006e-05, + "loss": 5.0672, + "step": 9620 + }, + { + "epoch": 0.5320619126589275, + "grad_norm": 3.0533933639526367, + "learning_rate": 2.406196702671973e-05, + "loss": 4.8787, + "step": 9625 + }, + { + "epoch": 0.5323383084577115, + "grad_norm": 2.9441888332366943, + "learning_rate": 2.404775440591245e-05, + "loss": 4.7492, + "step": 9630 + }, + { + "epoch": 0.5326147042564953, + "grad_norm": 3.513918399810791, + "learning_rate": 2.4033541785105174e-05, + "loss": 5.1696, + "step": 9635 + }, + { + "epoch": 0.5328911000552792, + "grad_norm": 3.191592216491699, + "learning_rate": 2.4019329164297898e-05, + "loss": 5.0818, + "step": 9640 + }, + { + "epoch": 0.533167495854063, + "grad_norm": 3.9577038288116455, + "learning_rate": 2.4005116543490622e-05, + "loss": 5.0411, + "step": 9645 + }, + { + "epoch": 0.5334438916528469, + "grad_norm": 3.297288656234741, + "learning_rate": 2.3990903922683343e-05, + "loss": 5.009, + "step": 9650 + }, + { + "epoch": 0.5337202874516307, + "grad_norm": 2.9676246643066406, + "learning_rate": 2.3976691301876067e-05, + "loss": 5.2229, + "step": 9655 + }, + { + "epoch": 0.5339966832504146, + "grad_norm": 3.918351650238037, + "learning_rate": 2.396247868106879e-05, + "loss": 5.3025, + "step": 9660 + }, + { + "epoch": 0.5342730790491984, + "grad_norm": 3.9034459590911865, + "learning_rate": 2.394826606026151e-05, + "loss": 4.985, + "step": 9665 + }, + { + "epoch": 0.5345494748479823, + "grad_norm": 3.6231179237365723, + "learning_rate": 2.393405343945424e-05, + "loss": 4.8507, + "step": 9670 + }, + { + "epoch": 0.5348258706467661, + "grad_norm": 2.965517044067383, + "learning_rate": 2.391984081864696e-05, + "loss": 5.2807, + "step": 9675 + }, + { + "epoch": 0.53510226644555, + "grad_norm": 3.329669713973999, + "learning_rate": 2.3905628197839683e-05, + "loss": 5.1108, + "step": 9680 + }, + { + "epoch": 0.5353786622443338, + "grad_norm": 3.134333372116089, + "learning_rate": 2.3891415577032407e-05, + "loss": 5.1821, + "step": 9685 + }, + { + "epoch": 0.5356550580431177, + "grad_norm": 2.873486042022705, + "learning_rate": 2.3877202956225128e-05, + "loss": 5.0137, + "step": 9690 + }, + { + "epoch": 0.5359314538419016, + "grad_norm": 3.7581984996795654, + "learning_rate": 2.3862990335417855e-05, + "loss": 5.1051, + "step": 9695 + }, + { + "epoch": 0.5362078496406855, + "grad_norm": 4.077391147613525, + "learning_rate": 2.3848777714610576e-05, + "loss": 5.2823, + "step": 9700 + }, + { + "epoch": 0.5364842454394693, + "grad_norm": 3.292609930038452, + "learning_rate": 2.38345650938033e-05, + "loss": 5.0104, + "step": 9705 + }, + { + "epoch": 0.5367606412382532, + "grad_norm": 3.4207916259765625, + "learning_rate": 2.382035247299602e-05, + "loss": 5.2349, + "step": 9710 + }, + { + "epoch": 0.5370370370370371, + "grad_norm": 3.6559712886810303, + "learning_rate": 2.3806139852188744e-05, + "loss": 5.3262, + "step": 9715 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 2.987576484680176, + "learning_rate": 2.3791927231381468e-05, + "loss": 5.1957, + "step": 9720 + }, + { + "epoch": 0.5375898286346048, + "grad_norm": 3.834031105041504, + "learning_rate": 2.3777714610574192e-05, + "loss": 5.2105, + "step": 9725 + }, + { + "epoch": 0.5378662244333886, + "grad_norm": 2.903601884841919, + "learning_rate": 2.3763501989766916e-05, + "loss": 5.0302, + "step": 9730 + }, + { + "epoch": 0.5381426202321725, + "grad_norm": 4.139296531677246, + "learning_rate": 2.3749289368959637e-05, + "loss": 5.1185, + "step": 9735 + }, + { + "epoch": 0.5384190160309563, + "grad_norm": 3.0780575275421143, + "learning_rate": 2.373507674815236e-05, + "loss": 5.0262, + "step": 9740 + }, + { + "epoch": 0.5386954118297402, + "grad_norm": 2.722825288772583, + "learning_rate": 2.372086412734508e-05, + "loss": 4.9768, + "step": 9745 + }, + { + "epoch": 0.538971807628524, + "grad_norm": 2.6656720638275146, + "learning_rate": 2.370665150653781e-05, + "loss": 5.0315, + "step": 9750 + }, + { + "epoch": 0.539248203427308, + "grad_norm": 3.2058663368225098, + "learning_rate": 2.369243888573053e-05, + "loss": 4.9424, + "step": 9755 + }, + { + "epoch": 0.5395245992260918, + "grad_norm": 3.5652551651000977, + "learning_rate": 2.3678226264923253e-05, + "loss": 4.9202, + "step": 9760 + }, + { + "epoch": 0.5398009950248757, + "grad_norm": 4.415959358215332, + "learning_rate": 2.3664013644115977e-05, + "loss": 5.3065, + "step": 9765 + }, + { + "epoch": 0.5400773908236595, + "grad_norm": 3.7161080837249756, + "learning_rate": 2.3649801023308698e-05, + "loss": 5.3898, + "step": 9770 + }, + { + "epoch": 0.5403537866224434, + "grad_norm": 4.07873010635376, + "learning_rate": 2.3635588402501422e-05, + "loss": 5.0935, + "step": 9775 + }, + { + "epoch": 0.5406301824212272, + "grad_norm": 4.138256549835205, + "learning_rate": 2.3621375781694146e-05, + "loss": 4.8865, + "step": 9780 + }, + { + "epoch": 0.5409065782200111, + "grad_norm": 3.804762601852417, + "learning_rate": 2.360716316088687e-05, + "loss": 5.1764, + "step": 9785 + }, + { + "epoch": 0.5411829740187949, + "grad_norm": 3.2528018951416016, + "learning_rate": 2.359295054007959e-05, + "loss": 5.3817, + "step": 9790 + }, + { + "epoch": 0.5414593698175788, + "grad_norm": 3.966082811355591, + "learning_rate": 2.3578737919272314e-05, + "loss": 5.1415, + "step": 9795 + }, + { + "epoch": 0.5417357656163626, + "grad_norm": 3.4930150508880615, + "learning_rate": 2.356452529846504e-05, + "loss": 5.2255, + "step": 9800 + }, + { + "epoch": 0.5420121614151465, + "grad_norm": 3.7764456272125244, + "learning_rate": 2.355031267765776e-05, + "loss": 5.3015, + "step": 9805 + }, + { + "epoch": 0.5422885572139303, + "grad_norm": 3.298708200454712, + "learning_rate": 2.3536100056850486e-05, + "loss": 4.8978, + "step": 9810 + }, + { + "epoch": 0.5425649530127142, + "grad_norm": 3.262017011642456, + "learning_rate": 2.3521887436043207e-05, + "loss": 5.2939, + "step": 9815 + }, + { + "epoch": 0.542841348811498, + "grad_norm": 3.5957589149475098, + "learning_rate": 2.350767481523593e-05, + "loss": 5.3993, + "step": 9820 + }, + { + "epoch": 0.543117744610282, + "grad_norm": 3.324526071548462, + "learning_rate": 2.349346219442865e-05, + "loss": 4.8801, + "step": 9825 + }, + { + "epoch": 0.5433941404090658, + "grad_norm": 2.6901962757110596, + "learning_rate": 2.3479249573621375e-05, + "loss": 4.8502, + "step": 9830 + }, + { + "epoch": 0.5436705362078497, + "grad_norm": 3.8433539867401123, + "learning_rate": 2.3465036952814103e-05, + "loss": 5.1071, + "step": 9835 + }, + { + "epoch": 0.5439469320066335, + "grad_norm": 3.26355242729187, + "learning_rate": 2.3450824332006823e-05, + "loss": 5.0674, + "step": 9840 + }, + { + "epoch": 0.5442233278054174, + "grad_norm": 3.1263980865478516, + "learning_rate": 2.3436611711199547e-05, + "loss": 5.2538, + "step": 9845 + }, + { + "epoch": 0.5444997236042012, + "grad_norm": 3.164491891860962, + "learning_rate": 2.3422399090392268e-05, + "loss": 5.1808, + "step": 9850 + }, + { + "epoch": 0.5447761194029851, + "grad_norm": 2.887119770050049, + "learning_rate": 2.3408186469584992e-05, + "loss": 4.8469, + "step": 9855 + }, + { + "epoch": 0.5450525152017689, + "grad_norm": 3.320695161819458, + "learning_rate": 2.3393973848777716e-05, + "loss": 4.9163, + "step": 9860 + }, + { + "epoch": 0.5453289110005528, + "grad_norm": 3.467114210128784, + "learning_rate": 2.337976122797044e-05, + "loss": 5.0402, + "step": 9865 + }, + { + "epoch": 0.5456053067993366, + "grad_norm": 3.7807321548461914, + "learning_rate": 2.3365548607163164e-05, + "loss": 5.4195, + "step": 9870 + }, + { + "epoch": 0.5458817025981205, + "grad_norm": 3.7920944690704346, + "learning_rate": 2.3351335986355885e-05, + "loss": 4.8416, + "step": 9875 + }, + { + "epoch": 0.5461580983969043, + "grad_norm": 3.7439687252044678, + "learning_rate": 2.333712336554861e-05, + "loss": 5.0364, + "step": 9880 + }, + { + "epoch": 0.5464344941956882, + "grad_norm": 3.955336093902588, + "learning_rate": 2.332291074474133e-05, + "loss": 5.5306, + "step": 9885 + }, + { + "epoch": 0.546710889994472, + "grad_norm": 3.5154666900634766, + "learning_rate": 2.3308698123934056e-05, + "loss": 5.1119, + "step": 9890 + }, + { + "epoch": 0.546987285793256, + "grad_norm": 3.323625326156616, + "learning_rate": 2.3294485503126777e-05, + "loss": 5.1608, + "step": 9895 + }, + { + "epoch": 0.5472636815920398, + "grad_norm": 2.7888081073760986, + "learning_rate": 2.32802728823195e-05, + "loss": 5.173, + "step": 9900 + }, + { + "epoch": 0.5475400773908237, + "grad_norm": 3.7210378646850586, + "learning_rate": 2.3266060261512225e-05, + "loss": 5.3936, + "step": 9905 + }, + { + "epoch": 0.5478164731896075, + "grad_norm": 3.4708786010742188, + "learning_rate": 2.3251847640704946e-05, + "loss": 5.1886, + "step": 9910 + }, + { + "epoch": 0.5480928689883914, + "grad_norm": 3.851804494857788, + "learning_rate": 2.323763501989767e-05, + "loss": 5.0378, + "step": 9915 + }, + { + "epoch": 0.5483692647871752, + "grad_norm": 3.7334494590759277, + "learning_rate": 2.3223422399090394e-05, + "loss": 4.8619, + "step": 9920 + }, + { + "epoch": 0.5486456605859591, + "grad_norm": 3.100353479385376, + "learning_rate": 2.3209209778283118e-05, + "loss": 4.8678, + "step": 9925 + }, + { + "epoch": 0.548922056384743, + "grad_norm": 3.700317621231079, + "learning_rate": 2.3194997157475838e-05, + "loss": 5.2288, + "step": 9930 + }, + { + "epoch": 0.5491984521835268, + "grad_norm": 2.94370174407959, + "learning_rate": 2.3180784536668562e-05, + "loss": 5.097, + "step": 9935 + }, + { + "epoch": 0.5494748479823107, + "grad_norm": 2.9802255630493164, + "learning_rate": 2.3166571915861286e-05, + "loss": 5.1708, + "step": 9940 + }, + { + "epoch": 0.5497512437810945, + "grad_norm": 3.3415024280548096, + "learning_rate": 2.315235929505401e-05, + "loss": 5.2635, + "step": 9945 + }, + { + "epoch": 0.5500276395798784, + "grad_norm": 2.9417765140533447, + "learning_rate": 2.3138146674246734e-05, + "loss": 5.2387, + "step": 9950 + }, + { + "epoch": 0.5503040353786622, + "grad_norm": 3.112800359725952, + "learning_rate": 2.3123934053439455e-05, + "loss": 5.0443, + "step": 9955 + }, + { + "epoch": 0.5505804311774462, + "grad_norm": 2.728961944580078, + "learning_rate": 2.310972143263218e-05, + "loss": 5.0209, + "step": 9960 + }, + { + "epoch": 0.55085682697623, + "grad_norm": 3.2710981369018555, + "learning_rate": 2.30955088118249e-05, + "loss": 5.2521, + "step": 9965 + }, + { + "epoch": 0.5511332227750139, + "grad_norm": 3.748812675476074, + "learning_rate": 2.3081296191017623e-05, + "loss": 5.3927, + "step": 9970 + }, + { + "epoch": 0.5514096185737977, + "grad_norm": 3.3356282711029053, + "learning_rate": 2.3067083570210347e-05, + "loss": 5.1706, + "step": 9975 + }, + { + "epoch": 0.5516860143725816, + "grad_norm": 2.5249369144439697, + "learning_rate": 2.305287094940307e-05, + "loss": 5.0159, + "step": 9980 + }, + { + "epoch": 0.5519624101713654, + "grad_norm": 3.26520037651062, + "learning_rate": 2.3038658328595795e-05, + "loss": 5.1818, + "step": 9985 + }, + { + "epoch": 0.5522388059701493, + "grad_norm": 3.965653896331787, + "learning_rate": 2.3024445707788516e-05, + "loss": 4.8768, + "step": 9990 + }, + { + "epoch": 0.5525152017689331, + "grad_norm": 4.425838947296143, + "learning_rate": 2.301023308698124e-05, + "loss": 5.2812, + "step": 9995 + }, + { + "epoch": 0.552791597567717, + "grad_norm": 3.3867270946502686, + "learning_rate": 2.2996020466173964e-05, + "loss": 5.1826, + "step": 10000 + }, + { + "epoch": 0.5530679933665008, + "grad_norm": 3.590095043182373, + "learning_rate": 2.2981807845366688e-05, + "loss": 5.2498, + "step": 10005 + }, + { + "epoch": 0.5533443891652847, + "grad_norm": 3.230135440826416, + "learning_rate": 2.296759522455941e-05, + "loss": 5.0662, + "step": 10010 + }, + { + "epoch": 0.5536207849640685, + "grad_norm": 3.142805337905884, + "learning_rate": 2.2953382603752132e-05, + "loss": 5.1993, + "step": 10015 + }, + { + "epoch": 0.5538971807628524, + "grad_norm": 4.192418098449707, + "learning_rate": 2.2939169982944856e-05, + "loss": 5.1139, + "step": 10020 + }, + { + "epoch": 0.5541735765616362, + "grad_norm": 3.831026315689087, + "learning_rate": 2.2924957362137577e-05, + "loss": 5.1316, + "step": 10025 + }, + { + "epoch": 0.5544499723604202, + "grad_norm": 3.54022216796875, + "learning_rate": 2.2910744741330304e-05, + "loss": 5.0901, + "step": 10030 + }, + { + "epoch": 0.554726368159204, + "grad_norm": 4.166489601135254, + "learning_rate": 2.2896532120523025e-05, + "loss": 5.1592, + "step": 10035 + }, + { + "epoch": 0.5550027639579879, + "grad_norm": 3.5303165912628174, + "learning_rate": 2.288231949971575e-05, + "loss": 4.9975, + "step": 10040 + }, + { + "epoch": 0.5552791597567717, + "grad_norm": 3.065279245376587, + "learning_rate": 2.2868106878908473e-05, + "loss": 5.1, + "step": 10045 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 4.913681507110596, + "learning_rate": 2.2853894258101193e-05, + "loss": 5.207, + "step": 10050 + }, + { + "epoch": 0.5558319513543394, + "grad_norm": 3.3215198516845703, + "learning_rate": 2.283968163729392e-05, + "loss": 5.0969, + "step": 10055 + }, + { + "epoch": 0.5561083471531233, + "grad_norm": 3.600184440612793, + "learning_rate": 2.282546901648664e-05, + "loss": 4.8922, + "step": 10060 + }, + { + "epoch": 0.5563847429519071, + "grad_norm": 3.041132926940918, + "learning_rate": 2.2811256395679365e-05, + "loss": 5.2081, + "step": 10065 + }, + { + "epoch": 0.556661138750691, + "grad_norm": 3.7934274673461914, + "learning_rate": 2.2797043774872086e-05, + "loss": 5.1229, + "step": 10070 + }, + { + "epoch": 0.5569375345494748, + "grad_norm": 2.82381010055542, + "learning_rate": 2.278283115406481e-05, + "loss": 5.2129, + "step": 10075 + }, + { + "epoch": 0.5572139303482587, + "grad_norm": 3.184391736984253, + "learning_rate": 2.2768618533257534e-05, + "loss": 5.199, + "step": 10080 + }, + { + "epoch": 0.5574903261470425, + "grad_norm": 2.923619031906128, + "learning_rate": 2.2754405912450258e-05, + "loss": 5.2113, + "step": 10085 + }, + { + "epoch": 0.5577667219458264, + "grad_norm": 3.289689540863037, + "learning_rate": 2.2740193291642982e-05, + "loss": 5.3625, + "step": 10090 + }, + { + "epoch": 0.5580431177446102, + "grad_norm": 3.128100633621216, + "learning_rate": 2.2725980670835702e-05, + "loss": 4.9404, + "step": 10095 + }, + { + "epoch": 0.5583195135433942, + "grad_norm": 3.116595983505249, + "learning_rate": 2.2711768050028426e-05, + "loss": 4.6508, + "step": 10100 + }, + { + "epoch": 0.558595909342178, + "grad_norm": 3.509528636932373, + "learning_rate": 2.2697555429221147e-05, + "loss": 5.3815, + "step": 10105 + }, + { + "epoch": 0.5588723051409619, + "grad_norm": 2.830974578857422, + "learning_rate": 2.2683342808413874e-05, + "loss": 5.0021, + "step": 10110 + }, + { + "epoch": 0.5591487009397457, + "grad_norm": 3.5547916889190674, + "learning_rate": 2.2669130187606595e-05, + "loss": 5.1721, + "step": 10115 + }, + { + "epoch": 0.5594250967385296, + "grad_norm": 3.2966182231903076, + "learning_rate": 2.265491756679932e-05, + "loss": 5.1488, + "step": 10120 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 3.2352070808410645, + "learning_rate": 2.2640704945992043e-05, + "loss": 4.8284, + "step": 10125 + }, + { + "epoch": 0.5599778883360973, + "grad_norm": 3.2427544593811035, + "learning_rate": 2.2626492325184763e-05, + "loss": 5.1183, + "step": 10130 + }, + { + "epoch": 0.5602542841348811, + "grad_norm": 2.5661115646362305, + "learning_rate": 2.2612279704377487e-05, + "loss": 5.1111, + "step": 10135 + }, + { + "epoch": 0.560530679933665, + "grad_norm": 3.292429208755493, + "learning_rate": 2.259806708357021e-05, + "loss": 5.2935, + "step": 10140 + }, + { + "epoch": 0.5608070757324488, + "grad_norm": 3.2697348594665527, + "learning_rate": 2.2583854462762935e-05, + "loss": 5.3207, + "step": 10145 + }, + { + "epoch": 0.5610834715312327, + "grad_norm": 2.9878225326538086, + "learning_rate": 2.256964184195566e-05, + "loss": 4.8464, + "step": 10150 + }, + { + "epoch": 0.5613598673300166, + "grad_norm": 3.4964377880096436, + "learning_rate": 2.255542922114838e-05, + "loss": 5.2375, + "step": 10155 + }, + { + "epoch": 0.5616362631288004, + "grad_norm": 3.2263262271881104, + "learning_rate": 2.2541216600341104e-05, + "loss": 5.1451, + "step": 10160 + }, + { + "epoch": 0.5619126589275844, + "grad_norm": 2.8740015029907227, + "learning_rate": 2.2527003979533828e-05, + "loss": 4.9799, + "step": 10165 + }, + { + "epoch": 0.5621890547263682, + "grad_norm": 3.3548688888549805, + "learning_rate": 2.2512791358726552e-05, + "loss": 5.0609, + "step": 10170 + }, + { + "epoch": 0.5624654505251521, + "grad_norm": 3.348917245864868, + "learning_rate": 2.2498578737919272e-05, + "loss": 5.0375, + "step": 10175 + }, + { + "epoch": 0.5627418463239359, + "grad_norm": 2.9906487464904785, + "learning_rate": 2.2484366117111996e-05, + "loss": 5.109, + "step": 10180 + }, + { + "epoch": 0.5630182421227198, + "grad_norm": 3.7559046745300293, + "learning_rate": 2.247015349630472e-05, + "loss": 4.9875, + "step": 10185 + }, + { + "epoch": 0.5632946379215036, + "grad_norm": 4.169631004333496, + "learning_rate": 2.245594087549744e-05, + "loss": 5.0137, + "step": 10190 + }, + { + "epoch": 0.5635710337202875, + "grad_norm": 3.692657709121704, + "learning_rate": 2.244172825469017e-05, + "loss": 5.1208, + "step": 10195 + }, + { + "epoch": 0.5638474295190713, + "grad_norm": 4.431186199188232, + "learning_rate": 2.242751563388289e-05, + "loss": 5.0208, + "step": 10200 + }, + { + "epoch": 0.5641238253178552, + "grad_norm": 3.045299530029297, + "learning_rate": 2.2413303013075613e-05, + "loss": 4.8395, + "step": 10205 + }, + { + "epoch": 0.564400221116639, + "grad_norm": 2.9593913555145264, + "learning_rate": 2.2399090392268334e-05, + "loss": 5.0236, + "step": 10210 + }, + { + "epoch": 0.5646766169154229, + "grad_norm": 3.3709757328033447, + "learning_rate": 2.2384877771461058e-05, + "loss": 5.3689, + "step": 10215 + }, + { + "epoch": 0.5649530127142067, + "grad_norm": 3.7486398220062256, + "learning_rate": 2.237066515065378e-05, + "loss": 5.1189, + "step": 10220 + }, + { + "epoch": 0.5652294085129906, + "grad_norm": 2.766789674758911, + "learning_rate": 2.2356452529846505e-05, + "loss": 5.1984, + "step": 10225 + }, + { + "epoch": 0.5655058043117744, + "grad_norm": 3.7790632247924805, + "learning_rate": 2.234223990903923e-05, + "loss": 5.0615, + "step": 10230 + }, + { + "epoch": 0.5657822001105584, + "grad_norm": 3.1158392429351807, + "learning_rate": 2.232802728823195e-05, + "loss": 5.3859, + "step": 10235 + }, + { + "epoch": 0.5660585959093422, + "grad_norm": 3.7767012119293213, + "learning_rate": 2.2313814667424674e-05, + "loss": 5.1952, + "step": 10240 + }, + { + "epoch": 0.5663349917081261, + "grad_norm": 3.6504740715026855, + "learning_rate": 2.2299602046617395e-05, + "loss": 4.9556, + "step": 10245 + }, + { + "epoch": 0.5666113875069099, + "grad_norm": 3.328760862350464, + "learning_rate": 2.2285389425810122e-05, + "loss": 5.1943, + "step": 10250 + }, + { + "epoch": 0.5668877833056938, + "grad_norm": 2.9243085384368896, + "learning_rate": 2.2271176805002843e-05, + "loss": 4.821, + "step": 10255 + }, + { + "epoch": 0.5671641791044776, + "grad_norm": 3.004983425140381, + "learning_rate": 2.2256964184195567e-05, + "loss": 5.1676, + "step": 10260 + }, + { + "epoch": 0.5674405749032615, + "grad_norm": 3.309654474258423, + "learning_rate": 2.224275156338829e-05, + "loss": 4.9507, + "step": 10265 + }, + { + "epoch": 0.5677169707020453, + "grad_norm": 2.664963960647583, + "learning_rate": 2.222853894258101e-05, + "loss": 4.8282, + "step": 10270 + }, + { + "epoch": 0.5679933665008292, + "grad_norm": 3.0774271488189697, + "learning_rate": 2.221432632177374e-05, + "loss": 4.9935, + "step": 10275 + }, + { + "epoch": 0.568269762299613, + "grad_norm": 3.3895280361175537, + "learning_rate": 2.220011370096646e-05, + "loss": 5.2864, + "step": 10280 + }, + { + "epoch": 0.5685461580983969, + "grad_norm": 4.060614109039307, + "learning_rate": 2.2185901080159183e-05, + "loss": 5.4247, + "step": 10285 + }, + { + "epoch": 0.5688225538971807, + "grad_norm": 4.643860816955566, + "learning_rate": 2.2171688459351904e-05, + "loss": 5.0794, + "step": 10290 + }, + { + "epoch": 0.5690989496959646, + "grad_norm": 3.1240062713623047, + "learning_rate": 2.2157475838544628e-05, + "loss": 5.1919, + "step": 10295 + }, + { + "epoch": 0.5693753454947484, + "grad_norm": 2.9779624938964844, + "learning_rate": 2.214326321773735e-05, + "loss": 5.066, + "step": 10300 + }, + { + "epoch": 0.5696517412935324, + "grad_norm": 3.8186936378479004, + "learning_rate": 2.2129050596930076e-05, + "loss": 5.2885, + "step": 10305 + }, + { + "epoch": 0.5699281370923162, + "grad_norm": 3.409640073776245, + "learning_rate": 2.21148379761228e-05, + "loss": 4.9745, + "step": 10310 + }, + { + "epoch": 0.5702045328911001, + "grad_norm": 4.826499938964844, + "learning_rate": 2.210062535531552e-05, + "loss": 5.1617, + "step": 10315 + }, + { + "epoch": 0.5704809286898839, + "grad_norm": 3.2550175189971924, + "learning_rate": 2.2086412734508244e-05, + "loss": 5.0947, + "step": 10320 + }, + { + "epoch": 0.5707573244886678, + "grad_norm": 3.287470579147339, + "learning_rate": 2.2072200113700968e-05, + "loss": 4.8948, + "step": 10325 + }, + { + "epoch": 0.5710337202874516, + "grad_norm": 2.8862740993499756, + "learning_rate": 2.2057987492893692e-05, + "loss": 4.8842, + "step": 10330 + }, + { + "epoch": 0.5713101160862355, + "grad_norm": 3.2922773361206055, + "learning_rate": 2.2043774872086416e-05, + "loss": 5.13, + "step": 10335 + }, + { + "epoch": 0.5715865118850193, + "grad_norm": 3.6037707328796387, + "learning_rate": 2.2029562251279137e-05, + "loss": 4.9092, + "step": 10340 + }, + { + "epoch": 0.5718629076838032, + "grad_norm": 3.365394115447998, + "learning_rate": 2.201534963047186e-05, + "loss": 5.3031, + "step": 10345 + }, + { + "epoch": 0.572139303482587, + "grad_norm": 4.060079574584961, + "learning_rate": 2.200113700966458e-05, + "loss": 5.4657, + "step": 10350 + }, + { + "epoch": 0.5724156992813709, + "grad_norm": 3.4800689220428467, + "learning_rate": 2.1986924388857305e-05, + "loss": 5.0092, + "step": 10355 + }, + { + "epoch": 0.5726920950801547, + "grad_norm": 3.4629600048065186, + "learning_rate": 2.197271176805003e-05, + "loss": 5.0127, + "step": 10360 + }, + { + "epoch": 0.5729684908789386, + "grad_norm": 3.4687135219573975, + "learning_rate": 2.1958499147242753e-05, + "loss": 5.2457, + "step": 10365 + }, + { + "epoch": 0.5732448866777226, + "grad_norm": 3.8498566150665283, + "learning_rate": 2.1944286526435477e-05, + "loss": 5.4415, + "step": 10370 + }, + { + "epoch": 0.5735212824765064, + "grad_norm": 3.0555355548858643, + "learning_rate": 2.1930073905628198e-05, + "loss": 4.9743, + "step": 10375 + }, + { + "epoch": 0.5737976782752903, + "grad_norm": 3.5073928833007812, + "learning_rate": 2.1915861284820922e-05, + "loss": 5.0117, + "step": 10380 + }, + { + "epoch": 0.5740740740740741, + "grad_norm": 3.9393508434295654, + "learning_rate": 2.1901648664013646e-05, + "loss": 5.0825, + "step": 10385 + }, + { + "epoch": 0.574350469872858, + "grad_norm": 3.139087438583374, + "learning_rate": 2.188743604320637e-05, + "loss": 4.9374, + "step": 10390 + }, + { + "epoch": 0.5746268656716418, + "grad_norm": 3.968113660812378, + "learning_rate": 2.187322342239909e-05, + "loss": 5.1305, + "step": 10395 + }, + { + "epoch": 0.5749032614704257, + "grad_norm": 3.1973788738250732, + "learning_rate": 2.1859010801591814e-05, + "loss": 4.7387, + "step": 10400 + }, + { + "epoch": 0.5751796572692095, + "grad_norm": 3.473482847213745, + "learning_rate": 2.1844798180784538e-05, + "loss": 4.8958, + "step": 10405 + }, + { + "epoch": 0.5754560530679934, + "grad_norm": 3.418353796005249, + "learning_rate": 2.183058555997726e-05, + "loss": 4.8574, + "step": 10410 + }, + { + "epoch": 0.5757324488667772, + "grad_norm": 2.979325532913208, + "learning_rate": 2.1816372939169986e-05, + "loss": 5.1611, + "step": 10415 + }, + { + "epoch": 0.5760088446655611, + "grad_norm": 3.108954668045044, + "learning_rate": 2.1802160318362707e-05, + "loss": 5.1672, + "step": 10420 + }, + { + "epoch": 0.5762852404643449, + "grad_norm": 2.963421106338501, + "learning_rate": 2.178794769755543e-05, + "loss": 4.4638, + "step": 10425 + }, + { + "epoch": 0.5765616362631288, + "grad_norm": 4.1467461585998535, + "learning_rate": 2.177373507674815e-05, + "loss": 4.8556, + "step": 10430 + }, + { + "epoch": 0.5768380320619126, + "grad_norm": 4.003355503082275, + "learning_rate": 2.1759522455940875e-05, + "loss": 5.0152, + "step": 10435 + }, + { + "epoch": 0.5771144278606966, + "grad_norm": 3.502562999725342, + "learning_rate": 2.17453098351336e-05, + "loss": 5.0242, + "step": 10440 + }, + { + "epoch": 0.5773908236594804, + "grad_norm": 3.727094888687134, + "learning_rate": 2.1731097214326323e-05, + "loss": 5.2192, + "step": 10445 + }, + { + "epoch": 0.5776672194582643, + "grad_norm": 3.4509377479553223, + "learning_rate": 2.1716884593519047e-05, + "loss": 5.3475, + "step": 10450 + }, + { + "epoch": 0.5779436152570481, + "grad_norm": 3.1824676990509033, + "learning_rate": 2.1702671972711768e-05, + "loss": 4.8505, + "step": 10455 + }, + { + "epoch": 0.578220011055832, + "grad_norm": 4.357178211212158, + "learning_rate": 2.1688459351904492e-05, + "loss": 5.2004, + "step": 10460 + }, + { + "epoch": 0.5784964068546158, + "grad_norm": 4.844539642333984, + "learning_rate": 2.1674246731097216e-05, + "loss": 5.1357, + "step": 10465 + }, + { + "epoch": 0.5787728026533997, + "grad_norm": 3.694406509399414, + "learning_rate": 2.166003411028994e-05, + "loss": 4.9931, + "step": 10470 + }, + { + "epoch": 0.5790491984521835, + "grad_norm": 3.6863090991973877, + "learning_rate": 2.1645821489482664e-05, + "loss": 5.4604, + "step": 10475 + }, + { + "epoch": 0.5793255942509674, + "grad_norm": 3.620305299758911, + "learning_rate": 2.1631608868675384e-05, + "loss": 4.958, + "step": 10480 + }, + { + "epoch": 0.5796019900497512, + "grad_norm": 3.349443197250366, + "learning_rate": 2.161739624786811e-05, + "loss": 5.0294, + "step": 10485 + }, + { + "epoch": 0.5798783858485351, + "grad_norm": 3.268993854522705, + "learning_rate": 2.160318362706083e-05, + "loss": 5.0303, + "step": 10490 + }, + { + "epoch": 0.5801547816473189, + "grad_norm": 3.0426461696624756, + "learning_rate": 2.1588971006253556e-05, + "loss": 5.0315, + "step": 10495 + }, + { + "epoch": 0.5804311774461028, + "grad_norm": 3.8381333351135254, + "learning_rate": 2.1574758385446277e-05, + "loss": 5.1518, + "step": 10500 + }, + { + "epoch": 0.5807075732448866, + "grad_norm": 4.0485687255859375, + "learning_rate": 2.1560545764639e-05, + "loss": 5.1779, + "step": 10505 + }, + { + "epoch": 0.5809839690436706, + "grad_norm": 3.4191527366638184, + "learning_rate": 2.1546333143831725e-05, + "loss": 5.2983, + "step": 10510 + }, + { + "epoch": 0.5812603648424544, + "grad_norm": 3.35133695602417, + "learning_rate": 2.1532120523024445e-05, + "loss": 5.1839, + "step": 10515 + }, + { + "epoch": 0.5815367606412383, + "grad_norm": 2.7311925888061523, + "learning_rate": 2.151790790221717e-05, + "loss": 4.8678, + "step": 10520 + }, + { + "epoch": 0.5818131564400221, + "grad_norm": 4.017518043518066, + "learning_rate": 2.1503695281409893e-05, + "loss": 4.8128, + "step": 10525 + }, + { + "epoch": 0.582089552238806, + "grad_norm": 3.112291097640991, + "learning_rate": 2.1489482660602617e-05, + "loss": 5.3122, + "step": 10530 + }, + { + "epoch": 0.5823659480375898, + "grad_norm": 3.437387704849243, + "learning_rate": 2.1475270039795338e-05, + "loss": 4.9359, + "step": 10535 + }, + { + "epoch": 0.5826423438363737, + "grad_norm": 3.4737443923950195, + "learning_rate": 2.1461057418988062e-05, + "loss": 5.0828, + "step": 10540 + }, + { + "epoch": 0.5829187396351575, + "grad_norm": 4.825445175170898, + "learning_rate": 2.1446844798180786e-05, + "loss": 5.0617, + "step": 10545 + }, + { + "epoch": 0.5831951354339414, + "grad_norm": 2.9588332176208496, + "learning_rate": 2.143263217737351e-05, + "loss": 5.124, + "step": 10550 + }, + { + "epoch": 0.5834715312327252, + "grad_norm": 3.551666736602783, + "learning_rate": 2.1418419556566234e-05, + "loss": 5.1687, + "step": 10555 + }, + { + "epoch": 0.5837479270315091, + "grad_norm": 3.1426239013671875, + "learning_rate": 2.1404206935758954e-05, + "loss": 4.7921, + "step": 10560 + }, + { + "epoch": 0.5840243228302929, + "grad_norm": 3.088836193084717, + "learning_rate": 2.138999431495168e-05, + "loss": 5.3397, + "step": 10565 + }, + { + "epoch": 0.5843007186290768, + "grad_norm": 3.6306614875793457, + "learning_rate": 2.13757816941444e-05, + "loss": 5.3874, + "step": 10570 + }, + { + "epoch": 0.5845771144278606, + "grad_norm": 4.261897563934326, + "learning_rate": 2.1361569073337123e-05, + "loss": 5.0643, + "step": 10575 + }, + { + "epoch": 0.5848535102266446, + "grad_norm": 3.7982075214385986, + "learning_rate": 2.1347356452529847e-05, + "loss": 4.9988, + "step": 10580 + }, + { + "epoch": 0.5851299060254284, + "grad_norm": 3.575862407684326, + "learning_rate": 2.133314383172257e-05, + "loss": 5.3502, + "step": 10585 + }, + { + "epoch": 0.5854063018242123, + "grad_norm": 3.2091081142425537, + "learning_rate": 2.1318931210915295e-05, + "loss": 4.929, + "step": 10590 + }, + { + "epoch": 0.5856826976229962, + "grad_norm": 3.6131644248962402, + "learning_rate": 2.1304718590108016e-05, + "loss": 4.9555, + "step": 10595 + }, + { + "epoch": 0.58595909342178, + "grad_norm": 3.6742172241210938, + "learning_rate": 2.129050596930074e-05, + "loss": 5.0453, + "step": 10600 + }, + { + "epoch": 0.5862354892205639, + "grad_norm": 3.404371738433838, + "learning_rate": 2.1276293348493464e-05, + "loss": 4.9043, + "step": 10605 + }, + { + "epoch": 0.5865118850193477, + "grad_norm": 3.1416611671447754, + "learning_rate": 2.1262080727686187e-05, + "loss": 5.0373, + "step": 10610 + }, + { + "epoch": 0.5867882808181316, + "grad_norm": 3.7025134563446045, + "learning_rate": 2.124786810687891e-05, + "loss": 4.9599, + "step": 10615 + }, + { + "epoch": 0.5870646766169154, + "grad_norm": 3.171035051345825, + "learning_rate": 2.1233655486071632e-05, + "loss": 5.4027, + "step": 10620 + }, + { + "epoch": 0.5873410724156993, + "grad_norm": 3.402039051055908, + "learning_rate": 2.1219442865264356e-05, + "loss": 5.0835, + "step": 10625 + }, + { + "epoch": 0.5876174682144831, + "grad_norm": 3.2764732837677, + "learning_rate": 2.1205230244457077e-05, + "loss": 4.7431, + "step": 10630 + }, + { + "epoch": 0.587893864013267, + "grad_norm": 3.389270305633545, + "learning_rate": 2.1191017623649804e-05, + "loss": 5.0233, + "step": 10635 + }, + { + "epoch": 0.5881702598120508, + "grad_norm": 3.1218948364257812, + "learning_rate": 2.1176805002842525e-05, + "loss": 5.0945, + "step": 10640 + }, + { + "epoch": 0.5884466556108348, + "grad_norm": 3.973499298095703, + "learning_rate": 2.116259238203525e-05, + "loss": 4.7494, + "step": 10645 + }, + { + "epoch": 0.5887230514096186, + "grad_norm": 3.017624616622925, + "learning_rate": 2.1148379761227973e-05, + "loss": 4.8494, + "step": 10650 + }, + { + "epoch": 0.5889994472084025, + "grad_norm": 4.036647796630859, + "learning_rate": 2.1134167140420693e-05, + "loss": 5.053, + "step": 10655 + }, + { + "epoch": 0.5892758430071863, + "grad_norm": 3.0895462036132812, + "learning_rate": 2.111995451961342e-05, + "loss": 4.7882, + "step": 10660 + }, + { + "epoch": 0.5895522388059702, + "grad_norm": 3.1549055576324463, + "learning_rate": 2.110574189880614e-05, + "loss": 4.8596, + "step": 10665 + }, + { + "epoch": 0.589828634604754, + "grad_norm": 3.222073793411255, + "learning_rate": 2.1091529277998865e-05, + "loss": 5.1663, + "step": 10670 + }, + { + "epoch": 0.5901050304035379, + "grad_norm": 3.191425323486328, + "learning_rate": 2.1077316657191586e-05, + "loss": 5.1247, + "step": 10675 + }, + { + "epoch": 0.5903814262023217, + "grad_norm": 3.3451499938964844, + "learning_rate": 2.106310403638431e-05, + "loss": 5.0468, + "step": 10680 + }, + { + "epoch": 0.5906578220011056, + "grad_norm": 3.7087557315826416, + "learning_rate": 2.1048891415577034e-05, + "loss": 5.2902, + "step": 10685 + }, + { + "epoch": 0.5909342177998894, + "grad_norm": 3.4653618335723877, + "learning_rate": 2.1034678794769758e-05, + "loss": 4.8757, + "step": 10690 + }, + { + "epoch": 0.5912106135986733, + "grad_norm": 3.368715763092041, + "learning_rate": 2.102046617396248e-05, + "loss": 5.0481, + "step": 10695 + }, + { + "epoch": 0.5914870093974571, + "grad_norm": 3.7154407501220703, + "learning_rate": 2.1006253553155202e-05, + "loss": 4.8707, + "step": 10700 + }, + { + "epoch": 0.591763405196241, + "grad_norm": 2.9506330490112305, + "learning_rate": 2.0992040932347926e-05, + "loss": 4.9962, + "step": 10705 + }, + { + "epoch": 0.5920398009950248, + "grad_norm": 2.9487926959991455, + "learning_rate": 2.0977828311540647e-05, + "loss": 4.864, + "step": 10710 + }, + { + "epoch": 0.5923161967938088, + "grad_norm": 2.97090482711792, + "learning_rate": 2.0963615690733374e-05, + "loss": 4.7697, + "step": 10715 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 3.8541948795318604, + "learning_rate": 2.0949403069926095e-05, + "loss": 5.1119, + "step": 10720 + }, + { + "epoch": 0.5928689883913765, + "grad_norm": 3.0364882946014404, + "learning_rate": 2.093519044911882e-05, + "loss": 5.2197, + "step": 10725 + }, + { + "epoch": 0.5931453841901603, + "grad_norm": 3.167233943939209, + "learning_rate": 2.0920977828311543e-05, + "loss": 5.1334, + "step": 10730 + }, + { + "epoch": 0.5934217799889442, + "grad_norm": 2.8649866580963135, + "learning_rate": 2.0906765207504263e-05, + "loss": 4.886, + "step": 10735 + }, + { + "epoch": 0.593698175787728, + "grad_norm": 4.133547782897949, + "learning_rate": 2.0892552586696987e-05, + "loss": 5.1874, + "step": 10740 + }, + { + "epoch": 0.5939745715865119, + "grad_norm": 3.462965488433838, + "learning_rate": 2.087833996588971e-05, + "loss": 5.0722, + "step": 10745 + }, + { + "epoch": 0.5942509673852957, + "grad_norm": 3.6210412979125977, + "learning_rate": 2.0864127345082435e-05, + "loss": 4.7343, + "step": 10750 + }, + { + "epoch": 0.5945273631840796, + "grad_norm": 3.1893310546875, + "learning_rate": 2.0849914724275156e-05, + "loss": 4.9851, + "step": 10755 + }, + { + "epoch": 0.5948037589828634, + "grad_norm": 2.8404297828674316, + "learning_rate": 2.083570210346788e-05, + "loss": 5.1774, + "step": 10760 + }, + { + "epoch": 0.5950801547816473, + "grad_norm": 3.4504430294036865, + "learning_rate": 2.0821489482660604e-05, + "loss": 5.1506, + "step": 10765 + }, + { + "epoch": 0.5953565505804311, + "grad_norm": 3.756342649459839, + "learning_rate": 2.0807276861853324e-05, + "loss": 5.1785, + "step": 10770 + }, + { + "epoch": 0.595632946379215, + "grad_norm": 2.698963165283203, + "learning_rate": 2.0793064241046052e-05, + "loss": 5.1673, + "step": 10775 + }, + { + "epoch": 0.5959093421779988, + "grad_norm": 3.2271690368652344, + "learning_rate": 2.0778851620238772e-05, + "loss": 5.037, + "step": 10780 + }, + { + "epoch": 0.5961857379767828, + "grad_norm": 3.4815831184387207, + "learning_rate": 2.0764638999431496e-05, + "loss": 4.9544, + "step": 10785 + }, + { + "epoch": 0.5964621337755666, + "grad_norm": 3.6066627502441406, + "learning_rate": 2.075042637862422e-05, + "loss": 5.0425, + "step": 10790 + }, + { + "epoch": 0.5967385295743505, + "grad_norm": 3.649170160293579, + "learning_rate": 2.073621375781694e-05, + "loss": 5.303, + "step": 10795 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 3.320713996887207, + "learning_rate": 2.0722001137009668e-05, + "loss": 4.8067, + "step": 10800 + }, + { + "epoch": 0.5972913211719182, + "grad_norm": 2.9411003589630127, + "learning_rate": 2.070778851620239e-05, + "loss": 5.1317, + "step": 10805 + }, + { + "epoch": 0.5975677169707021, + "grad_norm": 3.370591878890991, + "learning_rate": 2.0693575895395113e-05, + "loss": 5.191, + "step": 10810 + }, + { + "epoch": 0.5978441127694859, + "grad_norm": 3.282804250717163, + "learning_rate": 2.0679363274587833e-05, + "loss": 5.2755, + "step": 10815 + }, + { + "epoch": 0.5981205085682698, + "grad_norm": 3.2764978408813477, + "learning_rate": 2.0665150653780557e-05, + "loss": 5.0115, + "step": 10820 + }, + { + "epoch": 0.5983969043670536, + "grad_norm": 3.380523920059204, + "learning_rate": 2.065093803297328e-05, + "loss": 4.9544, + "step": 10825 + }, + { + "epoch": 0.5986733001658375, + "grad_norm": 3.9343714714050293, + "learning_rate": 2.0636725412166005e-05, + "loss": 4.836, + "step": 10830 + }, + { + "epoch": 0.5989496959646213, + "grad_norm": 2.9544639587402344, + "learning_rate": 2.062251279135873e-05, + "loss": 4.939, + "step": 10835 + }, + { + "epoch": 0.5992260917634052, + "grad_norm": 3.6150293350219727, + "learning_rate": 2.060830017055145e-05, + "loss": 5.26, + "step": 10840 + }, + { + "epoch": 0.599502487562189, + "grad_norm": 3.480836868286133, + "learning_rate": 2.0594087549744174e-05, + "loss": 5.2313, + "step": 10845 + }, + { + "epoch": 0.599778883360973, + "grad_norm": 3.0056374073028564, + "learning_rate": 2.0579874928936894e-05, + "loss": 4.9642, + "step": 10850 + }, + { + "epoch": 0.6000552791597568, + "grad_norm": 3.1670284271240234, + "learning_rate": 2.0565662308129622e-05, + "loss": 5.4859, + "step": 10855 + }, + { + "epoch": 0.6003316749585407, + "grad_norm": 3.991672992706299, + "learning_rate": 2.0551449687322342e-05, + "loss": 5.167, + "step": 10860 + }, + { + "epoch": 0.6006080707573245, + "grad_norm": 3.690068244934082, + "learning_rate": 2.0537237066515066e-05, + "loss": 5.0009, + "step": 10865 + }, + { + "epoch": 0.6008844665561084, + "grad_norm": 3.5538177490234375, + "learning_rate": 2.052302444570779e-05, + "loss": 4.9969, + "step": 10870 + }, + { + "epoch": 0.6011608623548922, + "grad_norm": 3.6092166900634766, + "learning_rate": 2.050881182490051e-05, + "loss": 5.2605, + "step": 10875 + }, + { + "epoch": 0.6014372581536761, + "grad_norm": 3.576063871383667, + "learning_rate": 2.0494599204093235e-05, + "loss": 5.2605, + "step": 10880 + }, + { + "epoch": 0.6017136539524599, + "grad_norm": 3.104238271713257, + "learning_rate": 2.048038658328596e-05, + "loss": 4.956, + "step": 10885 + }, + { + "epoch": 0.6019900497512438, + "grad_norm": 2.788604974746704, + "learning_rate": 2.0466173962478683e-05, + "loss": 5.0503, + "step": 10890 + }, + { + "epoch": 0.6022664455500276, + "grad_norm": 3.3958215713500977, + "learning_rate": 2.0451961341671403e-05, + "loss": 5.4058, + "step": 10895 + }, + { + "epoch": 0.6025428413488115, + "grad_norm": 3.638075113296509, + "learning_rate": 2.0437748720864127e-05, + "loss": 5.0203, + "step": 10900 + }, + { + "epoch": 0.6028192371475953, + "grad_norm": 3.6704213619232178, + "learning_rate": 2.042353610005685e-05, + "loss": 5.0472, + "step": 10905 + }, + { + "epoch": 0.6030956329463792, + "grad_norm": 4.042377471923828, + "learning_rate": 2.0409323479249575e-05, + "loss": 4.8711, + "step": 10910 + }, + { + "epoch": 0.603372028745163, + "grad_norm": 3.991793632507324, + "learning_rate": 2.03951108584423e-05, + "loss": 5.3127, + "step": 10915 + }, + { + "epoch": 0.603648424543947, + "grad_norm": 3.0052876472473145, + "learning_rate": 2.038089823763502e-05, + "loss": 5.3358, + "step": 10920 + }, + { + "epoch": 0.6039248203427308, + "grad_norm": 3.848688840866089, + "learning_rate": 2.0366685616827744e-05, + "loss": 5.1195, + "step": 10925 + }, + { + "epoch": 0.6042012161415147, + "grad_norm": 3.1843132972717285, + "learning_rate": 2.0352472996020468e-05, + "loss": 5.0706, + "step": 10930 + }, + { + "epoch": 0.6044776119402985, + "grad_norm": 3.1511127948760986, + "learning_rate": 2.033826037521319e-05, + "loss": 5.0114, + "step": 10935 + }, + { + "epoch": 0.6047540077390824, + "grad_norm": 3.199019432067871, + "learning_rate": 2.0324047754405916e-05, + "loss": 4.9726, + "step": 10940 + }, + { + "epoch": 0.6050304035378662, + "grad_norm": 3.4878625869750977, + "learning_rate": 2.0309835133598637e-05, + "loss": 5.249, + "step": 10945 + }, + { + "epoch": 0.6053067993366501, + "grad_norm": 2.9809420108795166, + "learning_rate": 2.029562251279136e-05, + "loss": 5.2818, + "step": 10950 + }, + { + "epoch": 0.6055831951354339, + "grad_norm": 3.647505521774292, + "learning_rate": 2.028140989198408e-05, + "loss": 5.1207, + "step": 10955 + }, + { + "epoch": 0.6058595909342178, + "grad_norm": 3.185030698776245, + "learning_rate": 2.0267197271176805e-05, + "loss": 5.0107, + "step": 10960 + }, + { + "epoch": 0.6061359867330016, + "grad_norm": 4.0943803787231445, + "learning_rate": 2.025298465036953e-05, + "loss": 5.0815, + "step": 10965 + }, + { + "epoch": 0.6064123825317855, + "grad_norm": 2.887913703918457, + "learning_rate": 2.0238772029562253e-05, + "loss": 4.8405, + "step": 10970 + }, + { + "epoch": 0.6066887783305693, + "grad_norm": 3.262972831726074, + "learning_rate": 2.0224559408754977e-05, + "loss": 4.8914, + "step": 10975 + }, + { + "epoch": 0.6069651741293532, + "grad_norm": 4.215073585510254, + "learning_rate": 2.0210346787947698e-05, + "loss": 4.9772, + "step": 10980 + }, + { + "epoch": 0.607241569928137, + "grad_norm": 3.0118231773376465, + "learning_rate": 2.019613416714042e-05, + "loss": 4.9951, + "step": 10985 + }, + { + "epoch": 0.607517965726921, + "grad_norm": 4.327718257904053, + "learning_rate": 2.0181921546333142e-05, + "loss": 5.2855, + "step": 10990 + }, + { + "epoch": 0.6077943615257048, + "grad_norm": 3.5684874057769775, + "learning_rate": 2.016770892552587e-05, + "loss": 5.302, + "step": 10995 + }, + { + "epoch": 0.6080707573244887, + "grad_norm": 3.2276368141174316, + "learning_rate": 2.015349630471859e-05, + "loss": 5.1272, + "step": 11000 + }, + { + "epoch": 0.6083471531232725, + "grad_norm": 3.3742012977600098, + "learning_rate": 2.0139283683911314e-05, + "loss": 5.1438, + "step": 11005 + }, + { + "epoch": 0.6086235489220564, + "grad_norm": 3.5114636421203613, + "learning_rate": 2.0125071063104038e-05, + "loss": 4.9084, + "step": 11010 + }, + { + "epoch": 0.6088999447208402, + "grad_norm": 3.3797311782836914, + "learning_rate": 2.011085844229676e-05, + "loss": 4.9402, + "step": 11015 + }, + { + "epoch": 0.6091763405196241, + "grad_norm": 3.398404121398926, + "learning_rate": 2.0096645821489486e-05, + "loss": 4.8323, + "step": 11020 + }, + { + "epoch": 0.6094527363184079, + "grad_norm": 3.7108492851257324, + "learning_rate": 2.0082433200682207e-05, + "loss": 4.7783, + "step": 11025 + }, + { + "epoch": 0.6097291321171918, + "grad_norm": 3.121354341506958, + "learning_rate": 2.006822057987493e-05, + "loss": 5.045, + "step": 11030 + }, + { + "epoch": 0.6100055279159757, + "grad_norm": 4.430671691894531, + "learning_rate": 2.005400795906765e-05, + "loss": 5.0018, + "step": 11035 + }, + { + "epoch": 0.6102819237147595, + "grad_norm": 3.9316344261169434, + "learning_rate": 2.0039795338260375e-05, + "loss": 5.2804, + "step": 11040 + }, + { + "epoch": 0.6105583195135434, + "grad_norm": 3.610534906387329, + "learning_rate": 2.00255827174531e-05, + "loss": 5.0137, + "step": 11045 + }, + { + "epoch": 0.6108347153123272, + "grad_norm": 3.248753786087036, + "learning_rate": 2.0011370096645823e-05, + "loss": 4.9726, + "step": 11050 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 4.037750244140625, + "learning_rate": 1.9997157475838547e-05, + "loss": 5.3961, + "step": 11055 + }, + { + "epoch": 0.611387506909895, + "grad_norm": 3.1049892902374268, + "learning_rate": 1.9982944855031268e-05, + "loss": 5.026, + "step": 11060 + }, + { + "epoch": 0.6116639027086789, + "grad_norm": 3.5560545921325684, + "learning_rate": 1.996873223422399e-05, + "loss": 5.3866, + "step": 11065 + }, + { + "epoch": 0.6119402985074627, + "grad_norm": 3.1912312507629395, + "learning_rate": 1.9954519613416712e-05, + "loss": 5.1017, + "step": 11070 + }, + { + "epoch": 0.6122166943062466, + "grad_norm": 3.402047634124756, + "learning_rate": 1.994030699260944e-05, + "loss": 5.0437, + "step": 11075 + }, + { + "epoch": 0.6124930901050304, + "grad_norm": 4.292905807495117, + "learning_rate": 1.9926094371802164e-05, + "loss": 5.2194, + "step": 11080 + }, + { + "epoch": 0.6127694859038143, + "grad_norm": 3.1720829010009766, + "learning_rate": 1.9911881750994884e-05, + "loss": 4.989, + "step": 11085 + }, + { + "epoch": 0.6130458817025981, + "grad_norm": 3.152600049972534, + "learning_rate": 1.9897669130187608e-05, + "loss": 5.4581, + "step": 11090 + }, + { + "epoch": 0.613322277501382, + "grad_norm": 3.7568509578704834, + "learning_rate": 1.988345650938033e-05, + "loss": 5.183, + "step": 11095 + }, + { + "epoch": 0.6135986733001658, + "grad_norm": 3.13639497756958, + "learning_rate": 1.9869243888573053e-05, + "loss": 5.2017, + "step": 11100 + }, + { + "epoch": 0.6138750690989497, + "grad_norm": 3.1426844596862793, + "learning_rate": 1.9855031267765777e-05, + "loss": 4.9344, + "step": 11105 + }, + { + "epoch": 0.6141514648977335, + "grad_norm": 3.382380723953247, + "learning_rate": 1.98408186469585e-05, + "loss": 5.0299, + "step": 11110 + }, + { + "epoch": 0.6144278606965174, + "grad_norm": 3.4279749393463135, + "learning_rate": 1.9826606026151225e-05, + "loss": 5.0376, + "step": 11115 + }, + { + "epoch": 0.6147042564953012, + "grad_norm": 2.9934186935424805, + "learning_rate": 1.9812393405343945e-05, + "loss": 4.8845, + "step": 11120 + }, + { + "epoch": 0.6149806522940852, + "grad_norm": 3.1613216400146484, + "learning_rate": 1.979818078453667e-05, + "loss": 5.2844, + "step": 11125 + }, + { + "epoch": 0.615257048092869, + "grad_norm": 2.9240121841430664, + "learning_rate": 1.9783968163729393e-05, + "loss": 5.0547, + "step": 11130 + }, + { + "epoch": 0.6155334438916529, + "grad_norm": 3.2536087036132812, + "learning_rate": 1.9769755542922117e-05, + "loss": 4.9044, + "step": 11135 + }, + { + "epoch": 0.6158098396904367, + "grad_norm": 3.3475849628448486, + "learning_rate": 1.9755542922114838e-05, + "loss": 5.041, + "step": 11140 + }, + { + "epoch": 0.6160862354892206, + "grad_norm": 3.322903871536255, + "learning_rate": 1.9741330301307562e-05, + "loss": 4.9652, + "step": 11145 + }, + { + "epoch": 0.6163626312880044, + "grad_norm": 3.3216490745544434, + "learning_rate": 1.9727117680500286e-05, + "loss": 4.9637, + "step": 11150 + }, + { + "epoch": 0.6166390270867883, + "grad_norm": 3.5304057598114014, + "learning_rate": 1.9712905059693006e-05, + "loss": 5.2, + "step": 11155 + }, + { + "epoch": 0.6169154228855721, + "grad_norm": 3.0232765674591064, + "learning_rate": 1.9698692438885734e-05, + "loss": 4.9661, + "step": 11160 + }, + { + "epoch": 0.617191818684356, + "grad_norm": 3.3707964420318604, + "learning_rate": 1.9684479818078454e-05, + "loss": 5.2281, + "step": 11165 + }, + { + "epoch": 0.6174682144831398, + "grad_norm": 3.333315134048462, + "learning_rate": 1.9670267197271178e-05, + "loss": 5.0546, + "step": 11170 + }, + { + "epoch": 0.6177446102819237, + "grad_norm": 3.7123754024505615, + "learning_rate": 1.96560545764639e-05, + "loss": 4.9796, + "step": 11175 + }, + { + "epoch": 0.6180210060807075, + "grad_norm": 3.348001480102539, + "learning_rate": 1.9641841955656623e-05, + "loss": 5.06, + "step": 11180 + }, + { + "epoch": 0.6182974018794914, + "grad_norm": 3.643882989883423, + "learning_rate": 1.9627629334849347e-05, + "loss": 4.9581, + "step": 11185 + }, + { + "epoch": 0.6185737976782753, + "grad_norm": 4.164022922515869, + "learning_rate": 1.961341671404207e-05, + "loss": 5.1829, + "step": 11190 + }, + { + "epoch": 0.6188501934770592, + "grad_norm": 2.8525750637054443, + "learning_rate": 1.9599204093234795e-05, + "loss": 5.1341, + "step": 11195 + }, + { + "epoch": 0.619126589275843, + "grad_norm": 3.13280987739563, + "learning_rate": 1.9584991472427515e-05, + "loss": 4.6817, + "step": 11200 + }, + { + "epoch": 0.6194029850746269, + "grad_norm": 3.3129093647003174, + "learning_rate": 1.957077885162024e-05, + "loss": 4.7727, + "step": 11205 + }, + { + "epoch": 0.6196793808734107, + "grad_norm": 3.110025405883789, + "learning_rate": 1.955656623081296e-05, + "loss": 5.0595, + "step": 11210 + }, + { + "epoch": 0.6199557766721946, + "grad_norm": 3.4862754344940186, + "learning_rate": 1.9542353610005687e-05, + "loss": 4.9615, + "step": 11215 + }, + { + "epoch": 0.6202321724709784, + "grad_norm": 2.9289820194244385, + "learning_rate": 1.9528140989198408e-05, + "loss": 5.0799, + "step": 11220 + }, + { + "epoch": 0.6205085682697623, + "grad_norm": 3.868337631225586, + "learning_rate": 1.9513928368391132e-05, + "loss": 5.0724, + "step": 11225 + }, + { + "epoch": 0.6207849640685461, + "grad_norm": 3.828287124633789, + "learning_rate": 1.9499715747583856e-05, + "loss": 4.9354, + "step": 11230 + }, + { + "epoch": 0.62106135986733, + "grad_norm": 3.591191291809082, + "learning_rate": 1.9485503126776576e-05, + "loss": 5.0352, + "step": 11235 + }, + { + "epoch": 0.6213377556661138, + "grad_norm": 3.2487423419952393, + "learning_rate": 1.9471290505969304e-05, + "loss": 4.6987, + "step": 11240 + }, + { + "epoch": 0.6216141514648977, + "grad_norm": 3.1650702953338623, + "learning_rate": 1.9457077885162024e-05, + "loss": 5.0245, + "step": 11245 + }, + { + "epoch": 0.6218905472636815, + "grad_norm": 3.4682087898254395, + "learning_rate": 1.944286526435475e-05, + "loss": 4.9166, + "step": 11250 + }, + { + "epoch": 0.6221669430624654, + "grad_norm": 3.3193416595458984, + "learning_rate": 1.9428652643547472e-05, + "loss": 5.1592, + "step": 11255 + }, + { + "epoch": 0.6224433388612494, + "grad_norm": 3.075951337814331, + "learning_rate": 1.9414440022740193e-05, + "loss": 5.0866, + "step": 11260 + }, + { + "epoch": 0.6227197346600332, + "grad_norm": 3.391075372695923, + "learning_rate": 1.9400227401932917e-05, + "loss": 5.1489, + "step": 11265 + }, + { + "epoch": 0.6229961304588171, + "grad_norm": 3.094604015350342, + "learning_rate": 1.938601478112564e-05, + "loss": 5.0992, + "step": 11270 + }, + { + "epoch": 0.6232725262576009, + "grad_norm": 3.3130276203155518, + "learning_rate": 1.9371802160318365e-05, + "loss": 4.941, + "step": 11275 + }, + { + "epoch": 0.6235489220563848, + "grad_norm": 4.256862640380859, + "learning_rate": 1.9357589539511086e-05, + "loss": 5.0641, + "step": 11280 + }, + { + "epoch": 0.6238253178551686, + "grad_norm": 3.174084186553955, + "learning_rate": 1.934337691870381e-05, + "loss": 5.2618, + "step": 11285 + }, + { + "epoch": 0.6241017136539525, + "grad_norm": 4.607797145843506, + "learning_rate": 1.9329164297896533e-05, + "loss": 4.6887, + "step": 11290 + }, + { + "epoch": 0.6243781094527363, + "grad_norm": 3.27817964553833, + "learning_rate": 1.9314951677089257e-05, + "loss": 5.25, + "step": 11295 + }, + { + "epoch": 0.6246545052515202, + "grad_norm": 3.901153564453125, + "learning_rate": 1.930073905628198e-05, + "loss": 5.1067, + "step": 11300 + }, + { + "epoch": 0.624930901050304, + "grad_norm": 4.034906387329102, + "learning_rate": 1.9286526435474702e-05, + "loss": 5.0124, + "step": 11305 + }, + { + "epoch": 0.6252072968490879, + "grad_norm": 3.6741175651550293, + "learning_rate": 1.9272313814667426e-05, + "loss": 5.1274, + "step": 11310 + }, + { + "epoch": 0.6254836926478717, + "grad_norm": 3.136146068572998, + "learning_rate": 1.9258101193860147e-05, + "loss": 4.9906, + "step": 11315 + }, + { + "epoch": 0.6257600884466556, + "grad_norm": 2.70194673538208, + "learning_rate": 1.924388857305287e-05, + "loss": 4.8283, + "step": 11320 + }, + { + "epoch": 0.6260364842454395, + "grad_norm": 3.145467758178711, + "learning_rate": 1.9229675952245595e-05, + "loss": 5.2228, + "step": 11325 + }, + { + "epoch": 0.6263128800442234, + "grad_norm": 3.2286715507507324, + "learning_rate": 1.921546333143832e-05, + "loss": 4.9371, + "step": 11330 + }, + { + "epoch": 0.6265892758430072, + "grad_norm": 3.4829323291778564, + "learning_rate": 1.9201250710631043e-05, + "loss": 4.8272, + "step": 11335 + }, + { + "epoch": 0.6268656716417911, + "grad_norm": 3.874124050140381, + "learning_rate": 1.9187038089823763e-05, + "loss": 5.1033, + "step": 11340 + }, + { + "epoch": 0.6271420674405749, + "grad_norm": 3.66743540763855, + "learning_rate": 1.9172825469016487e-05, + "loss": 5.012, + "step": 11345 + }, + { + "epoch": 0.6274184632393588, + "grad_norm": 3.3614590167999268, + "learning_rate": 1.915861284820921e-05, + "loss": 5.2151, + "step": 11350 + }, + { + "epoch": 0.6276948590381426, + "grad_norm": 3.108370780944824, + "learning_rate": 1.9144400227401935e-05, + "loss": 4.9544, + "step": 11355 + }, + { + "epoch": 0.6279712548369265, + "grad_norm": 4.2630391120910645, + "learning_rate": 1.9130187606594656e-05, + "loss": 4.6829, + "step": 11360 + }, + { + "epoch": 0.6282476506357103, + "grad_norm": 4.369514465332031, + "learning_rate": 1.911597498578738e-05, + "loss": 5.0551, + "step": 11365 + }, + { + "epoch": 0.6285240464344942, + "grad_norm": 3.091458559036255, + "learning_rate": 1.9101762364980104e-05, + "loss": 5.2453, + "step": 11370 + }, + { + "epoch": 0.628800442233278, + "grad_norm": 3.507791757583618, + "learning_rate": 1.9087549744172824e-05, + "loss": 4.803, + "step": 11375 + }, + { + "epoch": 0.6290768380320619, + "grad_norm": 3.9560585021972656, + "learning_rate": 1.907333712336555e-05, + "loss": 4.9198, + "step": 11380 + }, + { + "epoch": 0.6293532338308457, + "grad_norm": 3.2745492458343506, + "learning_rate": 1.9059124502558272e-05, + "loss": 5.143, + "step": 11385 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 3.749288558959961, + "learning_rate": 1.9044911881750996e-05, + "loss": 5.0508, + "step": 11390 + }, + { + "epoch": 0.6299060254284135, + "grad_norm": 3.2741405963897705, + "learning_rate": 1.903069926094372e-05, + "loss": 4.849, + "step": 11395 + }, + { + "epoch": 0.6301824212271974, + "grad_norm": 3.3491995334625244, + "learning_rate": 1.901648664013644e-05, + "loss": 5.0678, + "step": 11400 + }, + { + "epoch": 0.6304588170259812, + "grad_norm": 3.6836395263671875, + "learning_rate": 1.9002274019329168e-05, + "loss": 5.2173, + "step": 11405 + }, + { + "epoch": 0.6307352128247651, + "grad_norm": 3.4828968048095703, + "learning_rate": 1.898806139852189e-05, + "loss": 4.8904, + "step": 11410 + }, + { + "epoch": 0.6310116086235489, + "grad_norm": 3.472628355026245, + "learning_rate": 1.8973848777714613e-05, + "loss": 5.1827, + "step": 11415 + }, + { + "epoch": 0.6312880044223328, + "grad_norm": 3.0321156978607178, + "learning_rate": 1.8959636156907333e-05, + "loss": 4.8185, + "step": 11420 + }, + { + "epoch": 0.6315644002211166, + "grad_norm": 3.027168035507202, + "learning_rate": 1.8945423536100057e-05, + "loss": 5.1646, + "step": 11425 + }, + { + "epoch": 0.6318407960199005, + "grad_norm": 3.633697748184204, + "learning_rate": 1.893121091529278e-05, + "loss": 4.8653, + "step": 11430 + }, + { + "epoch": 0.6321171918186843, + "grad_norm": 3.8917782306671143, + "learning_rate": 1.8916998294485505e-05, + "loss": 5.1636, + "step": 11435 + }, + { + "epoch": 0.6323935876174682, + "grad_norm": 3.511700391769409, + "learning_rate": 1.890278567367823e-05, + "loss": 5.0657, + "step": 11440 + }, + { + "epoch": 0.632669983416252, + "grad_norm": 3.8616366386413574, + "learning_rate": 1.888857305287095e-05, + "loss": 5.085, + "step": 11445 + }, + { + "epoch": 0.6329463792150359, + "grad_norm": 3.9748528003692627, + "learning_rate": 1.8874360432063674e-05, + "loss": 5.2986, + "step": 11450 + }, + { + "epoch": 0.6332227750138197, + "grad_norm": 3.7701029777526855, + "learning_rate": 1.8860147811256394e-05, + "loss": 5.0137, + "step": 11455 + }, + { + "epoch": 0.6334991708126037, + "grad_norm": 3.903155565261841, + "learning_rate": 1.884593519044912e-05, + "loss": 5.0449, + "step": 11460 + }, + { + "epoch": 0.6337755666113875, + "grad_norm": 4.124213218688965, + "learning_rate": 1.8831722569641842e-05, + "loss": 4.9746, + "step": 11465 + }, + { + "epoch": 0.6340519624101714, + "grad_norm": 3.3113105297088623, + "learning_rate": 1.8817509948834566e-05, + "loss": 5.3676, + "step": 11470 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 3.800441265106201, + "learning_rate": 1.880329732802729e-05, + "loss": 4.9518, + "step": 11475 + }, + { + "epoch": 0.6346047540077391, + "grad_norm": 3.8035666942596436, + "learning_rate": 1.878908470722001e-05, + "loss": 5.1185, + "step": 11480 + }, + { + "epoch": 0.634881149806523, + "grad_norm": 3.5756587982177734, + "learning_rate": 1.8774872086412735e-05, + "loss": 5.027, + "step": 11485 + }, + { + "epoch": 0.6351575456053068, + "grad_norm": 4.0546875, + "learning_rate": 1.876065946560546e-05, + "loss": 5.163, + "step": 11490 + }, + { + "epoch": 0.6354339414040907, + "grad_norm": 3.088939666748047, + "learning_rate": 1.8746446844798183e-05, + "loss": 5.0475, + "step": 11495 + }, + { + "epoch": 0.6357103372028745, + "grad_norm": 3.1749298572540283, + "learning_rate": 1.8732234223990903e-05, + "loss": 4.7649, + "step": 11500 + }, + { + "epoch": 0.6359867330016584, + "grad_norm": 3.7194294929504395, + "learning_rate": 1.8718021603183627e-05, + "loss": 4.9005, + "step": 11505 + }, + { + "epoch": 0.6362631288004422, + "grad_norm": 3.6377458572387695, + "learning_rate": 1.870380898237635e-05, + "loss": 4.8569, + "step": 11510 + }, + { + "epoch": 0.6365395245992261, + "grad_norm": 3.6304149627685547, + "learning_rate": 1.8689596361569075e-05, + "loss": 5.1913, + "step": 11515 + }, + { + "epoch": 0.6368159203980099, + "grad_norm": 3.7646942138671875, + "learning_rate": 1.86753837407618e-05, + "loss": 4.9044, + "step": 11520 + }, + { + "epoch": 0.6370923161967939, + "grad_norm": 2.9954702854156494, + "learning_rate": 1.866117111995452e-05, + "loss": 4.9666, + "step": 11525 + }, + { + "epoch": 0.6373687119955777, + "grad_norm": 3.282142162322998, + "learning_rate": 1.8646958499147244e-05, + "loss": 4.8544, + "step": 11530 + }, + { + "epoch": 0.6376451077943616, + "grad_norm": 3.545342206954956, + "learning_rate": 1.8632745878339964e-05, + "loss": 4.9769, + "step": 11535 + }, + { + "epoch": 0.6379215035931454, + "grad_norm": 3.873244285583496, + "learning_rate": 1.861853325753269e-05, + "loss": 5.1524, + "step": 11540 + }, + { + "epoch": 0.6381978993919293, + "grad_norm": 3.273556709289551, + "learning_rate": 1.8604320636725416e-05, + "loss": 4.9264, + "step": 11545 + }, + { + "epoch": 0.6384742951907131, + "grad_norm": 4.054984092712402, + "learning_rate": 1.8590108015918136e-05, + "loss": 4.9509, + "step": 11550 + }, + { + "epoch": 0.638750690989497, + "grad_norm": 4.351573467254639, + "learning_rate": 1.857589539511086e-05, + "loss": 5.0652, + "step": 11555 + }, + { + "epoch": 0.6390270867882808, + "grad_norm": 3.1132781505584717, + "learning_rate": 1.856168277430358e-05, + "loss": 5.0784, + "step": 11560 + }, + { + "epoch": 0.6393034825870647, + "grad_norm": 3.2859065532684326, + "learning_rate": 1.8547470153496305e-05, + "loss": 5.099, + "step": 11565 + }, + { + "epoch": 0.6395798783858485, + "grad_norm": 3.499131441116333, + "learning_rate": 1.853325753268903e-05, + "loss": 5.0266, + "step": 11570 + }, + { + "epoch": 0.6398562741846324, + "grad_norm": 3.202064037322998, + "learning_rate": 1.8519044911881753e-05, + "loss": 4.8385, + "step": 11575 + }, + { + "epoch": 0.6401326699834162, + "grad_norm": 3.816030502319336, + "learning_rate": 1.8504832291074477e-05, + "loss": 4.8693, + "step": 11580 + }, + { + "epoch": 0.6404090657822001, + "grad_norm": 3.5522069931030273, + "learning_rate": 1.8490619670267197e-05, + "loss": 5.2284, + "step": 11585 + }, + { + "epoch": 0.6406854615809839, + "grad_norm": 2.9385523796081543, + "learning_rate": 1.847640704945992e-05, + "loss": 4.9548, + "step": 11590 + }, + { + "epoch": 0.6409618573797679, + "grad_norm": 3.6049933433532715, + "learning_rate": 1.8462194428652642e-05, + "loss": 4.9881, + "step": 11595 + }, + { + "epoch": 0.6412382531785517, + "grad_norm": 2.8920090198516846, + "learning_rate": 1.844798180784537e-05, + "loss": 4.9215, + "step": 11600 + }, + { + "epoch": 0.6415146489773356, + "grad_norm": 3.008737564086914, + "learning_rate": 1.843376918703809e-05, + "loss": 4.9579, + "step": 11605 + }, + { + "epoch": 0.6417910447761194, + "grad_norm": 3.340202808380127, + "learning_rate": 1.8419556566230814e-05, + "loss": 4.9546, + "step": 11610 + }, + { + "epoch": 0.6420674405749033, + "grad_norm": 4.0580973625183105, + "learning_rate": 1.8405343945423538e-05, + "loss": 5.3337, + "step": 11615 + }, + { + "epoch": 0.6423438363736871, + "grad_norm": 3.040929079055786, + "learning_rate": 1.839113132461626e-05, + "loss": 4.8372, + "step": 11620 + }, + { + "epoch": 0.642620232172471, + "grad_norm": 3.111201524734497, + "learning_rate": 1.8376918703808986e-05, + "loss": 5.1435, + "step": 11625 + }, + { + "epoch": 0.6428966279712548, + "grad_norm": 3.6376893520355225, + "learning_rate": 1.8362706083001706e-05, + "loss": 4.9107, + "step": 11630 + }, + { + "epoch": 0.6431730237700387, + "grad_norm": 3.81954288482666, + "learning_rate": 1.834849346219443e-05, + "loss": 5.0791, + "step": 11635 + }, + { + "epoch": 0.6434494195688225, + "grad_norm": 2.9065468311309814, + "learning_rate": 1.833428084138715e-05, + "loss": 5.0171, + "step": 11640 + }, + { + "epoch": 0.6437258153676064, + "grad_norm": 3.3919105529785156, + "learning_rate": 1.8320068220579875e-05, + "loss": 4.9347, + "step": 11645 + }, + { + "epoch": 0.6440022111663902, + "grad_norm": 2.9932496547698975, + "learning_rate": 1.83058555997726e-05, + "loss": 5.207, + "step": 11650 + }, + { + "epoch": 0.6442786069651741, + "grad_norm": 3.7456681728363037, + "learning_rate": 1.8291642978965323e-05, + "loss": 4.9296, + "step": 11655 + }, + { + "epoch": 0.6445550027639579, + "grad_norm": 3.1489369869232178, + "learning_rate": 1.8277430358158047e-05, + "loss": 4.8545, + "step": 11660 + }, + { + "epoch": 0.6448313985627419, + "grad_norm": 3.770603895187378, + "learning_rate": 1.8263217737350768e-05, + "loss": 4.9182, + "step": 11665 + }, + { + "epoch": 0.6451077943615257, + "grad_norm": 3.3885293006896973, + "learning_rate": 1.824900511654349e-05, + "loss": 5.0724, + "step": 11670 + }, + { + "epoch": 0.6453841901603096, + "grad_norm": 3.6072707176208496, + "learning_rate": 1.8234792495736212e-05, + "loss": 4.9864, + "step": 11675 + }, + { + "epoch": 0.6456605859590934, + "grad_norm": 3.027808904647827, + "learning_rate": 1.822057987492894e-05, + "loss": 5.3012, + "step": 11680 + }, + { + "epoch": 0.6459369817578773, + "grad_norm": 3.8706979751586914, + "learning_rate": 1.8206367254121663e-05, + "loss": 5.0479, + "step": 11685 + }, + { + "epoch": 0.6462133775566611, + "grad_norm": 2.9442152976989746, + "learning_rate": 1.8192154633314384e-05, + "loss": 5.153, + "step": 11690 + }, + { + "epoch": 0.646489773355445, + "grad_norm": 3.645040512084961, + "learning_rate": 1.8177942012507108e-05, + "loss": 5.0203, + "step": 11695 + }, + { + "epoch": 0.6467661691542289, + "grad_norm": 3.7537355422973633, + "learning_rate": 1.816372939169983e-05, + "loss": 5.155, + "step": 11700 + }, + { + "epoch": 0.6470425649530127, + "grad_norm": 3.36226749420166, + "learning_rate": 1.8149516770892553e-05, + "loss": 5.1796, + "step": 11705 + }, + { + "epoch": 0.6473189607517966, + "grad_norm": 3.477741241455078, + "learning_rate": 1.8135304150085277e-05, + "loss": 4.8361, + "step": 11710 + }, + { + "epoch": 0.6475953565505804, + "grad_norm": 3.8094239234924316, + "learning_rate": 1.8121091529278e-05, + "loss": 4.9998, + "step": 11715 + }, + { + "epoch": 0.6478717523493643, + "grad_norm": 3.418806791305542, + "learning_rate": 1.8106878908470725e-05, + "loss": 5.1284, + "step": 11720 + }, + { + "epoch": 0.6481481481481481, + "grad_norm": 3.108595132827759, + "learning_rate": 1.8092666287663445e-05, + "loss": 5.0816, + "step": 11725 + }, + { + "epoch": 0.648424543946932, + "grad_norm": 3.583040475845337, + "learning_rate": 1.807845366685617e-05, + "loss": 5.0151, + "step": 11730 + }, + { + "epoch": 0.6487009397457159, + "grad_norm": 2.922956705093384, + "learning_rate": 1.8064241046048893e-05, + "loss": 5.2425, + "step": 11735 + }, + { + "epoch": 0.6489773355444998, + "grad_norm": 3.604804039001465, + "learning_rate": 1.8050028425241617e-05, + "loss": 4.9848, + "step": 11740 + }, + { + "epoch": 0.6492537313432836, + "grad_norm": 3.4737608432769775, + "learning_rate": 1.8035815804434338e-05, + "loss": 5.0245, + "step": 11745 + }, + { + "epoch": 0.6495301271420675, + "grad_norm": 3.2414677143096924, + "learning_rate": 1.802160318362706e-05, + "loss": 4.8838, + "step": 11750 + }, + { + "epoch": 0.6498065229408513, + "grad_norm": 3.7715156078338623, + "learning_rate": 1.8007390562819786e-05, + "loss": 4.8937, + "step": 11755 + }, + { + "epoch": 0.6500829187396352, + "grad_norm": 3.0882837772369385, + "learning_rate": 1.7993177942012506e-05, + "loss": 5.4637, + "step": 11760 + }, + { + "epoch": 0.650359314538419, + "grad_norm": 3.7343342304229736, + "learning_rate": 1.7978965321205234e-05, + "loss": 4.9524, + "step": 11765 + }, + { + "epoch": 0.6506357103372029, + "grad_norm": 3.9432291984558105, + "learning_rate": 1.7964752700397954e-05, + "loss": 5.1691, + "step": 11770 + }, + { + "epoch": 0.6509121061359867, + "grad_norm": 3.1902172565460205, + "learning_rate": 1.7950540079590678e-05, + "loss": 5.2231, + "step": 11775 + }, + { + "epoch": 0.6511885019347706, + "grad_norm": 3.733781099319458, + "learning_rate": 1.79363274587834e-05, + "loss": 5.1506, + "step": 11780 + }, + { + "epoch": 0.6514648977335544, + "grad_norm": 3.6218392848968506, + "learning_rate": 1.7922114837976123e-05, + "loss": 5.1997, + "step": 11785 + }, + { + "epoch": 0.6517412935323383, + "grad_norm": 4.095831394195557, + "learning_rate": 1.7907902217168847e-05, + "loss": 4.934, + "step": 11790 + }, + { + "epoch": 0.6520176893311221, + "grad_norm": 2.9039041996002197, + "learning_rate": 1.789368959636157e-05, + "loss": 4.8885, + "step": 11795 + }, + { + "epoch": 0.652294085129906, + "grad_norm": 2.837620496749878, + "learning_rate": 1.7879476975554295e-05, + "loss": 4.7736, + "step": 11800 + }, + { + "epoch": 0.6525704809286899, + "grad_norm": 4.38239049911499, + "learning_rate": 1.7865264354747015e-05, + "loss": 5.1993, + "step": 11805 + }, + { + "epoch": 0.6528468767274738, + "grad_norm": 3.1232762336730957, + "learning_rate": 1.785105173393974e-05, + "loss": 4.8217, + "step": 11810 + }, + { + "epoch": 0.6531232725262576, + "grad_norm": 3.8106727600097656, + "learning_rate": 1.783683911313246e-05, + "loss": 5.0121, + "step": 11815 + }, + { + "epoch": 0.6533996683250415, + "grad_norm": 3.4995906352996826, + "learning_rate": 1.7822626492325187e-05, + "loss": 5.0138, + "step": 11820 + }, + { + "epoch": 0.6536760641238253, + "grad_norm": 3.8078784942626953, + "learning_rate": 1.7808413871517908e-05, + "loss": 5.102, + "step": 11825 + }, + { + "epoch": 0.6539524599226092, + "grad_norm": 3.3027868270874023, + "learning_rate": 1.7794201250710632e-05, + "loss": 4.8747, + "step": 11830 + }, + { + "epoch": 0.654228855721393, + "grad_norm": 3.113663673400879, + "learning_rate": 1.7779988629903356e-05, + "loss": 4.9301, + "step": 11835 + }, + { + "epoch": 0.6545052515201769, + "grad_norm": 3.6533520221710205, + "learning_rate": 1.7765776009096076e-05, + "loss": 4.8014, + "step": 11840 + }, + { + "epoch": 0.6547816473189607, + "grad_norm": 3.131390333175659, + "learning_rate": 1.77515633882888e-05, + "loss": 4.8878, + "step": 11845 + }, + { + "epoch": 0.6550580431177446, + "grad_norm": 3.265279769897461, + "learning_rate": 1.7737350767481524e-05, + "loss": 5.1533, + "step": 11850 + }, + { + "epoch": 0.6553344389165284, + "grad_norm": 3.4372310638427734, + "learning_rate": 1.7723138146674248e-05, + "loss": 4.766, + "step": 11855 + }, + { + "epoch": 0.6556108347153123, + "grad_norm": 3.4847805500030518, + "learning_rate": 1.7708925525866972e-05, + "loss": 4.963, + "step": 11860 + }, + { + "epoch": 0.6558872305140961, + "grad_norm": 4.266146659851074, + "learning_rate": 1.7694712905059693e-05, + "loss": 4.8469, + "step": 11865 + }, + { + "epoch": 0.65616362631288, + "grad_norm": 3.8087639808654785, + "learning_rate": 1.7680500284252417e-05, + "loss": 5.1229, + "step": 11870 + }, + { + "epoch": 0.6564400221116639, + "grad_norm": 3.209784507751465, + "learning_rate": 1.766628766344514e-05, + "loss": 5.0318, + "step": 11875 + }, + { + "epoch": 0.6567164179104478, + "grad_norm": 3.074418306350708, + "learning_rate": 1.7652075042637865e-05, + "loss": 4.9064, + "step": 11880 + }, + { + "epoch": 0.6569928137092316, + "grad_norm": 3.3850672245025635, + "learning_rate": 1.7637862421830585e-05, + "loss": 4.882, + "step": 11885 + }, + { + "epoch": 0.6572692095080155, + "grad_norm": 2.94960618019104, + "learning_rate": 1.762364980102331e-05, + "loss": 5.1099, + "step": 11890 + }, + { + "epoch": 0.6575456053067993, + "grad_norm": 3.0878140926361084, + "learning_rate": 1.7609437180216033e-05, + "loss": 5.1642, + "step": 11895 + }, + { + "epoch": 0.6578220011055832, + "grad_norm": 3.899074077606201, + "learning_rate": 1.7595224559408754e-05, + "loss": 5.0711, + "step": 11900 + }, + { + "epoch": 0.658098396904367, + "grad_norm": 3.6305460929870605, + "learning_rate": 1.758101193860148e-05, + "loss": 4.9042, + "step": 11905 + }, + { + "epoch": 0.6583747927031509, + "grad_norm": 3.7980971336364746, + "learning_rate": 1.7566799317794202e-05, + "loss": 5.0719, + "step": 11910 + }, + { + "epoch": 0.6586511885019348, + "grad_norm": 3.5552990436553955, + "learning_rate": 1.7552586696986926e-05, + "loss": 5.0205, + "step": 11915 + }, + { + "epoch": 0.6589275843007186, + "grad_norm": 3.524707794189453, + "learning_rate": 1.7538374076179646e-05, + "loss": 4.978, + "step": 11920 + }, + { + "epoch": 0.6592039800995025, + "grad_norm": 3.4494073390960693, + "learning_rate": 1.752416145537237e-05, + "loss": 4.9419, + "step": 11925 + }, + { + "epoch": 0.6594803758982863, + "grad_norm": 2.977220058441162, + "learning_rate": 1.7509948834565094e-05, + "loss": 5.1477, + "step": 11930 + }, + { + "epoch": 0.6597567716970703, + "grad_norm": 4.73137903213501, + "learning_rate": 1.749573621375782e-05, + "loss": 4.9816, + "step": 11935 + }, + { + "epoch": 0.6600331674958541, + "grad_norm": 3.988065719604492, + "learning_rate": 1.7481523592950542e-05, + "loss": 4.8524, + "step": 11940 + }, + { + "epoch": 0.660309563294638, + "grad_norm": 3.581176996231079, + "learning_rate": 1.7467310972143263e-05, + "loss": 5.3324, + "step": 11945 + }, + { + "epoch": 0.6605859590934218, + "grad_norm": 3.1877663135528564, + "learning_rate": 1.7453098351335987e-05, + "loss": 4.9964, + "step": 11950 + }, + { + "epoch": 0.6608623548922057, + "grad_norm": 3.8775217533111572, + "learning_rate": 1.7438885730528708e-05, + "loss": 4.9952, + "step": 11955 + }, + { + "epoch": 0.6611387506909895, + "grad_norm": 3.670485496520996, + "learning_rate": 1.7424673109721435e-05, + "loss": 4.9151, + "step": 11960 + }, + { + "epoch": 0.6614151464897734, + "grad_norm": 5.177221298217773, + "learning_rate": 1.7410460488914155e-05, + "loss": 5.3336, + "step": 11965 + }, + { + "epoch": 0.6616915422885572, + "grad_norm": 3.854691982269287, + "learning_rate": 1.739624786810688e-05, + "loss": 5.2463, + "step": 11970 + }, + { + "epoch": 0.6619679380873411, + "grad_norm": 3.330227851867676, + "learning_rate": 1.7382035247299603e-05, + "loss": 4.4221, + "step": 11975 + }, + { + "epoch": 0.6622443338861249, + "grad_norm": 3.3024230003356934, + "learning_rate": 1.7367822626492324e-05, + "loss": 4.9856, + "step": 11980 + }, + { + "epoch": 0.6625207296849088, + "grad_norm": 3.446526527404785, + "learning_rate": 1.735361000568505e-05, + "loss": 4.9725, + "step": 11985 + }, + { + "epoch": 0.6627971254836926, + "grad_norm": 3.300661087036133, + "learning_rate": 1.7339397384877772e-05, + "loss": 4.965, + "step": 11990 + }, + { + "epoch": 0.6630735212824765, + "grad_norm": 3.3495724201202393, + "learning_rate": 1.7325184764070496e-05, + "loss": 5.1418, + "step": 11995 + }, + { + "epoch": 0.6633499170812603, + "grad_norm": 3.493957281112671, + "learning_rate": 1.731097214326322e-05, + "loss": 4.8989, + "step": 12000 + } + ], + "logging_steps": 5, + "max_steps": 18090, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1151619342811136e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}