{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.984819734345351, "eval_steps": 500, "global_step": 10500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003795066413662239, "grad_norm": 1.7131669521331787, "learning_rate": 1.6666666666666667e-06, "loss": 0.6804, "step": 10 }, { "epoch": 0.007590132827324478, "grad_norm": 1.7053213119506836, "learning_rate": 3.3333333333333333e-06, "loss": 0.6854, "step": 20 }, { "epoch": 0.011385199240986717, "grad_norm": 1.8932372331619263, "learning_rate": 5e-06, "loss": 0.6597, "step": 30 }, { "epoch": 0.015180265654648957, "grad_norm": 1.516872763633728, "learning_rate": 6.666666666666667e-06, "loss": 0.6243, "step": 40 }, { "epoch": 0.018975332068311195, "grad_norm": 1.5149081945419312, "learning_rate": 8.333333333333334e-06, "loss": 0.5632, "step": 50 }, { "epoch": 0.022770398481973434, "grad_norm": 1.0706552267074585, "learning_rate": 1e-05, "loss": 0.5289, "step": 60 }, { "epoch": 0.026565464895635674, "grad_norm": 1.102160930633545, "learning_rate": 1.1666666666666668e-05, "loss": 0.5263, "step": 70 }, { "epoch": 0.030360531309297913, "grad_norm": 1.2059059143066406, "learning_rate": 1.3333333333333333e-05, "loss": 0.5466, "step": 80 }, { "epoch": 0.03415559772296015, "grad_norm": 1.0622307062149048, "learning_rate": 1.5e-05, "loss": 0.4918, "step": 90 }, { "epoch": 0.03795066413662239, "grad_norm": 1.5696407556533813, "learning_rate": 1.6666666666666667e-05, "loss": 0.516, "step": 100 }, { "epoch": 0.04174573055028463, "grad_norm": 1.49858820438385, "learning_rate": 1.8333333333333333e-05, "loss": 0.5024, "step": 110 }, { "epoch": 0.04554079696394687, "grad_norm": 1.5996527671813965, "learning_rate": 2e-05, "loss": 0.4775, "step": 120 }, { "epoch": 0.04933586337760911, "grad_norm": 1.6391699314117432, "learning_rate": 2.1666666666666667e-05, "loss": 0.5028, "step": 130 }, { "epoch": 0.05313092979127135, "grad_norm": 1.5045441389083862, "learning_rate": 2.3333333333333336e-05, "loss": 0.472, "step": 140 }, { "epoch": 0.056925996204933584, "grad_norm": 1.1791646480560303, "learning_rate": 2.5e-05, "loss": 0.4606, "step": 150 }, { "epoch": 0.06072106261859583, "grad_norm": 1.3659300804138184, "learning_rate": 2.6666666666666667e-05, "loss": 0.527, "step": 160 }, { "epoch": 0.06451612903225806, "grad_norm": 0.9830155968666077, "learning_rate": 2.8333333333333335e-05, "loss": 0.458, "step": 170 }, { "epoch": 0.0683111954459203, "grad_norm": 1.6211776733398438, "learning_rate": 3e-05, "loss": 0.4613, "step": 180 }, { "epoch": 0.07210626185958255, "grad_norm": 1.9507710933685303, "learning_rate": 3.1666666666666666e-05, "loss": 0.4531, "step": 190 }, { "epoch": 0.07590132827324478, "grad_norm": 1.312615156173706, "learning_rate": 3.3333333333333335e-05, "loss": 0.4384, "step": 200 }, { "epoch": 0.07969639468690702, "grad_norm": 2.034919261932373, "learning_rate": 3.5e-05, "loss": 0.4747, "step": 210 }, { "epoch": 0.08349146110056926, "grad_norm": 2.045759677886963, "learning_rate": 3.6666666666666666e-05, "loss": 0.4153, "step": 220 }, { "epoch": 0.0872865275142315, "grad_norm": 2.0934813022613525, "learning_rate": 3.8333333333333334e-05, "loss": 0.3829, "step": 230 }, { "epoch": 0.09108159392789374, "grad_norm": 2.4255552291870117, "learning_rate": 4e-05, "loss": 0.3816, "step": 240 }, { "epoch": 0.09487666034155598, "grad_norm": 1.42184579372406, "learning_rate": 4.166666666666667e-05, "loss": 0.3948, "step": 250 }, { "epoch": 0.09867172675521822, "grad_norm": 1.6787000894546509, "learning_rate": 4.3333333333333334e-05, "loss": 0.3877, "step": 260 }, { "epoch": 0.10246679316888045, "grad_norm": 2.121290445327759, "learning_rate": 4.5e-05, "loss": 0.3732, "step": 270 }, { "epoch": 0.1062618595825427, "grad_norm": 1.5, "learning_rate": 4.666666666666667e-05, "loss": 0.3567, "step": 280 }, { "epoch": 0.11005692599620494, "grad_norm": 3.0193252563476562, "learning_rate": 4.8333333333333334e-05, "loss": 0.3916, "step": 290 }, { "epoch": 0.11385199240986717, "grad_norm": 2.7301666736602783, "learning_rate": 5e-05, "loss": 0.3723, "step": 300 }, { "epoch": 0.11764705882352941, "grad_norm": 1.8423070907592773, "learning_rate": 4.9951171875e-05, "loss": 0.3214, "step": 310 }, { "epoch": 0.12144212523719165, "grad_norm": 1.204102873802185, "learning_rate": 4.990234375e-05, "loss": 0.3251, "step": 320 }, { "epoch": 0.1252371916508539, "grad_norm": 1.803913950920105, "learning_rate": 4.9853515625000005e-05, "loss": 0.3942, "step": 330 }, { "epoch": 0.12903225806451613, "grad_norm": 3.175114154815674, "learning_rate": 4.9804687500000004e-05, "loss": 0.39, "step": 340 }, { "epoch": 0.13282732447817835, "grad_norm": 2.4476590156555176, "learning_rate": 4.9755859375e-05, "loss": 0.349, "step": 350 }, { "epoch": 0.1366223908918406, "grad_norm": 1.2592339515686035, "learning_rate": 4.970703125e-05, "loss": 0.3315, "step": 360 }, { "epoch": 0.14041745730550284, "grad_norm": 1.6238622665405273, "learning_rate": 4.9658203125e-05, "loss": 0.3307, "step": 370 }, { "epoch": 0.1442125237191651, "grad_norm": 1.3984373807907104, "learning_rate": 4.9609375000000005e-05, "loss": 0.294, "step": 380 }, { "epoch": 0.14800759013282733, "grad_norm": 3.1960623264312744, "learning_rate": 4.9560546875e-05, "loss": 0.3314, "step": 390 }, { "epoch": 0.15180265654648956, "grad_norm": 1.5345971584320068, "learning_rate": 4.951171875e-05, "loss": 0.3438, "step": 400 }, { "epoch": 0.1555977229601518, "grad_norm": 3.1037323474884033, "learning_rate": 4.9462890625e-05, "loss": 0.3246, "step": 410 }, { "epoch": 0.15939278937381404, "grad_norm": 3.519519805908203, "learning_rate": 4.94140625e-05, "loss": 0.3087, "step": 420 }, { "epoch": 0.16318785578747627, "grad_norm": 1.347273826599121, "learning_rate": 4.9365234375000005e-05, "loss": 0.3303, "step": 430 }, { "epoch": 0.16698292220113853, "grad_norm": 1.2372374534606934, "learning_rate": 4.931640625e-05, "loss": 0.3225, "step": 440 }, { "epoch": 0.17077798861480076, "grad_norm": 0.9122889637947083, "learning_rate": 4.9267578125e-05, "loss": 0.3081, "step": 450 }, { "epoch": 0.174573055028463, "grad_norm": 3.7750535011291504, "learning_rate": 4.921875e-05, "loss": 0.2785, "step": 460 }, { "epoch": 0.17836812144212524, "grad_norm": 1.0529924631118774, "learning_rate": 4.9169921875000006e-05, "loss": 0.283, "step": 470 }, { "epoch": 0.18216318785578747, "grad_norm": 1.5323132276535034, "learning_rate": 4.9121093750000004e-05, "loss": 0.2982, "step": 480 }, { "epoch": 0.1859582542694497, "grad_norm": 1.1751055717468262, "learning_rate": 4.9072265625e-05, "loss": 0.2639, "step": 490 }, { "epoch": 0.18975332068311196, "grad_norm": 1.0208653211593628, "learning_rate": 4.90234375e-05, "loss": 0.2651, "step": 500 }, { "epoch": 0.1935483870967742, "grad_norm": 1.7089987993240356, "learning_rate": 4.8974609375e-05, "loss": 0.2572, "step": 510 }, { "epoch": 0.19734345351043645, "grad_norm": 4.918070316314697, "learning_rate": 4.8925781250000006e-05, "loss": 0.299, "step": 520 }, { "epoch": 0.20113851992409867, "grad_norm": 1.117162823677063, "learning_rate": 4.8876953125000004e-05, "loss": 0.2699, "step": 530 }, { "epoch": 0.2049335863377609, "grad_norm": 1.813411831855774, "learning_rate": 4.8828125e-05, "loss": 0.2391, "step": 540 }, { "epoch": 0.20872865275142316, "grad_norm": 3.368643283843994, "learning_rate": 4.8779296875e-05, "loss": 0.3022, "step": 550 }, { "epoch": 0.2125237191650854, "grad_norm": 16.486289978027344, "learning_rate": 4.873046875e-05, "loss": 0.2837, "step": 560 }, { "epoch": 0.21631878557874762, "grad_norm": 1.3590037822723389, "learning_rate": 4.8681640625000005e-05, "loss": 0.2182, "step": 570 }, { "epoch": 0.22011385199240988, "grad_norm": 1.8672986030578613, "learning_rate": 4.8632812500000004e-05, "loss": 0.2925, "step": 580 }, { "epoch": 0.2239089184060721, "grad_norm": 2.350752592086792, "learning_rate": 4.8583984375e-05, "loss": 0.2585, "step": 590 }, { "epoch": 0.22770398481973433, "grad_norm": 2.4918649196624756, "learning_rate": 4.853515625e-05, "loss": 0.2824, "step": 600 }, { "epoch": 0.2314990512333966, "grad_norm": 2.4856553077697754, "learning_rate": 4.8486328125e-05, "loss": 0.2444, "step": 610 }, { "epoch": 0.23529411764705882, "grad_norm": 1.87199866771698, "learning_rate": 4.8437500000000005e-05, "loss": 0.256, "step": 620 }, { "epoch": 0.23908918406072105, "grad_norm": 1.0694291591644287, "learning_rate": 4.8388671875000004e-05, "loss": 0.245, "step": 630 }, { "epoch": 0.2428842504743833, "grad_norm": 0.7904035449028015, "learning_rate": 4.833984375e-05, "loss": 0.2588, "step": 640 }, { "epoch": 0.24667931688804554, "grad_norm": 2.714871883392334, "learning_rate": 4.8291015625e-05, "loss": 0.2741, "step": 650 }, { "epoch": 0.2504743833017078, "grad_norm": 3.948547124862671, "learning_rate": 4.82421875e-05, "loss": 0.2335, "step": 660 }, { "epoch": 0.25426944971537, "grad_norm": 1.6354694366455078, "learning_rate": 4.8193359375000005e-05, "loss": 0.2298, "step": 670 }, { "epoch": 0.25806451612903225, "grad_norm": 1.1305994987487793, "learning_rate": 4.8144531250000003e-05, "loss": 0.2279, "step": 680 }, { "epoch": 0.2618595825426945, "grad_norm": 1.804825782775879, "learning_rate": 4.8095703125e-05, "loss": 0.2401, "step": 690 }, { "epoch": 0.2656546489563567, "grad_norm": 1.0778950452804565, "learning_rate": 4.8046875e-05, "loss": 0.2498, "step": 700 }, { "epoch": 0.269449715370019, "grad_norm": 2.672403335571289, "learning_rate": 4.7998046875e-05, "loss": 0.2521, "step": 710 }, { "epoch": 0.2732447817836812, "grad_norm": 1.0559144020080566, "learning_rate": 4.7949218750000005e-05, "loss": 0.1855, "step": 720 }, { "epoch": 0.27703984819734345, "grad_norm": 1.3226491212844849, "learning_rate": 4.7900390625e-05, "loss": 0.21, "step": 730 }, { "epoch": 0.2808349146110057, "grad_norm": 2.1266074180603027, "learning_rate": 4.78515625e-05, "loss": 0.2232, "step": 740 }, { "epoch": 0.2846299810246679, "grad_norm": 2.9967539310455322, "learning_rate": 4.7802734375e-05, "loss": 0.2554, "step": 750 }, { "epoch": 0.2884250474383302, "grad_norm": 2.6614627838134766, "learning_rate": 4.775390625e-05, "loss": 0.2811, "step": 760 }, { "epoch": 0.2922201138519924, "grad_norm": 1.64667546749115, "learning_rate": 4.7705078125000004e-05, "loss": 0.2102, "step": 770 }, { "epoch": 0.29601518026565465, "grad_norm": 2.339608669281006, "learning_rate": 4.765625e-05, "loss": 0.2125, "step": 780 }, { "epoch": 0.2998102466793169, "grad_norm": 1.6804083585739136, "learning_rate": 4.7607421875e-05, "loss": 0.2722, "step": 790 }, { "epoch": 0.3036053130929791, "grad_norm": 2.6005263328552246, "learning_rate": 4.755859375e-05, "loss": 0.2067, "step": 800 }, { "epoch": 0.30740037950664134, "grad_norm": 5.113396167755127, "learning_rate": 4.7509765625000006e-05, "loss": 0.1988, "step": 810 }, { "epoch": 0.3111954459203036, "grad_norm": 1.9176031351089478, "learning_rate": 4.7460937500000004e-05, "loss": 0.2416, "step": 820 }, { "epoch": 0.31499051233396586, "grad_norm": 1.5946362018585205, "learning_rate": 4.7412109375e-05, "loss": 0.2416, "step": 830 }, { "epoch": 0.3187855787476281, "grad_norm": 1.6692804098129272, "learning_rate": 4.736328125e-05, "loss": 0.2139, "step": 840 }, { "epoch": 0.3225806451612903, "grad_norm": 4.5298285484313965, "learning_rate": 4.7314453125e-05, "loss": 0.2285, "step": 850 }, { "epoch": 0.32637571157495254, "grad_norm": 1.9948817491531372, "learning_rate": 4.7265625000000005e-05, "loss": 0.2453, "step": 860 }, { "epoch": 0.3301707779886148, "grad_norm": 2.5353565216064453, "learning_rate": 4.7216796875000004e-05, "loss": 0.2259, "step": 870 }, { "epoch": 0.33396584440227706, "grad_norm": 5.23643684387207, "learning_rate": 4.716796875e-05, "loss": 0.2318, "step": 880 }, { "epoch": 0.3377609108159393, "grad_norm": 3.062701463699341, "learning_rate": 4.7119140625e-05, "loss": 0.1835, "step": 890 }, { "epoch": 0.3415559772296015, "grad_norm": 1.5771597623825073, "learning_rate": 4.70703125e-05, "loss": 0.2195, "step": 900 }, { "epoch": 0.34535104364326374, "grad_norm": 0.9039077162742615, "learning_rate": 4.7021484375000005e-05, "loss": 0.1545, "step": 910 }, { "epoch": 0.349146110056926, "grad_norm": 2.7035298347473145, "learning_rate": 4.6972656250000004e-05, "loss": 0.2221, "step": 920 }, { "epoch": 0.35294117647058826, "grad_norm": 2.3225386142730713, "learning_rate": 4.6923828125e-05, "loss": 0.1912, "step": 930 }, { "epoch": 0.3567362428842505, "grad_norm": 1.1066793203353882, "learning_rate": 4.6875e-05, "loss": 0.2003, "step": 940 }, { "epoch": 0.3605313092979127, "grad_norm": 1.2358715534210205, "learning_rate": 4.6826171875e-05, "loss": 0.1944, "step": 950 }, { "epoch": 0.36432637571157495, "grad_norm": 0.5866732001304626, "learning_rate": 4.6777343750000005e-05, "loss": 0.1885, "step": 960 }, { "epoch": 0.3681214421252372, "grad_norm": 1.436168909072876, "learning_rate": 4.6728515625000004e-05, "loss": 0.182, "step": 970 }, { "epoch": 0.3719165085388994, "grad_norm": 1.5037955045700073, "learning_rate": 4.66796875e-05, "loss": 0.2024, "step": 980 }, { "epoch": 0.3757115749525617, "grad_norm": 1.4837393760681152, "learning_rate": 4.6630859375e-05, "loss": 0.2249, "step": 990 }, { "epoch": 0.3795066413662239, "grad_norm": 12.082221031188965, "learning_rate": 4.658203125e-05, "loss": 0.2191, "step": 1000 }, { "epoch": 0.38330170777988615, "grad_norm": 0.7743262648582458, "learning_rate": 4.6533203125000005e-05, "loss": 0.1654, "step": 1010 }, { "epoch": 0.3870967741935484, "grad_norm": 3.7393670082092285, "learning_rate": 4.6484375e-05, "loss": 0.1595, "step": 1020 }, { "epoch": 0.3908918406072106, "grad_norm": 1.2153229713439941, "learning_rate": 4.6435546875e-05, "loss": 0.2276, "step": 1030 }, { "epoch": 0.3946869070208729, "grad_norm": 0.9271629452705383, "learning_rate": 4.638671875e-05, "loss": 0.2039, "step": 1040 }, { "epoch": 0.3984819734345351, "grad_norm": 1.0829685926437378, "learning_rate": 4.6337890625e-05, "loss": 0.1731, "step": 1050 }, { "epoch": 0.40227703984819735, "grad_norm": 1.2705596685409546, "learning_rate": 4.6289062500000005e-05, "loss": 0.1359, "step": 1060 }, { "epoch": 0.4060721062618596, "grad_norm": 4.376911163330078, "learning_rate": 4.6240234375e-05, "loss": 0.2095, "step": 1070 }, { "epoch": 0.4098671726755218, "grad_norm": 2.1292335987091064, "learning_rate": 4.619140625e-05, "loss": 0.1916, "step": 1080 }, { "epoch": 0.41366223908918404, "grad_norm": 1.6525979042053223, "learning_rate": 4.6142578125e-05, "loss": 0.173, "step": 1090 }, { "epoch": 0.4174573055028463, "grad_norm": 4.228000164031982, "learning_rate": 4.609375e-05, "loss": 0.2117, "step": 1100 }, { "epoch": 0.42125237191650855, "grad_norm": 5.334222316741943, "learning_rate": 4.6044921875000004e-05, "loss": 0.185, "step": 1110 }, { "epoch": 0.4250474383301708, "grad_norm": 1.7326403856277466, "learning_rate": 4.599609375e-05, "loss": 0.1875, "step": 1120 }, { "epoch": 0.428842504743833, "grad_norm": 2.4292402267456055, "learning_rate": 4.5947265625e-05, "loss": 0.1747, "step": 1130 }, { "epoch": 0.43263757115749524, "grad_norm": 1.6561298370361328, "learning_rate": 4.58984375e-05, "loss": 0.2017, "step": 1140 }, { "epoch": 0.4364326375711575, "grad_norm": 2.659874439239502, "learning_rate": 4.5849609375000005e-05, "loss": 0.2415, "step": 1150 }, { "epoch": 0.44022770398481975, "grad_norm": 2.743425130844116, "learning_rate": 4.5800781250000004e-05, "loss": 0.2332, "step": 1160 }, { "epoch": 0.444022770398482, "grad_norm": 2.3197848796844482, "learning_rate": 4.5751953125e-05, "loss": 0.1946, "step": 1170 }, { "epoch": 0.4478178368121442, "grad_norm": 2.110534191131592, "learning_rate": 4.5703125e-05, "loss": 0.1948, "step": 1180 }, { "epoch": 0.45161290322580644, "grad_norm": 1.3609685897827148, "learning_rate": 4.5654296875e-05, "loss": 0.1801, "step": 1190 }, { "epoch": 0.45540796963946867, "grad_norm": 3.159426689147949, "learning_rate": 4.5605468750000005e-05, "loss": 0.2184, "step": 1200 }, { "epoch": 0.45920303605313095, "grad_norm": 1.7927987575531006, "learning_rate": 4.5556640625000004e-05, "loss": 0.1604, "step": 1210 }, { "epoch": 0.4629981024667932, "grad_norm": 1.5928328037261963, "learning_rate": 4.55078125e-05, "loss": 0.1693, "step": 1220 }, { "epoch": 0.4667931688804554, "grad_norm": 0.8145284056663513, "learning_rate": 4.5458984375e-05, "loss": 0.1761, "step": 1230 }, { "epoch": 0.47058823529411764, "grad_norm": 0.7765156030654907, "learning_rate": 4.541015625e-05, "loss": 0.1799, "step": 1240 }, { "epoch": 0.47438330170777987, "grad_norm": 1.8456169366836548, "learning_rate": 4.5361328125000005e-05, "loss": 0.168, "step": 1250 }, { "epoch": 0.4781783681214421, "grad_norm": 1.6953251361846924, "learning_rate": 4.5312500000000004e-05, "loss": 0.1945, "step": 1260 }, { "epoch": 0.4819734345351044, "grad_norm": 1.5285083055496216, "learning_rate": 4.5263671875e-05, "loss": 0.2075, "step": 1270 }, { "epoch": 0.4857685009487666, "grad_norm": 2.95650577545166, "learning_rate": 4.521484375e-05, "loss": 0.1601, "step": 1280 }, { "epoch": 0.48956356736242884, "grad_norm": 0.7677034735679626, "learning_rate": 4.5166015625e-05, "loss": 0.1695, "step": 1290 }, { "epoch": 0.49335863377609107, "grad_norm": 1.9959975481033325, "learning_rate": 4.5117187500000005e-05, "loss": 0.2183, "step": 1300 }, { "epoch": 0.4971537001897533, "grad_norm": 1.8000417947769165, "learning_rate": 4.5068359375000003e-05, "loss": 0.175, "step": 1310 }, { "epoch": 0.5009487666034156, "grad_norm": 1.400612473487854, "learning_rate": 4.501953125e-05, "loss": 0.2085, "step": 1320 }, { "epoch": 0.5047438330170778, "grad_norm": 1.6406989097595215, "learning_rate": 4.4970703125e-05, "loss": 0.1537, "step": 1330 }, { "epoch": 0.50853889943074, "grad_norm": 2.0849852561950684, "learning_rate": 4.4921875e-05, "loss": 0.1579, "step": 1340 }, { "epoch": 0.5123339658444023, "grad_norm": 2.6497225761413574, "learning_rate": 4.4873046875000005e-05, "loss": 0.1888, "step": 1350 }, { "epoch": 0.5161290322580645, "grad_norm": 2.2594399452209473, "learning_rate": 4.482421875e-05, "loss": 0.1645, "step": 1360 }, { "epoch": 0.5199240986717267, "grad_norm": 1.3591111898422241, "learning_rate": 4.4775390625e-05, "loss": 0.1876, "step": 1370 }, { "epoch": 0.523719165085389, "grad_norm": 5.060487747192383, "learning_rate": 4.47265625e-05, "loss": 0.1946, "step": 1380 }, { "epoch": 0.5275142314990512, "grad_norm": 1.7694716453552246, "learning_rate": 4.4677734375e-05, "loss": 0.0966, "step": 1390 }, { "epoch": 0.5313092979127134, "grad_norm": 2.8661625385284424, "learning_rate": 4.4628906250000004e-05, "loss": 0.1614, "step": 1400 }, { "epoch": 0.5351043643263758, "grad_norm": 2.2955727577209473, "learning_rate": 4.4580078125e-05, "loss": 0.193, "step": 1410 }, { "epoch": 0.538899430740038, "grad_norm": 1.4596924781799316, "learning_rate": 4.453125e-05, "loss": 0.1971, "step": 1420 }, { "epoch": 0.5426944971537002, "grad_norm": 1.039890170097351, "learning_rate": 4.4482421875e-05, "loss": 0.1909, "step": 1430 }, { "epoch": 0.5464895635673624, "grad_norm": 1.433979868888855, "learning_rate": 4.443359375e-05, "loss": 0.1832, "step": 1440 }, { "epoch": 0.5502846299810247, "grad_norm": 1.306391954421997, "learning_rate": 4.4384765625000004e-05, "loss": 0.1867, "step": 1450 }, { "epoch": 0.5540796963946869, "grad_norm": 1.2681069374084473, "learning_rate": 4.43359375e-05, "loss": 0.1506, "step": 1460 }, { "epoch": 0.5578747628083491, "grad_norm": 3.947502613067627, "learning_rate": 4.4287109375e-05, "loss": 0.1343, "step": 1470 }, { "epoch": 0.5616698292220114, "grad_norm": 4.928821563720703, "learning_rate": 4.423828125e-05, "loss": 0.2057, "step": 1480 }, { "epoch": 0.5654648956356736, "grad_norm": 2.162473201751709, "learning_rate": 4.4189453125000005e-05, "loss": 0.1942, "step": 1490 }, { "epoch": 0.5692599620493358, "grad_norm": 5.402246475219727, "learning_rate": 4.4140625000000004e-05, "loss": 0.1727, "step": 1500 }, { "epoch": 0.573055028462998, "grad_norm": 0.2728889286518097, "learning_rate": 4.4091796875e-05, "loss": 0.1345, "step": 1510 }, { "epoch": 0.5768500948766604, "grad_norm": 2.027841567993164, "learning_rate": 4.404296875e-05, "loss": 0.213, "step": 1520 }, { "epoch": 0.5806451612903226, "grad_norm": 1.3224737644195557, "learning_rate": 4.3994140625e-05, "loss": 0.1735, "step": 1530 }, { "epoch": 0.5844402277039848, "grad_norm": 2.3124992847442627, "learning_rate": 4.3945312500000005e-05, "loss": 0.2177, "step": 1540 }, { "epoch": 0.5882352941176471, "grad_norm": 1.2521787881851196, "learning_rate": 4.3896484375000004e-05, "loss": 0.1332, "step": 1550 }, { "epoch": 0.5920303605313093, "grad_norm": 2.5216283798217773, "learning_rate": 4.384765625e-05, "loss": 0.1318, "step": 1560 }, { "epoch": 0.5958254269449715, "grad_norm": 1.8268439769744873, "learning_rate": 4.3798828125e-05, "loss": 0.1269, "step": 1570 }, { "epoch": 0.5996204933586338, "grad_norm": 0.6268766522407532, "learning_rate": 4.375e-05, "loss": 0.1381, "step": 1580 }, { "epoch": 0.603415559772296, "grad_norm": 1.979546308517456, "learning_rate": 4.3701171875000005e-05, "loss": 0.1351, "step": 1590 }, { "epoch": 0.6072106261859582, "grad_norm": 1.5526436567306519, "learning_rate": 4.3652343750000004e-05, "loss": 0.2163, "step": 1600 }, { "epoch": 0.6110056925996205, "grad_norm": 0.9428083896636963, "learning_rate": 4.3603515625e-05, "loss": 0.1398, "step": 1610 }, { "epoch": 0.6148007590132827, "grad_norm": 2.1224870681762695, "learning_rate": 4.35546875e-05, "loss": 0.1891, "step": 1620 }, { "epoch": 0.618595825426945, "grad_norm": 0.3401525914669037, "learning_rate": 4.3505859375e-05, "loss": 0.1068, "step": 1630 }, { "epoch": 0.6223908918406073, "grad_norm": 1.1070092916488647, "learning_rate": 4.3457031250000005e-05, "loss": 0.1407, "step": 1640 }, { "epoch": 0.6261859582542695, "grad_norm": 1.1588579416275024, "learning_rate": 4.3408203125e-05, "loss": 0.2238, "step": 1650 }, { "epoch": 0.6299810246679317, "grad_norm": 1.3201090097427368, "learning_rate": 4.3359375e-05, "loss": 0.2135, "step": 1660 }, { "epoch": 0.6337760910815939, "grad_norm": 1.2257441282272339, "learning_rate": 4.3310546875e-05, "loss": 0.1261, "step": 1670 }, { "epoch": 0.6375711574952562, "grad_norm": 1.4213567972183228, "learning_rate": 4.326171875e-05, "loss": 0.1439, "step": 1680 }, { "epoch": 0.6413662239089184, "grad_norm": 1.0983916521072388, "learning_rate": 4.3212890625000004e-05, "loss": 0.1356, "step": 1690 }, { "epoch": 0.6451612903225806, "grad_norm": 1.6485854387283325, "learning_rate": 4.31640625e-05, "loss": 0.1549, "step": 1700 }, { "epoch": 0.6489563567362429, "grad_norm": 5.49334716796875, "learning_rate": 4.3115234375e-05, "loss": 0.1519, "step": 1710 }, { "epoch": 0.6527514231499051, "grad_norm": 0.26703280210494995, "learning_rate": 4.306640625e-05, "loss": 0.1499, "step": 1720 }, { "epoch": 0.6565464895635673, "grad_norm": 1.5822151899337769, "learning_rate": 4.3017578125e-05, "loss": 0.1733, "step": 1730 }, { "epoch": 0.6603415559772297, "grad_norm": 1.1510590314865112, "learning_rate": 4.2968750000000004e-05, "loss": 0.1665, "step": 1740 }, { "epoch": 0.6641366223908919, "grad_norm": 2.48427152633667, "learning_rate": 4.2919921875e-05, "loss": 0.1598, "step": 1750 }, { "epoch": 0.6679316888045541, "grad_norm": 2.0076019763946533, "learning_rate": 4.287109375e-05, "loss": 0.1642, "step": 1760 }, { "epoch": 0.6717267552182163, "grad_norm": 2.1611413955688477, "learning_rate": 4.2822265625e-05, "loss": 0.1538, "step": 1770 }, { "epoch": 0.6755218216318786, "grad_norm": 2.476008415222168, "learning_rate": 4.27734375e-05, "loss": 0.1193, "step": 1780 }, { "epoch": 0.6793168880455408, "grad_norm": 2.426025867462158, "learning_rate": 4.2724609375000004e-05, "loss": 0.161, "step": 1790 }, { "epoch": 0.683111954459203, "grad_norm": 2.2168385982513428, "learning_rate": 4.267578125e-05, "loss": 0.1429, "step": 1800 }, { "epoch": 0.6869070208728653, "grad_norm": 1.63054358959198, "learning_rate": 4.2626953125e-05, "loss": 0.1561, "step": 1810 }, { "epoch": 0.6907020872865275, "grad_norm": 5.170077323913574, "learning_rate": 4.2578125e-05, "loss": 0.1685, "step": 1820 }, { "epoch": 0.6944971537001897, "grad_norm": 2.700263023376465, "learning_rate": 4.2529296875000005e-05, "loss": 0.1601, "step": 1830 }, { "epoch": 0.698292220113852, "grad_norm": 1.6965094804763794, "learning_rate": 4.2480468750000004e-05, "loss": 0.1046, "step": 1840 }, { "epoch": 0.7020872865275142, "grad_norm": 5.461817264556885, "learning_rate": 4.2431640625e-05, "loss": 0.1421, "step": 1850 }, { "epoch": 0.7058823529411765, "grad_norm": 1.584050178527832, "learning_rate": 4.23828125e-05, "loss": 0.1781, "step": 1860 }, { "epoch": 0.7096774193548387, "grad_norm": 2.42586088180542, "learning_rate": 4.2333984375e-05, "loss": 0.1274, "step": 1870 }, { "epoch": 0.713472485768501, "grad_norm": 3.151433229446411, "learning_rate": 4.2285156250000005e-05, "loss": 0.1825, "step": 1880 }, { "epoch": 0.7172675521821632, "grad_norm": 1.1808427572250366, "learning_rate": 4.2236328125000004e-05, "loss": 0.2085, "step": 1890 }, { "epoch": 0.7210626185958254, "grad_norm": 1.981814980506897, "learning_rate": 4.21875e-05, "loss": 0.1718, "step": 1900 }, { "epoch": 0.7248576850094877, "grad_norm": 0.9719598293304443, "learning_rate": 4.2138671875e-05, "loss": 0.1461, "step": 1910 }, { "epoch": 0.7286527514231499, "grad_norm": 1.493422031402588, "learning_rate": 4.208984375e-05, "loss": 0.1902, "step": 1920 }, { "epoch": 0.7324478178368121, "grad_norm": 1.4552210569381714, "learning_rate": 4.2041015625000005e-05, "loss": 0.1253, "step": 1930 }, { "epoch": 0.7362428842504743, "grad_norm": 2.0822556018829346, "learning_rate": 4.1992187500000003e-05, "loss": 0.144, "step": 1940 }, { "epoch": 0.7400379506641366, "grad_norm": 2.461090326309204, "learning_rate": 4.1943359375e-05, "loss": 0.2084, "step": 1950 }, { "epoch": 0.7438330170777988, "grad_norm": 1.8043471574783325, "learning_rate": 4.189453125e-05, "loss": 0.1904, "step": 1960 }, { "epoch": 0.7476280834914611, "grad_norm": 1.6388760805130005, "learning_rate": 4.1845703125e-05, "loss": 0.2071, "step": 1970 }, { "epoch": 0.7514231499051234, "grad_norm": 2.5029492378234863, "learning_rate": 4.1796875000000005e-05, "loss": 0.1881, "step": 1980 }, { "epoch": 0.7552182163187856, "grad_norm": 1.3092814683914185, "learning_rate": 4.1748046875e-05, "loss": 0.1356, "step": 1990 }, { "epoch": 0.7590132827324478, "grad_norm": 1.2208425998687744, "learning_rate": 4.169921875e-05, "loss": 0.1378, "step": 2000 }, { "epoch": 0.7628083491461101, "grad_norm": 3.214336633682251, "learning_rate": 4.1650390625e-05, "loss": 0.1954, "step": 2010 }, { "epoch": 0.7666034155597723, "grad_norm": 4.104292392730713, "learning_rate": 4.16015625e-05, "loss": 0.1886, "step": 2020 }, { "epoch": 0.7703984819734345, "grad_norm": 2.170186996459961, "learning_rate": 4.1552734375000004e-05, "loss": 0.1705, "step": 2030 }, { "epoch": 0.7741935483870968, "grad_norm": 2.6494083404541016, "learning_rate": 4.150390625e-05, "loss": 0.1986, "step": 2040 }, { "epoch": 0.777988614800759, "grad_norm": 0.7542719841003418, "learning_rate": 4.1455078125e-05, "loss": 0.1255, "step": 2050 }, { "epoch": 0.7817836812144212, "grad_norm": 3.126569986343384, "learning_rate": 4.140625e-05, "loss": 0.1576, "step": 2060 }, { "epoch": 0.7855787476280834, "grad_norm": 1.0665310621261597, "learning_rate": 4.1357421875e-05, "loss": 0.174, "step": 2070 }, { "epoch": 0.7893738140417458, "grad_norm": 1.3480401039123535, "learning_rate": 4.1308593750000004e-05, "loss": 0.1203, "step": 2080 }, { "epoch": 0.793168880455408, "grad_norm": 2.358405113220215, "learning_rate": 4.1259765625e-05, "loss": 0.1394, "step": 2090 }, { "epoch": 0.7969639468690702, "grad_norm": 3.2337498664855957, "learning_rate": 4.12109375e-05, "loss": 0.1711, "step": 2100 }, { "epoch": 0.8007590132827325, "grad_norm": 2.7708380222320557, "learning_rate": 4.1162109375e-05, "loss": 0.1265, "step": 2110 }, { "epoch": 0.8045540796963947, "grad_norm": 3.3023488521575928, "learning_rate": 4.1113281250000005e-05, "loss": 0.1706, "step": 2120 }, { "epoch": 0.8083491461100569, "grad_norm": 1.758325219154358, "learning_rate": 4.1064453125000004e-05, "loss": 0.1371, "step": 2130 }, { "epoch": 0.8121442125237192, "grad_norm": 1.5623672008514404, "learning_rate": 4.1015625e-05, "loss": 0.1756, "step": 2140 }, { "epoch": 0.8159392789373814, "grad_norm": 1.3145450353622437, "learning_rate": 4.0966796875e-05, "loss": 0.1328, "step": 2150 }, { "epoch": 0.8197343453510436, "grad_norm": 2.432619094848633, "learning_rate": 4.091796875e-05, "loss": 0.1286, "step": 2160 }, { "epoch": 0.8235294117647058, "grad_norm": 0.4147840142250061, "learning_rate": 4.0869140625000005e-05, "loss": 0.1509, "step": 2170 }, { "epoch": 0.8273244781783681, "grad_norm": 1.6098836660385132, "learning_rate": 4.0820312500000004e-05, "loss": 0.1746, "step": 2180 }, { "epoch": 0.8311195445920304, "grad_norm": 2.5355212688446045, "learning_rate": 4.0771484375e-05, "loss": 0.1238, "step": 2190 }, { "epoch": 0.8349146110056926, "grad_norm": 1.5544086694717407, "learning_rate": 4.072265625e-05, "loss": 0.2168, "step": 2200 }, { "epoch": 0.8387096774193549, "grad_norm": 2.1792962551116943, "learning_rate": 4.0673828125e-05, "loss": 0.1338, "step": 2210 }, { "epoch": 0.8425047438330171, "grad_norm": 2.667340040206909, "learning_rate": 4.0625000000000005e-05, "loss": 0.1505, "step": 2220 }, { "epoch": 0.8462998102466793, "grad_norm": 0.8551260232925415, "learning_rate": 4.0576171875000004e-05, "loss": 0.1081, "step": 2230 }, { "epoch": 0.8500948766603416, "grad_norm": 2.8773763179779053, "learning_rate": 4.052734375e-05, "loss": 0.1089, "step": 2240 }, { "epoch": 0.8538899430740038, "grad_norm": 2.12497878074646, "learning_rate": 4.0478515625e-05, "loss": 0.1268, "step": 2250 }, { "epoch": 0.857685009487666, "grad_norm": 1.8039929866790771, "learning_rate": 4.04296875e-05, "loss": 0.1544, "step": 2260 }, { "epoch": 0.8614800759013282, "grad_norm": 0.4839627742767334, "learning_rate": 4.0380859375000005e-05, "loss": 0.1421, "step": 2270 }, { "epoch": 0.8652751423149905, "grad_norm": 3.672240734100342, "learning_rate": 4.033203125e-05, "loss": 0.134, "step": 2280 }, { "epoch": 0.8690702087286527, "grad_norm": 2.4371728897094727, "learning_rate": 4.0283203125e-05, "loss": 0.1419, "step": 2290 }, { "epoch": 0.872865275142315, "grad_norm": 1.8469904661178589, "learning_rate": 4.0234375e-05, "loss": 0.1846, "step": 2300 }, { "epoch": 0.8766603415559773, "grad_norm": 0.7639700174331665, "learning_rate": 4.0185546875e-05, "loss": 0.106, "step": 2310 }, { "epoch": 0.8804554079696395, "grad_norm": 1.4450427293777466, "learning_rate": 4.0136718750000004e-05, "loss": 0.1408, "step": 2320 }, { "epoch": 0.8842504743833017, "grad_norm": 1.3033993244171143, "learning_rate": 4.0087890625e-05, "loss": 0.1456, "step": 2330 }, { "epoch": 0.888045540796964, "grad_norm": 1.3045791387557983, "learning_rate": 4.00390625e-05, "loss": 0.1531, "step": 2340 }, { "epoch": 0.8918406072106262, "grad_norm": 3.4357423782348633, "learning_rate": 3.9990234375e-05, "loss": 0.1417, "step": 2350 }, { "epoch": 0.8956356736242884, "grad_norm": 3.5311038494110107, "learning_rate": 3.994140625e-05, "loss": 0.1317, "step": 2360 }, { "epoch": 0.8994307400379506, "grad_norm": 4.028538227081299, "learning_rate": 3.9892578125000004e-05, "loss": 0.1644, "step": 2370 }, { "epoch": 0.9032258064516129, "grad_norm": 1.4089256525039673, "learning_rate": 3.984375e-05, "loss": 0.1087, "step": 2380 }, { "epoch": 0.9070208728652751, "grad_norm": 0.2230881005525589, "learning_rate": 3.9794921875e-05, "loss": 0.1387, "step": 2390 }, { "epoch": 0.9108159392789373, "grad_norm": 2.5647592544555664, "learning_rate": 3.974609375e-05, "loss": 0.1475, "step": 2400 }, { "epoch": 0.9146110056925996, "grad_norm": 1.2803542613983154, "learning_rate": 3.9697265625e-05, "loss": 0.126, "step": 2410 }, { "epoch": 0.9184060721062619, "grad_norm": 3.2023112773895264, "learning_rate": 3.9648437500000004e-05, "loss": 0.1458, "step": 2420 }, { "epoch": 0.9222011385199241, "grad_norm": 3.615530252456665, "learning_rate": 3.9599609375e-05, "loss": 0.1297, "step": 2430 }, { "epoch": 0.9259962049335864, "grad_norm": 3.396568536758423, "learning_rate": 3.955078125e-05, "loss": 0.1486, "step": 2440 }, { "epoch": 0.9297912713472486, "grad_norm": 1.7030583620071411, "learning_rate": 3.9501953125e-05, "loss": 0.1464, "step": 2450 }, { "epoch": 0.9335863377609108, "grad_norm": 1.0317497253417969, "learning_rate": 3.9453125000000005e-05, "loss": 0.1658, "step": 2460 }, { "epoch": 0.937381404174573, "grad_norm": 1.1268532276153564, "learning_rate": 3.9404296875000004e-05, "loss": 0.1425, "step": 2470 }, { "epoch": 0.9411764705882353, "grad_norm": 0.9238561391830444, "learning_rate": 3.935546875e-05, "loss": 0.1565, "step": 2480 }, { "epoch": 0.9449715370018975, "grad_norm": 1.4960806369781494, "learning_rate": 3.9306640625e-05, "loss": 0.1681, "step": 2490 }, { "epoch": 0.9487666034155597, "grad_norm": 1.306814193725586, "learning_rate": 3.92578125e-05, "loss": 0.1719, "step": 2500 }, { "epoch": 0.952561669829222, "grad_norm": 0.391342431306839, "learning_rate": 3.9208984375000005e-05, "loss": 0.1497, "step": 2510 }, { "epoch": 0.9563567362428842, "grad_norm": 1.9634449481964111, "learning_rate": 3.9160156250000004e-05, "loss": 0.124, "step": 2520 }, { "epoch": 0.9601518026565465, "grad_norm": 2.7319021224975586, "learning_rate": 3.9111328125e-05, "loss": 0.1029, "step": 2530 }, { "epoch": 0.9639468690702088, "grad_norm": 1.062157392501831, "learning_rate": 3.90625e-05, "loss": 0.1612, "step": 2540 }, { "epoch": 0.967741935483871, "grad_norm": 2.737459182739258, "learning_rate": 3.9013671875e-05, "loss": 0.1817, "step": 2550 }, { "epoch": 0.9715370018975332, "grad_norm": 1.4106887578964233, "learning_rate": 3.8964843750000005e-05, "loss": 0.1875, "step": 2560 }, { "epoch": 0.9753320683111955, "grad_norm": 7.118113040924072, "learning_rate": 3.8916015625000003e-05, "loss": 0.2243, "step": 2570 }, { "epoch": 0.9791271347248577, "grad_norm": 2.956235647201538, "learning_rate": 3.88671875e-05, "loss": 0.1059, "step": 2580 }, { "epoch": 0.9829222011385199, "grad_norm": 1.2888784408569336, "learning_rate": 3.8818359375e-05, "loss": 0.1546, "step": 2590 }, { "epoch": 0.9867172675521821, "grad_norm": 2.5757930278778076, "learning_rate": 3.876953125e-05, "loss": 0.115, "step": 2600 }, { "epoch": 0.9905123339658444, "grad_norm": 0.7105236053466797, "learning_rate": 3.8720703125000005e-05, "loss": 0.1218, "step": 2610 }, { "epoch": 0.9943074003795066, "grad_norm": 2.5876383781433105, "learning_rate": 3.8671875e-05, "loss": 0.1487, "step": 2620 }, { "epoch": 0.9981024667931688, "grad_norm": 0.2208087146282196, "learning_rate": 3.8623046875e-05, "loss": 0.1429, "step": 2630 }, { "epoch": 1.0018975332068312, "grad_norm": 0.6170036196708679, "learning_rate": 3.857421875e-05, "loss": 0.128, "step": 2640 }, { "epoch": 1.0056925996204933, "grad_norm": 1.1868369579315186, "learning_rate": 3.8525390625e-05, "loss": 0.0923, "step": 2650 }, { "epoch": 1.0094876660341556, "grad_norm": 3.0359079837799072, "learning_rate": 3.8476562500000004e-05, "loss": 0.1104, "step": 2660 }, { "epoch": 1.0132827324478177, "grad_norm": 0.6559151411056519, "learning_rate": 3.8427734375e-05, "loss": 0.1089, "step": 2670 }, { "epoch": 1.01707779886148, "grad_norm": 10.784985542297363, "learning_rate": 3.837890625e-05, "loss": 0.1408, "step": 2680 }, { "epoch": 1.0208728652751424, "grad_norm": 1.7095699310302734, "learning_rate": 3.8330078125e-05, "loss": 0.1358, "step": 2690 }, { "epoch": 1.0246679316888045, "grad_norm": 1.3584043979644775, "learning_rate": 3.828125e-05, "loss": 0.1248, "step": 2700 }, { "epoch": 1.0284629981024669, "grad_norm": 5.567887783050537, "learning_rate": 3.8232421875000004e-05, "loss": 0.0992, "step": 2710 }, { "epoch": 1.032258064516129, "grad_norm": 1.6698075532913208, "learning_rate": 3.818359375e-05, "loss": 0.1503, "step": 2720 }, { "epoch": 1.0360531309297913, "grad_norm": 0.29519161581993103, "learning_rate": 3.8134765625e-05, "loss": 0.1247, "step": 2730 }, { "epoch": 1.0398481973434535, "grad_norm": 2.3616697788238525, "learning_rate": 3.80859375e-05, "loss": 0.1459, "step": 2740 }, { "epoch": 1.0436432637571158, "grad_norm": 1.219618320465088, "learning_rate": 3.8037109375e-05, "loss": 0.1036, "step": 2750 }, { "epoch": 1.047438330170778, "grad_norm": 1.3592404127120972, "learning_rate": 3.7988281250000004e-05, "loss": 0.1399, "step": 2760 }, { "epoch": 1.0512333965844403, "grad_norm": 1.2837351560592651, "learning_rate": 3.7939453125e-05, "loss": 0.1581, "step": 2770 }, { "epoch": 1.0550284629981024, "grad_norm": 1.3627588748931885, "learning_rate": 3.7890625e-05, "loss": 0.1093, "step": 2780 }, { "epoch": 1.0588235294117647, "grad_norm": 4.571230888366699, "learning_rate": 3.7841796875e-05, "loss": 0.1693, "step": 2790 }, { "epoch": 1.0626185958254268, "grad_norm": 1.575040578842163, "learning_rate": 3.7792968750000005e-05, "loss": 0.1646, "step": 2800 }, { "epoch": 1.0664136622390892, "grad_norm": 2.594174861907959, "learning_rate": 3.7744140625000004e-05, "loss": 0.0976, "step": 2810 }, { "epoch": 1.0702087286527515, "grad_norm": 4.076402187347412, "learning_rate": 3.76953125e-05, "loss": 0.1301, "step": 2820 }, { "epoch": 1.0740037950664136, "grad_norm": 2.7510082721710205, "learning_rate": 3.7646484375e-05, "loss": 0.1337, "step": 2830 }, { "epoch": 1.077798861480076, "grad_norm": 0.8219005465507507, "learning_rate": 3.759765625e-05, "loss": 0.1122, "step": 2840 }, { "epoch": 1.081593927893738, "grad_norm": 1.9153568744659424, "learning_rate": 3.7548828125000005e-05, "loss": 0.1428, "step": 2850 }, { "epoch": 1.0853889943074004, "grad_norm": 2.93013858795166, "learning_rate": 3.7500000000000003e-05, "loss": 0.1872, "step": 2860 }, { "epoch": 1.0891840607210626, "grad_norm": 0.7126034498214722, "learning_rate": 3.7451171875e-05, "loss": 0.1106, "step": 2870 }, { "epoch": 1.092979127134725, "grad_norm": 1.8968008756637573, "learning_rate": 3.740234375e-05, "loss": 0.1131, "step": 2880 }, { "epoch": 1.096774193548387, "grad_norm": 5.133113861083984, "learning_rate": 3.7353515625e-05, "loss": 0.0884, "step": 2890 }, { "epoch": 1.1005692599620494, "grad_norm": 3.756060838699341, "learning_rate": 3.7304687500000005e-05, "loss": 0.1373, "step": 2900 }, { "epoch": 1.1043643263757117, "grad_norm": 7.563070297241211, "learning_rate": 3.7255859375e-05, "loss": 0.1353, "step": 2910 }, { "epoch": 1.1081593927893738, "grad_norm": 4.473198413848877, "learning_rate": 3.720703125e-05, "loss": 0.1639, "step": 2920 }, { "epoch": 1.1119544592030361, "grad_norm": 2.689405679702759, "learning_rate": 3.7158203125e-05, "loss": 0.1117, "step": 2930 }, { "epoch": 1.1157495256166983, "grad_norm": 0.2793045938014984, "learning_rate": 3.7109375e-05, "loss": 0.1073, "step": 2940 }, { "epoch": 1.1195445920303606, "grad_norm": 1.4892089366912842, "learning_rate": 3.7060546875000004e-05, "loss": 0.1541, "step": 2950 }, { "epoch": 1.1233396584440227, "grad_norm": 1.1303538084030151, "learning_rate": 3.701171875e-05, "loss": 0.0961, "step": 2960 }, { "epoch": 1.127134724857685, "grad_norm": 0.6085264682769775, "learning_rate": 3.6962890625e-05, "loss": 0.111, "step": 2970 }, { "epoch": 1.1309297912713472, "grad_norm": 0.44500744342803955, "learning_rate": 3.69140625e-05, "loss": 0.0939, "step": 2980 }, { "epoch": 1.1347248576850095, "grad_norm": 1.8215651512145996, "learning_rate": 3.6865234375e-05, "loss": 0.1112, "step": 2990 }, { "epoch": 1.1385199240986716, "grad_norm": 0.7494792938232422, "learning_rate": 3.6816406250000004e-05, "loss": 0.1407, "step": 3000 }, { "epoch": 1.142314990512334, "grad_norm": 1.2958310842514038, "learning_rate": 3.6767578125e-05, "loss": 0.086, "step": 3010 }, { "epoch": 1.146110056925996, "grad_norm": 1.223376989364624, "learning_rate": 3.671875e-05, "loss": 0.1152, "step": 3020 }, { "epoch": 1.1499051233396584, "grad_norm": 5.232940196990967, "learning_rate": 3.6669921875e-05, "loss": 0.1308, "step": 3030 }, { "epoch": 1.1537001897533208, "grad_norm": 1.4690934419631958, "learning_rate": 3.662109375e-05, "loss": 0.1275, "step": 3040 }, { "epoch": 1.157495256166983, "grad_norm": 0.8882303833961487, "learning_rate": 3.6572265625000004e-05, "loss": 0.0709, "step": 3050 }, { "epoch": 1.1612903225806452, "grad_norm": 7.125335216522217, "learning_rate": 3.65234375e-05, "loss": 0.0991, "step": 3060 }, { "epoch": 1.1650853889943074, "grad_norm": 2.321225881576538, "learning_rate": 3.6474609375e-05, "loss": 0.1986, "step": 3070 }, { "epoch": 1.1688804554079697, "grad_norm": 2.8146891593933105, "learning_rate": 3.642578125e-05, "loss": 0.1497, "step": 3080 }, { "epoch": 1.1726755218216318, "grad_norm": 2.781428575515747, "learning_rate": 3.6376953125e-05, "loss": 0.1075, "step": 3090 }, { "epoch": 1.1764705882352942, "grad_norm": 7.027383327484131, "learning_rate": 3.6328125000000004e-05, "loss": 0.0921, "step": 3100 }, { "epoch": 1.1802656546489563, "grad_norm": 2.3189167976379395, "learning_rate": 3.6279296875e-05, "loss": 0.0784, "step": 3110 }, { "epoch": 1.1840607210626186, "grad_norm": 3.060039758682251, "learning_rate": 3.623046875e-05, "loss": 0.1262, "step": 3120 }, { "epoch": 1.187855787476281, "grad_norm": 6.099356174468994, "learning_rate": 3.6181640625e-05, "loss": 0.1506, "step": 3130 }, { "epoch": 1.191650853889943, "grad_norm": 3.1299543380737305, "learning_rate": 3.6132812500000005e-05, "loss": 0.1431, "step": 3140 }, { "epoch": 1.1954459203036052, "grad_norm": 1.5676418542861938, "learning_rate": 3.6083984375000004e-05, "loss": 0.1018, "step": 3150 }, { "epoch": 1.1992409867172675, "grad_norm": 0.786465585231781, "learning_rate": 3.603515625e-05, "loss": 0.1471, "step": 3160 }, { "epoch": 1.2030360531309299, "grad_norm": 0.6863810420036316, "learning_rate": 3.5986328125e-05, "loss": 0.1144, "step": 3170 }, { "epoch": 1.206831119544592, "grad_norm": 6.13245964050293, "learning_rate": 3.59375e-05, "loss": 0.1378, "step": 3180 }, { "epoch": 1.2106261859582543, "grad_norm": 0.9144377112388611, "learning_rate": 3.5888671875000005e-05, "loss": 0.1024, "step": 3190 }, { "epoch": 1.2144212523719164, "grad_norm": 13.092443466186523, "learning_rate": 3.583984375e-05, "loss": 0.1241, "step": 3200 }, { "epoch": 1.2182163187855788, "grad_norm": 5.453747272491455, "learning_rate": 3.5791015625e-05, "loss": 0.1307, "step": 3210 }, { "epoch": 1.222011385199241, "grad_norm": 5.696516036987305, "learning_rate": 3.57421875e-05, "loss": 0.1661, "step": 3220 }, { "epoch": 1.2258064516129032, "grad_norm": 1.4154207706451416, "learning_rate": 3.5693359375e-05, "loss": 0.1017, "step": 3230 }, { "epoch": 1.2296015180265654, "grad_norm": 3.1260204315185547, "learning_rate": 3.5644531250000005e-05, "loss": 0.1224, "step": 3240 }, { "epoch": 1.2333965844402277, "grad_norm": 1.4753592014312744, "learning_rate": 3.5595703125e-05, "loss": 0.1, "step": 3250 }, { "epoch": 1.23719165085389, "grad_norm": 2.7512917518615723, "learning_rate": 3.5546875e-05, "loss": 0.152, "step": 3260 }, { "epoch": 1.2409867172675522, "grad_norm": 0.1835506409406662, "learning_rate": 3.5498046875e-05, "loss": 0.0897, "step": 3270 }, { "epoch": 1.2447817836812145, "grad_norm": 2.484245777130127, "learning_rate": 3.544921875e-05, "loss": 0.1284, "step": 3280 }, { "epoch": 1.2485768500948766, "grad_norm": 2.778939962387085, "learning_rate": 3.5400390625000004e-05, "loss": 0.1225, "step": 3290 }, { "epoch": 1.252371916508539, "grad_norm": 4.067395210266113, "learning_rate": 3.53515625e-05, "loss": 0.1687, "step": 3300 }, { "epoch": 1.256166982922201, "grad_norm": 0.2922412157058716, "learning_rate": 3.5302734375e-05, "loss": 0.066, "step": 3310 }, { "epoch": 1.2599620493358634, "grad_norm": 2.992678165435791, "learning_rate": 3.525390625e-05, "loss": 0.1016, "step": 3320 }, { "epoch": 1.2637571157495255, "grad_norm": 0.5019288063049316, "learning_rate": 3.5205078125e-05, "loss": 0.0877, "step": 3330 }, { "epoch": 1.2675521821631879, "grad_norm": 5.55689811706543, "learning_rate": 3.5156250000000004e-05, "loss": 0.1191, "step": 3340 }, { "epoch": 1.2713472485768502, "grad_norm": 3.2791213989257812, "learning_rate": 3.5107421875e-05, "loss": 0.1086, "step": 3350 }, { "epoch": 1.2751423149905123, "grad_norm": 7.413064956665039, "learning_rate": 3.505859375e-05, "loss": 0.1063, "step": 3360 }, { "epoch": 1.2789373814041745, "grad_norm": 4.541271686553955, "learning_rate": 3.5009765625e-05, "loss": 0.0959, "step": 3370 }, { "epoch": 1.2827324478178368, "grad_norm": 2.8879811763763428, "learning_rate": 3.49609375e-05, "loss": 0.1178, "step": 3380 }, { "epoch": 1.2865275142314991, "grad_norm": 3.210865020751953, "learning_rate": 3.4912109375000004e-05, "loss": 0.1464, "step": 3390 }, { "epoch": 1.2903225806451613, "grad_norm": 0.654231071472168, "learning_rate": 3.486328125e-05, "loss": 0.1404, "step": 3400 }, { "epoch": 1.2941176470588236, "grad_norm": 2.9404890537261963, "learning_rate": 3.4814453125e-05, "loss": 0.1213, "step": 3410 }, { "epoch": 1.2979127134724857, "grad_norm": 2.2991085052490234, "learning_rate": 3.4765625e-05, "loss": 0.1131, "step": 3420 }, { "epoch": 1.301707779886148, "grad_norm": 0.30925440788269043, "learning_rate": 3.4716796875e-05, "loss": 0.1166, "step": 3430 }, { "epoch": 1.3055028462998102, "grad_norm": 1.3804266452789307, "learning_rate": 3.4667968750000004e-05, "loss": 0.0634, "step": 3440 }, { "epoch": 1.3092979127134725, "grad_norm": 3.1803112030029297, "learning_rate": 3.4619140625e-05, "loss": 0.1916, "step": 3450 }, { "epoch": 1.3130929791271346, "grad_norm": 2.8847222328186035, "learning_rate": 3.45703125e-05, "loss": 0.1856, "step": 3460 }, { "epoch": 1.316888045540797, "grad_norm": 7.0924973487854, "learning_rate": 3.4521484375e-05, "loss": 0.1292, "step": 3470 }, { "epoch": 1.3206831119544593, "grad_norm": 4.695943355560303, "learning_rate": 3.4472656250000005e-05, "loss": 0.1518, "step": 3480 }, { "epoch": 1.3244781783681214, "grad_norm": 4.995908260345459, "learning_rate": 3.4423828125000003e-05, "loss": 0.12, "step": 3490 }, { "epoch": 1.3282732447817835, "grad_norm": 4.585287570953369, "learning_rate": 3.4375e-05, "loss": 0.0933, "step": 3500 }, { "epoch": 1.3320683111954459, "grad_norm": 1.5841524600982666, "learning_rate": 3.4326171875e-05, "loss": 0.1172, "step": 3510 }, { "epoch": 1.3358633776091082, "grad_norm": 3.6837852001190186, "learning_rate": 3.427734375e-05, "loss": 0.1164, "step": 3520 }, { "epoch": 1.3396584440227703, "grad_norm": 2.470222234725952, "learning_rate": 3.4228515625000005e-05, "loss": 0.1258, "step": 3530 }, { "epoch": 1.3434535104364327, "grad_norm": 1.8782237768173218, "learning_rate": 3.41796875e-05, "loss": 0.1078, "step": 3540 }, { "epoch": 1.3472485768500948, "grad_norm": 0.29535171389579773, "learning_rate": 3.4130859375e-05, "loss": 0.1658, "step": 3550 }, { "epoch": 1.3510436432637571, "grad_norm": 3.8535208702087402, "learning_rate": 3.408203125e-05, "loss": 0.1632, "step": 3560 }, { "epoch": 1.3548387096774195, "grad_norm": 2.0340235233306885, "learning_rate": 3.4033203125e-05, "loss": 0.1498, "step": 3570 }, { "epoch": 1.3586337760910816, "grad_norm": 3.015774726867676, "learning_rate": 3.3984375000000004e-05, "loss": 0.1099, "step": 3580 }, { "epoch": 1.3624288425047437, "grad_norm": 5.396883487701416, "learning_rate": 3.3935546875e-05, "loss": 0.1308, "step": 3590 }, { "epoch": 1.366223908918406, "grad_norm": 4.15665864944458, "learning_rate": 3.388671875e-05, "loss": 0.0893, "step": 3600 }, { "epoch": 1.3700189753320684, "grad_norm": 2.0461652278900146, "learning_rate": 3.3837890625e-05, "loss": 0.1157, "step": 3610 }, { "epoch": 1.3738140417457305, "grad_norm": 1.5953052043914795, "learning_rate": 3.37890625e-05, "loss": 0.1611, "step": 3620 }, { "epoch": 1.3776091081593929, "grad_norm": 3.8149826526641846, "learning_rate": 3.3740234375000004e-05, "loss": 0.1582, "step": 3630 }, { "epoch": 1.381404174573055, "grad_norm": 5.658437252044678, "learning_rate": 3.369140625e-05, "loss": 0.1481, "step": 3640 }, { "epoch": 1.3851992409867173, "grad_norm": 0.47566506266593933, "learning_rate": 3.3642578125e-05, "loss": 0.1336, "step": 3650 }, { "epoch": 1.3889943074003794, "grad_norm": 2.9851224422454834, "learning_rate": 3.359375e-05, "loss": 0.1274, "step": 3660 }, { "epoch": 1.3927893738140418, "grad_norm": 2.3793752193450928, "learning_rate": 3.3544921875e-05, "loss": 0.1189, "step": 3670 }, { "epoch": 1.396584440227704, "grad_norm": 0.35333120822906494, "learning_rate": 3.3496093750000004e-05, "loss": 0.1021, "step": 3680 }, { "epoch": 1.4003795066413662, "grad_norm": 2.170039653778076, "learning_rate": 3.3447265625e-05, "loss": 0.1016, "step": 3690 }, { "epoch": 1.4041745730550286, "grad_norm": 3.225989818572998, "learning_rate": 3.33984375e-05, "loss": 0.1559, "step": 3700 }, { "epoch": 1.4079696394686907, "grad_norm": 5.81306266784668, "learning_rate": 3.3349609375e-05, "loss": 0.1378, "step": 3710 }, { "epoch": 1.4117647058823528, "grad_norm": 0.839579701423645, "learning_rate": 3.330078125e-05, "loss": 0.0981, "step": 3720 }, { "epoch": 1.4155597722960152, "grad_norm": 2.421964645385742, "learning_rate": 3.3251953125000004e-05, "loss": 0.1267, "step": 3730 }, { "epoch": 1.4193548387096775, "grad_norm": 0.298155814409256, "learning_rate": 3.3203125e-05, "loss": 0.1619, "step": 3740 }, { "epoch": 1.4231499051233396, "grad_norm": 5.643527030944824, "learning_rate": 3.3154296875e-05, "loss": 0.0844, "step": 3750 }, { "epoch": 1.426944971537002, "grad_norm": 1.7513082027435303, "learning_rate": 3.310546875e-05, "loss": 0.133, "step": 3760 }, { "epoch": 1.430740037950664, "grad_norm": 1.2837634086608887, "learning_rate": 3.3056640625000005e-05, "loss": 0.1241, "step": 3770 }, { "epoch": 1.4345351043643264, "grad_norm": 0.7017351984977722, "learning_rate": 3.3007812500000004e-05, "loss": 0.1123, "step": 3780 }, { "epoch": 1.4383301707779887, "grad_norm": 6.043475151062012, "learning_rate": 3.2958984375e-05, "loss": 0.1249, "step": 3790 }, { "epoch": 1.4421252371916509, "grad_norm": 4.449422359466553, "learning_rate": 3.291015625e-05, "loss": 0.173, "step": 3800 }, { "epoch": 1.445920303605313, "grad_norm": 1.7111449241638184, "learning_rate": 3.2861328125e-05, "loss": 0.1473, "step": 3810 }, { "epoch": 1.4497153700189753, "grad_norm": 1.3379569053649902, "learning_rate": 3.2812500000000005e-05, "loss": 0.1119, "step": 3820 }, { "epoch": 1.4535104364326377, "grad_norm": 7.154158115386963, "learning_rate": 3.2763671875e-05, "loss": 0.1273, "step": 3830 }, { "epoch": 1.4573055028462998, "grad_norm": 1.2248731851577759, "learning_rate": 3.271484375e-05, "loss": 0.1081, "step": 3840 }, { "epoch": 1.4611005692599621, "grad_norm": 1.219230055809021, "learning_rate": 3.2666015625e-05, "loss": 0.0945, "step": 3850 }, { "epoch": 1.4648956356736242, "grad_norm": 4.3124189376831055, "learning_rate": 3.26171875e-05, "loss": 0.1039, "step": 3860 }, { "epoch": 1.4686907020872866, "grad_norm": 2.915302038192749, "learning_rate": 3.2568359375000005e-05, "loss": 0.1236, "step": 3870 }, { "epoch": 1.4724857685009487, "grad_norm": 0.3403218984603882, "learning_rate": 3.251953125e-05, "loss": 0.146, "step": 3880 }, { "epoch": 1.476280834914611, "grad_norm": 1.74779212474823, "learning_rate": 3.2470703125e-05, "loss": 0.1096, "step": 3890 }, { "epoch": 1.4800759013282732, "grad_norm": 2.724412202835083, "learning_rate": 3.2421875e-05, "loss": 0.1147, "step": 3900 }, { "epoch": 1.4838709677419355, "grad_norm": 3.6029605865478516, "learning_rate": 3.2373046875e-05, "loss": 0.1293, "step": 3910 }, { "epoch": 1.4876660341555978, "grad_norm": 1.7680699825286865, "learning_rate": 3.2324218750000004e-05, "loss": 0.0891, "step": 3920 }, { "epoch": 1.49146110056926, "grad_norm": 0.7916316390037537, "learning_rate": 3.2275390625e-05, "loss": 0.1223, "step": 3930 }, { "epoch": 1.495256166982922, "grad_norm": 0.9054811596870422, "learning_rate": 3.22265625e-05, "loss": 0.0934, "step": 3940 }, { "epoch": 1.4990512333965844, "grad_norm": 0.14054611325263977, "learning_rate": 3.2177734375e-05, "loss": 0.0494, "step": 3950 }, { "epoch": 1.5028462998102468, "grad_norm": 3.1943421363830566, "learning_rate": 3.212890625e-05, "loss": 0.1156, "step": 3960 }, { "epoch": 1.5066413662239089, "grad_norm": 1.0965791940689087, "learning_rate": 3.2080078125000004e-05, "loss": 0.1016, "step": 3970 }, { "epoch": 1.510436432637571, "grad_norm": 1.3087248802185059, "learning_rate": 3.203125e-05, "loss": 0.0764, "step": 3980 }, { "epoch": 1.5142314990512333, "grad_norm": 2.760798692703247, "learning_rate": 3.1982421875e-05, "loss": 0.114, "step": 3990 }, { "epoch": 1.5180265654648957, "grad_norm": 0.1450069397687912, "learning_rate": 3.193359375e-05, "loss": 0.1192, "step": 4000 }, { "epoch": 1.521821631878558, "grad_norm": 4.504504680633545, "learning_rate": 3.1884765625e-05, "loss": 0.1046, "step": 4010 }, { "epoch": 1.5256166982922201, "grad_norm": 0.7182434797286987, "learning_rate": 3.1835937500000004e-05, "loss": 0.0932, "step": 4020 }, { "epoch": 1.5294117647058822, "grad_norm": 4.370609283447266, "learning_rate": 3.1787109375e-05, "loss": 0.144, "step": 4030 }, { "epoch": 1.5332068311195446, "grad_norm": 3.8300323486328125, "learning_rate": 3.173828125e-05, "loss": 0.0982, "step": 4040 }, { "epoch": 1.537001897533207, "grad_norm": 0.25771814584732056, "learning_rate": 3.1689453125e-05, "loss": 0.0691, "step": 4050 }, { "epoch": 1.540796963946869, "grad_norm": 2.758225917816162, "learning_rate": 3.1640625e-05, "loss": 0.1308, "step": 4060 }, { "epoch": 1.5445920303605312, "grad_norm": 2.7619638442993164, "learning_rate": 3.1591796875000004e-05, "loss": 0.094, "step": 4070 }, { "epoch": 1.5483870967741935, "grad_norm": 0.9765902757644653, "learning_rate": 3.154296875e-05, "loss": 0.0811, "step": 4080 }, { "epoch": 1.5521821631878558, "grad_norm": 4.361360549926758, "learning_rate": 3.1494140625e-05, "loss": 0.1742, "step": 4090 }, { "epoch": 1.5559772296015182, "grad_norm": 2.249197244644165, "learning_rate": 3.14453125e-05, "loss": 0.0807, "step": 4100 }, { "epoch": 1.5597722960151803, "grad_norm": 3.4518532752990723, "learning_rate": 3.1396484375000005e-05, "loss": 0.1422, "step": 4110 }, { "epoch": 1.5635673624288424, "grad_norm": 0.6679037809371948, "learning_rate": 3.1347656250000003e-05, "loss": 0.1214, "step": 4120 }, { "epoch": 1.5673624288425048, "grad_norm": 3.879596710205078, "learning_rate": 3.1298828125e-05, "loss": 0.1084, "step": 4130 }, { "epoch": 1.571157495256167, "grad_norm": 5.232009410858154, "learning_rate": 3.125e-05, "loss": 0.1192, "step": 4140 }, { "epoch": 1.5749525616698292, "grad_norm": 3.875843048095703, "learning_rate": 3.1201171875e-05, "loss": 0.1099, "step": 4150 }, { "epoch": 1.5787476280834913, "grad_norm": 0.17772170901298523, "learning_rate": 3.1152343750000005e-05, "loss": 0.1001, "step": 4160 }, { "epoch": 1.5825426944971537, "grad_norm": 0.6866888403892517, "learning_rate": 3.1103515625e-05, "loss": 0.1598, "step": 4170 }, { "epoch": 1.586337760910816, "grad_norm": 2.2445452213287354, "learning_rate": 3.10546875e-05, "loss": 0.1532, "step": 4180 }, { "epoch": 1.5901328273244781, "grad_norm": 1.2135056257247925, "learning_rate": 3.1005859375e-05, "loss": 0.1337, "step": 4190 }, { "epoch": 1.5939278937381403, "grad_norm": 0.8548033833503723, "learning_rate": 3.095703125e-05, "loss": 0.1142, "step": 4200 }, { "epoch": 1.5977229601518026, "grad_norm": 1.7404321432113647, "learning_rate": 3.0908203125000004e-05, "loss": 0.1195, "step": 4210 }, { "epoch": 1.601518026565465, "grad_norm": 1.4047428369522095, "learning_rate": 3.0859375e-05, "loss": 0.1853, "step": 4220 }, { "epoch": 1.6053130929791273, "grad_norm": 2.793487071990967, "learning_rate": 3.0810546875e-05, "loss": 0.1231, "step": 4230 }, { "epoch": 1.6091081593927894, "grad_norm": 0.928959310054779, "learning_rate": 3.076171875e-05, "loss": 0.0891, "step": 4240 }, { "epoch": 1.6129032258064515, "grad_norm": 1.1571967601776123, "learning_rate": 3.0712890625e-05, "loss": 0.1119, "step": 4250 }, { "epoch": 1.6166982922201139, "grad_norm": 3.0740041732788086, "learning_rate": 3.0664062500000004e-05, "loss": 0.1518, "step": 4260 }, { "epoch": 1.6204933586337762, "grad_norm": 5.726138114929199, "learning_rate": 3.0615234375e-05, "loss": 0.1121, "step": 4270 }, { "epoch": 1.6242884250474383, "grad_norm": 3.900777816772461, "learning_rate": 3.056640625e-05, "loss": 0.1513, "step": 4280 }, { "epoch": 1.6280834914611004, "grad_norm": 3.43808913230896, "learning_rate": 3.0517578125e-05, "loss": 0.1259, "step": 4290 }, { "epoch": 1.6318785578747628, "grad_norm": 1.2054848670959473, "learning_rate": 3.0468750000000002e-05, "loss": 0.1446, "step": 4300 }, { "epoch": 1.635673624288425, "grad_norm": 3.756579875946045, "learning_rate": 3.0419921875e-05, "loss": 0.1348, "step": 4310 }, { "epoch": 1.6394686907020875, "grad_norm": 1.4033925533294678, "learning_rate": 3.0371093750000003e-05, "loss": 0.1053, "step": 4320 }, { "epoch": 1.6432637571157496, "grad_norm": 1.6513621807098389, "learning_rate": 3.0322265625e-05, "loss": 0.1217, "step": 4330 }, { "epoch": 1.6470588235294117, "grad_norm": 1.9821256399154663, "learning_rate": 3.02734375e-05, "loss": 0.0959, "step": 4340 }, { "epoch": 1.650853889943074, "grad_norm": 7.50634241104126, "learning_rate": 3.0224609375000002e-05, "loss": 0.1487, "step": 4350 }, { "epoch": 1.6546489563567364, "grad_norm": 1.1505802869796753, "learning_rate": 3.017578125e-05, "loss": 0.1246, "step": 4360 }, { "epoch": 1.6584440227703985, "grad_norm": 1.774200677871704, "learning_rate": 3.0126953125000002e-05, "loss": 0.086, "step": 4370 }, { "epoch": 1.6622390891840606, "grad_norm": 1.566748023033142, "learning_rate": 3.0078125e-05, "loss": 0.1088, "step": 4380 }, { "epoch": 1.666034155597723, "grad_norm": 2.8167648315429688, "learning_rate": 3.0029296875000003e-05, "loss": 0.122, "step": 4390 }, { "epoch": 1.6698292220113853, "grad_norm": 1.7637346982955933, "learning_rate": 2.998046875e-05, "loss": 0.1036, "step": 4400 }, { "epoch": 1.6736242884250474, "grad_norm": 0.3347111642360687, "learning_rate": 2.9931640625e-05, "loss": 0.1259, "step": 4410 }, { "epoch": 1.6774193548387095, "grad_norm": 4.920076370239258, "learning_rate": 2.9882812500000002e-05, "loss": 0.1594, "step": 4420 }, { "epoch": 1.6812144212523719, "grad_norm": 3.4409444332122803, "learning_rate": 2.9833984375e-05, "loss": 0.1541, "step": 4430 }, { "epoch": 1.6850094876660342, "grad_norm": 0.639980673789978, "learning_rate": 2.9785156250000003e-05, "loss": 0.0826, "step": 4440 }, { "epoch": 1.6888045540796965, "grad_norm": 3.240345001220703, "learning_rate": 2.9736328125e-05, "loss": 0.1473, "step": 4450 }, { "epoch": 1.6925996204933587, "grad_norm": 2.2682647705078125, "learning_rate": 2.96875e-05, "loss": 0.0959, "step": 4460 }, { "epoch": 1.6963946869070208, "grad_norm": 2.3791496753692627, "learning_rate": 2.9638671875000002e-05, "loss": 0.0953, "step": 4470 }, { "epoch": 1.7001897533206831, "grad_norm": 1.5654246807098389, "learning_rate": 2.958984375e-05, "loss": 0.113, "step": 4480 }, { "epoch": 1.7039848197343455, "grad_norm": 5.17665958404541, "learning_rate": 2.9541015625000003e-05, "loss": 0.1164, "step": 4490 }, { "epoch": 1.7077798861480076, "grad_norm": 18.226165771484375, "learning_rate": 2.94921875e-05, "loss": 0.1293, "step": 4500 }, { "epoch": 1.7115749525616697, "grad_norm": 3.5760374069213867, "learning_rate": 2.9443359375e-05, "loss": 0.0931, "step": 4510 }, { "epoch": 1.715370018975332, "grad_norm": 2.9964776039123535, "learning_rate": 2.9394531250000002e-05, "loss": 0.0932, "step": 4520 }, { "epoch": 1.7191650853889944, "grad_norm": 10.505178451538086, "learning_rate": 2.9345703125e-05, "loss": 0.139, "step": 4530 }, { "epoch": 1.7229601518026565, "grad_norm": 0.9944730997085571, "learning_rate": 2.9296875000000002e-05, "loss": 0.159, "step": 4540 }, { "epoch": 1.7267552182163188, "grad_norm": 1.2323939800262451, "learning_rate": 2.9248046875e-05, "loss": 0.118, "step": 4550 }, { "epoch": 1.730550284629981, "grad_norm": 0.8581392765045166, "learning_rate": 2.9199218750000003e-05, "loss": 0.1165, "step": 4560 }, { "epoch": 1.7343453510436433, "grad_norm": 2.196648120880127, "learning_rate": 2.9150390625e-05, "loss": 0.0803, "step": 4570 }, { "epoch": 1.7381404174573056, "grad_norm": 3.5112388134002686, "learning_rate": 2.91015625e-05, "loss": 0.1348, "step": 4580 }, { "epoch": 1.7419354838709677, "grad_norm": 1.1738495826721191, "learning_rate": 2.9052734375000002e-05, "loss": 0.1114, "step": 4590 }, { "epoch": 1.7457305502846299, "grad_norm": 1.6850240230560303, "learning_rate": 2.900390625e-05, "loss": 0.1457, "step": 4600 }, { "epoch": 1.7495256166982922, "grad_norm": 1.4865467548370361, "learning_rate": 2.8955078125000003e-05, "loss": 0.1078, "step": 4610 }, { "epoch": 1.7533206831119545, "grad_norm": 1.445610523223877, "learning_rate": 2.890625e-05, "loss": 0.0839, "step": 4620 }, { "epoch": 1.7571157495256167, "grad_norm": 1.649983525276184, "learning_rate": 2.8857421875e-05, "loss": 0.1028, "step": 4630 }, { "epoch": 1.7609108159392788, "grad_norm": 2.717585802078247, "learning_rate": 2.8808593750000002e-05, "loss": 0.1127, "step": 4640 }, { "epoch": 1.7647058823529411, "grad_norm": 2.902244806289673, "learning_rate": 2.8759765625e-05, "loss": 0.0743, "step": 4650 }, { "epoch": 1.7685009487666035, "grad_norm": 1.8880512714385986, "learning_rate": 2.8710937500000002e-05, "loss": 0.0875, "step": 4660 }, { "epoch": 1.7722960151802658, "grad_norm": 1.119419813156128, "learning_rate": 2.8662109375e-05, "loss": 0.1028, "step": 4670 }, { "epoch": 1.776091081593928, "grad_norm": 2.3372507095336914, "learning_rate": 2.8613281250000003e-05, "loss": 0.161, "step": 4680 }, { "epoch": 1.77988614800759, "grad_norm": 0.6809380054473877, "learning_rate": 2.8564453125e-05, "loss": 0.091, "step": 4690 }, { "epoch": 1.7836812144212524, "grad_norm": 4.871325969696045, "learning_rate": 2.8515625e-05, "loss": 0.1495, "step": 4700 }, { "epoch": 1.7874762808349147, "grad_norm": 10.103543281555176, "learning_rate": 2.8466796875000002e-05, "loss": 0.0847, "step": 4710 }, { "epoch": 1.7912713472485768, "grad_norm": 0.719699501991272, "learning_rate": 2.841796875e-05, "loss": 0.0991, "step": 4720 }, { "epoch": 1.795066413662239, "grad_norm": 2.012406826019287, "learning_rate": 2.8369140625000003e-05, "loss": 0.069, "step": 4730 }, { "epoch": 1.7988614800759013, "grad_norm": 2.038810968399048, "learning_rate": 2.83203125e-05, "loss": 0.0946, "step": 4740 }, { "epoch": 1.8026565464895636, "grad_norm": 1.991003394126892, "learning_rate": 2.8271484375e-05, "loss": 0.1033, "step": 4750 }, { "epoch": 1.8064516129032258, "grad_norm": 1.9379823207855225, "learning_rate": 2.8222656250000002e-05, "loss": 0.0738, "step": 4760 }, { "epoch": 1.810246679316888, "grad_norm": 0.9378390312194824, "learning_rate": 2.8173828125e-05, "loss": 0.0907, "step": 4770 }, { "epoch": 1.8140417457305502, "grad_norm": 2.5683369636535645, "learning_rate": 2.8125000000000003e-05, "loss": 0.1156, "step": 4780 }, { "epoch": 1.8178368121442126, "grad_norm": 2.95536470413208, "learning_rate": 2.8076171875e-05, "loss": 0.0959, "step": 4790 }, { "epoch": 1.821631878557875, "grad_norm": 11.215580940246582, "learning_rate": 2.802734375e-05, "loss": 0.0812, "step": 4800 }, { "epoch": 1.825426944971537, "grad_norm": 0.4500042498111725, "learning_rate": 2.7978515625000002e-05, "loss": 0.1114, "step": 4810 }, { "epoch": 1.8292220113851991, "grad_norm": 0.5829250812530518, "learning_rate": 2.79296875e-05, "loss": 0.1284, "step": 4820 }, { "epoch": 1.8330170777988615, "grad_norm": 3.114776134490967, "learning_rate": 2.7880859375000002e-05, "loss": 0.1283, "step": 4830 }, { "epoch": 1.8368121442125238, "grad_norm": 0.47552067041397095, "learning_rate": 2.783203125e-05, "loss": 0.0752, "step": 4840 }, { "epoch": 1.840607210626186, "grad_norm": 4.794514179229736, "learning_rate": 2.7783203125000003e-05, "loss": 0.1012, "step": 4850 }, { "epoch": 1.844402277039848, "grad_norm": 5.392133712768555, "learning_rate": 2.7734375e-05, "loss": 0.178, "step": 4860 }, { "epoch": 1.8481973434535104, "grad_norm": 1.1505749225616455, "learning_rate": 2.7685546875e-05, "loss": 0.126, "step": 4870 }, { "epoch": 1.8519924098671727, "grad_norm": 1.1924586296081543, "learning_rate": 2.7636718750000002e-05, "loss": 0.1109, "step": 4880 }, { "epoch": 1.855787476280835, "grad_norm": 0.12782755494117737, "learning_rate": 2.7587890625e-05, "loss": 0.0732, "step": 4890 }, { "epoch": 1.8595825426944972, "grad_norm": 1.1095064878463745, "learning_rate": 2.7539062500000003e-05, "loss": 0.0802, "step": 4900 }, { "epoch": 1.8633776091081593, "grad_norm": 8.920310020446777, "learning_rate": 2.7490234375e-05, "loss": 0.0964, "step": 4910 }, { "epoch": 1.8671726755218216, "grad_norm": 1.8678808212280273, "learning_rate": 2.744140625e-05, "loss": 0.1072, "step": 4920 }, { "epoch": 1.870967741935484, "grad_norm": 1.8633017539978027, "learning_rate": 2.7392578125000002e-05, "loss": 0.0835, "step": 4930 }, { "epoch": 1.874762808349146, "grad_norm": 1.7576115131378174, "learning_rate": 2.734375e-05, "loss": 0.1327, "step": 4940 }, { "epoch": 1.8785578747628082, "grad_norm": 3.504157304763794, "learning_rate": 2.7294921875000003e-05, "loss": 0.1609, "step": 4950 }, { "epoch": 1.8823529411764706, "grad_norm": 1.7668483257293701, "learning_rate": 2.724609375e-05, "loss": 0.1316, "step": 4960 }, { "epoch": 1.886148007590133, "grad_norm": 0.659870982170105, "learning_rate": 2.7197265625e-05, "loss": 0.0913, "step": 4970 }, { "epoch": 1.889943074003795, "grad_norm": 1.428725004196167, "learning_rate": 2.7148437500000002e-05, "loss": 0.118, "step": 4980 }, { "epoch": 1.8937381404174574, "grad_norm": 1.8446964025497437, "learning_rate": 2.7099609375e-05, "loss": 0.1203, "step": 4990 }, { "epoch": 1.8975332068311195, "grad_norm": 2.9335217475891113, "learning_rate": 2.7050781250000002e-05, "loss": 0.1301, "step": 5000 }, { "epoch": 1.9013282732447818, "grad_norm": 0.8534810543060303, "learning_rate": 2.7001953125e-05, "loss": 0.0555, "step": 5010 }, { "epoch": 1.9051233396584442, "grad_norm": 0.5556221604347229, "learning_rate": 2.6953125000000003e-05, "loss": 0.1036, "step": 5020 }, { "epoch": 1.9089184060721063, "grad_norm": 1.7097387313842773, "learning_rate": 2.6904296875e-05, "loss": 0.0869, "step": 5030 }, { "epoch": 1.9127134724857684, "grad_norm": 2.324669122695923, "learning_rate": 2.685546875e-05, "loss": 0.1233, "step": 5040 }, { "epoch": 1.9165085388994307, "grad_norm": 2.4764981269836426, "learning_rate": 2.6806640625000002e-05, "loss": 0.1379, "step": 5050 }, { "epoch": 1.920303605313093, "grad_norm": 4.731557846069336, "learning_rate": 2.67578125e-05, "loss": 0.189, "step": 5060 }, { "epoch": 1.9240986717267552, "grad_norm": 0.4868462383747101, "learning_rate": 2.6708984375000003e-05, "loss": 0.0765, "step": 5070 }, { "epoch": 1.9278937381404173, "grad_norm": 1.3497892618179321, "learning_rate": 2.666015625e-05, "loss": 0.1039, "step": 5080 }, { "epoch": 1.9316888045540797, "grad_norm": 15.007429122924805, "learning_rate": 2.6611328125e-05, "loss": 0.0996, "step": 5090 }, { "epoch": 1.935483870967742, "grad_norm": 8.113617897033691, "learning_rate": 2.6562500000000002e-05, "loss": 0.1316, "step": 5100 }, { "epoch": 1.9392789373814043, "grad_norm": 0.4574742913246155, "learning_rate": 2.6513671875e-05, "loss": 0.1044, "step": 5110 }, { "epoch": 1.9430740037950665, "grad_norm": 2.1475601196289062, "learning_rate": 2.6464843750000002e-05, "loss": 0.1236, "step": 5120 }, { "epoch": 1.9468690702087286, "grad_norm": 2.370619058609009, "learning_rate": 2.6416015625e-05, "loss": 0.1358, "step": 5130 }, { "epoch": 1.950664136622391, "grad_norm": 0.7283152937889099, "learning_rate": 2.63671875e-05, "loss": 0.1348, "step": 5140 }, { "epoch": 1.9544592030360532, "grad_norm": 2.8883001804351807, "learning_rate": 2.6318359375e-05, "loss": 0.083, "step": 5150 }, { "epoch": 1.9582542694497154, "grad_norm": 0.26794353127479553, "learning_rate": 2.626953125e-05, "loss": 0.1229, "step": 5160 }, { "epoch": 1.9620493358633775, "grad_norm": 0.10836785286664963, "learning_rate": 2.6220703125000002e-05, "loss": 0.0731, "step": 5170 }, { "epoch": 1.9658444022770398, "grad_norm": 1.5825821161270142, "learning_rate": 2.6171875e-05, "loss": 0.1394, "step": 5180 }, { "epoch": 1.9696394686907022, "grad_norm": 2.9467551708221436, "learning_rate": 2.6123046875000003e-05, "loss": 0.0986, "step": 5190 }, { "epoch": 1.9734345351043643, "grad_norm": 0.14293566346168518, "learning_rate": 2.607421875e-05, "loss": 0.0824, "step": 5200 }, { "epoch": 1.9772296015180264, "grad_norm": 0.4912210702896118, "learning_rate": 2.6025390625e-05, "loss": 0.0863, "step": 5210 }, { "epoch": 1.9810246679316887, "grad_norm": 0.2447841614484787, "learning_rate": 2.5976562500000002e-05, "loss": 0.0877, "step": 5220 }, { "epoch": 1.984819734345351, "grad_norm": 0.13301405310630798, "learning_rate": 2.5927734375e-05, "loss": 0.104, "step": 5230 }, { "epoch": 1.9886148007590134, "grad_norm": 3.25866961479187, "learning_rate": 2.5878906250000003e-05, "loss": 0.0806, "step": 5240 }, { "epoch": 1.9924098671726755, "grad_norm": 3.9567527770996094, "learning_rate": 2.5830078125e-05, "loss": 0.1226, "step": 5250 }, { "epoch": 1.9962049335863377, "grad_norm": 3.6540729999542236, "learning_rate": 2.578125e-05, "loss": 0.0628, "step": 5260 }, { "epoch": 2.0, "grad_norm": 2.9958958625793457, "learning_rate": 2.5732421875000002e-05, "loss": 0.1229, "step": 5270 }, { "epoch": 2.0037950664136623, "grad_norm": 4.634014129638672, "learning_rate": 2.568359375e-05, "loss": 0.1, "step": 5280 }, { "epoch": 2.0075901328273247, "grad_norm": 1.0794429779052734, "learning_rate": 2.5634765625000002e-05, "loss": 0.1, "step": 5290 }, { "epoch": 2.0113851992409866, "grad_norm": 2.6222951412200928, "learning_rate": 2.55859375e-05, "loss": 0.057, "step": 5300 }, { "epoch": 2.015180265654649, "grad_norm": 1.499935507774353, "learning_rate": 2.5537109375e-05, "loss": 0.0766, "step": 5310 }, { "epoch": 2.0189753320683113, "grad_norm": 2.614969491958618, "learning_rate": 2.548828125e-05, "loss": 0.1003, "step": 5320 }, { "epoch": 2.0227703984819736, "grad_norm": 1.4524706602096558, "learning_rate": 2.5439453125e-05, "loss": 0.1681, "step": 5330 }, { "epoch": 2.0265654648956355, "grad_norm": 1.5427693128585815, "learning_rate": 2.5390625000000002e-05, "loss": 0.0745, "step": 5340 }, { "epoch": 2.030360531309298, "grad_norm": 0.6060462594032288, "learning_rate": 2.5341796875e-05, "loss": 0.0557, "step": 5350 }, { "epoch": 2.03415559772296, "grad_norm": 2.1763222217559814, "learning_rate": 2.5292968750000003e-05, "loss": 0.0962, "step": 5360 }, { "epoch": 2.0379506641366225, "grad_norm": 0.9857283234596252, "learning_rate": 2.5244140625e-05, "loss": 0.0646, "step": 5370 }, { "epoch": 2.041745730550285, "grad_norm": 0.14561018347740173, "learning_rate": 2.51953125e-05, "loss": 0.0686, "step": 5380 }, { "epoch": 2.0455407969639468, "grad_norm": 5.825016498565674, "learning_rate": 2.5146484375000002e-05, "loss": 0.1106, "step": 5390 }, { "epoch": 2.049335863377609, "grad_norm": 0.4656510353088379, "learning_rate": 2.509765625e-05, "loss": 0.0793, "step": 5400 }, { "epoch": 2.0531309297912714, "grad_norm": 5.336658954620361, "learning_rate": 2.5048828125000003e-05, "loss": 0.1136, "step": 5410 }, { "epoch": 2.0569259962049338, "grad_norm": 1.3186858892440796, "learning_rate": 2.5e-05, "loss": 0.0908, "step": 5420 }, { "epoch": 2.0607210626185957, "grad_norm": 2.3468871116638184, "learning_rate": 2.4951171875e-05, "loss": 0.1127, "step": 5430 }, { "epoch": 2.064516129032258, "grad_norm": 1.6484739780426025, "learning_rate": 2.4902343750000002e-05, "loss": 0.0921, "step": 5440 }, { "epoch": 2.0683111954459203, "grad_norm": 1.97286856174469, "learning_rate": 2.4853515625e-05, "loss": 0.064, "step": 5450 }, { "epoch": 2.0721062618595827, "grad_norm": 0.7309706211090088, "learning_rate": 2.4804687500000002e-05, "loss": 0.1256, "step": 5460 }, { "epoch": 2.0759013282732446, "grad_norm": 3.2271645069122314, "learning_rate": 2.4755859375e-05, "loss": 0.0889, "step": 5470 }, { "epoch": 2.079696394686907, "grad_norm": 18.506216049194336, "learning_rate": 2.470703125e-05, "loss": 0.1328, "step": 5480 }, { "epoch": 2.0834914611005693, "grad_norm": 1.2257277965545654, "learning_rate": 2.4658203125e-05, "loss": 0.0673, "step": 5490 }, { "epoch": 2.0872865275142316, "grad_norm": 0.1906469613313675, "learning_rate": 2.4609375e-05, "loss": 0.0808, "step": 5500 }, { "epoch": 2.091081593927894, "grad_norm": 0.9694260954856873, "learning_rate": 2.4560546875000002e-05, "loss": 0.0558, "step": 5510 }, { "epoch": 2.094876660341556, "grad_norm": 5.630046844482422, "learning_rate": 2.451171875e-05, "loss": 0.1262, "step": 5520 }, { "epoch": 2.098671726755218, "grad_norm": 0.13950304687023163, "learning_rate": 2.4462890625000003e-05, "loss": 0.0711, "step": 5530 }, { "epoch": 2.1024667931688805, "grad_norm": 0.424904465675354, "learning_rate": 2.44140625e-05, "loss": 0.0841, "step": 5540 }, { "epoch": 2.106261859582543, "grad_norm": 7.330411434173584, "learning_rate": 2.4365234375e-05, "loss": 0.1482, "step": 5550 }, { "epoch": 2.1100569259962048, "grad_norm": 0.2741791009902954, "learning_rate": 2.4316406250000002e-05, "loss": 0.0945, "step": 5560 }, { "epoch": 2.113851992409867, "grad_norm": 1.025099277496338, "learning_rate": 2.4267578125e-05, "loss": 0.0981, "step": 5570 }, { "epoch": 2.1176470588235294, "grad_norm": 2.723508596420288, "learning_rate": 2.4218750000000003e-05, "loss": 0.067, "step": 5580 }, { "epoch": 2.121442125237192, "grad_norm": 0.18666787445545197, "learning_rate": 2.4169921875e-05, "loss": 0.077, "step": 5590 }, { "epoch": 2.1252371916508537, "grad_norm": 2.304980754852295, "learning_rate": 2.412109375e-05, "loss": 0.1016, "step": 5600 }, { "epoch": 2.129032258064516, "grad_norm": 1.6174981594085693, "learning_rate": 2.4072265625000002e-05, "loss": 0.0735, "step": 5610 }, { "epoch": 2.1328273244781784, "grad_norm": 5.401015758514404, "learning_rate": 2.40234375e-05, "loss": 0.087, "step": 5620 }, { "epoch": 2.1366223908918407, "grad_norm": 2.5387024879455566, "learning_rate": 2.3974609375000002e-05, "loss": 0.1006, "step": 5630 }, { "epoch": 2.140417457305503, "grad_norm": 4.753091812133789, "learning_rate": 2.392578125e-05, "loss": 0.1013, "step": 5640 }, { "epoch": 2.144212523719165, "grad_norm": 3.540262460708618, "learning_rate": 2.3876953125e-05, "loss": 0.0697, "step": 5650 }, { "epoch": 2.1480075901328273, "grad_norm": 1.53217613697052, "learning_rate": 2.3828125e-05, "loss": 0.0812, "step": 5660 }, { "epoch": 2.1518026565464896, "grad_norm": 2.652308940887451, "learning_rate": 2.3779296875e-05, "loss": 0.092, "step": 5670 }, { "epoch": 2.155597722960152, "grad_norm": 2.7964372634887695, "learning_rate": 2.3730468750000002e-05, "loss": 0.0658, "step": 5680 }, { "epoch": 2.159392789373814, "grad_norm": 0.11225280165672302, "learning_rate": 2.3681640625e-05, "loss": 0.0939, "step": 5690 }, { "epoch": 2.163187855787476, "grad_norm": 1.5736573934555054, "learning_rate": 2.3632812500000003e-05, "loss": 0.0727, "step": 5700 }, { "epoch": 2.1669829222011385, "grad_norm": 2.087057113647461, "learning_rate": 2.3583984375e-05, "loss": 0.0654, "step": 5710 }, { "epoch": 2.170777988614801, "grad_norm": 1.598823070526123, "learning_rate": 2.353515625e-05, "loss": 0.0874, "step": 5720 }, { "epoch": 2.174573055028463, "grad_norm": 1.7258918285369873, "learning_rate": 2.3486328125000002e-05, "loss": 0.0703, "step": 5730 }, { "epoch": 2.178368121442125, "grad_norm": 12.662415504455566, "learning_rate": 2.34375e-05, "loss": 0.0998, "step": 5740 }, { "epoch": 2.1821631878557874, "grad_norm": 5.9703803062438965, "learning_rate": 2.3388671875000002e-05, "loss": 0.1021, "step": 5750 }, { "epoch": 2.18595825426945, "grad_norm": 1.9118971824645996, "learning_rate": 2.333984375e-05, "loss": 0.0574, "step": 5760 }, { "epoch": 2.189753320683112, "grad_norm": 2.8925118446350098, "learning_rate": 2.3291015625e-05, "loss": 0.0804, "step": 5770 }, { "epoch": 2.193548387096774, "grad_norm": 0.9911293387413025, "learning_rate": 2.32421875e-05, "loss": 0.0673, "step": 5780 }, { "epoch": 2.1973434535104364, "grad_norm": 3.4294886589050293, "learning_rate": 2.3193359375e-05, "loss": 0.0729, "step": 5790 }, { "epoch": 2.2011385199240987, "grad_norm": 5.382150650024414, "learning_rate": 2.3144531250000002e-05, "loss": 0.1117, "step": 5800 }, { "epoch": 2.204933586337761, "grad_norm": 3.5237820148468018, "learning_rate": 2.3095703125e-05, "loss": 0.0674, "step": 5810 }, { "epoch": 2.2087286527514234, "grad_norm": 5.6236772537231445, "learning_rate": 2.3046875e-05, "loss": 0.0279, "step": 5820 }, { "epoch": 2.2125237191650853, "grad_norm": 1.1168630123138428, "learning_rate": 2.2998046875e-05, "loss": 0.0773, "step": 5830 }, { "epoch": 2.2163187855787476, "grad_norm": 1.0353121757507324, "learning_rate": 2.294921875e-05, "loss": 0.062, "step": 5840 }, { "epoch": 2.22011385199241, "grad_norm": 1.4820594787597656, "learning_rate": 2.2900390625000002e-05, "loss": 0.0778, "step": 5850 }, { "epoch": 2.2239089184060723, "grad_norm": 8.295422554016113, "learning_rate": 2.28515625e-05, "loss": 0.1192, "step": 5860 }, { "epoch": 2.227703984819734, "grad_norm": 1.5980597734451294, "learning_rate": 2.2802734375000003e-05, "loss": 0.0648, "step": 5870 }, { "epoch": 2.2314990512333965, "grad_norm": 0.2760424315929413, "learning_rate": 2.275390625e-05, "loss": 0.0722, "step": 5880 }, { "epoch": 2.235294117647059, "grad_norm": 0.8219416737556458, "learning_rate": 2.2705078125e-05, "loss": 0.0935, "step": 5890 }, { "epoch": 2.239089184060721, "grad_norm": 0.16338910162448883, "learning_rate": 2.2656250000000002e-05, "loss": 0.0876, "step": 5900 }, { "epoch": 2.242884250474383, "grad_norm": 0.5857824683189392, "learning_rate": 2.2607421875e-05, "loss": 0.117, "step": 5910 }, { "epoch": 2.2466793168880455, "grad_norm": 0.1616586148738861, "learning_rate": 2.2558593750000002e-05, "loss": 0.072, "step": 5920 }, { "epoch": 2.250474383301708, "grad_norm": 0.26469337940216064, "learning_rate": 2.2509765625e-05, "loss": 0.0902, "step": 5930 }, { "epoch": 2.25426944971537, "grad_norm": 3.576016426086426, "learning_rate": 2.24609375e-05, "loss": 0.1647, "step": 5940 }, { "epoch": 2.258064516129032, "grad_norm": 6.523315906524658, "learning_rate": 2.2412109375e-05, "loss": 0.0705, "step": 5950 }, { "epoch": 2.2618595825426944, "grad_norm": 4.0901689529418945, "learning_rate": 2.236328125e-05, "loss": 0.0786, "step": 5960 }, { "epoch": 2.2656546489563567, "grad_norm": 0.5081945061683655, "learning_rate": 2.2314453125000002e-05, "loss": 0.1158, "step": 5970 }, { "epoch": 2.269449715370019, "grad_norm": 0.10847347974777222, "learning_rate": 2.2265625e-05, "loss": 0.0825, "step": 5980 }, { "epoch": 2.2732447817836814, "grad_norm": 9.521303176879883, "learning_rate": 2.2216796875e-05, "loss": 0.0875, "step": 5990 }, { "epoch": 2.2770398481973433, "grad_norm": 6.0424580574035645, "learning_rate": 2.216796875e-05, "loss": 0.0994, "step": 6000 }, { "epoch": 2.2808349146110056, "grad_norm": 0.3634886145591736, "learning_rate": 2.2119140625e-05, "loss": 0.0813, "step": 6010 }, { "epoch": 2.284629981024668, "grad_norm": 1.929626703262329, "learning_rate": 2.2070312500000002e-05, "loss": 0.0705, "step": 6020 }, { "epoch": 2.2884250474383303, "grad_norm": 4.993653297424316, "learning_rate": 2.2021484375e-05, "loss": 0.0731, "step": 6030 }, { "epoch": 2.292220113851992, "grad_norm": 0.4869803190231323, "learning_rate": 2.1972656250000003e-05, "loss": 0.1123, "step": 6040 }, { "epoch": 2.2960151802656545, "grad_norm": 1.1776117086410522, "learning_rate": 2.1923828125e-05, "loss": 0.0643, "step": 6050 }, { "epoch": 2.299810246679317, "grad_norm": 1.7794570922851562, "learning_rate": 2.1875e-05, "loss": 0.0852, "step": 6060 }, { "epoch": 2.3036053130929792, "grad_norm": 2.7579660415649414, "learning_rate": 2.1826171875000002e-05, "loss": 0.0975, "step": 6070 }, { "epoch": 2.3074003795066416, "grad_norm": 2.9852662086486816, "learning_rate": 2.177734375e-05, "loss": 0.0724, "step": 6080 }, { "epoch": 2.3111954459203035, "grad_norm": 3.543381452560425, "learning_rate": 2.1728515625000002e-05, "loss": 0.1108, "step": 6090 }, { "epoch": 2.314990512333966, "grad_norm": 6.476046085357666, "learning_rate": 2.16796875e-05, "loss": 0.1231, "step": 6100 }, { "epoch": 2.318785578747628, "grad_norm": 3.2935097217559814, "learning_rate": 2.1630859375e-05, "loss": 0.1052, "step": 6110 }, { "epoch": 2.3225806451612905, "grad_norm": 1.1247642040252686, "learning_rate": 2.158203125e-05, "loss": 0.0817, "step": 6120 }, { "epoch": 2.3263757115749524, "grad_norm": 6.793920993804932, "learning_rate": 2.1533203125e-05, "loss": 0.0623, "step": 6130 }, { "epoch": 2.3301707779886147, "grad_norm": 0.12885475158691406, "learning_rate": 2.1484375000000002e-05, "loss": 0.0942, "step": 6140 }, { "epoch": 2.333965844402277, "grad_norm": 1.4963340759277344, "learning_rate": 2.1435546875e-05, "loss": 0.0549, "step": 6150 }, { "epoch": 2.3377609108159394, "grad_norm": 1.460093379020691, "learning_rate": 2.138671875e-05, "loss": 0.094, "step": 6160 }, { "epoch": 2.3415559772296017, "grad_norm": 4.440692901611328, "learning_rate": 2.1337890625e-05, "loss": 0.1673, "step": 6170 }, { "epoch": 2.3453510436432636, "grad_norm": 2.9689061641693115, "learning_rate": 2.12890625e-05, "loss": 0.0772, "step": 6180 }, { "epoch": 2.349146110056926, "grad_norm": 8.890856742858887, "learning_rate": 2.1240234375000002e-05, "loss": 0.0588, "step": 6190 }, { "epoch": 2.3529411764705883, "grad_norm": 0.12126415222883224, "learning_rate": 2.119140625e-05, "loss": 0.0624, "step": 6200 }, { "epoch": 2.3567362428842507, "grad_norm": 0.5167102217674255, "learning_rate": 2.1142578125000003e-05, "loss": 0.0732, "step": 6210 }, { "epoch": 2.3605313092979125, "grad_norm": 0.18846435844898224, "learning_rate": 2.109375e-05, "loss": 0.1007, "step": 6220 }, { "epoch": 2.364326375711575, "grad_norm": 1.9389616250991821, "learning_rate": 2.1044921875e-05, "loss": 0.0912, "step": 6230 }, { "epoch": 2.3681214421252372, "grad_norm": 5.2946457862854, "learning_rate": 2.0996093750000002e-05, "loss": 0.057, "step": 6240 }, { "epoch": 2.3719165085388996, "grad_norm": 0.13522082567214966, "learning_rate": 2.0947265625e-05, "loss": 0.0877, "step": 6250 }, { "epoch": 2.375711574952562, "grad_norm": 0.43759119510650635, "learning_rate": 2.0898437500000002e-05, "loss": 0.0791, "step": 6260 }, { "epoch": 2.379506641366224, "grad_norm": 4.369633197784424, "learning_rate": 2.0849609375e-05, "loss": 0.0793, "step": 6270 }, { "epoch": 2.383301707779886, "grad_norm": 3.1445748805999756, "learning_rate": 2.080078125e-05, "loss": 0.0994, "step": 6280 }, { "epoch": 2.3870967741935485, "grad_norm": 0.5459542274475098, "learning_rate": 2.0751953125e-05, "loss": 0.0493, "step": 6290 }, { "epoch": 2.3908918406072104, "grad_norm": 0.8807210326194763, "learning_rate": 2.0703125e-05, "loss": 0.0669, "step": 6300 }, { "epoch": 2.3946869070208727, "grad_norm": 2.931506872177124, "learning_rate": 2.0654296875000002e-05, "loss": 0.1014, "step": 6310 }, { "epoch": 2.398481973434535, "grad_norm": 1.1972861289978027, "learning_rate": 2.060546875e-05, "loss": 0.0643, "step": 6320 }, { "epoch": 2.4022770398481974, "grad_norm": 2.670483112335205, "learning_rate": 2.0556640625000003e-05, "loss": 0.0651, "step": 6330 }, { "epoch": 2.4060721062618597, "grad_norm": 2.790907382965088, "learning_rate": 2.05078125e-05, "loss": 0.0979, "step": 6340 }, { "epoch": 2.4098671726755216, "grad_norm": 1.7010408639907837, "learning_rate": 2.0458984375e-05, "loss": 0.0616, "step": 6350 }, { "epoch": 2.413662239089184, "grad_norm": 2.3590617179870605, "learning_rate": 2.0410156250000002e-05, "loss": 0.0877, "step": 6360 }, { "epoch": 2.4174573055028463, "grad_norm": 0.7550681829452515, "learning_rate": 2.0361328125e-05, "loss": 0.0351, "step": 6370 }, { "epoch": 2.4212523719165087, "grad_norm": 2.2927632331848145, "learning_rate": 2.0312500000000002e-05, "loss": 0.102, "step": 6380 }, { "epoch": 2.4250474383301706, "grad_norm": 8.239547729492188, "learning_rate": 2.0263671875e-05, "loss": 0.1315, "step": 6390 }, { "epoch": 2.428842504743833, "grad_norm": 0.12305755913257599, "learning_rate": 2.021484375e-05, "loss": 0.0508, "step": 6400 }, { "epoch": 2.4326375711574952, "grad_norm": 0.24204160273075104, "learning_rate": 2.0166015625e-05, "loss": 0.1154, "step": 6410 }, { "epoch": 2.4364326375711576, "grad_norm": 1.9680283069610596, "learning_rate": 2.01171875e-05, "loss": 0.0576, "step": 6420 }, { "epoch": 2.44022770398482, "grad_norm": 2.9172940254211426, "learning_rate": 2.0068359375000002e-05, "loss": 0.0457, "step": 6430 }, { "epoch": 2.444022770398482, "grad_norm": 4.63267707824707, "learning_rate": 2.001953125e-05, "loss": 0.0544, "step": 6440 }, { "epoch": 2.447817836812144, "grad_norm": 1.447266936302185, "learning_rate": 1.9970703125e-05, "loss": 0.0885, "step": 6450 }, { "epoch": 2.4516129032258065, "grad_norm": 2.839066505432129, "learning_rate": 1.9921875e-05, "loss": 0.1266, "step": 6460 }, { "epoch": 2.455407969639469, "grad_norm": 2.1036999225616455, "learning_rate": 1.9873046875e-05, "loss": 0.1107, "step": 6470 }, { "epoch": 2.4592030360531307, "grad_norm": 2.6435329914093018, "learning_rate": 1.9824218750000002e-05, "loss": 0.0539, "step": 6480 }, { "epoch": 2.462998102466793, "grad_norm": 0.2627769112586975, "learning_rate": 1.9775390625e-05, "loss": 0.0713, "step": 6490 }, { "epoch": 2.4667931688804554, "grad_norm": 3.5408475399017334, "learning_rate": 1.9726562500000003e-05, "loss": 0.1061, "step": 6500 }, { "epoch": 2.4705882352941178, "grad_norm": 2.456315279006958, "learning_rate": 1.9677734375e-05, "loss": 0.0782, "step": 6510 }, { "epoch": 2.47438330170778, "grad_norm": 5.217021942138672, "learning_rate": 1.962890625e-05, "loss": 0.1009, "step": 6520 }, { "epoch": 2.478178368121442, "grad_norm": 4.218019962310791, "learning_rate": 1.9580078125000002e-05, "loss": 0.0663, "step": 6530 }, { "epoch": 2.4819734345351043, "grad_norm": 2.7066123485565186, "learning_rate": 1.953125e-05, "loss": 0.0891, "step": 6540 }, { "epoch": 2.4857685009487667, "grad_norm": 0.1062941625714302, "learning_rate": 1.9482421875000002e-05, "loss": 0.1085, "step": 6550 }, { "epoch": 2.489563567362429, "grad_norm": 5.984579086303711, "learning_rate": 1.943359375e-05, "loss": 0.092, "step": 6560 }, { "epoch": 2.493358633776091, "grad_norm": 0.7308592796325684, "learning_rate": 1.9384765625e-05, "loss": 0.072, "step": 6570 }, { "epoch": 2.4971537001897532, "grad_norm": 0.8086015582084656, "learning_rate": 1.93359375e-05, "loss": 0.1052, "step": 6580 }, { "epoch": 2.5009487666034156, "grad_norm": 1.8991528749465942, "learning_rate": 1.9287109375e-05, "loss": 0.0737, "step": 6590 }, { "epoch": 2.504743833017078, "grad_norm": 6.63985013961792, "learning_rate": 1.9238281250000002e-05, "loss": 0.1096, "step": 6600 }, { "epoch": 2.5085388994307403, "grad_norm": 0.17855627834796906, "learning_rate": 1.9189453125e-05, "loss": 0.0624, "step": 6610 }, { "epoch": 2.512333965844402, "grad_norm": 4.877336502075195, "learning_rate": 1.9140625e-05, "loss": 0.1211, "step": 6620 }, { "epoch": 2.5161290322580645, "grad_norm": 0.27590852975845337, "learning_rate": 1.9091796875e-05, "loss": 0.0521, "step": 6630 }, { "epoch": 2.519924098671727, "grad_norm": 0.45393088459968567, "learning_rate": 1.904296875e-05, "loss": 0.0707, "step": 6640 }, { "epoch": 2.5237191650853887, "grad_norm": 2.1049611568450928, "learning_rate": 1.8994140625000002e-05, "loss": 0.1105, "step": 6650 }, { "epoch": 2.527514231499051, "grad_norm": 1.805330753326416, "learning_rate": 1.89453125e-05, "loss": 0.068, "step": 6660 }, { "epoch": 2.5313092979127134, "grad_norm": 1.1227184534072876, "learning_rate": 1.8896484375000003e-05, "loss": 0.0572, "step": 6670 }, { "epoch": 2.5351043643263758, "grad_norm": 2.483306646347046, "learning_rate": 1.884765625e-05, "loss": 0.1095, "step": 6680 }, { "epoch": 2.538899430740038, "grad_norm": 0.1452198177576065, "learning_rate": 1.8798828125e-05, "loss": 0.0401, "step": 6690 }, { "epoch": 2.5426944971537004, "grad_norm": 0.14945687353610992, "learning_rate": 1.8750000000000002e-05, "loss": 0.0796, "step": 6700 }, { "epoch": 2.5464895635673623, "grad_norm": 1.3936477899551392, "learning_rate": 1.8701171875e-05, "loss": 0.0688, "step": 6710 }, { "epoch": 2.5502846299810247, "grad_norm": 0.16819104552268982, "learning_rate": 1.8652343750000002e-05, "loss": 0.0454, "step": 6720 }, { "epoch": 2.554079696394687, "grad_norm": 1.2239612340927124, "learning_rate": 1.8603515625e-05, "loss": 0.0588, "step": 6730 }, { "epoch": 2.557874762808349, "grad_norm": 7.471010684967041, "learning_rate": 1.85546875e-05, "loss": 0.0528, "step": 6740 }, { "epoch": 2.5616698292220113, "grad_norm": 4.900544166564941, "learning_rate": 1.8505859375e-05, "loss": 0.0858, "step": 6750 }, { "epoch": 2.5654648956356736, "grad_norm": 3.8821702003479004, "learning_rate": 1.845703125e-05, "loss": 0.046, "step": 6760 }, { "epoch": 2.569259962049336, "grad_norm": 0.17730577290058136, "learning_rate": 1.8408203125000002e-05, "loss": 0.0673, "step": 6770 }, { "epoch": 2.5730550284629983, "grad_norm": 3.4757065773010254, "learning_rate": 1.8359375e-05, "loss": 0.094, "step": 6780 }, { "epoch": 2.5768500948766606, "grad_norm": 3.2091782093048096, "learning_rate": 1.8310546875e-05, "loss": 0.08, "step": 6790 }, { "epoch": 2.5806451612903225, "grad_norm": 5.548855304718018, "learning_rate": 1.826171875e-05, "loss": 0.0996, "step": 6800 }, { "epoch": 2.584440227703985, "grad_norm": 0.17017248272895813, "learning_rate": 1.8212890625e-05, "loss": 0.0828, "step": 6810 }, { "epoch": 2.588235294117647, "grad_norm": 9.512433052062988, "learning_rate": 1.8164062500000002e-05, "loss": 0.0696, "step": 6820 }, { "epoch": 2.592030360531309, "grad_norm": 0.9737806916236877, "learning_rate": 1.8115234375e-05, "loss": 0.0881, "step": 6830 }, { "epoch": 2.5958254269449714, "grad_norm": 7.027744293212891, "learning_rate": 1.8066406250000002e-05, "loss": 0.06, "step": 6840 }, { "epoch": 2.5996204933586338, "grad_norm": 2.162301778793335, "learning_rate": 1.8017578125e-05, "loss": 0.0833, "step": 6850 }, { "epoch": 2.603415559772296, "grad_norm": 0.30585893988609314, "learning_rate": 1.796875e-05, "loss": 0.0794, "step": 6860 }, { "epoch": 2.6072106261859584, "grad_norm": 0.22574108839035034, "learning_rate": 1.7919921875e-05, "loss": 0.0965, "step": 6870 }, { "epoch": 2.6110056925996203, "grad_norm": 0.6627634763717651, "learning_rate": 1.787109375e-05, "loss": 0.0622, "step": 6880 }, { "epoch": 2.6148007590132827, "grad_norm": 0.17045138776302338, "learning_rate": 1.7822265625000002e-05, "loss": 0.0471, "step": 6890 }, { "epoch": 2.618595825426945, "grad_norm": 0.31901392340660095, "learning_rate": 1.77734375e-05, "loss": 0.0607, "step": 6900 }, { "epoch": 2.6223908918406074, "grad_norm": 0.21171316504478455, "learning_rate": 1.7724609375e-05, "loss": 0.0789, "step": 6910 }, { "epoch": 2.6261859582542693, "grad_norm": 0.8109591007232666, "learning_rate": 1.767578125e-05, "loss": 0.0973, "step": 6920 }, { "epoch": 2.6299810246679316, "grad_norm": 2.583545446395874, "learning_rate": 1.7626953125e-05, "loss": 0.0512, "step": 6930 }, { "epoch": 2.633776091081594, "grad_norm": 1.5937598943710327, "learning_rate": 1.7578125000000002e-05, "loss": 0.0861, "step": 6940 }, { "epoch": 2.6375711574952563, "grad_norm": 1.3143688440322876, "learning_rate": 1.7529296875e-05, "loss": 0.098, "step": 6950 }, { "epoch": 2.6413662239089186, "grad_norm": 2.390667676925659, "learning_rate": 1.748046875e-05, "loss": 0.0621, "step": 6960 }, { "epoch": 2.6451612903225805, "grad_norm": 0.30924805998802185, "learning_rate": 1.7431640625e-05, "loss": 0.0807, "step": 6970 }, { "epoch": 2.648956356736243, "grad_norm": 1.6821314096450806, "learning_rate": 1.73828125e-05, "loss": 0.0598, "step": 6980 }, { "epoch": 2.652751423149905, "grad_norm": 1.8624871969223022, "learning_rate": 1.7333984375000002e-05, "loss": 0.0841, "step": 6990 }, { "epoch": 2.656546489563567, "grad_norm": 1.0055333375930786, "learning_rate": 1.728515625e-05, "loss": 0.0853, "step": 7000 }, { "epoch": 2.6603415559772294, "grad_norm": 0.11686267703771591, "learning_rate": 1.7236328125000002e-05, "loss": 0.0455, "step": 7010 }, { "epoch": 2.6641366223908918, "grad_norm": 5.000795841217041, "learning_rate": 1.71875e-05, "loss": 0.1102, "step": 7020 }, { "epoch": 2.667931688804554, "grad_norm": 5.362839221954346, "learning_rate": 1.7138671875e-05, "loss": 0.0864, "step": 7030 }, { "epoch": 2.6717267552182165, "grad_norm": 4.031505584716797, "learning_rate": 1.708984375e-05, "loss": 0.0753, "step": 7040 }, { "epoch": 2.675521821631879, "grad_norm": 3.553187608718872, "learning_rate": 1.7041015625e-05, "loss": 0.0802, "step": 7050 }, { "epoch": 2.6793168880455407, "grad_norm": 2.1504125595092773, "learning_rate": 1.6992187500000002e-05, "loss": 0.0798, "step": 7060 }, { "epoch": 2.683111954459203, "grad_norm": 0.17360809445381165, "learning_rate": 1.6943359375e-05, "loss": 0.1064, "step": 7070 }, { "epoch": 2.6869070208728654, "grad_norm": 0.16311465203762054, "learning_rate": 1.689453125e-05, "loss": 0.1194, "step": 7080 }, { "epoch": 2.6907020872865273, "grad_norm": 3.6088805198669434, "learning_rate": 1.6845703125e-05, "loss": 0.0586, "step": 7090 }, { "epoch": 2.6944971537001896, "grad_norm": 5.143406867980957, "learning_rate": 1.6796875e-05, "loss": 0.0892, "step": 7100 }, { "epoch": 2.698292220113852, "grad_norm": 27.002168655395508, "learning_rate": 1.6748046875000002e-05, "loss": 0.089, "step": 7110 }, { "epoch": 2.7020872865275143, "grad_norm": 1.443231225013733, "learning_rate": 1.669921875e-05, "loss": 0.1328, "step": 7120 }, { "epoch": 2.7058823529411766, "grad_norm": 7.007279396057129, "learning_rate": 1.6650390625e-05, "loss": 0.0652, "step": 7130 }, { "epoch": 2.709677419354839, "grad_norm": 0.25469958782196045, "learning_rate": 1.66015625e-05, "loss": 0.045, "step": 7140 }, { "epoch": 2.713472485768501, "grad_norm": 4.693950653076172, "learning_rate": 1.6552734375e-05, "loss": 0.1245, "step": 7150 }, { "epoch": 2.717267552182163, "grad_norm": 0.3287486732006073, "learning_rate": 1.6503906250000002e-05, "loss": 0.068, "step": 7160 }, { "epoch": 2.7210626185958255, "grad_norm": 9.82812786102295, "learning_rate": 1.6455078125e-05, "loss": 0.0909, "step": 7170 }, { "epoch": 2.7248576850094874, "grad_norm": 14.501320838928223, "learning_rate": 1.6406250000000002e-05, "loss": 0.0972, "step": 7180 }, { "epoch": 2.72865275142315, "grad_norm": 5.130281448364258, "learning_rate": 1.6357421875e-05, "loss": 0.1253, "step": 7190 }, { "epoch": 2.732447817836812, "grad_norm": 3.5541763305664062, "learning_rate": 1.630859375e-05, "loss": 0.0822, "step": 7200 }, { "epoch": 2.7362428842504745, "grad_norm": 0.9670690894126892, "learning_rate": 1.6259765625e-05, "loss": 0.0231, "step": 7210 }, { "epoch": 2.740037950664137, "grad_norm": 0.676513135433197, "learning_rate": 1.62109375e-05, "loss": 0.0972, "step": 7220 }, { "epoch": 2.7438330170777987, "grad_norm": 7.5943217277526855, "learning_rate": 1.6162109375000002e-05, "loss": 0.0989, "step": 7230 }, { "epoch": 2.747628083491461, "grad_norm": 0.20399871468544006, "learning_rate": 1.611328125e-05, "loss": 0.1036, "step": 7240 }, { "epoch": 2.7514231499051234, "grad_norm": 0.43629199266433716, "learning_rate": 1.6064453125e-05, "loss": 0.0311, "step": 7250 }, { "epoch": 2.7552182163187857, "grad_norm": 1.144394040107727, "learning_rate": 1.6015625e-05, "loss": 0.0815, "step": 7260 }, { "epoch": 2.7590132827324476, "grad_norm": 0.06812827289104462, "learning_rate": 1.5966796875e-05, "loss": 0.0539, "step": 7270 }, { "epoch": 2.76280834914611, "grad_norm": 2.913031578063965, "learning_rate": 1.5917968750000002e-05, "loss": 0.0443, "step": 7280 }, { "epoch": 2.7666034155597723, "grad_norm": 2.4026944637298584, "learning_rate": 1.5869140625e-05, "loss": 0.0957, "step": 7290 }, { "epoch": 2.7703984819734346, "grad_norm": 3.89658784866333, "learning_rate": 1.58203125e-05, "loss": 0.1125, "step": 7300 }, { "epoch": 2.774193548387097, "grad_norm": 0.4522351920604706, "learning_rate": 1.5771484375e-05, "loss": 0.0889, "step": 7310 }, { "epoch": 2.777988614800759, "grad_norm": 5.769268989562988, "learning_rate": 1.572265625e-05, "loss": 0.0631, "step": 7320 }, { "epoch": 2.781783681214421, "grad_norm": 1.7276089191436768, "learning_rate": 1.5673828125000002e-05, "loss": 0.091, "step": 7330 }, { "epoch": 2.7855787476280836, "grad_norm": 2.0759644508361816, "learning_rate": 1.5625e-05, "loss": 0.0655, "step": 7340 }, { "epoch": 2.789373814041746, "grad_norm": 0.7582204937934875, "learning_rate": 1.5576171875000002e-05, "loss": 0.0541, "step": 7350 }, { "epoch": 2.793168880455408, "grad_norm": 16.55638885498047, "learning_rate": 1.552734375e-05, "loss": 0.1178, "step": 7360 }, { "epoch": 2.79696394686907, "grad_norm": 0.7026536464691162, "learning_rate": 1.5478515625e-05, "loss": 0.0459, "step": 7370 }, { "epoch": 2.8007590132827325, "grad_norm": 4.089038372039795, "learning_rate": 1.54296875e-05, "loss": 0.0663, "step": 7380 }, { "epoch": 2.804554079696395, "grad_norm": 3.8286547660827637, "learning_rate": 1.5380859375e-05, "loss": 0.1096, "step": 7390 }, { "epoch": 2.808349146110057, "grad_norm": 2.5993642807006836, "learning_rate": 1.5332031250000002e-05, "loss": 0.0685, "step": 7400 }, { "epoch": 2.812144212523719, "grad_norm": 1.0880334377288818, "learning_rate": 1.5283203125e-05, "loss": 0.0631, "step": 7410 }, { "epoch": 2.8159392789373814, "grad_norm": 1.036834478378296, "learning_rate": 1.5234375000000001e-05, "loss": 0.086, "step": 7420 }, { "epoch": 2.8197343453510437, "grad_norm": 5.436180114746094, "learning_rate": 1.5185546875000001e-05, "loss": 0.1121, "step": 7430 }, { "epoch": 2.8235294117647056, "grad_norm": 3.7009427547454834, "learning_rate": 1.513671875e-05, "loss": 0.0764, "step": 7440 }, { "epoch": 2.827324478178368, "grad_norm": 2.5197298526763916, "learning_rate": 1.5087890625e-05, "loss": 0.082, "step": 7450 }, { "epoch": 2.8311195445920303, "grad_norm": 3.15004563331604, "learning_rate": 1.50390625e-05, "loss": 0.112, "step": 7460 }, { "epoch": 2.8349146110056926, "grad_norm": 2.9666614532470703, "learning_rate": 1.4990234375e-05, "loss": 0.0872, "step": 7470 }, { "epoch": 2.838709677419355, "grad_norm": 6.0326385498046875, "learning_rate": 1.4941406250000001e-05, "loss": 0.0817, "step": 7480 }, { "epoch": 2.8425047438330173, "grad_norm": 1.699873685836792, "learning_rate": 1.4892578125000001e-05, "loss": 0.0816, "step": 7490 }, { "epoch": 2.846299810246679, "grad_norm": 0.14119946956634521, "learning_rate": 1.484375e-05, "loss": 0.0725, "step": 7500 }, { "epoch": 2.8500948766603416, "grad_norm": 6.737262725830078, "learning_rate": 1.4794921875e-05, "loss": 0.1205, "step": 7510 }, { "epoch": 2.853889943074004, "grad_norm": 4.460575103759766, "learning_rate": 1.474609375e-05, "loss": 0.123, "step": 7520 }, { "epoch": 2.857685009487666, "grad_norm": 0.09714975953102112, "learning_rate": 1.4697265625000001e-05, "loss": 0.0687, "step": 7530 }, { "epoch": 2.861480075901328, "grad_norm": 3.972470760345459, "learning_rate": 1.4648437500000001e-05, "loss": 0.1089, "step": 7540 }, { "epoch": 2.8652751423149905, "grad_norm": 2.0776712894439697, "learning_rate": 1.4599609375000001e-05, "loss": 0.1318, "step": 7550 }, { "epoch": 2.869070208728653, "grad_norm": 0.21448436379432678, "learning_rate": 1.455078125e-05, "loss": 0.0639, "step": 7560 }, { "epoch": 2.872865275142315, "grad_norm": 0.19727276265621185, "learning_rate": 1.4501953125e-05, "loss": 0.0464, "step": 7570 }, { "epoch": 2.8766603415559775, "grad_norm": 2.9958267211914062, "learning_rate": 1.4453125e-05, "loss": 0.0715, "step": 7580 }, { "epoch": 2.8804554079696394, "grad_norm": 1.823538064956665, "learning_rate": 1.4404296875000001e-05, "loss": 0.0781, "step": 7590 }, { "epoch": 2.8842504743833017, "grad_norm": 2.5351407527923584, "learning_rate": 1.4355468750000001e-05, "loss": 0.0888, "step": 7600 }, { "epoch": 2.888045540796964, "grad_norm": 4.274851322174072, "learning_rate": 1.4306640625000002e-05, "loss": 0.0228, "step": 7610 }, { "epoch": 2.891840607210626, "grad_norm": 4.665604591369629, "learning_rate": 1.42578125e-05, "loss": 0.083, "step": 7620 }, { "epoch": 2.8956356736242883, "grad_norm": 4.373048782348633, "learning_rate": 1.4208984375e-05, "loss": 0.0936, "step": 7630 }, { "epoch": 2.8994307400379506, "grad_norm": 1.5743074417114258, "learning_rate": 1.416015625e-05, "loss": 0.0414, "step": 7640 }, { "epoch": 2.903225806451613, "grad_norm": 2.3043341636657715, "learning_rate": 1.4111328125000001e-05, "loss": 0.0739, "step": 7650 }, { "epoch": 2.9070208728652753, "grad_norm": 2.980686902999878, "learning_rate": 1.4062500000000001e-05, "loss": 0.0755, "step": 7660 }, { "epoch": 2.9108159392789372, "grad_norm": 0.5928072929382324, "learning_rate": 1.4013671875e-05, "loss": 0.116, "step": 7670 }, { "epoch": 2.9146110056925996, "grad_norm": 0.14647921919822693, "learning_rate": 1.396484375e-05, "loss": 0.0367, "step": 7680 }, { "epoch": 2.918406072106262, "grad_norm": 6.466022968292236, "learning_rate": 1.3916015625e-05, "loss": 0.0365, "step": 7690 }, { "epoch": 2.9222011385199242, "grad_norm": 13.139077186584473, "learning_rate": 1.38671875e-05, "loss": 0.1295, "step": 7700 }, { "epoch": 2.925996204933586, "grad_norm": 0.3945586383342743, "learning_rate": 1.3818359375000001e-05, "loss": 0.0559, "step": 7710 }, { "epoch": 2.9297912713472485, "grad_norm": 0.04980861395597458, "learning_rate": 1.3769531250000001e-05, "loss": 0.0485, "step": 7720 }, { "epoch": 2.933586337760911, "grad_norm": 2.388545513153076, "learning_rate": 1.3720703125e-05, "loss": 0.0542, "step": 7730 }, { "epoch": 2.937381404174573, "grad_norm": 2.4082882404327393, "learning_rate": 1.3671875e-05, "loss": 0.0939, "step": 7740 }, { "epoch": 2.9411764705882355, "grad_norm": 4.933741569519043, "learning_rate": 1.3623046875e-05, "loss": 0.1409, "step": 7750 }, { "epoch": 2.9449715370018974, "grad_norm": 5.57550573348999, "learning_rate": 1.3574218750000001e-05, "loss": 0.0646, "step": 7760 }, { "epoch": 2.9487666034155597, "grad_norm": 1.8403911590576172, "learning_rate": 1.3525390625000001e-05, "loss": 0.0694, "step": 7770 }, { "epoch": 2.952561669829222, "grad_norm": 6.1294331550598145, "learning_rate": 1.3476562500000001e-05, "loss": 0.0476, "step": 7780 }, { "epoch": 2.956356736242884, "grad_norm": 0.0652192234992981, "learning_rate": 1.3427734375e-05, "loss": 0.0634, "step": 7790 }, { "epoch": 2.9601518026565463, "grad_norm": 2.2705845832824707, "learning_rate": 1.337890625e-05, "loss": 0.0577, "step": 7800 }, { "epoch": 2.9639468690702087, "grad_norm": 0.12686532735824585, "learning_rate": 1.3330078125e-05, "loss": 0.0948, "step": 7810 }, { "epoch": 2.967741935483871, "grad_norm": 3.2810075283050537, "learning_rate": 1.3281250000000001e-05, "loss": 0.0813, "step": 7820 }, { "epoch": 2.9715370018975333, "grad_norm": 2.2181339263916016, "learning_rate": 1.3232421875000001e-05, "loss": 0.1022, "step": 7830 }, { "epoch": 2.9753320683111957, "grad_norm": 1.6737946271896362, "learning_rate": 1.318359375e-05, "loss": 0.0557, "step": 7840 }, { "epoch": 2.9791271347248576, "grad_norm": 7.780960559844971, "learning_rate": 1.3134765625e-05, "loss": 0.0978, "step": 7850 }, { "epoch": 2.98292220113852, "grad_norm": 8.983189582824707, "learning_rate": 1.30859375e-05, "loss": 0.0601, "step": 7860 }, { "epoch": 2.9867172675521823, "grad_norm": 4.744899272918701, "learning_rate": 1.3037109375e-05, "loss": 0.0418, "step": 7870 }, { "epoch": 2.990512333965844, "grad_norm": 2.1875483989715576, "learning_rate": 1.2988281250000001e-05, "loss": 0.0746, "step": 7880 }, { "epoch": 2.9943074003795065, "grad_norm": 1.506842017173767, "learning_rate": 1.2939453125000001e-05, "loss": 0.0868, "step": 7890 }, { "epoch": 2.998102466793169, "grad_norm": 2.1302731037139893, "learning_rate": 1.2890625e-05, "loss": 0.0687, "step": 7900 }, { "epoch": 3.001897533206831, "grad_norm": 2.632828950881958, "learning_rate": 1.2841796875e-05, "loss": 0.0705, "step": 7910 }, { "epoch": 3.0056925996204935, "grad_norm": 0.15800461173057556, "learning_rate": 1.279296875e-05, "loss": 0.0522, "step": 7920 }, { "epoch": 3.0094876660341554, "grad_norm": 0.13846412301063538, "learning_rate": 1.2744140625e-05, "loss": 0.0363, "step": 7930 }, { "epoch": 3.0132827324478177, "grad_norm": 4.117944717407227, "learning_rate": 1.2695312500000001e-05, "loss": 0.0605, "step": 7940 }, { "epoch": 3.01707779886148, "grad_norm": 1.4927798509597778, "learning_rate": 1.2646484375000001e-05, "loss": 0.0346, "step": 7950 }, { "epoch": 3.0208728652751424, "grad_norm": 4.367966175079346, "learning_rate": 1.259765625e-05, "loss": 0.0458, "step": 7960 }, { "epoch": 3.0246679316888048, "grad_norm": 2.0026087760925293, "learning_rate": 1.2548828125e-05, "loss": 0.0749, "step": 7970 }, { "epoch": 3.0284629981024667, "grad_norm": 2.106546640396118, "learning_rate": 1.25e-05, "loss": 0.065, "step": 7980 }, { "epoch": 3.032258064516129, "grad_norm": 4.122467994689941, "learning_rate": 1.2451171875000001e-05, "loss": 0.0475, "step": 7990 }, { "epoch": 3.0360531309297913, "grad_norm": 0.08205808699131012, "learning_rate": 1.2402343750000001e-05, "loss": 0.0692, "step": 8000 }, { "epoch": 3.0398481973434537, "grad_norm": 1.0389831066131592, "learning_rate": 1.2353515625e-05, "loss": 0.0514, "step": 8010 }, { "epoch": 3.0436432637571156, "grad_norm": 0.1080293357372284, "learning_rate": 1.23046875e-05, "loss": 0.0385, "step": 8020 }, { "epoch": 3.047438330170778, "grad_norm": 0.2515338361263275, "learning_rate": 1.2255859375e-05, "loss": 0.0835, "step": 8030 }, { "epoch": 3.0512333965844403, "grad_norm": 1.1087881326675415, "learning_rate": 1.220703125e-05, "loss": 0.0559, "step": 8040 }, { "epoch": 3.0550284629981026, "grad_norm": 1.1088217496871948, "learning_rate": 1.2158203125000001e-05, "loss": 0.075, "step": 8050 }, { "epoch": 3.0588235294117645, "grad_norm": 3.310959577560425, "learning_rate": 1.2109375000000001e-05, "loss": 0.0596, "step": 8060 }, { "epoch": 3.062618595825427, "grad_norm": 1.186274766921997, "learning_rate": 1.2060546875e-05, "loss": 0.0399, "step": 8070 }, { "epoch": 3.066413662239089, "grad_norm": 3.054225444793701, "learning_rate": 1.201171875e-05, "loss": 0.0352, "step": 8080 }, { "epoch": 3.0702087286527515, "grad_norm": 0.3610187768936157, "learning_rate": 1.1962890625e-05, "loss": 0.0519, "step": 8090 }, { "epoch": 3.074003795066414, "grad_norm": 1.7858855724334717, "learning_rate": 1.19140625e-05, "loss": 0.0712, "step": 8100 }, { "epoch": 3.0777988614800758, "grad_norm": 3.144697666168213, "learning_rate": 1.1865234375000001e-05, "loss": 0.0343, "step": 8110 }, { "epoch": 3.081593927893738, "grad_norm": 1.743668556213379, "learning_rate": 1.1816406250000001e-05, "loss": 0.0611, "step": 8120 }, { "epoch": 3.0853889943074004, "grad_norm": 0.6149533987045288, "learning_rate": 1.1767578125e-05, "loss": 0.0512, "step": 8130 }, { "epoch": 3.0891840607210628, "grad_norm": 6.247795581817627, "learning_rate": 1.171875e-05, "loss": 0.0741, "step": 8140 }, { "epoch": 3.0929791271347247, "grad_norm": 0.8566815853118896, "learning_rate": 1.1669921875e-05, "loss": 0.0699, "step": 8150 }, { "epoch": 3.096774193548387, "grad_norm": 3.2794229984283447, "learning_rate": 1.162109375e-05, "loss": 0.0296, "step": 8160 }, { "epoch": 3.1005692599620494, "grad_norm": 0.10005365312099457, "learning_rate": 1.1572265625000001e-05, "loss": 0.0645, "step": 8170 }, { "epoch": 3.1043643263757117, "grad_norm": 2.8992691040039062, "learning_rate": 1.15234375e-05, "loss": 0.0456, "step": 8180 }, { "epoch": 3.108159392789374, "grad_norm": 3.6778674125671387, "learning_rate": 1.1474609375e-05, "loss": 0.0351, "step": 8190 }, { "epoch": 3.111954459203036, "grad_norm": 1.5398664474487305, "learning_rate": 1.142578125e-05, "loss": 0.042, "step": 8200 }, { "epoch": 3.1157495256166983, "grad_norm": 0.05135444924235344, "learning_rate": 1.1376953125e-05, "loss": 0.0478, "step": 8210 }, { "epoch": 3.1195445920303606, "grad_norm": 0.6804483532905579, "learning_rate": 1.1328125000000001e-05, "loss": 0.058, "step": 8220 }, { "epoch": 3.123339658444023, "grad_norm": 0.10011663287878036, "learning_rate": 1.1279296875000001e-05, "loss": 0.0456, "step": 8230 }, { "epoch": 3.127134724857685, "grad_norm": 0.466981440782547, "learning_rate": 1.123046875e-05, "loss": 0.0449, "step": 8240 }, { "epoch": 3.130929791271347, "grad_norm": 2.163849353790283, "learning_rate": 1.1181640625e-05, "loss": 0.0595, "step": 8250 }, { "epoch": 3.1347248576850095, "grad_norm": 1.1013680696487427, "learning_rate": 1.11328125e-05, "loss": 0.0708, "step": 8260 }, { "epoch": 3.138519924098672, "grad_norm": 8.969820022583008, "learning_rate": 1.1083984375e-05, "loss": 0.064, "step": 8270 }, { "epoch": 3.1423149905123338, "grad_norm": 1.1106621026992798, "learning_rate": 1.1035156250000001e-05, "loss": 0.1007, "step": 8280 }, { "epoch": 3.146110056925996, "grad_norm": 0.1508377343416214, "learning_rate": 1.0986328125000001e-05, "loss": 0.0464, "step": 8290 }, { "epoch": 3.1499051233396584, "grad_norm": 0.07330877333879471, "learning_rate": 1.09375e-05, "loss": 0.0797, "step": 8300 }, { "epoch": 3.153700189753321, "grad_norm": 1.6159915924072266, "learning_rate": 1.0888671875e-05, "loss": 0.0527, "step": 8310 }, { "epoch": 3.157495256166983, "grad_norm": 0.5196408629417419, "learning_rate": 1.083984375e-05, "loss": 0.0433, "step": 8320 }, { "epoch": 3.161290322580645, "grad_norm": 2.486041307449341, "learning_rate": 1.0791015625e-05, "loss": 0.0651, "step": 8330 }, { "epoch": 3.1650853889943074, "grad_norm": 1.0713788270950317, "learning_rate": 1.0742187500000001e-05, "loss": 0.0695, "step": 8340 }, { "epoch": 3.1688804554079697, "grad_norm": 0.19154168665409088, "learning_rate": 1.0693359375e-05, "loss": 0.0364, "step": 8350 }, { "epoch": 3.172675521821632, "grad_norm": 0.31223466992378235, "learning_rate": 1.064453125e-05, "loss": 0.0267, "step": 8360 }, { "epoch": 3.176470588235294, "grad_norm": 0.7767817378044128, "learning_rate": 1.0595703125e-05, "loss": 0.0635, "step": 8370 }, { "epoch": 3.1802656546489563, "grad_norm": 2.4257445335388184, "learning_rate": 1.0546875e-05, "loss": 0.0588, "step": 8380 }, { "epoch": 3.1840607210626186, "grad_norm": 1.2349954843521118, "learning_rate": 1.0498046875000001e-05, "loss": 0.0557, "step": 8390 }, { "epoch": 3.187855787476281, "grad_norm": 3.209284543991089, "learning_rate": 1.0449218750000001e-05, "loss": 0.0459, "step": 8400 }, { "epoch": 3.191650853889943, "grad_norm": 0.16265904903411865, "learning_rate": 1.0400390625e-05, "loss": 0.0525, "step": 8410 }, { "epoch": 3.195445920303605, "grad_norm": 0.6664568781852722, "learning_rate": 1.03515625e-05, "loss": 0.0727, "step": 8420 }, { "epoch": 3.1992409867172675, "grad_norm": 0.9481377005577087, "learning_rate": 1.0302734375e-05, "loss": 0.0215, "step": 8430 }, { "epoch": 3.20303605313093, "grad_norm": 5.600297451019287, "learning_rate": 1.025390625e-05, "loss": 0.0385, "step": 8440 }, { "epoch": 3.206831119544592, "grad_norm": 0.15000663697719574, "learning_rate": 1.0205078125000001e-05, "loss": 0.0659, "step": 8450 }, { "epoch": 3.210626185958254, "grad_norm": 0.6691407561302185, "learning_rate": 1.0156250000000001e-05, "loss": 0.0666, "step": 8460 }, { "epoch": 3.2144212523719164, "grad_norm": 1.3882899284362793, "learning_rate": 1.0107421875e-05, "loss": 0.0815, "step": 8470 }, { "epoch": 3.218216318785579, "grad_norm": 1.0314580202102661, "learning_rate": 1.005859375e-05, "loss": 0.0178, "step": 8480 }, { "epoch": 3.222011385199241, "grad_norm": 3.9537134170532227, "learning_rate": 1.0009765625e-05, "loss": 0.0631, "step": 8490 }, { "epoch": 3.225806451612903, "grad_norm": 5.446588039398193, "learning_rate": 9.9609375e-06, "loss": 0.0548, "step": 8500 }, { "epoch": 3.2296015180265654, "grad_norm": 8.026607513427734, "learning_rate": 9.912109375000001e-06, "loss": 0.0353, "step": 8510 }, { "epoch": 3.2333965844402277, "grad_norm": 0.1389143019914627, "learning_rate": 9.863281250000001e-06, "loss": 0.0419, "step": 8520 }, { "epoch": 3.23719165085389, "grad_norm": 1.255216121673584, "learning_rate": 9.814453125e-06, "loss": 0.0697, "step": 8530 }, { "epoch": 3.2409867172675524, "grad_norm": 4.600146770477295, "learning_rate": 9.765625e-06, "loss": 0.0879, "step": 8540 }, { "epoch": 3.2447817836812143, "grad_norm": 0.09613824635744095, "learning_rate": 9.716796875e-06, "loss": 0.0122, "step": 8550 }, { "epoch": 3.2485768500948766, "grad_norm": 1.0265446901321411, "learning_rate": 9.66796875e-06, "loss": 0.0227, "step": 8560 }, { "epoch": 3.252371916508539, "grad_norm": 2.185931444168091, "learning_rate": 9.619140625000001e-06, "loss": 0.1162, "step": 8570 }, { "epoch": 3.2561669829222013, "grad_norm": 0.1482323259115219, "learning_rate": 9.5703125e-06, "loss": 0.0581, "step": 8580 }, { "epoch": 3.259962049335863, "grad_norm": 0.17460452020168304, "learning_rate": 9.521484375e-06, "loss": 0.0399, "step": 8590 }, { "epoch": 3.2637571157495255, "grad_norm": 1.6274187564849854, "learning_rate": 9.47265625e-06, "loss": 0.0537, "step": 8600 }, { "epoch": 3.267552182163188, "grad_norm": 8.227033615112305, "learning_rate": 9.423828125e-06, "loss": 0.0646, "step": 8610 }, { "epoch": 3.27134724857685, "grad_norm": 0.08734069019556046, "learning_rate": 9.375000000000001e-06, "loss": 0.0675, "step": 8620 }, { "epoch": 3.2751423149905126, "grad_norm": 0.5700662732124329, "learning_rate": 9.326171875000001e-06, "loss": 0.0744, "step": 8630 }, { "epoch": 3.2789373814041745, "grad_norm": 2.089008092880249, "learning_rate": 9.27734375e-06, "loss": 0.0812, "step": 8640 }, { "epoch": 3.282732447817837, "grad_norm": 0.11990799009799957, "learning_rate": 9.228515625e-06, "loss": 0.071, "step": 8650 }, { "epoch": 3.286527514231499, "grad_norm": 0.5663464665412903, "learning_rate": 9.1796875e-06, "loss": 0.0279, "step": 8660 }, { "epoch": 3.2903225806451615, "grad_norm": 0.8847103118896484, "learning_rate": 9.130859375e-06, "loss": 0.0473, "step": 8670 }, { "epoch": 3.2941176470588234, "grad_norm": 0.08891147375106812, "learning_rate": 9.082031250000001e-06, "loss": 0.041, "step": 8680 }, { "epoch": 3.2979127134724857, "grad_norm": 0.0875004231929779, "learning_rate": 9.033203125000001e-06, "loss": 0.0284, "step": 8690 }, { "epoch": 3.301707779886148, "grad_norm": 0.353773832321167, "learning_rate": 8.984375e-06, "loss": 0.0451, "step": 8700 }, { "epoch": 3.3055028462998104, "grad_norm": 0.03987530991435051, "learning_rate": 8.935546875e-06, "loss": 0.0803, "step": 8710 }, { "epoch": 3.3092979127134727, "grad_norm": 2.087677001953125, "learning_rate": 8.88671875e-06, "loss": 0.0257, "step": 8720 }, { "epoch": 3.3130929791271346, "grad_norm": 4.051992893218994, "learning_rate": 8.837890625e-06, "loss": 0.0345, "step": 8730 }, { "epoch": 3.316888045540797, "grad_norm": 3.694368362426758, "learning_rate": 8.789062500000001e-06, "loss": 0.0824, "step": 8740 }, { "epoch": 3.3206831119544593, "grad_norm": 0.09131748974323273, "learning_rate": 8.740234375e-06, "loss": 0.0295, "step": 8750 }, { "epoch": 3.324478178368121, "grad_norm": 0.05908443033695221, "learning_rate": 8.69140625e-06, "loss": 0.0282, "step": 8760 }, { "epoch": 3.3282732447817835, "grad_norm": 1.863980770111084, "learning_rate": 8.642578125e-06, "loss": 0.0442, "step": 8770 }, { "epoch": 3.332068311195446, "grad_norm": 1.2207703590393066, "learning_rate": 8.59375e-06, "loss": 0.0316, "step": 8780 }, { "epoch": 3.3358633776091082, "grad_norm": 2.562156915664673, "learning_rate": 8.544921875e-06, "loss": 0.0598, "step": 8790 }, { "epoch": 3.3396584440227706, "grad_norm": 5.533409595489502, "learning_rate": 8.496093750000001e-06, "loss": 0.0432, "step": 8800 }, { "epoch": 3.3434535104364325, "grad_norm": 0.47492659091949463, "learning_rate": 8.447265625e-06, "loss": 0.0528, "step": 8810 }, { "epoch": 3.347248576850095, "grad_norm": 1.0108855962753296, "learning_rate": 8.3984375e-06, "loss": 0.0552, "step": 8820 }, { "epoch": 3.351043643263757, "grad_norm": 1.780705451965332, "learning_rate": 8.349609375e-06, "loss": 0.0252, "step": 8830 }, { "epoch": 3.3548387096774195, "grad_norm": 0.3152208924293518, "learning_rate": 8.30078125e-06, "loss": 0.0915, "step": 8840 }, { "epoch": 3.3586337760910814, "grad_norm": 1.9720813035964966, "learning_rate": 8.251953125000001e-06, "loss": 0.0571, "step": 8850 }, { "epoch": 3.3624288425047437, "grad_norm": 0.5636972784996033, "learning_rate": 8.203125000000001e-06, "loss": 0.0716, "step": 8860 }, { "epoch": 3.366223908918406, "grad_norm": 9.523944854736328, "learning_rate": 8.154296875e-06, "loss": 0.0649, "step": 8870 }, { "epoch": 3.3700189753320684, "grad_norm": 1.868201732635498, "learning_rate": 8.10546875e-06, "loss": 0.1055, "step": 8880 }, { "epoch": 3.3738140417457307, "grad_norm": 4.064790725708008, "learning_rate": 8.056640625e-06, "loss": 0.0681, "step": 8890 }, { "epoch": 3.3776091081593926, "grad_norm": 5.854636192321777, "learning_rate": 8.0078125e-06, "loss": 0.0755, "step": 8900 }, { "epoch": 3.381404174573055, "grad_norm": 0.47955596446990967, "learning_rate": 7.958984375000001e-06, "loss": 0.0832, "step": 8910 }, { "epoch": 3.3851992409867173, "grad_norm": 0.48627012968063354, "learning_rate": 7.91015625e-06, "loss": 0.0487, "step": 8920 }, { "epoch": 3.3889943074003797, "grad_norm": 1.4986870288848877, "learning_rate": 7.861328125e-06, "loss": 0.0769, "step": 8930 }, { "epoch": 3.3927893738140416, "grad_norm": 1.139615774154663, "learning_rate": 7.8125e-06, "loss": 0.0238, "step": 8940 }, { "epoch": 3.396584440227704, "grad_norm": 0.17134952545166016, "learning_rate": 7.763671875e-06, "loss": 0.072, "step": 8950 }, { "epoch": 3.4003795066413662, "grad_norm": 0.15060165524482727, "learning_rate": 7.71484375e-06, "loss": 0.0607, "step": 8960 }, { "epoch": 3.4041745730550286, "grad_norm": 1.0973819494247437, "learning_rate": 7.666015625000001e-06, "loss": 0.0914, "step": 8970 }, { "epoch": 3.407969639468691, "grad_norm": 4.7881951332092285, "learning_rate": 7.6171875000000005e-06, "loss": 0.0515, "step": 8980 }, { "epoch": 3.411764705882353, "grad_norm": 2.9025986194610596, "learning_rate": 7.568359375e-06, "loss": 0.0576, "step": 8990 }, { "epoch": 3.415559772296015, "grad_norm": 0.07781478762626648, "learning_rate": 7.51953125e-06, "loss": 0.0318, "step": 9000 }, { "epoch": 3.4193548387096775, "grad_norm": 2.8141448497772217, "learning_rate": 7.4707031250000005e-06, "loss": 0.0598, "step": 9010 }, { "epoch": 3.42314990512334, "grad_norm": 1.2371045351028442, "learning_rate": 7.421875e-06, "loss": 0.1014, "step": 9020 }, { "epoch": 3.4269449715370017, "grad_norm": 0.11280115693807602, "learning_rate": 7.373046875e-06, "loss": 0.0571, "step": 9030 }, { "epoch": 3.430740037950664, "grad_norm": 0.07071410119533539, "learning_rate": 7.3242187500000006e-06, "loss": 0.0289, "step": 9040 }, { "epoch": 3.4345351043643264, "grad_norm": 0.07948953658342361, "learning_rate": 7.275390625e-06, "loss": 0.0328, "step": 9050 }, { "epoch": 3.4383301707779887, "grad_norm": 6.166849613189697, "learning_rate": 7.2265625e-06, "loss": 0.0501, "step": 9060 }, { "epoch": 3.442125237191651, "grad_norm": 0.3815774619579315, "learning_rate": 7.177734375000001e-06, "loss": 0.0449, "step": 9070 }, { "epoch": 3.445920303605313, "grad_norm": 0.21274378895759583, "learning_rate": 7.12890625e-06, "loss": 0.0871, "step": 9080 }, { "epoch": 3.4497153700189753, "grad_norm": 0.5041061043739319, "learning_rate": 7.080078125e-06, "loss": 0.0451, "step": 9090 }, { "epoch": 3.4535104364326377, "grad_norm": 2.4566073417663574, "learning_rate": 7.031250000000001e-06, "loss": 0.0622, "step": 9100 }, { "epoch": 3.4573055028462996, "grad_norm": 5.31998872756958, "learning_rate": 6.982421875e-06, "loss": 0.0545, "step": 9110 }, { "epoch": 3.461100569259962, "grad_norm": 0.2531034052371979, "learning_rate": 6.93359375e-06, "loss": 0.0449, "step": 9120 }, { "epoch": 3.4648956356736242, "grad_norm": 0.03640067204833031, "learning_rate": 6.884765625000001e-06, "loss": 0.0944, "step": 9130 }, { "epoch": 3.4686907020872866, "grad_norm": 0.9717852473258972, "learning_rate": 6.8359375e-06, "loss": 0.0165, "step": 9140 }, { "epoch": 3.472485768500949, "grad_norm": 1.4924548864364624, "learning_rate": 6.7871093750000004e-06, "loss": 0.069, "step": 9150 }, { "epoch": 3.476280834914611, "grad_norm": 2.620271682739258, "learning_rate": 6.738281250000001e-06, "loss": 0.0967, "step": 9160 }, { "epoch": 3.480075901328273, "grad_norm": 2.279548406600952, "learning_rate": 6.689453125e-06, "loss": 0.0257, "step": 9170 }, { "epoch": 3.4838709677419355, "grad_norm": 0.08608423173427582, "learning_rate": 6.6406250000000005e-06, "loss": 0.0359, "step": 9180 }, { "epoch": 3.487666034155598, "grad_norm": 5.201995849609375, "learning_rate": 6.591796875e-06, "loss": 0.0349, "step": 9190 }, { "epoch": 3.4914611005692597, "grad_norm": 0.6848796606063843, "learning_rate": 6.54296875e-06, "loss": 0.0473, "step": 9200 }, { "epoch": 3.495256166982922, "grad_norm": 1.0673704147338867, "learning_rate": 6.4941406250000005e-06, "loss": 0.0751, "step": 9210 }, { "epoch": 3.4990512333965844, "grad_norm": 6.374655723571777, "learning_rate": 6.4453125e-06, "loss": 0.0672, "step": 9220 }, { "epoch": 3.5028462998102468, "grad_norm": 3.0670387744903564, "learning_rate": 6.396484375e-06, "loss": 0.1047, "step": 9230 }, { "epoch": 3.506641366223909, "grad_norm": 2.0058538913726807, "learning_rate": 6.3476562500000006e-06, "loss": 0.0571, "step": 9240 }, { "epoch": 3.510436432637571, "grad_norm": 0.8808121681213379, "learning_rate": 6.298828125e-06, "loss": 0.0742, "step": 9250 }, { "epoch": 3.5142314990512333, "grad_norm": 0.1013035699725151, "learning_rate": 6.25e-06, "loss": 0.0506, "step": 9260 }, { "epoch": 3.5180265654648957, "grad_norm": 1.1379400491714478, "learning_rate": 6.201171875000001e-06, "loss": 0.0466, "step": 9270 }, { "epoch": 3.521821631878558, "grad_norm": 0.44777366518974304, "learning_rate": 6.15234375e-06, "loss": 0.0425, "step": 9280 }, { "epoch": 3.52561669829222, "grad_norm": 0.6099011301994324, "learning_rate": 6.103515625e-06, "loss": 0.0368, "step": 9290 }, { "epoch": 3.5294117647058822, "grad_norm": 10.134333610534668, "learning_rate": 6.054687500000001e-06, "loss": 0.0459, "step": 9300 }, { "epoch": 3.5332068311195446, "grad_norm": 10.301962852478027, "learning_rate": 6.005859375e-06, "loss": 0.0712, "step": 9310 }, { "epoch": 3.537001897533207, "grad_norm": 2.240419864654541, "learning_rate": 5.95703125e-06, "loss": 0.0496, "step": 9320 }, { "epoch": 3.5407969639468693, "grad_norm": 9.403803825378418, "learning_rate": 5.908203125000001e-06, "loss": 0.0551, "step": 9330 }, { "epoch": 3.544592030360531, "grad_norm": 0.0765363797545433, "learning_rate": 5.859375e-06, "loss": 0.0382, "step": 9340 }, { "epoch": 3.5483870967741935, "grad_norm": 0.6216185688972473, "learning_rate": 5.810546875e-06, "loss": 0.0723, "step": 9350 }, { "epoch": 3.552182163187856, "grad_norm": 6.577167987823486, "learning_rate": 5.76171875e-06, "loss": 0.0626, "step": 9360 }, { "epoch": 3.555977229601518, "grad_norm": 0.15332098305225372, "learning_rate": 5.712890625e-06, "loss": 0.0419, "step": 9370 }, { "epoch": 3.55977229601518, "grad_norm": 3.2923789024353027, "learning_rate": 5.6640625000000005e-06, "loss": 0.0894, "step": 9380 }, { "epoch": 3.5635673624288424, "grad_norm": 1.0206191539764404, "learning_rate": 5.615234375e-06, "loss": 0.0477, "step": 9390 }, { "epoch": 3.5673624288425048, "grad_norm": 5.454959869384766, "learning_rate": 5.56640625e-06, "loss": 0.0315, "step": 9400 }, { "epoch": 3.571157495256167, "grad_norm": 0.3191007673740387, "learning_rate": 5.5175781250000005e-06, "loss": 0.068, "step": 9410 }, { "epoch": 3.5749525616698294, "grad_norm": 12.383304595947266, "learning_rate": 5.46875e-06, "loss": 0.0444, "step": 9420 }, { "epoch": 3.5787476280834913, "grad_norm": 1.9023758172988892, "learning_rate": 5.419921875e-06, "loss": 0.0942, "step": 9430 }, { "epoch": 3.5825426944971537, "grad_norm": 0.06706677377223969, "learning_rate": 5.3710937500000005e-06, "loss": 0.0512, "step": 9440 }, { "epoch": 3.586337760910816, "grad_norm": 0.32390040159225464, "learning_rate": 5.322265625e-06, "loss": 0.0603, "step": 9450 }, { "epoch": 3.590132827324478, "grad_norm": 1.5318775177001953, "learning_rate": 5.2734375e-06, "loss": 0.0491, "step": 9460 }, { "epoch": 3.5939278937381403, "grad_norm": 0.5909900665283203, "learning_rate": 5.2246093750000006e-06, "loss": 0.0294, "step": 9470 }, { "epoch": 3.5977229601518026, "grad_norm": 1.5226948261260986, "learning_rate": 5.17578125e-06, "loss": 0.0621, "step": 9480 }, { "epoch": 3.601518026565465, "grad_norm": 0.24643893539905548, "learning_rate": 5.126953125e-06, "loss": 0.0293, "step": 9490 }, { "epoch": 3.6053130929791273, "grad_norm": 7.143110752105713, "learning_rate": 5.078125000000001e-06, "loss": 0.0592, "step": 9500 }, { "epoch": 3.6091081593927896, "grad_norm": 3.5135350227355957, "learning_rate": 5.029296875e-06, "loss": 0.0705, "step": 9510 }, { "epoch": 3.6129032258064515, "grad_norm": 4.653140544891357, "learning_rate": 4.98046875e-06, "loss": 0.0624, "step": 9520 }, { "epoch": 3.616698292220114, "grad_norm": 0.044525645673274994, "learning_rate": 4.931640625000001e-06, "loss": 0.0449, "step": 9530 }, { "epoch": 3.620493358633776, "grad_norm": 7.338439464569092, "learning_rate": 4.8828125e-06, "loss": 0.0536, "step": 9540 }, { "epoch": 3.624288425047438, "grad_norm": 0.4086396396160126, "learning_rate": 4.833984375e-06, "loss": 0.038, "step": 9550 }, { "epoch": 3.6280834914611004, "grad_norm": 0.05038388445973396, "learning_rate": 4.78515625e-06, "loss": 0.0458, "step": 9560 }, { "epoch": 3.6318785578747628, "grad_norm": 0.09961717575788498, "learning_rate": 4.736328125e-06, "loss": 0.0468, "step": 9570 }, { "epoch": 3.635673624288425, "grad_norm": 0.27485185861587524, "learning_rate": 4.6875000000000004e-06, "loss": 0.0675, "step": 9580 }, { "epoch": 3.6394686907020875, "grad_norm": 4.295794486999512, "learning_rate": 4.638671875e-06, "loss": 0.0519, "step": 9590 }, { "epoch": 3.64326375711575, "grad_norm": 1.9907684326171875, "learning_rate": 4.58984375e-06, "loss": 0.0422, "step": 9600 }, { "epoch": 3.6470588235294117, "grad_norm": 0.12039614468812943, "learning_rate": 4.5410156250000005e-06, "loss": 0.044, "step": 9610 }, { "epoch": 3.650853889943074, "grad_norm": 0.4942443072795868, "learning_rate": 4.4921875e-06, "loss": 0.0828, "step": 9620 }, { "epoch": 3.6546489563567364, "grad_norm": 0.8744149804115295, "learning_rate": 4.443359375e-06, "loss": 0.0514, "step": 9630 }, { "epoch": 3.6584440227703983, "grad_norm": 1.8012325763702393, "learning_rate": 4.3945312500000005e-06, "loss": 0.0389, "step": 9640 }, { "epoch": 3.6622390891840606, "grad_norm": 0.09957607835531235, "learning_rate": 4.345703125e-06, "loss": 0.0512, "step": 9650 }, { "epoch": 3.666034155597723, "grad_norm": 0.0749269425868988, "learning_rate": 4.296875e-06, "loss": 0.0278, "step": 9660 }, { "epoch": 3.6698292220113853, "grad_norm": 0.04859253391623497, "learning_rate": 4.2480468750000006e-06, "loss": 0.0813, "step": 9670 }, { "epoch": 3.6736242884250476, "grad_norm": 3.236546277999878, "learning_rate": 4.19921875e-06, "loss": 0.0408, "step": 9680 }, { "epoch": 3.6774193548387095, "grad_norm": 2.782500743865967, "learning_rate": 4.150390625e-06, "loss": 0.0365, "step": 9690 }, { "epoch": 3.681214421252372, "grad_norm": 0.2516065835952759, "learning_rate": 4.101562500000001e-06, "loss": 0.0541, "step": 9700 }, { "epoch": 3.685009487666034, "grad_norm": 0.0802445337176323, "learning_rate": 4.052734375e-06, "loss": 0.0296, "step": 9710 }, { "epoch": 3.6888045540796965, "grad_norm": 0.7485657930374146, "learning_rate": 4.00390625e-06, "loss": 0.0194, "step": 9720 }, { "epoch": 3.6925996204933584, "grad_norm": 0.05877687409520149, "learning_rate": 3.955078125e-06, "loss": 0.0547, "step": 9730 }, { "epoch": 3.6963946869070208, "grad_norm": 3.6818785667419434, "learning_rate": 3.90625e-06, "loss": 0.0801, "step": 9740 }, { "epoch": 3.700189753320683, "grad_norm": 0.22303463518619537, "learning_rate": 3.857421875e-06, "loss": 0.0326, "step": 9750 }, { "epoch": 3.7039848197343455, "grad_norm": 0.16665808856487274, "learning_rate": 3.8085937500000002e-06, "loss": 0.0664, "step": 9760 }, { "epoch": 3.707779886148008, "grad_norm": 0.2113623172044754, "learning_rate": 3.759765625e-06, "loss": 0.0495, "step": 9770 }, { "epoch": 3.7115749525616697, "grad_norm": 1.9400161504745483, "learning_rate": 3.7109375e-06, "loss": 0.062, "step": 9780 }, { "epoch": 3.715370018975332, "grad_norm": 2.147211790084839, "learning_rate": 3.6621093750000003e-06, "loss": 0.0408, "step": 9790 }, { "epoch": 3.7191650853889944, "grad_norm": 0.17818136513233185, "learning_rate": 3.61328125e-06, "loss": 0.0376, "step": 9800 }, { "epoch": 3.7229601518026563, "grad_norm": 0.2646294832229614, "learning_rate": 3.564453125e-06, "loss": 0.0488, "step": 9810 }, { "epoch": 3.7267552182163186, "grad_norm": 0.07648167759180069, "learning_rate": 3.5156250000000003e-06, "loss": 0.0618, "step": 9820 }, { "epoch": 3.730550284629981, "grad_norm": 4.988431930541992, "learning_rate": 3.466796875e-06, "loss": 0.0438, "step": 9830 }, { "epoch": 3.7343453510436433, "grad_norm": 4.025431156158447, "learning_rate": 3.41796875e-06, "loss": 0.0663, "step": 9840 }, { "epoch": 3.7381404174573056, "grad_norm": 0.7877894043922424, "learning_rate": 3.3691406250000004e-06, "loss": 0.0261, "step": 9850 }, { "epoch": 3.741935483870968, "grad_norm": 1.7883660793304443, "learning_rate": 3.3203125000000002e-06, "loss": 0.0481, "step": 9860 }, { "epoch": 3.74573055028463, "grad_norm": 2.136960029602051, "learning_rate": 3.271484375e-06, "loss": 0.052, "step": 9870 }, { "epoch": 3.749525616698292, "grad_norm": 0.9067153930664062, "learning_rate": 3.22265625e-06, "loss": 0.0567, "step": 9880 }, { "epoch": 3.7533206831119545, "grad_norm": 1.2437059879302979, "learning_rate": 3.1738281250000003e-06, "loss": 0.053, "step": 9890 }, { "epoch": 3.7571157495256164, "grad_norm": 2.1223294734954834, "learning_rate": 3.125e-06, "loss": 0.0484, "step": 9900 }, { "epoch": 3.760910815939279, "grad_norm": 8.40434455871582, "learning_rate": 3.076171875e-06, "loss": 0.0451, "step": 9910 }, { "epoch": 3.764705882352941, "grad_norm": 2.565584421157837, "learning_rate": 3.0273437500000003e-06, "loss": 0.0589, "step": 9920 }, { "epoch": 3.7685009487666035, "grad_norm": 5.559597492218018, "learning_rate": 2.978515625e-06, "loss": 0.0396, "step": 9930 }, { "epoch": 3.772296015180266, "grad_norm": 0.5843867659568787, "learning_rate": 2.9296875e-06, "loss": 0.0682, "step": 9940 }, { "epoch": 3.776091081593928, "grad_norm": 1.6344566345214844, "learning_rate": 2.880859375e-06, "loss": 0.0892, "step": 9950 }, { "epoch": 3.77988614800759, "grad_norm": 5.6130051612854, "learning_rate": 2.8320312500000002e-06, "loss": 0.0439, "step": 9960 }, { "epoch": 3.7836812144212524, "grad_norm": 3.700528144836426, "learning_rate": 2.783203125e-06, "loss": 0.0228, "step": 9970 }, { "epoch": 3.7874762808349147, "grad_norm": 2.797687530517578, "learning_rate": 2.734375e-06, "loss": 0.0247, "step": 9980 }, { "epoch": 3.7912713472485766, "grad_norm": 1.7192658185958862, "learning_rate": 2.6855468750000003e-06, "loss": 0.0792, "step": 9990 }, { "epoch": 3.795066413662239, "grad_norm": 0.0573776513338089, "learning_rate": 2.63671875e-06, "loss": 0.0136, "step": 10000 }, { "epoch": 3.7988614800759013, "grad_norm": 0.07321004569530487, "learning_rate": 2.587890625e-06, "loss": 0.0461, "step": 10010 }, { "epoch": 3.8026565464895636, "grad_norm": 0.045114945620298386, "learning_rate": 2.5390625000000003e-06, "loss": 0.0658, "step": 10020 }, { "epoch": 3.806451612903226, "grad_norm": 0.3899228870868683, "learning_rate": 2.490234375e-06, "loss": 0.0389, "step": 10030 }, { "epoch": 3.8102466793168883, "grad_norm": 0.6319021582603455, "learning_rate": 2.44140625e-06, "loss": 0.0247, "step": 10040 }, { "epoch": 3.81404174573055, "grad_norm": 1.4026541709899902, "learning_rate": 2.392578125e-06, "loss": 0.0161, "step": 10050 }, { "epoch": 3.8178368121442126, "grad_norm": 4.106344699859619, "learning_rate": 2.3437500000000002e-06, "loss": 0.042, "step": 10060 }, { "epoch": 3.821631878557875, "grad_norm": 0.5673054456710815, "learning_rate": 2.294921875e-06, "loss": 0.0589, "step": 10070 }, { "epoch": 3.825426944971537, "grad_norm": 0.057744644582271576, "learning_rate": 2.24609375e-06, "loss": 0.0305, "step": 10080 }, { "epoch": 3.829222011385199, "grad_norm": 3.3453450202941895, "learning_rate": 2.1972656250000003e-06, "loss": 0.0317, "step": 10090 }, { "epoch": 3.8330170777988615, "grad_norm": 0.08820886164903641, "learning_rate": 2.1484375e-06, "loss": 0.0355, "step": 10100 }, { "epoch": 3.836812144212524, "grad_norm": 1.522764801979065, "learning_rate": 2.099609375e-06, "loss": 0.0496, "step": 10110 }, { "epoch": 3.840607210626186, "grad_norm": 0.9732184410095215, "learning_rate": 2.0507812500000003e-06, "loss": 0.0303, "step": 10120 }, { "epoch": 3.844402277039848, "grad_norm": 0.1131846010684967, "learning_rate": 2.001953125e-06, "loss": 0.0359, "step": 10130 }, { "epoch": 3.8481973434535104, "grad_norm": 3.4666688442230225, "learning_rate": 1.953125e-06, "loss": 0.0542, "step": 10140 }, { "epoch": 3.8519924098671727, "grad_norm": 3.6389381885528564, "learning_rate": 1.9042968750000001e-06, "loss": 0.0328, "step": 10150 }, { "epoch": 3.855787476280835, "grad_norm": 0.7695565819740295, "learning_rate": 1.85546875e-06, "loss": 0.0294, "step": 10160 }, { "epoch": 3.859582542694497, "grad_norm": 5.1775593757629395, "learning_rate": 1.806640625e-06, "loss": 0.055, "step": 10170 }, { "epoch": 3.8633776091081593, "grad_norm": 0.46061795949935913, "learning_rate": 1.7578125000000002e-06, "loss": 0.0376, "step": 10180 }, { "epoch": 3.8671726755218216, "grad_norm": 0.16866852343082428, "learning_rate": 1.708984375e-06, "loss": 0.092, "step": 10190 }, { "epoch": 3.870967741935484, "grad_norm": 2.495349168777466, "learning_rate": 1.6601562500000001e-06, "loss": 0.0205, "step": 10200 }, { "epoch": 3.8747628083491463, "grad_norm": 4.127594470977783, "learning_rate": 1.611328125e-06, "loss": 0.0376, "step": 10210 }, { "epoch": 3.878557874762808, "grad_norm": 0.0868837833404541, "learning_rate": 1.5625e-06, "loss": 0.0715, "step": 10220 }, { "epoch": 3.8823529411764706, "grad_norm": 2.7866268157958984, "learning_rate": 1.5136718750000002e-06, "loss": 0.0623, "step": 10230 }, { "epoch": 3.886148007590133, "grad_norm": 0.5652477741241455, "learning_rate": 1.46484375e-06, "loss": 0.0521, "step": 10240 }, { "epoch": 3.889943074003795, "grad_norm": 0.13568060100078583, "learning_rate": 1.4160156250000001e-06, "loss": 0.0373, "step": 10250 }, { "epoch": 3.893738140417457, "grad_norm": 7.213637828826904, "learning_rate": 1.3671875e-06, "loss": 0.1189, "step": 10260 }, { "epoch": 3.8975332068311195, "grad_norm": 4.795431613922119, "learning_rate": 1.318359375e-06, "loss": 0.0368, "step": 10270 }, { "epoch": 3.901328273244782, "grad_norm": 4.8751220703125, "learning_rate": 1.2695312500000002e-06, "loss": 0.0972, "step": 10280 }, { "epoch": 3.905123339658444, "grad_norm": 0.5513148307800293, "learning_rate": 1.220703125e-06, "loss": 0.0287, "step": 10290 }, { "epoch": 3.9089184060721065, "grad_norm": 0.16232678294181824, "learning_rate": 1.1718750000000001e-06, "loss": 0.0651, "step": 10300 }, { "epoch": 3.9127134724857684, "grad_norm": 3.053624391555786, "learning_rate": 1.123046875e-06, "loss": 0.0358, "step": 10310 }, { "epoch": 3.9165085388994307, "grad_norm": 0.1307297945022583, "learning_rate": 1.07421875e-06, "loss": 0.0171, "step": 10320 }, { "epoch": 3.920303605313093, "grad_norm": 5.61918306350708, "learning_rate": 1.0253906250000001e-06, "loss": 0.0383, "step": 10330 }, { "epoch": 3.924098671726755, "grad_norm": 4.017998695373535, "learning_rate": 9.765625e-07, "loss": 0.0547, "step": 10340 }, { "epoch": 3.9278937381404173, "grad_norm": 8.339895248413086, "learning_rate": 9.27734375e-07, "loss": 0.059, "step": 10350 }, { "epoch": 3.9316888045540797, "grad_norm": 0.5986772179603577, "learning_rate": 8.789062500000001e-07, "loss": 0.0773, "step": 10360 }, { "epoch": 3.935483870967742, "grad_norm": 0.0516970194876194, "learning_rate": 8.300781250000001e-07, "loss": 0.0697, "step": 10370 }, { "epoch": 3.9392789373814043, "grad_norm": 1.0691931247711182, "learning_rate": 7.8125e-07, "loss": 0.0382, "step": 10380 }, { "epoch": 3.9430740037950667, "grad_norm": 1.0503530502319336, "learning_rate": 7.32421875e-07, "loss": 0.0781, "step": 10390 }, { "epoch": 3.9468690702087286, "grad_norm": 4.003793239593506, "learning_rate": 6.8359375e-07, "loss": 0.1007, "step": 10400 }, { "epoch": 3.950664136622391, "grad_norm": 0.04315977543592453, "learning_rate": 6.347656250000001e-07, "loss": 0.0553, "step": 10410 }, { "epoch": 3.9544592030360532, "grad_norm": 4.378900051116943, "learning_rate": 5.859375000000001e-07, "loss": 0.0239, "step": 10420 }, { "epoch": 3.958254269449715, "grad_norm": 0.17604303359985352, "learning_rate": 5.37109375e-07, "loss": 0.0338, "step": 10430 }, { "epoch": 3.9620493358633775, "grad_norm": 0.040019456297159195, "learning_rate": 4.8828125e-07, "loss": 0.0088, "step": 10440 }, { "epoch": 3.96584440227704, "grad_norm": 4.001920700073242, "learning_rate": 4.3945312500000004e-07, "loss": 0.0395, "step": 10450 }, { "epoch": 3.969639468690702, "grad_norm": 4.805160999298096, "learning_rate": 3.90625e-07, "loss": 0.0713, "step": 10460 }, { "epoch": 3.9734345351043645, "grad_norm": 0.0865137130022049, "learning_rate": 3.41796875e-07, "loss": 0.0394, "step": 10470 }, { "epoch": 3.9772296015180264, "grad_norm": 2.695357322692871, "learning_rate": 2.9296875000000003e-07, "loss": 0.0582, "step": 10480 }, { "epoch": 3.9810246679316887, "grad_norm": 0.9629122018814087, "learning_rate": 2.44140625e-07, "loss": 0.0201, "step": 10490 }, { "epoch": 3.984819734345351, "grad_norm": 0.8045425415039062, "learning_rate": 1.953125e-07, "loss": 0.0653, "step": 10500 } ], "logging_steps": 10, "max_steps": 10540, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2762272477794816.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }