diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7383 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.984819734345351, + "eval_steps": 500, + "global_step": 10500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003795066413662239, + "grad_norm": 1.7131669521331787, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.6804, + "step": 10 + }, + { + "epoch": 0.007590132827324478, + "grad_norm": 1.7053213119506836, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.6854, + "step": 20 + }, + { + "epoch": 0.011385199240986717, + "grad_norm": 1.8932372331619263, + "learning_rate": 5e-06, + "loss": 0.6597, + "step": 30 + }, + { + "epoch": 0.015180265654648957, + "grad_norm": 1.516872763633728, + "learning_rate": 6.666666666666667e-06, + "loss": 0.6243, + "step": 40 + }, + { + "epoch": 0.018975332068311195, + "grad_norm": 1.5149081945419312, + "learning_rate": 8.333333333333334e-06, + "loss": 0.5632, + "step": 50 + }, + { + "epoch": 0.022770398481973434, + "grad_norm": 1.0706552267074585, + "learning_rate": 1e-05, + "loss": 0.5289, + "step": 60 + }, + { + "epoch": 0.026565464895635674, + "grad_norm": 1.102160930633545, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.5263, + "step": 70 + }, + { + "epoch": 0.030360531309297913, + "grad_norm": 1.2059059143066406, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.5466, + "step": 80 + }, + { + "epoch": 0.03415559772296015, + "grad_norm": 1.0622307062149048, + "learning_rate": 1.5e-05, + "loss": 0.4918, + "step": 90 + }, + { + "epoch": 0.03795066413662239, + "grad_norm": 1.5696407556533813, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.516, + "step": 100 + }, + { + "epoch": 0.04174573055028463, + "grad_norm": 1.49858820438385, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.5024, + "step": 110 + }, + { + "epoch": 0.04554079696394687, + "grad_norm": 1.5996527671813965, + "learning_rate": 2e-05, + "loss": 0.4775, + "step": 120 + }, + { + "epoch": 0.04933586337760911, + "grad_norm": 1.6391699314117432, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.5028, + "step": 130 + }, + { + "epoch": 0.05313092979127135, + "grad_norm": 1.5045441389083862, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.472, + "step": 140 + }, + { + "epoch": 0.056925996204933584, + "grad_norm": 1.1791646480560303, + "learning_rate": 2.5e-05, + "loss": 0.4606, + "step": 150 + }, + { + "epoch": 0.06072106261859583, + "grad_norm": 1.3659300804138184, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.527, + "step": 160 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 0.9830155968666077, + "learning_rate": 2.8333333333333335e-05, + "loss": 0.458, + "step": 170 + }, + { + "epoch": 0.0683111954459203, + "grad_norm": 1.6211776733398438, + "learning_rate": 3e-05, + "loss": 0.4613, + "step": 180 + }, + { + "epoch": 0.07210626185958255, + "grad_norm": 1.9507710933685303, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.4531, + "step": 190 + }, + { + "epoch": 0.07590132827324478, + "grad_norm": 1.312615156173706, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.4384, + "step": 200 + }, + { + "epoch": 0.07969639468690702, + "grad_norm": 2.034919261932373, + "learning_rate": 3.5e-05, + "loss": 0.4747, + "step": 210 + }, + { + "epoch": 0.08349146110056926, + "grad_norm": 2.045759677886963, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.4153, + "step": 220 + }, + { + "epoch": 0.0872865275142315, + "grad_norm": 2.0934813022613525, + "learning_rate": 3.8333333333333334e-05, + "loss": 0.3829, + "step": 230 + }, + { + "epoch": 0.09108159392789374, + "grad_norm": 2.4255552291870117, + "learning_rate": 4e-05, + "loss": 0.3816, + "step": 240 + }, + { + "epoch": 0.09487666034155598, + "grad_norm": 1.42184579372406, + "learning_rate": 4.166666666666667e-05, + "loss": 0.3948, + "step": 250 + }, + { + "epoch": 0.09867172675521822, + "grad_norm": 1.6787000894546509, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.3877, + "step": 260 + }, + { + "epoch": 0.10246679316888045, + "grad_norm": 2.121290445327759, + "learning_rate": 4.5e-05, + "loss": 0.3732, + "step": 270 + }, + { + "epoch": 0.1062618595825427, + "grad_norm": 1.5, + "learning_rate": 4.666666666666667e-05, + "loss": 0.3567, + "step": 280 + }, + { + "epoch": 0.11005692599620494, + "grad_norm": 3.0193252563476562, + "learning_rate": 4.8333333333333334e-05, + "loss": 0.3916, + "step": 290 + }, + { + "epoch": 0.11385199240986717, + "grad_norm": 2.7301666736602783, + "learning_rate": 5e-05, + "loss": 0.3723, + "step": 300 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 1.8423070907592773, + "learning_rate": 4.9951171875e-05, + "loss": 0.3214, + "step": 310 + }, + { + "epoch": 0.12144212523719165, + "grad_norm": 1.204102873802185, + "learning_rate": 4.990234375e-05, + "loss": 0.3251, + "step": 320 + }, + { + "epoch": 0.1252371916508539, + "grad_norm": 1.803913950920105, + "learning_rate": 4.9853515625000005e-05, + "loss": 0.3942, + "step": 330 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 3.175114154815674, + "learning_rate": 4.9804687500000004e-05, + "loss": 0.39, + "step": 340 + }, + { + "epoch": 0.13282732447817835, + "grad_norm": 2.4476590156555176, + "learning_rate": 4.9755859375e-05, + "loss": 0.349, + "step": 350 + }, + { + "epoch": 0.1366223908918406, + "grad_norm": 1.2592339515686035, + "learning_rate": 4.970703125e-05, + "loss": 0.3315, + "step": 360 + }, + { + "epoch": 0.14041745730550284, + "grad_norm": 1.6238622665405273, + "learning_rate": 4.9658203125e-05, + "loss": 0.3307, + "step": 370 + }, + { + "epoch": 0.1442125237191651, + "grad_norm": 1.3984373807907104, + "learning_rate": 4.9609375000000005e-05, + "loss": 0.294, + "step": 380 + }, + { + "epoch": 0.14800759013282733, + "grad_norm": 3.1960623264312744, + "learning_rate": 4.9560546875e-05, + "loss": 0.3314, + "step": 390 + }, + { + "epoch": 0.15180265654648956, + "grad_norm": 1.5345971584320068, + "learning_rate": 4.951171875e-05, + "loss": 0.3438, + "step": 400 + }, + { + "epoch": 0.1555977229601518, + "grad_norm": 3.1037323474884033, + "learning_rate": 4.9462890625e-05, + "loss": 0.3246, + "step": 410 + }, + { + "epoch": 0.15939278937381404, + "grad_norm": 3.519519805908203, + "learning_rate": 4.94140625e-05, + "loss": 0.3087, + "step": 420 + }, + { + "epoch": 0.16318785578747627, + "grad_norm": 1.347273826599121, + "learning_rate": 4.9365234375000005e-05, + "loss": 0.3303, + "step": 430 + }, + { + "epoch": 0.16698292220113853, + "grad_norm": 1.2372374534606934, + "learning_rate": 4.931640625e-05, + "loss": 0.3225, + "step": 440 + }, + { + "epoch": 0.17077798861480076, + "grad_norm": 0.9122889637947083, + "learning_rate": 4.9267578125e-05, + "loss": 0.3081, + "step": 450 + }, + { + "epoch": 0.174573055028463, + "grad_norm": 3.7750535011291504, + "learning_rate": 4.921875e-05, + "loss": 0.2785, + "step": 460 + }, + { + "epoch": 0.17836812144212524, + "grad_norm": 1.0529924631118774, + "learning_rate": 4.9169921875000006e-05, + "loss": 0.283, + "step": 470 + }, + { + "epoch": 0.18216318785578747, + "grad_norm": 1.5323132276535034, + "learning_rate": 4.9121093750000004e-05, + "loss": 0.2982, + "step": 480 + }, + { + "epoch": 0.1859582542694497, + "grad_norm": 1.1751055717468262, + "learning_rate": 4.9072265625e-05, + "loss": 0.2639, + "step": 490 + }, + { + "epoch": 0.18975332068311196, + "grad_norm": 1.0208653211593628, + "learning_rate": 4.90234375e-05, + "loss": 0.2651, + "step": 500 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 1.7089987993240356, + "learning_rate": 4.8974609375e-05, + "loss": 0.2572, + "step": 510 + }, + { + "epoch": 0.19734345351043645, + "grad_norm": 4.918070316314697, + "learning_rate": 4.8925781250000006e-05, + "loss": 0.299, + "step": 520 + }, + { + "epoch": 0.20113851992409867, + "grad_norm": 1.117162823677063, + "learning_rate": 4.8876953125000004e-05, + "loss": 0.2699, + "step": 530 + }, + { + "epoch": 0.2049335863377609, + "grad_norm": 1.813411831855774, + "learning_rate": 4.8828125e-05, + "loss": 0.2391, + "step": 540 + }, + { + "epoch": 0.20872865275142316, + "grad_norm": 3.368643283843994, + "learning_rate": 4.8779296875e-05, + "loss": 0.3022, + "step": 550 + }, + { + "epoch": 0.2125237191650854, + "grad_norm": 16.486289978027344, + "learning_rate": 4.873046875e-05, + "loss": 0.2837, + "step": 560 + }, + { + "epoch": 0.21631878557874762, + "grad_norm": 1.3590037822723389, + "learning_rate": 4.8681640625000005e-05, + "loss": 0.2182, + "step": 570 + }, + { + "epoch": 0.22011385199240988, + "grad_norm": 1.8672986030578613, + "learning_rate": 4.8632812500000004e-05, + "loss": 0.2925, + "step": 580 + }, + { + "epoch": 0.2239089184060721, + "grad_norm": 2.350752592086792, + "learning_rate": 4.8583984375e-05, + "loss": 0.2585, + "step": 590 + }, + { + "epoch": 0.22770398481973433, + "grad_norm": 2.4918649196624756, + "learning_rate": 4.853515625e-05, + "loss": 0.2824, + "step": 600 + }, + { + "epoch": 0.2314990512333966, + "grad_norm": 2.4856553077697754, + "learning_rate": 4.8486328125e-05, + "loss": 0.2444, + "step": 610 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 1.87199866771698, + "learning_rate": 4.8437500000000005e-05, + "loss": 0.256, + "step": 620 + }, + { + "epoch": 0.23908918406072105, + "grad_norm": 1.0694291591644287, + "learning_rate": 4.8388671875000004e-05, + "loss": 0.245, + "step": 630 + }, + { + "epoch": 0.2428842504743833, + "grad_norm": 0.7904035449028015, + "learning_rate": 4.833984375e-05, + "loss": 0.2588, + "step": 640 + }, + { + "epoch": 0.24667931688804554, + "grad_norm": 2.714871883392334, + "learning_rate": 4.8291015625e-05, + "loss": 0.2741, + "step": 650 + }, + { + "epoch": 0.2504743833017078, + "grad_norm": 3.948547124862671, + "learning_rate": 4.82421875e-05, + "loss": 0.2335, + "step": 660 + }, + { + "epoch": 0.25426944971537, + "grad_norm": 1.6354694366455078, + "learning_rate": 4.8193359375000005e-05, + "loss": 0.2298, + "step": 670 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 1.1305994987487793, + "learning_rate": 4.8144531250000003e-05, + "loss": 0.2279, + "step": 680 + }, + { + "epoch": 0.2618595825426945, + "grad_norm": 1.804825782775879, + "learning_rate": 4.8095703125e-05, + "loss": 0.2401, + "step": 690 + }, + { + "epoch": 0.2656546489563567, + "grad_norm": 1.0778950452804565, + "learning_rate": 4.8046875e-05, + "loss": 0.2498, + "step": 700 + }, + { + "epoch": 0.269449715370019, + "grad_norm": 2.672403335571289, + "learning_rate": 4.7998046875e-05, + "loss": 0.2521, + "step": 710 + }, + { + "epoch": 0.2732447817836812, + "grad_norm": 1.0559144020080566, + "learning_rate": 4.7949218750000005e-05, + "loss": 0.1855, + "step": 720 + }, + { + "epoch": 0.27703984819734345, + "grad_norm": 1.3226491212844849, + "learning_rate": 4.7900390625e-05, + "loss": 0.21, + "step": 730 + }, + { + "epoch": 0.2808349146110057, + "grad_norm": 2.1266074180603027, + "learning_rate": 4.78515625e-05, + "loss": 0.2232, + "step": 740 + }, + { + "epoch": 0.2846299810246679, + "grad_norm": 2.9967539310455322, + "learning_rate": 4.7802734375e-05, + "loss": 0.2554, + "step": 750 + }, + { + "epoch": 0.2884250474383302, + "grad_norm": 2.6614627838134766, + "learning_rate": 4.775390625e-05, + "loss": 0.2811, + "step": 760 + }, + { + "epoch": 0.2922201138519924, + "grad_norm": 1.64667546749115, + "learning_rate": 4.7705078125000004e-05, + "loss": 0.2102, + "step": 770 + }, + { + "epoch": 0.29601518026565465, + "grad_norm": 2.339608669281006, + "learning_rate": 4.765625e-05, + "loss": 0.2125, + "step": 780 + }, + { + "epoch": 0.2998102466793169, + "grad_norm": 1.6804083585739136, + "learning_rate": 4.7607421875e-05, + "loss": 0.2722, + "step": 790 + }, + { + "epoch": 0.3036053130929791, + "grad_norm": 2.6005263328552246, + "learning_rate": 4.755859375e-05, + "loss": 0.2067, + "step": 800 + }, + { + "epoch": 0.30740037950664134, + "grad_norm": 5.113396167755127, + "learning_rate": 4.7509765625000006e-05, + "loss": 0.1988, + "step": 810 + }, + { + "epoch": 0.3111954459203036, + "grad_norm": 1.9176031351089478, + "learning_rate": 4.7460937500000004e-05, + "loss": 0.2416, + "step": 820 + }, + { + "epoch": 0.31499051233396586, + "grad_norm": 1.5946362018585205, + "learning_rate": 4.7412109375e-05, + "loss": 0.2416, + "step": 830 + }, + { + "epoch": 0.3187855787476281, + "grad_norm": 1.6692804098129272, + "learning_rate": 4.736328125e-05, + "loss": 0.2139, + "step": 840 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 4.5298285484313965, + "learning_rate": 4.7314453125e-05, + "loss": 0.2285, + "step": 850 + }, + { + "epoch": 0.32637571157495254, + "grad_norm": 1.9948817491531372, + "learning_rate": 4.7265625000000005e-05, + "loss": 0.2453, + "step": 860 + }, + { + "epoch": 0.3301707779886148, + "grad_norm": 2.5353565216064453, + "learning_rate": 4.7216796875000004e-05, + "loss": 0.2259, + "step": 870 + }, + { + "epoch": 0.33396584440227706, + "grad_norm": 5.23643684387207, + "learning_rate": 4.716796875e-05, + "loss": 0.2318, + "step": 880 + }, + { + "epoch": 0.3377609108159393, + "grad_norm": 3.062701463699341, + "learning_rate": 4.7119140625e-05, + "loss": 0.1835, + "step": 890 + }, + { + "epoch": 0.3415559772296015, + "grad_norm": 1.5771597623825073, + "learning_rate": 4.70703125e-05, + "loss": 0.2195, + "step": 900 + }, + { + "epoch": 0.34535104364326374, + "grad_norm": 0.9039077162742615, + "learning_rate": 4.7021484375000005e-05, + "loss": 0.1545, + "step": 910 + }, + { + "epoch": 0.349146110056926, + "grad_norm": 2.7035298347473145, + "learning_rate": 4.6972656250000004e-05, + "loss": 0.2221, + "step": 920 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 2.3225386142730713, + "learning_rate": 4.6923828125e-05, + "loss": 0.1912, + "step": 930 + }, + { + "epoch": 0.3567362428842505, + "grad_norm": 1.1066793203353882, + "learning_rate": 4.6875e-05, + "loss": 0.2003, + "step": 940 + }, + { + "epoch": 0.3605313092979127, + "grad_norm": 1.2358715534210205, + "learning_rate": 4.6826171875e-05, + "loss": 0.1944, + "step": 950 + }, + { + "epoch": 0.36432637571157495, + "grad_norm": 0.5866732001304626, + "learning_rate": 4.6777343750000005e-05, + "loss": 0.1885, + "step": 960 + }, + { + "epoch": 0.3681214421252372, + "grad_norm": 1.436168909072876, + "learning_rate": 4.6728515625000004e-05, + "loss": 0.182, + "step": 970 + }, + { + "epoch": 0.3719165085388994, + "grad_norm": 1.5037955045700073, + "learning_rate": 4.66796875e-05, + "loss": 0.2024, + "step": 980 + }, + { + "epoch": 0.3757115749525617, + "grad_norm": 1.4837393760681152, + "learning_rate": 4.6630859375e-05, + "loss": 0.2249, + "step": 990 + }, + { + "epoch": 0.3795066413662239, + "grad_norm": 12.082221031188965, + "learning_rate": 4.658203125e-05, + "loss": 0.2191, + "step": 1000 + }, + { + "epoch": 0.38330170777988615, + "grad_norm": 0.7743262648582458, + "learning_rate": 4.6533203125000005e-05, + "loss": 0.1654, + "step": 1010 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 3.7393670082092285, + "learning_rate": 4.6484375e-05, + "loss": 0.1595, + "step": 1020 + }, + { + "epoch": 0.3908918406072106, + "grad_norm": 1.2153229713439941, + "learning_rate": 4.6435546875e-05, + "loss": 0.2276, + "step": 1030 + }, + { + "epoch": 0.3946869070208729, + "grad_norm": 0.9271629452705383, + "learning_rate": 4.638671875e-05, + "loss": 0.2039, + "step": 1040 + }, + { + "epoch": 0.3984819734345351, + "grad_norm": 1.0829685926437378, + "learning_rate": 4.6337890625e-05, + "loss": 0.1731, + "step": 1050 + }, + { + "epoch": 0.40227703984819735, + "grad_norm": 1.2705596685409546, + "learning_rate": 4.6289062500000005e-05, + "loss": 0.1359, + "step": 1060 + }, + { + "epoch": 0.4060721062618596, + "grad_norm": 4.376911163330078, + "learning_rate": 4.6240234375e-05, + "loss": 0.2095, + "step": 1070 + }, + { + "epoch": 0.4098671726755218, + "grad_norm": 2.1292335987091064, + "learning_rate": 4.619140625e-05, + "loss": 0.1916, + "step": 1080 + }, + { + "epoch": 0.41366223908918404, + "grad_norm": 1.6525979042053223, + "learning_rate": 4.6142578125e-05, + "loss": 0.173, + "step": 1090 + }, + { + "epoch": 0.4174573055028463, + "grad_norm": 4.228000164031982, + "learning_rate": 4.609375e-05, + "loss": 0.2117, + "step": 1100 + }, + { + "epoch": 0.42125237191650855, + "grad_norm": 5.334222316741943, + "learning_rate": 4.6044921875000004e-05, + "loss": 0.185, + "step": 1110 + }, + { + "epoch": 0.4250474383301708, + "grad_norm": 1.7326403856277466, + "learning_rate": 4.599609375e-05, + "loss": 0.1875, + "step": 1120 + }, + { + "epoch": 0.428842504743833, + "grad_norm": 2.4292402267456055, + "learning_rate": 4.5947265625e-05, + "loss": 0.1747, + "step": 1130 + }, + { + "epoch": 0.43263757115749524, + "grad_norm": 1.6561298370361328, + "learning_rate": 4.58984375e-05, + "loss": 0.2017, + "step": 1140 + }, + { + "epoch": 0.4364326375711575, + "grad_norm": 2.659874439239502, + "learning_rate": 4.5849609375000005e-05, + "loss": 0.2415, + "step": 1150 + }, + { + "epoch": 0.44022770398481975, + "grad_norm": 2.743425130844116, + "learning_rate": 4.5800781250000004e-05, + "loss": 0.2332, + "step": 1160 + }, + { + "epoch": 0.444022770398482, + "grad_norm": 2.3197848796844482, + "learning_rate": 4.5751953125e-05, + "loss": 0.1946, + "step": 1170 + }, + { + "epoch": 0.4478178368121442, + "grad_norm": 2.110534191131592, + "learning_rate": 4.5703125e-05, + "loss": 0.1948, + "step": 1180 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 1.3609685897827148, + "learning_rate": 4.5654296875e-05, + "loss": 0.1801, + "step": 1190 + }, + { + "epoch": 0.45540796963946867, + "grad_norm": 3.159426689147949, + "learning_rate": 4.5605468750000005e-05, + "loss": 0.2184, + "step": 1200 + }, + { + "epoch": 0.45920303605313095, + "grad_norm": 1.7927987575531006, + "learning_rate": 4.5556640625000004e-05, + "loss": 0.1604, + "step": 1210 + }, + { + "epoch": 0.4629981024667932, + "grad_norm": 1.5928328037261963, + "learning_rate": 4.55078125e-05, + "loss": 0.1693, + "step": 1220 + }, + { + "epoch": 0.4667931688804554, + "grad_norm": 0.8145284056663513, + "learning_rate": 4.5458984375e-05, + "loss": 0.1761, + "step": 1230 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.7765156030654907, + "learning_rate": 4.541015625e-05, + "loss": 0.1799, + "step": 1240 + }, + { + "epoch": 0.47438330170777987, + "grad_norm": 1.8456169366836548, + "learning_rate": 4.5361328125000005e-05, + "loss": 0.168, + "step": 1250 + }, + { + "epoch": 0.4781783681214421, + "grad_norm": 1.6953251361846924, + "learning_rate": 4.5312500000000004e-05, + "loss": 0.1945, + "step": 1260 + }, + { + "epoch": 0.4819734345351044, + "grad_norm": 1.5285083055496216, + "learning_rate": 4.5263671875e-05, + "loss": 0.2075, + "step": 1270 + }, + { + "epoch": 0.4857685009487666, + "grad_norm": 2.95650577545166, + "learning_rate": 4.521484375e-05, + "loss": 0.1601, + "step": 1280 + }, + { + "epoch": 0.48956356736242884, + "grad_norm": 0.7677034735679626, + "learning_rate": 4.5166015625e-05, + "loss": 0.1695, + "step": 1290 + }, + { + "epoch": 0.49335863377609107, + "grad_norm": 1.9959975481033325, + "learning_rate": 4.5117187500000005e-05, + "loss": 0.2183, + "step": 1300 + }, + { + "epoch": 0.4971537001897533, + "grad_norm": 1.8000417947769165, + "learning_rate": 4.5068359375000003e-05, + "loss": 0.175, + "step": 1310 + }, + { + "epoch": 0.5009487666034156, + "grad_norm": 1.400612473487854, + "learning_rate": 4.501953125e-05, + "loss": 0.2085, + "step": 1320 + }, + { + "epoch": 0.5047438330170778, + "grad_norm": 1.6406989097595215, + "learning_rate": 4.4970703125e-05, + "loss": 0.1537, + "step": 1330 + }, + { + "epoch": 0.50853889943074, + "grad_norm": 2.0849852561950684, + "learning_rate": 4.4921875e-05, + "loss": 0.1579, + "step": 1340 + }, + { + "epoch": 0.5123339658444023, + "grad_norm": 2.6497225761413574, + "learning_rate": 4.4873046875000005e-05, + "loss": 0.1888, + "step": 1350 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 2.2594399452209473, + "learning_rate": 4.482421875e-05, + "loss": 0.1645, + "step": 1360 + }, + { + "epoch": 0.5199240986717267, + "grad_norm": 1.3591111898422241, + "learning_rate": 4.4775390625e-05, + "loss": 0.1876, + "step": 1370 + }, + { + "epoch": 0.523719165085389, + "grad_norm": 5.060487747192383, + "learning_rate": 4.47265625e-05, + "loss": 0.1946, + "step": 1380 + }, + { + "epoch": 0.5275142314990512, + "grad_norm": 1.7694716453552246, + "learning_rate": 4.4677734375e-05, + "loss": 0.0966, + "step": 1390 + }, + { + "epoch": 0.5313092979127134, + "grad_norm": 2.8661625385284424, + "learning_rate": 4.4628906250000004e-05, + "loss": 0.1614, + "step": 1400 + }, + { + "epoch": 0.5351043643263758, + "grad_norm": 2.2955727577209473, + "learning_rate": 4.4580078125e-05, + "loss": 0.193, + "step": 1410 + }, + { + "epoch": 0.538899430740038, + "grad_norm": 1.4596924781799316, + "learning_rate": 4.453125e-05, + "loss": 0.1971, + "step": 1420 + }, + { + "epoch": 0.5426944971537002, + "grad_norm": 1.039890170097351, + "learning_rate": 4.4482421875e-05, + "loss": 0.1909, + "step": 1430 + }, + { + "epoch": 0.5464895635673624, + "grad_norm": 1.433979868888855, + "learning_rate": 4.443359375e-05, + "loss": 0.1832, + "step": 1440 + }, + { + "epoch": 0.5502846299810247, + "grad_norm": 1.306391954421997, + "learning_rate": 4.4384765625000004e-05, + "loss": 0.1867, + "step": 1450 + }, + { + "epoch": 0.5540796963946869, + "grad_norm": 1.2681069374084473, + "learning_rate": 4.43359375e-05, + "loss": 0.1506, + "step": 1460 + }, + { + "epoch": 0.5578747628083491, + "grad_norm": 3.947502613067627, + "learning_rate": 4.4287109375e-05, + "loss": 0.1343, + "step": 1470 + }, + { + "epoch": 0.5616698292220114, + "grad_norm": 4.928821563720703, + "learning_rate": 4.423828125e-05, + "loss": 0.2057, + "step": 1480 + }, + { + "epoch": 0.5654648956356736, + "grad_norm": 2.162473201751709, + "learning_rate": 4.4189453125000005e-05, + "loss": 0.1942, + "step": 1490 + }, + { + "epoch": 0.5692599620493358, + "grad_norm": 5.402246475219727, + "learning_rate": 4.4140625000000004e-05, + "loss": 0.1727, + "step": 1500 + }, + { + "epoch": 0.573055028462998, + "grad_norm": 0.2728889286518097, + "learning_rate": 4.4091796875e-05, + "loss": 0.1345, + "step": 1510 + }, + { + "epoch": 0.5768500948766604, + "grad_norm": 2.027841567993164, + "learning_rate": 4.404296875e-05, + "loss": 0.213, + "step": 1520 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 1.3224737644195557, + "learning_rate": 4.3994140625e-05, + "loss": 0.1735, + "step": 1530 + }, + { + "epoch": 0.5844402277039848, + "grad_norm": 2.3124992847442627, + "learning_rate": 4.3945312500000005e-05, + "loss": 0.2177, + "step": 1540 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.2521787881851196, + "learning_rate": 4.3896484375000004e-05, + "loss": 0.1332, + "step": 1550 + }, + { + "epoch": 0.5920303605313093, + "grad_norm": 2.5216283798217773, + "learning_rate": 4.384765625e-05, + "loss": 0.1318, + "step": 1560 + }, + { + "epoch": 0.5958254269449715, + "grad_norm": 1.8268439769744873, + "learning_rate": 4.3798828125e-05, + "loss": 0.1269, + "step": 1570 + }, + { + "epoch": 0.5996204933586338, + "grad_norm": 0.6268766522407532, + "learning_rate": 4.375e-05, + "loss": 0.1381, + "step": 1580 + }, + { + "epoch": 0.603415559772296, + "grad_norm": 1.979546308517456, + "learning_rate": 4.3701171875000005e-05, + "loss": 0.1351, + "step": 1590 + }, + { + "epoch": 0.6072106261859582, + "grad_norm": 1.5526436567306519, + "learning_rate": 4.3652343750000004e-05, + "loss": 0.2163, + "step": 1600 + }, + { + "epoch": 0.6110056925996205, + "grad_norm": 0.9428083896636963, + "learning_rate": 4.3603515625e-05, + "loss": 0.1398, + "step": 1610 + }, + { + "epoch": 0.6148007590132827, + "grad_norm": 2.1224870681762695, + "learning_rate": 4.35546875e-05, + "loss": 0.1891, + "step": 1620 + }, + { + "epoch": 0.618595825426945, + "grad_norm": 0.3401525914669037, + "learning_rate": 4.3505859375e-05, + "loss": 0.1068, + "step": 1630 + }, + { + "epoch": 0.6223908918406073, + "grad_norm": 1.1070092916488647, + "learning_rate": 4.3457031250000005e-05, + "loss": 0.1407, + "step": 1640 + }, + { + "epoch": 0.6261859582542695, + "grad_norm": 1.1588579416275024, + "learning_rate": 4.3408203125e-05, + "loss": 0.2238, + "step": 1650 + }, + { + "epoch": 0.6299810246679317, + "grad_norm": 1.3201090097427368, + "learning_rate": 4.3359375e-05, + "loss": 0.2135, + "step": 1660 + }, + { + "epoch": 0.6337760910815939, + "grad_norm": 1.2257441282272339, + "learning_rate": 4.3310546875e-05, + "loss": 0.1261, + "step": 1670 + }, + { + "epoch": 0.6375711574952562, + "grad_norm": 1.4213567972183228, + "learning_rate": 4.326171875e-05, + "loss": 0.1439, + "step": 1680 + }, + { + "epoch": 0.6413662239089184, + "grad_norm": 1.0983916521072388, + "learning_rate": 4.3212890625000004e-05, + "loss": 0.1356, + "step": 1690 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 1.6485854387283325, + "learning_rate": 4.31640625e-05, + "loss": 0.1549, + "step": 1700 + }, + { + "epoch": 0.6489563567362429, + "grad_norm": 5.49334716796875, + "learning_rate": 4.3115234375e-05, + "loss": 0.1519, + "step": 1710 + }, + { + "epoch": 0.6527514231499051, + "grad_norm": 0.26703280210494995, + "learning_rate": 4.306640625e-05, + "loss": 0.1499, + "step": 1720 + }, + { + "epoch": 0.6565464895635673, + "grad_norm": 1.5822151899337769, + "learning_rate": 4.3017578125e-05, + "loss": 0.1733, + "step": 1730 + }, + { + "epoch": 0.6603415559772297, + "grad_norm": 1.1510590314865112, + "learning_rate": 4.2968750000000004e-05, + "loss": 0.1665, + "step": 1740 + }, + { + "epoch": 0.6641366223908919, + "grad_norm": 2.48427152633667, + "learning_rate": 4.2919921875e-05, + "loss": 0.1598, + "step": 1750 + }, + { + "epoch": 0.6679316888045541, + "grad_norm": 2.0076019763946533, + "learning_rate": 4.287109375e-05, + "loss": 0.1642, + "step": 1760 + }, + { + "epoch": 0.6717267552182163, + "grad_norm": 2.1611413955688477, + "learning_rate": 4.2822265625e-05, + "loss": 0.1538, + "step": 1770 + }, + { + "epoch": 0.6755218216318786, + "grad_norm": 2.476008415222168, + "learning_rate": 4.27734375e-05, + "loss": 0.1193, + "step": 1780 + }, + { + "epoch": 0.6793168880455408, + "grad_norm": 2.426025867462158, + "learning_rate": 4.2724609375000004e-05, + "loss": 0.161, + "step": 1790 + }, + { + "epoch": 0.683111954459203, + "grad_norm": 2.2168385982513428, + "learning_rate": 4.267578125e-05, + "loss": 0.1429, + "step": 1800 + }, + { + "epoch": 0.6869070208728653, + "grad_norm": 1.63054358959198, + "learning_rate": 4.2626953125e-05, + "loss": 0.1561, + "step": 1810 + }, + { + "epoch": 0.6907020872865275, + "grad_norm": 5.170077323913574, + "learning_rate": 4.2578125e-05, + "loss": 0.1685, + "step": 1820 + }, + { + "epoch": 0.6944971537001897, + "grad_norm": 2.700263023376465, + "learning_rate": 4.2529296875000005e-05, + "loss": 0.1601, + "step": 1830 + }, + { + "epoch": 0.698292220113852, + "grad_norm": 1.6965094804763794, + "learning_rate": 4.2480468750000004e-05, + "loss": 0.1046, + "step": 1840 + }, + { + "epoch": 0.7020872865275142, + "grad_norm": 5.461817264556885, + "learning_rate": 4.2431640625e-05, + "loss": 0.1421, + "step": 1850 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 1.584050178527832, + "learning_rate": 4.23828125e-05, + "loss": 0.1781, + "step": 1860 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 2.42586088180542, + "learning_rate": 4.2333984375e-05, + "loss": 0.1274, + "step": 1870 + }, + { + "epoch": 0.713472485768501, + "grad_norm": 3.151433229446411, + "learning_rate": 4.2285156250000005e-05, + "loss": 0.1825, + "step": 1880 + }, + { + "epoch": 0.7172675521821632, + "grad_norm": 1.1808427572250366, + "learning_rate": 4.2236328125000004e-05, + "loss": 0.2085, + "step": 1890 + }, + { + "epoch": 0.7210626185958254, + "grad_norm": 1.981814980506897, + "learning_rate": 4.21875e-05, + "loss": 0.1718, + "step": 1900 + }, + { + "epoch": 0.7248576850094877, + "grad_norm": 0.9719598293304443, + "learning_rate": 4.2138671875e-05, + "loss": 0.1461, + "step": 1910 + }, + { + "epoch": 0.7286527514231499, + "grad_norm": 1.493422031402588, + "learning_rate": 4.208984375e-05, + "loss": 0.1902, + "step": 1920 + }, + { + "epoch": 0.7324478178368121, + "grad_norm": 1.4552210569381714, + "learning_rate": 4.2041015625000005e-05, + "loss": 0.1253, + "step": 1930 + }, + { + "epoch": 0.7362428842504743, + "grad_norm": 2.0822556018829346, + "learning_rate": 4.1992187500000003e-05, + "loss": 0.144, + "step": 1940 + }, + { + "epoch": 0.7400379506641366, + "grad_norm": 2.461090326309204, + "learning_rate": 4.1943359375e-05, + "loss": 0.2084, + "step": 1950 + }, + { + "epoch": 0.7438330170777988, + "grad_norm": 1.8043471574783325, + "learning_rate": 4.189453125e-05, + "loss": 0.1904, + "step": 1960 + }, + { + "epoch": 0.7476280834914611, + "grad_norm": 1.6388760805130005, + "learning_rate": 4.1845703125e-05, + "loss": 0.2071, + "step": 1970 + }, + { + "epoch": 0.7514231499051234, + "grad_norm": 2.5029492378234863, + "learning_rate": 4.1796875000000005e-05, + "loss": 0.1881, + "step": 1980 + }, + { + "epoch": 0.7552182163187856, + "grad_norm": 1.3092814683914185, + "learning_rate": 4.1748046875e-05, + "loss": 0.1356, + "step": 1990 + }, + { + "epoch": 0.7590132827324478, + "grad_norm": 1.2208425998687744, + "learning_rate": 4.169921875e-05, + "loss": 0.1378, + "step": 2000 + }, + { + "epoch": 0.7628083491461101, + "grad_norm": 3.214336633682251, + "learning_rate": 4.1650390625e-05, + "loss": 0.1954, + "step": 2010 + }, + { + "epoch": 0.7666034155597723, + "grad_norm": 4.104292392730713, + "learning_rate": 4.16015625e-05, + "loss": 0.1886, + "step": 2020 + }, + { + "epoch": 0.7703984819734345, + "grad_norm": 2.170186996459961, + "learning_rate": 4.1552734375000004e-05, + "loss": 0.1705, + "step": 2030 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 2.6494083404541016, + "learning_rate": 4.150390625e-05, + "loss": 0.1986, + "step": 2040 + }, + { + "epoch": 0.777988614800759, + "grad_norm": 0.7542719841003418, + "learning_rate": 4.1455078125e-05, + "loss": 0.1255, + "step": 2050 + }, + { + "epoch": 0.7817836812144212, + "grad_norm": 3.126569986343384, + "learning_rate": 4.140625e-05, + "loss": 0.1576, + "step": 2060 + }, + { + "epoch": 0.7855787476280834, + "grad_norm": 1.0665310621261597, + "learning_rate": 4.1357421875e-05, + "loss": 0.174, + "step": 2070 + }, + { + "epoch": 0.7893738140417458, + "grad_norm": 1.3480401039123535, + "learning_rate": 4.1308593750000004e-05, + "loss": 0.1203, + "step": 2080 + }, + { + "epoch": 0.793168880455408, + "grad_norm": 2.358405113220215, + "learning_rate": 4.1259765625e-05, + "loss": 0.1394, + "step": 2090 + }, + { + "epoch": 0.7969639468690702, + "grad_norm": 3.2337498664855957, + "learning_rate": 4.12109375e-05, + "loss": 0.1711, + "step": 2100 + }, + { + "epoch": 0.8007590132827325, + "grad_norm": 2.7708380222320557, + "learning_rate": 4.1162109375e-05, + "loss": 0.1265, + "step": 2110 + }, + { + "epoch": 0.8045540796963947, + "grad_norm": 3.3023488521575928, + "learning_rate": 4.1113281250000005e-05, + "loss": 0.1706, + "step": 2120 + }, + { + "epoch": 0.8083491461100569, + "grad_norm": 1.758325219154358, + "learning_rate": 4.1064453125000004e-05, + "loss": 0.1371, + "step": 2130 + }, + { + "epoch": 0.8121442125237192, + "grad_norm": 1.5623672008514404, + "learning_rate": 4.1015625e-05, + "loss": 0.1756, + "step": 2140 + }, + { + "epoch": 0.8159392789373814, + "grad_norm": 1.3145450353622437, + "learning_rate": 4.0966796875e-05, + "loss": 0.1328, + "step": 2150 + }, + { + "epoch": 0.8197343453510436, + "grad_norm": 2.432619094848633, + "learning_rate": 4.091796875e-05, + "loss": 0.1286, + "step": 2160 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.4147840142250061, + "learning_rate": 4.0869140625000005e-05, + "loss": 0.1509, + "step": 2170 + }, + { + "epoch": 0.8273244781783681, + "grad_norm": 1.6098836660385132, + "learning_rate": 4.0820312500000004e-05, + "loss": 0.1746, + "step": 2180 + }, + { + "epoch": 0.8311195445920304, + "grad_norm": 2.5355212688446045, + "learning_rate": 4.0771484375e-05, + "loss": 0.1238, + "step": 2190 + }, + { + "epoch": 0.8349146110056926, + "grad_norm": 1.5544086694717407, + "learning_rate": 4.072265625e-05, + "loss": 0.2168, + "step": 2200 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 2.1792962551116943, + "learning_rate": 4.0673828125e-05, + "loss": 0.1338, + "step": 2210 + }, + { + "epoch": 0.8425047438330171, + "grad_norm": 2.667340040206909, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.1505, + "step": 2220 + }, + { + "epoch": 0.8462998102466793, + "grad_norm": 0.8551260232925415, + "learning_rate": 4.0576171875000004e-05, + "loss": 0.1081, + "step": 2230 + }, + { + "epoch": 0.8500948766603416, + "grad_norm": 2.8773763179779053, + "learning_rate": 4.052734375e-05, + "loss": 0.1089, + "step": 2240 + }, + { + "epoch": 0.8538899430740038, + "grad_norm": 2.12497878074646, + "learning_rate": 4.0478515625e-05, + "loss": 0.1268, + "step": 2250 + }, + { + "epoch": 0.857685009487666, + "grad_norm": 1.8039929866790771, + "learning_rate": 4.04296875e-05, + "loss": 0.1544, + "step": 2260 + }, + { + "epoch": 0.8614800759013282, + "grad_norm": 0.4839627742767334, + "learning_rate": 4.0380859375000005e-05, + "loss": 0.1421, + "step": 2270 + }, + { + "epoch": 0.8652751423149905, + "grad_norm": 3.672240734100342, + "learning_rate": 4.033203125e-05, + "loss": 0.134, + "step": 2280 + }, + { + "epoch": 0.8690702087286527, + "grad_norm": 2.4371728897094727, + "learning_rate": 4.0283203125e-05, + "loss": 0.1419, + "step": 2290 + }, + { + "epoch": 0.872865275142315, + "grad_norm": 1.8469904661178589, + "learning_rate": 4.0234375e-05, + "loss": 0.1846, + "step": 2300 + }, + { + "epoch": 0.8766603415559773, + "grad_norm": 0.7639700174331665, + "learning_rate": 4.0185546875e-05, + "loss": 0.106, + "step": 2310 + }, + { + "epoch": 0.8804554079696395, + "grad_norm": 1.4450427293777466, + "learning_rate": 4.0136718750000004e-05, + "loss": 0.1408, + "step": 2320 + }, + { + "epoch": 0.8842504743833017, + "grad_norm": 1.3033993244171143, + "learning_rate": 4.0087890625e-05, + "loss": 0.1456, + "step": 2330 + }, + { + "epoch": 0.888045540796964, + "grad_norm": 1.3045791387557983, + "learning_rate": 4.00390625e-05, + "loss": 0.1531, + "step": 2340 + }, + { + "epoch": 0.8918406072106262, + "grad_norm": 3.4357423782348633, + "learning_rate": 3.9990234375e-05, + "loss": 0.1417, + "step": 2350 + }, + { + "epoch": 0.8956356736242884, + "grad_norm": 3.5311038494110107, + "learning_rate": 3.994140625e-05, + "loss": 0.1317, + "step": 2360 + }, + { + "epoch": 0.8994307400379506, + "grad_norm": 4.028538227081299, + "learning_rate": 3.9892578125000004e-05, + "loss": 0.1644, + "step": 2370 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 1.4089256525039673, + "learning_rate": 3.984375e-05, + "loss": 0.1087, + "step": 2380 + }, + { + "epoch": 0.9070208728652751, + "grad_norm": 0.2230881005525589, + "learning_rate": 3.9794921875e-05, + "loss": 0.1387, + "step": 2390 + }, + { + "epoch": 0.9108159392789373, + "grad_norm": 2.5647592544555664, + "learning_rate": 3.974609375e-05, + "loss": 0.1475, + "step": 2400 + }, + { + "epoch": 0.9146110056925996, + "grad_norm": 1.2803542613983154, + "learning_rate": 3.9697265625e-05, + "loss": 0.126, + "step": 2410 + }, + { + "epoch": 0.9184060721062619, + "grad_norm": 3.2023112773895264, + "learning_rate": 3.9648437500000004e-05, + "loss": 0.1458, + "step": 2420 + }, + { + "epoch": 0.9222011385199241, + "grad_norm": 3.615530252456665, + "learning_rate": 3.9599609375e-05, + "loss": 0.1297, + "step": 2430 + }, + { + "epoch": 0.9259962049335864, + "grad_norm": 3.396568536758423, + "learning_rate": 3.955078125e-05, + "loss": 0.1486, + "step": 2440 + }, + { + "epoch": 0.9297912713472486, + "grad_norm": 1.7030583620071411, + "learning_rate": 3.9501953125e-05, + "loss": 0.1464, + "step": 2450 + }, + { + "epoch": 0.9335863377609108, + "grad_norm": 1.0317497253417969, + "learning_rate": 3.9453125000000005e-05, + "loss": 0.1658, + "step": 2460 + }, + { + "epoch": 0.937381404174573, + "grad_norm": 1.1268532276153564, + "learning_rate": 3.9404296875000004e-05, + "loss": 0.1425, + "step": 2470 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.9238561391830444, + "learning_rate": 3.935546875e-05, + "loss": 0.1565, + "step": 2480 + }, + { + "epoch": 0.9449715370018975, + "grad_norm": 1.4960806369781494, + "learning_rate": 3.9306640625e-05, + "loss": 0.1681, + "step": 2490 + }, + { + "epoch": 0.9487666034155597, + "grad_norm": 1.306814193725586, + "learning_rate": 3.92578125e-05, + "loss": 0.1719, + "step": 2500 + }, + { + "epoch": 0.952561669829222, + "grad_norm": 0.391342431306839, + "learning_rate": 3.9208984375000005e-05, + "loss": 0.1497, + "step": 2510 + }, + { + "epoch": 0.9563567362428842, + "grad_norm": 1.9634449481964111, + "learning_rate": 3.9160156250000004e-05, + "loss": 0.124, + "step": 2520 + }, + { + "epoch": 0.9601518026565465, + "grad_norm": 2.7319021224975586, + "learning_rate": 3.9111328125e-05, + "loss": 0.1029, + "step": 2530 + }, + { + "epoch": 0.9639468690702088, + "grad_norm": 1.062157392501831, + "learning_rate": 3.90625e-05, + "loss": 0.1612, + "step": 2540 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 2.737459182739258, + "learning_rate": 3.9013671875e-05, + "loss": 0.1817, + "step": 2550 + }, + { + "epoch": 0.9715370018975332, + "grad_norm": 1.4106887578964233, + "learning_rate": 3.8964843750000005e-05, + "loss": 0.1875, + "step": 2560 + }, + { + "epoch": 0.9753320683111955, + "grad_norm": 7.118113040924072, + "learning_rate": 3.8916015625000003e-05, + "loss": 0.2243, + "step": 2570 + }, + { + "epoch": 0.9791271347248577, + "grad_norm": 2.956235647201538, + "learning_rate": 3.88671875e-05, + "loss": 0.1059, + "step": 2580 + }, + { + "epoch": 0.9829222011385199, + "grad_norm": 1.2888784408569336, + "learning_rate": 3.8818359375e-05, + "loss": 0.1546, + "step": 2590 + }, + { + "epoch": 0.9867172675521821, + "grad_norm": 2.5757930278778076, + "learning_rate": 3.876953125e-05, + "loss": 0.115, + "step": 2600 + }, + { + "epoch": 0.9905123339658444, + "grad_norm": 0.7105236053466797, + "learning_rate": 3.8720703125000005e-05, + "loss": 0.1218, + "step": 2610 + }, + { + "epoch": 0.9943074003795066, + "grad_norm": 2.5876383781433105, + "learning_rate": 3.8671875e-05, + "loss": 0.1487, + "step": 2620 + }, + { + "epoch": 0.9981024667931688, + "grad_norm": 0.2208087146282196, + "learning_rate": 3.8623046875e-05, + "loss": 0.1429, + "step": 2630 + }, + { + "epoch": 1.0018975332068312, + "grad_norm": 0.6170036196708679, + "learning_rate": 3.857421875e-05, + "loss": 0.128, + "step": 2640 + }, + { + "epoch": 1.0056925996204933, + "grad_norm": 1.1868369579315186, + "learning_rate": 3.8525390625e-05, + "loss": 0.0923, + "step": 2650 + }, + { + "epoch": 1.0094876660341556, + "grad_norm": 3.0359079837799072, + "learning_rate": 3.8476562500000004e-05, + "loss": 0.1104, + "step": 2660 + }, + { + "epoch": 1.0132827324478177, + "grad_norm": 0.6559151411056519, + "learning_rate": 3.8427734375e-05, + "loss": 0.1089, + "step": 2670 + }, + { + "epoch": 1.01707779886148, + "grad_norm": 10.784985542297363, + "learning_rate": 3.837890625e-05, + "loss": 0.1408, + "step": 2680 + }, + { + "epoch": 1.0208728652751424, + "grad_norm": 1.7095699310302734, + "learning_rate": 3.8330078125e-05, + "loss": 0.1358, + "step": 2690 + }, + { + "epoch": 1.0246679316888045, + "grad_norm": 1.3584043979644775, + "learning_rate": 3.828125e-05, + "loss": 0.1248, + "step": 2700 + }, + { + "epoch": 1.0284629981024669, + "grad_norm": 5.567887783050537, + "learning_rate": 3.8232421875000004e-05, + "loss": 0.0992, + "step": 2710 + }, + { + "epoch": 1.032258064516129, + "grad_norm": 1.6698075532913208, + "learning_rate": 3.818359375e-05, + "loss": 0.1503, + "step": 2720 + }, + { + "epoch": 1.0360531309297913, + "grad_norm": 0.29519161581993103, + "learning_rate": 3.8134765625e-05, + "loss": 0.1247, + "step": 2730 + }, + { + "epoch": 1.0398481973434535, + "grad_norm": 2.3616697788238525, + "learning_rate": 3.80859375e-05, + "loss": 0.1459, + "step": 2740 + }, + { + "epoch": 1.0436432637571158, + "grad_norm": 1.219618320465088, + "learning_rate": 3.8037109375e-05, + "loss": 0.1036, + "step": 2750 + }, + { + "epoch": 1.047438330170778, + "grad_norm": 1.3592404127120972, + "learning_rate": 3.7988281250000004e-05, + "loss": 0.1399, + "step": 2760 + }, + { + "epoch": 1.0512333965844403, + "grad_norm": 1.2837351560592651, + "learning_rate": 3.7939453125e-05, + "loss": 0.1581, + "step": 2770 + }, + { + "epoch": 1.0550284629981024, + "grad_norm": 1.3627588748931885, + "learning_rate": 3.7890625e-05, + "loss": 0.1093, + "step": 2780 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 4.571230888366699, + "learning_rate": 3.7841796875e-05, + "loss": 0.1693, + "step": 2790 + }, + { + "epoch": 1.0626185958254268, + "grad_norm": 1.575040578842163, + "learning_rate": 3.7792968750000005e-05, + "loss": 0.1646, + "step": 2800 + }, + { + "epoch": 1.0664136622390892, + "grad_norm": 2.594174861907959, + "learning_rate": 3.7744140625000004e-05, + "loss": 0.0976, + "step": 2810 + }, + { + "epoch": 1.0702087286527515, + "grad_norm": 4.076402187347412, + "learning_rate": 3.76953125e-05, + "loss": 0.1301, + "step": 2820 + }, + { + "epoch": 1.0740037950664136, + "grad_norm": 2.7510082721710205, + "learning_rate": 3.7646484375e-05, + "loss": 0.1337, + "step": 2830 + }, + { + "epoch": 1.077798861480076, + "grad_norm": 0.8219005465507507, + "learning_rate": 3.759765625e-05, + "loss": 0.1122, + "step": 2840 + }, + { + "epoch": 1.081593927893738, + "grad_norm": 1.9153568744659424, + "learning_rate": 3.7548828125000005e-05, + "loss": 0.1428, + "step": 2850 + }, + { + "epoch": 1.0853889943074004, + "grad_norm": 2.93013858795166, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.1872, + "step": 2860 + }, + { + "epoch": 1.0891840607210626, + "grad_norm": 0.7126034498214722, + "learning_rate": 3.7451171875e-05, + "loss": 0.1106, + "step": 2870 + }, + { + "epoch": 1.092979127134725, + "grad_norm": 1.8968008756637573, + "learning_rate": 3.740234375e-05, + "loss": 0.1131, + "step": 2880 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 5.133113861083984, + "learning_rate": 3.7353515625e-05, + "loss": 0.0884, + "step": 2890 + }, + { + "epoch": 1.1005692599620494, + "grad_norm": 3.756060838699341, + "learning_rate": 3.7304687500000005e-05, + "loss": 0.1373, + "step": 2900 + }, + { + "epoch": 1.1043643263757117, + "grad_norm": 7.563070297241211, + "learning_rate": 3.7255859375e-05, + "loss": 0.1353, + "step": 2910 + }, + { + "epoch": 1.1081593927893738, + "grad_norm": 4.473198413848877, + "learning_rate": 3.720703125e-05, + "loss": 0.1639, + "step": 2920 + }, + { + "epoch": 1.1119544592030361, + "grad_norm": 2.689405679702759, + "learning_rate": 3.7158203125e-05, + "loss": 0.1117, + "step": 2930 + }, + { + "epoch": 1.1157495256166983, + "grad_norm": 0.2793045938014984, + "learning_rate": 3.7109375e-05, + "loss": 0.1073, + "step": 2940 + }, + { + "epoch": 1.1195445920303606, + "grad_norm": 1.4892089366912842, + "learning_rate": 3.7060546875000004e-05, + "loss": 0.1541, + "step": 2950 + }, + { + "epoch": 1.1233396584440227, + "grad_norm": 1.1303538084030151, + "learning_rate": 3.701171875e-05, + "loss": 0.0961, + "step": 2960 + }, + { + "epoch": 1.127134724857685, + "grad_norm": 0.6085264682769775, + "learning_rate": 3.6962890625e-05, + "loss": 0.111, + "step": 2970 + }, + { + "epoch": 1.1309297912713472, + "grad_norm": 0.44500744342803955, + "learning_rate": 3.69140625e-05, + "loss": 0.0939, + "step": 2980 + }, + { + "epoch": 1.1347248576850095, + "grad_norm": 1.8215651512145996, + "learning_rate": 3.6865234375e-05, + "loss": 0.1112, + "step": 2990 + }, + { + "epoch": 1.1385199240986716, + "grad_norm": 0.7494792938232422, + "learning_rate": 3.6816406250000004e-05, + "loss": 0.1407, + "step": 3000 + }, + { + "epoch": 1.142314990512334, + "grad_norm": 1.2958310842514038, + "learning_rate": 3.6767578125e-05, + "loss": 0.086, + "step": 3010 + }, + { + "epoch": 1.146110056925996, + "grad_norm": 1.223376989364624, + "learning_rate": 3.671875e-05, + "loss": 0.1152, + "step": 3020 + }, + { + "epoch": 1.1499051233396584, + "grad_norm": 5.232940196990967, + "learning_rate": 3.6669921875e-05, + "loss": 0.1308, + "step": 3030 + }, + { + "epoch": 1.1537001897533208, + "grad_norm": 1.4690934419631958, + "learning_rate": 3.662109375e-05, + "loss": 0.1275, + "step": 3040 + }, + { + "epoch": 1.157495256166983, + "grad_norm": 0.8882303833961487, + "learning_rate": 3.6572265625000004e-05, + "loss": 0.0709, + "step": 3050 + }, + { + "epoch": 1.1612903225806452, + "grad_norm": 7.125335216522217, + "learning_rate": 3.65234375e-05, + "loss": 0.0991, + "step": 3060 + }, + { + "epoch": 1.1650853889943074, + "grad_norm": 2.321225881576538, + "learning_rate": 3.6474609375e-05, + "loss": 0.1986, + "step": 3070 + }, + { + "epoch": 1.1688804554079697, + "grad_norm": 2.8146891593933105, + "learning_rate": 3.642578125e-05, + "loss": 0.1497, + "step": 3080 + }, + { + "epoch": 1.1726755218216318, + "grad_norm": 2.781428575515747, + "learning_rate": 3.6376953125e-05, + "loss": 0.1075, + "step": 3090 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 7.027383327484131, + "learning_rate": 3.6328125000000004e-05, + "loss": 0.0921, + "step": 3100 + }, + { + "epoch": 1.1802656546489563, + "grad_norm": 2.3189167976379395, + "learning_rate": 3.6279296875e-05, + "loss": 0.0784, + "step": 3110 + }, + { + "epoch": 1.1840607210626186, + "grad_norm": 3.060039758682251, + "learning_rate": 3.623046875e-05, + "loss": 0.1262, + "step": 3120 + }, + { + "epoch": 1.187855787476281, + "grad_norm": 6.099356174468994, + "learning_rate": 3.6181640625e-05, + "loss": 0.1506, + "step": 3130 + }, + { + "epoch": 1.191650853889943, + "grad_norm": 3.1299543380737305, + "learning_rate": 3.6132812500000005e-05, + "loss": 0.1431, + "step": 3140 + }, + { + "epoch": 1.1954459203036052, + "grad_norm": 1.5676418542861938, + "learning_rate": 3.6083984375000004e-05, + "loss": 0.1018, + "step": 3150 + }, + { + "epoch": 1.1992409867172675, + "grad_norm": 0.786465585231781, + "learning_rate": 3.603515625e-05, + "loss": 0.1471, + "step": 3160 + }, + { + "epoch": 1.2030360531309299, + "grad_norm": 0.6863810420036316, + "learning_rate": 3.5986328125e-05, + "loss": 0.1144, + "step": 3170 + }, + { + "epoch": 1.206831119544592, + "grad_norm": 6.13245964050293, + "learning_rate": 3.59375e-05, + "loss": 0.1378, + "step": 3180 + }, + { + "epoch": 1.2106261859582543, + "grad_norm": 0.9144377112388611, + "learning_rate": 3.5888671875000005e-05, + "loss": 0.1024, + "step": 3190 + }, + { + "epoch": 1.2144212523719164, + "grad_norm": 13.092443466186523, + "learning_rate": 3.583984375e-05, + "loss": 0.1241, + "step": 3200 + }, + { + "epoch": 1.2182163187855788, + "grad_norm": 5.453747272491455, + "learning_rate": 3.5791015625e-05, + "loss": 0.1307, + "step": 3210 + }, + { + "epoch": 1.222011385199241, + "grad_norm": 5.696516036987305, + "learning_rate": 3.57421875e-05, + "loss": 0.1661, + "step": 3220 + }, + { + "epoch": 1.2258064516129032, + "grad_norm": 1.4154207706451416, + "learning_rate": 3.5693359375e-05, + "loss": 0.1017, + "step": 3230 + }, + { + "epoch": 1.2296015180265654, + "grad_norm": 3.1260204315185547, + "learning_rate": 3.5644531250000005e-05, + "loss": 0.1224, + "step": 3240 + }, + { + "epoch": 1.2333965844402277, + "grad_norm": 1.4753592014312744, + "learning_rate": 3.5595703125e-05, + "loss": 0.1, + "step": 3250 + }, + { + "epoch": 1.23719165085389, + "grad_norm": 2.7512917518615723, + "learning_rate": 3.5546875e-05, + "loss": 0.152, + "step": 3260 + }, + { + "epoch": 1.2409867172675522, + "grad_norm": 0.1835506409406662, + "learning_rate": 3.5498046875e-05, + "loss": 0.0897, + "step": 3270 + }, + { + "epoch": 1.2447817836812145, + "grad_norm": 2.484245777130127, + "learning_rate": 3.544921875e-05, + "loss": 0.1284, + "step": 3280 + }, + { + "epoch": 1.2485768500948766, + "grad_norm": 2.778939962387085, + "learning_rate": 3.5400390625000004e-05, + "loss": 0.1225, + "step": 3290 + }, + { + "epoch": 1.252371916508539, + "grad_norm": 4.067395210266113, + "learning_rate": 3.53515625e-05, + "loss": 0.1687, + "step": 3300 + }, + { + "epoch": 1.256166982922201, + "grad_norm": 0.2922412157058716, + "learning_rate": 3.5302734375e-05, + "loss": 0.066, + "step": 3310 + }, + { + "epoch": 1.2599620493358634, + "grad_norm": 2.992678165435791, + "learning_rate": 3.525390625e-05, + "loss": 0.1016, + "step": 3320 + }, + { + "epoch": 1.2637571157495255, + "grad_norm": 0.5019288063049316, + "learning_rate": 3.5205078125e-05, + "loss": 0.0877, + "step": 3330 + }, + { + "epoch": 1.2675521821631879, + "grad_norm": 5.55689811706543, + "learning_rate": 3.5156250000000004e-05, + "loss": 0.1191, + "step": 3340 + }, + { + "epoch": 1.2713472485768502, + "grad_norm": 3.2791213989257812, + "learning_rate": 3.5107421875e-05, + "loss": 0.1086, + "step": 3350 + }, + { + "epoch": 1.2751423149905123, + "grad_norm": 7.413064956665039, + "learning_rate": 3.505859375e-05, + "loss": 0.1063, + "step": 3360 + }, + { + "epoch": 1.2789373814041745, + "grad_norm": 4.541271686553955, + "learning_rate": 3.5009765625e-05, + "loss": 0.0959, + "step": 3370 + }, + { + "epoch": 1.2827324478178368, + "grad_norm": 2.8879811763763428, + "learning_rate": 3.49609375e-05, + "loss": 0.1178, + "step": 3380 + }, + { + "epoch": 1.2865275142314991, + "grad_norm": 3.210865020751953, + "learning_rate": 3.4912109375000004e-05, + "loss": 0.1464, + "step": 3390 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 0.654231071472168, + "learning_rate": 3.486328125e-05, + "loss": 0.1404, + "step": 3400 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 2.9404890537261963, + "learning_rate": 3.4814453125e-05, + "loss": 0.1213, + "step": 3410 + }, + { + "epoch": 1.2979127134724857, + "grad_norm": 2.2991085052490234, + "learning_rate": 3.4765625e-05, + "loss": 0.1131, + "step": 3420 + }, + { + "epoch": 1.301707779886148, + "grad_norm": 0.30925440788269043, + "learning_rate": 3.4716796875e-05, + "loss": 0.1166, + "step": 3430 + }, + { + "epoch": 1.3055028462998102, + "grad_norm": 1.3804266452789307, + "learning_rate": 3.4667968750000004e-05, + "loss": 0.0634, + "step": 3440 + }, + { + "epoch": 1.3092979127134725, + "grad_norm": 3.1803112030029297, + "learning_rate": 3.4619140625e-05, + "loss": 0.1916, + "step": 3450 + }, + { + "epoch": 1.3130929791271346, + "grad_norm": 2.8847222328186035, + "learning_rate": 3.45703125e-05, + "loss": 0.1856, + "step": 3460 + }, + { + "epoch": 1.316888045540797, + "grad_norm": 7.0924973487854, + "learning_rate": 3.4521484375e-05, + "loss": 0.1292, + "step": 3470 + }, + { + "epoch": 1.3206831119544593, + "grad_norm": 4.695943355560303, + "learning_rate": 3.4472656250000005e-05, + "loss": 0.1518, + "step": 3480 + }, + { + "epoch": 1.3244781783681214, + "grad_norm": 4.995908260345459, + "learning_rate": 3.4423828125000003e-05, + "loss": 0.12, + "step": 3490 + }, + { + "epoch": 1.3282732447817835, + "grad_norm": 4.585287570953369, + "learning_rate": 3.4375e-05, + "loss": 0.0933, + "step": 3500 + }, + { + "epoch": 1.3320683111954459, + "grad_norm": 1.5841524600982666, + "learning_rate": 3.4326171875e-05, + "loss": 0.1172, + "step": 3510 + }, + { + "epoch": 1.3358633776091082, + "grad_norm": 3.6837852001190186, + "learning_rate": 3.427734375e-05, + "loss": 0.1164, + "step": 3520 + }, + { + "epoch": 1.3396584440227703, + "grad_norm": 2.470222234725952, + "learning_rate": 3.4228515625000005e-05, + "loss": 0.1258, + "step": 3530 + }, + { + "epoch": 1.3434535104364327, + "grad_norm": 1.8782237768173218, + "learning_rate": 3.41796875e-05, + "loss": 0.1078, + "step": 3540 + }, + { + "epoch": 1.3472485768500948, + "grad_norm": 0.29535171389579773, + "learning_rate": 3.4130859375e-05, + "loss": 0.1658, + "step": 3550 + }, + { + "epoch": 1.3510436432637571, + "grad_norm": 3.8535208702087402, + "learning_rate": 3.408203125e-05, + "loss": 0.1632, + "step": 3560 + }, + { + "epoch": 1.3548387096774195, + "grad_norm": 2.0340235233306885, + "learning_rate": 3.4033203125e-05, + "loss": 0.1498, + "step": 3570 + }, + { + "epoch": 1.3586337760910816, + "grad_norm": 3.015774726867676, + "learning_rate": 3.3984375000000004e-05, + "loss": 0.1099, + "step": 3580 + }, + { + "epoch": 1.3624288425047437, + "grad_norm": 5.396883487701416, + "learning_rate": 3.3935546875e-05, + "loss": 0.1308, + "step": 3590 + }, + { + "epoch": 1.366223908918406, + "grad_norm": 4.15665864944458, + "learning_rate": 3.388671875e-05, + "loss": 0.0893, + "step": 3600 + }, + { + "epoch": 1.3700189753320684, + "grad_norm": 2.0461652278900146, + "learning_rate": 3.3837890625e-05, + "loss": 0.1157, + "step": 3610 + }, + { + "epoch": 1.3738140417457305, + "grad_norm": 1.5953052043914795, + "learning_rate": 3.37890625e-05, + "loss": 0.1611, + "step": 3620 + }, + { + "epoch": 1.3776091081593929, + "grad_norm": 3.8149826526641846, + "learning_rate": 3.3740234375000004e-05, + "loss": 0.1582, + "step": 3630 + }, + { + "epoch": 1.381404174573055, + "grad_norm": 5.658437252044678, + "learning_rate": 3.369140625e-05, + "loss": 0.1481, + "step": 3640 + }, + { + "epoch": 1.3851992409867173, + "grad_norm": 0.47566506266593933, + "learning_rate": 3.3642578125e-05, + "loss": 0.1336, + "step": 3650 + }, + { + "epoch": 1.3889943074003794, + "grad_norm": 2.9851224422454834, + "learning_rate": 3.359375e-05, + "loss": 0.1274, + "step": 3660 + }, + { + "epoch": 1.3927893738140418, + "grad_norm": 2.3793752193450928, + "learning_rate": 3.3544921875e-05, + "loss": 0.1189, + "step": 3670 + }, + { + "epoch": 1.396584440227704, + "grad_norm": 0.35333120822906494, + "learning_rate": 3.3496093750000004e-05, + "loss": 0.1021, + "step": 3680 + }, + { + "epoch": 1.4003795066413662, + "grad_norm": 2.170039653778076, + "learning_rate": 3.3447265625e-05, + "loss": 0.1016, + "step": 3690 + }, + { + "epoch": 1.4041745730550286, + "grad_norm": 3.225989818572998, + "learning_rate": 3.33984375e-05, + "loss": 0.1559, + "step": 3700 + }, + { + "epoch": 1.4079696394686907, + "grad_norm": 5.81306266784668, + "learning_rate": 3.3349609375e-05, + "loss": 0.1378, + "step": 3710 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.839579701423645, + "learning_rate": 3.330078125e-05, + "loss": 0.0981, + "step": 3720 + }, + { + "epoch": 1.4155597722960152, + "grad_norm": 2.421964645385742, + "learning_rate": 3.3251953125000004e-05, + "loss": 0.1267, + "step": 3730 + }, + { + "epoch": 1.4193548387096775, + "grad_norm": 0.298155814409256, + "learning_rate": 3.3203125e-05, + "loss": 0.1619, + "step": 3740 + }, + { + "epoch": 1.4231499051233396, + "grad_norm": 5.643527030944824, + "learning_rate": 3.3154296875e-05, + "loss": 0.0844, + "step": 3750 + }, + { + "epoch": 1.426944971537002, + "grad_norm": 1.7513082027435303, + "learning_rate": 3.310546875e-05, + "loss": 0.133, + "step": 3760 + }, + { + "epoch": 1.430740037950664, + "grad_norm": 1.2837634086608887, + "learning_rate": 3.3056640625000005e-05, + "loss": 0.1241, + "step": 3770 + }, + { + "epoch": 1.4345351043643264, + "grad_norm": 0.7017351984977722, + "learning_rate": 3.3007812500000004e-05, + "loss": 0.1123, + "step": 3780 + }, + { + "epoch": 1.4383301707779887, + "grad_norm": 6.043475151062012, + "learning_rate": 3.2958984375e-05, + "loss": 0.1249, + "step": 3790 + }, + { + "epoch": 1.4421252371916509, + "grad_norm": 4.449422359466553, + "learning_rate": 3.291015625e-05, + "loss": 0.173, + "step": 3800 + }, + { + "epoch": 1.445920303605313, + "grad_norm": 1.7111449241638184, + "learning_rate": 3.2861328125e-05, + "loss": 0.1473, + "step": 3810 + }, + { + "epoch": 1.4497153700189753, + "grad_norm": 1.3379569053649902, + "learning_rate": 3.2812500000000005e-05, + "loss": 0.1119, + "step": 3820 + }, + { + "epoch": 1.4535104364326377, + "grad_norm": 7.154158115386963, + "learning_rate": 3.2763671875e-05, + "loss": 0.1273, + "step": 3830 + }, + { + "epoch": 1.4573055028462998, + "grad_norm": 1.2248731851577759, + "learning_rate": 3.271484375e-05, + "loss": 0.1081, + "step": 3840 + }, + { + "epoch": 1.4611005692599621, + "grad_norm": 1.219230055809021, + "learning_rate": 3.2666015625e-05, + "loss": 0.0945, + "step": 3850 + }, + { + "epoch": 1.4648956356736242, + "grad_norm": 4.3124189376831055, + "learning_rate": 3.26171875e-05, + "loss": 0.1039, + "step": 3860 + }, + { + "epoch": 1.4686907020872866, + "grad_norm": 2.915302038192749, + "learning_rate": 3.2568359375000005e-05, + "loss": 0.1236, + "step": 3870 + }, + { + "epoch": 1.4724857685009487, + "grad_norm": 0.3403218984603882, + "learning_rate": 3.251953125e-05, + "loss": 0.146, + "step": 3880 + }, + { + "epoch": 1.476280834914611, + "grad_norm": 1.74779212474823, + "learning_rate": 3.2470703125e-05, + "loss": 0.1096, + "step": 3890 + }, + { + "epoch": 1.4800759013282732, + "grad_norm": 2.724412202835083, + "learning_rate": 3.2421875e-05, + "loss": 0.1147, + "step": 3900 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 3.6029605865478516, + "learning_rate": 3.2373046875e-05, + "loss": 0.1293, + "step": 3910 + }, + { + "epoch": 1.4876660341555978, + "grad_norm": 1.7680699825286865, + "learning_rate": 3.2324218750000004e-05, + "loss": 0.0891, + "step": 3920 + }, + { + "epoch": 1.49146110056926, + "grad_norm": 0.7916316390037537, + "learning_rate": 3.2275390625e-05, + "loss": 0.1223, + "step": 3930 + }, + { + "epoch": 1.495256166982922, + "grad_norm": 0.9054811596870422, + "learning_rate": 3.22265625e-05, + "loss": 0.0934, + "step": 3940 + }, + { + "epoch": 1.4990512333965844, + "grad_norm": 0.14054611325263977, + "learning_rate": 3.2177734375e-05, + "loss": 0.0494, + "step": 3950 + }, + { + "epoch": 1.5028462998102468, + "grad_norm": 3.1943421363830566, + "learning_rate": 3.212890625e-05, + "loss": 0.1156, + "step": 3960 + }, + { + "epoch": 1.5066413662239089, + "grad_norm": 1.0965791940689087, + "learning_rate": 3.2080078125000004e-05, + "loss": 0.1016, + "step": 3970 + }, + { + "epoch": 1.510436432637571, + "grad_norm": 1.3087248802185059, + "learning_rate": 3.203125e-05, + "loss": 0.0764, + "step": 3980 + }, + { + "epoch": 1.5142314990512333, + "grad_norm": 2.760798692703247, + "learning_rate": 3.1982421875e-05, + "loss": 0.114, + "step": 3990 + }, + { + "epoch": 1.5180265654648957, + "grad_norm": 0.1450069397687912, + "learning_rate": 3.193359375e-05, + "loss": 0.1192, + "step": 4000 + }, + { + "epoch": 1.521821631878558, + "grad_norm": 4.504504680633545, + "learning_rate": 3.1884765625e-05, + "loss": 0.1046, + "step": 4010 + }, + { + "epoch": 1.5256166982922201, + "grad_norm": 0.7182434797286987, + "learning_rate": 3.1835937500000004e-05, + "loss": 0.0932, + "step": 4020 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 4.370609283447266, + "learning_rate": 3.1787109375e-05, + "loss": 0.144, + "step": 4030 + }, + { + "epoch": 1.5332068311195446, + "grad_norm": 3.8300323486328125, + "learning_rate": 3.173828125e-05, + "loss": 0.0982, + "step": 4040 + }, + { + "epoch": 1.537001897533207, + "grad_norm": 0.25771814584732056, + "learning_rate": 3.1689453125e-05, + "loss": 0.0691, + "step": 4050 + }, + { + "epoch": 1.540796963946869, + "grad_norm": 2.758225917816162, + "learning_rate": 3.1640625e-05, + "loss": 0.1308, + "step": 4060 + }, + { + "epoch": 1.5445920303605312, + "grad_norm": 2.7619638442993164, + "learning_rate": 3.1591796875000004e-05, + "loss": 0.094, + "step": 4070 + }, + { + "epoch": 1.5483870967741935, + "grad_norm": 0.9765902757644653, + "learning_rate": 3.154296875e-05, + "loss": 0.0811, + "step": 4080 + }, + { + "epoch": 1.5521821631878558, + "grad_norm": 4.361360549926758, + "learning_rate": 3.1494140625e-05, + "loss": 0.1742, + "step": 4090 + }, + { + "epoch": 1.5559772296015182, + "grad_norm": 2.249197244644165, + "learning_rate": 3.14453125e-05, + "loss": 0.0807, + "step": 4100 + }, + { + "epoch": 1.5597722960151803, + "grad_norm": 3.4518532752990723, + "learning_rate": 3.1396484375000005e-05, + "loss": 0.1422, + "step": 4110 + }, + { + "epoch": 1.5635673624288424, + "grad_norm": 0.6679037809371948, + "learning_rate": 3.1347656250000003e-05, + "loss": 0.1214, + "step": 4120 + }, + { + "epoch": 1.5673624288425048, + "grad_norm": 3.879596710205078, + "learning_rate": 3.1298828125e-05, + "loss": 0.1084, + "step": 4130 + }, + { + "epoch": 1.571157495256167, + "grad_norm": 5.232009410858154, + "learning_rate": 3.125e-05, + "loss": 0.1192, + "step": 4140 + }, + { + "epoch": 1.5749525616698292, + "grad_norm": 3.875843048095703, + "learning_rate": 3.1201171875e-05, + "loss": 0.1099, + "step": 4150 + }, + { + "epoch": 1.5787476280834913, + "grad_norm": 0.17772170901298523, + "learning_rate": 3.1152343750000005e-05, + "loss": 0.1001, + "step": 4160 + }, + { + "epoch": 1.5825426944971537, + "grad_norm": 0.6866888403892517, + "learning_rate": 3.1103515625e-05, + "loss": 0.1598, + "step": 4170 + }, + { + "epoch": 1.586337760910816, + "grad_norm": 2.2445452213287354, + "learning_rate": 3.10546875e-05, + "loss": 0.1532, + "step": 4180 + }, + { + "epoch": 1.5901328273244781, + "grad_norm": 1.2135056257247925, + "learning_rate": 3.1005859375e-05, + "loss": 0.1337, + "step": 4190 + }, + { + "epoch": 1.5939278937381403, + "grad_norm": 0.8548033833503723, + "learning_rate": 3.095703125e-05, + "loss": 0.1142, + "step": 4200 + }, + { + "epoch": 1.5977229601518026, + "grad_norm": 1.7404321432113647, + "learning_rate": 3.0908203125000004e-05, + "loss": 0.1195, + "step": 4210 + }, + { + "epoch": 1.601518026565465, + "grad_norm": 1.4047428369522095, + "learning_rate": 3.0859375e-05, + "loss": 0.1853, + "step": 4220 + }, + { + "epoch": 1.6053130929791273, + "grad_norm": 2.793487071990967, + "learning_rate": 3.0810546875e-05, + "loss": 0.1231, + "step": 4230 + }, + { + "epoch": 1.6091081593927894, + "grad_norm": 0.928959310054779, + "learning_rate": 3.076171875e-05, + "loss": 0.0891, + "step": 4240 + }, + { + "epoch": 1.6129032258064515, + "grad_norm": 1.1571967601776123, + "learning_rate": 3.0712890625e-05, + "loss": 0.1119, + "step": 4250 + }, + { + "epoch": 1.6166982922201139, + "grad_norm": 3.0740041732788086, + "learning_rate": 3.0664062500000004e-05, + "loss": 0.1518, + "step": 4260 + }, + { + "epoch": 1.6204933586337762, + "grad_norm": 5.726138114929199, + "learning_rate": 3.0615234375e-05, + "loss": 0.1121, + "step": 4270 + }, + { + "epoch": 1.6242884250474383, + "grad_norm": 3.900777816772461, + "learning_rate": 3.056640625e-05, + "loss": 0.1513, + "step": 4280 + }, + { + "epoch": 1.6280834914611004, + "grad_norm": 3.43808913230896, + "learning_rate": 3.0517578125e-05, + "loss": 0.1259, + "step": 4290 + }, + { + "epoch": 1.6318785578747628, + "grad_norm": 1.2054848670959473, + "learning_rate": 3.0468750000000002e-05, + "loss": 0.1446, + "step": 4300 + }, + { + "epoch": 1.635673624288425, + "grad_norm": 3.756579875946045, + "learning_rate": 3.0419921875e-05, + "loss": 0.1348, + "step": 4310 + }, + { + "epoch": 1.6394686907020875, + "grad_norm": 1.4033925533294678, + "learning_rate": 3.0371093750000003e-05, + "loss": 0.1053, + "step": 4320 + }, + { + "epoch": 1.6432637571157496, + "grad_norm": 1.6513621807098389, + "learning_rate": 3.0322265625e-05, + "loss": 0.1217, + "step": 4330 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 1.9821256399154663, + "learning_rate": 3.02734375e-05, + "loss": 0.0959, + "step": 4340 + }, + { + "epoch": 1.650853889943074, + "grad_norm": 7.50634241104126, + "learning_rate": 3.0224609375000002e-05, + "loss": 0.1487, + "step": 4350 + }, + { + "epoch": 1.6546489563567364, + "grad_norm": 1.1505802869796753, + "learning_rate": 3.017578125e-05, + "loss": 0.1246, + "step": 4360 + }, + { + "epoch": 1.6584440227703985, + "grad_norm": 1.774200677871704, + "learning_rate": 3.0126953125000002e-05, + "loss": 0.086, + "step": 4370 + }, + { + "epoch": 1.6622390891840606, + "grad_norm": 1.566748023033142, + "learning_rate": 3.0078125e-05, + "loss": 0.1088, + "step": 4380 + }, + { + "epoch": 1.666034155597723, + "grad_norm": 2.8167648315429688, + "learning_rate": 3.0029296875000003e-05, + "loss": 0.122, + "step": 4390 + }, + { + "epoch": 1.6698292220113853, + "grad_norm": 1.7637346982955933, + "learning_rate": 2.998046875e-05, + "loss": 0.1036, + "step": 4400 + }, + { + "epoch": 1.6736242884250474, + "grad_norm": 0.3347111642360687, + "learning_rate": 2.9931640625e-05, + "loss": 0.1259, + "step": 4410 + }, + { + "epoch": 1.6774193548387095, + "grad_norm": 4.920076370239258, + "learning_rate": 2.9882812500000002e-05, + "loss": 0.1594, + "step": 4420 + }, + { + "epoch": 1.6812144212523719, + "grad_norm": 3.4409444332122803, + "learning_rate": 2.9833984375e-05, + "loss": 0.1541, + "step": 4430 + }, + { + "epoch": 1.6850094876660342, + "grad_norm": 0.639980673789978, + "learning_rate": 2.9785156250000003e-05, + "loss": 0.0826, + "step": 4440 + }, + { + "epoch": 1.6888045540796965, + "grad_norm": 3.240345001220703, + "learning_rate": 2.9736328125e-05, + "loss": 0.1473, + "step": 4450 + }, + { + "epoch": 1.6925996204933587, + "grad_norm": 2.2682647705078125, + "learning_rate": 2.96875e-05, + "loss": 0.0959, + "step": 4460 + }, + { + "epoch": 1.6963946869070208, + "grad_norm": 2.3791496753692627, + "learning_rate": 2.9638671875000002e-05, + "loss": 0.0953, + "step": 4470 + }, + { + "epoch": 1.7001897533206831, + "grad_norm": 1.5654246807098389, + "learning_rate": 2.958984375e-05, + "loss": 0.113, + "step": 4480 + }, + { + "epoch": 1.7039848197343455, + "grad_norm": 5.17665958404541, + "learning_rate": 2.9541015625000003e-05, + "loss": 0.1164, + "step": 4490 + }, + { + "epoch": 1.7077798861480076, + "grad_norm": 18.226165771484375, + "learning_rate": 2.94921875e-05, + "loss": 0.1293, + "step": 4500 + }, + { + "epoch": 1.7115749525616697, + "grad_norm": 3.5760374069213867, + "learning_rate": 2.9443359375e-05, + "loss": 0.0931, + "step": 4510 + }, + { + "epoch": 1.715370018975332, + "grad_norm": 2.9964776039123535, + "learning_rate": 2.9394531250000002e-05, + "loss": 0.0932, + "step": 4520 + }, + { + "epoch": 1.7191650853889944, + "grad_norm": 10.505178451538086, + "learning_rate": 2.9345703125e-05, + "loss": 0.139, + "step": 4530 + }, + { + "epoch": 1.7229601518026565, + "grad_norm": 0.9944730997085571, + "learning_rate": 2.9296875000000002e-05, + "loss": 0.159, + "step": 4540 + }, + { + "epoch": 1.7267552182163188, + "grad_norm": 1.2323939800262451, + "learning_rate": 2.9248046875e-05, + "loss": 0.118, + "step": 4550 + }, + { + "epoch": 1.730550284629981, + "grad_norm": 0.8581392765045166, + "learning_rate": 2.9199218750000003e-05, + "loss": 0.1165, + "step": 4560 + }, + { + "epoch": 1.7343453510436433, + "grad_norm": 2.196648120880127, + "learning_rate": 2.9150390625e-05, + "loss": 0.0803, + "step": 4570 + }, + { + "epoch": 1.7381404174573056, + "grad_norm": 3.5112388134002686, + "learning_rate": 2.91015625e-05, + "loss": 0.1348, + "step": 4580 + }, + { + "epoch": 1.7419354838709677, + "grad_norm": 1.1738495826721191, + "learning_rate": 2.9052734375000002e-05, + "loss": 0.1114, + "step": 4590 + }, + { + "epoch": 1.7457305502846299, + "grad_norm": 1.6850240230560303, + "learning_rate": 2.900390625e-05, + "loss": 0.1457, + "step": 4600 + }, + { + "epoch": 1.7495256166982922, + "grad_norm": 1.4865467548370361, + "learning_rate": 2.8955078125000003e-05, + "loss": 0.1078, + "step": 4610 + }, + { + "epoch": 1.7533206831119545, + "grad_norm": 1.445610523223877, + "learning_rate": 2.890625e-05, + "loss": 0.0839, + "step": 4620 + }, + { + "epoch": 1.7571157495256167, + "grad_norm": 1.649983525276184, + "learning_rate": 2.8857421875e-05, + "loss": 0.1028, + "step": 4630 + }, + { + "epoch": 1.7609108159392788, + "grad_norm": 2.717585802078247, + "learning_rate": 2.8808593750000002e-05, + "loss": 0.1127, + "step": 4640 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 2.902244806289673, + "learning_rate": 2.8759765625e-05, + "loss": 0.0743, + "step": 4650 + }, + { + "epoch": 1.7685009487666035, + "grad_norm": 1.8880512714385986, + "learning_rate": 2.8710937500000002e-05, + "loss": 0.0875, + "step": 4660 + }, + { + "epoch": 1.7722960151802658, + "grad_norm": 1.119419813156128, + "learning_rate": 2.8662109375e-05, + "loss": 0.1028, + "step": 4670 + }, + { + "epoch": 1.776091081593928, + "grad_norm": 2.3372507095336914, + "learning_rate": 2.8613281250000003e-05, + "loss": 0.161, + "step": 4680 + }, + { + "epoch": 1.77988614800759, + "grad_norm": 0.6809380054473877, + "learning_rate": 2.8564453125e-05, + "loss": 0.091, + "step": 4690 + }, + { + "epoch": 1.7836812144212524, + "grad_norm": 4.871325969696045, + "learning_rate": 2.8515625e-05, + "loss": 0.1495, + "step": 4700 + }, + { + "epoch": 1.7874762808349147, + "grad_norm": 10.103543281555176, + "learning_rate": 2.8466796875000002e-05, + "loss": 0.0847, + "step": 4710 + }, + { + "epoch": 1.7912713472485768, + "grad_norm": 0.719699501991272, + "learning_rate": 2.841796875e-05, + "loss": 0.0991, + "step": 4720 + }, + { + "epoch": 1.795066413662239, + "grad_norm": 2.012406826019287, + "learning_rate": 2.8369140625000003e-05, + "loss": 0.069, + "step": 4730 + }, + { + "epoch": 1.7988614800759013, + "grad_norm": 2.038810968399048, + "learning_rate": 2.83203125e-05, + "loss": 0.0946, + "step": 4740 + }, + { + "epoch": 1.8026565464895636, + "grad_norm": 1.991003394126892, + "learning_rate": 2.8271484375e-05, + "loss": 0.1033, + "step": 4750 + }, + { + "epoch": 1.8064516129032258, + "grad_norm": 1.9379823207855225, + "learning_rate": 2.8222656250000002e-05, + "loss": 0.0738, + "step": 4760 + }, + { + "epoch": 1.810246679316888, + "grad_norm": 0.9378390312194824, + "learning_rate": 2.8173828125e-05, + "loss": 0.0907, + "step": 4770 + }, + { + "epoch": 1.8140417457305502, + "grad_norm": 2.5683369636535645, + "learning_rate": 2.8125000000000003e-05, + "loss": 0.1156, + "step": 4780 + }, + { + "epoch": 1.8178368121442126, + "grad_norm": 2.95536470413208, + "learning_rate": 2.8076171875e-05, + "loss": 0.0959, + "step": 4790 + }, + { + "epoch": 1.821631878557875, + "grad_norm": 11.215580940246582, + "learning_rate": 2.802734375e-05, + "loss": 0.0812, + "step": 4800 + }, + { + "epoch": 1.825426944971537, + "grad_norm": 0.4500042498111725, + "learning_rate": 2.7978515625000002e-05, + "loss": 0.1114, + "step": 4810 + }, + { + "epoch": 1.8292220113851991, + "grad_norm": 0.5829250812530518, + "learning_rate": 2.79296875e-05, + "loss": 0.1284, + "step": 4820 + }, + { + "epoch": 1.8330170777988615, + "grad_norm": 3.114776134490967, + "learning_rate": 2.7880859375000002e-05, + "loss": 0.1283, + "step": 4830 + }, + { + "epoch": 1.8368121442125238, + "grad_norm": 0.47552067041397095, + "learning_rate": 2.783203125e-05, + "loss": 0.0752, + "step": 4840 + }, + { + "epoch": 1.840607210626186, + "grad_norm": 4.794514179229736, + "learning_rate": 2.7783203125000003e-05, + "loss": 0.1012, + "step": 4850 + }, + { + "epoch": 1.844402277039848, + "grad_norm": 5.392133712768555, + "learning_rate": 2.7734375e-05, + "loss": 0.178, + "step": 4860 + }, + { + "epoch": 1.8481973434535104, + "grad_norm": 1.1505749225616455, + "learning_rate": 2.7685546875e-05, + "loss": 0.126, + "step": 4870 + }, + { + "epoch": 1.8519924098671727, + "grad_norm": 1.1924586296081543, + "learning_rate": 2.7636718750000002e-05, + "loss": 0.1109, + "step": 4880 + }, + { + "epoch": 1.855787476280835, + "grad_norm": 0.12782755494117737, + "learning_rate": 2.7587890625e-05, + "loss": 0.0732, + "step": 4890 + }, + { + "epoch": 1.8595825426944972, + "grad_norm": 1.1095064878463745, + "learning_rate": 2.7539062500000003e-05, + "loss": 0.0802, + "step": 4900 + }, + { + "epoch": 1.8633776091081593, + "grad_norm": 8.920310020446777, + "learning_rate": 2.7490234375e-05, + "loss": 0.0964, + "step": 4910 + }, + { + "epoch": 1.8671726755218216, + "grad_norm": 1.8678808212280273, + "learning_rate": 2.744140625e-05, + "loss": 0.1072, + "step": 4920 + }, + { + "epoch": 1.870967741935484, + "grad_norm": 1.8633017539978027, + "learning_rate": 2.7392578125000002e-05, + "loss": 0.0835, + "step": 4930 + }, + { + "epoch": 1.874762808349146, + "grad_norm": 1.7576115131378174, + "learning_rate": 2.734375e-05, + "loss": 0.1327, + "step": 4940 + }, + { + "epoch": 1.8785578747628082, + "grad_norm": 3.504157304763794, + "learning_rate": 2.7294921875000003e-05, + "loss": 0.1609, + "step": 4950 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 1.7668483257293701, + "learning_rate": 2.724609375e-05, + "loss": 0.1316, + "step": 4960 + }, + { + "epoch": 1.886148007590133, + "grad_norm": 0.659870982170105, + "learning_rate": 2.7197265625e-05, + "loss": 0.0913, + "step": 4970 + }, + { + "epoch": 1.889943074003795, + "grad_norm": 1.428725004196167, + "learning_rate": 2.7148437500000002e-05, + "loss": 0.118, + "step": 4980 + }, + { + "epoch": 1.8937381404174574, + "grad_norm": 1.8446964025497437, + "learning_rate": 2.7099609375e-05, + "loss": 0.1203, + "step": 4990 + }, + { + "epoch": 1.8975332068311195, + "grad_norm": 2.9335217475891113, + "learning_rate": 2.7050781250000002e-05, + "loss": 0.1301, + "step": 5000 + }, + { + "epoch": 1.9013282732447818, + "grad_norm": 0.8534810543060303, + "learning_rate": 2.7001953125e-05, + "loss": 0.0555, + "step": 5010 + }, + { + "epoch": 1.9051233396584442, + "grad_norm": 0.5556221604347229, + "learning_rate": 2.6953125000000003e-05, + "loss": 0.1036, + "step": 5020 + }, + { + "epoch": 1.9089184060721063, + "grad_norm": 1.7097387313842773, + "learning_rate": 2.6904296875e-05, + "loss": 0.0869, + "step": 5030 + }, + { + "epoch": 1.9127134724857684, + "grad_norm": 2.324669122695923, + "learning_rate": 2.685546875e-05, + "loss": 0.1233, + "step": 5040 + }, + { + "epoch": 1.9165085388994307, + "grad_norm": 2.4764981269836426, + "learning_rate": 2.6806640625000002e-05, + "loss": 0.1379, + "step": 5050 + }, + { + "epoch": 1.920303605313093, + "grad_norm": 4.731557846069336, + "learning_rate": 2.67578125e-05, + "loss": 0.189, + "step": 5060 + }, + { + "epoch": 1.9240986717267552, + "grad_norm": 0.4868462383747101, + "learning_rate": 2.6708984375000003e-05, + "loss": 0.0765, + "step": 5070 + }, + { + "epoch": 1.9278937381404173, + "grad_norm": 1.3497892618179321, + "learning_rate": 2.666015625e-05, + "loss": 0.1039, + "step": 5080 + }, + { + "epoch": 1.9316888045540797, + "grad_norm": 15.007429122924805, + "learning_rate": 2.6611328125e-05, + "loss": 0.0996, + "step": 5090 + }, + { + "epoch": 1.935483870967742, + "grad_norm": 8.113617897033691, + "learning_rate": 2.6562500000000002e-05, + "loss": 0.1316, + "step": 5100 + }, + { + "epoch": 1.9392789373814043, + "grad_norm": 0.4574742913246155, + "learning_rate": 2.6513671875e-05, + "loss": 0.1044, + "step": 5110 + }, + { + "epoch": 1.9430740037950665, + "grad_norm": 2.1475601196289062, + "learning_rate": 2.6464843750000002e-05, + "loss": 0.1236, + "step": 5120 + }, + { + "epoch": 1.9468690702087286, + "grad_norm": 2.370619058609009, + "learning_rate": 2.6416015625e-05, + "loss": 0.1358, + "step": 5130 + }, + { + "epoch": 1.950664136622391, + "grad_norm": 0.7283152937889099, + "learning_rate": 2.63671875e-05, + "loss": 0.1348, + "step": 5140 + }, + { + "epoch": 1.9544592030360532, + "grad_norm": 2.8883001804351807, + "learning_rate": 2.6318359375e-05, + "loss": 0.083, + "step": 5150 + }, + { + "epoch": 1.9582542694497154, + "grad_norm": 0.26794353127479553, + "learning_rate": 2.626953125e-05, + "loss": 0.1229, + "step": 5160 + }, + { + "epoch": 1.9620493358633775, + "grad_norm": 0.10836785286664963, + "learning_rate": 2.6220703125000002e-05, + "loss": 0.0731, + "step": 5170 + }, + { + "epoch": 1.9658444022770398, + "grad_norm": 1.5825821161270142, + "learning_rate": 2.6171875e-05, + "loss": 0.1394, + "step": 5180 + }, + { + "epoch": 1.9696394686907022, + "grad_norm": 2.9467551708221436, + "learning_rate": 2.6123046875000003e-05, + "loss": 0.0986, + "step": 5190 + }, + { + "epoch": 1.9734345351043643, + "grad_norm": 0.14293566346168518, + "learning_rate": 2.607421875e-05, + "loss": 0.0824, + "step": 5200 + }, + { + "epoch": 1.9772296015180264, + "grad_norm": 0.4912210702896118, + "learning_rate": 2.6025390625e-05, + "loss": 0.0863, + "step": 5210 + }, + { + "epoch": 1.9810246679316887, + "grad_norm": 0.2447841614484787, + "learning_rate": 2.5976562500000002e-05, + "loss": 0.0877, + "step": 5220 + }, + { + "epoch": 1.984819734345351, + "grad_norm": 0.13301405310630798, + "learning_rate": 2.5927734375e-05, + "loss": 0.104, + "step": 5230 + }, + { + "epoch": 1.9886148007590134, + "grad_norm": 3.25866961479187, + "learning_rate": 2.5878906250000003e-05, + "loss": 0.0806, + "step": 5240 + }, + { + "epoch": 1.9924098671726755, + "grad_norm": 3.9567527770996094, + "learning_rate": 2.5830078125e-05, + "loss": 0.1226, + "step": 5250 + }, + { + "epoch": 1.9962049335863377, + "grad_norm": 3.6540729999542236, + "learning_rate": 2.578125e-05, + "loss": 0.0628, + "step": 5260 + }, + { + "epoch": 2.0, + "grad_norm": 2.9958958625793457, + "learning_rate": 2.5732421875000002e-05, + "loss": 0.1229, + "step": 5270 + }, + { + "epoch": 2.0037950664136623, + "grad_norm": 4.634014129638672, + "learning_rate": 2.568359375e-05, + "loss": 0.1, + "step": 5280 + }, + { + "epoch": 2.0075901328273247, + "grad_norm": 1.0794429779052734, + "learning_rate": 2.5634765625000002e-05, + "loss": 0.1, + "step": 5290 + }, + { + "epoch": 2.0113851992409866, + "grad_norm": 2.6222951412200928, + "learning_rate": 2.55859375e-05, + "loss": 0.057, + "step": 5300 + }, + { + "epoch": 2.015180265654649, + "grad_norm": 1.499935507774353, + "learning_rate": 2.5537109375e-05, + "loss": 0.0766, + "step": 5310 + }, + { + "epoch": 2.0189753320683113, + "grad_norm": 2.614969491958618, + "learning_rate": 2.548828125e-05, + "loss": 0.1003, + "step": 5320 + }, + { + "epoch": 2.0227703984819736, + "grad_norm": 1.4524706602096558, + "learning_rate": 2.5439453125e-05, + "loss": 0.1681, + "step": 5330 + }, + { + "epoch": 2.0265654648956355, + "grad_norm": 1.5427693128585815, + "learning_rate": 2.5390625000000002e-05, + "loss": 0.0745, + "step": 5340 + }, + { + "epoch": 2.030360531309298, + "grad_norm": 0.6060462594032288, + "learning_rate": 2.5341796875e-05, + "loss": 0.0557, + "step": 5350 + }, + { + "epoch": 2.03415559772296, + "grad_norm": 2.1763222217559814, + "learning_rate": 2.5292968750000003e-05, + "loss": 0.0962, + "step": 5360 + }, + { + "epoch": 2.0379506641366225, + "grad_norm": 0.9857283234596252, + "learning_rate": 2.5244140625e-05, + "loss": 0.0646, + "step": 5370 + }, + { + "epoch": 2.041745730550285, + "grad_norm": 0.14561018347740173, + "learning_rate": 2.51953125e-05, + "loss": 0.0686, + "step": 5380 + }, + { + "epoch": 2.0455407969639468, + "grad_norm": 5.825016498565674, + "learning_rate": 2.5146484375000002e-05, + "loss": 0.1106, + "step": 5390 + }, + { + "epoch": 2.049335863377609, + "grad_norm": 0.4656510353088379, + "learning_rate": 2.509765625e-05, + "loss": 0.0793, + "step": 5400 + }, + { + "epoch": 2.0531309297912714, + "grad_norm": 5.336658954620361, + "learning_rate": 2.5048828125000003e-05, + "loss": 0.1136, + "step": 5410 + }, + { + "epoch": 2.0569259962049338, + "grad_norm": 1.3186858892440796, + "learning_rate": 2.5e-05, + "loss": 0.0908, + "step": 5420 + }, + { + "epoch": 2.0607210626185957, + "grad_norm": 2.3468871116638184, + "learning_rate": 2.4951171875e-05, + "loss": 0.1127, + "step": 5430 + }, + { + "epoch": 2.064516129032258, + "grad_norm": 1.6484739780426025, + "learning_rate": 2.4902343750000002e-05, + "loss": 0.0921, + "step": 5440 + }, + { + "epoch": 2.0683111954459203, + "grad_norm": 1.97286856174469, + "learning_rate": 2.4853515625e-05, + "loss": 0.064, + "step": 5450 + }, + { + "epoch": 2.0721062618595827, + "grad_norm": 0.7309706211090088, + "learning_rate": 2.4804687500000002e-05, + "loss": 0.1256, + "step": 5460 + }, + { + "epoch": 2.0759013282732446, + "grad_norm": 3.2271645069122314, + "learning_rate": 2.4755859375e-05, + "loss": 0.0889, + "step": 5470 + }, + { + "epoch": 2.079696394686907, + "grad_norm": 18.506216049194336, + "learning_rate": 2.470703125e-05, + "loss": 0.1328, + "step": 5480 + }, + { + "epoch": 2.0834914611005693, + "grad_norm": 1.2257277965545654, + "learning_rate": 2.4658203125e-05, + "loss": 0.0673, + "step": 5490 + }, + { + "epoch": 2.0872865275142316, + "grad_norm": 0.1906469613313675, + "learning_rate": 2.4609375e-05, + "loss": 0.0808, + "step": 5500 + }, + { + "epoch": 2.091081593927894, + "grad_norm": 0.9694260954856873, + "learning_rate": 2.4560546875000002e-05, + "loss": 0.0558, + "step": 5510 + }, + { + "epoch": 2.094876660341556, + "grad_norm": 5.630046844482422, + "learning_rate": 2.451171875e-05, + "loss": 0.1262, + "step": 5520 + }, + { + "epoch": 2.098671726755218, + "grad_norm": 0.13950304687023163, + "learning_rate": 2.4462890625000003e-05, + "loss": 0.0711, + "step": 5530 + }, + { + "epoch": 2.1024667931688805, + "grad_norm": 0.424904465675354, + "learning_rate": 2.44140625e-05, + "loss": 0.0841, + "step": 5540 + }, + { + "epoch": 2.106261859582543, + "grad_norm": 7.330411434173584, + "learning_rate": 2.4365234375e-05, + "loss": 0.1482, + "step": 5550 + }, + { + "epoch": 2.1100569259962048, + "grad_norm": 0.2741791009902954, + "learning_rate": 2.4316406250000002e-05, + "loss": 0.0945, + "step": 5560 + }, + { + "epoch": 2.113851992409867, + "grad_norm": 1.025099277496338, + "learning_rate": 2.4267578125e-05, + "loss": 0.0981, + "step": 5570 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 2.723508596420288, + "learning_rate": 2.4218750000000003e-05, + "loss": 0.067, + "step": 5580 + }, + { + "epoch": 2.121442125237192, + "grad_norm": 0.18666787445545197, + "learning_rate": 2.4169921875e-05, + "loss": 0.077, + "step": 5590 + }, + { + "epoch": 2.1252371916508537, + "grad_norm": 2.304980754852295, + "learning_rate": 2.412109375e-05, + "loss": 0.1016, + "step": 5600 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 1.6174981594085693, + "learning_rate": 2.4072265625000002e-05, + "loss": 0.0735, + "step": 5610 + }, + { + "epoch": 2.1328273244781784, + "grad_norm": 5.401015758514404, + "learning_rate": 2.40234375e-05, + "loss": 0.087, + "step": 5620 + }, + { + "epoch": 2.1366223908918407, + "grad_norm": 2.5387024879455566, + "learning_rate": 2.3974609375000002e-05, + "loss": 0.1006, + "step": 5630 + }, + { + "epoch": 2.140417457305503, + "grad_norm": 4.753091812133789, + "learning_rate": 2.392578125e-05, + "loss": 0.1013, + "step": 5640 + }, + { + "epoch": 2.144212523719165, + "grad_norm": 3.540262460708618, + "learning_rate": 2.3876953125e-05, + "loss": 0.0697, + "step": 5650 + }, + { + "epoch": 2.1480075901328273, + "grad_norm": 1.53217613697052, + "learning_rate": 2.3828125e-05, + "loss": 0.0812, + "step": 5660 + }, + { + "epoch": 2.1518026565464896, + "grad_norm": 2.652308940887451, + "learning_rate": 2.3779296875e-05, + "loss": 0.092, + "step": 5670 + }, + { + "epoch": 2.155597722960152, + "grad_norm": 2.7964372634887695, + "learning_rate": 2.3730468750000002e-05, + "loss": 0.0658, + "step": 5680 + }, + { + "epoch": 2.159392789373814, + "grad_norm": 0.11225280165672302, + "learning_rate": 2.3681640625e-05, + "loss": 0.0939, + "step": 5690 + }, + { + "epoch": 2.163187855787476, + "grad_norm": 1.5736573934555054, + "learning_rate": 2.3632812500000003e-05, + "loss": 0.0727, + "step": 5700 + }, + { + "epoch": 2.1669829222011385, + "grad_norm": 2.087057113647461, + "learning_rate": 2.3583984375e-05, + "loss": 0.0654, + "step": 5710 + }, + { + "epoch": 2.170777988614801, + "grad_norm": 1.598823070526123, + "learning_rate": 2.353515625e-05, + "loss": 0.0874, + "step": 5720 + }, + { + "epoch": 2.174573055028463, + "grad_norm": 1.7258918285369873, + "learning_rate": 2.3486328125000002e-05, + "loss": 0.0703, + "step": 5730 + }, + { + "epoch": 2.178368121442125, + "grad_norm": 12.662415504455566, + "learning_rate": 2.34375e-05, + "loss": 0.0998, + "step": 5740 + }, + { + "epoch": 2.1821631878557874, + "grad_norm": 5.9703803062438965, + "learning_rate": 2.3388671875000002e-05, + "loss": 0.1021, + "step": 5750 + }, + { + "epoch": 2.18595825426945, + "grad_norm": 1.9118971824645996, + "learning_rate": 2.333984375e-05, + "loss": 0.0574, + "step": 5760 + }, + { + "epoch": 2.189753320683112, + "grad_norm": 2.8925118446350098, + "learning_rate": 2.3291015625e-05, + "loss": 0.0804, + "step": 5770 + }, + { + "epoch": 2.193548387096774, + "grad_norm": 0.9911293387413025, + "learning_rate": 2.32421875e-05, + "loss": 0.0673, + "step": 5780 + }, + { + "epoch": 2.1973434535104364, + "grad_norm": 3.4294886589050293, + "learning_rate": 2.3193359375e-05, + "loss": 0.0729, + "step": 5790 + }, + { + "epoch": 2.2011385199240987, + "grad_norm": 5.382150650024414, + "learning_rate": 2.3144531250000002e-05, + "loss": 0.1117, + "step": 5800 + }, + { + "epoch": 2.204933586337761, + "grad_norm": 3.5237820148468018, + "learning_rate": 2.3095703125e-05, + "loss": 0.0674, + "step": 5810 + }, + { + "epoch": 2.2087286527514234, + "grad_norm": 5.6236772537231445, + "learning_rate": 2.3046875e-05, + "loss": 0.0279, + "step": 5820 + }, + { + "epoch": 2.2125237191650853, + "grad_norm": 1.1168630123138428, + "learning_rate": 2.2998046875e-05, + "loss": 0.0773, + "step": 5830 + }, + { + "epoch": 2.2163187855787476, + "grad_norm": 1.0353121757507324, + "learning_rate": 2.294921875e-05, + "loss": 0.062, + "step": 5840 + }, + { + "epoch": 2.22011385199241, + "grad_norm": 1.4820594787597656, + "learning_rate": 2.2900390625000002e-05, + "loss": 0.0778, + "step": 5850 + }, + { + "epoch": 2.2239089184060723, + "grad_norm": 8.295422554016113, + "learning_rate": 2.28515625e-05, + "loss": 0.1192, + "step": 5860 + }, + { + "epoch": 2.227703984819734, + "grad_norm": 1.5980597734451294, + "learning_rate": 2.2802734375000003e-05, + "loss": 0.0648, + "step": 5870 + }, + { + "epoch": 2.2314990512333965, + "grad_norm": 0.2760424315929413, + "learning_rate": 2.275390625e-05, + "loss": 0.0722, + "step": 5880 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.8219416737556458, + "learning_rate": 2.2705078125e-05, + "loss": 0.0935, + "step": 5890 + }, + { + "epoch": 2.239089184060721, + "grad_norm": 0.16338910162448883, + "learning_rate": 2.2656250000000002e-05, + "loss": 0.0876, + "step": 5900 + }, + { + "epoch": 2.242884250474383, + "grad_norm": 0.5857824683189392, + "learning_rate": 2.2607421875e-05, + "loss": 0.117, + "step": 5910 + }, + { + "epoch": 2.2466793168880455, + "grad_norm": 0.1616586148738861, + "learning_rate": 2.2558593750000002e-05, + "loss": 0.072, + "step": 5920 + }, + { + "epoch": 2.250474383301708, + "grad_norm": 0.26469337940216064, + "learning_rate": 2.2509765625e-05, + "loss": 0.0902, + "step": 5930 + }, + { + "epoch": 2.25426944971537, + "grad_norm": 3.576016426086426, + "learning_rate": 2.24609375e-05, + "loss": 0.1647, + "step": 5940 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 6.523315906524658, + "learning_rate": 2.2412109375e-05, + "loss": 0.0705, + "step": 5950 + }, + { + "epoch": 2.2618595825426944, + "grad_norm": 4.0901689529418945, + "learning_rate": 2.236328125e-05, + "loss": 0.0786, + "step": 5960 + }, + { + "epoch": 2.2656546489563567, + "grad_norm": 0.5081945061683655, + "learning_rate": 2.2314453125000002e-05, + "loss": 0.1158, + "step": 5970 + }, + { + "epoch": 2.269449715370019, + "grad_norm": 0.10847347974777222, + "learning_rate": 2.2265625e-05, + "loss": 0.0825, + "step": 5980 + }, + { + "epoch": 2.2732447817836814, + "grad_norm": 9.521303176879883, + "learning_rate": 2.2216796875e-05, + "loss": 0.0875, + "step": 5990 + }, + { + "epoch": 2.2770398481973433, + "grad_norm": 6.0424580574035645, + "learning_rate": 2.216796875e-05, + "loss": 0.0994, + "step": 6000 + }, + { + "epoch": 2.2808349146110056, + "grad_norm": 0.3634886145591736, + "learning_rate": 2.2119140625e-05, + "loss": 0.0813, + "step": 6010 + }, + { + "epoch": 2.284629981024668, + "grad_norm": 1.929626703262329, + "learning_rate": 2.2070312500000002e-05, + "loss": 0.0705, + "step": 6020 + }, + { + "epoch": 2.2884250474383303, + "grad_norm": 4.993653297424316, + "learning_rate": 2.2021484375e-05, + "loss": 0.0731, + "step": 6030 + }, + { + "epoch": 2.292220113851992, + "grad_norm": 0.4869803190231323, + "learning_rate": 2.1972656250000003e-05, + "loss": 0.1123, + "step": 6040 + }, + { + "epoch": 2.2960151802656545, + "grad_norm": 1.1776117086410522, + "learning_rate": 2.1923828125e-05, + "loss": 0.0643, + "step": 6050 + }, + { + "epoch": 2.299810246679317, + "grad_norm": 1.7794570922851562, + "learning_rate": 2.1875e-05, + "loss": 0.0852, + "step": 6060 + }, + { + "epoch": 2.3036053130929792, + "grad_norm": 2.7579660415649414, + "learning_rate": 2.1826171875000002e-05, + "loss": 0.0975, + "step": 6070 + }, + { + "epoch": 2.3074003795066416, + "grad_norm": 2.9852662086486816, + "learning_rate": 2.177734375e-05, + "loss": 0.0724, + "step": 6080 + }, + { + "epoch": 2.3111954459203035, + "grad_norm": 3.543381452560425, + "learning_rate": 2.1728515625000002e-05, + "loss": 0.1108, + "step": 6090 + }, + { + "epoch": 2.314990512333966, + "grad_norm": 6.476046085357666, + "learning_rate": 2.16796875e-05, + "loss": 0.1231, + "step": 6100 + }, + { + "epoch": 2.318785578747628, + "grad_norm": 3.2935097217559814, + "learning_rate": 2.1630859375e-05, + "loss": 0.1052, + "step": 6110 + }, + { + "epoch": 2.3225806451612905, + "grad_norm": 1.1247642040252686, + "learning_rate": 2.158203125e-05, + "loss": 0.0817, + "step": 6120 + }, + { + "epoch": 2.3263757115749524, + "grad_norm": 6.793920993804932, + "learning_rate": 2.1533203125e-05, + "loss": 0.0623, + "step": 6130 + }, + { + "epoch": 2.3301707779886147, + "grad_norm": 0.12885475158691406, + "learning_rate": 2.1484375000000002e-05, + "loss": 0.0942, + "step": 6140 + }, + { + "epoch": 2.333965844402277, + "grad_norm": 1.4963340759277344, + "learning_rate": 2.1435546875e-05, + "loss": 0.0549, + "step": 6150 + }, + { + "epoch": 2.3377609108159394, + "grad_norm": 1.460093379020691, + "learning_rate": 2.138671875e-05, + "loss": 0.094, + "step": 6160 + }, + { + "epoch": 2.3415559772296017, + "grad_norm": 4.440692901611328, + "learning_rate": 2.1337890625e-05, + "loss": 0.1673, + "step": 6170 + }, + { + "epoch": 2.3453510436432636, + "grad_norm": 2.9689061641693115, + "learning_rate": 2.12890625e-05, + "loss": 0.0772, + "step": 6180 + }, + { + "epoch": 2.349146110056926, + "grad_norm": 8.890856742858887, + "learning_rate": 2.1240234375000002e-05, + "loss": 0.0588, + "step": 6190 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.12126415222883224, + "learning_rate": 2.119140625e-05, + "loss": 0.0624, + "step": 6200 + }, + { + "epoch": 2.3567362428842507, + "grad_norm": 0.5167102217674255, + "learning_rate": 2.1142578125000003e-05, + "loss": 0.0732, + "step": 6210 + }, + { + "epoch": 2.3605313092979125, + "grad_norm": 0.18846435844898224, + "learning_rate": 2.109375e-05, + "loss": 0.1007, + "step": 6220 + }, + { + "epoch": 2.364326375711575, + "grad_norm": 1.9389616250991821, + "learning_rate": 2.1044921875e-05, + "loss": 0.0912, + "step": 6230 + }, + { + "epoch": 2.3681214421252372, + "grad_norm": 5.2946457862854, + "learning_rate": 2.0996093750000002e-05, + "loss": 0.057, + "step": 6240 + }, + { + "epoch": 2.3719165085388996, + "grad_norm": 0.13522082567214966, + "learning_rate": 2.0947265625e-05, + "loss": 0.0877, + "step": 6250 + }, + { + "epoch": 2.375711574952562, + "grad_norm": 0.43759119510650635, + "learning_rate": 2.0898437500000002e-05, + "loss": 0.0791, + "step": 6260 + }, + { + "epoch": 2.379506641366224, + "grad_norm": 4.369633197784424, + "learning_rate": 2.0849609375e-05, + "loss": 0.0793, + "step": 6270 + }, + { + "epoch": 2.383301707779886, + "grad_norm": 3.1445748805999756, + "learning_rate": 2.080078125e-05, + "loss": 0.0994, + "step": 6280 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 0.5459542274475098, + "learning_rate": 2.0751953125e-05, + "loss": 0.0493, + "step": 6290 + }, + { + "epoch": 2.3908918406072104, + "grad_norm": 0.8807210326194763, + "learning_rate": 2.0703125e-05, + "loss": 0.0669, + "step": 6300 + }, + { + "epoch": 2.3946869070208727, + "grad_norm": 2.931506872177124, + "learning_rate": 2.0654296875000002e-05, + "loss": 0.1014, + "step": 6310 + }, + { + "epoch": 2.398481973434535, + "grad_norm": 1.1972861289978027, + "learning_rate": 2.060546875e-05, + "loss": 0.0643, + "step": 6320 + }, + { + "epoch": 2.4022770398481974, + "grad_norm": 2.670483112335205, + "learning_rate": 2.0556640625000003e-05, + "loss": 0.0651, + "step": 6330 + }, + { + "epoch": 2.4060721062618597, + "grad_norm": 2.790907382965088, + "learning_rate": 2.05078125e-05, + "loss": 0.0979, + "step": 6340 + }, + { + "epoch": 2.4098671726755216, + "grad_norm": 1.7010408639907837, + "learning_rate": 2.0458984375e-05, + "loss": 0.0616, + "step": 6350 + }, + { + "epoch": 2.413662239089184, + "grad_norm": 2.3590617179870605, + "learning_rate": 2.0410156250000002e-05, + "loss": 0.0877, + "step": 6360 + }, + { + "epoch": 2.4174573055028463, + "grad_norm": 0.7550681829452515, + "learning_rate": 2.0361328125e-05, + "loss": 0.0351, + "step": 6370 + }, + { + "epoch": 2.4212523719165087, + "grad_norm": 2.2927632331848145, + "learning_rate": 2.0312500000000002e-05, + "loss": 0.102, + "step": 6380 + }, + { + "epoch": 2.4250474383301706, + "grad_norm": 8.239547729492188, + "learning_rate": 2.0263671875e-05, + "loss": 0.1315, + "step": 6390 + }, + { + "epoch": 2.428842504743833, + "grad_norm": 0.12305755913257599, + "learning_rate": 2.021484375e-05, + "loss": 0.0508, + "step": 6400 + }, + { + "epoch": 2.4326375711574952, + "grad_norm": 0.24204160273075104, + "learning_rate": 2.0166015625e-05, + "loss": 0.1154, + "step": 6410 + }, + { + "epoch": 2.4364326375711576, + "grad_norm": 1.9680283069610596, + "learning_rate": 2.01171875e-05, + "loss": 0.0576, + "step": 6420 + }, + { + "epoch": 2.44022770398482, + "grad_norm": 2.9172940254211426, + "learning_rate": 2.0068359375000002e-05, + "loss": 0.0457, + "step": 6430 + }, + { + "epoch": 2.444022770398482, + "grad_norm": 4.63267707824707, + "learning_rate": 2.001953125e-05, + "loss": 0.0544, + "step": 6440 + }, + { + "epoch": 2.447817836812144, + "grad_norm": 1.447266936302185, + "learning_rate": 1.9970703125e-05, + "loss": 0.0885, + "step": 6450 + }, + { + "epoch": 2.4516129032258065, + "grad_norm": 2.839066505432129, + "learning_rate": 1.9921875e-05, + "loss": 0.1266, + "step": 6460 + }, + { + "epoch": 2.455407969639469, + "grad_norm": 2.1036999225616455, + "learning_rate": 1.9873046875e-05, + "loss": 0.1107, + "step": 6470 + }, + { + "epoch": 2.4592030360531307, + "grad_norm": 2.6435329914093018, + "learning_rate": 1.9824218750000002e-05, + "loss": 0.0539, + "step": 6480 + }, + { + "epoch": 2.462998102466793, + "grad_norm": 0.2627769112586975, + "learning_rate": 1.9775390625e-05, + "loss": 0.0713, + "step": 6490 + }, + { + "epoch": 2.4667931688804554, + "grad_norm": 3.5408475399017334, + "learning_rate": 1.9726562500000003e-05, + "loss": 0.1061, + "step": 6500 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 2.456315279006958, + "learning_rate": 1.9677734375e-05, + "loss": 0.0782, + "step": 6510 + }, + { + "epoch": 2.47438330170778, + "grad_norm": 5.217021942138672, + "learning_rate": 1.962890625e-05, + "loss": 0.1009, + "step": 6520 + }, + { + "epoch": 2.478178368121442, + "grad_norm": 4.218019962310791, + "learning_rate": 1.9580078125000002e-05, + "loss": 0.0663, + "step": 6530 + }, + { + "epoch": 2.4819734345351043, + "grad_norm": 2.7066123485565186, + "learning_rate": 1.953125e-05, + "loss": 0.0891, + "step": 6540 + }, + { + "epoch": 2.4857685009487667, + "grad_norm": 0.1062941625714302, + "learning_rate": 1.9482421875000002e-05, + "loss": 0.1085, + "step": 6550 + }, + { + "epoch": 2.489563567362429, + "grad_norm": 5.984579086303711, + "learning_rate": 1.943359375e-05, + "loss": 0.092, + "step": 6560 + }, + { + "epoch": 2.493358633776091, + "grad_norm": 0.7308592796325684, + "learning_rate": 1.9384765625e-05, + "loss": 0.072, + "step": 6570 + }, + { + "epoch": 2.4971537001897532, + "grad_norm": 0.8086015582084656, + "learning_rate": 1.93359375e-05, + "loss": 0.1052, + "step": 6580 + }, + { + "epoch": 2.5009487666034156, + "grad_norm": 1.8991528749465942, + "learning_rate": 1.9287109375e-05, + "loss": 0.0737, + "step": 6590 + }, + { + "epoch": 2.504743833017078, + "grad_norm": 6.63985013961792, + "learning_rate": 1.9238281250000002e-05, + "loss": 0.1096, + "step": 6600 + }, + { + "epoch": 2.5085388994307403, + "grad_norm": 0.17855627834796906, + "learning_rate": 1.9189453125e-05, + "loss": 0.0624, + "step": 6610 + }, + { + "epoch": 2.512333965844402, + "grad_norm": 4.877336502075195, + "learning_rate": 1.9140625e-05, + "loss": 0.1211, + "step": 6620 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 0.27590852975845337, + "learning_rate": 1.9091796875e-05, + "loss": 0.0521, + "step": 6630 + }, + { + "epoch": 2.519924098671727, + "grad_norm": 0.45393088459968567, + "learning_rate": 1.904296875e-05, + "loss": 0.0707, + "step": 6640 + }, + { + "epoch": 2.5237191650853887, + "grad_norm": 2.1049611568450928, + "learning_rate": 1.8994140625000002e-05, + "loss": 0.1105, + "step": 6650 + }, + { + "epoch": 2.527514231499051, + "grad_norm": 1.805330753326416, + "learning_rate": 1.89453125e-05, + "loss": 0.068, + "step": 6660 + }, + { + "epoch": 2.5313092979127134, + "grad_norm": 1.1227184534072876, + "learning_rate": 1.8896484375000003e-05, + "loss": 0.0572, + "step": 6670 + }, + { + "epoch": 2.5351043643263758, + "grad_norm": 2.483306646347046, + "learning_rate": 1.884765625e-05, + "loss": 0.1095, + "step": 6680 + }, + { + "epoch": 2.538899430740038, + "grad_norm": 0.1452198177576065, + "learning_rate": 1.8798828125e-05, + "loss": 0.0401, + "step": 6690 + }, + { + "epoch": 2.5426944971537004, + "grad_norm": 0.14945687353610992, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.0796, + "step": 6700 + }, + { + "epoch": 2.5464895635673623, + "grad_norm": 1.3936477899551392, + "learning_rate": 1.8701171875e-05, + "loss": 0.0688, + "step": 6710 + }, + { + "epoch": 2.5502846299810247, + "grad_norm": 0.16819104552268982, + "learning_rate": 1.8652343750000002e-05, + "loss": 0.0454, + "step": 6720 + }, + { + "epoch": 2.554079696394687, + "grad_norm": 1.2239612340927124, + "learning_rate": 1.8603515625e-05, + "loss": 0.0588, + "step": 6730 + }, + { + "epoch": 2.557874762808349, + "grad_norm": 7.471010684967041, + "learning_rate": 1.85546875e-05, + "loss": 0.0528, + "step": 6740 + }, + { + "epoch": 2.5616698292220113, + "grad_norm": 4.900544166564941, + "learning_rate": 1.8505859375e-05, + "loss": 0.0858, + "step": 6750 + }, + { + "epoch": 2.5654648956356736, + "grad_norm": 3.8821702003479004, + "learning_rate": 1.845703125e-05, + "loss": 0.046, + "step": 6760 + }, + { + "epoch": 2.569259962049336, + "grad_norm": 0.17730577290058136, + "learning_rate": 1.8408203125000002e-05, + "loss": 0.0673, + "step": 6770 + }, + { + "epoch": 2.5730550284629983, + "grad_norm": 3.4757065773010254, + "learning_rate": 1.8359375e-05, + "loss": 0.094, + "step": 6780 + }, + { + "epoch": 2.5768500948766606, + "grad_norm": 3.2091782093048096, + "learning_rate": 1.8310546875e-05, + "loss": 0.08, + "step": 6790 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 5.548855304718018, + "learning_rate": 1.826171875e-05, + "loss": 0.0996, + "step": 6800 + }, + { + "epoch": 2.584440227703985, + "grad_norm": 0.17017248272895813, + "learning_rate": 1.8212890625e-05, + "loss": 0.0828, + "step": 6810 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 9.512433052062988, + "learning_rate": 1.8164062500000002e-05, + "loss": 0.0696, + "step": 6820 + }, + { + "epoch": 2.592030360531309, + "grad_norm": 0.9737806916236877, + "learning_rate": 1.8115234375e-05, + "loss": 0.0881, + "step": 6830 + }, + { + "epoch": 2.5958254269449714, + "grad_norm": 7.027744293212891, + "learning_rate": 1.8066406250000002e-05, + "loss": 0.06, + "step": 6840 + }, + { + "epoch": 2.5996204933586338, + "grad_norm": 2.162301778793335, + "learning_rate": 1.8017578125e-05, + "loss": 0.0833, + "step": 6850 + }, + { + "epoch": 2.603415559772296, + "grad_norm": 0.30585893988609314, + "learning_rate": 1.796875e-05, + "loss": 0.0794, + "step": 6860 + }, + { + "epoch": 2.6072106261859584, + "grad_norm": 0.22574108839035034, + "learning_rate": 1.7919921875e-05, + "loss": 0.0965, + "step": 6870 + }, + { + "epoch": 2.6110056925996203, + "grad_norm": 0.6627634763717651, + "learning_rate": 1.787109375e-05, + "loss": 0.0622, + "step": 6880 + }, + { + "epoch": 2.6148007590132827, + "grad_norm": 0.17045138776302338, + "learning_rate": 1.7822265625000002e-05, + "loss": 0.0471, + "step": 6890 + }, + { + "epoch": 2.618595825426945, + "grad_norm": 0.31901392340660095, + "learning_rate": 1.77734375e-05, + "loss": 0.0607, + "step": 6900 + }, + { + "epoch": 2.6223908918406074, + "grad_norm": 0.21171316504478455, + "learning_rate": 1.7724609375e-05, + "loss": 0.0789, + "step": 6910 + }, + { + "epoch": 2.6261859582542693, + "grad_norm": 0.8109591007232666, + "learning_rate": 1.767578125e-05, + "loss": 0.0973, + "step": 6920 + }, + { + "epoch": 2.6299810246679316, + "grad_norm": 2.583545446395874, + "learning_rate": 1.7626953125e-05, + "loss": 0.0512, + "step": 6930 + }, + { + "epoch": 2.633776091081594, + "grad_norm": 1.5937598943710327, + "learning_rate": 1.7578125000000002e-05, + "loss": 0.0861, + "step": 6940 + }, + { + "epoch": 2.6375711574952563, + "grad_norm": 1.3143688440322876, + "learning_rate": 1.7529296875e-05, + "loss": 0.098, + "step": 6950 + }, + { + "epoch": 2.6413662239089186, + "grad_norm": 2.390667676925659, + "learning_rate": 1.748046875e-05, + "loss": 0.0621, + "step": 6960 + }, + { + "epoch": 2.6451612903225805, + "grad_norm": 0.30924805998802185, + "learning_rate": 1.7431640625e-05, + "loss": 0.0807, + "step": 6970 + }, + { + "epoch": 2.648956356736243, + "grad_norm": 1.6821314096450806, + "learning_rate": 1.73828125e-05, + "loss": 0.0598, + "step": 6980 + }, + { + "epoch": 2.652751423149905, + "grad_norm": 1.8624871969223022, + "learning_rate": 1.7333984375000002e-05, + "loss": 0.0841, + "step": 6990 + }, + { + "epoch": 2.656546489563567, + "grad_norm": 1.0055333375930786, + "learning_rate": 1.728515625e-05, + "loss": 0.0853, + "step": 7000 + }, + { + "epoch": 2.6603415559772294, + "grad_norm": 0.11686267703771591, + "learning_rate": 1.7236328125000002e-05, + "loss": 0.0455, + "step": 7010 + }, + { + "epoch": 2.6641366223908918, + "grad_norm": 5.000795841217041, + "learning_rate": 1.71875e-05, + "loss": 0.1102, + "step": 7020 + }, + { + "epoch": 2.667931688804554, + "grad_norm": 5.362839221954346, + "learning_rate": 1.7138671875e-05, + "loss": 0.0864, + "step": 7030 + }, + { + "epoch": 2.6717267552182165, + "grad_norm": 4.031505584716797, + "learning_rate": 1.708984375e-05, + "loss": 0.0753, + "step": 7040 + }, + { + "epoch": 2.675521821631879, + "grad_norm": 3.553187608718872, + "learning_rate": 1.7041015625e-05, + "loss": 0.0802, + "step": 7050 + }, + { + "epoch": 2.6793168880455407, + "grad_norm": 2.1504125595092773, + "learning_rate": 1.6992187500000002e-05, + "loss": 0.0798, + "step": 7060 + }, + { + "epoch": 2.683111954459203, + "grad_norm": 0.17360809445381165, + "learning_rate": 1.6943359375e-05, + "loss": 0.1064, + "step": 7070 + }, + { + "epoch": 2.6869070208728654, + "grad_norm": 0.16311465203762054, + "learning_rate": 1.689453125e-05, + "loss": 0.1194, + "step": 7080 + }, + { + "epoch": 2.6907020872865273, + "grad_norm": 3.6088805198669434, + "learning_rate": 1.6845703125e-05, + "loss": 0.0586, + "step": 7090 + }, + { + "epoch": 2.6944971537001896, + "grad_norm": 5.143406867980957, + "learning_rate": 1.6796875e-05, + "loss": 0.0892, + "step": 7100 + }, + { + "epoch": 2.698292220113852, + "grad_norm": 27.002168655395508, + "learning_rate": 1.6748046875000002e-05, + "loss": 0.089, + "step": 7110 + }, + { + "epoch": 2.7020872865275143, + "grad_norm": 1.443231225013733, + "learning_rate": 1.669921875e-05, + "loss": 0.1328, + "step": 7120 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 7.007279396057129, + "learning_rate": 1.6650390625e-05, + "loss": 0.0652, + "step": 7130 + }, + { + "epoch": 2.709677419354839, + "grad_norm": 0.25469958782196045, + "learning_rate": 1.66015625e-05, + "loss": 0.045, + "step": 7140 + }, + { + "epoch": 2.713472485768501, + "grad_norm": 4.693950653076172, + "learning_rate": 1.6552734375e-05, + "loss": 0.1245, + "step": 7150 + }, + { + "epoch": 2.717267552182163, + "grad_norm": 0.3287486732006073, + "learning_rate": 1.6503906250000002e-05, + "loss": 0.068, + "step": 7160 + }, + { + "epoch": 2.7210626185958255, + "grad_norm": 9.82812786102295, + "learning_rate": 1.6455078125e-05, + "loss": 0.0909, + "step": 7170 + }, + { + "epoch": 2.7248576850094874, + "grad_norm": 14.501320838928223, + "learning_rate": 1.6406250000000002e-05, + "loss": 0.0972, + "step": 7180 + }, + { + "epoch": 2.72865275142315, + "grad_norm": 5.130281448364258, + "learning_rate": 1.6357421875e-05, + "loss": 0.1253, + "step": 7190 + }, + { + "epoch": 2.732447817836812, + "grad_norm": 3.5541763305664062, + "learning_rate": 1.630859375e-05, + "loss": 0.0822, + "step": 7200 + }, + { + "epoch": 2.7362428842504745, + "grad_norm": 0.9670690894126892, + "learning_rate": 1.6259765625e-05, + "loss": 0.0231, + "step": 7210 + }, + { + "epoch": 2.740037950664137, + "grad_norm": 0.676513135433197, + "learning_rate": 1.62109375e-05, + "loss": 0.0972, + "step": 7220 + }, + { + "epoch": 2.7438330170777987, + "grad_norm": 7.5943217277526855, + "learning_rate": 1.6162109375000002e-05, + "loss": 0.0989, + "step": 7230 + }, + { + "epoch": 2.747628083491461, + "grad_norm": 0.20399871468544006, + "learning_rate": 1.611328125e-05, + "loss": 0.1036, + "step": 7240 + }, + { + "epoch": 2.7514231499051234, + "grad_norm": 0.43629199266433716, + "learning_rate": 1.6064453125e-05, + "loss": 0.0311, + "step": 7250 + }, + { + "epoch": 2.7552182163187857, + "grad_norm": 1.144394040107727, + "learning_rate": 1.6015625e-05, + "loss": 0.0815, + "step": 7260 + }, + { + "epoch": 2.7590132827324476, + "grad_norm": 0.06812827289104462, + "learning_rate": 1.5966796875e-05, + "loss": 0.0539, + "step": 7270 + }, + { + "epoch": 2.76280834914611, + "grad_norm": 2.913031578063965, + "learning_rate": 1.5917968750000002e-05, + "loss": 0.0443, + "step": 7280 + }, + { + "epoch": 2.7666034155597723, + "grad_norm": 2.4026944637298584, + "learning_rate": 1.5869140625e-05, + "loss": 0.0957, + "step": 7290 + }, + { + "epoch": 2.7703984819734346, + "grad_norm": 3.89658784866333, + "learning_rate": 1.58203125e-05, + "loss": 0.1125, + "step": 7300 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 0.4522351920604706, + "learning_rate": 1.5771484375e-05, + "loss": 0.0889, + "step": 7310 + }, + { + "epoch": 2.777988614800759, + "grad_norm": 5.769268989562988, + "learning_rate": 1.572265625e-05, + "loss": 0.0631, + "step": 7320 + }, + { + "epoch": 2.781783681214421, + "grad_norm": 1.7276089191436768, + "learning_rate": 1.5673828125000002e-05, + "loss": 0.091, + "step": 7330 + }, + { + "epoch": 2.7855787476280836, + "grad_norm": 2.0759644508361816, + "learning_rate": 1.5625e-05, + "loss": 0.0655, + "step": 7340 + }, + { + "epoch": 2.789373814041746, + "grad_norm": 0.7582204937934875, + "learning_rate": 1.5576171875000002e-05, + "loss": 0.0541, + "step": 7350 + }, + { + "epoch": 2.793168880455408, + "grad_norm": 16.55638885498047, + "learning_rate": 1.552734375e-05, + "loss": 0.1178, + "step": 7360 + }, + { + "epoch": 2.79696394686907, + "grad_norm": 0.7026536464691162, + "learning_rate": 1.5478515625e-05, + "loss": 0.0459, + "step": 7370 + }, + { + "epoch": 2.8007590132827325, + "grad_norm": 4.089038372039795, + "learning_rate": 1.54296875e-05, + "loss": 0.0663, + "step": 7380 + }, + { + "epoch": 2.804554079696395, + "grad_norm": 3.8286547660827637, + "learning_rate": 1.5380859375e-05, + "loss": 0.1096, + "step": 7390 + }, + { + "epoch": 2.808349146110057, + "grad_norm": 2.5993642807006836, + "learning_rate": 1.5332031250000002e-05, + "loss": 0.0685, + "step": 7400 + }, + { + "epoch": 2.812144212523719, + "grad_norm": 1.0880334377288818, + "learning_rate": 1.5283203125e-05, + "loss": 0.0631, + "step": 7410 + }, + { + "epoch": 2.8159392789373814, + "grad_norm": 1.036834478378296, + "learning_rate": 1.5234375000000001e-05, + "loss": 0.086, + "step": 7420 + }, + { + "epoch": 2.8197343453510437, + "grad_norm": 5.436180114746094, + "learning_rate": 1.5185546875000001e-05, + "loss": 0.1121, + "step": 7430 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 3.7009427547454834, + "learning_rate": 1.513671875e-05, + "loss": 0.0764, + "step": 7440 + }, + { + "epoch": 2.827324478178368, + "grad_norm": 2.5197298526763916, + "learning_rate": 1.5087890625e-05, + "loss": 0.082, + "step": 7450 + }, + { + "epoch": 2.8311195445920303, + "grad_norm": 3.15004563331604, + "learning_rate": 1.50390625e-05, + "loss": 0.112, + "step": 7460 + }, + { + "epoch": 2.8349146110056926, + "grad_norm": 2.9666614532470703, + "learning_rate": 1.4990234375e-05, + "loss": 0.0872, + "step": 7470 + }, + { + "epoch": 2.838709677419355, + "grad_norm": 6.0326385498046875, + "learning_rate": 1.4941406250000001e-05, + "loss": 0.0817, + "step": 7480 + }, + { + "epoch": 2.8425047438330173, + "grad_norm": 1.699873685836792, + "learning_rate": 1.4892578125000001e-05, + "loss": 0.0816, + "step": 7490 + }, + { + "epoch": 2.846299810246679, + "grad_norm": 0.14119946956634521, + "learning_rate": 1.484375e-05, + "loss": 0.0725, + "step": 7500 + }, + { + "epoch": 2.8500948766603416, + "grad_norm": 6.737262725830078, + "learning_rate": 1.4794921875e-05, + "loss": 0.1205, + "step": 7510 + }, + { + "epoch": 2.853889943074004, + "grad_norm": 4.460575103759766, + "learning_rate": 1.474609375e-05, + "loss": 0.123, + "step": 7520 + }, + { + "epoch": 2.857685009487666, + "grad_norm": 0.09714975953102112, + "learning_rate": 1.4697265625000001e-05, + "loss": 0.0687, + "step": 7530 + }, + { + "epoch": 2.861480075901328, + "grad_norm": 3.972470760345459, + "learning_rate": 1.4648437500000001e-05, + "loss": 0.1089, + "step": 7540 + }, + { + "epoch": 2.8652751423149905, + "grad_norm": 2.0776712894439697, + "learning_rate": 1.4599609375000001e-05, + "loss": 0.1318, + "step": 7550 + }, + { + "epoch": 2.869070208728653, + "grad_norm": 0.21448436379432678, + "learning_rate": 1.455078125e-05, + "loss": 0.0639, + "step": 7560 + }, + { + "epoch": 2.872865275142315, + "grad_norm": 0.19727276265621185, + "learning_rate": 1.4501953125e-05, + "loss": 0.0464, + "step": 7570 + }, + { + "epoch": 2.8766603415559775, + "grad_norm": 2.9958267211914062, + "learning_rate": 1.4453125e-05, + "loss": 0.0715, + "step": 7580 + }, + { + "epoch": 2.8804554079696394, + "grad_norm": 1.823538064956665, + "learning_rate": 1.4404296875000001e-05, + "loss": 0.0781, + "step": 7590 + }, + { + "epoch": 2.8842504743833017, + "grad_norm": 2.5351407527923584, + "learning_rate": 1.4355468750000001e-05, + "loss": 0.0888, + "step": 7600 + }, + { + "epoch": 2.888045540796964, + "grad_norm": 4.274851322174072, + "learning_rate": 1.4306640625000002e-05, + "loss": 0.0228, + "step": 7610 + }, + { + "epoch": 2.891840607210626, + "grad_norm": 4.665604591369629, + "learning_rate": 1.42578125e-05, + "loss": 0.083, + "step": 7620 + }, + { + "epoch": 2.8956356736242883, + "grad_norm": 4.373048782348633, + "learning_rate": 1.4208984375e-05, + "loss": 0.0936, + "step": 7630 + }, + { + "epoch": 2.8994307400379506, + "grad_norm": 1.5743074417114258, + "learning_rate": 1.416015625e-05, + "loss": 0.0414, + "step": 7640 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 2.3043341636657715, + "learning_rate": 1.4111328125000001e-05, + "loss": 0.0739, + "step": 7650 + }, + { + "epoch": 2.9070208728652753, + "grad_norm": 2.980686902999878, + "learning_rate": 1.4062500000000001e-05, + "loss": 0.0755, + "step": 7660 + }, + { + "epoch": 2.9108159392789372, + "grad_norm": 0.5928072929382324, + "learning_rate": 1.4013671875e-05, + "loss": 0.116, + "step": 7670 + }, + { + "epoch": 2.9146110056925996, + "grad_norm": 0.14647921919822693, + "learning_rate": 1.396484375e-05, + "loss": 0.0367, + "step": 7680 + }, + { + "epoch": 2.918406072106262, + "grad_norm": 6.466022968292236, + "learning_rate": 1.3916015625e-05, + "loss": 0.0365, + "step": 7690 + }, + { + "epoch": 2.9222011385199242, + "grad_norm": 13.139077186584473, + "learning_rate": 1.38671875e-05, + "loss": 0.1295, + "step": 7700 + }, + { + "epoch": 2.925996204933586, + "grad_norm": 0.3945586383342743, + "learning_rate": 1.3818359375000001e-05, + "loss": 0.0559, + "step": 7710 + }, + { + "epoch": 2.9297912713472485, + "grad_norm": 0.04980861395597458, + "learning_rate": 1.3769531250000001e-05, + "loss": 0.0485, + "step": 7720 + }, + { + "epoch": 2.933586337760911, + "grad_norm": 2.388545513153076, + "learning_rate": 1.3720703125e-05, + "loss": 0.0542, + "step": 7730 + }, + { + "epoch": 2.937381404174573, + "grad_norm": 2.4082882404327393, + "learning_rate": 1.3671875e-05, + "loss": 0.0939, + "step": 7740 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 4.933741569519043, + "learning_rate": 1.3623046875e-05, + "loss": 0.1409, + "step": 7750 + }, + { + "epoch": 2.9449715370018974, + "grad_norm": 5.57550573348999, + "learning_rate": 1.3574218750000001e-05, + "loss": 0.0646, + "step": 7760 + }, + { + "epoch": 2.9487666034155597, + "grad_norm": 1.8403911590576172, + "learning_rate": 1.3525390625000001e-05, + "loss": 0.0694, + "step": 7770 + }, + { + "epoch": 2.952561669829222, + "grad_norm": 6.1294331550598145, + "learning_rate": 1.3476562500000001e-05, + "loss": 0.0476, + "step": 7780 + }, + { + "epoch": 2.956356736242884, + "grad_norm": 0.0652192234992981, + "learning_rate": 1.3427734375e-05, + "loss": 0.0634, + "step": 7790 + }, + { + "epoch": 2.9601518026565463, + "grad_norm": 2.2705845832824707, + "learning_rate": 1.337890625e-05, + "loss": 0.0577, + "step": 7800 + }, + { + "epoch": 2.9639468690702087, + "grad_norm": 0.12686532735824585, + "learning_rate": 1.3330078125e-05, + "loss": 0.0948, + "step": 7810 + }, + { + "epoch": 2.967741935483871, + "grad_norm": 3.2810075283050537, + "learning_rate": 1.3281250000000001e-05, + "loss": 0.0813, + "step": 7820 + }, + { + "epoch": 2.9715370018975333, + "grad_norm": 2.2181339263916016, + "learning_rate": 1.3232421875000001e-05, + "loss": 0.1022, + "step": 7830 + }, + { + "epoch": 2.9753320683111957, + "grad_norm": 1.6737946271896362, + "learning_rate": 1.318359375e-05, + "loss": 0.0557, + "step": 7840 + }, + { + "epoch": 2.9791271347248576, + "grad_norm": 7.780960559844971, + "learning_rate": 1.3134765625e-05, + "loss": 0.0978, + "step": 7850 + }, + { + "epoch": 2.98292220113852, + "grad_norm": 8.983189582824707, + "learning_rate": 1.30859375e-05, + "loss": 0.0601, + "step": 7860 + }, + { + "epoch": 2.9867172675521823, + "grad_norm": 4.744899272918701, + "learning_rate": 1.3037109375e-05, + "loss": 0.0418, + "step": 7870 + }, + { + "epoch": 2.990512333965844, + "grad_norm": 2.1875483989715576, + "learning_rate": 1.2988281250000001e-05, + "loss": 0.0746, + "step": 7880 + }, + { + "epoch": 2.9943074003795065, + "grad_norm": 1.506842017173767, + "learning_rate": 1.2939453125000001e-05, + "loss": 0.0868, + "step": 7890 + }, + { + "epoch": 2.998102466793169, + "grad_norm": 2.1302731037139893, + "learning_rate": 1.2890625e-05, + "loss": 0.0687, + "step": 7900 + }, + { + "epoch": 3.001897533206831, + "grad_norm": 2.632828950881958, + "learning_rate": 1.2841796875e-05, + "loss": 0.0705, + "step": 7910 + }, + { + "epoch": 3.0056925996204935, + "grad_norm": 0.15800461173057556, + "learning_rate": 1.279296875e-05, + "loss": 0.0522, + "step": 7920 + }, + { + "epoch": 3.0094876660341554, + "grad_norm": 0.13846412301063538, + "learning_rate": 1.2744140625e-05, + "loss": 0.0363, + "step": 7930 + }, + { + "epoch": 3.0132827324478177, + "grad_norm": 4.117944717407227, + "learning_rate": 1.2695312500000001e-05, + "loss": 0.0605, + "step": 7940 + }, + { + "epoch": 3.01707779886148, + "grad_norm": 1.4927798509597778, + "learning_rate": 1.2646484375000001e-05, + "loss": 0.0346, + "step": 7950 + }, + { + "epoch": 3.0208728652751424, + "grad_norm": 4.367966175079346, + "learning_rate": 1.259765625e-05, + "loss": 0.0458, + "step": 7960 + }, + { + "epoch": 3.0246679316888048, + "grad_norm": 2.0026087760925293, + "learning_rate": 1.2548828125e-05, + "loss": 0.0749, + "step": 7970 + }, + { + "epoch": 3.0284629981024667, + "grad_norm": 2.106546640396118, + "learning_rate": 1.25e-05, + "loss": 0.065, + "step": 7980 + }, + { + "epoch": 3.032258064516129, + "grad_norm": 4.122467994689941, + "learning_rate": 1.2451171875000001e-05, + "loss": 0.0475, + "step": 7990 + }, + { + "epoch": 3.0360531309297913, + "grad_norm": 0.08205808699131012, + "learning_rate": 1.2402343750000001e-05, + "loss": 0.0692, + "step": 8000 + }, + { + "epoch": 3.0398481973434537, + "grad_norm": 1.0389831066131592, + "learning_rate": 1.2353515625e-05, + "loss": 0.0514, + "step": 8010 + }, + { + "epoch": 3.0436432637571156, + "grad_norm": 0.1080293357372284, + "learning_rate": 1.23046875e-05, + "loss": 0.0385, + "step": 8020 + }, + { + "epoch": 3.047438330170778, + "grad_norm": 0.2515338361263275, + "learning_rate": 1.2255859375e-05, + "loss": 0.0835, + "step": 8030 + }, + { + "epoch": 3.0512333965844403, + "grad_norm": 1.1087881326675415, + "learning_rate": 1.220703125e-05, + "loss": 0.0559, + "step": 8040 + }, + { + "epoch": 3.0550284629981026, + "grad_norm": 1.1088217496871948, + "learning_rate": 1.2158203125000001e-05, + "loss": 0.075, + "step": 8050 + }, + { + "epoch": 3.0588235294117645, + "grad_norm": 3.310959577560425, + "learning_rate": 1.2109375000000001e-05, + "loss": 0.0596, + "step": 8060 + }, + { + "epoch": 3.062618595825427, + "grad_norm": 1.186274766921997, + "learning_rate": 1.2060546875e-05, + "loss": 0.0399, + "step": 8070 + }, + { + "epoch": 3.066413662239089, + "grad_norm": 3.054225444793701, + "learning_rate": 1.201171875e-05, + "loss": 0.0352, + "step": 8080 + }, + { + "epoch": 3.0702087286527515, + "grad_norm": 0.3610187768936157, + "learning_rate": 1.1962890625e-05, + "loss": 0.0519, + "step": 8090 + }, + { + "epoch": 3.074003795066414, + "grad_norm": 1.7858855724334717, + "learning_rate": 1.19140625e-05, + "loss": 0.0712, + "step": 8100 + }, + { + "epoch": 3.0777988614800758, + "grad_norm": 3.144697666168213, + "learning_rate": 1.1865234375000001e-05, + "loss": 0.0343, + "step": 8110 + }, + { + "epoch": 3.081593927893738, + "grad_norm": 1.743668556213379, + "learning_rate": 1.1816406250000001e-05, + "loss": 0.0611, + "step": 8120 + }, + { + "epoch": 3.0853889943074004, + "grad_norm": 0.6149533987045288, + "learning_rate": 1.1767578125e-05, + "loss": 0.0512, + "step": 8130 + }, + { + "epoch": 3.0891840607210628, + "grad_norm": 6.247795581817627, + "learning_rate": 1.171875e-05, + "loss": 0.0741, + "step": 8140 + }, + { + "epoch": 3.0929791271347247, + "grad_norm": 0.8566815853118896, + "learning_rate": 1.1669921875e-05, + "loss": 0.0699, + "step": 8150 + }, + { + "epoch": 3.096774193548387, + "grad_norm": 3.2794229984283447, + "learning_rate": 1.162109375e-05, + "loss": 0.0296, + "step": 8160 + }, + { + "epoch": 3.1005692599620494, + "grad_norm": 0.10005365312099457, + "learning_rate": 1.1572265625000001e-05, + "loss": 0.0645, + "step": 8170 + }, + { + "epoch": 3.1043643263757117, + "grad_norm": 2.8992691040039062, + "learning_rate": 1.15234375e-05, + "loss": 0.0456, + "step": 8180 + }, + { + "epoch": 3.108159392789374, + "grad_norm": 3.6778674125671387, + "learning_rate": 1.1474609375e-05, + "loss": 0.0351, + "step": 8190 + }, + { + "epoch": 3.111954459203036, + "grad_norm": 1.5398664474487305, + "learning_rate": 1.142578125e-05, + "loss": 0.042, + "step": 8200 + }, + { + "epoch": 3.1157495256166983, + "grad_norm": 0.05135444924235344, + "learning_rate": 1.1376953125e-05, + "loss": 0.0478, + "step": 8210 + }, + { + "epoch": 3.1195445920303606, + "grad_norm": 0.6804483532905579, + "learning_rate": 1.1328125000000001e-05, + "loss": 0.058, + "step": 8220 + }, + { + "epoch": 3.123339658444023, + "grad_norm": 0.10011663287878036, + "learning_rate": 1.1279296875000001e-05, + "loss": 0.0456, + "step": 8230 + }, + { + "epoch": 3.127134724857685, + "grad_norm": 0.466981440782547, + "learning_rate": 1.123046875e-05, + "loss": 0.0449, + "step": 8240 + }, + { + "epoch": 3.130929791271347, + "grad_norm": 2.163849353790283, + "learning_rate": 1.1181640625e-05, + "loss": 0.0595, + "step": 8250 + }, + { + "epoch": 3.1347248576850095, + "grad_norm": 1.1013680696487427, + "learning_rate": 1.11328125e-05, + "loss": 0.0708, + "step": 8260 + }, + { + "epoch": 3.138519924098672, + "grad_norm": 8.969820022583008, + "learning_rate": 1.1083984375e-05, + "loss": 0.064, + "step": 8270 + }, + { + "epoch": 3.1423149905123338, + "grad_norm": 1.1106621026992798, + "learning_rate": 1.1035156250000001e-05, + "loss": 0.1007, + "step": 8280 + }, + { + "epoch": 3.146110056925996, + "grad_norm": 0.1508377343416214, + "learning_rate": 1.0986328125000001e-05, + "loss": 0.0464, + "step": 8290 + }, + { + "epoch": 3.1499051233396584, + "grad_norm": 0.07330877333879471, + "learning_rate": 1.09375e-05, + "loss": 0.0797, + "step": 8300 + }, + { + "epoch": 3.153700189753321, + "grad_norm": 1.6159915924072266, + "learning_rate": 1.0888671875e-05, + "loss": 0.0527, + "step": 8310 + }, + { + "epoch": 3.157495256166983, + "grad_norm": 0.5196408629417419, + "learning_rate": 1.083984375e-05, + "loss": 0.0433, + "step": 8320 + }, + { + "epoch": 3.161290322580645, + "grad_norm": 2.486041307449341, + "learning_rate": 1.0791015625e-05, + "loss": 0.0651, + "step": 8330 + }, + { + "epoch": 3.1650853889943074, + "grad_norm": 1.0713788270950317, + "learning_rate": 1.0742187500000001e-05, + "loss": 0.0695, + "step": 8340 + }, + { + "epoch": 3.1688804554079697, + "grad_norm": 0.19154168665409088, + "learning_rate": 1.0693359375e-05, + "loss": 0.0364, + "step": 8350 + }, + { + "epoch": 3.172675521821632, + "grad_norm": 0.31223466992378235, + "learning_rate": 1.064453125e-05, + "loss": 0.0267, + "step": 8360 + }, + { + "epoch": 3.176470588235294, + "grad_norm": 0.7767817378044128, + "learning_rate": 1.0595703125e-05, + "loss": 0.0635, + "step": 8370 + }, + { + "epoch": 3.1802656546489563, + "grad_norm": 2.4257445335388184, + "learning_rate": 1.0546875e-05, + "loss": 0.0588, + "step": 8380 + }, + { + "epoch": 3.1840607210626186, + "grad_norm": 1.2349954843521118, + "learning_rate": 1.0498046875000001e-05, + "loss": 0.0557, + "step": 8390 + }, + { + "epoch": 3.187855787476281, + "grad_norm": 3.209284543991089, + "learning_rate": 1.0449218750000001e-05, + "loss": 0.0459, + "step": 8400 + }, + { + "epoch": 3.191650853889943, + "grad_norm": 0.16265904903411865, + "learning_rate": 1.0400390625e-05, + "loss": 0.0525, + "step": 8410 + }, + { + "epoch": 3.195445920303605, + "grad_norm": 0.6664568781852722, + "learning_rate": 1.03515625e-05, + "loss": 0.0727, + "step": 8420 + }, + { + "epoch": 3.1992409867172675, + "grad_norm": 0.9481377005577087, + "learning_rate": 1.0302734375e-05, + "loss": 0.0215, + "step": 8430 + }, + { + "epoch": 3.20303605313093, + "grad_norm": 5.600297451019287, + "learning_rate": 1.025390625e-05, + "loss": 0.0385, + "step": 8440 + }, + { + "epoch": 3.206831119544592, + "grad_norm": 0.15000663697719574, + "learning_rate": 1.0205078125000001e-05, + "loss": 0.0659, + "step": 8450 + }, + { + "epoch": 3.210626185958254, + "grad_norm": 0.6691407561302185, + "learning_rate": 1.0156250000000001e-05, + "loss": 0.0666, + "step": 8460 + }, + { + "epoch": 3.2144212523719164, + "grad_norm": 1.3882899284362793, + "learning_rate": 1.0107421875e-05, + "loss": 0.0815, + "step": 8470 + }, + { + "epoch": 3.218216318785579, + "grad_norm": 1.0314580202102661, + "learning_rate": 1.005859375e-05, + "loss": 0.0178, + "step": 8480 + }, + { + "epoch": 3.222011385199241, + "grad_norm": 3.9537134170532227, + "learning_rate": 1.0009765625e-05, + "loss": 0.0631, + "step": 8490 + }, + { + "epoch": 3.225806451612903, + "grad_norm": 5.446588039398193, + "learning_rate": 9.9609375e-06, + "loss": 0.0548, + "step": 8500 + }, + { + "epoch": 3.2296015180265654, + "grad_norm": 8.026607513427734, + "learning_rate": 9.912109375000001e-06, + "loss": 0.0353, + "step": 8510 + }, + { + "epoch": 3.2333965844402277, + "grad_norm": 0.1389143019914627, + "learning_rate": 9.863281250000001e-06, + "loss": 0.0419, + "step": 8520 + }, + { + "epoch": 3.23719165085389, + "grad_norm": 1.255216121673584, + "learning_rate": 9.814453125e-06, + "loss": 0.0697, + "step": 8530 + }, + { + "epoch": 3.2409867172675524, + "grad_norm": 4.600146770477295, + "learning_rate": 9.765625e-06, + "loss": 0.0879, + "step": 8540 + }, + { + "epoch": 3.2447817836812143, + "grad_norm": 0.09613824635744095, + "learning_rate": 9.716796875e-06, + "loss": 0.0122, + "step": 8550 + }, + { + "epoch": 3.2485768500948766, + "grad_norm": 1.0265446901321411, + "learning_rate": 9.66796875e-06, + "loss": 0.0227, + "step": 8560 + }, + { + "epoch": 3.252371916508539, + "grad_norm": 2.185931444168091, + "learning_rate": 9.619140625000001e-06, + "loss": 0.1162, + "step": 8570 + }, + { + "epoch": 3.2561669829222013, + "grad_norm": 0.1482323259115219, + "learning_rate": 9.5703125e-06, + "loss": 0.0581, + "step": 8580 + }, + { + "epoch": 3.259962049335863, + "grad_norm": 0.17460452020168304, + "learning_rate": 9.521484375e-06, + "loss": 0.0399, + "step": 8590 + }, + { + "epoch": 3.2637571157495255, + "grad_norm": 1.6274187564849854, + "learning_rate": 9.47265625e-06, + "loss": 0.0537, + "step": 8600 + }, + { + "epoch": 3.267552182163188, + "grad_norm": 8.227033615112305, + "learning_rate": 9.423828125e-06, + "loss": 0.0646, + "step": 8610 + }, + { + "epoch": 3.27134724857685, + "grad_norm": 0.08734069019556046, + "learning_rate": 9.375000000000001e-06, + "loss": 0.0675, + "step": 8620 + }, + { + "epoch": 3.2751423149905126, + "grad_norm": 0.5700662732124329, + "learning_rate": 9.326171875000001e-06, + "loss": 0.0744, + "step": 8630 + }, + { + "epoch": 3.2789373814041745, + "grad_norm": 2.089008092880249, + "learning_rate": 9.27734375e-06, + "loss": 0.0812, + "step": 8640 + }, + { + "epoch": 3.282732447817837, + "grad_norm": 0.11990799009799957, + "learning_rate": 9.228515625e-06, + "loss": 0.071, + "step": 8650 + }, + { + "epoch": 3.286527514231499, + "grad_norm": 0.5663464665412903, + "learning_rate": 9.1796875e-06, + "loss": 0.0279, + "step": 8660 + }, + { + "epoch": 3.2903225806451615, + "grad_norm": 0.8847103118896484, + "learning_rate": 9.130859375e-06, + "loss": 0.0473, + "step": 8670 + }, + { + "epoch": 3.2941176470588234, + "grad_norm": 0.08891147375106812, + "learning_rate": 9.082031250000001e-06, + "loss": 0.041, + "step": 8680 + }, + { + "epoch": 3.2979127134724857, + "grad_norm": 0.0875004231929779, + "learning_rate": 9.033203125000001e-06, + "loss": 0.0284, + "step": 8690 + }, + { + "epoch": 3.301707779886148, + "grad_norm": 0.353773832321167, + "learning_rate": 8.984375e-06, + "loss": 0.0451, + "step": 8700 + }, + { + "epoch": 3.3055028462998104, + "grad_norm": 0.03987530991435051, + "learning_rate": 8.935546875e-06, + "loss": 0.0803, + "step": 8710 + }, + { + "epoch": 3.3092979127134727, + "grad_norm": 2.087677001953125, + "learning_rate": 8.88671875e-06, + "loss": 0.0257, + "step": 8720 + }, + { + "epoch": 3.3130929791271346, + "grad_norm": 4.051992893218994, + "learning_rate": 8.837890625e-06, + "loss": 0.0345, + "step": 8730 + }, + { + "epoch": 3.316888045540797, + "grad_norm": 3.694368362426758, + "learning_rate": 8.789062500000001e-06, + "loss": 0.0824, + "step": 8740 + }, + { + "epoch": 3.3206831119544593, + "grad_norm": 0.09131748974323273, + "learning_rate": 8.740234375e-06, + "loss": 0.0295, + "step": 8750 + }, + { + "epoch": 3.324478178368121, + "grad_norm": 0.05908443033695221, + "learning_rate": 8.69140625e-06, + "loss": 0.0282, + "step": 8760 + }, + { + "epoch": 3.3282732447817835, + "grad_norm": 1.863980770111084, + "learning_rate": 8.642578125e-06, + "loss": 0.0442, + "step": 8770 + }, + { + "epoch": 3.332068311195446, + "grad_norm": 1.2207703590393066, + "learning_rate": 8.59375e-06, + "loss": 0.0316, + "step": 8780 + }, + { + "epoch": 3.3358633776091082, + "grad_norm": 2.562156915664673, + "learning_rate": 8.544921875e-06, + "loss": 0.0598, + "step": 8790 + }, + { + "epoch": 3.3396584440227706, + "grad_norm": 5.533409595489502, + "learning_rate": 8.496093750000001e-06, + "loss": 0.0432, + "step": 8800 + }, + { + "epoch": 3.3434535104364325, + "grad_norm": 0.47492659091949463, + "learning_rate": 8.447265625e-06, + "loss": 0.0528, + "step": 8810 + }, + { + "epoch": 3.347248576850095, + "grad_norm": 1.0108855962753296, + "learning_rate": 8.3984375e-06, + "loss": 0.0552, + "step": 8820 + }, + { + "epoch": 3.351043643263757, + "grad_norm": 1.780705451965332, + "learning_rate": 8.349609375e-06, + "loss": 0.0252, + "step": 8830 + }, + { + "epoch": 3.3548387096774195, + "grad_norm": 0.3152208924293518, + "learning_rate": 8.30078125e-06, + "loss": 0.0915, + "step": 8840 + }, + { + "epoch": 3.3586337760910814, + "grad_norm": 1.9720813035964966, + "learning_rate": 8.251953125000001e-06, + "loss": 0.0571, + "step": 8850 + }, + { + "epoch": 3.3624288425047437, + "grad_norm": 0.5636972784996033, + "learning_rate": 8.203125000000001e-06, + "loss": 0.0716, + "step": 8860 + }, + { + "epoch": 3.366223908918406, + "grad_norm": 9.523944854736328, + "learning_rate": 8.154296875e-06, + "loss": 0.0649, + "step": 8870 + }, + { + "epoch": 3.3700189753320684, + "grad_norm": 1.868201732635498, + "learning_rate": 8.10546875e-06, + "loss": 0.1055, + "step": 8880 + }, + { + "epoch": 3.3738140417457307, + "grad_norm": 4.064790725708008, + "learning_rate": 8.056640625e-06, + "loss": 0.0681, + "step": 8890 + }, + { + "epoch": 3.3776091081593926, + "grad_norm": 5.854636192321777, + "learning_rate": 8.0078125e-06, + "loss": 0.0755, + "step": 8900 + }, + { + "epoch": 3.381404174573055, + "grad_norm": 0.47955596446990967, + "learning_rate": 7.958984375000001e-06, + "loss": 0.0832, + "step": 8910 + }, + { + "epoch": 3.3851992409867173, + "grad_norm": 0.48627012968063354, + "learning_rate": 7.91015625e-06, + "loss": 0.0487, + "step": 8920 + }, + { + "epoch": 3.3889943074003797, + "grad_norm": 1.4986870288848877, + "learning_rate": 7.861328125e-06, + "loss": 0.0769, + "step": 8930 + }, + { + "epoch": 3.3927893738140416, + "grad_norm": 1.139615774154663, + "learning_rate": 7.8125e-06, + "loss": 0.0238, + "step": 8940 + }, + { + "epoch": 3.396584440227704, + "grad_norm": 0.17134952545166016, + "learning_rate": 7.763671875e-06, + "loss": 0.072, + "step": 8950 + }, + { + "epoch": 3.4003795066413662, + "grad_norm": 0.15060165524482727, + "learning_rate": 7.71484375e-06, + "loss": 0.0607, + "step": 8960 + }, + { + "epoch": 3.4041745730550286, + "grad_norm": 1.0973819494247437, + "learning_rate": 7.666015625000001e-06, + "loss": 0.0914, + "step": 8970 + }, + { + "epoch": 3.407969639468691, + "grad_norm": 4.7881951332092285, + "learning_rate": 7.6171875000000005e-06, + "loss": 0.0515, + "step": 8980 + }, + { + "epoch": 3.411764705882353, + "grad_norm": 2.9025986194610596, + "learning_rate": 7.568359375e-06, + "loss": 0.0576, + "step": 8990 + }, + { + "epoch": 3.415559772296015, + "grad_norm": 0.07781478762626648, + "learning_rate": 7.51953125e-06, + "loss": 0.0318, + "step": 9000 + }, + { + "epoch": 3.4193548387096775, + "grad_norm": 2.8141448497772217, + "learning_rate": 7.4707031250000005e-06, + "loss": 0.0598, + "step": 9010 + }, + { + "epoch": 3.42314990512334, + "grad_norm": 1.2371045351028442, + "learning_rate": 7.421875e-06, + "loss": 0.1014, + "step": 9020 + }, + { + "epoch": 3.4269449715370017, + "grad_norm": 0.11280115693807602, + "learning_rate": 7.373046875e-06, + "loss": 0.0571, + "step": 9030 + }, + { + "epoch": 3.430740037950664, + "grad_norm": 0.07071410119533539, + "learning_rate": 7.3242187500000006e-06, + "loss": 0.0289, + "step": 9040 + }, + { + "epoch": 3.4345351043643264, + "grad_norm": 0.07948953658342361, + "learning_rate": 7.275390625e-06, + "loss": 0.0328, + "step": 9050 + }, + { + "epoch": 3.4383301707779887, + "grad_norm": 6.166849613189697, + "learning_rate": 7.2265625e-06, + "loss": 0.0501, + "step": 9060 + }, + { + "epoch": 3.442125237191651, + "grad_norm": 0.3815774619579315, + "learning_rate": 7.177734375000001e-06, + "loss": 0.0449, + "step": 9070 + }, + { + "epoch": 3.445920303605313, + "grad_norm": 0.21274378895759583, + "learning_rate": 7.12890625e-06, + "loss": 0.0871, + "step": 9080 + }, + { + "epoch": 3.4497153700189753, + "grad_norm": 0.5041061043739319, + "learning_rate": 7.080078125e-06, + "loss": 0.0451, + "step": 9090 + }, + { + "epoch": 3.4535104364326377, + "grad_norm": 2.4566073417663574, + "learning_rate": 7.031250000000001e-06, + "loss": 0.0622, + "step": 9100 + }, + { + "epoch": 3.4573055028462996, + "grad_norm": 5.31998872756958, + "learning_rate": 6.982421875e-06, + "loss": 0.0545, + "step": 9110 + }, + { + "epoch": 3.461100569259962, + "grad_norm": 0.2531034052371979, + "learning_rate": 6.93359375e-06, + "loss": 0.0449, + "step": 9120 + }, + { + "epoch": 3.4648956356736242, + "grad_norm": 0.03640067204833031, + "learning_rate": 6.884765625000001e-06, + "loss": 0.0944, + "step": 9130 + }, + { + "epoch": 3.4686907020872866, + "grad_norm": 0.9717852473258972, + "learning_rate": 6.8359375e-06, + "loss": 0.0165, + "step": 9140 + }, + { + "epoch": 3.472485768500949, + "grad_norm": 1.4924548864364624, + "learning_rate": 6.7871093750000004e-06, + "loss": 0.069, + "step": 9150 + }, + { + "epoch": 3.476280834914611, + "grad_norm": 2.620271682739258, + "learning_rate": 6.738281250000001e-06, + "loss": 0.0967, + "step": 9160 + }, + { + "epoch": 3.480075901328273, + "grad_norm": 2.279548406600952, + "learning_rate": 6.689453125e-06, + "loss": 0.0257, + "step": 9170 + }, + { + "epoch": 3.4838709677419355, + "grad_norm": 0.08608423173427582, + "learning_rate": 6.6406250000000005e-06, + "loss": 0.0359, + "step": 9180 + }, + { + "epoch": 3.487666034155598, + "grad_norm": 5.201995849609375, + "learning_rate": 6.591796875e-06, + "loss": 0.0349, + "step": 9190 + }, + { + "epoch": 3.4914611005692597, + "grad_norm": 0.6848796606063843, + "learning_rate": 6.54296875e-06, + "loss": 0.0473, + "step": 9200 + }, + { + "epoch": 3.495256166982922, + "grad_norm": 1.0673704147338867, + "learning_rate": 6.4941406250000005e-06, + "loss": 0.0751, + "step": 9210 + }, + { + "epoch": 3.4990512333965844, + "grad_norm": 6.374655723571777, + "learning_rate": 6.4453125e-06, + "loss": 0.0672, + "step": 9220 + }, + { + "epoch": 3.5028462998102468, + "grad_norm": 3.0670387744903564, + "learning_rate": 6.396484375e-06, + "loss": 0.1047, + "step": 9230 + }, + { + "epoch": 3.506641366223909, + "grad_norm": 2.0058538913726807, + "learning_rate": 6.3476562500000006e-06, + "loss": 0.0571, + "step": 9240 + }, + { + "epoch": 3.510436432637571, + "grad_norm": 0.8808121681213379, + "learning_rate": 6.298828125e-06, + "loss": 0.0742, + "step": 9250 + }, + { + "epoch": 3.5142314990512333, + "grad_norm": 0.1013035699725151, + "learning_rate": 6.25e-06, + "loss": 0.0506, + "step": 9260 + }, + { + "epoch": 3.5180265654648957, + "grad_norm": 1.1379400491714478, + "learning_rate": 6.201171875000001e-06, + "loss": 0.0466, + "step": 9270 + }, + { + "epoch": 3.521821631878558, + "grad_norm": 0.44777366518974304, + "learning_rate": 6.15234375e-06, + "loss": 0.0425, + "step": 9280 + }, + { + "epoch": 3.52561669829222, + "grad_norm": 0.6099011301994324, + "learning_rate": 6.103515625e-06, + "loss": 0.0368, + "step": 9290 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 10.134333610534668, + "learning_rate": 6.054687500000001e-06, + "loss": 0.0459, + "step": 9300 + }, + { + "epoch": 3.5332068311195446, + "grad_norm": 10.301962852478027, + "learning_rate": 6.005859375e-06, + "loss": 0.0712, + "step": 9310 + }, + { + "epoch": 3.537001897533207, + "grad_norm": 2.240419864654541, + "learning_rate": 5.95703125e-06, + "loss": 0.0496, + "step": 9320 + }, + { + "epoch": 3.5407969639468693, + "grad_norm": 9.403803825378418, + "learning_rate": 5.908203125000001e-06, + "loss": 0.0551, + "step": 9330 + }, + { + "epoch": 3.544592030360531, + "grad_norm": 0.0765363797545433, + "learning_rate": 5.859375e-06, + "loss": 0.0382, + "step": 9340 + }, + { + "epoch": 3.5483870967741935, + "grad_norm": 0.6216185688972473, + "learning_rate": 5.810546875e-06, + "loss": 0.0723, + "step": 9350 + }, + { + "epoch": 3.552182163187856, + "grad_norm": 6.577167987823486, + "learning_rate": 5.76171875e-06, + "loss": 0.0626, + "step": 9360 + }, + { + "epoch": 3.555977229601518, + "grad_norm": 0.15332098305225372, + "learning_rate": 5.712890625e-06, + "loss": 0.0419, + "step": 9370 + }, + { + "epoch": 3.55977229601518, + "grad_norm": 3.2923789024353027, + "learning_rate": 5.6640625000000005e-06, + "loss": 0.0894, + "step": 9380 + }, + { + "epoch": 3.5635673624288424, + "grad_norm": 1.0206191539764404, + "learning_rate": 5.615234375e-06, + "loss": 0.0477, + "step": 9390 + }, + { + "epoch": 3.5673624288425048, + "grad_norm": 5.454959869384766, + "learning_rate": 5.56640625e-06, + "loss": 0.0315, + "step": 9400 + }, + { + "epoch": 3.571157495256167, + "grad_norm": 0.3191007673740387, + "learning_rate": 5.5175781250000005e-06, + "loss": 0.068, + "step": 9410 + }, + { + "epoch": 3.5749525616698294, + "grad_norm": 12.383304595947266, + "learning_rate": 5.46875e-06, + "loss": 0.0444, + "step": 9420 + }, + { + "epoch": 3.5787476280834913, + "grad_norm": 1.9023758172988892, + "learning_rate": 5.419921875e-06, + "loss": 0.0942, + "step": 9430 + }, + { + "epoch": 3.5825426944971537, + "grad_norm": 0.06706677377223969, + "learning_rate": 5.3710937500000005e-06, + "loss": 0.0512, + "step": 9440 + }, + { + "epoch": 3.586337760910816, + "grad_norm": 0.32390040159225464, + "learning_rate": 5.322265625e-06, + "loss": 0.0603, + "step": 9450 + }, + { + "epoch": 3.590132827324478, + "grad_norm": 1.5318775177001953, + "learning_rate": 5.2734375e-06, + "loss": 0.0491, + "step": 9460 + }, + { + "epoch": 3.5939278937381403, + "grad_norm": 0.5909900665283203, + "learning_rate": 5.2246093750000006e-06, + "loss": 0.0294, + "step": 9470 + }, + { + "epoch": 3.5977229601518026, + "grad_norm": 1.5226948261260986, + "learning_rate": 5.17578125e-06, + "loss": 0.0621, + "step": 9480 + }, + { + "epoch": 3.601518026565465, + "grad_norm": 0.24643893539905548, + "learning_rate": 5.126953125e-06, + "loss": 0.0293, + "step": 9490 + }, + { + "epoch": 3.6053130929791273, + "grad_norm": 7.143110752105713, + "learning_rate": 5.078125000000001e-06, + "loss": 0.0592, + "step": 9500 + }, + { + "epoch": 3.6091081593927896, + "grad_norm": 3.5135350227355957, + "learning_rate": 5.029296875e-06, + "loss": 0.0705, + "step": 9510 + }, + { + "epoch": 3.6129032258064515, + "grad_norm": 4.653140544891357, + "learning_rate": 4.98046875e-06, + "loss": 0.0624, + "step": 9520 + }, + { + "epoch": 3.616698292220114, + "grad_norm": 0.044525645673274994, + "learning_rate": 4.931640625000001e-06, + "loss": 0.0449, + "step": 9530 + }, + { + "epoch": 3.620493358633776, + "grad_norm": 7.338439464569092, + "learning_rate": 4.8828125e-06, + "loss": 0.0536, + "step": 9540 + }, + { + "epoch": 3.624288425047438, + "grad_norm": 0.4086396396160126, + "learning_rate": 4.833984375e-06, + "loss": 0.038, + "step": 9550 + }, + { + "epoch": 3.6280834914611004, + "grad_norm": 0.05038388445973396, + "learning_rate": 4.78515625e-06, + "loss": 0.0458, + "step": 9560 + }, + { + "epoch": 3.6318785578747628, + "grad_norm": 0.09961717575788498, + "learning_rate": 4.736328125e-06, + "loss": 0.0468, + "step": 9570 + }, + { + "epoch": 3.635673624288425, + "grad_norm": 0.27485185861587524, + "learning_rate": 4.6875000000000004e-06, + "loss": 0.0675, + "step": 9580 + }, + { + "epoch": 3.6394686907020875, + "grad_norm": 4.295794486999512, + "learning_rate": 4.638671875e-06, + "loss": 0.0519, + "step": 9590 + }, + { + "epoch": 3.64326375711575, + "grad_norm": 1.9907684326171875, + "learning_rate": 4.58984375e-06, + "loss": 0.0422, + "step": 9600 + }, + { + "epoch": 3.6470588235294117, + "grad_norm": 0.12039614468812943, + "learning_rate": 4.5410156250000005e-06, + "loss": 0.044, + "step": 9610 + }, + { + "epoch": 3.650853889943074, + "grad_norm": 0.4942443072795868, + "learning_rate": 4.4921875e-06, + "loss": 0.0828, + "step": 9620 + }, + { + "epoch": 3.6546489563567364, + "grad_norm": 0.8744149804115295, + "learning_rate": 4.443359375e-06, + "loss": 0.0514, + "step": 9630 + }, + { + "epoch": 3.6584440227703983, + "grad_norm": 1.8012325763702393, + "learning_rate": 4.3945312500000005e-06, + "loss": 0.0389, + "step": 9640 + }, + { + "epoch": 3.6622390891840606, + "grad_norm": 0.09957607835531235, + "learning_rate": 4.345703125e-06, + "loss": 0.0512, + "step": 9650 + }, + { + "epoch": 3.666034155597723, + "grad_norm": 0.0749269425868988, + "learning_rate": 4.296875e-06, + "loss": 0.0278, + "step": 9660 + }, + { + "epoch": 3.6698292220113853, + "grad_norm": 0.04859253391623497, + "learning_rate": 4.2480468750000006e-06, + "loss": 0.0813, + "step": 9670 + }, + { + "epoch": 3.6736242884250476, + "grad_norm": 3.236546277999878, + "learning_rate": 4.19921875e-06, + "loss": 0.0408, + "step": 9680 + }, + { + "epoch": 3.6774193548387095, + "grad_norm": 2.782500743865967, + "learning_rate": 4.150390625e-06, + "loss": 0.0365, + "step": 9690 + }, + { + "epoch": 3.681214421252372, + "grad_norm": 0.2516065835952759, + "learning_rate": 4.101562500000001e-06, + "loss": 0.0541, + "step": 9700 + }, + { + "epoch": 3.685009487666034, + "grad_norm": 0.0802445337176323, + "learning_rate": 4.052734375e-06, + "loss": 0.0296, + "step": 9710 + }, + { + "epoch": 3.6888045540796965, + "grad_norm": 0.7485657930374146, + "learning_rate": 4.00390625e-06, + "loss": 0.0194, + "step": 9720 + }, + { + "epoch": 3.6925996204933584, + "grad_norm": 0.05877687409520149, + "learning_rate": 3.955078125e-06, + "loss": 0.0547, + "step": 9730 + }, + { + "epoch": 3.6963946869070208, + "grad_norm": 3.6818785667419434, + "learning_rate": 3.90625e-06, + "loss": 0.0801, + "step": 9740 + }, + { + "epoch": 3.700189753320683, + "grad_norm": 0.22303463518619537, + "learning_rate": 3.857421875e-06, + "loss": 0.0326, + "step": 9750 + }, + { + "epoch": 3.7039848197343455, + "grad_norm": 0.16665808856487274, + "learning_rate": 3.8085937500000002e-06, + "loss": 0.0664, + "step": 9760 + }, + { + "epoch": 3.707779886148008, + "grad_norm": 0.2113623172044754, + "learning_rate": 3.759765625e-06, + "loss": 0.0495, + "step": 9770 + }, + { + "epoch": 3.7115749525616697, + "grad_norm": 1.9400161504745483, + "learning_rate": 3.7109375e-06, + "loss": 0.062, + "step": 9780 + }, + { + "epoch": 3.715370018975332, + "grad_norm": 2.147211790084839, + "learning_rate": 3.6621093750000003e-06, + "loss": 0.0408, + "step": 9790 + }, + { + "epoch": 3.7191650853889944, + "grad_norm": 0.17818136513233185, + "learning_rate": 3.61328125e-06, + "loss": 0.0376, + "step": 9800 + }, + { + "epoch": 3.7229601518026563, + "grad_norm": 0.2646294832229614, + "learning_rate": 3.564453125e-06, + "loss": 0.0488, + "step": 9810 + }, + { + "epoch": 3.7267552182163186, + "grad_norm": 0.07648167759180069, + "learning_rate": 3.5156250000000003e-06, + "loss": 0.0618, + "step": 9820 + }, + { + "epoch": 3.730550284629981, + "grad_norm": 4.988431930541992, + "learning_rate": 3.466796875e-06, + "loss": 0.0438, + "step": 9830 + }, + { + "epoch": 3.7343453510436433, + "grad_norm": 4.025431156158447, + "learning_rate": 3.41796875e-06, + "loss": 0.0663, + "step": 9840 + }, + { + "epoch": 3.7381404174573056, + "grad_norm": 0.7877894043922424, + "learning_rate": 3.3691406250000004e-06, + "loss": 0.0261, + "step": 9850 + }, + { + "epoch": 3.741935483870968, + "grad_norm": 1.7883660793304443, + "learning_rate": 3.3203125000000002e-06, + "loss": 0.0481, + "step": 9860 + }, + { + "epoch": 3.74573055028463, + "grad_norm": 2.136960029602051, + "learning_rate": 3.271484375e-06, + "loss": 0.052, + "step": 9870 + }, + { + "epoch": 3.749525616698292, + "grad_norm": 0.9067153930664062, + "learning_rate": 3.22265625e-06, + "loss": 0.0567, + "step": 9880 + }, + { + "epoch": 3.7533206831119545, + "grad_norm": 1.2437059879302979, + "learning_rate": 3.1738281250000003e-06, + "loss": 0.053, + "step": 9890 + }, + { + "epoch": 3.7571157495256164, + "grad_norm": 2.1223294734954834, + "learning_rate": 3.125e-06, + "loss": 0.0484, + "step": 9900 + }, + { + "epoch": 3.760910815939279, + "grad_norm": 8.40434455871582, + "learning_rate": 3.076171875e-06, + "loss": 0.0451, + "step": 9910 + }, + { + "epoch": 3.764705882352941, + "grad_norm": 2.565584421157837, + "learning_rate": 3.0273437500000003e-06, + "loss": 0.0589, + "step": 9920 + }, + { + "epoch": 3.7685009487666035, + "grad_norm": 5.559597492218018, + "learning_rate": 2.978515625e-06, + "loss": 0.0396, + "step": 9930 + }, + { + "epoch": 3.772296015180266, + "grad_norm": 0.5843867659568787, + "learning_rate": 2.9296875e-06, + "loss": 0.0682, + "step": 9940 + }, + { + "epoch": 3.776091081593928, + "grad_norm": 1.6344566345214844, + "learning_rate": 2.880859375e-06, + "loss": 0.0892, + "step": 9950 + }, + { + "epoch": 3.77988614800759, + "grad_norm": 5.6130051612854, + "learning_rate": 2.8320312500000002e-06, + "loss": 0.0439, + "step": 9960 + }, + { + "epoch": 3.7836812144212524, + "grad_norm": 3.700528144836426, + "learning_rate": 2.783203125e-06, + "loss": 0.0228, + "step": 9970 + }, + { + "epoch": 3.7874762808349147, + "grad_norm": 2.797687530517578, + "learning_rate": 2.734375e-06, + "loss": 0.0247, + "step": 9980 + }, + { + "epoch": 3.7912713472485766, + "grad_norm": 1.7192658185958862, + "learning_rate": 2.6855468750000003e-06, + "loss": 0.0792, + "step": 9990 + }, + { + "epoch": 3.795066413662239, + "grad_norm": 0.0573776513338089, + "learning_rate": 2.63671875e-06, + "loss": 0.0136, + "step": 10000 + }, + { + "epoch": 3.7988614800759013, + "grad_norm": 0.07321004569530487, + "learning_rate": 2.587890625e-06, + "loss": 0.0461, + "step": 10010 + }, + { + "epoch": 3.8026565464895636, + "grad_norm": 0.045114945620298386, + "learning_rate": 2.5390625000000003e-06, + "loss": 0.0658, + "step": 10020 + }, + { + "epoch": 3.806451612903226, + "grad_norm": 0.3899228870868683, + "learning_rate": 2.490234375e-06, + "loss": 0.0389, + "step": 10030 + }, + { + "epoch": 3.8102466793168883, + "grad_norm": 0.6319021582603455, + "learning_rate": 2.44140625e-06, + "loss": 0.0247, + "step": 10040 + }, + { + "epoch": 3.81404174573055, + "grad_norm": 1.4026541709899902, + "learning_rate": 2.392578125e-06, + "loss": 0.0161, + "step": 10050 + }, + { + "epoch": 3.8178368121442126, + "grad_norm": 4.106344699859619, + "learning_rate": 2.3437500000000002e-06, + "loss": 0.042, + "step": 10060 + }, + { + "epoch": 3.821631878557875, + "grad_norm": 0.5673054456710815, + "learning_rate": 2.294921875e-06, + "loss": 0.0589, + "step": 10070 + }, + { + "epoch": 3.825426944971537, + "grad_norm": 0.057744644582271576, + "learning_rate": 2.24609375e-06, + "loss": 0.0305, + "step": 10080 + }, + { + "epoch": 3.829222011385199, + "grad_norm": 3.3453450202941895, + "learning_rate": 2.1972656250000003e-06, + "loss": 0.0317, + "step": 10090 + }, + { + "epoch": 3.8330170777988615, + "grad_norm": 0.08820886164903641, + "learning_rate": 2.1484375e-06, + "loss": 0.0355, + "step": 10100 + }, + { + "epoch": 3.836812144212524, + "grad_norm": 1.522764801979065, + "learning_rate": 2.099609375e-06, + "loss": 0.0496, + "step": 10110 + }, + { + "epoch": 3.840607210626186, + "grad_norm": 0.9732184410095215, + "learning_rate": 2.0507812500000003e-06, + "loss": 0.0303, + "step": 10120 + }, + { + "epoch": 3.844402277039848, + "grad_norm": 0.1131846010684967, + "learning_rate": 2.001953125e-06, + "loss": 0.0359, + "step": 10130 + }, + { + "epoch": 3.8481973434535104, + "grad_norm": 3.4666688442230225, + "learning_rate": 1.953125e-06, + "loss": 0.0542, + "step": 10140 + }, + { + "epoch": 3.8519924098671727, + "grad_norm": 3.6389381885528564, + "learning_rate": 1.9042968750000001e-06, + "loss": 0.0328, + "step": 10150 + }, + { + "epoch": 3.855787476280835, + "grad_norm": 0.7695565819740295, + "learning_rate": 1.85546875e-06, + "loss": 0.0294, + "step": 10160 + }, + { + "epoch": 3.859582542694497, + "grad_norm": 5.1775593757629395, + "learning_rate": 1.806640625e-06, + "loss": 0.055, + "step": 10170 + }, + { + "epoch": 3.8633776091081593, + "grad_norm": 0.46061795949935913, + "learning_rate": 1.7578125000000002e-06, + "loss": 0.0376, + "step": 10180 + }, + { + "epoch": 3.8671726755218216, + "grad_norm": 0.16866852343082428, + "learning_rate": 1.708984375e-06, + "loss": 0.092, + "step": 10190 + }, + { + "epoch": 3.870967741935484, + "grad_norm": 2.495349168777466, + "learning_rate": 1.6601562500000001e-06, + "loss": 0.0205, + "step": 10200 + }, + { + "epoch": 3.8747628083491463, + "grad_norm": 4.127594470977783, + "learning_rate": 1.611328125e-06, + "loss": 0.0376, + "step": 10210 + }, + { + "epoch": 3.878557874762808, + "grad_norm": 0.0868837833404541, + "learning_rate": 1.5625e-06, + "loss": 0.0715, + "step": 10220 + }, + { + "epoch": 3.8823529411764706, + "grad_norm": 2.7866268157958984, + "learning_rate": 1.5136718750000002e-06, + "loss": 0.0623, + "step": 10230 + }, + { + "epoch": 3.886148007590133, + "grad_norm": 0.5652477741241455, + "learning_rate": 1.46484375e-06, + "loss": 0.0521, + "step": 10240 + }, + { + "epoch": 3.889943074003795, + "grad_norm": 0.13568060100078583, + "learning_rate": 1.4160156250000001e-06, + "loss": 0.0373, + "step": 10250 + }, + { + "epoch": 3.893738140417457, + "grad_norm": 7.213637828826904, + "learning_rate": 1.3671875e-06, + "loss": 0.1189, + "step": 10260 + }, + { + "epoch": 3.8975332068311195, + "grad_norm": 4.795431613922119, + "learning_rate": 1.318359375e-06, + "loss": 0.0368, + "step": 10270 + }, + { + "epoch": 3.901328273244782, + "grad_norm": 4.8751220703125, + "learning_rate": 1.2695312500000002e-06, + "loss": 0.0972, + "step": 10280 + }, + { + "epoch": 3.905123339658444, + "grad_norm": 0.5513148307800293, + "learning_rate": 1.220703125e-06, + "loss": 0.0287, + "step": 10290 + }, + { + "epoch": 3.9089184060721065, + "grad_norm": 0.16232678294181824, + "learning_rate": 1.1718750000000001e-06, + "loss": 0.0651, + "step": 10300 + }, + { + "epoch": 3.9127134724857684, + "grad_norm": 3.053624391555786, + "learning_rate": 1.123046875e-06, + "loss": 0.0358, + "step": 10310 + }, + { + "epoch": 3.9165085388994307, + "grad_norm": 0.1307297945022583, + "learning_rate": 1.07421875e-06, + "loss": 0.0171, + "step": 10320 + }, + { + "epoch": 3.920303605313093, + "grad_norm": 5.61918306350708, + "learning_rate": 1.0253906250000001e-06, + "loss": 0.0383, + "step": 10330 + }, + { + "epoch": 3.924098671726755, + "grad_norm": 4.017998695373535, + "learning_rate": 9.765625e-07, + "loss": 0.0547, + "step": 10340 + }, + { + "epoch": 3.9278937381404173, + "grad_norm": 8.339895248413086, + "learning_rate": 9.27734375e-07, + "loss": 0.059, + "step": 10350 + }, + { + "epoch": 3.9316888045540797, + "grad_norm": 0.5986772179603577, + "learning_rate": 8.789062500000001e-07, + "loss": 0.0773, + "step": 10360 + }, + { + "epoch": 3.935483870967742, + "grad_norm": 0.0516970194876194, + "learning_rate": 8.300781250000001e-07, + "loss": 0.0697, + "step": 10370 + }, + { + "epoch": 3.9392789373814043, + "grad_norm": 1.0691931247711182, + "learning_rate": 7.8125e-07, + "loss": 0.0382, + "step": 10380 + }, + { + "epoch": 3.9430740037950667, + "grad_norm": 1.0503530502319336, + "learning_rate": 7.32421875e-07, + "loss": 0.0781, + "step": 10390 + }, + { + "epoch": 3.9468690702087286, + "grad_norm": 4.003793239593506, + "learning_rate": 6.8359375e-07, + "loss": 0.1007, + "step": 10400 + }, + { + "epoch": 3.950664136622391, + "grad_norm": 0.04315977543592453, + "learning_rate": 6.347656250000001e-07, + "loss": 0.0553, + "step": 10410 + }, + { + "epoch": 3.9544592030360532, + "grad_norm": 4.378900051116943, + "learning_rate": 5.859375000000001e-07, + "loss": 0.0239, + "step": 10420 + }, + { + "epoch": 3.958254269449715, + "grad_norm": 0.17604303359985352, + "learning_rate": 5.37109375e-07, + "loss": 0.0338, + "step": 10430 + }, + { + "epoch": 3.9620493358633775, + "grad_norm": 0.040019456297159195, + "learning_rate": 4.8828125e-07, + "loss": 0.0088, + "step": 10440 + }, + { + "epoch": 3.96584440227704, + "grad_norm": 4.001920700073242, + "learning_rate": 4.3945312500000004e-07, + "loss": 0.0395, + "step": 10450 + }, + { + "epoch": 3.969639468690702, + "grad_norm": 4.805160999298096, + "learning_rate": 3.90625e-07, + "loss": 0.0713, + "step": 10460 + }, + { + "epoch": 3.9734345351043645, + "grad_norm": 0.0865137130022049, + "learning_rate": 3.41796875e-07, + "loss": 0.0394, + "step": 10470 + }, + { + "epoch": 3.9772296015180264, + "grad_norm": 2.695357322692871, + "learning_rate": 2.9296875000000003e-07, + "loss": 0.0582, + "step": 10480 + }, + { + "epoch": 3.9810246679316887, + "grad_norm": 0.9629122018814087, + "learning_rate": 2.44140625e-07, + "loss": 0.0201, + "step": 10490 + }, + { + "epoch": 3.984819734345351, + "grad_norm": 0.8045425415039062, + "learning_rate": 1.953125e-07, + "loss": 0.0653, + "step": 10500 + } + ], + "logging_steps": 10, + "max_steps": 10540, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2762272477794816.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}