diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4328222763528682, + "eval_steps": 500, + "global_step": 15000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009552466924583273, + "grad_norm": 2.0798542499542236, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.907, + "step": 10 + }, + { + "epoch": 0.0019104933849166546, + "grad_norm": 1.9614883661270142, + "learning_rate": 2.5333333333333334e-06, + "loss": 0.8934, + "step": 20 + }, + { + "epoch": 0.002865740077374982, + "grad_norm": 1.724998950958252, + "learning_rate": 3.866666666666667e-06, + "loss": 0.7885, + "step": 30 + }, + { + "epoch": 0.0038209867698333093, + "grad_norm": 1.2985706329345703, + "learning_rate": 5.2e-06, + "loss": 0.6411, + "step": 40 + }, + { + "epoch": 0.004776233462291637, + "grad_norm": 2.9078688621520996, + "learning_rate": 6.533333333333333e-06, + "loss": 0.5671, + "step": 50 + }, + { + "epoch": 0.005731480154749964, + "grad_norm": 0.6146636605262756, + "learning_rate": 7.866666666666667e-06, + "loss": 0.3757, + "step": 60 + }, + { + "epoch": 0.006686726847208292, + "grad_norm": 0.4550829827785492, + "learning_rate": 9.2e-06, + "loss": 0.3449, + "step": 70 + }, + { + "epoch": 0.0076419735396666185, + "grad_norm": 0.6388053894042969, + "learning_rate": 1.0533333333333335e-05, + "loss": 0.3064, + "step": 80 + }, + { + "epoch": 0.008597220232124947, + "grad_norm": 0.44831541180610657, + "learning_rate": 1.1866666666666668e-05, + "loss": 0.2696, + "step": 90 + }, + { + "epoch": 0.009552466924583274, + "grad_norm": 0.4611794948577881, + "learning_rate": 1.32e-05, + "loss": 0.2448, + "step": 100 + }, + { + "epoch": 0.0105077136170416, + "grad_norm": 0.1431284248828888, + "learning_rate": 1.4533333333333335e-05, + "loss": 0.2094, + "step": 110 + }, + { + "epoch": 0.011462960309499929, + "grad_norm": 0.11084526777267456, + "learning_rate": 1.586666666666667e-05, + "loss": 0.2167, + "step": 120 + }, + { + "epoch": 0.012418207001958255, + "grad_norm": 0.11287008225917816, + "learning_rate": 1.7199999999999998e-05, + "loss": 0.2187, + "step": 130 + }, + { + "epoch": 0.013373453694416584, + "grad_norm": 0.11731468141078949, + "learning_rate": 1.8533333333333334e-05, + "loss": 0.2231, + "step": 140 + }, + { + "epoch": 0.01432870038687491, + "grad_norm": 0.16812096536159515, + "learning_rate": 1.9866666666666667e-05, + "loss": 0.2246, + "step": 150 + }, + { + "epoch": 0.015283947079333237, + "grad_norm": 0.09585987776517868, + "learning_rate": 2.12e-05, + "loss": 0.1999, + "step": 160 + }, + { + "epoch": 0.016239193771791564, + "grad_norm": 0.06834663450717926, + "learning_rate": 2.2533333333333333e-05, + "loss": 0.2112, + "step": 170 + }, + { + "epoch": 0.017194440464249894, + "grad_norm": 0.0838400200009346, + "learning_rate": 2.3866666666666666e-05, + "loss": 0.214, + "step": 180 + }, + { + "epoch": 0.01814968715670822, + "grad_norm": 0.09908033907413483, + "learning_rate": 2.5200000000000003e-05, + "loss": 0.2187, + "step": 190 + }, + { + "epoch": 0.019104933849166547, + "grad_norm": 0.15255768597126007, + "learning_rate": 2.6533333333333332e-05, + "loss": 0.2212, + "step": 200 + }, + { + "epoch": 0.020060180541624874, + "grad_norm": 0.08462337404489517, + "learning_rate": 2.786666666666667e-05, + "loss": 0.1999, + "step": 210 + }, + { + "epoch": 0.0210154272340832, + "grad_norm": 0.06040903180837631, + "learning_rate": 2.9199999999999998e-05, + "loss": 0.2088, + "step": 220 + }, + { + "epoch": 0.02197067392654153, + "grad_norm": 0.06774196773767471, + "learning_rate": 3.0533333333333335e-05, + "loss": 0.2129, + "step": 230 + }, + { + "epoch": 0.022925920618999857, + "grad_norm": 0.11282925307750702, + "learning_rate": 3.1866666666666664e-05, + "loss": 0.2198, + "step": 240 + }, + { + "epoch": 0.023881167311458184, + "grad_norm": 0.1323988288640976, + "learning_rate": 3.32e-05, + "loss": 0.221, + "step": 250 + }, + { + "epoch": 0.02483641400391651, + "grad_norm": 0.10758616775274277, + "learning_rate": 3.453333333333334e-05, + "loss": 0.2001, + "step": 260 + }, + { + "epoch": 0.025791660696374837, + "grad_norm": 0.05729316174983978, + "learning_rate": 3.586666666666667e-05, + "loss": 0.2104, + "step": 270 + }, + { + "epoch": 0.026746907388833167, + "grad_norm": 0.06489019095897675, + "learning_rate": 3.72e-05, + "loss": 0.2127, + "step": 280 + }, + { + "epoch": 0.027702154081291494, + "grad_norm": 0.10550963878631592, + "learning_rate": 3.853333333333334e-05, + "loss": 0.2171, + "step": 290 + }, + { + "epoch": 0.02865740077374982, + "grad_norm": 0.14865626394748688, + "learning_rate": 3.986666666666667e-05, + "loss": 0.2196, + "step": 300 + }, + { + "epoch": 0.029612647466208147, + "grad_norm": 0.13421903550624847, + "learning_rate": 4.12e-05, + "loss": 0.2017, + "step": 310 + }, + { + "epoch": 0.030567894158666474, + "grad_norm": 0.07205755263566971, + "learning_rate": 4.2533333333333335e-05, + "loss": 0.209, + "step": 320 + }, + { + "epoch": 0.0315231408511248, + "grad_norm": 0.06970394402742386, + "learning_rate": 4.3866666666666665e-05, + "loss": 0.2132, + "step": 330 + }, + { + "epoch": 0.03247838754358313, + "grad_norm": 0.16807648539543152, + "learning_rate": 4.52e-05, + "loss": 0.2167, + "step": 340 + }, + { + "epoch": 0.03343363423604146, + "grad_norm": 0.1984771341085434, + "learning_rate": 4.653333333333334e-05, + "loss": 0.2166, + "step": 350 + }, + { + "epoch": 0.03438888092849979, + "grad_norm": 0.09778633713722229, + "learning_rate": 4.7866666666666674e-05, + "loss": 0.2041, + "step": 360 + }, + { + "epoch": 0.035344127620958114, + "grad_norm": 0.08991667628288269, + "learning_rate": 4.92e-05, + "loss": 0.2082, + "step": 370 + }, + { + "epoch": 0.03629937431341644, + "grad_norm": 0.10208883881568909, + "learning_rate": 5.053333333333333e-05, + "loss": 0.2081, + "step": 380 + }, + { + "epoch": 0.03725462100587477, + "grad_norm": 0.1255088746547699, + "learning_rate": 5.1866666666666676e-05, + "loss": 0.2118, + "step": 390 + }, + { + "epoch": 0.038209867698333094, + "grad_norm": 0.19001370668411255, + "learning_rate": 5.3200000000000006e-05, + "loss": 0.2146, + "step": 400 + }, + { + "epoch": 0.03916511439079142, + "grad_norm": 0.09977246075868607, + "learning_rate": 5.4533333333333335e-05, + "loss": 0.2018, + "step": 410 + }, + { + "epoch": 0.04012036108324975, + "grad_norm": 0.12999273836612701, + "learning_rate": 5.5866666666666665e-05, + "loss": 0.2063, + "step": 420 + }, + { + "epoch": 0.041075607775708074, + "grad_norm": 0.08959522098302841, + "learning_rate": 5.72e-05, + "loss": 0.2064, + "step": 430 + }, + { + "epoch": 0.0420308544681664, + "grad_norm": 0.14837408065795898, + "learning_rate": 5.853333333333334e-05, + "loss": 0.2104, + "step": 440 + }, + { + "epoch": 0.042986101160624735, + "grad_norm": 0.18937060236930847, + "learning_rate": 5.9866666666666674e-05, + "loss": 0.2132, + "step": 450 + }, + { + "epoch": 0.04394134785308306, + "grad_norm": 0.12442853301763535, + "learning_rate": 6.12e-05, + "loss": 0.2006, + "step": 460 + }, + { + "epoch": 0.04489659454554139, + "grad_norm": 0.11519322544336319, + "learning_rate": 6.253333333333333e-05, + "loss": 0.2039, + "step": 470 + }, + { + "epoch": 0.045851841237999715, + "grad_norm": 0.0926322489976883, + "learning_rate": 6.386666666666667e-05, + "loss": 0.2052, + "step": 480 + }, + { + "epoch": 0.04680708793045804, + "grad_norm": 0.16390164196491241, + "learning_rate": 6.52e-05, + "loss": 0.2091, + "step": 490 + }, + { + "epoch": 0.04776233462291637, + "grad_norm": 0.256491094827652, + "learning_rate": 6.653333333333334e-05, + "loss": 0.2122, + "step": 500 + }, + { + "epoch": 0.048717581315374694, + "grad_norm": 0.12082330137491226, + "learning_rate": 6.786666666666667e-05, + "loss": 0.2003, + "step": 510 + }, + { + "epoch": 0.04967282800783302, + "grad_norm": 0.08324269205331802, + "learning_rate": 6.92e-05, + "loss": 0.2042, + "step": 520 + }, + { + "epoch": 0.05062807470029135, + "grad_norm": 0.06913580000400543, + "learning_rate": 7.053333333333334e-05, + "loss": 0.2033, + "step": 530 + }, + { + "epoch": 0.051583321392749674, + "grad_norm": 0.11715491861104965, + "learning_rate": 7.186666666666667e-05, + "loss": 0.2081, + "step": 540 + }, + { + "epoch": 0.05253856808520801, + "grad_norm": 0.22186318039894104, + "learning_rate": 7.32e-05, + "loss": 0.2094, + "step": 550 + }, + { + "epoch": 0.053493814777666335, + "grad_norm": 0.11386065185070038, + "learning_rate": 7.453333333333333e-05, + "loss": 0.2031, + "step": 560 + }, + { + "epoch": 0.05444906147012466, + "grad_norm": 0.08842533081769943, + "learning_rate": 7.586666666666668e-05, + "loss": 0.2001, + "step": 570 + }, + { + "epoch": 0.05540430816258299, + "grad_norm": 0.09553701430559158, + "learning_rate": 7.72e-05, + "loss": 0.201, + "step": 580 + }, + { + "epoch": 0.056359554855041315, + "grad_norm": 0.12029985338449478, + "learning_rate": 7.853333333333334e-05, + "loss": 0.2066, + "step": 590 + }, + { + "epoch": 0.05731480154749964, + "grad_norm": 0.36245641112327576, + "learning_rate": 7.986666666666667e-05, + "loss": 0.212, + "step": 600 + }, + { + "epoch": 0.05827004823995797, + "grad_norm": 0.11322560906410217, + "learning_rate": 8.120000000000001e-05, + "loss": 0.203, + "step": 610 + }, + { + "epoch": 0.059225294932416295, + "grad_norm": 0.07776595652103424, + "learning_rate": 8.253333333333334e-05, + "loss": 0.2016, + "step": 620 + }, + { + "epoch": 0.06018054162487462, + "grad_norm": 0.09582793712615967, + "learning_rate": 8.386666666666667e-05, + "loss": 0.203, + "step": 630 + }, + { + "epoch": 0.06113578831733295, + "grad_norm": 0.13911056518554688, + "learning_rate": 8.52e-05, + "loss": 0.2053, + "step": 640 + }, + { + "epoch": 0.06209103500979128, + "grad_norm": 0.3633959889411926, + "learning_rate": 8.653333333333333e-05, + "loss": 0.2086, + "step": 650 + }, + { + "epoch": 0.0630462817022496, + "grad_norm": 0.10734552890062332, + "learning_rate": 8.786666666666667e-05, + "loss": 0.2065, + "step": 660 + }, + { + "epoch": 0.06400152839470793, + "grad_norm": 0.08110401779413223, + "learning_rate": 8.92e-05, + "loss": 0.201, + "step": 670 + }, + { + "epoch": 0.06495677508716625, + "grad_norm": 0.08999983966350555, + "learning_rate": 9.053333333333334e-05, + "loss": 0.202, + "step": 680 + }, + { + "epoch": 0.06591202177962459, + "grad_norm": 0.11316987872123718, + "learning_rate": 9.186666666666667e-05, + "loss": 0.2084, + "step": 690 + }, + { + "epoch": 0.06686726847208292, + "grad_norm": 0.3875355124473572, + "learning_rate": 9.320000000000002e-05, + "loss": 0.2092, + "step": 700 + }, + { + "epoch": 0.06782251516454124, + "grad_norm": 0.12044321745634079, + "learning_rate": 9.453333333333335e-05, + "loss": 0.2064, + "step": 710 + }, + { + "epoch": 0.06877776185699958, + "grad_norm": 0.07788579165935516, + "learning_rate": 9.586666666666667e-05, + "loss": 0.1998, + "step": 720 + }, + { + "epoch": 0.0697330085494579, + "grad_norm": 0.10780824720859528, + "learning_rate": 9.72e-05, + "loss": 0.2019, + "step": 730 + }, + { + "epoch": 0.07068825524191623, + "grad_norm": 0.11069575697183609, + "learning_rate": 9.853333333333333e-05, + "loss": 0.2056, + "step": 740 + }, + { + "epoch": 0.07164350193437455, + "grad_norm": 0.4447477459907532, + "learning_rate": 9.986666666666668e-05, + "loss": 0.2076, + "step": 750 + }, + { + "epoch": 0.07259874862683288, + "grad_norm": 0.2588540315628052, + "learning_rate": 0.00010120000000000001, + "loss": 0.2038, + "step": 760 + }, + { + "epoch": 0.0735539953192912, + "grad_norm": 0.08379202336072922, + "learning_rate": 0.00010253333333333335, + "loss": 0.201, + "step": 770 + }, + { + "epoch": 0.07450924201174954, + "grad_norm": 0.0976845845580101, + "learning_rate": 0.00010386666666666667, + "loss": 0.2018, + "step": 780 + }, + { + "epoch": 0.07546448870420785, + "grad_norm": 0.1460348218679428, + "learning_rate": 0.00010520000000000001, + "loss": 0.204, + "step": 790 + }, + { + "epoch": 0.07641973539666619, + "grad_norm": 0.4925660789012909, + "learning_rate": 0.00010653333333333333, + "loss": 0.2079, + "step": 800 + }, + { + "epoch": 0.07737498208912452, + "grad_norm": 0.1974959820508957, + "learning_rate": 0.00010786666666666667, + "loss": 0.207, + "step": 810 + }, + { + "epoch": 0.07833022878158284, + "grad_norm": 0.16117534041404724, + "learning_rate": 0.00010920000000000001, + "loss": 0.2022, + "step": 820 + }, + { + "epoch": 0.07928547547404118, + "grad_norm": 0.16919535398483276, + "learning_rate": 0.00011053333333333333, + "loss": 0.2027, + "step": 830 + }, + { + "epoch": 0.0802407221664995, + "grad_norm": 0.11847756057977676, + "learning_rate": 0.00011186666666666667, + "loss": 0.2068, + "step": 840 + }, + { + "epoch": 0.08119596885895783, + "grad_norm": 0.41180866956710815, + "learning_rate": 0.0001132, + "loss": 0.2074, + "step": 850 + }, + { + "epoch": 0.08215121555141615, + "grad_norm": 0.17310842871665955, + "learning_rate": 0.00011453333333333334, + "loss": 0.2067, + "step": 860 + }, + { + "epoch": 0.08310646224387448, + "grad_norm": 0.11892854422330856, + "learning_rate": 0.00011586666666666667, + "loss": 0.2006, + "step": 870 + }, + { + "epoch": 0.0840617089363328, + "grad_norm": 0.09759584814310074, + "learning_rate": 0.0001172, + "loss": 0.204, + "step": 880 + }, + { + "epoch": 0.08501695562879114, + "grad_norm": 0.09496277570724487, + "learning_rate": 0.00011853333333333335, + "loss": 0.2069, + "step": 890 + }, + { + "epoch": 0.08597220232124947, + "grad_norm": 0.34175682067871094, + "learning_rate": 0.00011986666666666669, + "loss": 0.2088, + "step": 900 + }, + { + "epoch": 0.08692744901370779, + "grad_norm": 0.17791183292865753, + "learning_rate": 0.0001212, + "loss": 0.2033, + "step": 910 + }, + { + "epoch": 0.08788269570616612, + "grad_norm": 0.11148897558450699, + "learning_rate": 0.00012253333333333335, + "loss": 0.2034, + "step": 920 + }, + { + "epoch": 0.08883794239862444, + "grad_norm": 0.08730180561542511, + "learning_rate": 0.00012386666666666665, + "loss": 0.2043, + "step": 930 + }, + { + "epoch": 0.08979318909108278, + "grad_norm": 0.10876961797475815, + "learning_rate": 0.0001252, + "loss": 0.2074, + "step": 940 + }, + { + "epoch": 0.0907484357835411, + "grad_norm": 0.4162116050720215, + "learning_rate": 0.00012653333333333334, + "loss": 0.2099, + "step": 950 + }, + { + "epoch": 0.09170368247599943, + "grad_norm": 0.349356085062027, + "learning_rate": 0.00012786666666666667, + "loss": 0.2073, + "step": 960 + }, + { + "epoch": 0.09265892916845775, + "grad_norm": 0.144659161567688, + "learning_rate": 0.00012920000000000002, + "loss": 0.2014, + "step": 970 + }, + { + "epoch": 0.09361417586091608, + "grad_norm": 0.10344243794679642, + "learning_rate": 0.00013053333333333333, + "loss": 0.2041, + "step": 980 + }, + { + "epoch": 0.0945694225533744, + "grad_norm": 0.10903052240610123, + "learning_rate": 0.00013186666666666668, + "loss": 0.2067, + "step": 990 + }, + { + "epoch": 0.09552466924583274, + "grad_norm": 0.4268416464328766, + "learning_rate": 0.0001332, + "loss": 0.2091, + "step": 1000 + }, + { + "epoch": 0.09647991593829107, + "grad_norm": 0.18322598934173584, + "learning_rate": 0.00013453333333333334, + "loss": 0.2075, + "step": 1010 + }, + { + "epoch": 0.09743516263074939, + "grad_norm": 0.1304457038640976, + "learning_rate": 0.00013586666666666667, + "loss": 0.1997, + "step": 1020 + }, + { + "epoch": 0.09839040932320772, + "grad_norm": 0.1123695969581604, + "learning_rate": 0.00013720000000000003, + "loss": 0.2017, + "step": 1030 + }, + { + "epoch": 0.09934565601566604, + "grad_norm": 0.15449099242687225, + "learning_rate": 0.00013853333333333333, + "loss": 0.2064, + "step": 1040 + }, + { + "epoch": 0.10030090270812438, + "grad_norm": 0.43143564462661743, + "learning_rate": 0.0001398666666666667, + "loss": 0.2089, + "step": 1050 + }, + { + "epoch": 0.1012561494005827, + "grad_norm": 0.2075752168893814, + "learning_rate": 0.0001412, + "loss": 0.2063, + "step": 1060 + }, + { + "epoch": 0.10221139609304103, + "grad_norm": 0.09274331480264664, + "learning_rate": 0.00014253333333333335, + "loss": 0.2009, + "step": 1070 + }, + { + "epoch": 0.10316664278549935, + "grad_norm": 0.10772950947284698, + "learning_rate": 0.00014386666666666668, + "loss": 0.2018, + "step": 1080 + }, + { + "epoch": 0.10412188947795768, + "grad_norm": 0.13142071664333344, + "learning_rate": 0.0001452, + "loss": 0.2055, + "step": 1090 + }, + { + "epoch": 0.10507713617041602, + "grad_norm": 0.45018109679222107, + "learning_rate": 0.00014653333333333334, + "loss": 0.2078, + "step": 1100 + }, + { + "epoch": 0.10603238286287434, + "grad_norm": 0.17738114297389984, + "learning_rate": 0.00014786666666666666, + "loss": 0.2159, + "step": 1110 + }, + { + "epoch": 0.10698762955533267, + "grad_norm": 0.18743546307086945, + "learning_rate": 0.0001492, + "loss": 0.2173, + "step": 1120 + }, + { + "epoch": 0.10794287624779099, + "grad_norm": 0.11755328625440598, + "learning_rate": 0.00015053333333333335, + "loss": 0.209, + "step": 1130 + }, + { + "epoch": 0.10889812294024932, + "grad_norm": 0.15332145988941193, + "learning_rate": 0.00015186666666666668, + "loss": 0.2047, + "step": 1140 + }, + { + "epoch": 0.10985336963270764, + "grad_norm": 0.5098714232444763, + "learning_rate": 0.0001532, + "loss": 0.2054, + "step": 1150 + }, + { + "epoch": 0.11080861632516598, + "grad_norm": 0.24695514142513275, + "learning_rate": 0.00015453333333333334, + "loss": 0.2137, + "step": 1160 + }, + { + "epoch": 0.1117638630176243, + "grad_norm": 0.10639436542987823, + "learning_rate": 0.00015586666666666667, + "loss": 0.2021, + "step": 1170 + }, + { + "epoch": 0.11271910971008263, + "grad_norm": 0.13042502105236053, + "learning_rate": 0.00015720000000000003, + "loss": 0.2059, + "step": 1180 + }, + { + "epoch": 0.11367435640254096, + "grad_norm": 0.11687403172254562, + "learning_rate": 0.00015853333333333333, + "loss": 0.2071, + "step": 1190 + }, + { + "epoch": 0.11462960309499928, + "grad_norm": 0.5390796065330505, + "learning_rate": 0.00015986666666666669, + "loss": 0.2096, + "step": 1200 + }, + { + "epoch": 0.11558484978745762, + "grad_norm": 0.20866340398788452, + "learning_rate": 0.00016120000000000002, + "loss": 0.2236, + "step": 1210 + }, + { + "epoch": 0.11654009647991594, + "grad_norm": 0.08614636212587357, + "learning_rate": 0.00016253333333333334, + "loss": 0.2031, + "step": 1220 + }, + { + "epoch": 0.11749534317237427, + "grad_norm": 0.11634163558483124, + "learning_rate": 0.00016386666666666667, + "loss": 0.2031, + "step": 1230 + }, + { + "epoch": 0.11845058986483259, + "grad_norm": 0.14586254954338074, + "learning_rate": 0.0001652, + "loss": 0.2069, + "step": 1240 + }, + { + "epoch": 0.11940583655729092, + "grad_norm": 0.5198449492454529, + "learning_rate": 0.00016653333333333333, + "loss": 0.2064, + "step": 1250 + }, + { + "epoch": 0.12036108324974924, + "grad_norm": 0.2319190949201584, + "learning_rate": 0.0001678666666666667, + "loss": 0.2549, + "step": 1260 + }, + { + "epoch": 0.12131632994220758, + "grad_norm": 0.14767807722091675, + "learning_rate": 0.0001692, + "loss": 0.2072, + "step": 1270 + }, + { + "epoch": 0.1222715766346659, + "grad_norm": 0.09973818808794022, + "learning_rate": 0.00017053333333333335, + "loss": 0.2068, + "step": 1280 + }, + { + "epoch": 0.12322682332712423, + "grad_norm": 0.12260327488183975, + "learning_rate": 0.00017186666666666665, + "loss": 0.2072, + "step": 1290 + }, + { + "epoch": 0.12418207001958256, + "grad_norm": 0.5598644018173218, + "learning_rate": 0.0001732, + "loss": 0.2071, + "step": 1300 + }, + { + "epoch": 0.1251373167120409, + "grad_norm": 0.21588961780071259, + "learning_rate": 0.00017453333333333334, + "loss": 0.2389, + "step": 1310 + }, + { + "epoch": 0.1260925634044992, + "grad_norm": 0.11560805141925812, + "learning_rate": 0.00017586666666666667, + "loss": 0.2039, + "step": 1320 + }, + { + "epoch": 0.12704781009695754, + "grad_norm": 0.09798284620046616, + "learning_rate": 0.0001772, + "loss": 0.2039, + "step": 1330 + }, + { + "epoch": 0.12800305678941587, + "grad_norm": 0.16167224943637848, + "learning_rate": 0.00017853333333333335, + "loss": 0.2049, + "step": 1340 + }, + { + "epoch": 0.1289583034818742, + "grad_norm": 0.32957813143730164, + "learning_rate": 0.00017986666666666668, + "loss": 0.2022, + "step": 1350 + }, + { + "epoch": 0.1299135501743325, + "grad_norm": 0.21633221209049225, + "learning_rate": 0.0001812, + "loss": 0.2676, + "step": 1360 + }, + { + "epoch": 0.13086879686679084, + "grad_norm": 0.12575742602348328, + "learning_rate": 0.00018253333333333334, + "loss": 0.206, + "step": 1370 + }, + { + "epoch": 0.13182404355924918, + "grad_norm": 0.15339286625385284, + "learning_rate": 0.00018386666666666667, + "loss": 0.2055, + "step": 1380 + }, + { + "epoch": 0.1327792902517075, + "grad_norm": 0.17081816494464874, + "learning_rate": 0.00018520000000000003, + "loss": 0.2057, + "step": 1390 + }, + { + "epoch": 0.13373453694416584, + "grad_norm": 0.2828335165977478, + "learning_rate": 0.00018653333333333333, + "loss": 0.204, + "step": 1400 + }, + { + "epoch": 0.13468978363662415, + "grad_norm": 0.14135104417800903, + "learning_rate": 0.0001878666666666667, + "loss": 0.2242, + "step": 1410 + }, + { + "epoch": 0.13564503032908248, + "grad_norm": 0.10726441442966461, + "learning_rate": 0.0001892, + "loss": 0.1995, + "step": 1420 + }, + { + "epoch": 0.13660027702154082, + "grad_norm": 0.0936957523226738, + "learning_rate": 0.00019053333333333335, + "loss": 0.2039, + "step": 1430 + }, + { + "epoch": 0.13755552371399915, + "grad_norm": 0.15068306028842926, + "learning_rate": 0.00019186666666666668, + "loss": 0.2048, + "step": 1440 + }, + { + "epoch": 0.13851077040645746, + "grad_norm": 0.13459832966327667, + "learning_rate": 0.0001932, + "loss": 0.1975, + "step": 1450 + }, + { + "epoch": 0.1394660170989158, + "grad_norm": 0.1478847712278366, + "learning_rate": 0.00019453333333333334, + "loss": 0.2233, + "step": 1460 + }, + { + "epoch": 0.14042126379137412, + "grad_norm": 0.10763124376535416, + "learning_rate": 0.00019586666666666667, + "loss": 0.2024, + "step": 1470 + }, + { + "epoch": 0.14137651048383246, + "grad_norm": 0.1476346254348755, + "learning_rate": 0.0001972, + "loss": 0.2021, + "step": 1480 + }, + { + "epoch": 0.1423317571762908, + "grad_norm": 0.1142350286245346, + "learning_rate": 0.00019853333333333335, + "loss": 0.2077, + "step": 1490 + }, + { + "epoch": 0.1432870038687491, + "grad_norm": 0.3819134831428528, + "learning_rate": 0.00019986666666666668, + "loss": 0.2046, + "step": 1500 + }, + { + "epoch": 0.14424225056120743, + "grad_norm": 0.20617090165615082, + "learning_rate": 0.00019986666666666668, + "loss": 0.2243, + "step": 1510 + }, + { + "epoch": 0.14519749725366576, + "grad_norm": 0.11755608022212982, + "learning_rate": 0.00019971851851851853, + "loss": 0.2015, + "step": 1520 + }, + { + "epoch": 0.1461527439461241, + "grad_norm": 0.10507424175739288, + "learning_rate": 0.00019957037037037037, + "loss": 0.2065, + "step": 1530 + }, + { + "epoch": 0.1471079906385824, + "grad_norm": 0.12585783004760742, + "learning_rate": 0.00019942222222222222, + "loss": 0.2051, + "step": 1540 + }, + { + "epoch": 0.14806323733104074, + "grad_norm": 0.22103334963321686, + "learning_rate": 0.00019927407407407407, + "loss": 0.2024, + "step": 1550 + }, + { + "epoch": 0.14901848402349907, + "grad_norm": 0.21955139935016632, + "learning_rate": 0.00019912592592592594, + "loss": 0.2186, + "step": 1560 + }, + { + "epoch": 0.1499737307159574, + "grad_norm": 0.14329008758068085, + "learning_rate": 0.0001989777777777778, + "loss": 0.2022, + "step": 1570 + }, + { + "epoch": 0.1509289774084157, + "grad_norm": 0.1393403261899948, + "learning_rate": 0.00019882962962962963, + "loss": 0.2037, + "step": 1580 + }, + { + "epoch": 0.15188422410087404, + "grad_norm": 0.1838270127773285, + "learning_rate": 0.00019868148148148148, + "loss": 0.2071, + "step": 1590 + }, + { + "epoch": 0.15283947079333238, + "grad_norm": 0.26425862312316895, + "learning_rate": 0.00019853333333333335, + "loss": 0.2049, + "step": 1600 + }, + { + "epoch": 0.1537947174857907, + "grad_norm": 0.15780730545520782, + "learning_rate": 0.00019838518518518517, + "loss": 0.2176, + "step": 1610 + }, + { + "epoch": 0.15474996417824904, + "grad_norm": 0.09308743476867676, + "learning_rate": 0.00019823703703703704, + "loss": 0.2027, + "step": 1620 + }, + { + "epoch": 0.15570521087070735, + "grad_norm": 0.11731009930372238, + "learning_rate": 0.0001980888888888889, + "loss": 0.2024, + "step": 1630 + }, + { + "epoch": 0.15666045756316568, + "grad_norm": 0.151070237159729, + "learning_rate": 0.00019794074074074076, + "loss": 0.2059, + "step": 1640 + }, + { + "epoch": 0.15761570425562402, + "grad_norm": 0.2339385598897934, + "learning_rate": 0.00019779259259259258, + "loss": 0.2011, + "step": 1650 + }, + { + "epoch": 0.15857095094808235, + "grad_norm": 0.1527925431728363, + "learning_rate": 0.00019764444444444446, + "loss": 0.2402, + "step": 1660 + }, + { + "epoch": 0.15952619764054066, + "grad_norm": 0.14847882091999054, + "learning_rate": 0.0001974962962962963, + "loss": 0.2072, + "step": 1670 + }, + { + "epoch": 0.160481444332999, + "grad_norm": 0.10180728882551193, + "learning_rate": 0.00019734814814814815, + "loss": 0.2029, + "step": 1680 + }, + { + "epoch": 0.16143669102545732, + "grad_norm": 0.14945369958877563, + "learning_rate": 0.0001972, + "loss": 0.2041, + "step": 1690 + }, + { + "epoch": 0.16239193771791566, + "grad_norm": 0.47041115164756775, + "learning_rate": 0.00019705185185185187, + "loss": 0.2113, + "step": 1700 + }, + { + "epoch": 0.163347184410374, + "grad_norm": 0.2964141070842743, + "learning_rate": 0.00019690370370370372, + "loss": 0.2239, + "step": 1710 + }, + { + "epoch": 0.1643024311028323, + "grad_norm": 0.12235242128372192, + "learning_rate": 0.00019675555555555556, + "loss": 0.2031, + "step": 1720 + }, + { + "epoch": 0.16525767779529063, + "grad_norm": 0.10584467649459839, + "learning_rate": 0.0001966074074074074, + "loss": 0.205, + "step": 1730 + }, + { + "epoch": 0.16621292448774896, + "grad_norm": 0.18592573702335358, + "learning_rate": 0.00019645925925925928, + "loss": 0.2047, + "step": 1740 + }, + { + "epoch": 0.1671681711802073, + "grad_norm": 0.3373814821243286, + "learning_rate": 0.0001963111111111111, + "loss": 0.2033, + "step": 1750 + }, + { + "epoch": 0.1681234178726656, + "grad_norm": 0.24131450057029724, + "learning_rate": 0.00019616296296296297, + "loss": 0.2274, + "step": 1760 + }, + { + "epoch": 0.16907866456512394, + "grad_norm": 0.13119126856327057, + "learning_rate": 0.00019601481481481482, + "loss": 0.2044, + "step": 1770 + }, + { + "epoch": 0.17003391125758227, + "grad_norm": 0.12517541646957397, + "learning_rate": 0.00019586666666666667, + "loss": 0.207, + "step": 1780 + }, + { + "epoch": 0.1709891579500406, + "grad_norm": 0.13666100800037384, + "learning_rate": 0.0001957185185185185, + "loss": 0.206, + "step": 1790 + }, + { + "epoch": 0.17194440464249894, + "grad_norm": 0.5242202281951904, + "learning_rate": 0.00019557037037037039, + "loss": 0.1996, + "step": 1800 + }, + { + "epoch": 0.17289965133495724, + "grad_norm": 0.436653196811676, + "learning_rate": 0.00019542222222222223, + "loss": 0.2492, + "step": 1810 + }, + { + "epoch": 0.17385489802741558, + "grad_norm": 0.14327634871006012, + "learning_rate": 0.00019527407407407408, + "loss": 0.2126, + "step": 1820 + }, + { + "epoch": 0.1748101447198739, + "grad_norm": 0.5541105270385742, + "learning_rate": 0.00019512592592592592, + "loss": 0.2044, + "step": 1830 + }, + { + "epoch": 0.17576539141233224, + "grad_norm": 0.17921100556850433, + "learning_rate": 0.0001949777777777778, + "loss": 0.2107, + "step": 1840 + }, + { + "epoch": 0.17672063810479055, + "grad_norm": 0.4111920893192291, + "learning_rate": 0.00019482962962962962, + "loss": 0.2078, + "step": 1850 + }, + { + "epoch": 0.17767588479724888, + "grad_norm": 0.32501187920570374, + "learning_rate": 0.0001946814814814815, + "loss": 0.2392, + "step": 1860 + }, + { + "epoch": 0.17863113148970722, + "grad_norm": 0.27491918206214905, + "learning_rate": 0.00019453333333333334, + "loss": 0.2017, + "step": 1870 + }, + { + "epoch": 0.17958637818216555, + "grad_norm": 0.10791027545928955, + "learning_rate": 0.0001943851851851852, + "loss": 0.2066, + "step": 1880 + }, + { + "epoch": 0.18054162487462388, + "grad_norm": 0.11400250345468521, + "learning_rate": 0.00019423703703703703, + "loss": 0.2079, + "step": 1890 + }, + { + "epoch": 0.1814968715670822, + "grad_norm": 0.23481184244155884, + "learning_rate": 0.0001940888888888889, + "loss": 0.1987, + "step": 1900 + }, + { + "epoch": 0.18245211825954052, + "grad_norm": 0.2478189617395401, + "learning_rate": 0.00019394074074074075, + "loss": 0.2507, + "step": 1910 + }, + { + "epoch": 0.18340736495199886, + "grad_norm": 0.12263601273298264, + "learning_rate": 0.0001937925925925926, + "loss": 0.2044, + "step": 1920 + }, + { + "epoch": 0.1843626116444572, + "grad_norm": 0.14222730696201324, + "learning_rate": 0.00019364444444444444, + "loss": 0.204, + "step": 1930 + }, + { + "epoch": 0.1853178583369155, + "grad_norm": 0.15957382321357727, + "learning_rate": 0.00019349629629629631, + "loss": 0.2077, + "step": 1940 + }, + { + "epoch": 0.18627310502937383, + "grad_norm": 0.5012878179550171, + "learning_rate": 0.00019334814814814816, + "loss": 0.2102, + "step": 1950 + }, + { + "epoch": 0.18722835172183216, + "grad_norm": 0.2779427170753479, + "learning_rate": 0.0001932, + "loss": 0.2239, + "step": 1960 + }, + { + "epoch": 0.1881835984142905, + "grad_norm": 0.14166271686553955, + "learning_rate": 0.00019305185185185185, + "loss": 0.2043, + "step": 1970 + }, + { + "epoch": 0.1891388451067488, + "grad_norm": 0.11754471063613892, + "learning_rate": 0.00019290370370370373, + "loss": 0.2039, + "step": 1980 + }, + { + "epoch": 0.19009409179920714, + "grad_norm": 0.15170662105083466, + "learning_rate": 0.00019275555555555555, + "loss": 0.2052, + "step": 1990 + }, + { + "epoch": 0.19104933849166547, + "grad_norm": 0.5039442181587219, + "learning_rate": 0.00019260740740740742, + "loss": 0.2082, + "step": 2000 + }, + { + "epoch": 0.1920045851841238, + "grad_norm": 0.2808721959590912, + "learning_rate": 0.00019245925925925927, + "loss": 0.225, + "step": 2010 + }, + { + "epoch": 0.19295983187658214, + "grad_norm": 0.10636847466230392, + "learning_rate": 0.00019231111111111114, + "loss": 0.2018, + "step": 2020 + }, + { + "epoch": 0.19391507856904044, + "grad_norm": 0.10885459184646606, + "learning_rate": 0.00019216296296296296, + "loss": 0.2042, + "step": 2030 + }, + { + "epoch": 0.19487032526149878, + "grad_norm": 0.1382536143064499, + "learning_rate": 0.00019201481481481483, + "loss": 0.2058, + "step": 2040 + }, + { + "epoch": 0.1958255719539571, + "grad_norm": 0.6492496728897095, + "learning_rate": 0.00019186666666666668, + "loss": 0.207, + "step": 2050 + }, + { + "epoch": 0.19678081864641545, + "grad_norm": 0.13635113835334778, + "learning_rate": 0.00019171851851851852, + "loss": 0.2132, + "step": 2060 + }, + { + "epoch": 0.19773606533887375, + "grad_norm": 0.1241525337100029, + "learning_rate": 0.00019157037037037037, + "loss": 0.2032, + "step": 2070 + }, + { + "epoch": 0.19869131203133208, + "grad_norm": 0.12168664485216141, + "learning_rate": 0.00019142222222222224, + "loss": 0.2066, + "step": 2080 + }, + { + "epoch": 0.19964655872379042, + "grad_norm": 0.12792471051216125, + "learning_rate": 0.00019127407407407406, + "loss": 0.2039, + "step": 2090 + }, + { + "epoch": 0.20060180541624875, + "grad_norm": 0.5138373970985413, + "learning_rate": 0.00019112592592592594, + "loss": 0.2095, + "step": 2100 + }, + { + "epoch": 0.20155705210870709, + "grad_norm": 0.2566812336444855, + "learning_rate": 0.00019097777777777778, + "loss": 0.2186, + "step": 2110 + }, + { + "epoch": 0.2025122988011654, + "grad_norm": 0.11528566479682922, + "learning_rate": 0.00019082962962962966, + "loss": 0.2034, + "step": 2120 + }, + { + "epoch": 0.20346754549362372, + "grad_norm": 0.14964407682418823, + "learning_rate": 0.00019068148148148147, + "loss": 0.2014, + "step": 2130 + }, + { + "epoch": 0.20442279218608206, + "grad_norm": 0.1438506543636322, + "learning_rate": 0.00019053333333333335, + "loss": 0.2036, + "step": 2140 + }, + { + "epoch": 0.2053780388785404, + "grad_norm": 0.5848654508590698, + "learning_rate": 0.0001903851851851852, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2063332855709987, + "grad_norm": 0.16795799136161804, + "learning_rate": 0.00019023703703703704, + "loss": 0.2149, + "step": 2160 + }, + { + "epoch": 0.20728853226345703, + "grad_norm": 0.26638638973236084, + "learning_rate": 0.0001900888888888889, + "loss": 0.2026, + "step": 2170 + }, + { + "epoch": 0.20824377895591537, + "grad_norm": 0.11838365346193314, + "learning_rate": 0.00018994074074074076, + "loss": 0.2025, + "step": 2180 + }, + { + "epoch": 0.2091990256483737, + "grad_norm": 0.15363581478595734, + "learning_rate": 0.0001897925925925926, + "loss": 0.2058, + "step": 2190 + }, + { + "epoch": 0.21015427234083203, + "grad_norm": 0.5906602740287781, + "learning_rate": 0.00018964444444444445, + "loss": 0.2077, + "step": 2200 + }, + { + "epoch": 0.21110951903329034, + "grad_norm": 0.31574729084968567, + "learning_rate": 0.0001894962962962963, + "loss": 0.2206, + "step": 2210 + }, + { + "epoch": 0.21206476572574867, + "grad_norm": 0.128960981965065, + "learning_rate": 0.00018934814814814817, + "loss": 0.2001, + "step": 2220 + }, + { + "epoch": 0.213020012418207, + "grad_norm": 0.10915983468294144, + "learning_rate": 0.0001892, + "loss": 0.2023, + "step": 2230 + }, + { + "epoch": 0.21397525911066534, + "grad_norm": 0.14831770956516266, + "learning_rate": 0.00018905185185185186, + "loss": 0.2032, + "step": 2240 + }, + { + "epoch": 0.21493050580312364, + "grad_norm": 0.5704192519187927, + "learning_rate": 0.0001889037037037037, + "loss": 0.2027, + "step": 2250 + }, + { + "epoch": 0.21588575249558198, + "grad_norm": 0.2867341935634613, + "learning_rate": 0.00018875555555555558, + "loss": 0.2227, + "step": 2260 + }, + { + "epoch": 0.2168409991880403, + "grad_norm": 0.199985072016716, + "learning_rate": 0.0001886074074074074, + "loss": 0.2029, + "step": 2270 + }, + { + "epoch": 0.21779624588049865, + "grad_norm": 0.09733956307172775, + "learning_rate": 0.00018845925925925928, + "loss": 0.204, + "step": 2280 + }, + { + "epoch": 0.21875149257295698, + "grad_norm": 0.11999070644378662, + "learning_rate": 0.00018831111111111112, + "loss": 0.2055, + "step": 2290 + }, + { + "epoch": 0.21970673926541529, + "grad_norm": 0.4675360918045044, + "learning_rate": 0.00018816296296296297, + "loss": 0.2052, + "step": 2300 + }, + { + "epoch": 0.22066198595787362, + "grad_norm": 0.29119136929512024, + "learning_rate": 0.00018801481481481482, + "loss": 0.2082, + "step": 2310 + }, + { + "epoch": 0.22161723265033195, + "grad_norm": 0.14248254895210266, + "learning_rate": 0.0001878666666666667, + "loss": 0.2044, + "step": 2320 + }, + { + "epoch": 0.22257247934279029, + "grad_norm": 0.12034345418214798, + "learning_rate": 0.00018771851851851853, + "loss": 0.203, + "step": 2330 + }, + { + "epoch": 0.2235277260352486, + "grad_norm": 0.15301008522510529, + "learning_rate": 0.00018757037037037038, + "loss": 0.2055, + "step": 2340 + }, + { + "epoch": 0.22448297272770693, + "grad_norm": 0.5262898206710815, + "learning_rate": 0.00018742222222222223, + "loss": 0.2058, + "step": 2350 + }, + { + "epoch": 0.22543821942016526, + "grad_norm": 0.3706663250923157, + "learning_rate": 0.0001872740740740741, + "loss": 0.2095, + "step": 2360 + }, + { + "epoch": 0.2263934661126236, + "grad_norm": 0.12153764069080353, + "learning_rate": 0.00018712592592592592, + "loss": 0.2026, + "step": 2370 + }, + { + "epoch": 0.22734871280508193, + "grad_norm": 0.133193701505661, + "learning_rate": 0.0001869777777777778, + "loss": 0.2016, + "step": 2380 + }, + { + "epoch": 0.22830395949754023, + "grad_norm": 0.1649506688117981, + "learning_rate": 0.00018682962962962964, + "loss": 0.2047, + "step": 2390 + }, + { + "epoch": 0.22925920618999857, + "grad_norm": 0.5738644003868103, + "learning_rate": 0.00018668148148148149, + "loss": 0.2036, + "step": 2400 + }, + { + "epoch": 0.2302144528824569, + "grad_norm": 6.20187520980835, + "learning_rate": 0.00018653333333333333, + "loss": 0.2537, + "step": 2410 + }, + { + "epoch": 0.23116969957491523, + "grad_norm": 0.09397050738334656, + "learning_rate": 0.0001863851851851852, + "loss": 0.2093, + "step": 2420 + }, + { + "epoch": 0.23212494626737354, + "grad_norm": 0.13936394453048706, + "learning_rate": 0.00018623703703703705, + "loss": 0.2016, + "step": 2430 + }, + { + "epoch": 0.23308019295983187, + "grad_norm": 0.16786976158618927, + "learning_rate": 0.0001860888888888889, + "loss": 0.2043, + "step": 2440 + }, + { + "epoch": 0.2340354396522902, + "grad_norm": 0.4057718813419342, + "learning_rate": 0.00018594074074074074, + "loss": 0.2074, + "step": 2450 + }, + { + "epoch": 0.23499068634474854, + "grad_norm": 0.22006480395793915, + "learning_rate": 0.00018579259259259262, + "loss": 0.2139, + "step": 2460 + }, + { + "epoch": 0.23594593303720685, + "grad_norm": 0.11016175895929337, + "learning_rate": 0.00018564444444444444, + "loss": 0.2029, + "step": 2470 + }, + { + "epoch": 0.23690117972966518, + "grad_norm": 0.11153749376535416, + "learning_rate": 0.0001854962962962963, + "loss": 0.202, + "step": 2480 + }, + { + "epoch": 0.2378564264221235, + "grad_norm": 0.1272597759962082, + "learning_rate": 0.00018534814814814816, + "loss": 0.2051, + "step": 2490 + }, + { + "epoch": 0.23881167311458185, + "grad_norm": 0.5681122541427612, + "learning_rate": 0.00018520000000000003, + "loss": 0.2063, + "step": 2500 + }, + { + "epoch": 0.23976691980704018, + "grad_norm": 0.23601225018501282, + "learning_rate": 0.00018505185185185185, + "loss": 0.2173, + "step": 2510 + }, + { + "epoch": 0.24072216649949849, + "grad_norm": 0.14005862176418304, + "learning_rate": 0.00018490370370370372, + "loss": 0.2022, + "step": 2520 + }, + { + "epoch": 0.24167741319195682, + "grad_norm": 0.12287179380655289, + "learning_rate": 0.00018475555555555557, + "loss": 0.2039, + "step": 2530 + }, + { + "epoch": 0.24263265988441515, + "grad_norm": 0.15546666085720062, + "learning_rate": 0.00018460740740740741, + "loss": 0.2045, + "step": 2540 + }, + { + "epoch": 0.2435879065768735, + "grad_norm": 0.5308989882469177, + "learning_rate": 0.00018445925925925926, + "loss": 0.2047, + "step": 2550 + }, + { + "epoch": 0.2445431532693318, + "grad_norm": 0.1517964005470276, + "learning_rate": 0.00018431111111111113, + "loss": 0.212, + "step": 2560 + }, + { + "epoch": 0.24549839996179013, + "grad_norm": 0.09176363050937653, + "learning_rate": 0.00018416296296296298, + "loss": 0.2003, + "step": 2570 + }, + { + "epoch": 0.24645364665424846, + "grad_norm": 0.11431318521499634, + "learning_rate": 0.00018401481481481483, + "loss": 0.2005, + "step": 2580 + }, + { + "epoch": 0.2474088933467068, + "grad_norm": 0.13079893589019775, + "learning_rate": 0.00018386666666666667, + "loss": 0.2037, + "step": 2590 + }, + { + "epoch": 0.24836414003916513, + "grad_norm": 0.47278907895088196, + "learning_rate": 0.00018371851851851855, + "loss": 0.2054, + "step": 2600 + }, + { + "epoch": 0.24931938673162343, + "grad_norm": 0.24692212045192719, + "learning_rate": 0.00018357037037037037, + "loss": 0.2116, + "step": 2610 + }, + { + "epoch": 0.2502746334240818, + "grad_norm": 0.1928204596042633, + "learning_rate": 0.00018342222222222224, + "loss": 0.2057, + "step": 2620 + }, + { + "epoch": 0.25122988011654007, + "grad_norm": 0.1608019471168518, + "learning_rate": 0.00018327407407407408, + "loss": 0.2027, + "step": 2630 + }, + { + "epoch": 0.2521851268089984, + "grad_norm": 0.09996878355741501, + "learning_rate": 0.00018312592592592596, + "loss": 0.2036, + "step": 2640 + }, + { + "epoch": 0.25314037350145674, + "grad_norm": 0.45600563287734985, + "learning_rate": 0.00018297777777777778, + "loss": 0.2054, + "step": 2650 + }, + { + "epoch": 0.2540956201939151, + "grad_norm": 0.3103402256965637, + "learning_rate": 0.00018282962962962965, + "loss": 0.2107, + "step": 2660 + }, + { + "epoch": 0.2550508668863734, + "grad_norm": 0.12778714299201965, + "learning_rate": 0.0001826814814814815, + "loss": 0.2055, + "step": 2670 + }, + { + "epoch": 0.25600611357883174, + "grad_norm": 0.112953320145607, + "learning_rate": 0.00018253333333333334, + "loss": 0.2024, + "step": 2680 + }, + { + "epoch": 0.2569613602712901, + "grad_norm": 0.10482873767614365, + "learning_rate": 0.0001823851851851852, + "loss": 0.204, + "step": 2690 + }, + { + "epoch": 0.2579166069637484, + "grad_norm": 0.606562077999115, + "learning_rate": 0.00018223703703703706, + "loss": 0.2061, + "step": 2700 + }, + { + "epoch": 0.25887185365620674, + "grad_norm": 0.3492737412452698, + "learning_rate": 0.00018208888888888888, + "loss": 0.2109, + "step": 2710 + }, + { + "epoch": 0.259827100348665, + "grad_norm": 0.12048076838254929, + "learning_rate": 0.00018194074074074076, + "loss": 0.2046, + "step": 2720 + }, + { + "epoch": 0.26078234704112335, + "grad_norm": 0.1138598620891571, + "learning_rate": 0.0001817925925925926, + "loss": 0.2021, + "step": 2730 + }, + { + "epoch": 0.2617375937335817, + "grad_norm": 0.12291901558637619, + "learning_rate": 0.00018164444444444447, + "loss": 0.2022, + "step": 2740 + }, + { + "epoch": 0.26269284042604, + "grad_norm": 0.43753868341445923, + "learning_rate": 0.0001814962962962963, + "loss": 0.2047, + "step": 2750 + }, + { + "epoch": 0.26364808711849835, + "grad_norm": 0.21602752804756165, + "learning_rate": 0.00018134814814814817, + "loss": 0.2078, + "step": 2760 + }, + { + "epoch": 0.2646033338109567, + "grad_norm": 0.13231903314590454, + "learning_rate": 0.0001812, + "loss": 0.2022, + "step": 2770 + }, + { + "epoch": 0.265558580503415, + "grad_norm": 0.11796011030673981, + "learning_rate": 0.00018105185185185186, + "loss": 0.2019, + "step": 2780 + }, + { + "epoch": 0.26651382719587335, + "grad_norm": 0.13643573224544525, + "learning_rate": 0.0001809037037037037, + "loss": 0.2027, + "step": 2790 + }, + { + "epoch": 0.2674690738883317, + "grad_norm": 0.44247397780418396, + "learning_rate": 0.00018075555555555558, + "loss": 0.2038, + "step": 2800 + }, + { + "epoch": 0.26842432058078997, + "grad_norm": 0.3825100064277649, + "learning_rate": 0.00018060740740740743, + "loss": 0.2069, + "step": 2810 + }, + { + "epoch": 0.2693795672732483, + "grad_norm": 0.195270374417305, + "learning_rate": 0.00018045925925925927, + "loss": 0.2043, + "step": 2820 + }, + { + "epoch": 0.27033481396570663, + "grad_norm": 0.12330356240272522, + "learning_rate": 0.00018031111111111112, + "loss": 0.202, + "step": 2830 + }, + { + "epoch": 0.27129006065816497, + "grad_norm": 0.1486448496580124, + "learning_rate": 0.000180162962962963, + "loss": 0.2046, + "step": 2840 + }, + { + "epoch": 0.2722453073506233, + "grad_norm": 0.4175782799720764, + "learning_rate": 0.0001800148148148148, + "loss": 0.203, + "step": 2850 + }, + { + "epoch": 0.27320055404308163, + "grad_norm": 0.15270055830478668, + "learning_rate": 0.00017986666666666668, + "loss": 0.2134, + "step": 2860 + }, + { + "epoch": 0.27415580073553997, + "grad_norm": 0.14655576646327972, + "learning_rate": 0.00017971851851851853, + "loss": 0.2006, + "step": 2870 + }, + { + "epoch": 0.2751110474279983, + "grad_norm": 0.1165398582816124, + "learning_rate": 0.00017957037037037038, + "loss": 0.2017, + "step": 2880 + }, + { + "epoch": 0.27606629412045663, + "grad_norm": 0.1337936520576477, + "learning_rate": 0.00017942222222222222, + "loss": 0.2036, + "step": 2890 + }, + { + "epoch": 0.2770215408129149, + "grad_norm": 0.38786137104034424, + "learning_rate": 0.0001792740740740741, + "loss": 0.2046, + "step": 2900 + }, + { + "epoch": 0.27797678750537325, + "grad_norm": 0.2964805066585541, + "learning_rate": 0.00017912592592592594, + "loss": 0.2087, + "step": 2910 + }, + { + "epoch": 0.2789320341978316, + "grad_norm": 0.1085568442940712, + "learning_rate": 0.0001789777777777778, + "loss": 0.2052, + "step": 2920 + }, + { + "epoch": 0.2798872808902899, + "grad_norm": 0.10278130322694778, + "learning_rate": 0.00017882962962962963, + "loss": 0.1993, + "step": 2930 + }, + { + "epoch": 0.28084252758274825, + "grad_norm": 0.13532647490501404, + "learning_rate": 0.00017868148148148148, + "loss": 0.2038, + "step": 2940 + }, + { + "epoch": 0.2817977742752066, + "grad_norm": 0.3095054626464844, + "learning_rate": 0.00017853333333333335, + "loss": 0.2065, + "step": 2950 + }, + { + "epoch": 0.2827530209676649, + "grad_norm": 0.2533681392669678, + "learning_rate": 0.0001783851851851852, + "loss": 0.2086, + "step": 2960 + }, + { + "epoch": 0.28370826766012325, + "grad_norm": 0.12914352118968964, + "learning_rate": 0.00017823703703703705, + "loss": 0.2062, + "step": 2970 + }, + { + "epoch": 0.2846635143525816, + "grad_norm": 0.12468916922807693, + "learning_rate": 0.0001780888888888889, + "loss": 0.2001, + "step": 2980 + }, + { + "epoch": 0.28561876104503986, + "grad_norm": 0.24026978015899658, + "learning_rate": 0.00017794074074074074, + "loss": 0.2039, + "step": 2990 + }, + { + "epoch": 0.2865740077374982, + "grad_norm": 0.35038548707962036, + "learning_rate": 0.0001777925925925926, + "loss": 0.2043, + "step": 3000 + }, + { + "epoch": 0.2875292544299565, + "grad_norm": 0.16599488258361816, + "learning_rate": 0.00017764444444444446, + "loss": 0.2043, + "step": 3010 + }, + { + "epoch": 0.28848450112241486, + "grad_norm": 0.17750556766986847, + "learning_rate": 0.0001774962962962963, + "loss": 0.202, + "step": 3020 + }, + { + "epoch": 0.2894397478148732, + "grad_norm": 0.10757072269916534, + "learning_rate": 0.00017734814814814815, + "loss": 0.2009, + "step": 3030 + }, + { + "epoch": 0.2903949945073315, + "grad_norm": 0.17457428574562073, + "learning_rate": 0.0001772, + "loss": 0.205, + "step": 3040 + }, + { + "epoch": 0.29135024119978986, + "grad_norm": 0.4000655710697174, + "learning_rate": 0.00017705185185185187, + "loss": 0.2048, + "step": 3050 + }, + { + "epoch": 0.2923054878922482, + "grad_norm": 0.3278186619281769, + "learning_rate": 0.00017690370370370372, + "loss": 0.2026, + "step": 3060 + }, + { + "epoch": 0.2932607345847065, + "grad_norm": 0.09013079106807709, + "learning_rate": 0.00017675555555555556, + "loss": 0.2039, + "step": 3070 + }, + { + "epoch": 0.2942159812771648, + "grad_norm": 0.11759201437234879, + "learning_rate": 0.0001766074074074074, + "loss": 0.2014, + "step": 3080 + }, + { + "epoch": 0.29517122796962314, + "grad_norm": 0.17286565899848938, + "learning_rate": 0.00017645925925925926, + "loss": 0.2049, + "step": 3090 + }, + { + "epoch": 0.2961264746620815, + "grad_norm": 0.3381953835487366, + "learning_rate": 0.00017631111111111113, + "loss": 0.2038, + "step": 3100 + }, + { + "epoch": 0.2970817213545398, + "grad_norm": 0.1709469109773636, + "learning_rate": 0.00017616296296296298, + "loss": 0.2018, + "step": 3110 + }, + { + "epoch": 0.29803696804699814, + "grad_norm": 0.1467283070087433, + "learning_rate": 0.00017601481481481482, + "loss": 0.1997, + "step": 3120 + }, + { + "epoch": 0.2989922147394565, + "grad_norm": 0.13545508682727814, + "learning_rate": 0.00017586666666666667, + "loss": 0.2009, + "step": 3130 + }, + { + "epoch": 0.2999474614319148, + "grad_norm": 0.19185511767864227, + "learning_rate": 0.00017571851851851851, + "loss": 0.2045, + "step": 3140 + }, + { + "epoch": 0.30090270812437314, + "grad_norm": 0.3416673541069031, + "learning_rate": 0.0001755703703703704, + "loss": 0.2063, + "step": 3150 + }, + { + "epoch": 0.3018579548168314, + "grad_norm": 0.14955377578735352, + "learning_rate": 0.00017542222222222223, + "loss": 0.2017, + "step": 3160 + }, + { + "epoch": 0.30281320150928975, + "grad_norm": 0.10988181084394455, + "learning_rate": 0.00017527407407407408, + "loss": 0.2016, + "step": 3170 + }, + { + "epoch": 0.3037684482017481, + "grad_norm": 0.17967171967029572, + "learning_rate": 0.00017512592592592593, + "loss": 0.2004, + "step": 3180 + }, + { + "epoch": 0.3047236948942064, + "grad_norm": 0.19823792576789856, + "learning_rate": 0.0001749777777777778, + "loss": 0.2023, + "step": 3190 + }, + { + "epoch": 0.30567894158666475, + "grad_norm": 0.49947112798690796, + "learning_rate": 0.00017482962962962962, + "loss": 0.205, + "step": 3200 + }, + { + "epoch": 0.3066341882791231, + "grad_norm": 0.272572785615921, + "learning_rate": 0.0001746814814814815, + "loss": 0.2051, + "step": 3210 + }, + { + "epoch": 0.3075894349715814, + "grad_norm": 0.1181032732129097, + "learning_rate": 0.00017453333333333334, + "loss": 0.2034, + "step": 3220 + }, + { + "epoch": 0.30854468166403975, + "grad_norm": 0.10932394117116928, + "learning_rate": 0.00017438518518518518, + "loss": 0.2014, + "step": 3230 + }, + { + "epoch": 0.3094999283564981, + "grad_norm": 0.1775374561548233, + "learning_rate": 0.00017423703703703703, + "loss": 0.2058, + "step": 3240 + }, + { + "epoch": 0.31045517504895637, + "grad_norm": 0.39363420009613037, + "learning_rate": 0.0001740888888888889, + "loss": 0.2061, + "step": 3250 + }, + { + "epoch": 0.3114104217414147, + "grad_norm": 0.2556678354740143, + "learning_rate": 0.00017394074074074075, + "loss": 0.2036, + "step": 3260 + }, + { + "epoch": 0.31236566843387303, + "grad_norm": 0.1335153728723526, + "learning_rate": 0.0001737925925925926, + "loss": 0.1994, + "step": 3270 + }, + { + "epoch": 0.31332091512633137, + "grad_norm": 0.1207786276936531, + "learning_rate": 0.00017364444444444444, + "loss": 0.2013, + "step": 3280 + }, + { + "epoch": 0.3142761618187897, + "grad_norm": 0.18867254257202148, + "learning_rate": 0.00017349629629629632, + "loss": 0.2022, + "step": 3290 + }, + { + "epoch": 0.31523140851124803, + "grad_norm": 0.4312264919281006, + "learning_rate": 0.00017334814814814814, + "loss": 0.2052, + "step": 3300 + }, + { + "epoch": 0.31618665520370637, + "grad_norm": 0.2023773193359375, + "learning_rate": 0.0001732, + "loss": 0.1996, + "step": 3310 + }, + { + "epoch": 0.3171419018961647, + "grad_norm": 0.13800661265850067, + "learning_rate": 0.00017305185185185185, + "loss": 0.2029, + "step": 3320 + }, + { + "epoch": 0.31809714858862304, + "grad_norm": 0.12555184960365295, + "learning_rate": 0.0001729037037037037, + "loss": 0.2014, + "step": 3330 + }, + { + "epoch": 0.3190523952810813, + "grad_norm": 0.1472926288843155, + "learning_rate": 0.00017275555555555555, + "loss": 0.2051, + "step": 3340 + }, + { + "epoch": 0.32000764197353965, + "grad_norm": 0.4507752060890198, + "learning_rate": 0.00017260740740740742, + "loss": 0.2041, + "step": 3350 + }, + { + "epoch": 0.320962888665998, + "grad_norm": 0.42336827516555786, + "learning_rate": 0.00017245925925925927, + "loss": 0.2079, + "step": 3360 + }, + { + "epoch": 0.3219181353584563, + "grad_norm": 0.09905827045440674, + "learning_rate": 0.0001723111111111111, + "loss": 0.2011, + "step": 3370 + }, + { + "epoch": 0.32287338205091465, + "grad_norm": 0.09226644784212112, + "learning_rate": 0.00017216296296296296, + "loss": 0.2, + "step": 3380 + }, + { + "epoch": 0.323828628743373, + "grad_norm": 0.11263467371463776, + "learning_rate": 0.00017201481481481483, + "loss": 0.2053, + "step": 3390 + }, + { + "epoch": 0.3247838754358313, + "grad_norm": 0.2874763607978821, + "learning_rate": 0.00017186666666666665, + "loss": 0.2063, + "step": 3400 + }, + { + "epoch": 0.32573912212828965, + "grad_norm": 0.13759906589984894, + "learning_rate": 0.00017171851851851853, + "loss": 0.2038, + "step": 3410 + }, + { + "epoch": 0.326694368820748, + "grad_norm": 0.10311861336231232, + "learning_rate": 0.00017157037037037037, + "loss": 0.1998, + "step": 3420 + }, + { + "epoch": 0.32764961551320626, + "grad_norm": 0.10236906260251999, + "learning_rate": 0.00017142222222222224, + "loss": 0.2007, + "step": 3430 + }, + { + "epoch": 0.3286048622056646, + "grad_norm": 0.14640459418296814, + "learning_rate": 0.00017127407407407406, + "loss": 0.2034, + "step": 3440 + }, + { + "epoch": 0.3295601088981229, + "grad_norm": 0.4186616837978363, + "learning_rate": 0.00017112592592592594, + "loss": 0.2063, + "step": 3450 + }, + { + "epoch": 0.33051535559058126, + "grad_norm": 0.210789293050766, + "learning_rate": 0.00017097777777777778, + "loss": 0.2043, + "step": 3460 + }, + { + "epoch": 0.3314706022830396, + "grad_norm": 0.09906265139579773, + "learning_rate": 0.00017082962962962963, + "loss": 0.199, + "step": 3470 + }, + { + "epoch": 0.33242584897549793, + "grad_norm": 0.10522822290658951, + "learning_rate": 0.00017068148148148148, + "loss": 0.2008, + "step": 3480 + }, + { + "epoch": 0.33338109566795626, + "grad_norm": 0.13106843829154968, + "learning_rate": 0.00017053333333333335, + "loss": 0.2067, + "step": 3490 + }, + { + "epoch": 0.3343363423604146, + "grad_norm": 0.3762379586696625, + "learning_rate": 0.0001703851851851852, + "loss": 0.2059, + "step": 3500 + }, + { + "epoch": 0.33529158905287293, + "grad_norm": 0.17082005739212036, + "learning_rate": 0.00017023703703703704, + "loss": 0.2124, + "step": 3510 + }, + { + "epoch": 0.3362468357453312, + "grad_norm": 0.0899599939584732, + "learning_rate": 0.0001700888888888889, + "loss": 0.2037, + "step": 3520 + }, + { + "epoch": 0.33720208243778954, + "grad_norm": 0.10781609266996384, + "learning_rate": 0.00016994074074074076, + "loss": 0.2021, + "step": 3530 + }, + { + "epoch": 0.3381573291302479, + "grad_norm": 0.1443158984184265, + "learning_rate": 0.00016979259259259258, + "loss": 0.2045, + "step": 3540 + }, + { + "epoch": 0.3391125758227062, + "grad_norm": 0.3108382523059845, + "learning_rate": 0.00016964444444444445, + "loss": 0.2066, + "step": 3550 + }, + { + "epoch": 0.34006782251516454, + "grad_norm": 0.11456870287656784, + "learning_rate": 0.0001694962962962963, + "loss": 0.2127, + "step": 3560 + }, + { + "epoch": 0.3410230692076229, + "grad_norm": 0.14917728304862976, + "learning_rate": 0.00016934814814814817, + "loss": 0.2035, + "step": 3570 + }, + { + "epoch": 0.3419783159000812, + "grad_norm": 0.12674477696418762, + "learning_rate": 0.0001692, + "loss": 0.2029, + "step": 3580 + }, + { + "epoch": 0.34293356259253954, + "grad_norm": 0.14471426606178284, + "learning_rate": 0.00016905185185185187, + "loss": 0.2047, + "step": 3590 + }, + { + "epoch": 0.3438888092849979, + "grad_norm": 0.34318575263023376, + "learning_rate": 0.0001689037037037037, + "loss": 0.207, + "step": 3600 + }, + { + "epoch": 0.34484405597745615, + "grad_norm": 0.27254998683929443, + "learning_rate": 0.00016875555555555556, + "loss": 0.2059, + "step": 3610 + }, + { + "epoch": 0.3457993026699145, + "grad_norm": 0.11677141487598419, + "learning_rate": 0.0001686074074074074, + "loss": 0.2052, + "step": 3620 + }, + { + "epoch": 0.3467545493623728, + "grad_norm": 0.10886923968791962, + "learning_rate": 0.00016845925925925928, + "loss": 0.2032, + "step": 3630 + }, + { + "epoch": 0.34770979605483116, + "grad_norm": 0.146949902176857, + "learning_rate": 0.0001683111111111111, + "loss": 0.2044, + "step": 3640 + }, + { + "epoch": 0.3486650427472895, + "grad_norm": 0.37153467535972595, + "learning_rate": 0.00016816296296296297, + "loss": 0.2055, + "step": 3650 + }, + { + "epoch": 0.3496202894397478, + "grad_norm": 0.25440579652786255, + "learning_rate": 0.00016801481481481482, + "loss": 0.209, + "step": 3660 + }, + { + "epoch": 0.35057553613220616, + "grad_norm": 0.0973072350025177, + "learning_rate": 0.0001678666666666667, + "loss": 0.2055, + "step": 3670 + }, + { + "epoch": 0.3515307828246645, + "grad_norm": 0.11460244655609131, + "learning_rate": 0.0001677185185185185, + "loss": 0.2023, + "step": 3680 + }, + { + "epoch": 0.3524860295171228, + "grad_norm": 0.16661228239536285, + "learning_rate": 0.00016757037037037038, + "loss": 0.2052, + "step": 3690 + }, + { + "epoch": 0.3534412762095811, + "grad_norm": 0.37548327445983887, + "learning_rate": 0.00016742222222222223, + "loss": 0.206, + "step": 3700 + }, + { + "epoch": 0.35439652290203943, + "grad_norm": 0.2567369341850281, + "learning_rate": 0.00016727407407407408, + "loss": 0.2062, + "step": 3710 + }, + { + "epoch": 0.35535176959449777, + "grad_norm": 0.1041766032576561, + "learning_rate": 0.00016712592592592592, + "loss": 0.2012, + "step": 3720 + }, + { + "epoch": 0.3563070162869561, + "grad_norm": 0.12674827873706818, + "learning_rate": 0.0001669777777777778, + "loss": 0.2037, + "step": 3730 + }, + { + "epoch": 0.35726226297941444, + "grad_norm": 0.1635134369134903, + "learning_rate": 0.00016682962962962964, + "loss": 0.2049, + "step": 3740 + }, + { + "epoch": 0.35821750967187277, + "grad_norm": 0.371850848197937, + "learning_rate": 0.0001666814814814815, + "loss": 0.2066, + "step": 3750 + }, + { + "epoch": 0.3591727563643311, + "grad_norm": 0.11050805449485779, + "learning_rate": 0.00016653333333333333, + "loss": 0.2049, + "step": 3760 + }, + { + "epoch": 0.36012800305678944, + "grad_norm": 0.12971824407577515, + "learning_rate": 0.0001663851851851852, + "loss": 0.2003, + "step": 3770 + }, + { + "epoch": 0.36108324974924777, + "grad_norm": 0.10030124336481094, + "learning_rate": 0.00016623703703703703, + "loss": 0.2025, + "step": 3780 + }, + { + "epoch": 0.36203849644170605, + "grad_norm": 0.15601076185703278, + "learning_rate": 0.0001660888888888889, + "loss": 0.2039, + "step": 3790 + }, + { + "epoch": 0.3629937431341644, + "grad_norm": 0.3648783266544342, + "learning_rate": 0.00016594074074074075, + "loss": 0.2058, + "step": 3800 + }, + { + "epoch": 0.3639489898266227, + "grad_norm": 0.23194488883018494, + "learning_rate": 0.00016579259259259262, + "loss": 0.2057, + "step": 3810 + }, + { + "epoch": 0.36490423651908105, + "grad_norm": 0.1057933121919632, + "learning_rate": 0.00016564444444444444, + "loss": 0.2067, + "step": 3820 + }, + { + "epoch": 0.3658594832115394, + "grad_norm": 0.11324844509363174, + "learning_rate": 0.0001654962962962963, + "loss": 0.2021, + "step": 3830 + }, + { + "epoch": 0.3668147299039977, + "grad_norm": 0.14048735797405243, + "learning_rate": 0.00016534814814814816, + "loss": 0.2055, + "step": 3840 + }, + { + "epoch": 0.36776997659645605, + "grad_norm": 0.36952847242355347, + "learning_rate": 0.0001652, + "loss": 0.2086, + "step": 3850 + }, + { + "epoch": 0.3687252232889144, + "grad_norm": 0.3616020083427429, + "learning_rate": 0.00016505185185185185, + "loss": 0.2036, + "step": 3860 + }, + { + "epoch": 0.3696804699813727, + "grad_norm": 0.12143600732088089, + "learning_rate": 0.00016490370370370372, + "loss": 0.2025, + "step": 3870 + }, + { + "epoch": 0.370635716673831, + "grad_norm": 0.13124673068523407, + "learning_rate": 0.00016475555555555557, + "loss": 0.2016, + "step": 3880 + }, + { + "epoch": 0.37159096336628933, + "grad_norm": 0.1724298894405365, + "learning_rate": 0.00016460740740740742, + "loss": 0.2038, + "step": 3890 + }, + { + "epoch": 0.37254621005874766, + "grad_norm": 0.32873499393463135, + "learning_rate": 0.00016445925925925926, + "loss": 0.205, + "step": 3900 + }, + { + "epoch": 0.373501456751206, + "grad_norm": 0.26445272564888, + "learning_rate": 0.00016431111111111114, + "loss": 0.2064, + "step": 3910 + }, + { + "epoch": 0.37445670344366433, + "grad_norm": 0.11803455650806427, + "learning_rate": 0.00016416296296296295, + "loss": 0.203, + "step": 3920 + }, + { + "epoch": 0.37541195013612266, + "grad_norm": 0.13132309913635254, + "learning_rate": 0.00016401481481481483, + "loss": 0.1999, + "step": 3930 + }, + { + "epoch": 0.376367196828581, + "grad_norm": 0.14737923443317413, + "learning_rate": 0.00016386666666666667, + "loss": 0.2034, + "step": 3940 + }, + { + "epoch": 0.37732244352103933, + "grad_norm": 0.357509583234787, + "learning_rate": 0.00016371851851851852, + "loss": 0.2051, + "step": 3950 + }, + { + "epoch": 0.3782776902134976, + "grad_norm": 0.17359203100204468, + "learning_rate": 0.00016357037037037037, + "loss": 0.203, + "step": 3960 + }, + { + "epoch": 0.37923293690595594, + "grad_norm": 0.1117023155093193, + "learning_rate": 0.00016342222222222224, + "loss": 0.2033, + "step": 3970 + }, + { + "epoch": 0.3801881835984143, + "grad_norm": 0.10505598783493042, + "learning_rate": 0.00016327407407407409, + "loss": 0.2018, + "step": 3980 + }, + { + "epoch": 0.3811434302908726, + "grad_norm": 0.17457084357738495, + "learning_rate": 0.00016312592592592593, + "loss": 0.2061, + "step": 3990 + }, + { + "epoch": 0.38209867698333094, + "grad_norm": 0.3120948076248169, + "learning_rate": 0.00016297777777777778, + "loss": 0.2062, + "step": 4000 + }, + { + "epoch": 0.3830539236757893, + "grad_norm": 0.37100452184677124, + "learning_rate": 0.00016282962962962965, + "loss": 0.2052, + "step": 4010 + }, + { + "epoch": 0.3840091703682476, + "grad_norm": 0.08743845671415329, + "learning_rate": 0.00016268148148148147, + "loss": 0.2018, + "step": 4020 + }, + { + "epoch": 0.38496441706070594, + "grad_norm": 0.1295013129711151, + "learning_rate": 0.00016253333333333334, + "loss": 0.2003, + "step": 4030 + }, + { + "epoch": 0.3859196637531643, + "grad_norm": 0.15588463842868805, + "learning_rate": 0.0001623851851851852, + "loss": 0.205, + "step": 4040 + }, + { + "epoch": 0.38687491044562256, + "grad_norm": 0.3880206346511841, + "learning_rate": 0.00016223703703703706, + "loss": 0.2031, + "step": 4050 + }, + { + "epoch": 0.3878301571380809, + "grad_norm": 0.13966549932956696, + "learning_rate": 0.00016208888888888888, + "loss": 0.2134, + "step": 4060 + }, + { + "epoch": 0.3887854038305392, + "grad_norm": 0.11811528354883194, + "learning_rate": 0.00016194074074074076, + "loss": 0.1983, + "step": 4070 + }, + { + "epoch": 0.38974065052299756, + "grad_norm": 0.14821895956993103, + "learning_rate": 0.0001617925925925926, + "loss": 0.202, + "step": 4080 + }, + { + "epoch": 0.3906958972154559, + "grad_norm": 0.12111028283834457, + "learning_rate": 0.00016164444444444445, + "loss": 0.2039, + "step": 4090 + }, + { + "epoch": 0.3916511439079142, + "grad_norm": 0.371774822473526, + "learning_rate": 0.0001614962962962963, + "loss": 0.2047, + "step": 4100 + }, + { + "epoch": 0.39260639060037256, + "grad_norm": 0.3905799090862274, + "learning_rate": 0.00016134814814814817, + "loss": 0.2191, + "step": 4110 + }, + { + "epoch": 0.3935616372928309, + "grad_norm": 0.13462865352630615, + "learning_rate": 0.00016120000000000002, + "loss": 0.2046, + "step": 4120 + }, + { + "epoch": 0.3945168839852892, + "grad_norm": 0.12144972383975983, + "learning_rate": 0.00016105185185185186, + "loss": 0.2023, + "step": 4130 + }, + { + "epoch": 0.3954721306777475, + "grad_norm": 0.14258955419063568, + "learning_rate": 0.0001609037037037037, + "loss": 0.2044, + "step": 4140 + }, + { + "epoch": 0.39642737737020584, + "grad_norm": 0.32331934571266174, + "learning_rate": 0.00016075555555555558, + "loss": 0.2035, + "step": 4150 + }, + { + "epoch": 0.39738262406266417, + "grad_norm": 0.2566055655479431, + "learning_rate": 0.0001606074074074074, + "loss": 0.2116, + "step": 4160 + }, + { + "epoch": 0.3983378707551225, + "grad_norm": 0.12487474083900452, + "learning_rate": 0.00016045925925925927, + "loss": 0.2053, + "step": 4170 + }, + { + "epoch": 0.39929311744758084, + "grad_norm": 0.137589693069458, + "learning_rate": 0.00016031111111111112, + "loss": 0.2021, + "step": 4180 + }, + { + "epoch": 0.40024836414003917, + "grad_norm": 0.12724049389362335, + "learning_rate": 0.000160162962962963, + "loss": 0.2045, + "step": 4190 + }, + { + "epoch": 0.4012036108324975, + "grad_norm": 0.3219785690307617, + "learning_rate": 0.0001600148148148148, + "loss": 0.2057, + "step": 4200 + }, + { + "epoch": 0.40215885752495584, + "grad_norm": 0.12036234885454178, + "learning_rate": 0.00015986666666666669, + "loss": 0.2069, + "step": 4210 + }, + { + "epoch": 0.40311410421741417, + "grad_norm": 0.10740282386541367, + "learning_rate": 0.00015971851851851853, + "loss": 0.2008, + "step": 4220 + }, + { + "epoch": 0.40406935090987245, + "grad_norm": 0.12133664637804031, + "learning_rate": 0.00015957037037037038, + "loss": 0.2014, + "step": 4230 + }, + { + "epoch": 0.4050245976023308, + "grad_norm": 0.17164857685565948, + "learning_rate": 0.00015942222222222222, + "loss": 0.2034, + "step": 4240 + }, + { + "epoch": 0.4059798442947891, + "grad_norm": 0.46345826983451843, + "learning_rate": 0.0001592740740740741, + "loss": 0.2045, + "step": 4250 + }, + { + "epoch": 0.40693509098724745, + "grad_norm": 0.20649810135364532, + "learning_rate": 0.00015912592592592592, + "loss": 0.2111, + "step": 4260 + }, + { + "epoch": 0.4078903376797058, + "grad_norm": 0.08920012414455414, + "learning_rate": 0.0001589777777777778, + "loss": 0.2024, + "step": 4270 + }, + { + "epoch": 0.4088455843721641, + "grad_norm": 0.11577285826206207, + "learning_rate": 0.00015882962962962964, + "loss": 0.1994, + "step": 4280 + }, + { + "epoch": 0.40980083106462245, + "grad_norm": 0.16047626733779907, + "learning_rate": 0.0001586814814814815, + "loss": 0.2037, + "step": 4290 + }, + { + "epoch": 0.4107560777570808, + "grad_norm": 0.2667694687843323, + "learning_rate": 0.00015853333333333333, + "loss": 0.2054, + "step": 4300 + }, + { + "epoch": 0.4117113244495391, + "grad_norm": 0.1745455265045166, + "learning_rate": 0.0001583851851851852, + "loss": 0.2097, + "step": 4310 + }, + { + "epoch": 0.4126665711419974, + "grad_norm": 0.10706843435764313, + "learning_rate": 0.00015823703703703705, + "loss": 0.2037, + "step": 4320 + }, + { + "epoch": 0.41362181783445573, + "grad_norm": 0.12602153420448303, + "learning_rate": 0.0001580888888888889, + "loss": 0.203, + "step": 4330 + }, + { + "epoch": 0.41457706452691406, + "grad_norm": 0.1355423778295517, + "learning_rate": 0.00015794074074074074, + "loss": 0.205, + "step": 4340 + }, + { + "epoch": 0.4155323112193724, + "grad_norm": 0.4957958459854126, + "learning_rate": 0.00015779259259259261, + "loss": 0.2059, + "step": 4350 + }, + { + "epoch": 0.41648755791183073, + "grad_norm": 0.17122064530849457, + "learning_rate": 0.00015764444444444446, + "loss": 0.207, + "step": 4360 + }, + { + "epoch": 0.41744280460428906, + "grad_norm": 0.09928039461374283, + "learning_rate": 0.0001574962962962963, + "loss": 0.1999, + "step": 4370 + }, + { + "epoch": 0.4183980512967474, + "grad_norm": 0.09767697006464005, + "learning_rate": 0.00015734814814814815, + "loss": 0.1991, + "step": 4380 + }, + { + "epoch": 0.41935329798920573, + "grad_norm": 0.1489480435848236, + "learning_rate": 0.00015720000000000003, + "loss": 0.2021, + "step": 4390 + }, + { + "epoch": 0.42030854468166406, + "grad_norm": 0.38458502292633057, + "learning_rate": 0.00015705185185185185, + "loss": 0.2065, + "step": 4400 + }, + { + "epoch": 0.42126379137412234, + "grad_norm": 0.2195194512605667, + "learning_rate": 0.00015690370370370372, + "loss": 0.2083, + "step": 4410 + }, + { + "epoch": 0.4222190380665807, + "grad_norm": 0.0922282487154007, + "learning_rate": 0.00015675555555555557, + "loss": 0.2035, + "step": 4420 + }, + { + "epoch": 0.423174284759039, + "grad_norm": 0.12092910706996918, + "learning_rate": 0.00015660740740740744, + "loss": 0.201, + "step": 4430 + }, + { + "epoch": 0.42412953145149734, + "grad_norm": 0.1265036016702652, + "learning_rate": 0.00015645925925925926, + "loss": 0.2041, + "step": 4440 + }, + { + "epoch": 0.4250847781439557, + "grad_norm": 0.36383864283561707, + "learning_rate": 0.00015631111111111113, + "loss": 0.2031, + "step": 4450 + }, + { + "epoch": 0.426040024836414, + "grad_norm": 0.2539485991001129, + "learning_rate": 0.00015616296296296298, + "loss": 0.2136, + "step": 4460 + }, + { + "epoch": 0.42699527152887234, + "grad_norm": 0.09574822336435318, + "learning_rate": 0.00015601481481481482, + "loss": 0.2051, + "step": 4470 + }, + { + "epoch": 0.4279505182213307, + "grad_norm": 0.12107813358306885, + "learning_rate": 0.00015586666666666667, + "loss": 0.2021, + "step": 4480 + }, + { + "epoch": 0.428905764913789, + "grad_norm": 0.14104434847831726, + "learning_rate": 0.00015571851851851854, + "loss": 0.2037, + "step": 4490 + }, + { + "epoch": 0.4298610116062473, + "grad_norm": 0.3190214931964874, + "learning_rate": 0.0001555703703703704, + "loss": 0.2039, + "step": 4500 + }, + { + "epoch": 0.4308162582987056, + "grad_norm": 0.15680533647537231, + "learning_rate": 0.00015542222222222224, + "loss": 0.2124, + "step": 4510 + }, + { + "epoch": 0.43177150499116396, + "grad_norm": 0.14274321496486664, + "learning_rate": 0.00015527407407407408, + "loss": 0.2027, + "step": 4520 + }, + { + "epoch": 0.4327267516836223, + "grad_norm": 0.12968724966049194, + "learning_rate": 0.00015512592592592593, + "loss": 0.2026, + "step": 4530 + }, + { + "epoch": 0.4336819983760806, + "grad_norm": 0.1476454883813858, + "learning_rate": 0.00015497777777777777, + "loss": 0.2044, + "step": 4540 + }, + { + "epoch": 0.43463724506853896, + "grad_norm": 0.3405204117298126, + "learning_rate": 0.00015482962962962965, + "loss": 0.2056, + "step": 4550 + }, + { + "epoch": 0.4355924917609973, + "grad_norm": 0.22096122801303864, + "learning_rate": 0.0001546814814814815, + "loss": 0.2055, + "step": 4560 + }, + { + "epoch": 0.4365477384534556, + "grad_norm": 0.13107611238956451, + "learning_rate": 0.00015453333333333334, + "loss": 0.2018, + "step": 4570 + }, + { + "epoch": 0.43750298514591396, + "grad_norm": 0.11204512417316437, + "learning_rate": 0.00015438518518518519, + "loss": 0.2, + "step": 4580 + }, + { + "epoch": 0.43845823183837224, + "grad_norm": 0.143476665019989, + "learning_rate": 0.00015423703703703706, + "loss": 0.2048, + "step": 4590 + }, + { + "epoch": 0.43941347853083057, + "grad_norm": 0.38437598943710327, + "learning_rate": 0.0001540888888888889, + "loss": 0.2055, + "step": 4600 + }, + { + "epoch": 0.4403687252232889, + "grad_norm": 0.2756338119506836, + "learning_rate": 0.00015394074074074075, + "loss": 0.2076, + "step": 4610 + }, + { + "epoch": 0.44132397191574724, + "grad_norm": 0.09002037346363068, + "learning_rate": 0.0001537925925925926, + "loss": 0.2036, + "step": 4620 + }, + { + "epoch": 0.44227921860820557, + "grad_norm": 0.114268459379673, + "learning_rate": 0.00015364444444444444, + "loss": 0.2011, + "step": 4630 + }, + { + "epoch": 0.4432344653006639, + "grad_norm": 0.12585988640785217, + "learning_rate": 0.0001534962962962963, + "loss": 0.2035, + "step": 4640 + }, + { + "epoch": 0.44418971199312224, + "grad_norm": 0.3231578767299652, + "learning_rate": 0.00015334814814814816, + "loss": 0.2051, + "step": 4650 + }, + { + "epoch": 0.44514495868558057, + "grad_norm": 0.27576273679733276, + "learning_rate": 0.0001532, + "loss": 0.2075, + "step": 4660 + }, + { + "epoch": 0.4461002053780389, + "grad_norm": 0.10367155820131302, + "learning_rate": 0.00015305185185185186, + "loss": 0.202, + "step": 4670 + }, + { + "epoch": 0.4470554520704972, + "grad_norm": 0.12141682952642441, + "learning_rate": 0.0001529037037037037, + "loss": 0.2026, + "step": 4680 + }, + { + "epoch": 0.4480106987629555, + "grad_norm": 0.1607910692691803, + "learning_rate": 0.00015275555555555558, + "loss": 0.2038, + "step": 4690 + }, + { + "epoch": 0.44896594545541385, + "grad_norm": 0.3699163794517517, + "learning_rate": 0.00015260740740740742, + "loss": 0.2055, + "step": 4700 + }, + { + "epoch": 0.4499211921478722, + "grad_norm": 0.12423942238092422, + "learning_rate": 0.00015245925925925927, + "loss": 0.2083, + "step": 4710 + }, + { + "epoch": 0.4508764388403305, + "grad_norm": 0.12863990664482117, + "learning_rate": 0.00015231111111111111, + "loss": 0.2006, + "step": 4720 + }, + { + "epoch": 0.45183168553278885, + "grad_norm": 0.11075339466333389, + "learning_rate": 0.00015216296296296296, + "loss": 0.201, + "step": 4730 + }, + { + "epoch": 0.4527869322252472, + "grad_norm": 0.16675962507724762, + "learning_rate": 0.00015201481481481483, + "loss": 0.2054, + "step": 4740 + }, + { + "epoch": 0.4537421789177055, + "grad_norm": 0.3847846984863281, + "learning_rate": 0.00015186666666666668, + "loss": 0.205, + "step": 4750 + }, + { + "epoch": 0.45469742561016385, + "grad_norm": 0.1789114773273468, + "learning_rate": 0.00015171851851851853, + "loss": 0.2002, + "step": 4760 + }, + { + "epoch": 0.45565267230262213, + "grad_norm": 0.09958454221487045, + "learning_rate": 0.00015157037037037037, + "loss": 0.2004, + "step": 4770 + }, + { + "epoch": 0.45660791899508046, + "grad_norm": 0.09986834973096848, + "learning_rate": 0.00015142222222222222, + "loss": 0.2008, + "step": 4780 + }, + { + "epoch": 0.4575631656875388, + "grad_norm": 0.14175297319889069, + "learning_rate": 0.00015127407407407407, + "loss": 0.2034, + "step": 4790 + }, + { + "epoch": 0.45851841237999713, + "grad_norm": 0.42737746238708496, + "learning_rate": 0.00015112592592592594, + "loss": 0.2075, + "step": 4800 + }, + { + "epoch": 0.45947365907245546, + "grad_norm": 0.3718159794807434, + "learning_rate": 0.00015097777777777779, + "loss": 0.2066, + "step": 4810 + }, + { + "epoch": 0.4604289057649138, + "grad_norm": 0.12669920921325684, + "learning_rate": 0.00015082962962962963, + "loss": 0.2045, + "step": 4820 + }, + { + "epoch": 0.46138415245737213, + "grad_norm": 0.1332644671201706, + "learning_rate": 0.00015068148148148148, + "loss": 0.2007, + "step": 4830 + }, + { + "epoch": 0.46233939914983047, + "grad_norm": 0.16694070398807526, + "learning_rate": 0.00015053333333333335, + "loss": 0.2033, + "step": 4840 + }, + { + "epoch": 0.46329464584228874, + "grad_norm": 0.3865353465080261, + "learning_rate": 0.0001503851851851852, + "loss": 0.2059, + "step": 4850 + }, + { + "epoch": 0.4642498925347471, + "grad_norm": 0.18828840553760529, + "learning_rate": 0.00015023703703703704, + "loss": 0.207, + "step": 4860 + }, + { + "epoch": 0.4652051392272054, + "grad_norm": 0.10073354095220566, + "learning_rate": 0.0001500888888888889, + "loss": 0.201, + "step": 4870 + }, + { + "epoch": 0.46616038591966374, + "grad_norm": 0.16433393955230713, + "learning_rate": 0.00014994074074074074, + "loss": 0.2012, + "step": 4880 + }, + { + "epoch": 0.4671156326121221, + "grad_norm": 0.12232095003128052, + "learning_rate": 0.00014979259259259258, + "loss": 0.2041, + "step": 4890 + }, + { + "epoch": 0.4680708793045804, + "grad_norm": 0.34756287932395935, + "learning_rate": 0.00014964444444444446, + "loss": 0.2057, + "step": 4900 + }, + { + "epoch": 0.46902612599703875, + "grad_norm": 0.22505418956279755, + "learning_rate": 0.0001494962962962963, + "loss": 0.2039, + "step": 4910 + }, + { + "epoch": 0.4699813726894971, + "grad_norm": 0.1396235227584839, + "learning_rate": 0.00014934814814814815, + "loss": 0.2052, + "step": 4920 + }, + { + "epoch": 0.4709366193819554, + "grad_norm": 0.0969369113445282, + "learning_rate": 0.0001492, + "loss": 0.2, + "step": 4930 + }, + { + "epoch": 0.4718918660744137, + "grad_norm": 0.15045002102851868, + "learning_rate": 0.00014905185185185187, + "loss": 0.2037, + "step": 4940 + }, + { + "epoch": 0.472847112766872, + "grad_norm": 0.365792840719223, + "learning_rate": 0.00014890370370370371, + "loss": 0.2063, + "step": 4950 + }, + { + "epoch": 0.47380235945933036, + "grad_norm": 0.17121556401252747, + "learning_rate": 0.00014875555555555556, + "loss": 0.1996, + "step": 4960 + }, + { + "epoch": 0.4747576061517887, + "grad_norm": 0.09126877784729004, + "learning_rate": 0.0001486074074074074, + "loss": 0.2057, + "step": 4970 + }, + { + "epoch": 0.475712852844247, + "grad_norm": 0.08809260278940201, + "learning_rate": 0.00014845925925925928, + "loss": 0.1994, + "step": 4980 + }, + { + "epoch": 0.47666809953670536, + "grad_norm": 0.1562274992465973, + "learning_rate": 0.0001483111111111111, + "loss": 0.204, + "step": 4990 + }, + { + "epoch": 0.4776233462291637, + "grad_norm": 0.2817908227443695, + "learning_rate": 0.00014816296296296297, + "loss": 0.208, + "step": 5000 + }, + { + "epoch": 0.478578592921622, + "grad_norm": 0.21794337034225464, + "learning_rate": 0.00014801481481481482, + "loss": 0.2046, + "step": 5010 + }, + { + "epoch": 0.47953383961408036, + "grad_norm": 0.08885704725980759, + "learning_rate": 0.00014786666666666666, + "loss": 0.2031, + "step": 5020 + }, + { + "epoch": 0.48048908630653864, + "grad_norm": 0.08695749193429947, + "learning_rate": 0.0001477185185185185, + "loss": 0.2023, + "step": 5030 + }, + { + "epoch": 0.48144433299899697, + "grad_norm": 0.11303768306970596, + "learning_rate": 0.00014757037037037038, + "loss": 0.2047, + "step": 5040 + }, + { + "epoch": 0.4823995796914553, + "grad_norm": 0.32796552777290344, + "learning_rate": 0.00014742222222222223, + "loss": 0.2056, + "step": 5050 + }, + { + "epoch": 0.48335482638391364, + "grad_norm": 0.17214402556419373, + "learning_rate": 0.00014727407407407408, + "loss": 0.202, + "step": 5060 + }, + { + "epoch": 0.48431007307637197, + "grad_norm": 0.131885826587677, + "learning_rate": 0.00014712592592592592, + "loss": 0.2005, + "step": 5070 + }, + { + "epoch": 0.4852653197688303, + "grad_norm": 0.09677991271018982, + "learning_rate": 0.0001469777777777778, + "loss": 0.2007, + "step": 5080 + }, + { + "epoch": 0.48622056646128864, + "grad_norm": 0.11090332269668579, + "learning_rate": 0.00014682962962962962, + "loss": 0.2052, + "step": 5090 + }, + { + "epoch": 0.487175813153747, + "grad_norm": 0.3222978711128235, + "learning_rate": 0.0001466814814814815, + "loss": 0.2057, + "step": 5100 + }, + { + "epoch": 0.4881310598462053, + "grad_norm": 0.15994799137115479, + "learning_rate": 0.00014653333333333334, + "loss": 0.2023, + "step": 5110 + }, + { + "epoch": 0.4890863065386636, + "grad_norm": 0.09031850844621658, + "learning_rate": 0.0001463851851851852, + "loss": 0.205, + "step": 5120 + }, + { + "epoch": 0.4900415532311219, + "grad_norm": 0.0860792025923729, + "learning_rate": 0.00014623703703703703, + "loss": 0.2018, + "step": 5130 + }, + { + "epoch": 0.49099679992358025, + "grad_norm": 0.14190521836280823, + "learning_rate": 0.0001460888888888889, + "loss": 0.2047, + "step": 5140 + }, + { + "epoch": 0.4919520466160386, + "grad_norm": 0.266658216714859, + "learning_rate": 0.00014594074074074075, + "loss": 0.2038, + "step": 5150 + }, + { + "epoch": 0.4929072933084969, + "grad_norm": 0.20443643629550934, + "learning_rate": 0.0001457925925925926, + "loss": 0.1998, + "step": 5160 + }, + { + "epoch": 0.49386254000095525, + "grad_norm": 0.1025327667593956, + "learning_rate": 0.00014564444444444444, + "loss": 0.2035, + "step": 5170 + }, + { + "epoch": 0.4948177866934136, + "grad_norm": 0.08821458369493484, + "learning_rate": 0.0001454962962962963, + "loss": 0.2006, + "step": 5180 + }, + { + "epoch": 0.4957730333858719, + "grad_norm": 0.10269002616405487, + "learning_rate": 0.00014534814814814813, + "loss": 0.2026, + "step": 5190 + }, + { + "epoch": 0.49672828007833025, + "grad_norm": 0.310215562582016, + "learning_rate": 0.0001452, + "loss": 0.2049, + "step": 5200 + }, + { + "epoch": 0.49768352677078853, + "grad_norm": 0.12952567636966705, + "learning_rate": 0.00014505185185185185, + "loss": 0.2056, + "step": 5210 + }, + { + "epoch": 0.49863877346324686, + "grad_norm": 0.07955840229988098, + "learning_rate": 0.00014490370370370373, + "loss": 0.1991, + "step": 5220 + }, + { + "epoch": 0.4995940201557052, + "grad_norm": 0.10103992372751236, + "learning_rate": 0.00014475555555555554, + "loss": 0.1995, + "step": 5230 + }, + { + "epoch": 0.5005492668481636, + "grad_norm": 0.1356901079416275, + "learning_rate": 0.00014460740740740742, + "loss": 0.2043, + "step": 5240 + }, + { + "epoch": 0.5015045135406219, + "grad_norm": 0.40337422490119934, + "learning_rate": 0.00014445925925925926, + "loss": 0.206, + "step": 5250 + }, + { + "epoch": 0.5024597602330801, + "grad_norm": 0.22015082836151123, + "learning_rate": 0.0001443111111111111, + "loss": 0.2009, + "step": 5260 + }, + { + "epoch": 0.5034150069255385, + "grad_norm": 0.09830496460199356, + "learning_rate": 0.00014416296296296296, + "loss": 0.202, + "step": 5270 + }, + { + "epoch": 0.5043702536179968, + "grad_norm": 0.12919305264949799, + "learning_rate": 0.00014401481481481483, + "loss": 0.2021, + "step": 5280 + }, + { + "epoch": 0.5053255003104552, + "grad_norm": 0.13084401190280914, + "learning_rate": 0.00014386666666666668, + "loss": 0.2034, + "step": 5290 + }, + { + "epoch": 0.5062807470029135, + "grad_norm": 0.33833256363868713, + "learning_rate": 0.00014371851851851852, + "loss": 0.2077, + "step": 5300 + }, + { + "epoch": 0.5072359936953719, + "grad_norm": 0.17477412521839142, + "learning_rate": 0.00014357037037037037, + "loss": 0.1996, + "step": 5310 + }, + { + "epoch": 0.5081912403878301, + "grad_norm": 0.09218191355466843, + "learning_rate": 0.00014342222222222224, + "loss": 0.2059, + "step": 5320 + }, + { + "epoch": 0.5091464870802885, + "grad_norm": 0.09006420522928238, + "learning_rate": 0.00014327407407407406, + "loss": 0.2019, + "step": 5330 + }, + { + "epoch": 0.5101017337727468, + "grad_norm": 0.12034288048744202, + "learning_rate": 0.00014312592592592593, + "loss": 0.2046, + "step": 5340 + }, + { + "epoch": 0.5110569804652051, + "grad_norm": 0.28528374433517456, + "learning_rate": 0.00014297777777777778, + "loss": 0.206, + "step": 5350 + }, + { + "epoch": 0.5120122271576635, + "grad_norm": 0.14377838373184204, + "learning_rate": 0.00014282962962962965, + "loss": 0.2025, + "step": 5360 + }, + { + "epoch": 0.5129674738501218, + "grad_norm": 0.1412467062473297, + "learning_rate": 0.00014268148148148147, + "loss": 0.2013, + "step": 5370 + }, + { + "epoch": 0.5139227205425801, + "grad_norm": 0.08147553354501724, + "learning_rate": 0.00014253333333333335, + "loss": 0.201, + "step": 5380 + }, + { + "epoch": 0.5148779672350384, + "grad_norm": 0.13719892501831055, + "learning_rate": 0.0001423851851851852, + "loss": 0.2034, + "step": 5390 + }, + { + "epoch": 0.5158332139274968, + "grad_norm": 0.3484647274017334, + "learning_rate": 0.00014223703703703704, + "loss": 0.203, + "step": 5400 + }, + { + "epoch": 0.5167884606199551, + "grad_norm": 0.1756839007139206, + "learning_rate": 0.00014208888888888889, + "loss": 0.2031, + "step": 5410 + }, + { + "epoch": 0.5177437073124135, + "grad_norm": 0.10281263291835785, + "learning_rate": 0.00014194074074074076, + "loss": 0.1998, + "step": 5420 + }, + { + "epoch": 0.5186989540048718, + "grad_norm": 0.08952672779560089, + "learning_rate": 0.0001417925925925926, + "loss": 0.1995, + "step": 5430 + }, + { + "epoch": 0.51965420069733, + "grad_norm": 0.12098149210214615, + "learning_rate": 0.00014164444444444445, + "loss": 0.2014, + "step": 5440 + }, + { + "epoch": 0.5206094473897884, + "grad_norm": 0.31019869446754456, + "learning_rate": 0.0001414962962962963, + "loss": 0.2026, + "step": 5450 + }, + { + "epoch": 0.5215646940822467, + "grad_norm": 0.13479962944984436, + "learning_rate": 0.00014134814814814817, + "loss": 0.2033, + "step": 5460 + }, + { + "epoch": 0.5225199407747051, + "grad_norm": 0.11460354179143906, + "learning_rate": 0.0001412, + "loss": 0.1978, + "step": 5470 + }, + { + "epoch": 0.5234751874671634, + "grad_norm": 0.10888929665088654, + "learning_rate": 0.00014105185185185186, + "loss": 0.1984, + "step": 5480 + }, + { + "epoch": 0.5244304341596218, + "grad_norm": 0.15640781819820404, + "learning_rate": 0.0001409037037037037, + "loss": 0.2018, + "step": 5490 + }, + { + "epoch": 0.52538568085208, + "grad_norm": 0.32829177379608154, + "learning_rate": 0.00014075555555555556, + "loss": 0.225, + "step": 5500 + }, + { + "epoch": 0.5263409275445384, + "grad_norm": 0.16984596848487854, + "learning_rate": 0.0001406074074074074, + "loss": 0.2056, + "step": 5510 + }, + { + "epoch": 0.5272961742369967, + "grad_norm": 0.13876718282699585, + "learning_rate": 0.00014045925925925928, + "loss": 0.1953, + "step": 5520 + }, + { + "epoch": 0.528251420929455, + "grad_norm": 0.10149407386779785, + "learning_rate": 0.00014031111111111112, + "loss": 0.1984, + "step": 5530 + }, + { + "epoch": 0.5292066676219134, + "grad_norm": 0.13450825214385986, + "learning_rate": 0.00014016296296296297, + "loss": 0.2034, + "step": 5540 + }, + { + "epoch": 0.5301619143143717, + "grad_norm": 0.38654589653015137, + "learning_rate": 0.00014001481481481481, + "loss": 0.2043, + "step": 5550 + }, + { + "epoch": 0.53111716100683, + "grad_norm": 0.12655580043792725, + "learning_rate": 0.0001398666666666667, + "loss": 0.2062, + "step": 5560 + }, + { + "epoch": 0.5320724076992883, + "grad_norm": 0.1314125508069992, + "learning_rate": 0.0001397185185185185, + "loss": 0.2015, + "step": 5570 + }, + { + "epoch": 0.5330276543917467, + "grad_norm": 0.09950239211320877, + "learning_rate": 0.00013957037037037038, + "loss": 0.2022, + "step": 5580 + }, + { + "epoch": 0.533982901084205, + "grad_norm": 0.12652038037776947, + "learning_rate": 0.00013942222222222223, + "loss": 0.2041, + "step": 5590 + }, + { + "epoch": 0.5349381477766634, + "grad_norm": 0.3637951910495758, + "learning_rate": 0.0001392740740740741, + "loss": 0.2052, + "step": 5600 + }, + { + "epoch": 0.5358933944691217, + "grad_norm": 0.38742905855178833, + "learning_rate": 0.00013912592592592592, + "loss": 0.2041, + "step": 5610 + }, + { + "epoch": 0.5368486411615799, + "grad_norm": 0.08842667192220688, + "learning_rate": 0.0001389777777777778, + "loss": 0.2045, + "step": 5620 + }, + { + "epoch": 0.5378038878540383, + "grad_norm": 0.08290154486894608, + "learning_rate": 0.00013882962962962964, + "loss": 0.2009, + "step": 5630 + }, + { + "epoch": 0.5387591345464966, + "grad_norm": 0.12457109987735748, + "learning_rate": 0.00013868148148148148, + "loss": 0.2037, + "step": 5640 + }, + { + "epoch": 0.539714381238955, + "grad_norm": 0.26266059279441833, + "learning_rate": 0.00013853333333333333, + "loss": 0.2048, + "step": 5650 + }, + { + "epoch": 0.5406696279314133, + "grad_norm": 0.19951313734054565, + "learning_rate": 0.0001383851851851852, + "loss": 0.2017, + "step": 5660 + }, + { + "epoch": 0.5416248746238717, + "grad_norm": 0.1073467880487442, + "learning_rate": 0.00013823703703703705, + "loss": 0.2007, + "step": 5670 + }, + { + "epoch": 0.5425801213163299, + "grad_norm": 0.08884900063276291, + "learning_rate": 0.0001380888888888889, + "loss": 0.1992, + "step": 5680 + }, + { + "epoch": 0.5435353680087883, + "grad_norm": 0.1196378618478775, + "learning_rate": 0.00013794074074074074, + "loss": 0.2018, + "step": 5690 + }, + { + "epoch": 0.5444906147012466, + "grad_norm": 0.2851475477218628, + "learning_rate": 0.00013779259259259262, + "loss": 0.2026, + "step": 5700 + }, + { + "epoch": 0.5454458613937049, + "grad_norm": 0.14857490360736847, + "learning_rate": 0.00013764444444444444, + "loss": 0.2037, + "step": 5710 + }, + { + "epoch": 0.5464011080861633, + "grad_norm": 0.102902352809906, + "learning_rate": 0.0001374962962962963, + "loss": 0.2007, + "step": 5720 + }, + { + "epoch": 0.5473563547786215, + "grad_norm": 0.10917162150144577, + "learning_rate": 0.00013734814814814815, + "loss": 0.2001, + "step": 5730 + }, + { + "epoch": 0.5483116014710799, + "grad_norm": 0.11089454591274261, + "learning_rate": 0.00013720000000000003, + "loss": 0.2025, + "step": 5740 + }, + { + "epoch": 0.5492668481635382, + "grad_norm": 0.3424724042415619, + "learning_rate": 0.00013705185185185185, + "loss": 0.204, + "step": 5750 + }, + { + "epoch": 0.5502220948559966, + "grad_norm": 0.15958619117736816, + "learning_rate": 0.00013690370370370372, + "loss": 0.1983, + "step": 5760 + }, + { + "epoch": 0.5511773415484549, + "grad_norm": 0.10369803756475449, + "learning_rate": 0.00013675555555555557, + "loss": 0.2033, + "step": 5770 + }, + { + "epoch": 0.5521325882409133, + "grad_norm": 0.08812570571899414, + "learning_rate": 0.0001366074074074074, + "loss": 0.1991, + "step": 5780 + }, + { + "epoch": 0.5530878349333715, + "grad_norm": 0.12774603068828583, + "learning_rate": 0.00013645925925925926, + "loss": 0.2034, + "step": 5790 + }, + { + "epoch": 0.5540430816258298, + "grad_norm": 0.32776811718940735, + "learning_rate": 0.00013631111111111113, + "loss": 0.2054, + "step": 5800 + }, + { + "epoch": 0.5549983283182882, + "grad_norm": 0.224983349442482, + "learning_rate": 0.00013616296296296295, + "loss": 0.1976, + "step": 5810 + }, + { + "epoch": 0.5559535750107465, + "grad_norm": 0.07955210655927658, + "learning_rate": 0.00013601481481481483, + "loss": 0.203, + "step": 5820 + }, + { + "epoch": 0.5569088217032049, + "grad_norm": 0.09523504227399826, + "learning_rate": 0.00013586666666666667, + "loss": 0.2003, + "step": 5830 + }, + { + "epoch": 0.5578640683956632, + "grad_norm": 0.11511880159378052, + "learning_rate": 0.00013571851851851854, + "loss": 0.202, + "step": 5840 + }, + { + "epoch": 0.5588193150881215, + "grad_norm": 0.39985740184783936, + "learning_rate": 0.00013557037037037036, + "loss": 0.2052, + "step": 5850 + }, + { + "epoch": 0.5597745617805798, + "grad_norm": 0.13187652826309204, + "learning_rate": 0.00013542222222222224, + "loss": 0.2024, + "step": 5860 + }, + { + "epoch": 0.5607298084730382, + "grad_norm": 0.09542467445135117, + "learning_rate": 0.00013527407407407408, + "loss": 0.2035, + "step": 5870 + }, + { + "epoch": 0.5616850551654965, + "grad_norm": 0.10654427856206894, + "learning_rate": 0.00013512592592592593, + "loss": 0.2, + "step": 5880 + }, + { + "epoch": 0.5626403018579548, + "grad_norm": 0.13208676874637604, + "learning_rate": 0.00013497777777777778, + "loss": 0.2019, + "step": 5890 + }, + { + "epoch": 0.5635955485504132, + "grad_norm": 0.32865169644355774, + "learning_rate": 0.00013482962962962965, + "loss": 0.2042, + "step": 5900 + }, + { + "epoch": 0.5645507952428714, + "grad_norm": 0.15548019111156464, + "learning_rate": 0.0001346814814814815, + "loss": 0.2016, + "step": 5910 + }, + { + "epoch": 0.5655060419353298, + "grad_norm": 0.08601599931716919, + "learning_rate": 0.00013453333333333334, + "loss": 0.1961, + "step": 5920 + }, + { + "epoch": 0.5664612886277881, + "grad_norm": 0.12143748253583908, + "learning_rate": 0.0001343851851851852, + "loss": 0.1991, + "step": 5930 + }, + { + "epoch": 0.5674165353202465, + "grad_norm": 0.16378195583820343, + "learning_rate": 0.00013423703703703706, + "loss": 0.2017, + "step": 5940 + }, + { + "epoch": 0.5683717820127048, + "grad_norm": 0.4630471467971802, + "learning_rate": 0.00013408888888888888, + "loss": 0.2049, + "step": 5950 + }, + { + "epoch": 0.5693270287051632, + "grad_norm": 0.3741738200187683, + "learning_rate": 0.00013394074074074075, + "loss": 0.2014, + "step": 5960 + }, + { + "epoch": 0.5702822753976214, + "grad_norm": 0.1016160398721695, + "learning_rate": 0.0001337925925925926, + "loss": 0.1998, + "step": 5970 + }, + { + "epoch": 0.5712375220900797, + "grad_norm": 0.09003070741891861, + "learning_rate": 0.00013364444444444447, + "loss": 0.1973, + "step": 5980 + }, + { + "epoch": 0.5721927687825381, + "grad_norm": 0.12196210771799088, + "learning_rate": 0.0001334962962962963, + "loss": 0.2013, + "step": 5990 + }, + { + "epoch": 0.5731480154749964, + "grad_norm": 0.3323560655117035, + "learning_rate": 0.00013334814814814817, + "loss": 0.2038, + "step": 6000 + }, + { + "epoch": 0.5741032621674548, + "grad_norm": 0.14074192941188812, + "learning_rate": 0.0001332, + "loss": 0.2042, + "step": 6010 + }, + { + "epoch": 0.575058508859913, + "grad_norm": 0.08944179117679596, + "learning_rate": 0.00013305185185185186, + "loss": 0.1973, + "step": 6020 + }, + { + "epoch": 0.5760137555523714, + "grad_norm": 0.09902766346931458, + "learning_rate": 0.0001329037037037037, + "loss": 0.1999, + "step": 6030 + }, + { + "epoch": 0.5769690022448297, + "grad_norm": 0.13410291075706482, + "learning_rate": 0.00013275555555555558, + "loss": 0.2037, + "step": 6040 + }, + { + "epoch": 0.5779242489372881, + "grad_norm": 0.27775460481643677, + "learning_rate": 0.00013260740740740742, + "loss": 0.2043, + "step": 6050 + }, + { + "epoch": 0.5788794956297464, + "grad_norm": 0.09671878814697266, + "learning_rate": 0.00013245925925925927, + "loss": 0.2022, + "step": 6060 + }, + { + "epoch": 0.5798347423222047, + "grad_norm": 0.09354133903980255, + "learning_rate": 0.00013231111111111112, + "loss": 0.1969, + "step": 6070 + }, + { + "epoch": 0.580789989014663, + "grad_norm": 0.10099305212497711, + "learning_rate": 0.000132162962962963, + "loss": 0.1995, + "step": 6080 + }, + { + "epoch": 0.5817452357071213, + "grad_norm": 0.13947731256484985, + "learning_rate": 0.0001320148148148148, + "loss": 0.2018, + "step": 6090 + }, + { + "epoch": 0.5827004823995797, + "grad_norm": 0.30881768465042114, + "learning_rate": 0.00013186666666666668, + "loss": 0.2043, + "step": 6100 + }, + { + "epoch": 0.583655729092038, + "grad_norm": 0.12652377784252167, + "learning_rate": 0.00013171851851851853, + "loss": 0.203, + "step": 6110 + }, + { + "epoch": 0.5846109757844964, + "grad_norm": 0.10004782676696777, + "learning_rate": 0.00013157037037037038, + "loss": 0.1979, + "step": 6120 + }, + { + "epoch": 0.5855662224769547, + "grad_norm": 0.09912140667438507, + "learning_rate": 0.00013142222222222222, + "loss": 0.1981, + "step": 6130 + }, + { + "epoch": 0.586521469169413, + "grad_norm": 0.14001400768756866, + "learning_rate": 0.0001312740740740741, + "loss": 0.2011, + "step": 6140 + }, + { + "epoch": 0.5874767158618713, + "grad_norm": 0.3409046232700348, + "learning_rate": 0.00013112592592592594, + "loss": 0.2032, + "step": 6150 + }, + { + "epoch": 0.5884319625543296, + "grad_norm": 0.15291565656661987, + "learning_rate": 0.0001309777777777778, + "loss": 0.2008, + "step": 6160 + }, + { + "epoch": 0.589387209246788, + "grad_norm": 0.09848395735025406, + "learning_rate": 0.00013082962962962963, + "loss": 0.2017, + "step": 6170 + }, + { + "epoch": 0.5903424559392463, + "grad_norm": 0.11449220776557922, + "learning_rate": 0.0001306814814814815, + "loss": 0.1994, + "step": 6180 + }, + { + "epoch": 0.5912977026317047, + "grad_norm": 0.15763157606124878, + "learning_rate": 0.00013053333333333333, + "loss": 0.2034, + "step": 6190 + }, + { + "epoch": 0.592252949324163, + "grad_norm": 0.34295904636383057, + "learning_rate": 0.0001303851851851852, + "loss": 0.204, + "step": 6200 + }, + { + "epoch": 0.5932081960166213, + "grad_norm": 0.168208509683609, + "learning_rate": 0.00013023703703703705, + "loss": 0.2024, + "step": 6210 + }, + { + "epoch": 0.5941634427090796, + "grad_norm": 0.08638432621955872, + "learning_rate": 0.0001300888888888889, + "loss": 0.2013, + "step": 6220 + }, + { + "epoch": 0.5951186894015379, + "grad_norm": 0.105532206594944, + "learning_rate": 0.00012994074074074074, + "loss": 0.199, + "step": 6230 + }, + { + "epoch": 0.5960739360939963, + "grad_norm": 0.1493140310049057, + "learning_rate": 0.0001297925925925926, + "loss": 0.2019, + "step": 6240 + }, + { + "epoch": 0.5970291827864546, + "grad_norm": 0.27100884914398193, + "learning_rate": 0.00012964444444444446, + "loss": 0.2039, + "step": 6250 + }, + { + "epoch": 0.597984429478913, + "grad_norm": 0.16155314445495605, + "learning_rate": 0.0001294962962962963, + "loss": 0.2014, + "step": 6260 + }, + { + "epoch": 0.5989396761713712, + "grad_norm": 0.13234031200408936, + "learning_rate": 0.00012934814814814815, + "loss": 0.2014, + "step": 6270 + }, + { + "epoch": 0.5998949228638296, + "grad_norm": 0.10277749598026276, + "learning_rate": 0.00012920000000000002, + "loss": 0.1998, + "step": 6280 + }, + { + "epoch": 0.6008501695562879, + "grad_norm": 0.1772095113992691, + "learning_rate": 0.00012905185185185187, + "loss": 0.2015, + "step": 6290 + }, + { + "epoch": 0.6018054162487463, + "grad_norm": 0.2691553831100464, + "learning_rate": 0.00012890370370370372, + "loss": 0.2036, + "step": 6300 + }, + { + "epoch": 0.6027606629412046, + "grad_norm": 0.1066615879535675, + "learning_rate": 0.00012875555555555556, + "loss": 0.2018, + "step": 6310 + }, + { + "epoch": 0.6037159096336628, + "grad_norm": 0.08042429387569427, + "learning_rate": 0.0001286074074074074, + "loss": 0.1989, + "step": 6320 + }, + { + "epoch": 0.6046711563261212, + "grad_norm": 0.10551764816045761, + "learning_rate": 0.00012845925925925925, + "loss": 0.1994, + "step": 6330 + }, + { + "epoch": 0.6056264030185795, + "grad_norm": 0.1348186433315277, + "learning_rate": 0.00012831111111111113, + "loss": 0.2035, + "step": 6340 + }, + { + "epoch": 0.6065816497110379, + "grad_norm": 0.3124079406261444, + "learning_rate": 0.00012816296296296297, + "loss": 0.2046, + "step": 6350 + }, + { + "epoch": 0.6075368964034962, + "grad_norm": 0.1197357103228569, + "learning_rate": 0.00012801481481481482, + "loss": 0.1994, + "step": 6360 + }, + { + "epoch": 0.6084921430959546, + "grad_norm": 0.08718045055866241, + "learning_rate": 0.00012786666666666667, + "loss": 0.1996, + "step": 6370 + }, + { + "epoch": 0.6094473897884128, + "grad_norm": 0.13004030287265778, + "learning_rate": 0.0001277185185185185, + "loss": 0.1988, + "step": 6380 + }, + { + "epoch": 0.6104026364808712, + "grad_norm": 0.1345457136631012, + "learning_rate": 0.00012757037037037039, + "loss": 0.2028, + "step": 6390 + }, + { + "epoch": 0.6113578831733295, + "grad_norm": 0.3091771900653839, + "learning_rate": 0.00012742222222222223, + "loss": 0.2078, + "step": 6400 + }, + { + "epoch": 0.6123131298657878, + "grad_norm": 0.15620669722557068, + "learning_rate": 0.00012727407407407408, + "loss": 0.2007, + "step": 6410 + }, + { + "epoch": 0.6132683765582462, + "grad_norm": 0.10864575207233429, + "learning_rate": 0.00012712592592592592, + "loss": 0.2002, + "step": 6420 + }, + { + "epoch": 0.6142236232507045, + "grad_norm": 0.09514521807432175, + "learning_rate": 0.00012697777777777777, + "loss": 0.2, + "step": 6430 + }, + { + "epoch": 0.6151788699431628, + "grad_norm": 0.12483695149421692, + "learning_rate": 0.00012682962962962964, + "loss": 0.202, + "step": 6440 + }, + { + "epoch": 0.6161341166356211, + "grad_norm": 0.22912171483039856, + "learning_rate": 0.0001266814814814815, + "loss": 0.2036, + "step": 6450 + }, + { + "epoch": 0.6170893633280795, + "grad_norm": 0.14288096129894257, + "learning_rate": 0.00012653333333333334, + "loss": 0.1948, + "step": 6460 + }, + { + "epoch": 0.6180446100205378, + "grad_norm": 0.11408117413520813, + "learning_rate": 0.00012638518518518518, + "loss": 0.2016, + "step": 6470 + }, + { + "epoch": 0.6189998567129962, + "grad_norm": 0.07267523556947708, + "learning_rate": 0.00012623703703703703, + "loss": 0.1998, + "step": 6480 + }, + { + "epoch": 0.6199551034054545, + "grad_norm": 0.11102595180273056, + "learning_rate": 0.0001260888888888889, + "loss": 0.2015, + "step": 6490 + }, + { + "epoch": 0.6209103500979127, + "grad_norm": 0.21290349960327148, + "learning_rate": 0.00012594074074074075, + "loss": 0.2053, + "step": 6500 + }, + { + "epoch": 0.6218655967903711, + "grad_norm": 0.12181617319583893, + "learning_rate": 0.0001257925925925926, + "loss": 0.1984, + "step": 6510 + }, + { + "epoch": 0.6228208434828294, + "grad_norm": 0.15453316271305084, + "learning_rate": 0.00012564444444444444, + "loss": 0.1981, + "step": 6520 + }, + { + "epoch": 0.6237760901752878, + "grad_norm": 0.12571464478969574, + "learning_rate": 0.00012549629629629631, + "loss": 0.1993, + "step": 6530 + }, + { + "epoch": 0.6247313368677461, + "grad_norm": 0.12125848233699799, + "learning_rate": 0.00012534814814814816, + "loss": 0.2018, + "step": 6540 + }, + { + "epoch": 0.6256865835602045, + "grad_norm": 0.23237983882427216, + "learning_rate": 0.0001252, + "loss": 0.2053, + "step": 6550 + }, + { + "epoch": 0.6266418302526627, + "grad_norm": 0.09626813232898712, + "learning_rate": 0.00012505185185185185, + "loss": 0.1961, + "step": 6560 + }, + { + "epoch": 0.6275970769451211, + "grad_norm": 0.08862323313951492, + "learning_rate": 0.0001249037037037037, + "loss": 0.1977, + "step": 6570 + }, + { + "epoch": 0.6285523236375794, + "grad_norm": 0.08696942031383514, + "learning_rate": 0.00012475555555555555, + "loss": 0.1996, + "step": 6580 + }, + { + "epoch": 0.6295075703300377, + "grad_norm": 0.14030681550502777, + "learning_rate": 0.00012460740740740742, + "loss": 0.2018, + "step": 6590 + }, + { + "epoch": 0.6304628170224961, + "grad_norm": 0.3134588301181793, + "learning_rate": 0.00012445925925925927, + "loss": 0.2036, + "step": 6600 + }, + { + "epoch": 0.6314180637149543, + "grad_norm": 0.14569984376430511, + "learning_rate": 0.0001243111111111111, + "loss": 0.1973, + "step": 6610 + }, + { + "epoch": 0.6323733104074127, + "grad_norm": 0.09531128406524658, + "learning_rate": 0.00012416296296296296, + "loss": 0.1997, + "step": 6620 + }, + { + "epoch": 0.633328557099871, + "grad_norm": 0.07848309725522995, + "learning_rate": 0.00012401481481481483, + "loss": 0.1996, + "step": 6630 + }, + { + "epoch": 0.6342838037923294, + "grad_norm": 0.1390487551689148, + "learning_rate": 0.00012386666666666665, + "loss": 0.2015, + "step": 6640 + }, + { + "epoch": 0.6352390504847877, + "grad_norm": 0.29697003960609436, + "learning_rate": 0.00012371851851851852, + "loss": 0.2038, + "step": 6650 + }, + { + "epoch": 0.6361942971772461, + "grad_norm": 0.16010154783725739, + "learning_rate": 0.00012357037037037037, + "loss": 0.1974, + "step": 6660 + }, + { + "epoch": 0.6371495438697043, + "grad_norm": 0.15159407258033752, + "learning_rate": 0.00012342222222222224, + "loss": 0.1984, + "step": 6670 + }, + { + "epoch": 0.6381047905621626, + "grad_norm": 0.1260843575000763, + "learning_rate": 0.00012327407407407406, + "loss": 0.1992, + "step": 6680 + }, + { + "epoch": 0.639060037254621, + "grad_norm": 0.1428649127483368, + "learning_rate": 0.00012312592592592594, + "loss": 0.2008, + "step": 6690 + }, + { + "epoch": 0.6400152839470793, + "grad_norm": 0.28894853591918945, + "learning_rate": 0.00012297777777777778, + "loss": 0.2038, + "step": 6700 + }, + { + "epoch": 0.6409705306395377, + "grad_norm": 0.10079352557659149, + "learning_rate": 0.00012282962962962963, + "loss": 0.196, + "step": 6710 + }, + { + "epoch": 0.641925777331996, + "grad_norm": 0.08398754894733429, + "learning_rate": 0.00012268148148148147, + "loss": 0.1973, + "step": 6720 + }, + { + "epoch": 0.6428810240244544, + "grad_norm": 0.10271371901035309, + "learning_rate": 0.00012253333333333335, + "loss": 0.1993, + "step": 6730 + }, + { + "epoch": 0.6438362707169126, + "grad_norm": 0.14714913070201874, + "learning_rate": 0.00012238518518518517, + "loss": 0.2017, + "step": 6740 + }, + { + "epoch": 0.644791517409371, + "grad_norm": 0.4217683970928192, + "learning_rate": 0.00012223703703703704, + "loss": 0.2046, + "step": 6750 + }, + { + "epoch": 0.6457467641018293, + "grad_norm": 0.11601690202951431, + "learning_rate": 0.0001220888888888889, + "loss": 0.1994, + "step": 6760 + }, + { + "epoch": 0.6467020107942876, + "grad_norm": 0.14154985547065735, + "learning_rate": 0.00012194074074074076, + "loss": 0.1992, + "step": 6770 + }, + { + "epoch": 0.647657257486746, + "grad_norm": 0.09532088786363602, + "learning_rate": 0.00012179259259259259, + "loss": 0.1996, + "step": 6780 + }, + { + "epoch": 0.6486125041792042, + "grad_norm": 0.13837966322898865, + "learning_rate": 0.00012164444444444445, + "loss": 0.2002, + "step": 6790 + }, + { + "epoch": 0.6495677508716626, + "grad_norm": 0.23131221532821655, + "learning_rate": 0.00012149629629629631, + "loss": 0.2038, + "step": 6800 + }, + { + "epoch": 0.6505229975641209, + "grad_norm": 0.24057185649871826, + "learning_rate": 0.00012134814814814815, + "loss": 0.1961, + "step": 6810 + }, + { + "epoch": 0.6514782442565793, + "grad_norm": 0.092747263610363, + "learning_rate": 0.0001212, + "loss": 0.1977, + "step": 6820 + }, + { + "epoch": 0.6524334909490376, + "grad_norm": 0.07853356748819351, + "learning_rate": 0.00012105185185185186, + "loss": 0.1983, + "step": 6830 + }, + { + "epoch": 0.653388737641496, + "grad_norm": 0.13685356080532074, + "learning_rate": 0.00012090370370370372, + "loss": 0.2029, + "step": 6840 + }, + { + "epoch": 0.6543439843339542, + "grad_norm": 0.3275240361690521, + "learning_rate": 0.00012075555555555556, + "loss": 0.2061, + "step": 6850 + }, + { + "epoch": 0.6552992310264125, + "grad_norm": 0.1146525964140892, + "learning_rate": 0.00012060740740740742, + "loss": 0.1988, + "step": 6860 + }, + { + "epoch": 0.6562544777188709, + "grad_norm": 0.08507192134857178, + "learning_rate": 0.00012045925925925928, + "loss": 0.1976, + "step": 6870 + }, + { + "epoch": 0.6572097244113292, + "grad_norm": 0.10274770855903625, + "learning_rate": 0.00012031111111111111, + "loss": 0.2, + "step": 6880 + }, + { + "epoch": 0.6581649711037876, + "grad_norm": 0.12793031334877014, + "learning_rate": 0.00012016296296296297, + "loss": 0.203, + "step": 6890 + }, + { + "epoch": 0.6591202177962459, + "grad_norm": 0.25618916749954224, + "learning_rate": 0.00012001481481481483, + "loss": 0.2042, + "step": 6900 + }, + { + "epoch": 0.6600754644887042, + "grad_norm": 0.11048085242509842, + "learning_rate": 0.00011986666666666669, + "loss": 0.1982, + "step": 6910 + }, + { + "epoch": 0.6610307111811625, + "grad_norm": 0.13066765666007996, + "learning_rate": 0.00011971851851851852, + "loss": 0.1985, + "step": 6920 + }, + { + "epoch": 0.6619859578736209, + "grad_norm": 0.09768009185791016, + "learning_rate": 0.00011957037037037038, + "loss": 0.1991, + "step": 6930 + }, + { + "epoch": 0.6629412045660792, + "grad_norm": 0.11774328351020813, + "learning_rate": 0.00011942222222222224, + "loss": 0.2029, + "step": 6940 + }, + { + "epoch": 0.6638964512585375, + "grad_norm": 0.2897031307220459, + "learning_rate": 0.00011927407407407407, + "loss": 0.2054, + "step": 6950 + }, + { + "epoch": 0.6648516979509959, + "grad_norm": 0.1458863914012909, + "learning_rate": 0.00011912592592592593, + "loss": 0.1971, + "step": 6960 + }, + { + "epoch": 0.6658069446434541, + "grad_norm": 0.10939256846904755, + "learning_rate": 0.0001189777777777778, + "loss": 0.1987, + "step": 6970 + }, + { + "epoch": 0.6667621913359125, + "grad_norm": 0.09529370814561844, + "learning_rate": 0.00011882962962962964, + "loss": 0.1975, + "step": 6980 + }, + { + "epoch": 0.6677174380283708, + "grad_norm": 0.1301857829093933, + "learning_rate": 0.00011868148148148149, + "loss": 0.2006, + "step": 6990 + }, + { + "epoch": 0.6686726847208292, + "grad_norm": 0.318764328956604, + "learning_rate": 0.00011853333333333335, + "loss": 0.2028, + "step": 7000 + }, + { + "epoch": 0.6696279314132875, + "grad_norm": 0.12811118364334106, + "learning_rate": 0.0001183851851851852, + "loss": 0.1977, + "step": 7010 + }, + { + "epoch": 0.6705831781057459, + "grad_norm": 0.09584329277276993, + "learning_rate": 0.00011823703703703704, + "loss": 0.1971, + "step": 7020 + }, + { + "epoch": 0.6715384247982041, + "grad_norm": 0.10128890722990036, + "learning_rate": 0.0001180888888888889, + "loss": 0.1976, + "step": 7030 + }, + { + "epoch": 0.6724936714906624, + "grad_norm": 0.11427022516727448, + "learning_rate": 0.00011794074074074076, + "loss": 0.2012, + "step": 7040 + }, + { + "epoch": 0.6734489181831208, + "grad_norm": 0.34024617075920105, + "learning_rate": 0.00011779259259259259, + "loss": 0.2044, + "step": 7050 + }, + { + "epoch": 0.6744041648755791, + "grad_norm": 0.09404119104146957, + "learning_rate": 0.00011764444444444445, + "loss": 0.1979, + "step": 7060 + }, + { + "epoch": 0.6753594115680375, + "grad_norm": 0.09541890770196915, + "learning_rate": 0.00011749629629629631, + "loss": 0.1984, + "step": 7070 + }, + { + "epoch": 0.6763146582604957, + "grad_norm": 0.1030392274260521, + "learning_rate": 0.00011734814814814816, + "loss": 0.1988, + "step": 7080 + }, + { + "epoch": 0.6772699049529541, + "grad_norm": 0.1543877124786377, + "learning_rate": 0.0001172, + "loss": 0.202, + "step": 7090 + }, + { + "epoch": 0.6782251516454124, + "grad_norm": 0.291363924741745, + "learning_rate": 0.00011705185185185186, + "loss": 0.2039, + "step": 7100 + }, + { + "epoch": 0.6791803983378708, + "grad_norm": 0.1557106226682663, + "learning_rate": 0.00011690370370370371, + "loss": 0.1975, + "step": 7110 + }, + { + "epoch": 0.6801356450303291, + "grad_norm": 0.13644851744174957, + "learning_rate": 0.00011675555555555556, + "loss": 0.1995, + "step": 7120 + }, + { + "epoch": 0.6810908917227874, + "grad_norm": 0.10906966030597687, + "learning_rate": 0.00011660740740740741, + "loss": 0.2003, + "step": 7130 + }, + { + "epoch": 0.6820461384152458, + "grad_norm": 0.1362205445766449, + "learning_rate": 0.00011645925925925927, + "loss": 0.2019, + "step": 7140 + }, + { + "epoch": 0.683001385107704, + "grad_norm": 0.22184917330741882, + "learning_rate": 0.00011631111111111112, + "loss": 0.2032, + "step": 7150 + }, + { + "epoch": 0.6839566318001624, + "grad_norm": 0.14768429100513458, + "learning_rate": 0.00011616296296296297, + "loss": 0.2001, + "step": 7160 + }, + { + "epoch": 0.6849118784926207, + "grad_norm": 0.10601145029067993, + "learning_rate": 0.00011601481481481483, + "loss": 0.2006, + "step": 7170 + }, + { + "epoch": 0.6858671251850791, + "grad_norm": 0.09267520159482956, + "learning_rate": 0.00011586666666666667, + "loss": 0.198, + "step": 7180 + }, + { + "epoch": 0.6868223718775374, + "grad_norm": 0.11036325246095657, + "learning_rate": 0.00011571851851851852, + "loss": 0.2021, + "step": 7190 + }, + { + "epoch": 0.6877776185699958, + "grad_norm": 0.22072440385818481, + "learning_rate": 0.00011557037037037038, + "loss": 0.2029, + "step": 7200 + }, + { + "epoch": 0.688732865262454, + "grad_norm": 0.1525413691997528, + "learning_rate": 0.00011542222222222223, + "loss": 0.1984, + "step": 7210 + }, + { + "epoch": 0.6896881119549123, + "grad_norm": 0.13061276078224182, + "learning_rate": 0.00011527407407407409, + "loss": 0.1998, + "step": 7220 + }, + { + "epoch": 0.6906433586473707, + "grad_norm": 0.07936930656433105, + "learning_rate": 0.00011512592592592593, + "loss": 0.199, + "step": 7230 + }, + { + "epoch": 0.691598605339829, + "grad_norm": 0.14293666183948517, + "learning_rate": 0.00011497777777777778, + "loss": 0.2016, + "step": 7240 + }, + { + "epoch": 0.6925538520322874, + "grad_norm": 0.2694840431213379, + "learning_rate": 0.00011482962962962964, + "loss": 0.2048, + "step": 7250 + }, + { + "epoch": 0.6935090987247456, + "grad_norm": 0.15118570625782013, + "learning_rate": 0.00011468148148148148, + "loss": 0.1995, + "step": 7260 + }, + { + "epoch": 0.694464345417204, + "grad_norm": 0.12141676247119904, + "learning_rate": 0.00011453333333333334, + "loss": 0.1979, + "step": 7270 + }, + { + "epoch": 0.6954195921096623, + "grad_norm": 0.07915287464857101, + "learning_rate": 0.00011438518518518519, + "loss": 0.198, + "step": 7280 + }, + { + "epoch": 0.6963748388021207, + "grad_norm": 0.1203601062297821, + "learning_rate": 0.00011423703703703705, + "loss": 0.2027, + "step": 7290 + }, + { + "epoch": 0.697330085494579, + "grad_norm": 0.22973081469535828, + "learning_rate": 0.0001140888888888889, + "loss": 0.2053, + "step": 7300 + }, + { + "epoch": 0.6982853321870373, + "grad_norm": 0.1236661821603775, + "learning_rate": 0.00011394074074074074, + "loss": 0.1994, + "step": 7310 + }, + { + "epoch": 0.6992405788794956, + "grad_norm": 0.10576550662517548, + "learning_rate": 0.0001137925925925926, + "loss": 0.1993, + "step": 7320 + }, + { + "epoch": 0.7001958255719539, + "grad_norm": 0.08561199903488159, + "learning_rate": 0.00011364444444444445, + "loss": 0.1984, + "step": 7330 + }, + { + "epoch": 0.7011510722644123, + "grad_norm": 0.1306900978088379, + "learning_rate": 0.0001134962962962963, + "loss": 0.2036, + "step": 7340 + }, + { + "epoch": 0.7021063189568706, + "grad_norm": 0.22850926220417023, + "learning_rate": 0.00011334814814814815, + "loss": 0.2043, + "step": 7350 + }, + { + "epoch": 0.703061565649329, + "grad_norm": 0.14131084084510803, + "learning_rate": 0.0001132, + "loss": 0.1969, + "step": 7360 + }, + { + "epoch": 0.7040168123417873, + "grad_norm": 0.12975451350212097, + "learning_rate": 0.00011305185185185185, + "loss": 0.2002, + "step": 7370 + }, + { + "epoch": 0.7049720590342456, + "grad_norm": 0.10148114711046219, + "learning_rate": 0.0001129037037037037, + "loss": 0.1998, + "step": 7380 + }, + { + "epoch": 0.7059273057267039, + "grad_norm": 0.10766816139221191, + "learning_rate": 0.00011275555555555557, + "loss": 0.2024, + "step": 7390 + }, + { + "epoch": 0.7068825524191622, + "grad_norm": 0.2067338526248932, + "learning_rate": 0.00011260740740740741, + "loss": 0.2044, + "step": 7400 + }, + { + "epoch": 0.7078377991116206, + "grad_norm": 0.17065021395683289, + "learning_rate": 0.00011245925925925926, + "loss": 0.1972, + "step": 7410 + }, + { + "epoch": 0.7087930458040789, + "grad_norm": 0.14703714847564697, + "learning_rate": 0.00011231111111111112, + "loss": 0.1993, + "step": 7420 + }, + { + "epoch": 0.7097482924965373, + "grad_norm": 0.0743475928902626, + "learning_rate": 0.00011216296296296296, + "loss": 0.1993, + "step": 7430 + }, + { + "epoch": 0.7107035391889955, + "grad_norm": 0.12671244144439697, + "learning_rate": 0.00011201481481481481, + "loss": 0.2015, + "step": 7440 + }, + { + "epoch": 0.7116587858814539, + "grad_norm": 0.20756767690181732, + "learning_rate": 0.00011186666666666667, + "loss": 0.2038, + "step": 7450 + }, + { + "epoch": 0.7126140325739122, + "grad_norm": 0.13475944101810455, + "learning_rate": 0.00011171851851851853, + "loss": 0.1946, + "step": 7460 + }, + { + "epoch": 0.7135692792663706, + "grad_norm": 0.14000599086284637, + "learning_rate": 0.00011157037037037036, + "loss": 0.1989, + "step": 7470 + }, + { + "epoch": 0.7145245259588289, + "grad_norm": 0.08982401341199875, + "learning_rate": 0.00011142222222222222, + "loss": 0.1998, + "step": 7480 + }, + { + "epoch": 0.7154797726512871, + "grad_norm": 0.13047632575035095, + "learning_rate": 0.00011127407407407408, + "loss": 0.202, + "step": 7490 + }, + { + "epoch": 0.7164350193437455, + "grad_norm": 0.2293279767036438, + "learning_rate": 0.00011112592592592592, + "loss": 0.2033, + "step": 7500 + }, + { + "epoch": 0.7173902660362038, + "grad_norm": 0.18973354995250702, + "learning_rate": 0.00011097777777777778, + "loss": 0.1974, + "step": 7510 + }, + { + "epoch": 0.7183455127286622, + "grad_norm": 0.16436341404914856, + "learning_rate": 0.00011082962962962964, + "loss": 0.1988, + "step": 7520 + }, + { + "epoch": 0.7193007594211205, + "grad_norm": 0.0743977501988411, + "learning_rate": 0.0001106814814814815, + "loss": 0.1978, + "step": 7530 + }, + { + "epoch": 0.7202560061135789, + "grad_norm": 0.09269341826438904, + "learning_rate": 0.00011053333333333333, + "loss": 0.2011, + "step": 7540 + }, + { + "epoch": 0.7212112528060372, + "grad_norm": 0.19990885257720947, + "learning_rate": 0.00011038518518518519, + "loss": 0.2035, + "step": 7550 + }, + { + "epoch": 0.7221664994984955, + "grad_norm": 0.1131897047162056, + "learning_rate": 0.00011023703703703705, + "loss": 0.1956, + "step": 7560 + }, + { + "epoch": 0.7231217461909538, + "grad_norm": 0.08888459950685501, + "learning_rate": 0.00011008888888888888, + "loss": 0.1967, + "step": 7570 + }, + { + "epoch": 0.7240769928834121, + "grad_norm": 0.08514232188463211, + "learning_rate": 0.00010994074074074074, + "loss": 0.1992, + "step": 7580 + }, + { + "epoch": 0.7250322395758705, + "grad_norm": 0.09677214175462723, + "learning_rate": 0.0001097925925925926, + "loss": 0.2015, + "step": 7590 + }, + { + "epoch": 0.7259874862683288, + "grad_norm": 0.20914286375045776, + "learning_rate": 0.00010964444444444446, + "loss": 0.2042, + "step": 7600 + }, + { + "epoch": 0.7269427329607872, + "grad_norm": 0.12801726162433624, + "learning_rate": 0.00010949629629629629, + "loss": 0.1967, + "step": 7610 + }, + { + "epoch": 0.7278979796532454, + "grad_norm": 0.10206489264965057, + "learning_rate": 0.00010934814814814815, + "loss": 0.1984, + "step": 7620 + }, + { + "epoch": 0.7288532263457038, + "grad_norm": 0.08488079160451889, + "learning_rate": 0.00010920000000000001, + "loss": 0.1983, + "step": 7630 + }, + { + "epoch": 0.7298084730381621, + "grad_norm": 0.15926392376422882, + "learning_rate": 0.00010905185185185184, + "loss": 0.2024, + "step": 7640 + }, + { + "epoch": 0.7307637197306205, + "grad_norm": 0.1908629834651947, + "learning_rate": 0.0001089037037037037, + "loss": 0.205, + "step": 7650 + }, + { + "epoch": 0.7317189664230788, + "grad_norm": 0.1444040983915329, + "learning_rate": 0.00010875555555555556, + "loss": 0.1973, + "step": 7660 + }, + { + "epoch": 0.732674213115537, + "grad_norm": 0.12023048847913742, + "learning_rate": 0.0001086074074074074, + "loss": 0.1969, + "step": 7670 + }, + { + "epoch": 0.7336294598079954, + "grad_norm": 0.09281352162361145, + "learning_rate": 0.00010845925925925926, + "loss": 0.1987, + "step": 7680 + }, + { + "epoch": 0.7345847065004537, + "grad_norm": 0.13516128063201904, + "learning_rate": 0.00010831111111111112, + "loss": 0.2037, + "step": 7690 + }, + { + "epoch": 0.7355399531929121, + "grad_norm": 0.2315383404493332, + "learning_rate": 0.00010816296296296298, + "loss": 0.203, + "step": 7700 + }, + { + "epoch": 0.7364951998853704, + "grad_norm": 0.1517767459154129, + "learning_rate": 0.00010801481481481481, + "loss": 0.199, + "step": 7710 + }, + { + "epoch": 0.7374504465778288, + "grad_norm": 0.10027414560317993, + "learning_rate": 0.00010786666666666667, + "loss": 0.1976, + "step": 7720 + }, + { + "epoch": 0.738405693270287, + "grad_norm": 0.07501152157783508, + "learning_rate": 0.00010771851851851853, + "loss": 0.1983, + "step": 7730 + }, + { + "epoch": 0.7393609399627454, + "grad_norm": 0.10912565141916275, + "learning_rate": 0.00010757037037037036, + "loss": 0.202, + "step": 7740 + }, + { + "epoch": 0.7403161866552037, + "grad_norm": 0.21067175269126892, + "learning_rate": 0.00010742222222222222, + "loss": 0.2043, + "step": 7750 + }, + { + "epoch": 0.741271433347662, + "grad_norm": 0.11937571316957474, + "learning_rate": 0.00010727407407407408, + "loss": 0.1966, + "step": 7760 + }, + { + "epoch": 0.7422266800401204, + "grad_norm": 0.1493697464466095, + "learning_rate": 0.00010712592592592594, + "loss": 0.1974, + "step": 7770 + }, + { + "epoch": 0.7431819267325787, + "grad_norm": 0.07944036275148392, + "learning_rate": 0.00010697777777777777, + "loss": 0.1983, + "step": 7780 + }, + { + "epoch": 0.744137173425037, + "grad_norm": 0.12938891351222992, + "learning_rate": 0.00010682962962962963, + "loss": 0.202, + "step": 7790 + }, + { + "epoch": 0.7450924201174953, + "grad_norm": 0.19698497653007507, + "learning_rate": 0.00010668148148148149, + "loss": 0.2043, + "step": 7800 + }, + { + "epoch": 0.7460476668099537, + "grad_norm": 0.12791553139686584, + "learning_rate": 0.00010653333333333333, + "loss": 0.1954, + "step": 7810 + }, + { + "epoch": 0.747002913502412, + "grad_norm": 0.1833954155445099, + "learning_rate": 0.00010638518518518519, + "loss": 0.1982, + "step": 7820 + }, + { + "epoch": 0.7479581601948704, + "grad_norm": 0.07448244094848633, + "learning_rate": 0.00010623703703703704, + "loss": 0.1995, + "step": 7830 + }, + { + "epoch": 0.7489134068873287, + "grad_norm": 0.18532446026802063, + "learning_rate": 0.0001060888888888889, + "loss": 0.2029, + "step": 7840 + }, + { + "epoch": 0.7498686535797869, + "grad_norm": 0.1823538988828659, + "learning_rate": 0.00010594074074074074, + "loss": 0.2045, + "step": 7850 + }, + { + "epoch": 0.7508239002722453, + "grad_norm": 0.17263145744800568, + "learning_rate": 0.0001057925925925926, + "loss": 0.1949, + "step": 7860 + }, + { + "epoch": 0.7517791469647036, + "grad_norm": 0.16599738597869873, + "learning_rate": 0.00010564444444444446, + "loss": 0.1974, + "step": 7870 + }, + { + "epoch": 0.752734393657162, + "grad_norm": 0.07930149137973785, + "learning_rate": 0.00010549629629629629, + "loss": 0.1969, + "step": 7880 + }, + { + "epoch": 0.7536896403496203, + "grad_norm": 0.16069641709327698, + "learning_rate": 0.00010534814814814815, + "loss": 0.2029, + "step": 7890 + }, + { + "epoch": 0.7546448870420787, + "grad_norm": 0.20293623208999634, + "learning_rate": 0.00010520000000000001, + "loss": 0.2044, + "step": 7900 + }, + { + "epoch": 0.7556001337345369, + "grad_norm": 0.18291781842708588, + "learning_rate": 0.00010505185185185187, + "loss": 0.195, + "step": 7910 + }, + { + "epoch": 0.7565553804269952, + "grad_norm": 0.13923436403274536, + "learning_rate": 0.0001049037037037037, + "loss": 0.1973, + "step": 7920 + }, + { + "epoch": 0.7575106271194536, + "grad_norm": 0.08182086795568466, + "learning_rate": 0.00010475555555555556, + "loss": 0.1986, + "step": 7930 + }, + { + "epoch": 0.7584658738119119, + "grad_norm": 0.11042799055576324, + "learning_rate": 0.00010460740740740742, + "loss": 0.2029, + "step": 7940 + }, + { + "epoch": 0.7594211205043703, + "grad_norm": 0.2199370265007019, + "learning_rate": 0.00010445925925925925, + "loss": 0.2032, + "step": 7950 + }, + { + "epoch": 0.7603763671968286, + "grad_norm": 0.19982871413230896, + "learning_rate": 0.00010431111111111111, + "loss": 0.1952, + "step": 7960 + }, + { + "epoch": 0.7613316138892869, + "grad_norm": 0.14492951333522797, + "learning_rate": 0.00010416296296296297, + "loss": 0.1963, + "step": 7970 + }, + { + "epoch": 0.7622868605817452, + "grad_norm": 0.09715760499238968, + "learning_rate": 0.0001040148148148148, + "loss": 0.1975, + "step": 7980 + }, + { + "epoch": 0.7632421072742036, + "grad_norm": 0.14056870341300964, + "learning_rate": 0.00010386666666666667, + "loss": 0.2027, + "step": 7990 + }, + { + "epoch": 0.7641973539666619, + "grad_norm": 0.23944570124149323, + "learning_rate": 0.00010371851851851853, + "loss": 0.2043, + "step": 8000 + }, + { + "epoch": 0.7651526006591202, + "grad_norm": 0.12410598993301392, + "learning_rate": 0.00010357037037037039, + "loss": 0.1944, + "step": 8010 + }, + { + "epoch": 0.7661078473515786, + "grad_norm": 0.14546865224838257, + "learning_rate": 0.00010342222222222222, + "loss": 0.1974, + "step": 8020 + }, + { + "epoch": 0.7670630940440368, + "grad_norm": 0.07528841495513916, + "learning_rate": 0.00010327407407407408, + "loss": 0.1991, + "step": 8030 + }, + { + "epoch": 0.7680183407364952, + "grad_norm": 0.11421654373407364, + "learning_rate": 0.00010312592592592594, + "loss": 0.201, + "step": 8040 + }, + { + "epoch": 0.7689735874289535, + "grad_norm": 0.2141459733247757, + "learning_rate": 0.00010297777777777777, + "loss": 0.2047, + "step": 8050 + }, + { + "epoch": 0.7699288341214119, + "grad_norm": 0.13949456810951233, + "learning_rate": 0.00010282962962962963, + "loss": 0.1943, + "step": 8060 + }, + { + "epoch": 0.7708840808138702, + "grad_norm": 0.14627870917320251, + "learning_rate": 0.00010268148148148149, + "loss": 0.1979, + "step": 8070 + }, + { + "epoch": 0.7718393275063286, + "grad_norm": 0.08122966438531876, + "learning_rate": 0.00010253333333333335, + "loss": 0.1985, + "step": 8080 + }, + { + "epoch": 0.7727945741987868, + "grad_norm": 0.11243822425603867, + "learning_rate": 0.00010238518518518518, + "loss": 0.2002, + "step": 8090 + }, + { + "epoch": 0.7737498208912451, + "grad_norm": 0.16983778774738312, + "learning_rate": 0.00010223703703703704, + "loss": 0.205, + "step": 8100 + }, + { + "epoch": 0.7747050675837035, + "grad_norm": 0.1632688045501709, + "learning_rate": 0.0001020888888888889, + "loss": 0.1929, + "step": 8110 + }, + { + "epoch": 0.7756603142761618, + "grad_norm": 0.16163240373134613, + "learning_rate": 0.00010194074074074073, + "loss": 0.1969, + "step": 8120 + }, + { + "epoch": 0.7766155609686202, + "grad_norm": 0.07510218024253845, + "learning_rate": 0.0001017925925925926, + "loss": 0.1986, + "step": 8130 + }, + { + "epoch": 0.7775708076610784, + "grad_norm": 0.10797309130430222, + "learning_rate": 0.00010164444444444445, + "loss": 0.2014, + "step": 8140 + }, + { + "epoch": 0.7785260543535368, + "grad_norm": 0.23113864660263062, + "learning_rate": 0.00010149629629629631, + "loss": 0.2046, + "step": 8150 + }, + { + "epoch": 0.7794813010459951, + "grad_norm": 0.2128959745168686, + "learning_rate": 0.00010134814814814815, + "loss": 0.1943, + "step": 8160 + }, + { + "epoch": 0.7804365477384535, + "grad_norm": 0.11465991288423538, + "learning_rate": 0.00010120000000000001, + "loss": 0.1974, + "step": 8170 + }, + { + "epoch": 0.7813917944309118, + "grad_norm": 0.08690842241048813, + "learning_rate": 0.00010105185185185187, + "loss": 0.1985, + "step": 8180 + }, + { + "epoch": 0.7823470411233701, + "grad_norm": 0.12978310883045197, + "learning_rate": 0.0001009037037037037, + "loss": 0.2004, + "step": 8190 + }, + { + "epoch": 0.7833022878158284, + "grad_norm": 0.21451863646507263, + "learning_rate": 0.00010075555555555556, + "loss": 0.2038, + "step": 8200 + }, + { + "epoch": 0.7842575345082867, + "grad_norm": 0.14841678738594055, + "learning_rate": 0.00010060740740740742, + "loss": 0.1945, + "step": 8210 + }, + { + "epoch": 0.7852127812007451, + "grad_norm": 0.1173843964934349, + "learning_rate": 0.00010045925925925928, + "loss": 0.1946, + "step": 8220 + }, + { + "epoch": 0.7861680278932034, + "grad_norm": 0.08300807327032089, + "learning_rate": 0.00010031111111111111, + "loss": 0.1971, + "step": 8230 + }, + { + "epoch": 0.7871232745856618, + "grad_norm": 0.1852853000164032, + "learning_rate": 0.00010016296296296297, + "loss": 0.2014, + "step": 8240 + }, + { + "epoch": 0.7880785212781201, + "grad_norm": 0.20180000364780426, + "learning_rate": 0.00010001481481481483, + "loss": 0.2053, + "step": 8250 + }, + { + "epoch": 0.7890337679705784, + "grad_norm": 0.1663861721754074, + "learning_rate": 9.986666666666668e-05, + "loss": 0.1941, + "step": 8260 + }, + { + "epoch": 0.7899890146630367, + "grad_norm": 0.14617478847503662, + "learning_rate": 9.971851851851852e-05, + "loss": 0.197, + "step": 8270 + }, + { + "epoch": 0.790944261355495, + "grad_norm": 0.06954821944236755, + "learning_rate": 9.957037037037038e-05, + "loss": 0.1993, + "step": 8280 + }, + { + "epoch": 0.7918995080479534, + "grad_norm": 0.1401512175798416, + "learning_rate": 9.942222222222223e-05, + "loss": 0.2021, + "step": 8290 + }, + { + "epoch": 0.7928547547404117, + "grad_norm": 0.2168128937482834, + "learning_rate": 9.927407407407408e-05, + "loss": 0.2049, + "step": 8300 + }, + { + "epoch": 0.7938100014328701, + "grad_norm": 0.14390647411346436, + "learning_rate": 9.912592592592594e-05, + "loss": 0.1922, + "step": 8310 + }, + { + "epoch": 0.7947652481253283, + "grad_norm": 0.1461930125951767, + "learning_rate": 9.897777777777778e-05, + "loss": 0.1975, + "step": 8320 + }, + { + "epoch": 0.7957204948177867, + "grad_norm": 0.06800387054681778, + "learning_rate": 9.882962962962964e-05, + "loss": 0.1974, + "step": 8330 + }, + { + "epoch": 0.796675741510245, + "grad_norm": 0.1342678666114807, + "learning_rate": 9.868148148148149e-05, + "loss": 0.2007, + "step": 8340 + }, + { + "epoch": 0.7976309882027034, + "grad_norm": 0.19177483022212982, + "learning_rate": 9.853333333333333e-05, + "loss": 0.2028, + "step": 8350 + }, + { + "epoch": 0.7985862348951617, + "grad_norm": 0.19509050250053406, + "learning_rate": 9.83851851851852e-05, + "loss": 0.1946, + "step": 8360 + }, + { + "epoch": 0.79954148158762, + "grad_norm": 0.1521720141172409, + "learning_rate": 9.823703703703704e-05, + "loss": 0.1962, + "step": 8370 + }, + { + "epoch": 0.8004967282800783, + "grad_norm": 0.07219533622264862, + "learning_rate": 9.80888888888889e-05, + "loss": 0.1983, + "step": 8380 + }, + { + "epoch": 0.8014519749725366, + "grad_norm": 0.15904979407787323, + "learning_rate": 9.794074074074075e-05, + "loss": 0.2, + "step": 8390 + }, + { + "epoch": 0.802407221664995, + "grad_norm": 0.20888783037662506, + "learning_rate": 9.77925925925926e-05, + "loss": 0.2023, + "step": 8400 + }, + { + "epoch": 0.8033624683574533, + "grad_norm": 0.15352968871593475, + "learning_rate": 9.764444444444445e-05, + "loss": 0.1941, + "step": 8410 + }, + { + "epoch": 0.8043177150499117, + "grad_norm": 0.11600544303655624, + "learning_rate": 9.74962962962963e-05, + "loss": 0.1947, + "step": 8420 + }, + { + "epoch": 0.80527296174237, + "grad_norm": 0.06507077813148499, + "learning_rate": 9.734814814814816e-05, + "loss": 0.197, + "step": 8430 + }, + { + "epoch": 0.8062282084348283, + "grad_norm": 0.14993168413639069, + "learning_rate": 9.72e-05, + "loss": 0.2017, + "step": 8440 + }, + { + "epoch": 0.8071834551272866, + "grad_norm": 0.2011600136756897, + "learning_rate": 9.705185185185186e-05, + "loss": 0.2041, + "step": 8450 + }, + { + "epoch": 0.8081387018197449, + "grad_norm": 0.12556695938110352, + "learning_rate": 9.690370370370371e-05, + "loss": 0.1939, + "step": 8460 + }, + { + "epoch": 0.8090939485122033, + "grad_norm": 0.12797193229198456, + "learning_rate": 9.675555555555556e-05, + "loss": 0.1959, + "step": 8470 + }, + { + "epoch": 0.8100491952046616, + "grad_norm": 0.0666249543428421, + "learning_rate": 9.660740740740742e-05, + "loss": 0.1977, + "step": 8480 + }, + { + "epoch": 0.81100444189712, + "grad_norm": 0.11509077996015549, + "learning_rate": 9.645925925925926e-05, + "loss": 0.2009, + "step": 8490 + }, + { + "epoch": 0.8119596885895782, + "grad_norm": 0.22886355221271515, + "learning_rate": 9.631111111111112e-05, + "loss": 0.2036, + "step": 8500 + }, + { + "epoch": 0.8129149352820366, + "grad_norm": 0.21546624600887299, + "learning_rate": 9.616296296296297e-05, + "loss": 0.1924, + "step": 8510 + }, + { + "epoch": 0.8138701819744949, + "grad_norm": 0.15088830888271332, + "learning_rate": 9.601481481481483e-05, + "loss": 0.1968, + "step": 8520 + }, + { + "epoch": 0.8148254286669533, + "grad_norm": 0.11225682497024536, + "learning_rate": 9.586666666666667e-05, + "loss": 0.1969, + "step": 8530 + }, + { + "epoch": 0.8157806753594116, + "grad_norm": 0.14504078030586243, + "learning_rate": 9.571851851851852e-05, + "loss": 0.2008, + "step": 8540 + }, + { + "epoch": 0.8167359220518698, + "grad_norm": 0.18268372118473053, + "learning_rate": 9.557037037037038e-05, + "loss": 0.2043, + "step": 8550 + }, + { + "epoch": 0.8176911687443282, + "grad_norm": 0.15672026574611664, + "learning_rate": 9.542222222222223e-05, + "loss": 0.1919, + "step": 8560 + }, + { + "epoch": 0.8186464154367865, + "grad_norm": 0.1838408261537552, + "learning_rate": 9.527407407407409e-05, + "loss": 0.1949, + "step": 8570 + }, + { + "epoch": 0.8196016621292449, + "grad_norm": 0.07827872782945633, + "learning_rate": 9.512592592592593e-05, + "loss": 0.1981, + "step": 8580 + }, + { + "epoch": 0.8205569088217032, + "grad_norm": 0.10423124581575394, + "learning_rate": 9.497777777777779e-05, + "loss": 0.2015, + "step": 8590 + }, + { + "epoch": 0.8215121555141616, + "grad_norm": 0.1930837333202362, + "learning_rate": 9.482962962962964e-05, + "loss": 0.2033, + "step": 8600 + }, + { + "epoch": 0.8224674022066198, + "grad_norm": 0.1966949999332428, + "learning_rate": 9.468148148148149e-05, + "loss": 0.1933, + "step": 8610 + }, + { + "epoch": 0.8234226488990782, + "grad_norm": 0.12053806334733963, + "learning_rate": 9.453333333333335e-05, + "loss": 0.1942, + "step": 8620 + }, + { + "epoch": 0.8243778955915365, + "grad_norm": 0.07416214793920517, + "learning_rate": 9.438518518518519e-05, + "loss": 0.1982, + "step": 8630 + }, + { + "epoch": 0.8253331422839948, + "grad_norm": 0.1334538608789444, + "learning_rate": 9.423703703703705e-05, + "loss": 0.2006, + "step": 8640 + }, + { + "epoch": 0.8262883889764532, + "grad_norm": 0.18779663741588593, + "learning_rate": 9.40888888888889e-05, + "loss": 0.2041, + "step": 8650 + }, + { + "epoch": 0.8272436356689115, + "grad_norm": 0.14109750092029572, + "learning_rate": 9.394074074074074e-05, + "loss": 0.1911, + "step": 8660 + }, + { + "epoch": 0.8281988823613698, + "grad_norm": 0.11741621792316437, + "learning_rate": 9.37925925925926e-05, + "loss": 0.1971, + "step": 8670 + }, + { + "epoch": 0.8291541290538281, + "grad_norm": 0.08277934044599533, + "learning_rate": 9.364444444444445e-05, + "loss": 0.1962, + "step": 8680 + }, + { + "epoch": 0.8301093757462865, + "grad_norm": 0.14392395317554474, + "learning_rate": 9.349629629629631e-05, + "loss": 0.2006, + "step": 8690 + }, + { + "epoch": 0.8310646224387448, + "grad_norm": 0.24216341972351074, + "learning_rate": 9.334814814814816e-05, + "loss": 0.204, + "step": 8700 + }, + { + "epoch": 0.8320198691312032, + "grad_norm": 0.15122485160827637, + "learning_rate": 9.320000000000002e-05, + "loss": 0.1921, + "step": 8710 + }, + { + "epoch": 0.8329751158236615, + "grad_norm": 0.13646939396858215, + "learning_rate": 9.305185185185186e-05, + "loss": 0.1971, + "step": 8720 + }, + { + "epoch": 0.8339303625161197, + "grad_norm": 0.07911708950996399, + "learning_rate": 9.290370370370371e-05, + "loss": 0.1982, + "step": 8730 + }, + { + "epoch": 0.8348856092085781, + "grad_norm": 0.10435432940721512, + "learning_rate": 9.275555555555557e-05, + "loss": 0.2005, + "step": 8740 + }, + { + "epoch": 0.8358408559010364, + "grad_norm": 0.23191022872924805, + "learning_rate": 9.260740740740741e-05, + "loss": 0.2042, + "step": 8750 + }, + { + "epoch": 0.8367961025934948, + "grad_norm": 0.16332274675369263, + "learning_rate": 9.245925925925927e-05, + "loss": 0.1928, + "step": 8760 + }, + { + "epoch": 0.8377513492859531, + "grad_norm": 0.16581833362579346, + "learning_rate": 9.231111111111112e-05, + "loss": 0.1964, + "step": 8770 + }, + { + "epoch": 0.8387065959784115, + "grad_norm": 0.06937286257743835, + "learning_rate": 9.216296296296297e-05, + "loss": 0.1963, + "step": 8780 + }, + { + "epoch": 0.8396618426708697, + "grad_norm": 0.12238670140504837, + "learning_rate": 9.201481481481483e-05, + "loss": 0.2002, + "step": 8790 + }, + { + "epoch": 0.8406170893633281, + "grad_norm": 0.1900535523891449, + "learning_rate": 9.186666666666667e-05, + "loss": 0.2038, + "step": 8800 + }, + { + "epoch": 0.8415723360557864, + "grad_norm": 0.20824721455574036, + "learning_rate": 9.171851851851853e-05, + "loss": 0.1949, + "step": 8810 + }, + { + "epoch": 0.8425275827482447, + "grad_norm": 0.127131387591362, + "learning_rate": 9.157037037037038e-05, + "loss": 0.1934, + "step": 8820 + }, + { + "epoch": 0.8434828294407031, + "grad_norm": 0.076890729367733, + "learning_rate": 9.142222222222222e-05, + "loss": 0.1955, + "step": 8830 + }, + { + "epoch": 0.8444380761331614, + "grad_norm": 0.14583854377269745, + "learning_rate": 9.127407407407408e-05, + "loss": 0.2006, + "step": 8840 + }, + { + "epoch": 0.8453933228256197, + "grad_norm": 0.2490548938512802, + "learning_rate": 9.112592592592593e-05, + "loss": 0.2054, + "step": 8850 + }, + { + "epoch": 0.846348569518078, + "grad_norm": 0.11510622501373291, + "learning_rate": 9.097777777777779e-05, + "loss": 0.1924, + "step": 8860 + }, + { + "epoch": 0.8473038162105364, + "grad_norm": 0.11410252749919891, + "learning_rate": 9.082962962962964e-05, + "loss": 0.1949, + "step": 8870 + }, + { + "epoch": 0.8482590629029947, + "grad_norm": 0.08570121228694916, + "learning_rate": 9.068148148148148e-05, + "loss": 0.1972, + "step": 8880 + }, + { + "epoch": 0.8492143095954531, + "grad_norm": 0.1635618507862091, + "learning_rate": 9.053333333333334e-05, + "loss": 0.2009, + "step": 8890 + }, + { + "epoch": 0.8501695562879114, + "grad_norm": 0.2034139484167099, + "learning_rate": 9.038518518518519e-05, + "loss": 0.2037, + "step": 8900 + }, + { + "epoch": 0.8511248029803696, + "grad_norm": 0.16847805678844452, + "learning_rate": 9.023703703703704e-05, + "loss": 0.1898, + "step": 8910 + }, + { + "epoch": 0.852080049672828, + "grad_norm": 0.16264882683753967, + "learning_rate": 9.00888888888889e-05, + "loss": 0.1967, + "step": 8920 + }, + { + "epoch": 0.8530352963652863, + "grad_norm": 0.07546920329332352, + "learning_rate": 8.994074074074074e-05, + "loss": 0.1982, + "step": 8930 + }, + { + "epoch": 0.8539905430577447, + "grad_norm": 0.12496549636125565, + "learning_rate": 8.97925925925926e-05, + "loss": 0.1992, + "step": 8940 + }, + { + "epoch": 0.854945789750203, + "grad_norm": 0.17706719040870667, + "learning_rate": 8.964444444444445e-05, + "loss": 0.204, + "step": 8950 + }, + { + "epoch": 0.8559010364426614, + "grad_norm": 0.17304418981075287, + "learning_rate": 8.94962962962963e-05, + "loss": 0.1928, + "step": 8960 + }, + { + "epoch": 0.8568562831351196, + "grad_norm": 0.22156664729118347, + "learning_rate": 8.934814814814815e-05, + "loss": 0.195, + "step": 8970 + }, + { + "epoch": 0.857811529827578, + "grad_norm": 0.07757692039012909, + "learning_rate": 8.92e-05, + "loss": 0.1975, + "step": 8980 + }, + { + "epoch": 0.8587667765200363, + "grad_norm": 0.10818079113960266, + "learning_rate": 8.905185185185186e-05, + "loss": 0.1993, + "step": 8990 + }, + { + "epoch": 0.8597220232124946, + "grad_norm": 0.19327661395072937, + "learning_rate": 8.89037037037037e-05, + "loss": 0.2033, + "step": 9000 + }, + { + "epoch": 0.860677269904953, + "grad_norm": 0.18037331104278564, + "learning_rate": 8.875555555555555e-05, + "loss": 0.1933, + "step": 9010 + }, + { + "epoch": 0.8616325165974112, + "grad_norm": 0.11041972786188126, + "learning_rate": 8.860740740740741e-05, + "loss": 0.1941, + "step": 9020 + }, + { + "epoch": 0.8625877632898696, + "grad_norm": 0.06798222661018372, + "learning_rate": 8.845925925925926e-05, + "loss": 0.1974, + "step": 9030 + }, + { + "epoch": 0.8635430099823279, + "grad_norm": 0.14087295532226562, + "learning_rate": 8.831111111111112e-05, + "loss": 0.2009, + "step": 9040 + }, + { + "epoch": 0.8644982566747863, + "grad_norm": 0.23657061159610748, + "learning_rate": 8.816296296296296e-05, + "loss": 0.2038, + "step": 9050 + }, + { + "epoch": 0.8654535033672446, + "grad_norm": 0.1508956402540207, + "learning_rate": 8.801481481481481e-05, + "loss": 0.1916, + "step": 9060 + }, + { + "epoch": 0.866408750059703, + "grad_norm": 0.07923205196857452, + "learning_rate": 8.786666666666667e-05, + "loss": 0.1969, + "step": 9070 + }, + { + "epoch": 0.8673639967521612, + "grad_norm": 0.08770725876092911, + "learning_rate": 8.771851851851852e-05, + "loss": 0.1988, + "step": 9080 + }, + { + "epoch": 0.8683192434446195, + "grad_norm": 0.12787607312202454, + "learning_rate": 8.757037037037036e-05, + "loss": 0.2013, + "step": 9090 + }, + { + "epoch": 0.8692744901370779, + "grad_norm": 0.23347334563732147, + "learning_rate": 8.742222222222222e-05, + "loss": 0.2038, + "step": 9100 + }, + { + "epoch": 0.8702297368295362, + "grad_norm": 0.20431476831436157, + "learning_rate": 8.727407407407407e-05, + "loss": 0.19, + "step": 9110 + }, + { + "epoch": 0.8711849835219946, + "grad_norm": 0.173463836312294, + "learning_rate": 8.712592592592593e-05, + "loss": 0.197, + "step": 9120 + }, + { + "epoch": 0.8721402302144529, + "grad_norm": 0.07533210515975952, + "learning_rate": 8.697777777777777e-05, + "loss": 0.1981, + "step": 9130 + }, + { + "epoch": 0.8730954769069112, + "grad_norm": 0.1275360882282257, + "learning_rate": 8.682962962962963e-05, + "loss": 0.2009, + "step": 9140 + }, + { + "epoch": 0.8740507235993695, + "grad_norm": 0.18006309866905212, + "learning_rate": 8.668148148148148e-05, + "loss": 0.204, + "step": 9150 + }, + { + "epoch": 0.8750059702918279, + "grad_norm": 0.17089661955833435, + "learning_rate": 8.653333333333333e-05, + "loss": 0.1897, + "step": 9160 + }, + { + "epoch": 0.8759612169842862, + "grad_norm": 0.19405335187911987, + "learning_rate": 8.638518518518519e-05, + "loss": 0.1961, + "step": 9170 + }, + { + "epoch": 0.8769164636767445, + "grad_norm": 0.07635375112295151, + "learning_rate": 8.623703703703703e-05, + "loss": 0.1976, + "step": 9180 + }, + { + "epoch": 0.8778717103692029, + "grad_norm": 0.1420837789773941, + "learning_rate": 8.608888888888889e-05, + "loss": 0.2007, + "step": 9190 + }, + { + "epoch": 0.8788269570616611, + "grad_norm": 0.2507460117340088, + "learning_rate": 8.594074074074074e-05, + "loss": 0.2019, + "step": 9200 + }, + { + "epoch": 0.8797822037541195, + "grad_norm": 0.1322818398475647, + "learning_rate": 8.57925925925926e-05, + "loss": 0.1921, + "step": 9210 + }, + { + "epoch": 0.8807374504465778, + "grad_norm": 0.15335558354854584, + "learning_rate": 8.564444444444445e-05, + "loss": 0.1948, + "step": 9220 + }, + { + "epoch": 0.8816926971390362, + "grad_norm": 0.0793461874127388, + "learning_rate": 8.549629629629629e-05, + "loss": 0.197, + "step": 9230 + }, + { + "epoch": 0.8826479438314945, + "grad_norm": 0.10587692260742188, + "learning_rate": 8.534814814814815e-05, + "loss": 0.2006, + "step": 9240 + }, + { + "epoch": 0.8836031905239529, + "grad_norm": 0.211539164185524, + "learning_rate": 8.52e-05, + "loss": 0.202, + "step": 9250 + }, + { + "epoch": 0.8845584372164111, + "grad_norm": 0.1647574007511139, + "learning_rate": 8.505185185185186e-05, + "loss": 0.1926, + "step": 9260 + }, + { + "epoch": 0.8855136839088694, + "grad_norm": 0.10489039868116379, + "learning_rate": 8.49037037037037e-05, + "loss": 0.1933, + "step": 9270 + }, + { + "epoch": 0.8864689306013278, + "grad_norm": 0.08796203136444092, + "learning_rate": 8.475555555555555e-05, + "loss": 0.1961, + "step": 9280 + }, + { + "epoch": 0.8874241772937861, + "grad_norm": 0.15730325877666473, + "learning_rate": 8.460740740740741e-05, + "loss": 0.2007, + "step": 9290 + }, + { + "epoch": 0.8883794239862445, + "grad_norm": 0.22997820377349854, + "learning_rate": 8.445925925925926e-05, + "loss": 0.203, + "step": 9300 + }, + { + "epoch": 0.8893346706787028, + "grad_norm": 0.0923224613070488, + "learning_rate": 8.431111111111112e-05, + "loss": 0.1927, + "step": 9310 + }, + { + "epoch": 0.8902899173711611, + "grad_norm": 0.09773056209087372, + "learning_rate": 8.416296296296296e-05, + "loss": 0.1962, + "step": 9320 + }, + { + "epoch": 0.8912451640636194, + "grad_norm": 0.07204195111989975, + "learning_rate": 8.401481481481482e-05, + "loss": 0.1969, + "step": 9330 + }, + { + "epoch": 0.8922004107560778, + "grad_norm": 0.1365681141614914, + "learning_rate": 8.386666666666667e-05, + "loss": 0.2012, + "step": 9340 + }, + { + "epoch": 0.8931556574485361, + "grad_norm": 0.1925460696220398, + "learning_rate": 8.371851851851851e-05, + "loss": 0.2034, + "step": 9350 + }, + { + "epoch": 0.8941109041409944, + "grad_norm": 0.13350558280944824, + "learning_rate": 8.357037037037037e-05, + "loss": 0.1927, + "step": 9360 + }, + { + "epoch": 0.8950661508334528, + "grad_norm": 0.1282387226819992, + "learning_rate": 8.342222222222222e-05, + "loss": 0.195, + "step": 9370 + }, + { + "epoch": 0.896021397525911, + "grad_norm": 0.08503387868404388, + "learning_rate": 8.327407407407408e-05, + "loss": 0.199, + "step": 9380 + }, + { + "epoch": 0.8969766442183694, + "grad_norm": 0.13588573038578033, + "learning_rate": 8.312592592592593e-05, + "loss": 0.2006, + "step": 9390 + }, + { + "epoch": 0.8979318909108277, + "grad_norm": 0.18095846474170685, + "learning_rate": 8.297777777777777e-05, + "loss": 0.2038, + "step": 9400 + }, + { + "epoch": 0.8988871376032861, + "grad_norm": 0.1470443606376648, + "learning_rate": 8.282962962962963e-05, + "loss": 0.1914, + "step": 9410 + }, + { + "epoch": 0.8998423842957444, + "grad_norm": 0.11700747907161713, + "learning_rate": 8.268148148148148e-05, + "loss": 0.1945, + "step": 9420 + }, + { + "epoch": 0.9007976309882028, + "grad_norm": 0.08257856965065002, + "learning_rate": 8.253333333333334e-05, + "loss": 0.1965, + "step": 9430 + }, + { + "epoch": 0.901752877680661, + "grad_norm": 0.11103440821170807, + "learning_rate": 8.238518518518518e-05, + "loss": 0.2012, + "step": 9440 + }, + { + "epoch": 0.9027081243731193, + "grad_norm": 0.19891560077667236, + "learning_rate": 8.223703703703704e-05, + "loss": 0.2051, + "step": 9450 + }, + { + "epoch": 0.9036633710655777, + "grad_norm": 0.15487581491470337, + "learning_rate": 8.208888888888889e-05, + "loss": 0.1919, + "step": 9460 + }, + { + "epoch": 0.904618617758036, + "grad_norm": 0.11258412897586823, + "learning_rate": 8.194074074074074e-05, + "loss": 0.1943, + "step": 9470 + }, + { + "epoch": 0.9055738644504944, + "grad_norm": 0.07216157019138336, + "learning_rate": 8.17925925925926e-05, + "loss": 0.1976, + "step": 9480 + }, + { + "epoch": 0.9065291111429526, + "grad_norm": 0.16788683831691742, + "learning_rate": 8.164444444444444e-05, + "loss": 0.2014, + "step": 9490 + }, + { + "epoch": 0.907484357835411, + "grad_norm": 0.1965964436531067, + "learning_rate": 8.14962962962963e-05, + "loss": 0.2047, + "step": 9500 + }, + { + "epoch": 0.9084396045278693, + "grad_norm": 0.2151501625776291, + "learning_rate": 8.134814814814815e-05, + "loss": 0.1917, + "step": 9510 + }, + { + "epoch": 0.9093948512203277, + "grad_norm": 0.09645062685012817, + "learning_rate": 8.120000000000001e-05, + "loss": 0.1931, + "step": 9520 + }, + { + "epoch": 0.910350097912786, + "grad_norm": 0.07982160151004791, + "learning_rate": 8.105185185185185e-05, + "loss": 0.1987, + "step": 9530 + }, + { + "epoch": 0.9113053446052443, + "grad_norm": 0.1163114458322525, + "learning_rate": 8.09037037037037e-05, + "loss": 0.2005, + "step": 9540 + }, + { + "epoch": 0.9122605912977026, + "grad_norm": 0.2425907999277115, + "learning_rate": 8.075555555555556e-05, + "loss": 0.2047, + "step": 9550 + }, + { + "epoch": 0.9132158379901609, + "grad_norm": 0.14334097504615784, + "learning_rate": 8.060740740740741e-05, + "loss": 0.191, + "step": 9560 + }, + { + "epoch": 0.9141710846826193, + "grad_norm": 0.10129152238368988, + "learning_rate": 8.045925925925927e-05, + "loss": 0.1947, + "step": 9570 + }, + { + "epoch": 0.9151263313750776, + "grad_norm": 0.08656469732522964, + "learning_rate": 8.031111111111111e-05, + "loss": 0.1975, + "step": 9580 + }, + { + "epoch": 0.916081578067536, + "grad_norm": 0.10407857596874237, + "learning_rate": 8.016296296296296e-05, + "loss": 0.1992, + "step": 9590 + }, + { + "epoch": 0.9170368247599943, + "grad_norm": 0.17440539598464966, + "learning_rate": 8.001481481481482e-05, + "loss": 0.203, + "step": 9600 + }, + { + "epoch": 0.9179920714524525, + "grad_norm": 0.15519316494464874, + "learning_rate": 7.986666666666667e-05, + "loss": 0.1895, + "step": 9610 + }, + { + "epoch": 0.9189473181449109, + "grad_norm": 0.1368798166513443, + "learning_rate": 7.971851851851853e-05, + "loss": 0.194, + "step": 9620 + }, + { + "epoch": 0.9199025648373692, + "grad_norm": 0.08210785686969757, + "learning_rate": 7.957037037037037e-05, + "loss": 0.1975, + "step": 9630 + }, + { + "epoch": 0.9208578115298276, + "grad_norm": 0.12684789299964905, + "learning_rate": 7.942222222222223e-05, + "loss": 0.1998, + "step": 9640 + }, + { + "epoch": 0.9218130582222859, + "grad_norm": 0.159325510263443, + "learning_rate": 7.927407407407408e-05, + "loss": 0.2051, + "step": 9650 + }, + { + "epoch": 0.9227683049147443, + "grad_norm": 0.21739524602890015, + "learning_rate": 7.912592592592592e-05, + "loss": 0.1886, + "step": 9660 + }, + { + "epoch": 0.9237235516072025, + "grad_norm": 0.12373743951320648, + "learning_rate": 7.897777777777778e-05, + "loss": 0.1939, + "step": 9670 + }, + { + "epoch": 0.9246787982996609, + "grad_norm": 0.11995956301689148, + "learning_rate": 7.882962962962963e-05, + "loss": 0.1963, + "step": 9680 + }, + { + "epoch": 0.9256340449921192, + "grad_norm": 0.1054321900010109, + "learning_rate": 7.868148148148149e-05, + "loss": 0.1995, + "step": 9690 + }, + { + "epoch": 0.9265892916845775, + "grad_norm": 0.18521544337272644, + "learning_rate": 7.853333333333334e-05, + "loss": 0.2033, + "step": 9700 + }, + { + "epoch": 0.9275445383770359, + "grad_norm": 0.14290209114551544, + "learning_rate": 7.838518518518518e-05, + "loss": 0.1912, + "step": 9710 + }, + { + "epoch": 0.9284997850694942, + "grad_norm": 0.12010928988456726, + "learning_rate": 7.823703703703704e-05, + "loss": 0.1938, + "step": 9720 + }, + { + "epoch": 0.9294550317619525, + "grad_norm": 0.08440925925970078, + "learning_rate": 7.808888888888889e-05, + "loss": 0.196, + "step": 9730 + }, + { + "epoch": 0.9304102784544108, + "grad_norm": 0.13756245374679565, + "learning_rate": 7.794074074074075e-05, + "loss": 0.2004, + "step": 9740 + }, + { + "epoch": 0.9313655251468692, + "grad_norm": 0.1666550189256668, + "learning_rate": 7.77925925925926e-05, + "loss": 0.2037, + "step": 9750 + }, + { + "epoch": 0.9323207718393275, + "grad_norm": 0.21596874296665192, + "learning_rate": 7.764444444444445e-05, + "loss": 0.1926, + "step": 9760 + }, + { + "epoch": 0.9332760185317859, + "grad_norm": 0.10867001116275787, + "learning_rate": 7.74962962962963e-05, + "loss": 0.1933, + "step": 9770 + }, + { + "epoch": 0.9342312652242442, + "grad_norm": 0.0639323890209198, + "learning_rate": 7.734814814814815e-05, + "loss": 0.1959, + "step": 9780 + }, + { + "epoch": 0.9351865119167024, + "grad_norm": 0.12160497158765793, + "learning_rate": 7.72e-05, + "loss": 0.2011, + "step": 9790 + }, + { + "epoch": 0.9361417586091608, + "grad_norm": 0.20208191871643066, + "learning_rate": 7.705185185185185e-05, + "loss": 0.2041, + "step": 9800 + }, + { + "epoch": 0.9370970053016191, + "grad_norm": 0.2048955261707306, + "learning_rate": 7.690370370370371e-05, + "loss": 0.1901, + "step": 9810 + }, + { + "epoch": 0.9380522519940775, + "grad_norm": 0.1331399381160736, + "learning_rate": 7.675555555555556e-05, + "loss": 0.1953, + "step": 9820 + }, + { + "epoch": 0.9390074986865358, + "grad_norm": 0.07257109880447388, + "learning_rate": 7.660740740740742e-05, + "loss": 0.1973, + "step": 9830 + }, + { + "epoch": 0.9399627453789942, + "grad_norm": 0.12627868354320526, + "learning_rate": 7.645925925925926e-05, + "loss": 0.1994, + "step": 9840 + }, + { + "epoch": 0.9409179920714524, + "grad_norm": 0.18185719847679138, + "learning_rate": 7.631111111111111e-05, + "loss": 0.2038, + "step": 9850 + }, + { + "epoch": 0.9418732387639108, + "grad_norm": 0.14436939358711243, + "learning_rate": 7.616296296296297e-05, + "loss": 0.1892, + "step": 9860 + }, + { + "epoch": 0.9428284854563691, + "grad_norm": 0.1035638079047203, + "learning_rate": 7.601481481481482e-05, + "loss": 0.1933, + "step": 9870 + }, + { + "epoch": 0.9437837321488274, + "grad_norm": 0.10040664672851562, + "learning_rate": 7.586666666666668e-05, + "loss": 0.1962, + "step": 9880 + }, + { + "epoch": 0.9447389788412858, + "grad_norm": 0.12080970406532288, + "learning_rate": 7.571851851851852e-05, + "loss": 0.2002, + "step": 9890 + }, + { + "epoch": 0.945694225533744, + "grad_norm": 0.16594280302524567, + "learning_rate": 7.557037037037037e-05, + "loss": 0.2032, + "step": 9900 + }, + { + "epoch": 0.9466494722262024, + "grad_norm": 0.20917296409606934, + "learning_rate": 7.542222222222223e-05, + "loss": 0.1906, + "step": 9910 + }, + { + "epoch": 0.9476047189186607, + "grad_norm": 0.12201665341854095, + "learning_rate": 7.527407407407408e-05, + "loss": 0.1947, + "step": 9920 + }, + { + "epoch": 0.9485599656111191, + "grad_norm": 0.12169451266527176, + "learning_rate": 7.512592592592593e-05, + "loss": 0.1989, + "step": 9930 + }, + { + "epoch": 0.9495152123035774, + "grad_norm": 0.09808619320392609, + "learning_rate": 7.497777777777778e-05, + "loss": 0.1994, + "step": 9940 + }, + { + "epoch": 0.9504704589960358, + "grad_norm": 0.1698949635028839, + "learning_rate": 7.482962962962964e-05, + "loss": 0.2037, + "step": 9950 + }, + { + "epoch": 0.951425705688494, + "grad_norm": 0.20752237737178802, + "learning_rate": 7.468148148148149e-05, + "loss": 0.1886, + "step": 9960 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.11208115518093109, + "learning_rate": 7.453333333333333e-05, + "loss": 0.1937, + "step": 9970 + }, + { + "epoch": 0.9533361990734107, + "grad_norm": 0.11728549003601074, + "learning_rate": 7.43851851851852e-05, + "loss": 0.1974, + "step": 9980 + }, + { + "epoch": 0.954291445765869, + "grad_norm": 0.10618801414966583, + "learning_rate": 7.423703703703704e-05, + "loss": 0.1995, + "step": 9990 + }, + { + "epoch": 0.9552466924583274, + "grad_norm": 0.1451137661933899, + "learning_rate": 7.40888888888889e-05, + "loss": 0.2033, + "step": 10000 + }, + { + "epoch": 0.9562019391507857, + "grad_norm": 0.24454337358474731, + "learning_rate": 7.394074074074075e-05, + "loss": 0.1911, + "step": 10010 + }, + { + "epoch": 0.957157185843244, + "grad_norm": 0.08611460030078888, + "learning_rate": 7.379259259259259e-05, + "loss": 0.1931, + "step": 10020 + }, + { + "epoch": 0.9581124325357023, + "grad_norm": 0.07674242556095123, + "learning_rate": 7.364444444444445e-05, + "loss": 0.1967, + "step": 10030 + }, + { + "epoch": 0.9590676792281607, + "grad_norm": 0.16499543190002441, + "learning_rate": 7.34962962962963e-05, + "loss": 0.1996, + "step": 10040 + }, + { + "epoch": 0.960022925920619, + "grad_norm": 0.19315092265605927, + "learning_rate": 7.334814814814816e-05, + "loss": 0.204, + "step": 10050 + }, + { + "epoch": 0.9609781726130773, + "grad_norm": 0.14484643936157227, + "learning_rate": 7.32e-05, + "loss": 0.1874, + "step": 10060 + }, + { + "epoch": 0.9619334193055357, + "grad_norm": 0.12405912578105927, + "learning_rate": 7.305185185185186e-05, + "loss": 0.195, + "step": 10070 + }, + { + "epoch": 0.9628886659979939, + "grad_norm": 0.06671059131622314, + "learning_rate": 7.290370370370371e-05, + "loss": 0.1973, + "step": 10080 + }, + { + "epoch": 0.9638439126904523, + "grad_norm": 0.09796929359436035, + "learning_rate": 7.275555555555556e-05, + "loss": 0.2002, + "step": 10090 + }, + { + "epoch": 0.9647991593829106, + "grad_norm": 0.18177813291549683, + "learning_rate": 7.260740740740742e-05, + "loss": 0.2044, + "step": 10100 + }, + { + "epoch": 0.965754406075369, + "grad_norm": 0.1589105874300003, + "learning_rate": 7.245925925925926e-05, + "loss": 0.1897, + "step": 10110 + }, + { + "epoch": 0.9667096527678273, + "grad_norm": 0.16179531812667847, + "learning_rate": 7.231111111111112e-05, + "loss": 0.1951, + "step": 10120 + }, + { + "epoch": 0.9676648994602857, + "grad_norm": 0.08069080859422684, + "learning_rate": 7.216296296296297e-05, + "loss": 0.1972, + "step": 10130 + }, + { + "epoch": 0.9686201461527439, + "grad_norm": 0.11243908107280731, + "learning_rate": 7.201481481481481e-05, + "loss": 0.1986, + "step": 10140 + }, + { + "epoch": 0.9695753928452022, + "grad_norm": 0.1934669017791748, + "learning_rate": 7.186666666666667e-05, + "loss": 0.2036, + "step": 10150 + }, + { + "epoch": 0.9705306395376606, + "grad_norm": 0.15914154052734375, + "learning_rate": 7.171851851851852e-05, + "loss": 0.1877, + "step": 10160 + }, + { + "epoch": 0.9714858862301189, + "grad_norm": 0.07927779853343964, + "learning_rate": 7.157037037037038e-05, + "loss": 0.1938, + "step": 10170 + }, + { + "epoch": 0.9724411329225773, + "grad_norm": 0.15848129987716675, + "learning_rate": 7.142222222222223e-05, + "loss": 0.1973, + "step": 10180 + }, + { + "epoch": 0.9733963796150356, + "grad_norm": 0.09003908932209015, + "learning_rate": 7.127407407407409e-05, + "loss": 0.2001, + "step": 10190 + }, + { + "epoch": 0.974351626307494, + "grad_norm": 0.16996480524539948, + "learning_rate": 7.112592592592593e-05, + "loss": 0.2019, + "step": 10200 + }, + { + "epoch": 0.9753068729999522, + "grad_norm": 0.18027909100055695, + "learning_rate": 7.097777777777778e-05, + "loss": 0.1914, + "step": 10210 + }, + { + "epoch": 0.9762621196924106, + "grad_norm": 0.09944116324186325, + "learning_rate": 7.082962962962964e-05, + "loss": 0.1929, + "step": 10220 + }, + { + "epoch": 0.9772173663848689, + "grad_norm": 0.11399278044700623, + "learning_rate": 7.068148148148148e-05, + "loss": 0.1965, + "step": 10230 + }, + { + "epoch": 0.9781726130773272, + "grad_norm": 0.11866579204797745, + "learning_rate": 7.053333333333334e-05, + "loss": 0.1999, + "step": 10240 + }, + { + "epoch": 0.9791278597697856, + "grad_norm": 0.20890718698501587, + "learning_rate": 7.038518518518519e-05, + "loss": 0.2044, + "step": 10250 + }, + { + "epoch": 0.9800831064622438, + "grad_norm": 0.16690291464328766, + "learning_rate": 7.023703703703705e-05, + "loss": 0.1896, + "step": 10260 + }, + { + "epoch": 0.9810383531547022, + "grad_norm": 0.12047307938337326, + "learning_rate": 7.00888888888889e-05, + "loss": 0.1929, + "step": 10270 + }, + { + "epoch": 0.9819935998471605, + "grad_norm": 0.07980793714523315, + "learning_rate": 6.994074074074074e-05, + "loss": 0.1968, + "step": 10280 + }, + { + "epoch": 0.9829488465396189, + "grad_norm": 0.14673179388046265, + "learning_rate": 6.97925925925926e-05, + "loss": 0.2006, + "step": 10290 + }, + { + "epoch": 0.9839040932320772, + "grad_norm": 0.1462879627943039, + "learning_rate": 6.964444444444445e-05, + "loss": 0.2029, + "step": 10300 + }, + { + "epoch": 0.9848593399245356, + "grad_norm": 0.16838905215263367, + "learning_rate": 6.949629629629631e-05, + "loss": 0.1887, + "step": 10310 + }, + { + "epoch": 0.9858145866169938, + "grad_norm": 0.14399904012680054, + "learning_rate": 6.934814814814816e-05, + "loss": 0.1941, + "step": 10320 + }, + { + "epoch": 0.9867698333094521, + "grad_norm": 0.06735172122716904, + "learning_rate": 6.92e-05, + "loss": 0.198, + "step": 10330 + }, + { + "epoch": 0.9877250800019105, + "grad_norm": 0.0967218354344368, + "learning_rate": 6.905185185185186e-05, + "loss": 0.2014, + "step": 10340 + }, + { + "epoch": 0.9886803266943688, + "grad_norm": 0.18309317529201508, + "learning_rate": 6.890370370370371e-05, + "loss": 0.2027, + "step": 10350 + }, + { + "epoch": 0.9896355733868272, + "grad_norm": 0.12692053616046906, + "learning_rate": 6.875555555555557e-05, + "loss": 0.1896, + "step": 10360 + }, + { + "epoch": 0.9905908200792855, + "grad_norm": 0.09795171767473221, + "learning_rate": 6.860740740740741e-05, + "loss": 0.193, + "step": 10370 + }, + { + "epoch": 0.9915460667717438, + "grad_norm": 0.11544561386108398, + "learning_rate": 6.845925925925927e-05, + "loss": 0.1973, + "step": 10380 + }, + { + "epoch": 0.9925013134642021, + "grad_norm": 0.1150268018245697, + "learning_rate": 6.831111111111112e-05, + "loss": 0.2015, + "step": 10390 + }, + { + "epoch": 0.9934565601566605, + "grad_norm": 0.160989910364151, + "learning_rate": 6.816296296296297e-05, + "loss": 0.2036, + "step": 10400 + }, + { + "epoch": 0.9944118068491188, + "grad_norm": 0.14440488815307617, + "learning_rate": 6.801481481481483e-05, + "loss": 0.1895, + "step": 10410 + }, + { + "epoch": 0.9953670535415771, + "grad_norm": 0.07250352948904037, + "learning_rate": 6.786666666666667e-05, + "loss": 0.1927, + "step": 10420 + }, + { + "epoch": 0.9963223002340355, + "grad_norm": 0.11280882358551025, + "learning_rate": 6.771851851851853e-05, + "loss": 0.1962, + "step": 10430 + }, + { + "epoch": 0.9972775469264937, + "grad_norm": 0.11649350076913834, + "learning_rate": 6.757037037037038e-05, + "loss": 0.2007, + "step": 10440 + }, + { + "epoch": 0.9982327936189521, + "grad_norm": 0.17705261707305908, + "learning_rate": 6.742222222222222e-05, + "loss": 0.204, + "step": 10450 + }, + { + "epoch": 0.9991880403114104, + "grad_norm": 0.07574629783630371, + "learning_rate": 6.727407407407408e-05, + "loss": 0.1946, + "step": 10460 + }, + { + "epoch": 1.0000955246692458, + "grad_norm": 0.16626384854316711, + "learning_rate": 6.712592592592593e-05, + "loss": 0.198, + "step": 10470 + }, + { + "epoch": 1.001050771361704, + "grad_norm": 0.09976531565189362, + "learning_rate": 6.697777777777779e-05, + "loss": 0.1857, + "step": 10480 + }, + { + "epoch": 1.0020060180541626, + "grad_norm": 0.06542108952999115, + "learning_rate": 6.682962962962964e-05, + "loss": 0.1922, + "step": 10490 + }, + { + "epoch": 1.0029612647466208, + "grad_norm": 0.09266208857297897, + "learning_rate": 6.668148148148148e-05, + "loss": 0.1982, + "step": 10500 + }, + { + "epoch": 1.0039165114390791, + "grad_norm": 0.11459262669086456, + "learning_rate": 6.653333333333334e-05, + "loss": 0.2009, + "step": 10510 + }, + { + "epoch": 1.0048717581315374, + "grad_norm": 0.1587928831577301, + "learning_rate": 6.638518518518519e-05, + "loss": 0.2008, + "step": 10520 + }, + { + "epoch": 1.005827004823996, + "grad_norm": 0.11515659838914871, + "learning_rate": 6.623703703703705e-05, + "loss": 0.1876, + "step": 10530 + }, + { + "epoch": 1.0067822515164542, + "grad_norm": 0.07409398257732391, + "learning_rate": 6.60888888888889e-05, + "loss": 0.1928, + "step": 10540 + }, + { + "epoch": 1.0077374982089125, + "grad_norm": 0.09634041041135788, + "learning_rate": 6.594074074074074e-05, + "loss": 0.1968, + "step": 10550 + }, + { + "epoch": 1.0086927449013707, + "grad_norm": 0.10461993515491486, + "learning_rate": 6.57925925925926e-05, + "loss": 0.2012, + "step": 10560 + }, + { + "epoch": 1.009647991593829, + "grad_norm": 0.188216432929039, + "learning_rate": 6.564444444444445e-05, + "loss": 0.2006, + "step": 10570 + }, + { + "epoch": 1.0106032382862875, + "grad_norm": 0.10396794974803925, + "learning_rate": 6.54962962962963e-05, + "loss": 0.188, + "step": 10580 + }, + { + "epoch": 1.0115584849787458, + "grad_norm": 0.08603309839963913, + "learning_rate": 6.534814814814815e-05, + "loss": 0.1926, + "step": 10590 + }, + { + "epoch": 1.012513731671204, + "grad_norm": 0.09479288011789322, + "learning_rate": 6.52e-05, + "loss": 0.1961, + "step": 10600 + }, + { + "epoch": 1.0134689783636623, + "grad_norm": 0.11584699153900146, + "learning_rate": 6.505185185185186e-05, + "loss": 0.1998, + "step": 10610 + }, + { + "epoch": 1.0144242250561208, + "grad_norm": 0.21939048171043396, + "learning_rate": 6.49037037037037e-05, + "loss": 0.201, + "step": 10620 + }, + { + "epoch": 1.0153794717485791, + "grad_norm": 0.11948587000370026, + "learning_rate": 6.475555555555555e-05, + "loss": 0.1877, + "step": 10630 + }, + { + "epoch": 1.0163347184410374, + "grad_norm": 0.087392158806324, + "learning_rate": 6.460740740740741e-05, + "loss": 0.1937, + "step": 10640 + }, + { + "epoch": 1.0172899651334957, + "grad_norm": 0.0761866346001625, + "learning_rate": 6.445925925925926e-05, + "loss": 0.1974, + "step": 10650 + }, + { + "epoch": 1.018245211825954, + "grad_norm": 0.1001187115907669, + "learning_rate": 6.431111111111112e-05, + "loss": 0.1999, + "step": 10660 + }, + { + "epoch": 1.0192004585184125, + "grad_norm": 0.20643304288387299, + "learning_rate": 6.416296296296296e-05, + "loss": 0.2001, + "step": 10670 + }, + { + "epoch": 1.0201557052108707, + "grad_norm": 0.12136881798505783, + "learning_rate": 6.401481481481481e-05, + "loss": 0.1889, + "step": 10680 + }, + { + "epoch": 1.021110951903329, + "grad_norm": 0.14471648633480072, + "learning_rate": 6.386666666666667e-05, + "loss": 0.1934, + "step": 10690 + }, + { + "epoch": 1.0220661985957873, + "grad_norm": 0.08957241475582123, + "learning_rate": 6.371851851851852e-05, + "loss": 0.1969, + "step": 10700 + }, + { + "epoch": 1.0230214452882458, + "grad_norm": 0.08908242732286453, + "learning_rate": 6.357037037037038e-05, + "loss": 0.2006, + "step": 10710 + }, + { + "epoch": 1.023976691980704, + "grad_norm": 0.17719383537769318, + "learning_rate": 6.342222222222222e-05, + "loss": 0.2005, + "step": 10720 + }, + { + "epoch": 1.0249319386731623, + "grad_norm": 0.13578200340270996, + "learning_rate": 6.327407407407407e-05, + "loss": 0.1896, + "step": 10730 + }, + { + "epoch": 1.0258871853656206, + "grad_norm": 0.09592164307832718, + "learning_rate": 6.312592592592593e-05, + "loss": 0.1925, + "step": 10740 + }, + { + "epoch": 1.026842432058079, + "grad_norm": 0.08957613259553909, + "learning_rate": 6.297777777777777e-05, + "loss": 0.1975, + "step": 10750 + }, + { + "epoch": 1.0277976787505374, + "grad_norm": 0.11708281934261322, + "learning_rate": 6.282962962962963e-05, + "loss": 0.2009, + "step": 10760 + }, + { + "epoch": 1.0287529254429957, + "grad_norm": 0.17692160606384277, + "learning_rate": 6.268148148148148e-05, + "loss": 0.2005, + "step": 10770 + }, + { + "epoch": 1.029708172135454, + "grad_norm": 0.1275843232870102, + "learning_rate": 6.253333333333333e-05, + "loss": 0.1888, + "step": 10780 + }, + { + "epoch": 1.0306634188279122, + "grad_norm": 0.07367505878210068, + "learning_rate": 6.238518518518519e-05, + "loss": 0.1918, + "step": 10790 + }, + { + "epoch": 1.0316186655203707, + "grad_norm": 0.09375837445259094, + "learning_rate": 6.223703703703703e-05, + "loss": 0.1969, + "step": 10800 + }, + { + "epoch": 1.032573912212829, + "grad_norm": 0.11127982288599014, + "learning_rate": 6.208888888888889e-05, + "loss": 0.1995, + "step": 10810 + }, + { + "epoch": 1.0335291589052873, + "grad_norm": 0.20982736349105835, + "learning_rate": 6.194074074074074e-05, + "loss": 0.2004, + "step": 10820 + }, + { + "epoch": 1.0344844055977456, + "grad_norm": 0.11941300332546234, + "learning_rate": 6.179259259259258e-05, + "loss": 0.1896, + "step": 10830 + }, + { + "epoch": 1.0354396522902038, + "grad_norm": 0.09594809263944626, + "learning_rate": 6.164444444444444e-05, + "loss": 0.1924, + "step": 10840 + }, + { + "epoch": 1.0363948989826623, + "grad_norm": 0.092034250497818, + "learning_rate": 6.149629629629629e-05, + "loss": 0.1956, + "step": 10850 + }, + { + "epoch": 1.0373501456751206, + "grad_norm": 0.10250823199748993, + "learning_rate": 6.134814814814815e-05, + "loss": 0.1999, + "step": 10860 + }, + { + "epoch": 1.038305392367579, + "grad_norm": 0.20155517756938934, + "learning_rate": 6.12e-05, + "loss": 0.2001, + "step": 10870 + }, + { + "epoch": 1.0392606390600372, + "grad_norm": 0.09367980062961578, + "learning_rate": 6.105185185185186e-05, + "loss": 0.1883, + "step": 10880 + }, + { + "epoch": 1.0402158857524957, + "grad_norm": 0.1017754077911377, + "learning_rate": 6.090370370370371e-05, + "loss": 0.1933, + "step": 10890 + }, + { + "epoch": 1.041171132444954, + "grad_norm": 0.07937656342983246, + "learning_rate": 6.0755555555555556e-05, + "loss": 0.1978, + "step": 10900 + }, + { + "epoch": 1.0421263791374122, + "grad_norm": 0.09843187034130096, + "learning_rate": 6.0607407407407416e-05, + "loss": 0.2007, + "step": 10910 + }, + { + "epoch": 1.0430816258298705, + "grad_norm": 0.17273889482021332, + "learning_rate": 6.045925925925926e-05, + "loss": 0.1988, + "step": 10920 + }, + { + "epoch": 1.0440368725223288, + "grad_norm": 0.10299399495124817, + "learning_rate": 6.0311111111111115e-05, + "loss": 0.1866, + "step": 10930 + }, + { + "epoch": 1.0449921192147873, + "grad_norm": 0.1454765498638153, + "learning_rate": 6.016296296296297e-05, + "loss": 0.1934, + "step": 10940 + }, + { + "epoch": 1.0459473659072456, + "grad_norm": 0.08828850090503693, + "learning_rate": 6.0014814814814814e-05, + "loss": 0.1962, + "step": 10950 + }, + { + "epoch": 1.0469026125997039, + "grad_norm": 0.08813127875328064, + "learning_rate": 5.9866666666666674e-05, + "loss": 0.199, + "step": 10960 + }, + { + "epoch": 1.0478578592921621, + "grad_norm": 0.17540910840034485, + "learning_rate": 5.971851851851852e-05, + "loss": 0.1988, + "step": 10970 + }, + { + "epoch": 1.0488131059846206, + "grad_norm": 0.11944608390331268, + "learning_rate": 5.957037037037037e-05, + "loss": 0.189, + "step": 10980 + }, + { + "epoch": 1.049768352677079, + "grad_norm": 0.07055499404668808, + "learning_rate": 5.9422222222222226e-05, + "loss": 0.1932, + "step": 10990 + }, + { + "epoch": 1.0507235993695372, + "grad_norm": 0.08947975933551788, + "learning_rate": 5.927407407407408e-05, + "loss": 0.1951, + "step": 11000 + }, + { + "epoch": 1.0516788460619955, + "grad_norm": 0.12585119903087616, + "learning_rate": 5.912592592592593e-05, + "loss": 0.201, + "step": 11010 + }, + { + "epoch": 1.0526340927544537, + "grad_norm": 0.18942105770111084, + "learning_rate": 5.897777777777778e-05, + "loss": 0.2003, + "step": 11020 + }, + { + "epoch": 1.0535893394469122, + "grad_norm": 0.11553331464529037, + "learning_rate": 5.882962962962963e-05, + "loss": 0.1869, + "step": 11030 + }, + { + "epoch": 1.0545445861393705, + "grad_norm": 0.06898508220911026, + "learning_rate": 5.8681481481481485e-05, + "loss": 0.1922, + "step": 11040 + }, + { + "epoch": 1.0554998328318288, + "grad_norm": 0.07421170175075531, + "learning_rate": 5.853333333333334e-05, + "loss": 0.1962, + "step": 11050 + }, + { + "epoch": 1.056455079524287, + "grad_norm": 0.1096966564655304, + "learning_rate": 5.8385185185185184e-05, + "loss": 0.2009, + "step": 11060 + }, + { + "epoch": 1.0574103262167456, + "grad_norm": 0.16953304409980774, + "learning_rate": 5.823703703703704e-05, + "loss": 0.2008, + "step": 11070 + }, + { + "epoch": 1.0583655729092039, + "grad_norm": 0.1059802696108818, + "learning_rate": 5.808888888888889e-05, + "loss": 0.186, + "step": 11080 + }, + { + "epoch": 1.0593208196016621, + "grad_norm": 0.0721934363245964, + "learning_rate": 5.794074074074074e-05, + "loss": 0.1926, + "step": 11090 + }, + { + "epoch": 1.0602760662941204, + "grad_norm": 0.08843927830457687, + "learning_rate": 5.7792592592592596e-05, + "loss": 0.1961, + "step": 11100 + }, + { + "epoch": 1.0612313129865787, + "grad_norm": 0.12775014340877533, + "learning_rate": 5.764444444444444e-05, + "loss": 0.2004, + "step": 11110 + }, + { + "epoch": 1.0621865596790372, + "grad_norm": 0.17205151915550232, + "learning_rate": 5.74962962962963e-05, + "loss": 0.2002, + "step": 11120 + }, + { + "epoch": 1.0631418063714955, + "grad_norm": 0.12345268577337265, + "learning_rate": 5.734814814814815e-05, + "loss": 0.1868, + "step": 11130 + }, + { + "epoch": 1.0640970530639537, + "grad_norm": 0.06966684758663177, + "learning_rate": 5.72e-05, + "loss": 0.1927, + "step": 11140 + }, + { + "epoch": 1.065052299756412, + "grad_norm": 0.10183680057525635, + "learning_rate": 5.7051851851851854e-05, + "loss": 0.1964, + "step": 11150 + }, + { + "epoch": 1.0660075464488705, + "grad_norm": 0.09469080716371536, + "learning_rate": 5.69037037037037e-05, + "loss": 0.2004, + "step": 11160 + }, + { + "epoch": 1.0669627931413288, + "grad_norm": 0.1744278520345688, + "learning_rate": 5.675555555555556e-05, + "loss": 0.1999, + "step": 11170 + }, + { + "epoch": 1.067918039833787, + "grad_norm": 0.12014975398778915, + "learning_rate": 5.6607407407407407e-05, + "loss": 0.186, + "step": 11180 + }, + { + "epoch": 1.0688732865262454, + "grad_norm": 0.06704577803611755, + "learning_rate": 5.6459259259259266e-05, + "loss": 0.1916, + "step": 11190 + }, + { + "epoch": 1.0698285332187036, + "grad_norm": 0.09180966764688492, + "learning_rate": 5.631111111111111e-05, + "loss": 0.1962, + "step": 11200 + }, + { + "epoch": 1.0707837799111621, + "grad_norm": 0.09561672806739807, + "learning_rate": 5.616296296296296e-05, + "loss": 0.2004, + "step": 11210 + }, + { + "epoch": 1.0717390266036204, + "grad_norm": 0.17557843029499054, + "learning_rate": 5.601481481481482e-05, + "loss": 0.1993, + "step": 11220 + }, + { + "epoch": 1.0726942732960787, + "grad_norm": 0.10663829743862152, + "learning_rate": 5.5866666666666665e-05, + "loss": 0.1863, + "step": 11230 + }, + { + "epoch": 1.073649519988537, + "grad_norm": 0.07170140743255615, + "learning_rate": 5.5718518518518525e-05, + "loss": 0.1926, + "step": 11240 + }, + { + "epoch": 1.0746047666809955, + "grad_norm": 0.07816746085882187, + "learning_rate": 5.557037037037037e-05, + "loss": 0.1952, + "step": 11250 + }, + { + "epoch": 1.0755600133734537, + "grad_norm": 0.10512028634548187, + "learning_rate": 5.542222222222222e-05, + "loss": 0.2002, + "step": 11260 + }, + { + "epoch": 1.076515260065912, + "grad_norm": 0.17822621762752533, + "learning_rate": 5.527407407407408e-05, + "loss": 0.1996, + "step": 11270 + }, + { + "epoch": 1.0774705067583703, + "grad_norm": 0.1166963279247284, + "learning_rate": 5.512592592592592e-05, + "loss": 0.1866, + "step": 11280 + }, + { + "epoch": 1.0784257534508286, + "grad_norm": 0.06219042092561722, + "learning_rate": 5.497777777777778e-05, + "loss": 0.1913, + "step": 11290 + }, + { + "epoch": 1.079381000143287, + "grad_norm": 0.07941684871912003, + "learning_rate": 5.482962962962963e-05, + "loss": 0.1951, + "step": 11300 + }, + { + "epoch": 1.0803362468357454, + "grad_norm": 0.07750697433948517, + "learning_rate": 5.468148148148149e-05, + "loss": 0.2003, + "step": 11310 + }, + { + "epoch": 1.0812914935282036, + "grad_norm": 0.15450026094913483, + "learning_rate": 5.4533333333333335e-05, + "loss": 0.1998, + "step": 11320 + }, + { + "epoch": 1.082246740220662, + "grad_norm": 0.12201431393623352, + "learning_rate": 5.438518518518518e-05, + "loss": 0.1865, + "step": 11330 + }, + { + "epoch": 1.0832019869131204, + "grad_norm": 0.06394163519144058, + "learning_rate": 5.423703703703704e-05, + "loss": 0.1908, + "step": 11340 + }, + { + "epoch": 1.0841572336055787, + "grad_norm": 0.08452702313661575, + "learning_rate": 5.408888888888889e-05, + "loss": 0.1959, + "step": 11350 + }, + { + "epoch": 1.085112480298037, + "grad_norm": 0.12962254881858826, + "learning_rate": 5.394074074074075e-05, + "loss": 0.2007, + "step": 11360 + }, + { + "epoch": 1.0860677269904953, + "grad_norm": 0.17652928829193115, + "learning_rate": 5.3792592592592594e-05, + "loss": 0.1986, + "step": 11370 + }, + { + "epoch": 1.0870229736829535, + "grad_norm": 0.11936990916728973, + "learning_rate": 5.364444444444444e-05, + "loss": 0.1855, + "step": 11380 + }, + { + "epoch": 1.087978220375412, + "grad_norm": 0.08107299357652664, + "learning_rate": 5.34962962962963e-05, + "loss": 0.1912, + "step": 11390 + }, + { + "epoch": 1.0889334670678703, + "grad_norm": 0.07732398808002472, + "learning_rate": 5.3348148148148146e-05, + "loss": 0.1965, + "step": 11400 + }, + { + "epoch": 1.0898887137603286, + "grad_norm": 0.10233469307422638, + "learning_rate": 5.3200000000000006e-05, + "loss": 0.2012, + "step": 11410 + }, + { + "epoch": 1.0908439604527869, + "grad_norm": 0.16339091956615448, + "learning_rate": 5.305185185185185e-05, + "loss": 0.2001, + "step": 11420 + }, + { + "epoch": 1.0917992071452454, + "grad_norm": 0.10522663593292236, + "learning_rate": 5.290370370370371e-05, + "loss": 0.1862, + "step": 11430 + }, + { + "epoch": 1.0927544538377036, + "grad_norm": 0.07971503585577011, + "learning_rate": 5.275555555555556e-05, + "loss": 0.1918, + "step": 11440 + }, + { + "epoch": 1.093709700530162, + "grad_norm": 0.08777160942554474, + "learning_rate": 5.2607407407407404e-05, + "loss": 0.1955, + "step": 11450 + }, + { + "epoch": 1.0946649472226202, + "grad_norm": 0.104684017598629, + "learning_rate": 5.2459259259259264e-05, + "loss": 0.1999, + "step": 11460 + }, + { + "epoch": 1.0956201939150785, + "grad_norm": 0.15735992789268494, + "learning_rate": 5.231111111111111e-05, + "loss": 0.1989, + "step": 11470 + }, + { + "epoch": 1.096575440607537, + "grad_norm": 0.11038859188556671, + "learning_rate": 5.216296296296297e-05, + "loss": 0.1866, + "step": 11480 + }, + { + "epoch": 1.0975306872999953, + "grad_norm": 0.06414885818958282, + "learning_rate": 5.2014814814814816e-05, + "loss": 0.1919, + "step": 11490 + }, + { + "epoch": 1.0984859339924535, + "grad_norm": 0.08361148089170456, + "learning_rate": 5.1866666666666676e-05, + "loss": 0.195, + "step": 11500 + }, + { + "epoch": 1.0994411806849118, + "grad_norm": 0.10123872011899948, + "learning_rate": 5.171851851851852e-05, + "loss": 0.1995, + "step": 11510 + }, + { + "epoch": 1.1003964273773703, + "grad_norm": 0.17141133546829224, + "learning_rate": 5.157037037037037e-05, + "loss": 0.2003, + "step": 11520 + }, + { + "epoch": 1.1013516740698286, + "grad_norm": 0.11368907988071442, + "learning_rate": 5.142222222222223e-05, + "loss": 0.1859, + "step": 11530 + }, + { + "epoch": 1.1023069207622869, + "grad_norm": 0.07556621730327606, + "learning_rate": 5.1274074074074075e-05, + "loss": 0.1908, + "step": 11540 + }, + { + "epoch": 1.1032621674547451, + "grad_norm": 0.06946977972984314, + "learning_rate": 5.1125925925925934e-05, + "loss": 0.195, + "step": 11550 + }, + { + "epoch": 1.1042174141472034, + "grad_norm": 0.09921626001596451, + "learning_rate": 5.097777777777778e-05, + "loss": 0.2001, + "step": 11560 + }, + { + "epoch": 1.105172660839662, + "grad_norm": 0.16775824129581451, + "learning_rate": 5.082962962962963e-05, + "loss": 0.201, + "step": 11570 + }, + { + "epoch": 1.1061279075321202, + "grad_norm": 0.10106077045202255, + "learning_rate": 5.068148148148149e-05, + "loss": 0.1848, + "step": 11580 + }, + { + "epoch": 1.1070831542245785, + "grad_norm": 0.06299445778131485, + "learning_rate": 5.053333333333333e-05, + "loss": 0.1904, + "step": 11590 + }, + { + "epoch": 1.1080384009170368, + "grad_norm": 0.08746166527271271, + "learning_rate": 5.038518518518519e-05, + "loss": 0.1949, + "step": 11600 + }, + { + "epoch": 1.1089936476094953, + "grad_norm": 0.0920008197426796, + "learning_rate": 5.023703703703704e-05, + "loss": 0.2014, + "step": 11610 + }, + { + "epoch": 1.1099488943019535, + "grad_norm": 0.1872783750295639, + "learning_rate": 5.00888888888889e-05, + "loss": 0.201, + "step": 11620 + }, + { + "epoch": 1.1109041409944118, + "grad_norm": 0.11330235749483109, + "learning_rate": 4.9940740740740745e-05, + "loss": 0.1851, + "step": 11630 + }, + { + "epoch": 1.11185938768687, + "grad_norm": 0.07352713495492935, + "learning_rate": 4.97925925925926e-05, + "loss": 0.1906, + "step": 11640 + }, + { + "epoch": 1.1128146343793284, + "grad_norm": 0.07548803091049194, + "learning_rate": 4.964444444444445e-05, + "loss": 0.1954, + "step": 11650 + }, + { + "epoch": 1.1137698810717869, + "grad_norm": 0.15241172909736633, + "learning_rate": 4.94962962962963e-05, + "loss": 0.2001, + "step": 11660 + }, + { + "epoch": 1.1147251277642451, + "grad_norm": 0.158297598361969, + "learning_rate": 4.934814814814815e-05, + "loss": 0.2002, + "step": 11670 + }, + { + "epoch": 1.1156803744567034, + "grad_norm": 0.1126081794500351, + "learning_rate": 4.92e-05, + "loss": 0.1847, + "step": 11680 + }, + { + "epoch": 1.1166356211491617, + "grad_norm": 0.059119775891304016, + "learning_rate": 4.9051851851851856e-05, + "loss": 0.1914, + "step": 11690 + }, + { + "epoch": 1.1175908678416202, + "grad_norm": 0.11391792446374893, + "learning_rate": 4.890370370370371e-05, + "loss": 0.1954, + "step": 11700 + }, + { + "epoch": 1.1185461145340785, + "grad_norm": 0.11543019860982895, + "learning_rate": 4.875555555555556e-05, + "loss": 0.1999, + "step": 11710 + }, + { + "epoch": 1.1195013612265368, + "grad_norm": 0.15582410991191864, + "learning_rate": 4.860740740740741e-05, + "loss": 0.1989, + "step": 11720 + }, + { + "epoch": 1.120456607918995, + "grad_norm": 0.1310344636440277, + "learning_rate": 4.845925925925926e-05, + "loss": 0.1852, + "step": 11730 + }, + { + "epoch": 1.1214118546114533, + "grad_norm": 0.06701342761516571, + "learning_rate": 4.8311111111111115e-05, + "loss": 0.1909, + "step": 11740 + }, + { + "epoch": 1.1223671013039118, + "grad_norm": 0.11257292330265045, + "learning_rate": 4.816296296296297e-05, + "loss": 0.194, + "step": 11750 + }, + { + "epoch": 1.12332234799637, + "grad_norm": 0.09040035307407379, + "learning_rate": 4.801481481481482e-05, + "loss": 0.2001, + "step": 11760 + }, + { + "epoch": 1.1242775946888284, + "grad_norm": 0.19392362236976624, + "learning_rate": 4.7866666666666674e-05, + "loss": 0.1997, + "step": 11770 + }, + { + "epoch": 1.1252328413812867, + "grad_norm": 0.0966700091958046, + "learning_rate": 4.771851851851853e-05, + "loss": 0.1868, + "step": 11780 + }, + { + "epoch": 1.1261880880737452, + "grad_norm": 0.06864186376333237, + "learning_rate": 4.757037037037037e-05, + "loss": 0.1909, + "step": 11790 + }, + { + "epoch": 1.1271433347662034, + "grad_norm": 0.0784897580742836, + "learning_rate": 4.7422222222222226e-05, + "loss": 0.1948, + "step": 11800 + }, + { + "epoch": 1.1280985814586617, + "grad_norm": 0.08286922425031662, + "learning_rate": 4.727407407407408e-05, + "loss": 0.1996, + "step": 11810 + }, + { + "epoch": 1.12905382815112, + "grad_norm": 0.22640322148799896, + "learning_rate": 4.712592592592593e-05, + "loss": 0.201, + "step": 11820 + }, + { + "epoch": 1.1300090748435783, + "grad_norm": 0.1183919832110405, + "learning_rate": 4.6977777777777785e-05, + "loss": 0.1846, + "step": 11830 + }, + { + "epoch": 1.1309643215360368, + "grad_norm": 0.08607591688632965, + "learning_rate": 4.682962962962963e-05, + "loss": 0.1906, + "step": 11840 + }, + { + "epoch": 1.131919568228495, + "grad_norm": 0.09465157240629196, + "learning_rate": 4.6681481481481484e-05, + "loss": 0.196, + "step": 11850 + }, + { + "epoch": 1.1328748149209533, + "grad_norm": 0.08989481627941132, + "learning_rate": 4.653333333333334e-05, + "loss": 0.1992, + "step": 11860 + }, + { + "epoch": 1.1338300616134116, + "grad_norm": 0.2044905126094818, + "learning_rate": 4.638518518518519e-05, + "loss": 0.199, + "step": 11870 + }, + { + "epoch": 1.13478530830587, + "grad_norm": 0.09982435405254364, + "learning_rate": 4.6237037037037037e-05, + "loss": 0.1859, + "step": 11880 + }, + { + "epoch": 1.1357405549983284, + "grad_norm": 0.08768076449632645, + "learning_rate": 4.608888888888889e-05, + "loss": 0.1911, + "step": 11890 + }, + { + "epoch": 1.1366958016907867, + "grad_norm": 0.08545703440904617, + "learning_rate": 4.594074074074074e-05, + "loss": 0.1962, + "step": 11900 + }, + { + "epoch": 1.137651048383245, + "grad_norm": 0.08855101466178894, + "learning_rate": 4.5792592592592596e-05, + "loss": 0.2002, + "step": 11910 + }, + { + "epoch": 1.1386062950757032, + "grad_norm": 0.165024995803833, + "learning_rate": 4.564444444444444e-05, + "loss": 0.199, + "step": 11920 + }, + { + "epoch": 1.1395615417681617, + "grad_norm": 0.10052850842475891, + "learning_rate": 4.5496296296296295e-05, + "loss": 0.1855, + "step": 11930 + }, + { + "epoch": 1.14051678846062, + "grad_norm": 0.06890951097011566, + "learning_rate": 4.534814814814815e-05, + "loss": 0.192, + "step": 11940 + }, + { + "epoch": 1.1414720351530783, + "grad_norm": 0.09082265198230743, + "learning_rate": 4.52e-05, + "loss": 0.1955, + "step": 11950 + }, + { + "epoch": 1.1424272818455365, + "grad_norm": 0.12514296174049377, + "learning_rate": 4.5051851851851854e-05, + "loss": 0.2, + "step": 11960 + }, + { + "epoch": 1.143382528537995, + "grad_norm": 0.15191242098808289, + "learning_rate": 4.49037037037037e-05, + "loss": 0.1994, + "step": 11970 + }, + { + "epoch": 1.1443377752304533, + "grad_norm": 0.1041502133011818, + "learning_rate": 4.475555555555555e-05, + "loss": 0.1855, + "step": 11980 + }, + { + "epoch": 1.1452930219229116, + "grad_norm": 0.07793102413415909, + "learning_rate": 4.4607407407407406e-05, + "loss": 0.1913, + "step": 11990 + }, + { + "epoch": 1.1462482686153699, + "grad_norm": 0.07437871396541595, + "learning_rate": 4.445925925925926e-05, + "loss": 0.1959, + "step": 12000 + }, + { + "epoch": 1.1472035153078282, + "grad_norm": 0.12875069677829742, + "learning_rate": 4.431111111111111e-05, + "loss": 0.1988, + "step": 12010 + }, + { + "epoch": 1.1481587620002867, + "grad_norm": 0.15850570797920227, + "learning_rate": 4.4162962962962965e-05, + "loss": 0.1995, + "step": 12020 + }, + { + "epoch": 1.149114008692745, + "grad_norm": 0.10010084509849548, + "learning_rate": 4.401481481481481e-05, + "loss": 0.1858, + "step": 12030 + }, + { + "epoch": 1.1500692553852032, + "grad_norm": 0.07205148786306381, + "learning_rate": 4.3866666666666665e-05, + "loss": 0.1919, + "step": 12040 + }, + { + "epoch": 1.1510245020776615, + "grad_norm": 0.09679614752531052, + "learning_rate": 4.371851851851852e-05, + "loss": 0.1963, + "step": 12050 + }, + { + "epoch": 1.15197974877012, + "grad_norm": 0.10506289452314377, + "learning_rate": 4.357037037037037e-05, + "loss": 0.2001, + "step": 12060 + }, + { + "epoch": 1.1529349954625783, + "grad_norm": 0.16931326687335968, + "learning_rate": 4.3422222222222224e-05, + "loss": 0.2002, + "step": 12070 + }, + { + "epoch": 1.1538902421550365, + "grad_norm": 0.09440700709819794, + "learning_rate": 4.327407407407408e-05, + "loss": 0.1848, + "step": 12080 + }, + { + "epoch": 1.1548454888474948, + "grad_norm": 0.054509177803993225, + "learning_rate": 4.312592592592593e-05, + "loss": 0.1915, + "step": 12090 + }, + { + "epoch": 1.155800735539953, + "grad_norm": 0.07895702868700027, + "learning_rate": 4.2977777777777776e-05, + "loss": 0.1951, + "step": 12100 + }, + { + "epoch": 1.1567559822324116, + "grad_norm": 0.12499396502971649, + "learning_rate": 4.282962962962963e-05, + "loss": 0.1999, + "step": 12110 + }, + { + "epoch": 1.1577112289248699, + "grad_norm": 0.14979098737239838, + "learning_rate": 4.268148148148148e-05, + "loss": 0.2, + "step": 12120 + }, + { + "epoch": 1.1586664756173282, + "grad_norm": 0.10792044550180435, + "learning_rate": 4.2533333333333335e-05, + "loss": 0.1844, + "step": 12130 + }, + { + "epoch": 1.1596217223097864, + "grad_norm": 0.0688839927315712, + "learning_rate": 4.238518518518519e-05, + "loss": 0.1914, + "step": 12140 + }, + { + "epoch": 1.160576969002245, + "grad_norm": 0.06082676351070404, + "learning_rate": 4.223703703703704e-05, + "loss": 0.1945, + "step": 12150 + }, + { + "epoch": 1.1615322156947032, + "grad_norm": 0.1477699875831604, + "learning_rate": 4.208888888888889e-05, + "loss": 0.1997, + "step": 12160 + }, + { + "epoch": 1.1624874623871615, + "grad_norm": 0.1498918980360031, + "learning_rate": 4.194074074074074e-05, + "loss": 0.2003, + "step": 12170 + }, + { + "epoch": 1.1634427090796198, + "grad_norm": 0.1018199622631073, + "learning_rate": 4.179259259259259e-05, + "loss": 0.184, + "step": 12180 + }, + { + "epoch": 1.164397955772078, + "grad_norm": 0.0773182362318039, + "learning_rate": 4.1644444444444446e-05, + "loss": 0.1912, + "step": 12190 + }, + { + "epoch": 1.1653532024645366, + "grad_norm": 0.06646312028169632, + "learning_rate": 4.14962962962963e-05, + "loss": 0.1945, + "step": 12200 + }, + { + "epoch": 1.1663084491569948, + "grad_norm": 0.09444624185562134, + "learning_rate": 4.134814814814815e-05, + "loss": 0.1996, + "step": 12210 + }, + { + "epoch": 1.167263695849453, + "grad_norm": 0.15636619925498962, + "learning_rate": 4.12e-05, + "loss": 0.199, + "step": 12220 + }, + { + "epoch": 1.1682189425419114, + "grad_norm": 0.09470692276954651, + "learning_rate": 4.105185185185185e-05, + "loss": 0.1845, + "step": 12230 + }, + { + "epoch": 1.1691741892343699, + "grad_norm": 0.06418988108634949, + "learning_rate": 4.0903703703703705e-05, + "loss": 0.1904, + "step": 12240 + }, + { + "epoch": 1.1701294359268282, + "grad_norm": 0.07210509479045868, + "learning_rate": 4.075555555555556e-05, + "loss": 0.1948, + "step": 12250 + }, + { + "epoch": 1.1710846826192864, + "grad_norm": 0.11306772381067276, + "learning_rate": 4.060740740740741e-05, + "loss": 0.2002, + "step": 12260 + }, + { + "epoch": 1.1720399293117447, + "grad_norm": 0.14943212270736694, + "learning_rate": 4.0459259259259264e-05, + "loss": 0.1997, + "step": 12270 + }, + { + "epoch": 1.172995176004203, + "grad_norm": 0.09277426451444626, + "learning_rate": 4.031111111111111e-05, + "loss": 0.1831, + "step": 12280 + }, + { + "epoch": 1.1739504226966615, + "grad_norm": 0.06567569822072983, + "learning_rate": 4.016296296296296e-05, + "loss": 0.1907, + "step": 12290 + }, + { + "epoch": 1.1749056693891198, + "grad_norm": 0.06958837062120438, + "learning_rate": 4.0014814814814816e-05, + "loss": 0.1943, + "step": 12300 + }, + { + "epoch": 1.175860916081578, + "grad_norm": 0.11151952296495438, + "learning_rate": 3.986666666666667e-05, + "loss": 0.1992, + "step": 12310 + }, + { + "epoch": 1.1768161627740363, + "grad_norm": 0.16728582978248596, + "learning_rate": 3.971851851851852e-05, + "loss": 0.1994, + "step": 12320 + }, + { + "epoch": 1.1777714094664948, + "grad_norm": 0.08983734250068665, + "learning_rate": 3.9570370370370375e-05, + "loss": 0.1835, + "step": 12330 + }, + { + "epoch": 1.1787266561589531, + "grad_norm": 0.060879047960042953, + "learning_rate": 3.942222222222222e-05, + "loss": 0.1905, + "step": 12340 + }, + { + "epoch": 1.1796819028514114, + "grad_norm": 0.07289200276136398, + "learning_rate": 3.9274074074074074e-05, + "loss": 0.1941, + "step": 12350 + }, + { + "epoch": 1.1806371495438697, + "grad_norm": 0.10434010624885559, + "learning_rate": 3.912592592592593e-05, + "loss": 0.1986, + "step": 12360 + }, + { + "epoch": 1.181592396236328, + "grad_norm": 0.1622079759836197, + "learning_rate": 3.897777777777778e-05, + "loss": 0.1997, + "step": 12370 + }, + { + "epoch": 1.1825476429287864, + "grad_norm": 0.08091533184051514, + "learning_rate": 3.882962962962963e-05, + "loss": 0.183, + "step": 12380 + }, + { + "epoch": 1.1835028896212447, + "grad_norm": 0.07402651757001877, + "learning_rate": 3.8681481481481486e-05, + "loss": 0.1899, + "step": 12390 + }, + { + "epoch": 1.184458136313703, + "grad_norm": 0.07148318737745285, + "learning_rate": 3.853333333333334e-05, + "loss": 0.1946, + "step": 12400 + }, + { + "epoch": 1.1854133830061613, + "grad_norm": 0.11965110898017883, + "learning_rate": 3.8385185185185186e-05, + "loss": 0.1987, + "step": 12410 + }, + { + "epoch": 1.1863686296986198, + "grad_norm": 0.1528114378452301, + "learning_rate": 3.823703703703704e-05, + "loss": 0.2001, + "step": 12420 + }, + { + "epoch": 1.187323876391078, + "grad_norm": 0.09836557507514954, + "learning_rate": 3.808888888888889e-05, + "loss": 0.1829, + "step": 12430 + }, + { + "epoch": 1.1882791230835363, + "grad_norm": 0.060973864048719406, + "learning_rate": 3.7940740740740745e-05, + "loss": 0.1897, + "step": 12440 + }, + { + "epoch": 1.1892343697759946, + "grad_norm": 0.08469880372285843, + "learning_rate": 3.77925925925926e-05, + "loss": 0.194, + "step": 12450 + }, + { + "epoch": 1.190189616468453, + "grad_norm": 0.10124852508306503, + "learning_rate": 3.764444444444445e-05, + "loss": 0.1988, + "step": 12460 + }, + { + "epoch": 1.1911448631609114, + "grad_norm": 0.15043075382709503, + "learning_rate": 3.74962962962963e-05, + "loss": 0.1997, + "step": 12470 + }, + { + "epoch": 1.1921001098533697, + "grad_norm": 0.09942149370908737, + "learning_rate": 3.734814814814815e-05, + "loss": 0.1831, + "step": 12480 + }, + { + "epoch": 1.193055356545828, + "grad_norm": 0.06965496391057968, + "learning_rate": 3.72e-05, + "loss": 0.19, + "step": 12490 + }, + { + "epoch": 1.1940106032382862, + "grad_norm": 0.06517937779426575, + "learning_rate": 3.7051851851851856e-05, + "loss": 0.1942, + "step": 12500 + }, + { + "epoch": 1.1949658499307447, + "grad_norm": 0.1225384771823883, + "learning_rate": 3.690370370370371e-05, + "loss": 0.1992, + "step": 12510 + }, + { + "epoch": 1.195921096623203, + "grad_norm": 0.15679340064525604, + "learning_rate": 3.675555555555556e-05, + "loss": 0.1994, + "step": 12520 + }, + { + "epoch": 1.1968763433156613, + "grad_norm": 0.09618114680051804, + "learning_rate": 3.660740740740741e-05, + "loss": 0.1825, + "step": 12530 + }, + { + "epoch": 1.1978315900081196, + "grad_norm": 0.05807056650519371, + "learning_rate": 3.645925925925926e-05, + "loss": 0.1901, + "step": 12540 + }, + { + "epoch": 1.1987868367005778, + "grad_norm": 0.0703585296869278, + "learning_rate": 3.6311111111111114e-05, + "loss": 0.1947, + "step": 12550 + }, + { + "epoch": 1.1997420833930363, + "grad_norm": 0.13773010671138763, + "learning_rate": 3.616296296296297e-05, + "loss": 0.1991, + "step": 12560 + }, + { + "epoch": 1.2006973300854946, + "grad_norm": 0.15620911121368408, + "learning_rate": 3.601481481481482e-05, + "loss": 0.1989, + "step": 12570 + }, + { + "epoch": 1.201652576777953, + "grad_norm": 0.08928284794092178, + "learning_rate": 3.586666666666667e-05, + "loss": 0.1831, + "step": 12580 + }, + { + "epoch": 1.2026078234704112, + "grad_norm": 0.06567124277353287, + "learning_rate": 3.571851851851852e-05, + "loss": 0.1912, + "step": 12590 + }, + { + "epoch": 1.2035630701628697, + "grad_norm": 0.06833696365356445, + "learning_rate": 3.557037037037037e-05, + "loss": 0.1942, + "step": 12600 + }, + { + "epoch": 1.204518316855328, + "grad_norm": 0.12125487625598907, + "learning_rate": 3.5422222222222226e-05, + "loss": 0.1991, + "step": 12610 + }, + { + "epoch": 1.2054735635477862, + "grad_norm": 0.1656818985939026, + "learning_rate": 3.527407407407408e-05, + "loss": 0.1985, + "step": 12620 + }, + { + "epoch": 1.2064288102402445, + "grad_norm": 0.10647820681333542, + "learning_rate": 3.512592592592593e-05, + "loss": 0.1831, + "step": 12630 + }, + { + "epoch": 1.2073840569327028, + "grad_norm": 0.06365940719842911, + "learning_rate": 3.4977777777777785e-05, + "loss": 0.1909, + "step": 12640 + }, + { + "epoch": 1.2083393036251613, + "grad_norm": 0.06544508039951324, + "learning_rate": 3.482962962962963e-05, + "loss": 0.1946, + "step": 12650 + }, + { + "epoch": 1.2092945503176196, + "grad_norm": 0.08825177699327469, + "learning_rate": 3.4681481481481484e-05, + "loss": 0.1983, + "step": 12660 + }, + { + "epoch": 1.2102497970100778, + "grad_norm": 0.15609724819660187, + "learning_rate": 3.453333333333334e-05, + "loss": 0.1986, + "step": 12670 + }, + { + "epoch": 1.2112050437025361, + "grad_norm": 0.09403195232152939, + "learning_rate": 3.438518518518519e-05, + "loss": 0.1822, + "step": 12680 + }, + { + "epoch": 1.2121602903949946, + "grad_norm": 0.055547330528497696, + "learning_rate": 3.423703703703704e-05, + "loss": 0.19, + "step": 12690 + }, + { + "epoch": 1.213115537087453, + "grad_norm": 0.09445223212242126, + "learning_rate": 3.408888888888889e-05, + "loss": 0.1942, + "step": 12700 + }, + { + "epoch": 1.2140707837799112, + "grad_norm": 0.10912507027387619, + "learning_rate": 3.394074074074074e-05, + "loss": 0.1982, + "step": 12710 + }, + { + "epoch": 1.2150260304723695, + "grad_norm": 0.15998390316963196, + "learning_rate": 3.3792592592592595e-05, + "loss": 0.1989, + "step": 12720 + }, + { + "epoch": 1.2159812771648277, + "grad_norm": 0.10185787081718445, + "learning_rate": 3.364444444444445e-05, + "loss": 0.1814, + "step": 12730 + }, + { + "epoch": 1.2169365238572862, + "grad_norm": 0.07232099026441574, + "learning_rate": 3.3496296296296295e-05, + "loss": 0.1895, + "step": 12740 + }, + { + "epoch": 1.2178917705497445, + "grad_norm": 0.07367640733718872, + "learning_rate": 3.334814814814815e-05, + "loss": 0.1937, + "step": 12750 + }, + { + "epoch": 1.2188470172422028, + "grad_norm": 0.09808887541294098, + "learning_rate": 3.32e-05, + "loss": 0.1977, + "step": 12760 + }, + { + "epoch": 1.219802263934661, + "grad_norm": 0.1700250655412674, + "learning_rate": 3.3051851851851854e-05, + "loss": 0.199, + "step": 12770 + }, + { + "epoch": 1.2207575106271196, + "grad_norm": 0.0908031091094017, + "learning_rate": 3.29037037037037e-05, + "loss": 0.1835, + "step": 12780 + }, + { + "epoch": 1.2217127573195778, + "grad_norm": 0.06603245437145233, + "learning_rate": 3.275555555555555e-05, + "loss": 0.1895, + "step": 12790 + }, + { + "epoch": 1.2226680040120361, + "grad_norm": 0.05788644403219223, + "learning_rate": 3.2607407407407406e-05, + "loss": 0.1934, + "step": 12800 + }, + { + "epoch": 1.2236232507044944, + "grad_norm": 0.10153844207525253, + "learning_rate": 3.245925925925926e-05, + "loss": 0.1989, + "step": 12810 + }, + { + "epoch": 1.2245784973969527, + "grad_norm": 0.15934138000011444, + "learning_rate": 3.231111111111111e-05, + "loss": 0.1984, + "step": 12820 + }, + { + "epoch": 1.2255337440894112, + "grad_norm": 0.08226645737886429, + "learning_rate": 3.2162962962962965e-05, + "loss": 0.1824, + "step": 12830 + }, + { + "epoch": 1.2264889907818695, + "grad_norm": 0.06215713173151016, + "learning_rate": 3.201481481481481e-05, + "loss": 0.1892, + "step": 12840 + }, + { + "epoch": 1.2274442374743277, + "grad_norm": 0.08781886845827103, + "learning_rate": 3.1866666666666664e-05, + "loss": 0.1936, + "step": 12850 + }, + { + "epoch": 1.228399484166786, + "grad_norm": 0.09996737539768219, + "learning_rate": 3.171851851851852e-05, + "loss": 0.1986, + "step": 12860 + }, + { + "epoch": 1.2293547308592445, + "grad_norm": 0.15390396118164062, + "learning_rate": 3.157037037037037e-05, + "loss": 0.1984, + "step": 12870 + }, + { + "epoch": 1.2303099775517028, + "grad_norm": 0.09596038609743118, + "learning_rate": 3.142222222222222e-05, + "loss": 0.1825, + "step": 12880 + }, + { + "epoch": 1.231265224244161, + "grad_norm": 0.06414441019296646, + "learning_rate": 3.1274074074074076e-05, + "loss": 0.1898, + "step": 12890 + }, + { + "epoch": 1.2322204709366193, + "grad_norm": 0.0710066705942154, + "learning_rate": 3.112592592592592e-05, + "loss": 0.1934, + "step": 12900 + }, + { + "epoch": 1.2331757176290776, + "grad_norm": 0.1029692143201828, + "learning_rate": 3.0977777777777776e-05, + "loss": 0.1981, + "step": 12910 + }, + { + "epoch": 1.2341309643215361, + "grad_norm": 0.15667466819286346, + "learning_rate": 3.082962962962963e-05, + "loss": 0.1997, + "step": 12920 + }, + { + "epoch": 1.2350862110139944, + "grad_norm": 0.09223178774118423, + "learning_rate": 3.068148148148148e-05, + "loss": 0.1822, + "step": 12930 + }, + { + "epoch": 1.2360414577064527, + "grad_norm": 0.06726747006177902, + "learning_rate": 3.0533333333333335e-05, + "loss": 0.1898, + "step": 12940 + }, + { + "epoch": 1.236996704398911, + "grad_norm": 0.07996299117803574, + "learning_rate": 3.0385185185185188e-05, + "loss": 0.1936, + "step": 12950 + }, + { + "epoch": 1.2379519510913695, + "grad_norm": 0.10410148650407791, + "learning_rate": 3.0237037037037037e-05, + "loss": 0.199, + "step": 12960 + }, + { + "epoch": 1.2389071977838277, + "grad_norm": 0.15508505702018738, + "learning_rate": 3.008888888888889e-05, + "loss": 0.1992, + "step": 12970 + }, + { + "epoch": 1.239862444476286, + "grad_norm": 0.10372573882341385, + "learning_rate": 2.994074074074074e-05, + "loss": 0.1822, + "step": 12980 + }, + { + "epoch": 1.2408176911687443, + "grad_norm": 0.061683837324380875, + "learning_rate": 2.9792592592592593e-05, + "loss": 0.1893, + "step": 12990 + }, + { + "epoch": 1.2417729378612026, + "grad_norm": 0.05991368368268013, + "learning_rate": 2.9644444444444446e-05, + "loss": 0.1934, + "step": 13000 + }, + { + "epoch": 1.242728184553661, + "grad_norm": 0.11899848282337189, + "learning_rate": 2.94962962962963e-05, + "loss": 0.1986, + "step": 13010 + }, + { + "epoch": 1.2436834312461194, + "grad_norm": 0.15275335311889648, + "learning_rate": 2.9348148148148145e-05, + "loss": 0.2002, + "step": 13020 + }, + { + "epoch": 1.2446386779385776, + "grad_norm": 0.08578670769929886, + "learning_rate": 2.9199999999999998e-05, + "loss": 0.1822, + "step": 13030 + }, + { + "epoch": 1.245593924631036, + "grad_norm": 0.06400281190872192, + "learning_rate": 2.905185185185185e-05, + "loss": 0.1895, + "step": 13040 + }, + { + "epoch": 1.2465491713234944, + "grad_norm": 0.07122600823640823, + "learning_rate": 2.8903703703703704e-05, + "loss": 0.1935, + "step": 13050 + }, + { + "epoch": 1.2475044180159527, + "grad_norm": 0.11315450817346573, + "learning_rate": 2.8755555555555557e-05, + "loss": 0.1979, + "step": 13060 + }, + { + "epoch": 1.248459664708411, + "grad_norm": 0.1562516689300537, + "learning_rate": 2.860740740740741e-05, + "loss": 0.2003, + "step": 13070 + }, + { + "epoch": 1.2494149114008692, + "grad_norm": 0.0909615084528923, + "learning_rate": 2.8459259259259263e-05, + "loss": 0.1828, + "step": 13080 + }, + { + "epoch": 1.2503701580933275, + "grad_norm": 0.056711986660957336, + "learning_rate": 2.831111111111111e-05, + "loss": 0.19, + "step": 13090 + }, + { + "epoch": 1.2513254047857858, + "grad_norm": 0.06463145464658737, + "learning_rate": 2.8162962962962963e-05, + "loss": 0.1934, + "step": 13100 + }, + { + "epoch": 1.2522806514782443, + "grad_norm": 0.09779531508684158, + "learning_rate": 2.8014814814814816e-05, + "loss": 0.1988, + "step": 13110 + }, + { + "epoch": 1.2532358981707026, + "grad_norm": 0.15403911471366882, + "learning_rate": 2.786666666666667e-05, + "loss": 0.1996, + "step": 13120 + }, + { + "epoch": 1.2541911448631609, + "grad_norm": 0.1008806899189949, + "learning_rate": 2.771851851851852e-05, + "loss": 0.1827, + "step": 13130 + }, + { + "epoch": 1.2551463915556194, + "grad_norm": 0.06381751596927643, + "learning_rate": 2.7570370370370375e-05, + "loss": 0.1892, + "step": 13140 + }, + { + "epoch": 1.2561016382480776, + "grad_norm": 0.06516945362091064, + "learning_rate": 2.742222222222222e-05, + "loss": 0.193, + "step": 13150 + }, + { + "epoch": 1.257056884940536, + "grad_norm": 0.10235823690891266, + "learning_rate": 2.7274074074074074e-05, + "loss": 0.1981, + "step": 13160 + }, + { + "epoch": 1.2580121316329942, + "grad_norm": 0.1551736742258072, + "learning_rate": 2.7125925925925927e-05, + "loss": 0.1989, + "step": 13170 + }, + { + "epoch": 1.2589673783254525, + "grad_norm": 0.09147223085165024, + "learning_rate": 2.697777777777778e-05, + "loss": 0.1816, + "step": 13180 + }, + { + "epoch": 1.2599226250179107, + "grad_norm": 0.07331864535808563, + "learning_rate": 2.6829629629629633e-05, + "loss": 0.1895, + "step": 13190 + }, + { + "epoch": 1.2608778717103692, + "grad_norm": 0.0740872323513031, + "learning_rate": 2.6681481481481486e-05, + "loss": 0.1934, + "step": 13200 + }, + { + "epoch": 1.2618331184028275, + "grad_norm": 0.1049848198890686, + "learning_rate": 2.6533333333333332e-05, + "loss": 0.1984, + "step": 13210 + }, + { + "epoch": 1.2627883650952858, + "grad_norm": 0.15695710480213165, + "learning_rate": 2.6385185185185185e-05, + "loss": 0.1997, + "step": 13220 + }, + { + "epoch": 1.2637436117877443, + "grad_norm": 0.09660939872264862, + "learning_rate": 2.623703703703704e-05, + "loss": 0.1815, + "step": 13230 + }, + { + "epoch": 1.2646988584802026, + "grad_norm": 0.0635804608464241, + "learning_rate": 2.608888888888889e-05, + "loss": 0.1891, + "step": 13240 + }, + { + "epoch": 1.2656541051726609, + "grad_norm": 0.08403673022985458, + "learning_rate": 2.5940740740740744e-05, + "loss": 0.1933, + "step": 13250 + }, + { + "epoch": 1.2666093518651191, + "grad_norm": 0.08146359026432037, + "learning_rate": 2.5792592592592597e-05, + "loss": 0.1987, + "step": 13260 + }, + { + "epoch": 1.2675645985575774, + "grad_norm": 0.14868749678134918, + "learning_rate": 2.5644444444444444e-05, + "loss": 0.1998, + "step": 13270 + }, + { + "epoch": 1.2685198452500357, + "grad_norm": 0.09479347616434097, + "learning_rate": 2.5496296296296297e-05, + "loss": 0.1817, + "step": 13280 + }, + { + "epoch": 1.2694750919424942, + "grad_norm": 0.059485744684934616, + "learning_rate": 2.534814814814815e-05, + "loss": 0.1893, + "step": 13290 + }, + { + "epoch": 1.2704303386349525, + "grad_norm": 0.07404431700706482, + "learning_rate": 2.5200000000000003e-05, + "loss": 0.1936, + "step": 13300 + }, + { + "epoch": 1.2713855853274107, + "grad_norm": 0.1138007789850235, + "learning_rate": 2.5051851851851856e-05, + "loss": 0.1984, + "step": 13310 + }, + { + "epoch": 1.2723408320198693, + "grad_norm": 0.15096613764762878, + "learning_rate": 2.4903703703703705e-05, + "loss": 0.1983, + "step": 13320 + }, + { + "epoch": 1.2732960787123275, + "grad_norm": 0.09422920644283295, + "learning_rate": 2.475555555555556e-05, + "loss": 0.1814, + "step": 13330 + }, + { + "epoch": 1.2742513254047858, + "grad_norm": 0.07392732799053192, + "learning_rate": 2.4607407407407408e-05, + "loss": 0.1891, + "step": 13340 + }, + { + "epoch": 1.275206572097244, + "grad_norm": 0.07089727371931076, + "learning_rate": 2.445925925925926e-05, + "loss": 0.1934, + "step": 13350 + }, + { + "epoch": 1.2761618187897024, + "grad_norm": 0.11914543807506561, + "learning_rate": 2.431111111111111e-05, + "loss": 0.1984, + "step": 13360 + }, + { + "epoch": 1.2771170654821606, + "grad_norm": 0.15955843031406403, + "learning_rate": 2.4162962962962964e-05, + "loss": 0.2002, + "step": 13370 + }, + { + "epoch": 1.2780723121746191, + "grad_norm": 0.09782761335372925, + "learning_rate": 2.4014814814814817e-05, + "loss": 0.1818, + "step": 13380 + }, + { + "epoch": 1.2790275588670774, + "grad_norm": 0.07677102088928223, + "learning_rate": 2.3866666666666666e-05, + "loss": 0.189, + "step": 13390 + }, + { + "epoch": 1.2799828055595357, + "grad_norm": 0.071205273270607, + "learning_rate": 2.371851851851852e-05, + "loss": 0.1924, + "step": 13400 + }, + { + "epoch": 1.2809380522519942, + "grad_norm": 0.09079116582870483, + "learning_rate": 2.357037037037037e-05, + "loss": 0.1978, + "step": 13410 + }, + { + "epoch": 1.2818932989444525, + "grad_norm": 0.17159642279148102, + "learning_rate": 2.3422222222222222e-05, + "loss": 0.198, + "step": 13420 + }, + { + "epoch": 1.2828485456369108, + "grad_norm": 0.09026908874511719, + "learning_rate": 2.3274074074074075e-05, + "loss": 0.1808, + "step": 13430 + }, + { + "epoch": 1.283803792329369, + "grad_norm": 0.0792994573712349, + "learning_rate": 2.3125925925925925e-05, + "loss": 0.1886, + "step": 13440 + }, + { + "epoch": 1.2847590390218273, + "grad_norm": 0.06254783272743225, + "learning_rate": 2.2977777777777778e-05, + "loss": 0.1928, + "step": 13450 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.11604833602905273, + "learning_rate": 2.282962962962963e-05, + "loss": 0.1983, + "step": 13460 + }, + { + "epoch": 1.286669532406744, + "grad_norm": 0.14751599729061127, + "learning_rate": 2.268148148148148e-05, + "loss": 0.1977, + "step": 13470 + }, + { + "epoch": 1.2876247790992024, + "grad_norm": 0.07771807163953781, + "learning_rate": 2.2533333333333333e-05, + "loss": 0.1817, + "step": 13480 + }, + { + "epoch": 1.2885800257916606, + "grad_norm": 0.06615254282951355, + "learning_rate": 2.2385185185185186e-05, + "loss": 0.1896, + "step": 13490 + }, + { + "epoch": 1.2895352724841191, + "grad_norm": 0.058883316814899445, + "learning_rate": 2.2237037037037036e-05, + "loss": 0.1932, + "step": 13500 + }, + { + "epoch": 1.2904905191765774, + "grad_norm": 0.10671798884868622, + "learning_rate": 2.208888888888889e-05, + "loss": 0.1987, + "step": 13510 + }, + { + "epoch": 1.2914457658690357, + "grad_norm": 0.14683255553245544, + "learning_rate": 2.1940740740740742e-05, + "loss": 0.1997, + "step": 13520 + }, + { + "epoch": 1.292401012561494, + "grad_norm": 0.09709542244672775, + "learning_rate": 2.1792592592592595e-05, + "loss": 0.1821, + "step": 13530 + }, + { + "epoch": 1.2933562592539523, + "grad_norm": 0.06401824951171875, + "learning_rate": 2.1644444444444445e-05, + "loss": 0.1891, + "step": 13540 + }, + { + "epoch": 1.2943115059464105, + "grad_norm": 0.07614504545927048, + "learning_rate": 2.1496296296296298e-05, + "loss": 0.1935, + "step": 13550 + }, + { + "epoch": 1.295266752638869, + "grad_norm": 0.10084769874811172, + "learning_rate": 2.134814814814815e-05, + "loss": 0.1986, + "step": 13560 + }, + { + "epoch": 1.2962219993313273, + "grad_norm": 0.1459190398454666, + "learning_rate": 2.12e-05, + "loss": 0.2, + "step": 13570 + }, + { + "epoch": 1.2971772460237856, + "grad_norm": 0.08136285096406937, + "learning_rate": 2.1051851851851853e-05, + "loss": 0.1815, + "step": 13580 + }, + { + "epoch": 1.298132492716244, + "grad_norm": 0.06184697896242142, + "learning_rate": 2.0903703703703706e-05, + "loss": 0.1892, + "step": 13590 + }, + { + "epoch": 1.2990877394087024, + "grad_norm": 0.06824415922164917, + "learning_rate": 2.0755555555555556e-05, + "loss": 0.1926, + "step": 13600 + }, + { + "epoch": 1.3000429861011606, + "grad_norm": 0.10083822160959244, + "learning_rate": 2.060740740740741e-05, + "loss": 0.1972, + "step": 13610 + }, + { + "epoch": 1.300998232793619, + "grad_norm": 0.15515734255313873, + "learning_rate": 2.0459259259259262e-05, + "loss": 0.1991, + "step": 13620 + }, + { + "epoch": 1.3019534794860772, + "grad_norm": 0.09369368106126785, + "learning_rate": 2.031111111111111e-05, + "loss": 0.1814, + "step": 13630 + }, + { + "epoch": 1.3029087261785355, + "grad_norm": 0.06133545935153961, + "learning_rate": 2.0162962962962965e-05, + "loss": 0.189, + "step": 13640 + }, + { + "epoch": 1.303863972870994, + "grad_norm": 0.06848432868719101, + "learning_rate": 2.0014814814814818e-05, + "loss": 0.1927, + "step": 13650 + }, + { + "epoch": 1.3048192195634523, + "grad_norm": 0.10023056715726852, + "learning_rate": 1.9866666666666667e-05, + "loss": 0.1973, + "step": 13660 + }, + { + "epoch": 1.3057744662559105, + "grad_norm": 0.16171331703662872, + "learning_rate": 1.971851851851852e-05, + "loss": 0.1983, + "step": 13670 + }, + { + "epoch": 1.306729712948369, + "grad_norm": 0.09014260023832321, + "learning_rate": 1.9570370370370373e-05, + "loss": 0.1823, + "step": 13680 + }, + { + "epoch": 1.3076849596408273, + "grad_norm": 0.0606960766017437, + "learning_rate": 1.9422222222222223e-05, + "loss": 0.1888, + "step": 13690 + }, + { + "epoch": 1.3086402063332856, + "grad_norm": 0.07925919443368912, + "learning_rate": 1.9274074074074076e-05, + "loss": 0.1935, + "step": 13700 + }, + { + "epoch": 1.3095954530257439, + "grad_norm": 0.10734547674655914, + "learning_rate": 1.912592592592593e-05, + "loss": 0.1985, + "step": 13710 + }, + { + "epoch": 1.3105506997182022, + "grad_norm": 0.15949596464633942, + "learning_rate": 1.897777777777778e-05, + "loss": 0.199, + "step": 13720 + }, + { + "epoch": 1.3115059464106604, + "grad_norm": 0.07958906143903732, + "learning_rate": 1.882962962962963e-05, + "loss": 0.1811, + "step": 13730 + }, + { + "epoch": 1.312461193103119, + "grad_norm": 0.06587712466716766, + "learning_rate": 1.8681481481481485e-05, + "loss": 0.1894, + "step": 13740 + }, + { + "epoch": 1.3134164397955772, + "grad_norm": 0.0697634294629097, + "learning_rate": 1.8533333333333334e-05, + "loss": 0.1931, + "step": 13750 + }, + { + "epoch": 1.3143716864880355, + "grad_norm": 0.08874509483575821, + "learning_rate": 1.8385185185185187e-05, + "loss": 0.1985, + "step": 13760 + }, + { + "epoch": 1.315326933180494, + "grad_norm": 0.1566840559244156, + "learning_rate": 1.8237037037037037e-05, + "loss": 0.1992, + "step": 13770 + }, + { + "epoch": 1.3162821798729523, + "grad_norm": 0.08778232336044312, + "learning_rate": 1.808888888888889e-05, + "loss": 0.1814, + "step": 13780 + }, + { + "epoch": 1.3172374265654105, + "grad_norm": 0.06845912337303162, + "learning_rate": 1.794074074074074e-05, + "loss": 0.189, + "step": 13790 + }, + { + "epoch": 1.3181926732578688, + "grad_norm": 0.07066404074430466, + "learning_rate": 1.7792592592592593e-05, + "loss": 0.1932, + "step": 13800 + }, + { + "epoch": 1.319147919950327, + "grad_norm": 0.10193174332380295, + "learning_rate": 1.7644444444444446e-05, + "loss": 0.1978, + "step": 13810 + }, + { + "epoch": 1.3201031666427854, + "grad_norm": 0.14859530329704285, + "learning_rate": 1.7496296296296295e-05, + "loss": 0.1987, + "step": 13820 + }, + { + "epoch": 1.3210584133352439, + "grad_norm": 0.07397311180830002, + "learning_rate": 1.734814814814815e-05, + "loss": 0.1816, + "step": 13830 + }, + { + "epoch": 1.3220136600277022, + "grad_norm": 0.07166632264852524, + "learning_rate": 1.7199999999999998e-05, + "loss": 0.1887, + "step": 13840 + }, + { + "epoch": 1.3229689067201604, + "grad_norm": 0.06838031858205795, + "learning_rate": 1.705185185185185e-05, + "loss": 0.1932, + "step": 13850 + }, + { + "epoch": 1.323924153412619, + "grad_norm": 0.08447981625795364, + "learning_rate": 1.6903703703703704e-05, + "loss": 0.1973, + "step": 13860 + }, + { + "epoch": 1.3248794001050772, + "grad_norm": 0.16111405193805695, + "learning_rate": 1.6755555555555557e-05, + "loss": 0.1982, + "step": 13870 + }, + { + "epoch": 1.3258346467975355, + "grad_norm": 0.11569291353225708, + "learning_rate": 1.6607407407407407e-05, + "loss": 0.1804, + "step": 13880 + }, + { + "epoch": 1.3267898934899938, + "grad_norm": 0.05849316716194153, + "learning_rate": 1.645925925925926e-05, + "loss": 0.1884, + "step": 13890 + }, + { + "epoch": 1.327745140182452, + "grad_norm": 0.07721465080976486, + "learning_rate": 1.6311111111111113e-05, + "loss": 0.193, + "step": 13900 + }, + { + "epoch": 1.3287003868749103, + "grad_norm": 0.10829592496156693, + "learning_rate": 1.6162962962962962e-05, + "loss": 0.1978, + "step": 13910 + }, + { + "epoch": 1.3296556335673688, + "grad_norm": 0.15014564990997314, + "learning_rate": 1.6014814814814815e-05, + "loss": 0.1988, + "step": 13920 + }, + { + "epoch": 1.330610880259827, + "grad_norm": 0.09521475434303284, + "learning_rate": 1.586666666666667e-05, + "loss": 0.1813, + "step": 13930 + }, + { + "epoch": 1.3315661269522854, + "grad_norm": 0.05856531485915184, + "learning_rate": 1.5718518518518518e-05, + "loss": 0.1887, + "step": 13940 + }, + { + "epoch": 1.3325213736447439, + "grad_norm": 0.07324463874101639, + "learning_rate": 1.557037037037037e-05, + "loss": 0.1926, + "step": 13950 + }, + { + "epoch": 1.3334766203372022, + "grad_norm": 0.09733791649341583, + "learning_rate": 1.5422222222222224e-05, + "loss": 0.1981, + "step": 13960 + }, + { + "epoch": 1.3344318670296604, + "grad_norm": 0.1468607783317566, + "learning_rate": 1.5274074074074074e-05, + "loss": 0.1988, + "step": 13970 + }, + { + "epoch": 1.3353871137221187, + "grad_norm": 0.07646271586418152, + "learning_rate": 1.5125925925925927e-05, + "loss": 0.1805, + "step": 13980 + }, + { + "epoch": 1.336342360414577, + "grad_norm": 0.06440643221139908, + "learning_rate": 1.497777777777778e-05, + "loss": 0.189, + "step": 13990 + }, + { + "epoch": 1.3372976071070353, + "grad_norm": 0.0725795105099678, + "learning_rate": 1.482962962962963e-05, + "loss": 0.1927, + "step": 14000 + }, + { + "epoch": 1.3382528537994938, + "grad_norm": 0.11663739383220673, + "learning_rate": 1.4681481481481482e-05, + "loss": 0.1981, + "step": 14010 + }, + { + "epoch": 1.339208100491952, + "grad_norm": 0.15067516267299652, + "learning_rate": 1.4533333333333335e-05, + "loss": 0.1973, + "step": 14020 + }, + { + "epoch": 1.3401633471844103, + "grad_norm": 0.0837399959564209, + "learning_rate": 1.4385185185185185e-05, + "loss": 0.1805, + "step": 14030 + }, + { + "epoch": 1.3411185938768688, + "grad_norm": 0.06571058183908463, + "learning_rate": 1.4237037037037038e-05, + "loss": 0.1884, + "step": 14040 + }, + { + "epoch": 1.342073840569327, + "grad_norm": 0.07058189064264297, + "learning_rate": 1.4088888888888891e-05, + "loss": 0.1924, + "step": 14050 + }, + { + "epoch": 1.3430290872617854, + "grad_norm": 0.09456133842468262, + "learning_rate": 1.394074074074074e-05, + "loss": 0.1976, + "step": 14060 + }, + { + "epoch": 1.3439843339542437, + "grad_norm": 0.15115611255168915, + "learning_rate": 1.3792592592592594e-05, + "loss": 0.1987, + "step": 14070 + }, + { + "epoch": 1.344939580646702, + "grad_norm": 0.07672551274299622, + "learning_rate": 1.3644444444444445e-05, + "loss": 0.1801, + "step": 14080 + }, + { + "epoch": 1.3458948273391602, + "grad_norm": 0.06534221768379211, + "learning_rate": 1.3496296296296296e-05, + "loss": 0.1883, + "step": 14090 + }, + { + "epoch": 1.3468500740316187, + "grad_norm": 0.06814494729042053, + "learning_rate": 1.334814814814815e-05, + "loss": 0.1929, + "step": 14100 + }, + { + "epoch": 1.347805320724077, + "grad_norm": 0.09241555631160736, + "learning_rate": 1.32e-05, + "loss": 0.1977, + "step": 14110 + }, + { + "epoch": 1.3487605674165353, + "grad_norm": 0.1599811464548111, + "learning_rate": 1.3051851851851852e-05, + "loss": 0.1982, + "step": 14120 + }, + { + "epoch": 1.3497158141089938, + "grad_norm": 0.1026700884103775, + "learning_rate": 1.2903703703703703e-05, + "loss": 0.1806, + "step": 14130 + }, + { + "epoch": 1.350671060801452, + "grad_norm": 0.06195655092597008, + "learning_rate": 1.2755555555555556e-05, + "loss": 0.1878, + "step": 14140 + }, + { + "epoch": 1.3516263074939103, + "grad_norm": 0.0745544582605362, + "learning_rate": 1.2607407407407406e-05, + "loss": 0.1929, + "step": 14150 + }, + { + "epoch": 1.3525815541863686, + "grad_norm": 0.10045164078474045, + "learning_rate": 1.2459259259259259e-05, + "loss": 0.1983, + "step": 14160 + }, + { + "epoch": 1.3535368008788269, + "grad_norm": 0.15764273703098297, + "learning_rate": 1.2311111111111112e-05, + "loss": 0.198, + "step": 14170 + }, + { + "epoch": 1.3544920475712852, + "grad_norm": 0.08975204080343246, + "learning_rate": 1.2162962962962963e-05, + "loss": 0.1809, + "step": 14180 + }, + { + "epoch": 1.3554472942637437, + "grad_norm": 0.06544684618711472, + "learning_rate": 1.2014814814814815e-05, + "loss": 0.1888, + "step": 14190 + }, + { + "epoch": 1.356402540956202, + "grad_norm": 0.07405107468366623, + "learning_rate": 1.1866666666666668e-05, + "loss": 0.1931, + "step": 14200 + }, + { + "epoch": 1.3573577876486602, + "grad_norm": 0.10317433625459671, + "learning_rate": 1.1718518518518519e-05, + "loss": 0.1977, + "step": 14210 + }, + { + "epoch": 1.3583130343411187, + "grad_norm": 0.14299127459526062, + "learning_rate": 1.157037037037037e-05, + "loss": 0.1988, + "step": 14220 + }, + { + "epoch": 1.359268281033577, + "grad_norm": 0.08238115906715393, + "learning_rate": 1.1422222222222223e-05, + "loss": 0.1796, + "step": 14230 + }, + { + "epoch": 1.3602235277260353, + "grad_norm": 0.07125524431467056, + "learning_rate": 1.1274074074074075e-05, + "loss": 0.1879, + "step": 14240 + }, + { + "epoch": 1.3611787744184936, + "grad_norm": 0.06334027647972107, + "learning_rate": 1.1125925925925928e-05, + "loss": 0.1932, + "step": 14250 + }, + { + "epoch": 1.3621340211109518, + "grad_norm": 0.1071334108710289, + "learning_rate": 1.0977777777777779e-05, + "loss": 0.1976, + "step": 14260 + }, + { + "epoch": 1.36308926780341, + "grad_norm": 0.14405353367328644, + "learning_rate": 1.082962962962963e-05, + "loss": 0.1973, + "step": 14270 + }, + { + "epoch": 1.3640445144958686, + "grad_norm": 0.08287125825881958, + "learning_rate": 1.0681481481481483e-05, + "loss": 0.1805, + "step": 14280 + }, + { + "epoch": 1.3649997611883269, + "grad_norm": 0.061365820467472076, + "learning_rate": 1.0533333333333335e-05, + "loss": 0.1885, + "step": 14290 + }, + { + "epoch": 1.3659550078807852, + "grad_norm": 0.06179488077759743, + "learning_rate": 1.0385185185185186e-05, + "loss": 0.193, + "step": 14300 + }, + { + "epoch": 1.3669102545732437, + "grad_norm": 0.10180076956748962, + "learning_rate": 1.0237037037037037e-05, + "loss": 0.1974, + "step": 14310 + }, + { + "epoch": 1.367865501265702, + "grad_norm": 0.1490979641675949, + "learning_rate": 1.0088888888888889e-05, + "loss": 0.1988, + "step": 14320 + }, + { + "epoch": 1.3688207479581602, + "grad_norm": 0.10851076245307922, + "learning_rate": 9.94074074074074e-06, + "loss": 0.1805, + "step": 14330 + }, + { + "epoch": 1.3697759946506185, + "grad_norm": 0.060650117695331573, + "learning_rate": 9.792592592592593e-06, + "loss": 0.189, + "step": 14340 + }, + { + "epoch": 1.3707312413430768, + "grad_norm": 0.06470511853694916, + "learning_rate": 9.644444444444444e-06, + "loss": 0.1936, + "step": 14350 + }, + { + "epoch": 1.371686488035535, + "grad_norm": 0.11010751873254776, + "learning_rate": 9.496296296296296e-06, + "loss": 0.1982, + "step": 14360 + }, + { + "epoch": 1.3726417347279936, + "grad_norm": 0.15772590041160583, + "learning_rate": 9.348148148148149e-06, + "loss": 0.1997, + "step": 14370 + }, + { + "epoch": 1.3735969814204518, + "grad_norm": 0.07963084429502487, + "learning_rate": 9.2e-06, + "loss": 0.1803, + "step": 14380 + }, + { + "epoch": 1.3745522281129101, + "grad_norm": 0.07517506927251816, + "learning_rate": 9.051851851851851e-06, + "loss": 0.1882, + "step": 14390 + }, + { + "epoch": 1.3755074748053686, + "grad_norm": 0.08012760430574417, + "learning_rate": 8.903703703703704e-06, + "loss": 0.193, + "step": 14400 + }, + { + "epoch": 1.376462721497827, + "grad_norm": 0.11921060085296631, + "learning_rate": 8.755555555555556e-06, + "loss": 0.1984, + "step": 14410 + }, + { + "epoch": 1.3774179681902852, + "grad_norm": 0.15329593420028687, + "learning_rate": 8.607407407407409e-06, + "loss": 0.1983, + "step": 14420 + }, + { + "epoch": 1.3783732148827434, + "grad_norm": 0.0893860012292862, + "learning_rate": 8.45925925925926e-06, + "loss": 0.1806, + "step": 14430 + }, + { + "epoch": 1.3793284615752017, + "grad_norm": 0.0695950984954834, + "learning_rate": 8.311111111111111e-06, + "loss": 0.1888, + "step": 14440 + }, + { + "epoch": 1.38028370826766, + "grad_norm": 0.07914608716964722, + "learning_rate": 8.162962962962964e-06, + "loss": 0.1937, + "step": 14450 + }, + { + "epoch": 1.3812389549601185, + "grad_norm": 0.10042670369148254, + "learning_rate": 8.014814814814816e-06, + "loss": 0.1983, + "step": 14460 + }, + { + "epoch": 1.3821942016525768, + "grad_norm": 0.1473369598388672, + "learning_rate": 7.866666666666667e-06, + "loss": 0.1988, + "step": 14470 + }, + { + "epoch": 1.383149448345035, + "grad_norm": 0.10038721561431885, + "learning_rate": 7.71851851851852e-06, + "loss": 0.18, + "step": 14480 + }, + { + "epoch": 1.3841046950374936, + "grad_norm": 0.06510590761899948, + "learning_rate": 7.5703703703703705e-06, + "loss": 0.1882, + "step": 14490 + }, + { + "epoch": 1.3850599417299518, + "grad_norm": 0.08909650892019272, + "learning_rate": 7.422222222222222e-06, + "loss": 0.1929, + "step": 14500 + }, + { + "epoch": 1.3860151884224101, + "grad_norm": 0.10360381007194519, + "learning_rate": 7.274074074074075e-06, + "loss": 0.1982, + "step": 14510 + }, + { + "epoch": 1.3869704351148684, + "grad_norm": 0.15662701427936554, + "learning_rate": 7.125925925925926e-06, + "loss": 0.1989, + "step": 14520 + }, + { + "epoch": 1.3879256818073267, + "grad_norm": 0.0809619128704071, + "learning_rate": 6.9777777777777775e-06, + "loss": 0.1801, + "step": 14530 + }, + { + "epoch": 1.388880928499785, + "grad_norm": 0.05729440972208977, + "learning_rate": 6.8296296296296305e-06, + "loss": 0.1885, + "step": 14540 + }, + { + "epoch": 1.3898361751922435, + "grad_norm": 0.0785176008939743, + "learning_rate": 6.681481481481482e-06, + "loss": 0.1933, + "step": 14550 + }, + { + "epoch": 1.3907914218847017, + "grad_norm": 0.11649096012115479, + "learning_rate": 6.533333333333333e-06, + "loss": 0.1975, + "step": 14560 + }, + { + "epoch": 1.39174666857716, + "grad_norm": 0.15687984228134155, + "learning_rate": 6.385185185185185e-06, + "loss": 0.198, + "step": 14570 + }, + { + "epoch": 1.3927019152696185, + "grad_norm": 0.07697634398937225, + "learning_rate": 6.237037037037037e-06, + "loss": 0.1808, + "step": 14580 + }, + { + "epoch": 1.3936571619620768, + "grad_norm": 0.06329817324876785, + "learning_rate": 6.088888888888889e-06, + "loss": 0.1889, + "step": 14590 + }, + { + "epoch": 1.394612408654535, + "grad_norm": 0.06335621327161789, + "learning_rate": 5.940740740740741e-06, + "loss": 0.1932, + "step": 14600 + }, + { + "epoch": 1.3955676553469933, + "grad_norm": 0.10063440352678299, + "learning_rate": 5.792592592592593e-06, + "loss": 0.198, + "step": 14610 + }, + { + "epoch": 1.3965229020394516, + "grad_norm": 0.15546005964279175, + "learning_rate": 5.6444444444444445e-06, + "loss": 0.1977, + "step": 14620 + }, + { + "epoch": 1.39747814873191, + "grad_norm": 0.08702757954597473, + "learning_rate": 5.496296296296297e-06, + "loss": 0.1794, + "step": 14630 + }, + { + "epoch": 1.3984333954243684, + "grad_norm": 0.0682302713394165, + "learning_rate": 5.348148148148149e-06, + "loss": 0.1883, + "step": 14640 + }, + { + "epoch": 1.3993886421168267, + "grad_norm": 0.07530588656663895, + "learning_rate": 5.2e-06, + "loss": 0.193, + "step": 14650 + }, + { + "epoch": 1.400343888809285, + "grad_norm": 0.10217908024787903, + "learning_rate": 5.051851851851852e-06, + "loss": 0.1979, + "step": 14660 + }, + { + "epoch": 1.4012991355017435, + "grad_norm": 0.15378542244434357, + "learning_rate": 4.903703703703704e-06, + "loss": 0.198, + "step": 14670 + }, + { + "epoch": 1.4022543821942017, + "grad_norm": 0.09957956522703171, + "learning_rate": 4.755555555555556e-06, + "loss": 0.1794, + "step": 14680 + }, + { + "epoch": 1.40320962888666, + "grad_norm": 0.06103844195604324, + "learning_rate": 4.607407407407407e-06, + "loss": 0.1881, + "step": 14690 + }, + { + "epoch": 1.4041648755791183, + "grad_norm": 0.09466850012540817, + "learning_rate": 4.459259259259259e-06, + "loss": 0.1931, + "step": 14700 + }, + { + "epoch": 1.4051201222715766, + "grad_norm": 0.09376849234104156, + "learning_rate": 4.3111111111111115e-06, + "loss": 0.1971, + "step": 14710 + }, + { + "epoch": 1.4060753689640348, + "grad_norm": 0.15457859635353088, + "learning_rate": 4.162962962962963e-06, + "loss": 0.1987, + "step": 14720 + }, + { + "epoch": 1.4070306156564933, + "grad_norm": 0.08470315486192703, + "learning_rate": 4.014814814814815e-06, + "loss": 0.179, + "step": 14730 + }, + { + "epoch": 1.4079858623489516, + "grad_norm": 0.06838913261890411, + "learning_rate": 3.866666666666667e-06, + "loss": 0.1881, + "step": 14740 + }, + { + "epoch": 1.40894110904141, + "grad_norm": 0.06744107604026794, + "learning_rate": 3.7185185185185185e-06, + "loss": 0.1923, + "step": 14750 + }, + { + "epoch": 1.4098963557338684, + "grad_norm": 0.10477790981531143, + "learning_rate": 3.5703703703703703e-06, + "loss": 0.1977, + "step": 14760 + }, + { + "epoch": 1.4108516024263267, + "grad_norm": 0.1559683233499527, + "learning_rate": 3.4222222222222224e-06, + "loss": 0.198, + "step": 14770 + }, + { + "epoch": 1.411806849118785, + "grad_norm": 0.0938514843583107, + "learning_rate": 3.2740740740740746e-06, + "loss": 0.179, + "step": 14780 + }, + { + "epoch": 1.4127620958112432, + "grad_norm": 0.06281778961420059, + "learning_rate": 3.125925925925926e-06, + "loss": 0.1881, + "step": 14790 + }, + { + "epoch": 1.4137173425037015, + "grad_norm": 0.06448351591825485, + "learning_rate": 2.977777777777778e-06, + "loss": 0.192, + "step": 14800 + }, + { + "epoch": 1.4146725891961598, + "grad_norm": 0.098030686378479, + "learning_rate": 2.8296296296296294e-06, + "loss": 0.1976, + "step": 14810 + }, + { + "epoch": 1.4156278358886183, + "grad_norm": 0.14580897986888885, + "learning_rate": 2.6814814814814816e-06, + "loss": 0.1982, + "step": 14820 + }, + { + "epoch": 1.4165830825810766, + "grad_norm": 0.08316420018672943, + "learning_rate": 2.5333333333333334e-06, + "loss": 0.1802, + "step": 14830 + }, + { + "epoch": 1.4175383292735348, + "grad_norm": 0.06413526833057404, + "learning_rate": 2.3851851851851855e-06, + "loss": 0.1882, + "step": 14840 + }, + { + "epoch": 1.4184935759659933, + "grad_norm": 0.07762613147497177, + "learning_rate": 2.2370370370370373e-06, + "loss": 0.1933, + "step": 14850 + }, + { + "epoch": 1.4194488226584516, + "grad_norm": 0.1181880459189415, + "learning_rate": 2.088888888888889e-06, + "loss": 0.1974, + "step": 14860 + }, + { + "epoch": 1.42040406935091, + "grad_norm": 0.16065983474254608, + "learning_rate": 1.9407407407407408e-06, + "loss": 0.1975, + "step": 14870 + }, + { + "epoch": 1.4213593160433682, + "grad_norm": 0.07450287789106369, + "learning_rate": 1.7925925925925925e-06, + "loss": 0.1794, + "step": 14880 + }, + { + "epoch": 1.4223145627358265, + "grad_norm": 0.058435749262571335, + "learning_rate": 1.6444444444444447e-06, + "loss": 0.1887, + "step": 14890 + }, + { + "epoch": 1.4232698094282847, + "grad_norm": 0.07401622831821442, + "learning_rate": 1.4962962962962962e-06, + "loss": 0.193, + "step": 14900 + }, + { + "epoch": 1.4242250561207432, + "grad_norm": 0.12826916575431824, + "learning_rate": 1.3481481481481482e-06, + "loss": 0.1977, + "step": 14910 + }, + { + "epoch": 1.4251803028132015, + "grad_norm": 0.15752126276493073, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.1978, + "step": 14920 + }, + { + "epoch": 1.4261355495056598, + "grad_norm": 0.0686534121632576, + "learning_rate": 1.051851851851852e-06, + "loss": 0.1794, + "step": 14930 + }, + { + "epoch": 1.4270907961981183, + "grad_norm": 0.06406836956739426, + "learning_rate": 9.037037037037039e-07, + "loss": 0.1886, + "step": 14940 + }, + { + "epoch": 1.4280460428905766, + "grad_norm": 0.06210838258266449, + "learning_rate": 7.555555555555556e-07, + "loss": 0.1934, + "step": 14950 + }, + { + "epoch": 1.4290012895830349, + "grad_norm": 0.08844652026891708, + "learning_rate": 6.074074074074074e-07, + "loss": 0.198, + "step": 14960 + }, + { + "epoch": 1.4299565362754931, + "grad_norm": 0.15475121140480042, + "learning_rate": 4.5925925925925927e-07, + "loss": 0.1971, + "step": 14970 + }, + { + "epoch": 1.4309117829679514, + "grad_norm": 0.08120597153902054, + "learning_rate": 3.111111111111111e-07, + "loss": 0.1793, + "step": 14980 + }, + { + "epoch": 1.4318670296604097, + "grad_norm": 0.07655435800552368, + "learning_rate": 1.6296296296296295e-07, + "loss": 0.1884, + "step": 14990 + }, + { + "epoch": 1.4328222763528682, + "grad_norm": 0.07046757638454437, + "learning_rate": 1.4814814814814816e-08, + "loss": 0.193, + "step": 15000 + } + ], + "logging_steps": 10, + "max_steps": 15000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.918384541218406e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}