diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,10534 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.4328222763528682, - "eval_steps": 500, - "global_step": 15000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0009552466924583273, - "grad_norm": 2.0798542499542236, - "learning_rate": 1.2000000000000002e-06, - "loss": 0.907, - "step": 10 - }, - { - "epoch": 0.0019104933849166546, - "grad_norm": 1.9614883661270142, - "learning_rate": 2.5333333333333334e-06, - "loss": 0.8934, - "step": 20 - }, - { - "epoch": 0.002865740077374982, - "grad_norm": 1.724998950958252, - "learning_rate": 3.866666666666667e-06, - "loss": 0.7885, - "step": 30 - }, - { - "epoch": 0.0038209867698333093, - "grad_norm": 1.2985706329345703, - "learning_rate": 5.2e-06, - "loss": 0.6411, - "step": 40 - }, - { - "epoch": 0.004776233462291637, - "grad_norm": 2.9078688621520996, - "learning_rate": 6.533333333333333e-06, - "loss": 0.5671, - "step": 50 - }, - { - "epoch": 0.005731480154749964, - "grad_norm": 0.6146636605262756, - "learning_rate": 7.866666666666667e-06, - "loss": 0.3757, - "step": 60 - }, - { - "epoch": 0.006686726847208292, - "grad_norm": 0.4550829827785492, - "learning_rate": 9.2e-06, - "loss": 0.3449, - "step": 70 - }, - { - "epoch": 0.0076419735396666185, - "grad_norm": 0.6388053894042969, - "learning_rate": 1.0533333333333335e-05, - "loss": 0.3064, - "step": 80 - }, - { - "epoch": 0.008597220232124947, - "grad_norm": 0.44831541180610657, - "learning_rate": 1.1866666666666668e-05, - "loss": 0.2696, - "step": 90 - }, - { - "epoch": 0.009552466924583274, - "grad_norm": 0.4611794948577881, - "learning_rate": 1.32e-05, - "loss": 0.2448, - "step": 100 - }, - { - "epoch": 0.0105077136170416, - "grad_norm": 0.1431284248828888, - "learning_rate": 1.4533333333333335e-05, - "loss": 0.2094, - "step": 110 - }, - { - "epoch": 0.011462960309499929, - "grad_norm": 0.11084526777267456, - "learning_rate": 1.586666666666667e-05, - "loss": 0.2167, - "step": 120 - }, - { - "epoch": 0.012418207001958255, - "grad_norm": 0.11287008225917816, - "learning_rate": 1.7199999999999998e-05, - "loss": 0.2187, - "step": 130 - }, - { - "epoch": 0.013373453694416584, - "grad_norm": 0.11731468141078949, - "learning_rate": 1.8533333333333334e-05, - "loss": 0.2231, - "step": 140 - }, - { - "epoch": 0.01432870038687491, - "grad_norm": 0.16812096536159515, - "learning_rate": 1.9866666666666667e-05, - "loss": 0.2246, - "step": 150 - }, - { - "epoch": 0.015283947079333237, - "grad_norm": 0.09585987776517868, - "learning_rate": 2.12e-05, - "loss": 0.1999, - "step": 160 - }, - { - "epoch": 0.016239193771791564, - "grad_norm": 0.06834663450717926, - "learning_rate": 2.2533333333333333e-05, - "loss": 0.2112, - "step": 170 - }, - { - "epoch": 0.017194440464249894, - "grad_norm": 0.0838400200009346, - "learning_rate": 2.3866666666666666e-05, - "loss": 0.214, - "step": 180 - }, - { - "epoch": 0.01814968715670822, - "grad_norm": 0.09908033907413483, - "learning_rate": 2.5200000000000003e-05, - "loss": 0.2187, - "step": 190 - }, - { - "epoch": 0.019104933849166547, - "grad_norm": 0.15255768597126007, - "learning_rate": 2.6533333333333332e-05, - "loss": 0.2212, - "step": 200 - }, - { - "epoch": 0.020060180541624874, - "grad_norm": 0.08462337404489517, - "learning_rate": 2.786666666666667e-05, - "loss": 0.1999, - "step": 210 - }, - { - "epoch": 0.0210154272340832, - "grad_norm": 0.06040903180837631, - "learning_rate": 2.9199999999999998e-05, - "loss": 0.2088, - "step": 220 - }, - { - "epoch": 0.02197067392654153, - "grad_norm": 0.06774196773767471, - "learning_rate": 3.0533333333333335e-05, - "loss": 0.2129, - "step": 230 - }, - { - "epoch": 0.022925920618999857, - "grad_norm": 0.11282925307750702, - "learning_rate": 3.1866666666666664e-05, - "loss": 0.2198, - "step": 240 - }, - { - "epoch": 0.023881167311458184, - "grad_norm": 0.1323988288640976, - "learning_rate": 3.32e-05, - "loss": 0.221, - "step": 250 - }, - { - "epoch": 0.02483641400391651, - "grad_norm": 0.10758616775274277, - "learning_rate": 3.453333333333334e-05, - "loss": 0.2001, - "step": 260 - }, - { - "epoch": 0.025791660696374837, - "grad_norm": 0.05729316174983978, - "learning_rate": 3.586666666666667e-05, - "loss": 0.2104, - "step": 270 - }, - { - "epoch": 0.026746907388833167, - "grad_norm": 0.06489019095897675, - "learning_rate": 3.72e-05, - "loss": 0.2127, - "step": 280 - }, - { - "epoch": 0.027702154081291494, - "grad_norm": 0.10550963878631592, - "learning_rate": 3.853333333333334e-05, - "loss": 0.2171, - "step": 290 - }, - { - "epoch": 0.02865740077374982, - "grad_norm": 0.14865626394748688, - "learning_rate": 3.986666666666667e-05, - "loss": 0.2196, - "step": 300 - }, - { - "epoch": 0.029612647466208147, - "grad_norm": 0.13421903550624847, - "learning_rate": 4.12e-05, - "loss": 0.2017, - "step": 310 - }, - { - "epoch": 0.030567894158666474, - "grad_norm": 0.07205755263566971, - "learning_rate": 4.2533333333333335e-05, - "loss": 0.209, - "step": 320 - }, - { - "epoch": 0.0315231408511248, - "grad_norm": 0.06970394402742386, - "learning_rate": 4.3866666666666665e-05, - "loss": 0.2132, - "step": 330 - }, - { - "epoch": 0.03247838754358313, - "grad_norm": 0.16807648539543152, - "learning_rate": 4.52e-05, - "loss": 0.2167, - "step": 340 - }, - { - "epoch": 0.03343363423604146, - "grad_norm": 0.1984771341085434, - "learning_rate": 4.653333333333334e-05, - "loss": 0.2166, - "step": 350 - }, - { - "epoch": 0.03438888092849979, - "grad_norm": 0.09778633713722229, - "learning_rate": 4.7866666666666674e-05, - "loss": 0.2041, - "step": 360 - }, - { - "epoch": 0.035344127620958114, - "grad_norm": 0.08991667628288269, - "learning_rate": 4.92e-05, - "loss": 0.2082, - "step": 370 - }, - { - "epoch": 0.03629937431341644, - "grad_norm": 0.10208883881568909, - "learning_rate": 5.053333333333333e-05, - "loss": 0.2081, - "step": 380 - }, - { - "epoch": 0.03725462100587477, - "grad_norm": 0.1255088746547699, - "learning_rate": 5.1866666666666676e-05, - "loss": 0.2118, - "step": 390 - }, - { - "epoch": 0.038209867698333094, - "grad_norm": 0.19001370668411255, - "learning_rate": 5.3200000000000006e-05, - "loss": 0.2146, - "step": 400 - }, - { - "epoch": 0.03916511439079142, - "grad_norm": 0.09977246075868607, - "learning_rate": 5.4533333333333335e-05, - "loss": 0.2018, - "step": 410 - }, - { - "epoch": 0.04012036108324975, - "grad_norm": 0.12999273836612701, - "learning_rate": 5.5866666666666665e-05, - "loss": 0.2063, - "step": 420 - }, - { - "epoch": 0.041075607775708074, - "grad_norm": 0.08959522098302841, - "learning_rate": 5.72e-05, - "loss": 0.2064, - "step": 430 - }, - { - "epoch": 0.0420308544681664, - "grad_norm": 0.14837408065795898, - "learning_rate": 5.853333333333334e-05, - "loss": 0.2104, - "step": 440 - }, - { - "epoch": 0.042986101160624735, - "grad_norm": 0.18937060236930847, - "learning_rate": 5.9866666666666674e-05, - "loss": 0.2132, - "step": 450 - }, - { - "epoch": 0.04394134785308306, - "grad_norm": 0.12442853301763535, - "learning_rate": 6.12e-05, - "loss": 0.2006, - "step": 460 - }, - { - "epoch": 0.04489659454554139, - "grad_norm": 0.11519322544336319, - "learning_rate": 6.253333333333333e-05, - "loss": 0.2039, - "step": 470 - }, - { - "epoch": 0.045851841237999715, - "grad_norm": 0.0926322489976883, - "learning_rate": 6.386666666666667e-05, - "loss": 0.2052, - "step": 480 - }, - { - "epoch": 0.04680708793045804, - "grad_norm": 0.16390164196491241, - "learning_rate": 6.52e-05, - "loss": 0.2091, - "step": 490 - }, - { - "epoch": 0.04776233462291637, - "grad_norm": 0.256491094827652, - "learning_rate": 6.653333333333334e-05, - "loss": 0.2122, - "step": 500 - }, - { - "epoch": 0.048717581315374694, - "grad_norm": 0.12082330137491226, - "learning_rate": 6.786666666666667e-05, - "loss": 0.2003, - "step": 510 - }, - { - "epoch": 0.04967282800783302, - "grad_norm": 0.08324269205331802, - "learning_rate": 6.92e-05, - "loss": 0.2042, - "step": 520 - }, - { - "epoch": 0.05062807470029135, - "grad_norm": 0.06913580000400543, - "learning_rate": 7.053333333333334e-05, - "loss": 0.2033, - "step": 530 - }, - { - "epoch": 0.051583321392749674, - "grad_norm": 0.11715491861104965, - "learning_rate": 7.186666666666667e-05, - "loss": 0.2081, - "step": 540 - }, - { - "epoch": 0.05253856808520801, - "grad_norm": 0.22186318039894104, - "learning_rate": 7.32e-05, - "loss": 0.2094, - "step": 550 - }, - { - "epoch": 0.053493814777666335, - "grad_norm": 0.11386065185070038, - "learning_rate": 7.453333333333333e-05, - "loss": 0.2031, - "step": 560 - }, - { - "epoch": 0.05444906147012466, - "grad_norm": 0.08842533081769943, - "learning_rate": 7.586666666666668e-05, - "loss": 0.2001, - "step": 570 - }, - { - "epoch": 0.05540430816258299, - "grad_norm": 0.09553701430559158, - "learning_rate": 7.72e-05, - "loss": 0.201, - "step": 580 - }, - { - "epoch": 0.056359554855041315, - "grad_norm": 0.12029985338449478, - "learning_rate": 7.853333333333334e-05, - "loss": 0.2066, - "step": 590 - }, - { - "epoch": 0.05731480154749964, - "grad_norm": 0.36245641112327576, - "learning_rate": 7.986666666666667e-05, - "loss": 0.212, - "step": 600 - }, - { - "epoch": 0.05827004823995797, - "grad_norm": 0.11322560906410217, - "learning_rate": 8.120000000000001e-05, - "loss": 0.203, - "step": 610 - }, - { - "epoch": 0.059225294932416295, - "grad_norm": 0.07776595652103424, - "learning_rate": 8.253333333333334e-05, - "loss": 0.2016, - "step": 620 - }, - { - "epoch": 0.06018054162487462, - "grad_norm": 0.09582793712615967, - "learning_rate": 8.386666666666667e-05, - "loss": 0.203, - "step": 630 - }, - { - "epoch": 0.06113578831733295, - "grad_norm": 0.13911056518554688, - "learning_rate": 8.52e-05, - "loss": 0.2053, - "step": 640 - }, - { - "epoch": 0.06209103500979128, - "grad_norm": 0.3633959889411926, - "learning_rate": 8.653333333333333e-05, - "loss": 0.2086, - "step": 650 - }, - { - "epoch": 0.0630462817022496, - "grad_norm": 0.10734552890062332, - "learning_rate": 8.786666666666667e-05, - "loss": 0.2065, - "step": 660 - }, - { - "epoch": 0.06400152839470793, - "grad_norm": 0.08110401779413223, - "learning_rate": 8.92e-05, - "loss": 0.201, - "step": 670 - }, - { - "epoch": 0.06495677508716625, - "grad_norm": 0.08999983966350555, - "learning_rate": 9.053333333333334e-05, - "loss": 0.202, - "step": 680 - }, - { - "epoch": 0.06591202177962459, - "grad_norm": 0.11316987872123718, - "learning_rate": 9.186666666666667e-05, - "loss": 0.2084, - "step": 690 - }, - { - "epoch": 0.06686726847208292, - "grad_norm": 0.3875355124473572, - "learning_rate": 9.320000000000002e-05, - "loss": 0.2092, - "step": 700 - }, - { - "epoch": 0.06782251516454124, - "grad_norm": 0.12044321745634079, - "learning_rate": 9.453333333333335e-05, - "loss": 0.2064, - "step": 710 - }, - { - "epoch": 0.06877776185699958, - "grad_norm": 0.07788579165935516, - "learning_rate": 9.586666666666667e-05, - "loss": 0.1998, - "step": 720 - }, - { - "epoch": 0.0697330085494579, - "grad_norm": 0.10780824720859528, - "learning_rate": 9.72e-05, - "loss": 0.2019, - "step": 730 - }, - { - "epoch": 0.07068825524191623, - "grad_norm": 0.11069575697183609, - "learning_rate": 9.853333333333333e-05, - "loss": 0.2056, - "step": 740 - }, - { - "epoch": 0.07164350193437455, - "grad_norm": 0.4447477459907532, - "learning_rate": 9.986666666666668e-05, - "loss": 0.2076, - "step": 750 - }, - { - "epoch": 0.07259874862683288, - "grad_norm": 0.2588540315628052, - "learning_rate": 0.00010120000000000001, - "loss": 0.2038, - "step": 760 - }, - { - "epoch": 0.0735539953192912, - "grad_norm": 0.08379202336072922, - "learning_rate": 0.00010253333333333335, - "loss": 0.201, - "step": 770 - }, - { - "epoch": 0.07450924201174954, - "grad_norm": 0.0976845845580101, - "learning_rate": 0.00010386666666666667, - "loss": 0.2018, - "step": 780 - }, - { - "epoch": 0.07546448870420785, - "grad_norm": 0.1460348218679428, - "learning_rate": 0.00010520000000000001, - "loss": 0.204, - "step": 790 - }, - { - "epoch": 0.07641973539666619, - "grad_norm": 0.4925660789012909, - "learning_rate": 0.00010653333333333333, - "loss": 0.2079, - "step": 800 - }, - { - "epoch": 0.07737498208912452, - "grad_norm": 0.1974959820508957, - "learning_rate": 0.00010786666666666667, - "loss": 0.207, - "step": 810 - }, - { - "epoch": 0.07833022878158284, - "grad_norm": 0.16117534041404724, - "learning_rate": 0.00010920000000000001, - "loss": 0.2022, - "step": 820 - }, - { - "epoch": 0.07928547547404118, - "grad_norm": 0.16919535398483276, - "learning_rate": 0.00011053333333333333, - "loss": 0.2027, - "step": 830 - }, - { - "epoch": 0.0802407221664995, - "grad_norm": 0.11847756057977676, - "learning_rate": 0.00011186666666666667, - "loss": 0.2068, - "step": 840 - }, - { - "epoch": 0.08119596885895783, - "grad_norm": 0.41180866956710815, - "learning_rate": 0.0001132, - "loss": 0.2074, - "step": 850 - }, - { - "epoch": 0.08215121555141615, - "grad_norm": 0.17310842871665955, - "learning_rate": 0.00011453333333333334, - "loss": 0.2067, - "step": 860 - }, - { - "epoch": 0.08310646224387448, - "grad_norm": 0.11892854422330856, - "learning_rate": 0.00011586666666666667, - "loss": 0.2006, - "step": 870 - }, - { - "epoch": 0.0840617089363328, - "grad_norm": 0.09759584814310074, - "learning_rate": 0.0001172, - "loss": 0.204, - "step": 880 - }, - { - "epoch": 0.08501695562879114, - "grad_norm": 0.09496277570724487, - "learning_rate": 0.00011853333333333335, - "loss": 0.2069, - "step": 890 - }, - { - "epoch": 0.08597220232124947, - "grad_norm": 0.34175682067871094, - "learning_rate": 0.00011986666666666669, - "loss": 0.2088, - "step": 900 - }, - { - "epoch": 0.08692744901370779, - "grad_norm": 0.17791183292865753, - "learning_rate": 0.0001212, - "loss": 0.2033, - "step": 910 - }, - { - "epoch": 0.08788269570616612, - "grad_norm": 0.11148897558450699, - "learning_rate": 0.00012253333333333335, - "loss": 0.2034, - "step": 920 - }, - { - "epoch": 0.08883794239862444, - "grad_norm": 0.08730180561542511, - "learning_rate": 0.00012386666666666665, - "loss": 0.2043, - "step": 930 - }, - { - "epoch": 0.08979318909108278, - "grad_norm": 0.10876961797475815, - "learning_rate": 0.0001252, - "loss": 0.2074, - "step": 940 - }, - { - "epoch": 0.0907484357835411, - "grad_norm": 0.4162116050720215, - "learning_rate": 0.00012653333333333334, - "loss": 0.2099, - "step": 950 - }, - { - "epoch": 0.09170368247599943, - "grad_norm": 0.349356085062027, - "learning_rate": 0.00012786666666666667, - "loss": 0.2073, - "step": 960 - }, - { - "epoch": 0.09265892916845775, - "grad_norm": 0.144659161567688, - "learning_rate": 0.00012920000000000002, - "loss": 0.2014, - "step": 970 - }, - { - "epoch": 0.09361417586091608, - "grad_norm": 0.10344243794679642, - "learning_rate": 0.00013053333333333333, - "loss": 0.2041, - "step": 980 - }, - { - "epoch": 0.0945694225533744, - "grad_norm": 0.10903052240610123, - "learning_rate": 0.00013186666666666668, - "loss": 0.2067, - "step": 990 - }, - { - "epoch": 0.09552466924583274, - "grad_norm": 0.4268416464328766, - "learning_rate": 0.0001332, - "loss": 0.2091, - "step": 1000 - }, - { - "epoch": 0.09647991593829107, - "grad_norm": 0.18322598934173584, - "learning_rate": 0.00013453333333333334, - "loss": 0.2075, - "step": 1010 - }, - { - "epoch": 0.09743516263074939, - "grad_norm": 0.1304457038640976, - "learning_rate": 0.00013586666666666667, - "loss": 0.1997, - "step": 1020 - }, - { - "epoch": 0.09839040932320772, - "grad_norm": 0.1123695969581604, - "learning_rate": 0.00013720000000000003, - "loss": 0.2017, - "step": 1030 - }, - { - "epoch": 0.09934565601566604, - "grad_norm": 0.15449099242687225, - "learning_rate": 0.00013853333333333333, - "loss": 0.2064, - "step": 1040 - }, - { - "epoch": 0.10030090270812438, - "grad_norm": 0.43143564462661743, - "learning_rate": 0.0001398666666666667, - "loss": 0.2089, - "step": 1050 - }, - { - "epoch": 0.1012561494005827, - "grad_norm": 0.2075752168893814, - "learning_rate": 0.0001412, - "loss": 0.2063, - "step": 1060 - }, - { - "epoch": 0.10221139609304103, - "grad_norm": 0.09274331480264664, - "learning_rate": 0.00014253333333333335, - "loss": 0.2009, - "step": 1070 - }, - { - "epoch": 0.10316664278549935, - "grad_norm": 0.10772950947284698, - "learning_rate": 0.00014386666666666668, - "loss": 0.2018, - "step": 1080 - }, - { - "epoch": 0.10412188947795768, - "grad_norm": 0.13142071664333344, - "learning_rate": 0.0001452, - "loss": 0.2055, - "step": 1090 - }, - { - "epoch": 0.10507713617041602, - "grad_norm": 0.45018109679222107, - "learning_rate": 0.00014653333333333334, - "loss": 0.2078, - "step": 1100 - }, - { - "epoch": 0.10603238286287434, - "grad_norm": 0.17738114297389984, - "learning_rate": 0.00014786666666666666, - "loss": 0.2159, - "step": 1110 - }, - { - "epoch": 0.10698762955533267, - "grad_norm": 0.18743546307086945, - "learning_rate": 0.0001492, - "loss": 0.2173, - "step": 1120 - }, - { - "epoch": 0.10794287624779099, - "grad_norm": 0.11755328625440598, - "learning_rate": 0.00015053333333333335, - "loss": 0.209, - "step": 1130 - }, - { - "epoch": 0.10889812294024932, - "grad_norm": 0.15332145988941193, - "learning_rate": 0.00015186666666666668, - "loss": 0.2047, - "step": 1140 - }, - { - "epoch": 0.10985336963270764, - "grad_norm": 0.5098714232444763, - "learning_rate": 0.0001532, - "loss": 0.2054, - "step": 1150 - }, - { - "epoch": 0.11080861632516598, - "grad_norm": 0.24695514142513275, - "learning_rate": 0.00015453333333333334, - "loss": 0.2137, - "step": 1160 - }, - { - "epoch": 0.1117638630176243, - "grad_norm": 0.10639436542987823, - "learning_rate": 0.00015586666666666667, - "loss": 0.2021, - "step": 1170 - }, - { - "epoch": 0.11271910971008263, - "grad_norm": 0.13042502105236053, - "learning_rate": 0.00015720000000000003, - "loss": 0.2059, - "step": 1180 - }, - { - "epoch": 0.11367435640254096, - "grad_norm": 0.11687403172254562, - "learning_rate": 0.00015853333333333333, - "loss": 0.2071, - "step": 1190 - }, - { - "epoch": 0.11462960309499928, - "grad_norm": 0.5390796065330505, - "learning_rate": 0.00015986666666666669, - "loss": 0.2096, - "step": 1200 - }, - { - "epoch": 0.11558484978745762, - "grad_norm": 0.20866340398788452, - "learning_rate": 0.00016120000000000002, - "loss": 0.2236, - "step": 1210 - }, - { - "epoch": 0.11654009647991594, - "grad_norm": 0.08614636212587357, - "learning_rate": 0.00016253333333333334, - "loss": 0.2031, - "step": 1220 - }, - { - "epoch": 0.11749534317237427, - "grad_norm": 0.11634163558483124, - "learning_rate": 0.00016386666666666667, - "loss": 0.2031, - "step": 1230 - }, - { - "epoch": 0.11845058986483259, - "grad_norm": 0.14586254954338074, - "learning_rate": 0.0001652, - "loss": 0.2069, - "step": 1240 - }, - { - "epoch": 0.11940583655729092, - "grad_norm": 0.5198449492454529, - "learning_rate": 0.00016653333333333333, - "loss": 0.2064, - "step": 1250 - }, - { - "epoch": 0.12036108324974924, - "grad_norm": 0.2319190949201584, - "learning_rate": 0.0001678666666666667, - "loss": 0.2549, - "step": 1260 - }, - { - "epoch": 0.12131632994220758, - "grad_norm": 0.14767807722091675, - "learning_rate": 0.0001692, - "loss": 0.2072, - "step": 1270 - }, - { - "epoch": 0.1222715766346659, - "grad_norm": 0.09973818808794022, - "learning_rate": 0.00017053333333333335, - "loss": 0.2068, - "step": 1280 - }, - { - "epoch": 0.12322682332712423, - "grad_norm": 0.12260327488183975, - "learning_rate": 0.00017186666666666665, - "loss": 0.2072, - "step": 1290 - }, - { - "epoch": 0.12418207001958256, - "grad_norm": 0.5598644018173218, - "learning_rate": 0.0001732, - "loss": 0.2071, - "step": 1300 - }, - { - "epoch": 0.1251373167120409, - "grad_norm": 0.21588961780071259, - "learning_rate": 0.00017453333333333334, - "loss": 0.2389, - "step": 1310 - }, - { - "epoch": 0.1260925634044992, - "grad_norm": 0.11560805141925812, - "learning_rate": 0.00017586666666666667, - "loss": 0.2039, - "step": 1320 - }, - { - "epoch": 0.12704781009695754, - "grad_norm": 0.09798284620046616, - "learning_rate": 0.0001772, - "loss": 0.2039, - "step": 1330 - }, - { - "epoch": 0.12800305678941587, - "grad_norm": 0.16167224943637848, - "learning_rate": 0.00017853333333333335, - "loss": 0.2049, - "step": 1340 - }, - { - "epoch": 0.1289583034818742, - "grad_norm": 0.32957813143730164, - "learning_rate": 0.00017986666666666668, - "loss": 0.2022, - "step": 1350 - }, - { - "epoch": 0.1299135501743325, - "grad_norm": 0.21633221209049225, - "learning_rate": 0.0001812, - "loss": 0.2676, - "step": 1360 - }, - { - "epoch": 0.13086879686679084, - "grad_norm": 0.12575742602348328, - "learning_rate": 0.00018253333333333334, - "loss": 0.206, - "step": 1370 - }, - { - "epoch": 0.13182404355924918, - "grad_norm": 0.15339286625385284, - "learning_rate": 0.00018386666666666667, - "loss": 0.2055, - "step": 1380 - }, - { - "epoch": 0.1327792902517075, - "grad_norm": 0.17081816494464874, - "learning_rate": 0.00018520000000000003, - "loss": 0.2057, - "step": 1390 - }, - { - "epoch": 0.13373453694416584, - "grad_norm": 0.2828335165977478, - "learning_rate": 0.00018653333333333333, - "loss": 0.204, - "step": 1400 - }, - { - "epoch": 0.13468978363662415, - "grad_norm": 0.14135104417800903, - "learning_rate": 0.0001878666666666667, - "loss": 0.2242, - "step": 1410 - }, - { - "epoch": 0.13564503032908248, - "grad_norm": 0.10726441442966461, - "learning_rate": 0.0001892, - "loss": 0.1995, - "step": 1420 - }, - { - "epoch": 0.13660027702154082, - "grad_norm": 0.0936957523226738, - "learning_rate": 0.00019053333333333335, - "loss": 0.2039, - "step": 1430 - }, - { - "epoch": 0.13755552371399915, - "grad_norm": 0.15068306028842926, - "learning_rate": 0.00019186666666666668, - "loss": 0.2048, - "step": 1440 - }, - { - "epoch": 0.13851077040645746, - "grad_norm": 0.13459832966327667, - "learning_rate": 0.0001932, - "loss": 0.1975, - "step": 1450 - }, - { - "epoch": 0.1394660170989158, - "grad_norm": 0.1478847712278366, - "learning_rate": 0.00019453333333333334, - "loss": 0.2233, - "step": 1460 - }, - { - "epoch": 0.14042126379137412, - "grad_norm": 0.10763124376535416, - "learning_rate": 0.00019586666666666667, - "loss": 0.2024, - "step": 1470 - }, - { - "epoch": 0.14137651048383246, - "grad_norm": 0.1476346254348755, - "learning_rate": 0.0001972, - "loss": 0.2021, - "step": 1480 - }, - { - "epoch": 0.1423317571762908, - "grad_norm": 0.1142350286245346, - "learning_rate": 0.00019853333333333335, - "loss": 0.2077, - "step": 1490 - }, - { - "epoch": 0.1432870038687491, - "grad_norm": 0.3819134831428528, - "learning_rate": 0.00019986666666666668, - "loss": 0.2046, - "step": 1500 - }, - { - "epoch": 0.14424225056120743, - "grad_norm": 0.20617090165615082, - "learning_rate": 0.00019986666666666668, - "loss": 0.2243, - "step": 1510 - }, - { - "epoch": 0.14519749725366576, - "grad_norm": 0.11755608022212982, - "learning_rate": 0.00019971851851851853, - "loss": 0.2015, - "step": 1520 - }, - { - "epoch": 0.1461527439461241, - "grad_norm": 0.10507424175739288, - "learning_rate": 0.00019957037037037037, - "loss": 0.2065, - "step": 1530 - }, - { - "epoch": 0.1471079906385824, - "grad_norm": 0.12585783004760742, - "learning_rate": 0.00019942222222222222, - "loss": 0.2051, - "step": 1540 - }, - { - "epoch": 0.14806323733104074, - "grad_norm": 0.22103334963321686, - "learning_rate": 0.00019927407407407407, - "loss": 0.2024, - "step": 1550 - }, - { - "epoch": 0.14901848402349907, - "grad_norm": 0.21955139935016632, - "learning_rate": 0.00019912592592592594, - "loss": 0.2186, - "step": 1560 - }, - { - "epoch": 0.1499737307159574, - "grad_norm": 0.14329008758068085, - "learning_rate": 0.0001989777777777778, - "loss": 0.2022, - "step": 1570 - }, - { - "epoch": 0.1509289774084157, - "grad_norm": 0.1393403261899948, - "learning_rate": 0.00019882962962962963, - "loss": 0.2037, - "step": 1580 - }, - { - "epoch": 0.15188422410087404, - "grad_norm": 0.1838270127773285, - "learning_rate": 0.00019868148148148148, - "loss": 0.2071, - "step": 1590 - }, - { - "epoch": 0.15283947079333238, - "grad_norm": 0.26425862312316895, - "learning_rate": 0.00019853333333333335, - "loss": 0.2049, - "step": 1600 - }, - { - "epoch": 0.1537947174857907, - "grad_norm": 0.15780730545520782, - "learning_rate": 0.00019838518518518517, - "loss": 0.2176, - "step": 1610 - }, - { - "epoch": 0.15474996417824904, - "grad_norm": 0.09308743476867676, - "learning_rate": 0.00019823703703703704, - "loss": 0.2027, - "step": 1620 - }, - { - "epoch": 0.15570521087070735, - "grad_norm": 0.11731009930372238, - "learning_rate": 0.0001980888888888889, - "loss": 0.2024, - "step": 1630 - }, - { - "epoch": 0.15666045756316568, - "grad_norm": 0.151070237159729, - "learning_rate": 0.00019794074074074076, - "loss": 0.2059, - "step": 1640 - }, - { - "epoch": 0.15761570425562402, - "grad_norm": 0.2339385598897934, - "learning_rate": 0.00019779259259259258, - "loss": 0.2011, - "step": 1650 - }, - { - "epoch": 0.15857095094808235, - "grad_norm": 0.1527925431728363, - "learning_rate": 0.00019764444444444446, - "loss": 0.2402, - "step": 1660 - }, - { - "epoch": 0.15952619764054066, - "grad_norm": 0.14847882091999054, - "learning_rate": 0.0001974962962962963, - "loss": 0.2072, - "step": 1670 - }, - { - "epoch": 0.160481444332999, - "grad_norm": 0.10180728882551193, - "learning_rate": 0.00019734814814814815, - "loss": 0.2029, - "step": 1680 - }, - { - "epoch": 0.16143669102545732, - "grad_norm": 0.14945369958877563, - "learning_rate": 0.0001972, - "loss": 0.2041, - "step": 1690 - }, - { - "epoch": 0.16239193771791566, - "grad_norm": 0.47041115164756775, - "learning_rate": 0.00019705185185185187, - "loss": 0.2113, - "step": 1700 - }, - { - "epoch": 0.163347184410374, - "grad_norm": 0.2964141070842743, - "learning_rate": 0.00019690370370370372, - "loss": 0.2239, - "step": 1710 - }, - { - "epoch": 0.1643024311028323, - "grad_norm": 0.12235242128372192, - "learning_rate": 0.00019675555555555556, - "loss": 0.2031, - "step": 1720 - }, - { - "epoch": 0.16525767779529063, - "grad_norm": 0.10584467649459839, - "learning_rate": 0.0001966074074074074, - "loss": 0.205, - "step": 1730 - }, - { - "epoch": 0.16621292448774896, - "grad_norm": 0.18592573702335358, - "learning_rate": 0.00019645925925925928, - "loss": 0.2047, - "step": 1740 - }, - { - "epoch": 0.1671681711802073, - "grad_norm": 0.3373814821243286, - "learning_rate": 0.0001963111111111111, - "loss": 0.2033, - "step": 1750 - }, - { - "epoch": 0.1681234178726656, - "grad_norm": 0.24131450057029724, - "learning_rate": 0.00019616296296296297, - "loss": 0.2274, - "step": 1760 - }, - { - "epoch": 0.16907866456512394, - "grad_norm": 0.13119126856327057, - "learning_rate": 0.00019601481481481482, - "loss": 0.2044, - "step": 1770 - }, - { - "epoch": 0.17003391125758227, - "grad_norm": 0.12517541646957397, - "learning_rate": 0.00019586666666666667, - "loss": 0.207, - "step": 1780 - }, - { - "epoch": 0.1709891579500406, - "grad_norm": 0.13666100800037384, - "learning_rate": 0.0001957185185185185, - "loss": 0.206, - "step": 1790 - }, - { - "epoch": 0.17194440464249894, - "grad_norm": 0.5242202281951904, - "learning_rate": 0.00019557037037037039, - "loss": 0.1996, - "step": 1800 - }, - { - "epoch": 0.17289965133495724, - "grad_norm": 0.436653196811676, - "learning_rate": 0.00019542222222222223, - "loss": 0.2492, - "step": 1810 - }, - { - "epoch": 0.17385489802741558, - "grad_norm": 0.14327634871006012, - "learning_rate": 0.00019527407407407408, - "loss": 0.2126, - "step": 1820 - }, - { - "epoch": 0.1748101447198739, - "grad_norm": 0.5541105270385742, - "learning_rate": 0.00019512592592592592, - "loss": 0.2044, - "step": 1830 - }, - { - "epoch": 0.17576539141233224, - "grad_norm": 0.17921100556850433, - "learning_rate": 0.0001949777777777778, - "loss": 0.2107, - "step": 1840 - }, - { - "epoch": 0.17672063810479055, - "grad_norm": 0.4111920893192291, - "learning_rate": 0.00019482962962962962, - "loss": 0.2078, - "step": 1850 - }, - { - "epoch": 0.17767588479724888, - "grad_norm": 0.32501187920570374, - "learning_rate": 0.0001946814814814815, - "loss": 0.2392, - "step": 1860 - }, - { - "epoch": 0.17863113148970722, - "grad_norm": 0.27491918206214905, - "learning_rate": 0.00019453333333333334, - "loss": 0.2017, - "step": 1870 - }, - { - "epoch": 0.17958637818216555, - "grad_norm": 0.10791027545928955, - "learning_rate": 0.0001943851851851852, - "loss": 0.2066, - "step": 1880 - }, - { - "epoch": 0.18054162487462388, - "grad_norm": 0.11400250345468521, - "learning_rate": 0.00019423703703703703, - "loss": 0.2079, - "step": 1890 - }, - { - "epoch": 0.1814968715670822, - "grad_norm": 0.23481184244155884, - "learning_rate": 0.0001940888888888889, - "loss": 0.1987, - "step": 1900 - }, - { - "epoch": 0.18245211825954052, - "grad_norm": 0.2478189617395401, - "learning_rate": 0.00019394074074074075, - "loss": 0.2507, - "step": 1910 - }, - { - "epoch": 0.18340736495199886, - "grad_norm": 0.12263601273298264, - "learning_rate": 0.0001937925925925926, - "loss": 0.2044, - "step": 1920 - }, - { - "epoch": 0.1843626116444572, - "grad_norm": 0.14222730696201324, - "learning_rate": 0.00019364444444444444, - "loss": 0.204, - "step": 1930 - }, - { - "epoch": 0.1853178583369155, - "grad_norm": 0.15957382321357727, - "learning_rate": 0.00019349629629629631, - "loss": 0.2077, - "step": 1940 - }, - { - "epoch": 0.18627310502937383, - "grad_norm": 0.5012878179550171, - "learning_rate": 0.00019334814814814816, - "loss": 0.2102, - "step": 1950 - }, - { - "epoch": 0.18722835172183216, - "grad_norm": 0.2779427170753479, - "learning_rate": 0.0001932, - "loss": 0.2239, - "step": 1960 - }, - { - "epoch": 0.1881835984142905, - "grad_norm": 0.14166271686553955, - "learning_rate": 0.00019305185185185185, - "loss": 0.2043, - "step": 1970 - }, - { - "epoch": 0.1891388451067488, - "grad_norm": 0.11754471063613892, - "learning_rate": 0.00019290370370370373, - "loss": 0.2039, - "step": 1980 - }, - { - "epoch": 0.19009409179920714, - "grad_norm": 0.15170662105083466, - "learning_rate": 0.00019275555555555555, - "loss": 0.2052, - "step": 1990 - }, - { - "epoch": 0.19104933849166547, - "grad_norm": 0.5039442181587219, - "learning_rate": 0.00019260740740740742, - "loss": 0.2082, - "step": 2000 - }, - { - "epoch": 0.1920045851841238, - "grad_norm": 0.2808721959590912, - "learning_rate": 0.00019245925925925927, - "loss": 0.225, - "step": 2010 - }, - { - "epoch": 0.19295983187658214, - "grad_norm": 0.10636847466230392, - "learning_rate": 0.00019231111111111114, - "loss": 0.2018, - "step": 2020 - }, - { - "epoch": 0.19391507856904044, - "grad_norm": 0.10885459184646606, - "learning_rate": 0.00019216296296296296, - "loss": 0.2042, - "step": 2030 - }, - { - "epoch": 0.19487032526149878, - "grad_norm": 0.1382536143064499, - "learning_rate": 0.00019201481481481483, - "loss": 0.2058, - "step": 2040 - }, - { - "epoch": 0.1958255719539571, - "grad_norm": 0.6492496728897095, - "learning_rate": 0.00019186666666666668, - "loss": 0.207, - "step": 2050 - }, - { - "epoch": 0.19678081864641545, - "grad_norm": 0.13635113835334778, - "learning_rate": 0.00019171851851851852, - "loss": 0.2132, - "step": 2060 - }, - { - "epoch": 0.19773606533887375, - "grad_norm": 0.1241525337100029, - "learning_rate": 0.00019157037037037037, - "loss": 0.2032, - "step": 2070 - }, - { - "epoch": 0.19869131203133208, - "grad_norm": 0.12168664485216141, - "learning_rate": 0.00019142222222222224, - "loss": 0.2066, - "step": 2080 - }, - { - "epoch": 0.19964655872379042, - "grad_norm": 0.12792471051216125, - "learning_rate": 0.00019127407407407406, - "loss": 0.2039, - "step": 2090 - }, - { - "epoch": 0.20060180541624875, - "grad_norm": 0.5138373970985413, - "learning_rate": 0.00019112592592592594, - "loss": 0.2095, - "step": 2100 - }, - { - "epoch": 0.20155705210870709, - "grad_norm": 0.2566812336444855, - "learning_rate": 0.00019097777777777778, - "loss": 0.2186, - "step": 2110 - }, - { - "epoch": 0.2025122988011654, - "grad_norm": 0.11528566479682922, - "learning_rate": 0.00019082962962962966, - "loss": 0.2034, - "step": 2120 - }, - { - "epoch": 0.20346754549362372, - "grad_norm": 0.14964407682418823, - "learning_rate": 0.00019068148148148147, - "loss": 0.2014, - "step": 2130 - }, - { - "epoch": 0.20442279218608206, - "grad_norm": 0.1438506543636322, - "learning_rate": 0.00019053333333333335, - "loss": 0.2036, - "step": 2140 - }, - { - "epoch": 0.2053780388785404, - "grad_norm": 0.5848654508590698, - "learning_rate": 0.0001903851851851852, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2063332855709987, - "grad_norm": 0.16795799136161804, - "learning_rate": 0.00019023703703703704, - "loss": 0.2149, - "step": 2160 - }, - { - "epoch": 0.20728853226345703, - "grad_norm": 0.26638638973236084, - "learning_rate": 0.0001900888888888889, - "loss": 0.2026, - "step": 2170 - }, - { - "epoch": 0.20824377895591537, - "grad_norm": 0.11838365346193314, - "learning_rate": 0.00018994074074074076, - "loss": 0.2025, - "step": 2180 - }, - { - "epoch": 0.2091990256483737, - "grad_norm": 0.15363581478595734, - "learning_rate": 0.0001897925925925926, - "loss": 0.2058, - "step": 2190 - }, - { - "epoch": 0.21015427234083203, - "grad_norm": 0.5906602740287781, - "learning_rate": 0.00018964444444444445, - "loss": 0.2077, - "step": 2200 - }, - { - "epoch": 0.21110951903329034, - "grad_norm": 0.31574729084968567, - "learning_rate": 0.0001894962962962963, - "loss": 0.2206, - "step": 2210 - }, - { - "epoch": 0.21206476572574867, - "grad_norm": 0.128960981965065, - "learning_rate": 0.00018934814814814817, - "loss": 0.2001, - "step": 2220 - }, - { - "epoch": 0.213020012418207, - "grad_norm": 0.10915983468294144, - "learning_rate": 0.0001892, - "loss": 0.2023, - "step": 2230 - }, - { - "epoch": 0.21397525911066534, - "grad_norm": 0.14831770956516266, - "learning_rate": 0.00018905185185185186, - "loss": 0.2032, - "step": 2240 - }, - { - "epoch": 0.21493050580312364, - "grad_norm": 0.5704192519187927, - "learning_rate": 0.0001889037037037037, - "loss": 0.2027, - "step": 2250 - }, - { - "epoch": 0.21588575249558198, - "grad_norm": 0.2867341935634613, - "learning_rate": 0.00018875555555555558, - "loss": 0.2227, - "step": 2260 - }, - { - "epoch": 0.2168409991880403, - "grad_norm": 0.199985072016716, - "learning_rate": 0.0001886074074074074, - "loss": 0.2029, - "step": 2270 - }, - { - "epoch": 0.21779624588049865, - "grad_norm": 0.09733956307172775, - "learning_rate": 0.00018845925925925928, - "loss": 0.204, - "step": 2280 - }, - { - "epoch": 0.21875149257295698, - "grad_norm": 0.11999070644378662, - "learning_rate": 0.00018831111111111112, - "loss": 0.2055, - "step": 2290 - }, - { - "epoch": 0.21970673926541529, - "grad_norm": 0.4675360918045044, - "learning_rate": 0.00018816296296296297, - "loss": 0.2052, - "step": 2300 - }, - { - "epoch": 0.22066198595787362, - "grad_norm": 0.29119136929512024, - "learning_rate": 0.00018801481481481482, - "loss": 0.2082, - "step": 2310 - }, - { - "epoch": 0.22161723265033195, - "grad_norm": 0.14248254895210266, - "learning_rate": 0.0001878666666666667, - "loss": 0.2044, - "step": 2320 - }, - { - "epoch": 0.22257247934279029, - "grad_norm": 0.12034345418214798, - "learning_rate": 0.00018771851851851853, - "loss": 0.203, - "step": 2330 - }, - { - "epoch": 0.2235277260352486, - "grad_norm": 0.15301008522510529, - "learning_rate": 0.00018757037037037038, - "loss": 0.2055, - "step": 2340 - }, - { - "epoch": 0.22448297272770693, - "grad_norm": 0.5262898206710815, - "learning_rate": 0.00018742222222222223, - "loss": 0.2058, - "step": 2350 - }, - { - "epoch": 0.22543821942016526, - "grad_norm": 0.3706663250923157, - "learning_rate": 0.0001872740740740741, - "loss": 0.2095, - "step": 2360 - }, - { - "epoch": 0.2263934661126236, - "grad_norm": 0.12153764069080353, - "learning_rate": 0.00018712592592592592, - "loss": 0.2026, - "step": 2370 - }, - { - "epoch": 0.22734871280508193, - "grad_norm": 0.133193701505661, - "learning_rate": 0.0001869777777777778, - "loss": 0.2016, - "step": 2380 - }, - { - "epoch": 0.22830395949754023, - "grad_norm": 0.1649506688117981, - "learning_rate": 0.00018682962962962964, - "loss": 0.2047, - "step": 2390 - }, - { - "epoch": 0.22925920618999857, - "grad_norm": 0.5738644003868103, - "learning_rate": 0.00018668148148148149, - "loss": 0.2036, - "step": 2400 - }, - { - "epoch": 0.2302144528824569, - "grad_norm": 6.20187520980835, - "learning_rate": 0.00018653333333333333, - "loss": 0.2537, - "step": 2410 - }, - { - "epoch": 0.23116969957491523, - "grad_norm": 0.09397050738334656, - "learning_rate": 0.0001863851851851852, - "loss": 0.2093, - "step": 2420 - }, - { - "epoch": 0.23212494626737354, - "grad_norm": 0.13936394453048706, - "learning_rate": 0.00018623703703703705, - "loss": 0.2016, - "step": 2430 - }, - { - "epoch": 0.23308019295983187, - "grad_norm": 0.16786976158618927, - "learning_rate": 0.0001860888888888889, - "loss": 0.2043, - "step": 2440 - }, - { - "epoch": 0.2340354396522902, - "grad_norm": 0.4057718813419342, - "learning_rate": 0.00018594074074074074, - "loss": 0.2074, - "step": 2450 - }, - { - "epoch": 0.23499068634474854, - "grad_norm": 0.22006480395793915, - "learning_rate": 0.00018579259259259262, - "loss": 0.2139, - "step": 2460 - }, - { - "epoch": 0.23594593303720685, - "grad_norm": 0.11016175895929337, - "learning_rate": 0.00018564444444444444, - "loss": 0.2029, - "step": 2470 - }, - { - "epoch": 0.23690117972966518, - "grad_norm": 0.11153749376535416, - "learning_rate": 0.0001854962962962963, - "loss": 0.202, - "step": 2480 - }, - { - "epoch": 0.2378564264221235, - "grad_norm": 0.1272597759962082, - "learning_rate": 0.00018534814814814816, - "loss": 0.2051, - "step": 2490 - }, - { - "epoch": 0.23881167311458185, - "grad_norm": 0.5681122541427612, - "learning_rate": 0.00018520000000000003, - "loss": 0.2063, - "step": 2500 - }, - { - "epoch": 0.23976691980704018, - "grad_norm": 0.23601225018501282, - "learning_rate": 0.00018505185185185185, - "loss": 0.2173, - "step": 2510 - }, - { - "epoch": 0.24072216649949849, - "grad_norm": 0.14005862176418304, - "learning_rate": 0.00018490370370370372, - "loss": 0.2022, - "step": 2520 - }, - { - "epoch": 0.24167741319195682, - "grad_norm": 0.12287179380655289, - "learning_rate": 0.00018475555555555557, - "loss": 0.2039, - "step": 2530 - }, - { - "epoch": 0.24263265988441515, - "grad_norm": 0.15546666085720062, - "learning_rate": 0.00018460740740740741, - "loss": 0.2045, - "step": 2540 - }, - { - "epoch": 0.2435879065768735, - "grad_norm": 0.5308989882469177, - "learning_rate": 0.00018445925925925926, - "loss": 0.2047, - "step": 2550 - }, - { - "epoch": 0.2445431532693318, - "grad_norm": 0.1517964005470276, - "learning_rate": 0.00018431111111111113, - "loss": 0.212, - "step": 2560 - }, - { - "epoch": 0.24549839996179013, - "grad_norm": 0.09176363050937653, - "learning_rate": 0.00018416296296296298, - "loss": 0.2003, - "step": 2570 - }, - { - "epoch": 0.24645364665424846, - "grad_norm": 0.11431318521499634, - "learning_rate": 0.00018401481481481483, - "loss": 0.2005, - "step": 2580 - }, - { - "epoch": 0.2474088933467068, - "grad_norm": 0.13079893589019775, - "learning_rate": 0.00018386666666666667, - "loss": 0.2037, - "step": 2590 - }, - { - "epoch": 0.24836414003916513, - "grad_norm": 0.47278907895088196, - "learning_rate": 0.00018371851851851855, - "loss": 0.2054, - "step": 2600 - }, - { - "epoch": 0.24931938673162343, - "grad_norm": 0.24692212045192719, - "learning_rate": 0.00018357037037037037, - "loss": 0.2116, - "step": 2610 - }, - { - "epoch": 0.2502746334240818, - "grad_norm": 0.1928204596042633, - "learning_rate": 0.00018342222222222224, - "loss": 0.2057, - "step": 2620 - }, - { - "epoch": 0.25122988011654007, - "grad_norm": 0.1608019471168518, - "learning_rate": 0.00018327407407407408, - "loss": 0.2027, - "step": 2630 - }, - { - "epoch": 0.2521851268089984, - "grad_norm": 0.09996878355741501, - "learning_rate": 0.00018312592592592596, - "loss": 0.2036, - "step": 2640 - }, - { - "epoch": 0.25314037350145674, - "grad_norm": 0.45600563287734985, - "learning_rate": 0.00018297777777777778, - "loss": 0.2054, - "step": 2650 - }, - { - "epoch": 0.2540956201939151, - "grad_norm": 0.3103402256965637, - "learning_rate": 0.00018282962962962965, - "loss": 0.2107, - "step": 2660 - }, - { - "epoch": 0.2550508668863734, - "grad_norm": 0.12778714299201965, - "learning_rate": 0.0001826814814814815, - "loss": 0.2055, - "step": 2670 - }, - { - "epoch": 0.25600611357883174, - "grad_norm": 0.112953320145607, - "learning_rate": 0.00018253333333333334, - "loss": 0.2024, - "step": 2680 - }, - { - "epoch": 0.2569613602712901, - "grad_norm": 0.10482873767614365, - "learning_rate": 0.0001823851851851852, - "loss": 0.204, - "step": 2690 - }, - { - "epoch": 0.2579166069637484, - "grad_norm": 0.606562077999115, - "learning_rate": 0.00018223703703703706, - "loss": 0.2061, - "step": 2700 - }, - { - "epoch": 0.25887185365620674, - "grad_norm": 0.3492737412452698, - "learning_rate": 0.00018208888888888888, - "loss": 0.2109, - "step": 2710 - }, - { - "epoch": 0.259827100348665, - "grad_norm": 0.12048076838254929, - "learning_rate": 0.00018194074074074076, - "loss": 0.2046, - "step": 2720 - }, - { - "epoch": 0.26078234704112335, - "grad_norm": 0.1138598620891571, - "learning_rate": 0.0001817925925925926, - "loss": 0.2021, - "step": 2730 - }, - { - "epoch": 0.2617375937335817, - "grad_norm": 0.12291901558637619, - "learning_rate": 0.00018164444444444447, - "loss": 0.2022, - "step": 2740 - }, - { - "epoch": 0.26269284042604, - "grad_norm": 0.43753868341445923, - "learning_rate": 0.0001814962962962963, - "loss": 0.2047, - "step": 2750 - }, - { - "epoch": 0.26364808711849835, - "grad_norm": 0.21602752804756165, - "learning_rate": 0.00018134814814814817, - "loss": 0.2078, - "step": 2760 - }, - { - "epoch": 0.2646033338109567, - "grad_norm": 0.13231903314590454, - "learning_rate": 0.0001812, - "loss": 0.2022, - "step": 2770 - }, - { - "epoch": 0.265558580503415, - "grad_norm": 0.11796011030673981, - "learning_rate": 0.00018105185185185186, - "loss": 0.2019, - "step": 2780 - }, - { - "epoch": 0.26651382719587335, - "grad_norm": 0.13643573224544525, - "learning_rate": 0.0001809037037037037, - "loss": 0.2027, - "step": 2790 - }, - { - "epoch": 0.2674690738883317, - "grad_norm": 0.44247397780418396, - "learning_rate": 0.00018075555555555558, - "loss": 0.2038, - "step": 2800 - }, - { - "epoch": 0.26842432058078997, - "grad_norm": 0.3825100064277649, - "learning_rate": 0.00018060740740740743, - "loss": 0.2069, - "step": 2810 - }, - { - "epoch": 0.2693795672732483, - "grad_norm": 0.195270374417305, - "learning_rate": 0.00018045925925925927, - "loss": 0.2043, - "step": 2820 - }, - { - "epoch": 0.27033481396570663, - "grad_norm": 0.12330356240272522, - "learning_rate": 0.00018031111111111112, - "loss": 0.202, - "step": 2830 - }, - { - "epoch": 0.27129006065816497, - "grad_norm": 0.1486448496580124, - "learning_rate": 0.000180162962962963, - "loss": 0.2046, - "step": 2840 - }, - { - "epoch": 0.2722453073506233, - "grad_norm": 0.4175782799720764, - "learning_rate": 0.0001800148148148148, - "loss": 0.203, - "step": 2850 - }, - { - "epoch": 0.27320055404308163, - "grad_norm": 0.15270055830478668, - "learning_rate": 0.00017986666666666668, - "loss": 0.2134, - "step": 2860 - }, - { - "epoch": 0.27415580073553997, - "grad_norm": 0.14655576646327972, - "learning_rate": 0.00017971851851851853, - "loss": 0.2006, - "step": 2870 - }, - { - "epoch": 0.2751110474279983, - "grad_norm": 0.1165398582816124, - "learning_rate": 0.00017957037037037038, - "loss": 0.2017, - "step": 2880 - }, - { - "epoch": 0.27606629412045663, - "grad_norm": 0.1337936520576477, - "learning_rate": 0.00017942222222222222, - "loss": 0.2036, - "step": 2890 - }, - { - "epoch": 0.2770215408129149, - "grad_norm": 0.38786137104034424, - "learning_rate": 0.0001792740740740741, - "loss": 0.2046, - "step": 2900 - }, - { - "epoch": 0.27797678750537325, - "grad_norm": 0.2964805066585541, - "learning_rate": 0.00017912592592592594, - "loss": 0.2087, - "step": 2910 - }, - { - "epoch": 0.2789320341978316, - "grad_norm": 0.1085568442940712, - "learning_rate": 0.0001789777777777778, - "loss": 0.2052, - "step": 2920 - }, - { - "epoch": 0.2798872808902899, - "grad_norm": 0.10278130322694778, - "learning_rate": 0.00017882962962962963, - "loss": 0.1993, - "step": 2930 - }, - { - "epoch": 0.28084252758274825, - "grad_norm": 0.13532647490501404, - "learning_rate": 0.00017868148148148148, - "loss": 0.2038, - "step": 2940 - }, - { - "epoch": 0.2817977742752066, - "grad_norm": 0.3095054626464844, - "learning_rate": 0.00017853333333333335, - "loss": 0.2065, - "step": 2950 - }, - { - "epoch": 0.2827530209676649, - "grad_norm": 0.2533681392669678, - "learning_rate": 0.0001783851851851852, - "loss": 0.2086, - "step": 2960 - }, - { - "epoch": 0.28370826766012325, - "grad_norm": 0.12914352118968964, - "learning_rate": 0.00017823703703703705, - "loss": 0.2062, - "step": 2970 - }, - { - "epoch": 0.2846635143525816, - "grad_norm": 0.12468916922807693, - "learning_rate": 0.0001780888888888889, - "loss": 0.2001, - "step": 2980 - }, - { - "epoch": 0.28561876104503986, - "grad_norm": 0.24026978015899658, - "learning_rate": 0.00017794074074074074, - "loss": 0.2039, - "step": 2990 - }, - { - "epoch": 0.2865740077374982, - "grad_norm": 0.35038548707962036, - "learning_rate": 0.0001777925925925926, - "loss": 0.2043, - "step": 3000 - }, - { - "epoch": 0.2875292544299565, - "grad_norm": 0.16599488258361816, - "learning_rate": 0.00017764444444444446, - "loss": 0.2043, - "step": 3010 - }, - { - "epoch": 0.28848450112241486, - "grad_norm": 0.17750556766986847, - "learning_rate": 0.0001774962962962963, - "loss": 0.202, - "step": 3020 - }, - { - "epoch": 0.2894397478148732, - "grad_norm": 0.10757072269916534, - "learning_rate": 0.00017734814814814815, - "loss": 0.2009, - "step": 3030 - }, - { - "epoch": 0.2903949945073315, - "grad_norm": 0.17457428574562073, - "learning_rate": 0.0001772, - "loss": 0.205, - "step": 3040 - }, - { - "epoch": 0.29135024119978986, - "grad_norm": 0.4000655710697174, - "learning_rate": 0.00017705185185185187, - "loss": 0.2048, - "step": 3050 - }, - { - "epoch": 0.2923054878922482, - "grad_norm": 0.3278186619281769, - "learning_rate": 0.00017690370370370372, - "loss": 0.2026, - "step": 3060 - }, - { - "epoch": 0.2932607345847065, - "grad_norm": 0.09013079106807709, - "learning_rate": 0.00017675555555555556, - "loss": 0.2039, - "step": 3070 - }, - { - "epoch": 0.2942159812771648, - "grad_norm": 0.11759201437234879, - "learning_rate": 0.0001766074074074074, - "loss": 0.2014, - "step": 3080 - }, - { - "epoch": 0.29517122796962314, - "grad_norm": 0.17286565899848938, - "learning_rate": 0.00017645925925925926, - "loss": 0.2049, - "step": 3090 - }, - { - "epoch": 0.2961264746620815, - "grad_norm": 0.3381953835487366, - "learning_rate": 0.00017631111111111113, - "loss": 0.2038, - "step": 3100 - }, - { - "epoch": 0.2970817213545398, - "grad_norm": 0.1709469109773636, - "learning_rate": 0.00017616296296296298, - "loss": 0.2018, - "step": 3110 - }, - { - "epoch": 0.29803696804699814, - "grad_norm": 0.1467283070087433, - "learning_rate": 0.00017601481481481482, - "loss": 0.1997, - "step": 3120 - }, - { - "epoch": 0.2989922147394565, - "grad_norm": 0.13545508682727814, - "learning_rate": 0.00017586666666666667, - "loss": 0.2009, - "step": 3130 - }, - { - "epoch": 0.2999474614319148, - "grad_norm": 0.19185511767864227, - "learning_rate": 0.00017571851851851851, - "loss": 0.2045, - "step": 3140 - }, - { - "epoch": 0.30090270812437314, - "grad_norm": 0.3416673541069031, - "learning_rate": 0.0001755703703703704, - "loss": 0.2063, - "step": 3150 - }, - { - "epoch": 0.3018579548168314, - "grad_norm": 0.14955377578735352, - "learning_rate": 0.00017542222222222223, - "loss": 0.2017, - "step": 3160 - }, - { - "epoch": 0.30281320150928975, - "grad_norm": 0.10988181084394455, - "learning_rate": 0.00017527407407407408, - "loss": 0.2016, - "step": 3170 - }, - { - "epoch": 0.3037684482017481, - "grad_norm": 0.17967171967029572, - "learning_rate": 0.00017512592592592593, - "loss": 0.2004, - "step": 3180 - }, - { - "epoch": 0.3047236948942064, - "grad_norm": 0.19823792576789856, - "learning_rate": 0.0001749777777777778, - "loss": 0.2023, - "step": 3190 - }, - { - "epoch": 0.30567894158666475, - "grad_norm": 0.49947112798690796, - "learning_rate": 0.00017482962962962962, - "loss": 0.205, - "step": 3200 - }, - { - "epoch": 0.3066341882791231, - "grad_norm": 0.272572785615921, - "learning_rate": 0.0001746814814814815, - "loss": 0.2051, - "step": 3210 - }, - { - "epoch": 0.3075894349715814, - "grad_norm": 0.1181032732129097, - "learning_rate": 0.00017453333333333334, - "loss": 0.2034, - "step": 3220 - }, - { - "epoch": 0.30854468166403975, - "grad_norm": 0.10932394117116928, - "learning_rate": 0.00017438518518518518, - "loss": 0.2014, - "step": 3230 - }, - { - "epoch": 0.3094999283564981, - "grad_norm": 0.1775374561548233, - "learning_rate": 0.00017423703703703703, - "loss": 0.2058, - "step": 3240 - }, - { - "epoch": 0.31045517504895637, - "grad_norm": 0.39363420009613037, - "learning_rate": 0.0001740888888888889, - "loss": 0.2061, - "step": 3250 - }, - { - "epoch": 0.3114104217414147, - "grad_norm": 0.2556678354740143, - "learning_rate": 0.00017394074074074075, - "loss": 0.2036, - "step": 3260 - }, - { - "epoch": 0.31236566843387303, - "grad_norm": 0.1335153728723526, - "learning_rate": 0.0001737925925925926, - "loss": 0.1994, - "step": 3270 - }, - { - "epoch": 0.31332091512633137, - "grad_norm": 0.1207786276936531, - "learning_rate": 0.00017364444444444444, - "loss": 0.2013, - "step": 3280 - }, - { - "epoch": 0.3142761618187897, - "grad_norm": 0.18867254257202148, - "learning_rate": 0.00017349629629629632, - "loss": 0.2022, - "step": 3290 - }, - { - "epoch": 0.31523140851124803, - "grad_norm": 0.4312264919281006, - "learning_rate": 0.00017334814814814814, - "loss": 0.2052, - "step": 3300 - }, - { - "epoch": 0.31618665520370637, - "grad_norm": 0.2023773193359375, - "learning_rate": 0.0001732, - "loss": 0.1996, - "step": 3310 - }, - { - "epoch": 0.3171419018961647, - "grad_norm": 0.13800661265850067, - "learning_rate": 0.00017305185185185185, - "loss": 0.2029, - "step": 3320 - }, - { - "epoch": 0.31809714858862304, - "grad_norm": 0.12555184960365295, - "learning_rate": 0.0001729037037037037, - "loss": 0.2014, - "step": 3330 - }, - { - "epoch": 0.3190523952810813, - "grad_norm": 0.1472926288843155, - "learning_rate": 0.00017275555555555555, - "loss": 0.2051, - "step": 3340 - }, - { - "epoch": 0.32000764197353965, - "grad_norm": 0.4507752060890198, - "learning_rate": 0.00017260740740740742, - "loss": 0.2041, - "step": 3350 - }, - { - "epoch": 0.320962888665998, - "grad_norm": 0.42336827516555786, - "learning_rate": 0.00017245925925925927, - "loss": 0.2079, - "step": 3360 - }, - { - "epoch": 0.3219181353584563, - "grad_norm": 0.09905827045440674, - "learning_rate": 0.0001723111111111111, - "loss": 0.2011, - "step": 3370 - }, - { - "epoch": 0.32287338205091465, - "grad_norm": 0.09226644784212112, - "learning_rate": 0.00017216296296296296, - "loss": 0.2, - "step": 3380 - }, - { - "epoch": 0.323828628743373, - "grad_norm": 0.11263467371463776, - "learning_rate": 0.00017201481481481483, - "loss": 0.2053, - "step": 3390 - }, - { - "epoch": 0.3247838754358313, - "grad_norm": 0.2874763607978821, - "learning_rate": 0.00017186666666666665, - "loss": 0.2063, - "step": 3400 - }, - { - "epoch": 0.32573912212828965, - "grad_norm": 0.13759906589984894, - "learning_rate": 0.00017171851851851853, - "loss": 0.2038, - "step": 3410 - }, - { - "epoch": 0.326694368820748, - "grad_norm": 0.10311861336231232, - "learning_rate": 0.00017157037037037037, - "loss": 0.1998, - "step": 3420 - }, - { - "epoch": 0.32764961551320626, - "grad_norm": 0.10236906260251999, - "learning_rate": 0.00017142222222222224, - "loss": 0.2007, - "step": 3430 - }, - { - "epoch": 0.3286048622056646, - "grad_norm": 0.14640459418296814, - "learning_rate": 0.00017127407407407406, - "loss": 0.2034, - "step": 3440 - }, - { - "epoch": 0.3295601088981229, - "grad_norm": 0.4186616837978363, - "learning_rate": 0.00017112592592592594, - "loss": 0.2063, - "step": 3450 - }, - { - "epoch": 0.33051535559058126, - "grad_norm": 0.210789293050766, - "learning_rate": 0.00017097777777777778, - "loss": 0.2043, - "step": 3460 - }, - { - "epoch": 0.3314706022830396, - "grad_norm": 0.09906265139579773, - "learning_rate": 0.00017082962962962963, - "loss": 0.199, - "step": 3470 - }, - { - "epoch": 0.33242584897549793, - "grad_norm": 0.10522822290658951, - "learning_rate": 0.00017068148148148148, - "loss": 0.2008, - "step": 3480 - }, - { - "epoch": 0.33338109566795626, - "grad_norm": 0.13106843829154968, - "learning_rate": 0.00017053333333333335, - "loss": 0.2067, - "step": 3490 - }, - { - "epoch": 0.3343363423604146, - "grad_norm": 0.3762379586696625, - "learning_rate": 0.0001703851851851852, - "loss": 0.2059, - "step": 3500 - }, - { - "epoch": 0.33529158905287293, - "grad_norm": 0.17082005739212036, - "learning_rate": 0.00017023703703703704, - "loss": 0.2124, - "step": 3510 - }, - { - "epoch": 0.3362468357453312, - "grad_norm": 0.0899599939584732, - "learning_rate": 0.0001700888888888889, - "loss": 0.2037, - "step": 3520 - }, - { - "epoch": 0.33720208243778954, - "grad_norm": 0.10781609266996384, - "learning_rate": 0.00016994074074074076, - "loss": 0.2021, - "step": 3530 - }, - { - "epoch": 0.3381573291302479, - "grad_norm": 0.1443158984184265, - "learning_rate": 0.00016979259259259258, - "loss": 0.2045, - "step": 3540 - }, - { - "epoch": 0.3391125758227062, - "grad_norm": 0.3108382523059845, - "learning_rate": 0.00016964444444444445, - "loss": 0.2066, - "step": 3550 - }, - { - "epoch": 0.34006782251516454, - "grad_norm": 0.11456870287656784, - "learning_rate": 0.0001694962962962963, - "loss": 0.2127, - "step": 3560 - }, - { - "epoch": 0.3410230692076229, - "grad_norm": 0.14917728304862976, - "learning_rate": 0.00016934814814814817, - "loss": 0.2035, - "step": 3570 - }, - { - "epoch": 0.3419783159000812, - "grad_norm": 0.12674477696418762, - "learning_rate": 0.0001692, - "loss": 0.2029, - "step": 3580 - }, - { - "epoch": 0.34293356259253954, - "grad_norm": 0.14471426606178284, - "learning_rate": 0.00016905185185185187, - "loss": 0.2047, - "step": 3590 - }, - { - "epoch": 0.3438888092849979, - "grad_norm": 0.34318575263023376, - "learning_rate": 0.0001689037037037037, - "loss": 0.207, - "step": 3600 - }, - { - "epoch": 0.34484405597745615, - "grad_norm": 0.27254998683929443, - "learning_rate": 0.00016875555555555556, - "loss": 0.2059, - "step": 3610 - }, - { - "epoch": 0.3457993026699145, - "grad_norm": 0.11677141487598419, - "learning_rate": 0.0001686074074074074, - "loss": 0.2052, - "step": 3620 - }, - { - "epoch": 0.3467545493623728, - "grad_norm": 0.10886923968791962, - "learning_rate": 0.00016845925925925928, - "loss": 0.2032, - "step": 3630 - }, - { - "epoch": 0.34770979605483116, - "grad_norm": 0.146949902176857, - "learning_rate": 0.0001683111111111111, - "loss": 0.2044, - "step": 3640 - }, - { - "epoch": 0.3486650427472895, - "grad_norm": 0.37153467535972595, - "learning_rate": 0.00016816296296296297, - "loss": 0.2055, - "step": 3650 - }, - { - "epoch": 0.3496202894397478, - "grad_norm": 0.25440579652786255, - "learning_rate": 0.00016801481481481482, - "loss": 0.209, - "step": 3660 - }, - { - "epoch": 0.35057553613220616, - "grad_norm": 0.0973072350025177, - "learning_rate": 0.0001678666666666667, - "loss": 0.2055, - "step": 3670 - }, - { - "epoch": 0.3515307828246645, - "grad_norm": 0.11460244655609131, - "learning_rate": 0.0001677185185185185, - "loss": 0.2023, - "step": 3680 - }, - { - "epoch": 0.3524860295171228, - "grad_norm": 0.16661228239536285, - "learning_rate": 0.00016757037037037038, - "loss": 0.2052, - "step": 3690 - }, - { - "epoch": 0.3534412762095811, - "grad_norm": 0.37548327445983887, - "learning_rate": 0.00016742222222222223, - "loss": 0.206, - "step": 3700 - }, - { - "epoch": 0.35439652290203943, - "grad_norm": 0.2567369341850281, - "learning_rate": 0.00016727407407407408, - "loss": 0.2062, - "step": 3710 - }, - { - "epoch": 0.35535176959449777, - "grad_norm": 0.1041766032576561, - "learning_rate": 0.00016712592592592592, - "loss": 0.2012, - "step": 3720 - }, - { - "epoch": 0.3563070162869561, - "grad_norm": 0.12674827873706818, - "learning_rate": 0.0001669777777777778, - "loss": 0.2037, - "step": 3730 - }, - { - "epoch": 0.35726226297941444, - "grad_norm": 0.1635134369134903, - "learning_rate": 0.00016682962962962964, - "loss": 0.2049, - "step": 3740 - }, - { - "epoch": 0.35821750967187277, - "grad_norm": 0.371850848197937, - "learning_rate": 0.0001666814814814815, - "loss": 0.2066, - "step": 3750 - }, - { - "epoch": 0.3591727563643311, - "grad_norm": 0.11050805449485779, - "learning_rate": 0.00016653333333333333, - "loss": 0.2049, - "step": 3760 - }, - { - "epoch": 0.36012800305678944, - "grad_norm": 0.12971824407577515, - "learning_rate": 0.0001663851851851852, - "loss": 0.2003, - "step": 3770 - }, - { - "epoch": 0.36108324974924777, - "grad_norm": 0.10030124336481094, - "learning_rate": 0.00016623703703703703, - "loss": 0.2025, - "step": 3780 - }, - { - "epoch": 0.36203849644170605, - "grad_norm": 0.15601076185703278, - "learning_rate": 0.0001660888888888889, - "loss": 0.2039, - "step": 3790 - }, - { - "epoch": 0.3629937431341644, - "grad_norm": 0.3648783266544342, - "learning_rate": 0.00016594074074074075, - "loss": 0.2058, - "step": 3800 - }, - { - "epoch": 0.3639489898266227, - "grad_norm": 0.23194488883018494, - "learning_rate": 0.00016579259259259262, - "loss": 0.2057, - "step": 3810 - }, - { - "epoch": 0.36490423651908105, - "grad_norm": 0.1057933121919632, - "learning_rate": 0.00016564444444444444, - "loss": 0.2067, - "step": 3820 - }, - { - "epoch": 0.3658594832115394, - "grad_norm": 0.11324844509363174, - "learning_rate": 0.0001654962962962963, - "loss": 0.2021, - "step": 3830 - }, - { - "epoch": 0.3668147299039977, - "grad_norm": 0.14048735797405243, - "learning_rate": 0.00016534814814814816, - "loss": 0.2055, - "step": 3840 - }, - { - "epoch": 0.36776997659645605, - "grad_norm": 0.36952847242355347, - "learning_rate": 0.0001652, - "loss": 0.2086, - "step": 3850 - }, - { - "epoch": 0.3687252232889144, - "grad_norm": 0.3616020083427429, - "learning_rate": 0.00016505185185185185, - "loss": 0.2036, - "step": 3860 - }, - { - "epoch": 0.3696804699813727, - "grad_norm": 0.12143600732088089, - "learning_rate": 0.00016490370370370372, - "loss": 0.2025, - "step": 3870 - }, - { - "epoch": 0.370635716673831, - "grad_norm": 0.13124673068523407, - "learning_rate": 0.00016475555555555557, - "loss": 0.2016, - "step": 3880 - }, - { - "epoch": 0.37159096336628933, - "grad_norm": 0.1724298894405365, - "learning_rate": 0.00016460740740740742, - "loss": 0.2038, - "step": 3890 - }, - { - "epoch": 0.37254621005874766, - "grad_norm": 0.32873499393463135, - "learning_rate": 0.00016445925925925926, - "loss": 0.205, - "step": 3900 - }, - { - "epoch": 0.373501456751206, - "grad_norm": 0.26445272564888, - "learning_rate": 0.00016431111111111114, - "loss": 0.2064, - "step": 3910 - }, - { - "epoch": 0.37445670344366433, - "grad_norm": 0.11803455650806427, - "learning_rate": 0.00016416296296296295, - "loss": 0.203, - "step": 3920 - }, - { - "epoch": 0.37541195013612266, - "grad_norm": 0.13132309913635254, - "learning_rate": 0.00016401481481481483, - "loss": 0.1999, - "step": 3930 - }, - { - "epoch": 0.376367196828581, - "grad_norm": 0.14737923443317413, - "learning_rate": 0.00016386666666666667, - "loss": 0.2034, - "step": 3940 - }, - { - "epoch": 0.37732244352103933, - "grad_norm": 0.357509583234787, - "learning_rate": 0.00016371851851851852, - "loss": 0.2051, - "step": 3950 - }, - { - "epoch": 0.3782776902134976, - "grad_norm": 0.17359203100204468, - "learning_rate": 0.00016357037037037037, - "loss": 0.203, - "step": 3960 - }, - { - "epoch": 0.37923293690595594, - "grad_norm": 0.1117023155093193, - "learning_rate": 0.00016342222222222224, - "loss": 0.2033, - "step": 3970 - }, - { - "epoch": 0.3801881835984143, - "grad_norm": 0.10505598783493042, - "learning_rate": 0.00016327407407407409, - "loss": 0.2018, - "step": 3980 - }, - { - "epoch": 0.3811434302908726, - "grad_norm": 0.17457084357738495, - "learning_rate": 0.00016312592592592593, - "loss": 0.2061, - "step": 3990 - }, - { - "epoch": 0.38209867698333094, - "grad_norm": 0.3120948076248169, - "learning_rate": 0.00016297777777777778, - "loss": 0.2062, - "step": 4000 - }, - { - "epoch": 0.3830539236757893, - "grad_norm": 0.37100452184677124, - "learning_rate": 0.00016282962962962965, - "loss": 0.2052, - "step": 4010 - }, - { - "epoch": 0.3840091703682476, - "grad_norm": 0.08743845671415329, - "learning_rate": 0.00016268148148148147, - "loss": 0.2018, - "step": 4020 - }, - { - "epoch": 0.38496441706070594, - "grad_norm": 0.1295013129711151, - "learning_rate": 0.00016253333333333334, - "loss": 0.2003, - "step": 4030 - }, - { - "epoch": 0.3859196637531643, - "grad_norm": 0.15588463842868805, - "learning_rate": 0.0001623851851851852, - "loss": 0.205, - "step": 4040 - }, - { - "epoch": 0.38687491044562256, - "grad_norm": 0.3880206346511841, - "learning_rate": 0.00016223703703703706, - "loss": 0.2031, - "step": 4050 - }, - { - "epoch": 0.3878301571380809, - "grad_norm": 0.13966549932956696, - "learning_rate": 0.00016208888888888888, - "loss": 0.2134, - "step": 4060 - }, - { - "epoch": 0.3887854038305392, - "grad_norm": 0.11811528354883194, - "learning_rate": 0.00016194074074074076, - "loss": 0.1983, - "step": 4070 - }, - { - "epoch": 0.38974065052299756, - "grad_norm": 0.14821895956993103, - "learning_rate": 0.0001617925925925926, - "loss": 0.202, - "step": 4080 - }, - { - "epoch": 0.3906958972154559, - "grad_norm": 0.12111028283834457, - "learning_rate": 0.00016164444444444445, - "loss": 0.2039, - "step": 4090 - }, - { - "epoch": 0.3916511439079142, - "grad_norm": 0.371774822473526, - "learning_rate": 0.0001614962962962963, - "loss": 0.2047, - "step": 4100 - }, - { - "epoch": 0.39260639060037256, - "grad_norm": 0.3905799090862274, - "learning_rate": 0.00016134814814814817, - "loss": 0.2191, - "step": 4110 - }, - { - "epoch": 0.3935616372928309, - "grad_norm": 0.13462865352630615, - "learning_rate": 0.00016120000000000002, - "loss": 0.2046, - "step": 4120 - }, - { - "epoch": 0.3945168839852892, - "grad_norm": 0.12144972383975983, - "learning_rate": 0.00016105185185185186, - "loss": 0.2023, - "step": 4130 - }, - { - "epoch": 0.3954721306777475, - "grad_norm": 0.14258955419063568, - "learning_rate": 0.0001609037037037037, - "loss": 0.2044, - "step": 4140 - }, - { - "epoch": 0.39642737737020584, - "grad_norm": 0.32331934571266174, - "learning_rate": 0.00016075555555555558, - "loss": 0.2035, - "step": 4150 - }, - { - "epoch": 0.39738262406266417, - "grad_norm": 0.2566055655479431, - "learning_rate": 0.0001606074074074074, - "loss": 0.2116, - "step": 4160 - }, - { - "epoch": 0.3983378707551225, - "grad_norm": 0.12487474083900452, - "learning_rate": 0.00016045925925925927, - "loss": 0.2053, - "step": 4170 - }, - { - "epoch": 0.39929311744758084, - "grad_norm": 0.137589693069458, - "learning_rate": 0.00016031111111111112, - "loss": 0.2021, - "step": 4180 - }, - { - "epoch": 0.40024836414003917, - "grad_norm": 0.12724049389362335, - "learning_rate": 0.000160162962962963, - "loss": 0.2045, - "step": 4190 - }, - { - "epoch": 0.4012036108324975, - "grad_norm": 0.3219785690307617, - "learning_rate": 0.0001600148148148148, - "loss": 0.2057, - "step": 4200 - }, - { - "epoch": 0.40215885752495584, - "grad_norm": 0.12036234885454178, - "learning_rate": 0.00015986666666666669, - "loss": 0.2069, - "step": 4210 - }, - { - "epoch": 0.40311410421741417, - "grad_norm": 0.10740282386541367, - "learning_rate": 0.00015971851851851853, - "loss": 0.2008, - "step": 4220 - }, - { - "epoch": 0.40406935090987245, - "grad_norm": 0.12133664637804031, - "learning_rate": 0.00015957037037037038, - "loss": 0.2014, - "step": 4230 - }, - { - "epoch": 0.4050245976023308, - "grad_norm": 0.17164857685565948, - "learning_rate": 0.00015942222222222222, - "loss": 0.2034, - "step": 4240 - }, - { - "epoch": 0.4059798442947891, - "grad_norm": 0.46345826983451843, - "learning_rate": 0.0001592740740740741, - "loss": 0.2045, - "step": 4250 - }, - { - "epoch": 0.40693509098724745, - "grad_norm": 0.20649810135364532, - "learning_rate": 0.00015912592592592592, - "loss": 0.2111, - "step": 4260 - }, - { - "epoch": 0.4078903376797058, - "grad_norm": 0.08920012414455414, - "learning_rate": 0.0001589777777777778, - "loss": 0.2024, - "step": 4270 - }, - { - "epoch": 0.4088455843721641, - "grad_norm": 0.11577285826206207, - "learning_rate": 0.00015882962962962964, - "loss": 0.1994, - "step": 4280 - }, - { - "epoch": 0.40980083106462245, - "grad_norm": 0.16047626733779907, - "learning_rate": 0.0001586814814814815, - "loss": 0.2037, - "step": 4290 - }, - { - "epoch": 0.4107560777570808, - "grad_norm": 0.2667694687843323, - "learning_rate": 0.00015853333333333333, - "loss": 0.2054, - "step": 4300 - }, - { - "epoch": 0.4117113244495391, - "grad_norm": 0.1745455265045166, - "learning_rate": 0.0001583851851851852, - "loss": 0.2097, - "step": 4310 - }, - { - "epoch": 0.4126665711419974, - "grad_norm": 0.10706843435764313, - "learning_rate": 0.00015823703703703705, - "loss": 0.2037, - "step": 4320 - }, - { - "epoch": 0.41362181783445573, - "grad_norm": 0.12602153420448303, - "learning_rate": 0.0001580888888888889, - "loss": 0.203, - "step": 4330 - }, - { - "epoch": 0.41457706452691406, - "grad_norm": 0.1355423778295517, - "learning_rate": 0.00015794074074074074, - "loss": 0.205, - "step": 4340 - }, - { - "epoch": 0.4155323112193724, - "grad_norm": 0.4957958459854126, - "learning_rate": 0.00015779259259259261, - "loss": 0.2059, - "step": 4350 - }, - { - "epoch": 0.41648755791183073, - "grad_norm": 0.17122064530849457, - "learning_rate": 0.00015764444444444446, - "loss": 0.207, - "step": 4360 - }, - { - "epoch": 0.41744280460428906, - "grad_norm": 0.09928039461374283, - "learning_rate": 0.0001574962962962963, - "loss": 0.1999, - "step": 4370 - }, - { - "epoch": 0.4183980512967474, - "grad_norm": 0.09767697006464005, - "learning_rate": 0.00015734814814814815, - "loss": 0.1991, - "step": 4380 - }, - { - "epoch": 0.41935329798920573, - "grad_norm": 0.1489480435848236, - "learning_rate": 0.00015720000000000003, - "loss": 0.2021, - "step": 4390 - }, - { - "epoch": 0.42030854468166406, - "grad_norm": 0.38458502292633057, - "learning_rate": 0.00015705185185185185, - "loss": 0.2065, - "step": 4400 - }, - { - "epoch": 0.42126379137412234, - "grad_norm": 0.2195194512605667, - "learning_rate": 0.00015690370370370372, - "loss": 0.2083, - "step": 4410 - }, - { - "epoch": 0.4222190380665807, - "grad_norm": 0.0922282487154007, - "learning_rate": 0.00015675555555555557, - "loss": 0.2035, - "step": 4420 - }, - { - "epoch": 0.423174284759039, - "grad_norm": 0.12092910706996918, - "learning_rate": 0.00015660740740740744, - "loss": 0.201, - "step": 4430 - }, - { - "epoch": 0.42412953145149734, - "grad_norm": 0.1265036016702652, - "learning_rate": 0.00015645925925925926, - "loss": 0.2041, - "step": 4440 - }, - { - "epoch": 0.4250847781439557, - "grad_norm": 0.36383864283561707, - "learning_rate": 0.00015631111111111113, - "loss": 0.2031, - "step": 4450 - }, - { - "epoch": 0.426040024836414, - "grad_norm": 0.2539485991001129, - "learning_rate": 0.00015616296296296298, - "loss": 0.2136, - "step": 4460 - }, - { - "epoch": 0.42699527152887234, - "grad_norm": 0.09574822336435318, - "learning_rate": 0.00015601481481481482, - "loss": 0.2051, - "step": 4470 - }, - { - "epoch": 0.4279505182213307, - "grad_norm": 0.12107813358306885, - "learning_rate": 0.00015586666666666667, - "loss": 0.2021, - "step": 4480 - }, - { - "epoch": 0.428905764913789, - "grad_norm": 0.14104434847831726, - "learning_rate": 0.00015571851851851854, - "loss": 0.2037, - "step": 4490 - }, - { - "epoch": 0.4298610116062473, - "grad_norm": 0.3190214931964874, - "learning_rate": 0.0001555703703703704, - "loss": 0.2039, - "step": 4500 - }, - { - "epoch": 0.4308162582987056, - "grad_norm": 0.15680533647537231, - "learning_rate": 0.00015542222222222224, - "loss": 0.2124, - "step": 4510 - }, - { - "epoch": 0.43177150499116396, - "grad_norm": 0.14274321496486664, - "learning_rate": 0.00015527407407407408, - "loss": 0.2027, - "step": 4520 - }, - { - "epoch": 0.4327267516836223, - "grad_norm": 0.12968724966049194, - "learning_rate": 0.00015512592592592593, - "loss": 0.2026, - "step": 4530 - }, - { - "epoch": 0.4336819983760806, - "grad_norm": 0.1476454883813858, - "learning_rate": 0.00015497777777777777, - "loss": 0.2044, - "step": 4540 - }, - { - "epoch": 0.43463724506853896, - "grad_norm": 0.3405204117298126, - "learning_rate": 0.00015482962962962965, - "loss": 0.2056, - "step": 4550 - }, - { - "epoch": 0.4355924917609973, - "grad_norm": 0.22096122801303864, - "learning_rate": 0.0001546814814814815, - "loss": 0.2055, - "step": 4560 - }, - { - "epoch": 0.4365477384534556, - "grad_norm": 0.13107611238956451, - "learning_rate": 0.00015453333333333334, - "loss": 0.2018, - "step": 4570 - }, - { - "epoch": 0.43750298514591396, - "grad_norm": 0.11204512417316437, - "learning_rate": 0.00015438518518518519, - "loss": 0.2, - "step": 4580 - }, - { - "epoch": 0.43845823183837224, - "grad_norm": 0.143476665019989, - "learning_rate": 0.00015423703703703706, - "loss": 0.2048, - "step": 4590 - }, - { - "epoch": 0.43941347853083057, - "grad_norm": 0.38437598943710327, - "learning_rate": 0.0001540888888888889, - "loss": 0.2055, - "step": 4600 - }, - { - "epoch": 0.4403687252232889, - "grad_norm": 0.2756338119506836, - "learning_rate": 0.00015394074074074075, - "loss": 0.2076, - "step": 4610 - }, - { - "epoch": 0.44132397191574724, - "grad_norm": 0.09002037346363068, - "learning_rate": 0.0001537925925925926, - "loss": 0.2036, - "step": 4620 - }, - { - "epoch": 0.44227921860820557, - "grad_norm": 0.114268459379673, - "learning_rate": 0.00015364444444444444, - "loss": 0.2011, - "step": 4630 - }, - { - "epoch": 0.4432344653006639, - "grad_norm": 0.12585988640785217, - "learning_rate": 0.0001534962962962963, - "loss": 0.2035, - "step": 4640 - }, - { - "epoch": 0.44418971199312224, - "grad_norm": 0.3231578767299652, - "learning_rate": 0.00015334814814814816, - "loss": 0.2051, - "step": 4650 - }, - { - "epoch": 0.44514495868558057, - "grad_norm": 0.27576273679733276, - "learning_rate": 0.0001532, - "loss": 0.2075, - "step": 4660 - }, - { - "epoch": 0.4461002053780389, - "grad_norm": 0.10367155820131302, - "learning_rate": 0.00015305185185185186, - "loss": 0.202, - "step": 4670 - }, - { - "epoch": 0.4470554520704972, - "grad_norm": 0.12141682952642441, - "learning_rate": 0.0001529037037037037, - "loss": 0.2026, - "step": 4680 - }, - { - "epoch": 0.4480106987629555, - "grad_norm": 0.1607910692691803, - "learning_rate": 0.00015275555555555558, - "loss": 0.2038, - "step": 4690 - }, - { - "epoch": 0.44896594545541385, - "grad_norm": 0.3699163794517517, - "learning_rate": 0.00015260740740740742, - "loss": 0.2055, - "step": 4700 - }, - { - "epoch": 0.4499211921478722, - "grad_norm": 0.12423942238092422, - "learning_rate": 0.00015245925925925927, - "loss": 0.2083, - "step": 4710 - }, - { - "epoch": 0.4508764388403305, - "grad_norm": 0.12863990664482117, - "learning_rate": 0.00015231111111111111, - "loss": 0.2006, - "step": 4720 - }, - { - "epoch": 0.45183168553278885, - "grad_norm": 0.11075339466333389, - "learning_rate": 0.00015216296296296296, - "loss": 0.201, - "step": 4730 - }, - { - "epoch": 0.4527869322252472, - "grad_norm": 0.16675962507724762, - "learning_rate": 0.00015201481481481483, - "loss": 0.2054, - "step": 4740 - }, - { - "epoch": 0.4537421789177055, - "grad_norm": 0.3847846984863281, - "learning_rate": 0.00015186666666666668, - "loss": 0.205, - "step": 4750 - }, - { - "epoch": 0.45469742561016385, - "grad_norm": 0.1789114773273468, - "learning_rate": 0.00015171851851851853, - "loss": 0.2002, - "step": 4760 - }, - { - "epoch": 0.45565267230262213, - "grad_norm": 0.09958454221487045, - "learning_rate": 0.00015157037037037037, - "loss": 0.2004, - "step": 4770 - }, - { - "epoch": 0.45660791899508046, - "grad_norm": 0.09986834973096848, - "learning_rate": 0.00015142222222222222, - "loss": 0.2008, - "step": 4780 - }, - { - "epoch": 0.4575631656875388, - "grad_norm": 0.14175297319889069, - "learning_rate": 0.00015127407407407407, - "loss": 0.2034, - "step": 4790 - }, - { - "epoch": 0.45851841237999713, - "grad_norm": 0.42737746238708496, - "learning_rate": 0.00015112592592592594, - "loss": 0.2075, - "step": 4800 - }, - { - "epoch": 0.45947365907245546, - "grad_norm": 0.3718159794807434, - "learning_rate": 0.00015097777777777779, - "loss": 0.2066, - "step": 4810 - }, - { - "epoch": 0.4604289057649138, - "grad_norm": 0.12669920921325684, - "learning_rate": 0.00015082962962962963, - "loss": 0.2045, - "step": 4820 - }, - { - "epoch": 0.46138415245737213, - "grad_norm": 0.1332644671201706, - "learning_rate": 0.00015068148148148148, - "loss": 0.2007, - "step": 4830 - }, - { - "epoch": 0.46233939914983047, - "grad_norm": 0.16694070398807526, - "learning_rate": 0.00015053333333333335, - "loss": 0.2033, - "step": 4840 - }, - { - "epoch": 0.46329464584228874, - "grad_norm": 0.3865353465080261, - "learning_rate": 0.0001503851851851852, - "loss": 0.2059, - "step": 4850 - }, - { - "epoch": 0.4642498925347471, - "grad_norm": 0.18828840553760529, - "learning_rate": 0.00015023703703703704, - "loss": 0.207, - "step": 4860 - }, - { - "epoch": 0.4652051392272054, - "grad_norm": 0.10073354095220566, - "learning_rate": 0.0001500888888888889, - "loss": 0.201, - "step": 4870 - }, - { - "epoch": 0.46616038591966374, - "grad_norm": 0.16433393955230713, - "learning_rate": 0.00014994074074074074, - "loss": 0.2012, - "step": 4880 - }, - { - "epoch": 0.4671156326121221, - "grad_norm": 0.12232095003128052, - "learning_rate": 0.00014979259259259258, - "loss": 0.2041, - "step": 4890 - }, - { - "epoch": 0.4680708793045804, - "grad_norm": 0.34756287932395935, - "learning_rate": 0.00014964444444444446, - "loss": 0.2057, - "step": 4900 - }, - { - "epoch": 0.46902612599703875, - "grad_norm": 0.22505418956279755, - "learning_rate": 0.0001494962962962963, - "loss": 0.2039, - "step": 4910 - }, - { - "epoch": 0.4699813726894971, - "grad_norm": 0.1396235227584839, - "learning_rate": 0.00014934814814814815, - "loss": 0.2052, - "step": 4920 - }, - { - "epoch": 0.4709366193819554, - "grad_norm": 0.0969369113445282, - "learning_rate": 0.0001492, - "loss": 0.2, - "step": 4930 - }, - { - "epoch": 0.4718918660744137, - "grad_norm": 0.15045002102851868, - "learning_rate": 0.00014905185185185187, - "loss": 0.2037, - "step": 4940 - }, - { - "epoch": 0.472847112766872, - "grad_norm": 0.365792840719223, - "learning_rate": 0.00014890370370370371, - "loss": 0.2063, - "step": 4950 - }, - { - "epoch": 0.47380235945933036, - "grad_norm": 0.17121556401252747, - "learning_rate": 0.00014875555555555556, - "loss": 0.1996, - "step": 4960 - }, - { - "epoch": 0.4747576061517887, - "grad_norm": 0.09126877784729004, - "learning_rate": 0.0001486074074074074, - "loss": 0.2057, - "step": 4970 - }, - { - "epoch": 0.475712852844247, - "grad_norm": 0.08809260278940201, - "learning_rate": 0.00014845925925925928, - "loss": 0.1994, - "step": 4980 - }, - { - "epoch": 0.47666809953670536, - "grad_norm": 0.1562274992465973, - "learning_rate": 0.0001483111111111111, - "loss": 0.204, - "step": 4990 - }, - { - "epoch": 0.4776233462291637, - "grad_norm": 0.2817908227443695, - "learning_rate": 0.00014816296296296297, - "loss": 0.208, - "step": 5000 - }, - { - "epoch": 0.478578592921622, - "grad_norm": 0.21794337034225464, - "learning_rate": 0.00014801481481481482, - "loss": 0.2046, - "step": 5010 - }, - { - "epoch": 0.47953383961408036, - "grad_norm": 0.08885704725980759, - "learning_rate": 0.00014786666666666666, - "loss": 0.2031, - "step": 5020 - }, - { - "epoch": 0.48048908630653864, - "grad_norm": 0.08695749193429947, - "learning_rate": 0.0001477185185185185, - "loss": 0.2023, - "step": 5030 - }, - { - "epoch": 0.48144433299899697, - "grad_norm": 0.11303768306970596, - "learning_rate": 0.00014757037037037038, - "loss": 0.2047, - "step": 5040 - }, - { - "epoch": 0.4823995796914553, - "grad_norm": 0.32796552777290344, - "learning_rate": 0.00014742222222222223, - "loss": 0.2056, - "step": 5050 - }, - { - "epoch": 0.48335482638391364, - "grad_norm": 0.17214402556419373, - "learning_rate": 0.00014727407407407408, - "loss": 0.202, - "step": 5060 - }, - { - "epoch": 0.48431007307637197, - "grad_norm": 0.131885826587677, - "learning_rate": 0.00014712592592592592, - "loss": 0.2005, - "step": 5070 - }, - { - "epoch": 0.4852653197688303, - "grad_norm": 0.09677991271018982, - "learning_rate": 0.0001469777777777778, - "loss": 0.2007, - "step": 5080 - }, - { - "epoch": 0.48622056646128864, - "grad_norm": 0.11090332269668579, - "learning_rate": 0.00014682962962962962, - "loss": 0.2052, - "step": 5090 - }, - { - "epoch": 0.487175813153747, - "grad_norm": 0.3222978711128235, - "learning_rate": 0.0001466814814814815, - "loss": 0.2057, - "step": 5100 - }, - { - "epoch": 0.4881310598462053, - "grad_norm": 0.15994799137115479, - "learning_rate": 0.00014653333333333334, - "loss": 0.2023, - "step": 5110 - }, - { - "epoch": 0.4890863065386636, - "grad_norm": 0.09031850844621658, - "learning_rate": 0.0001463851851851852, - "loss": 0.205, - "step": 5120 - }, - { - "epoch": 0.4900415532311219, - "grad_norm": 0.0860792025923729, - "learning_rate": 0.00014623703703703703, - "loss": 0.2018, - "step": 5130 - }, - { - "epoch": 0.49099679992358025, - "grad_norm": 0.14190521836280823, - "learning_rate": 0.0001460888888888889, - "loss": 0.2047, - "step": 5140 - }, - { - "epoch": 0.4919520466160386, - "grad_norm": 0.266658216714859, - "learning_rate": 0.00014594074074074075, - "loss": 0.2038, - "step": 5150 - }, - { - "epoch": 0.4929072933084969, - "grad_norm": 0.20443643629550934, - "learning_rate": 0.0001457925925925926, - "loss": 0.1998, - "step": 5160 - }, - { - "epoch": 0.49386254000095525, - "grad_norm": 0.1025327667593956, - "learning_rate": 0.00014564444444444444, - "loss": 0.2035, - "step": 5170 - }, - { - "epoch": 0.4948177866934136, - "grad_norm": 0.08821458369493484, - "learning_rate": 0.0001454962962962963, - "loss": 0.2006, - "step": 5180 - }, - { - "epoch": 0.4957730333858719, - "grad_norm": 0.10269002616405487, - "learning_rate": 0.00014534814814814813, - "loss": 0.2026, - "step": 5190 - }, - { - "epoch": 0.49672828007833025, - "grad_norm": 0.310215562582016, - "learning_rate": 0.0001452, - "loss": 0.2049, - "step": 5200 - }, - { - "epoch": 0.49768352677078853, - "grad_norm": 0.12952567636966705, - "learning_rate": 0.00014505185185185185, - "loss": 0.2056, - "step": 5210 - }, - { - "epoch": 0.49863877346324686, - "grad_norm": 0.07955840229988098, - "learning_rate": 0.00014490370370370373, - "loss": 0.1991, - "step": 5220 - }, - { - "epoch": 0.4995940201557052, - "grad_norm": 0.10103992372751236, - "learning_rate": 0.00014475555555555554, - "loss": 0.1995, - "step": 5230 - }, - { - "epoch": 0.5005492668481636, - "grad_norm": 0.1356901079416275, - "learning_rate": 0.00014460740740740742, - "loss": 0.2043, - "step": 5240 - }, - { - "epoch": 0.5015045135406219, - "grad_norm": 0.40337422490119934, - "learning_rate": 0.00014445925925925926, - "loss": 0.206, - "step": 5250 - }, - { - "epoch": 0.5024597602330801, - "grad_norm": 0.22015082836151123, - "learning_rate": 0.0001443111111111111, - "loss": 0.2009, - "step": 5260 - }, - { - "epoch": 0.5034150069255385, - "grad_norm": 0.09830496460199356, - "learning_rate": 0.00014416296296296296, - "loss": 0.202, - "step": 5270 - }, - { - "epoch": 0.5043702536179968, - "grad_norm": 0.12919305264949799, - "learning_rate": 0.00014401481481481483, - "loss": 0.2021, - "step": 5280 - }, - { - "epoch": 0.5053255003104552, - "grad_norm": 0.13084401190280914, - "learning_rate": 0.00014386666666666668, - "loss": 0.2034, - "step": 5290 - }, - { - "epoch": 0.5062807470029135, - "grad_norm": 0.33833256363868713, - "learning_rate": 0.00014371851851851852, - "loss": 0.2077, - "step": 5300 - }, - { - "epoch": 0.5072359936953719, - "grad_norm": 0.17477412521839142, - "learning_rate": 0.00014357037037037037, - "loss": 0.1996, - "step": 5310 - }, - { - "epoch": 0.5081912403878301, - "grad_norm": 0.09218191355466843, - "learning_rate": 0.00014342222222222224, - "loss": 0.2059, - "step": 5320 - }, - { - "epoch": 0.5091464870802885, - "grad_norm": 0.09006420522928238, - "learning_rate": 0.00014327407407407406, - "loss": 0.2019, - "step": 5330 - }, - { - "epoch": 0.5101017337727468, - "grad_norm": 0.12034288048744202, - "learning_rate": 0.00014312592592592593, - "loss": 0.2046, - "step": 5340 - }, - { - "epoch": 0.5110569804652051, - "grad_norm": 0.28528374433517456, - "learning_rate": 0.00014297777777777778, - "loss": 0.206, - "step": 5350 - }, - { - "epoch": 0.5120122271576635, - "grad_norm": 0.14377838373184204, - "learning_rate": 0.00014282962962962965, - "loss": 0.2025, - "step": 5360 - }, - { - "epoch": 0.5129674738501218, - "grad_norm": 0.1412467062473297, - "learning_rate": 0.00014268148148148147, - "loss": 0.2013, - "step": 5370 - }, - { - "epoch": 0.5139227205425801, - "grad_norm": 0.08147553354501724, - "learning_rate": 0.00014253333333333335, - "loss": 0.201, - "step": 5380 - }, - { - "epoch": 0.5148779672350384, - "grad_norm": 0.13719892501831055, - "learning_rate": 0.0001423851851851852, - "loss": 0.2034, - "step": 5390 - }, - { - "epoch": 0.5158332139274968, - "grad_norm": 0.3484647274017334, - "learning_rate": 0.00014223703703703704, - "loss": 0.203, - "step": 5400 - }, - { - "epoch": 0.5167884606199551, - "grad_norm": 0.1756839007139206, - "learning_rate": 0.00014208888888888889, - "loss": 0.2031, - "step": 5410 - }, - { - "epoch": 0.5177437073124135, - "grad_norm": 0.10281263291835785, - "learning_rate": 0.00014194074074074076, - "loss": 0.1998, - "step": 5420 - }, - { - "epoch": 0.5186989540048718, - "grad_norm": 0.08952672779560089, - "learning_rate": 0.0001417925925925926, - "loss": 0.1995, - "step": 5430 - }, - { - "epoch": 0.51965420069733, - "grad_norm": 0.12098149210214615, - "learning_rate": 0.00014164444444444445, - "loss": 0.2014, - "step": 5440 - }, - { - "epoch": 0.5206094473897884, - "grad_norm": 0.31019869446754456, - "learning_rate": 0.0001414962962962963, - "loss": 0.2026, - "step": 5450 - }, - { - "epoch": 0.5215646940822467, - "grad_norm": 0.13479962944984436, - "learning_rate": 0.00014134814814814817, - "loss": 0.2033, - "step": 5460 - }, - { - "epoch": 0.5225199407747051, - "grad_norm": 0.11460354179143906, - "learning_rate": 0.0001412, - "loss": 0.1978, - "step": 5470 - }, - { - "epoch": 0.5234751874671634, - "grad_norm": 0.10888929665088654, - "learning_rate": 0.00014105185185185186, - "loss": 0.1984, - "step": 5480 - }, - { - "epoch": 0.5244304341596218, - "grad_norm": 0.15640781819820404, - "learning_rate": 0.0001409037037037037, - "loss": 0.2018, - "step": 5490 - }, - { - "epoch": 0.52538568085208, - "grad_norm": 0.32829177379608154, - "learning_rate": 0.00014075555555555556, - "loss": 0.225, - "step": 5500 - }, - { - "epoch": 0.5263409275445384, - "grad_norm": 0.16984596848487854, - "learning_rate": 0.0001406074074074074, - "loss": 0.2056, - "step": 5510 - }, - { - "epoch": 0.5272961742369967, - "grad_norm": 0.13876718282699585, - "learning_rate": 0.00014045925925925928, - "loss": 0.1953, - "step": 5520 - }, - { - "epoch": 0.528251420929455, - "grad_norm": 0.10149407386779785, - "learning_rate": 0.00014031111111111112, - "loss": 0.1984, - "step": 5530 - }, - { - "epoch": 0.5292066676219134, - "grad_norm": 0.13450825214385986, - "learning_rate": 0.00014016296296296297, - "loss": 0.2034, - "step": 5540 - }, - { - "epoch": 0.5301619143143717, - "grad_norm": 0.38654589653015137, - "learning_rate": 0.00014001481481481481, - "loss": 0.2043, - "step": 5550 - }, - { - "epoch": 0.53111716100683, - "grad_norm": 0.12655580043792725, - "learning_rate": 0.0001398666666666667, - "loss": 0.2062, - "step": 5560 - }, - { - "epoch": 0.5320724076992883, - "grad_norm": 0.1314125508069992, - "learning_rate": 0.0001397185185185185, - "loss": 0.2015, - "step": 5570 - }, - { - "epoch": 0.5330276543917467, - "grad_norm": 0.09950239211320877, - "learning_rate": 0.00013957037037037038, - "loss": 0.2022, - "step": 5580 - }, - { - "epoch": 0.533982901084205, - "grad_norm": 0.12652038037776947, - "learning_rate": 0.00013942222222222223, - "loss": 0.2041, - "step": 5590 - }, - { - "epoch": 0.5349381477766634, - "grad_norm": 0.3637951910495758, - "learning_rate": 0.0001392740740740741, - "loss": 0.2052, - "step": 5600 - }, - { - "epoch": 0.5358933944691217, - "grad_norm": 0.38742905855178833, - "learning_rate": 0.00013912592592592592, - "loss": 0.2041, - "step": 5610 - }, - { - "epoch": 0.5368486411615799, - "grad_norm": 0.08842667192220688, - "learning_rate": 0.0001389777777777778, - "loss": 0.2045, - "step": 5620 - }, - { - "epoch": 0.5378038878540383, - "grad_norm": 0.08290154486894608, - "learning_rate": 0.00013882962962962964, - "loss": 0.2009, - "step": 5630 - }, - { - "epoch": 0.5387591345464966, - "grad_norm": 0.12457109987735748, - "learning_rate": 0.00013868148148148148, - "loss": 0.2037, - "step": 5640 - }, - { - "epoch": 0.539714381238955, - "grad_norm": 0.26266059279441833, - "learning_rate": 0.00013853333333333333, - "loss": 0.2048, - "step": 5650 - }, - { - "epoch": 0.5406696279314133, - "grad_norm": 0.19951313734054565, - "learning_rate": 0.0001383851851851852, - "loss": 0.2017, - "step": 5660 - }, - { - "epoch": 0.5416248746238717, - "grad_norm": 0.1073467880487442, - "learning_rate": 0.00013823703703703705, - "loss": 0.2007, - "step": 5670 - }, - { - "epoch": 0.5425801213163299, - "grad_norm": 0.08884900063276291, - "learning_rate": 0.0001380888888888889, - "loss": 0.1992, - "step": 5680 - }, - { - "epoch": 0.5435353680087883, - "grad_norm": 0.1196378618478775, - "learning_rate": 0.00013794074074074074, - "loss": 0.2018, - "step": 5690 - }, - { - "epoch": 0.5444906147012466, - "grad_norm": 0.2851475477218628, - "learning_rate": 0.00013779259259259262, - "loss": 0.2026, - "step": 5700 - }, - { - "epoch": 0.5454458613937049, - "grad_norm": 0.14857490360736847, - "learning_rate": 0.00013764444444444444, - "loss": 0.2037, - "step": 5710 - }, - { - "epoch": 0.5464011080861633, - "grad_norm": 0.102902352809906, - "learning_rate": 0.0001374962962962963, - "loss": 0.2007, - "step": 5720 - }, - { - "epoch": 0.5473563547786215, - "grad_norm": 0.10917162150144577, - "learning_rate": 0.00013734814814814815, - "loss": 0.2001, - "step": 5730 - }, - { - "epoch": 0.5483116014710799, - "grad_norm": 0.11089454591274261, - "learning_rate": 0.00013720000000000003, - "loss": 0.2025, - "step": 5740 - }, - { - "epoch": 0.5492668481635382, - "grad_norm": 0.3424724042415619, - "learning_rate": 0.00013705185185185185, - "loss": 0.204, - "step": 5750 - }, - { - "epoch": 0.5502220948559966, - "grad_norm": 0.15958619117736816, - "learning_rate": 0.00013690370370370372, - "loss": 0.1983, - "step": 5760 - }, - { - "epoch": 0.5511773415484549, - "grad_norm": 0.10369803756475449, - "learning_rate": 0.00013675555555555557, - "loss": 0.2033, - "step": 5770 - }, - { - "epoch": 0.5521325882409133, - "grad_norm": 0.08812570571899414, - "learning_rate": 0.0001366074074074074, - "loss": 0.1991, - "step": 5780 - }, - { - "epoch": 0.5530878349333715, - "grad_norm": 0.12774603068828583, - "learning_rate": 0.00013645925925925926, - "loss": 0.2034, - "step": 5790 - }, - { - "epoch": 0.5540430816258298, - "grad_norm": 0.32776811718940735, - "learning_rate": 0.00013631111111111113, - "loss": 0.2054, - "step": 5800 - }, - { - "epoch": 0.5549983283182882, - "grad_norm": 0.224983349442482, - "learning_rate": 0.00013616296296296295, - "loss": 0.1976, - "step": 5810 - }, - { - "epoch": 0.5559535750107465, - "grad_norm": 0.07955210655927658, - "learning_rate": 0.00013601481481481483, - "loss": 0.203, - "step": 5820 - }, - { - "epoch": 0.5569088217032049, - "grad_norm": 0.09523504227399826, - "learning_rate": 0.00013586666666666667, - "loss": 0.2003, - "step": 5830 - }, - { - "epoch": 0.5578640683956632, - "grad_norm": 0.11511880159378052, - "learning_rate": 0.00013571851851851854, - "loss": 0.202, - "step": 5840 - }, - { - "epoch": 0.5588193150881215, - "grad_norm": 0.39985740184783936, - "learning_rate": 0.00013557037037037036, - "loss": 0.2052, - "step": 5850 - }, - { - "epoch": 0.5597745617805798, - "grad_norm": 0.13187652826309204, - "learning_rate": 0.00013542222222222224, - "loss": 0.2024, - "step": 5860 - }, - { - "epoch": 0.5607298084730382, - "grad_norm": 0.09542467445135117, - "learning_rate": 0.00013527407407407408, - "loss": 0.2035, - "step": 5870 - }, - { - "epoch": 0.5616850551654965, - "grad_norm": 0.10654427856206894, - "learning_rate": 0.00013512592592592593, - "loss": 0.2, - "step": 5880 - }, - { - "epoch": 0.5626403018579548, - "grad_norm": 0.13208676874637604, - "learning_rate": 0.00013497777777777778, - "loss": 0.2019, - "step": 5890 - }, - { - "epoch": 0.5635955485504132, - "grad_norm": 0.32865169644355774, - "learning_rate": 0.00013482962962962965, - "loss": 0.2042, - "step": 5900 - }, - { - "epoch": 0.5645507952428714, - "grad_norm": 0.15548019111156464, - "learning_rate": 0.0001346814814814815, - "loss": 0.2016, - "step": 5910 - }, - { - "epoch": 0.5655060419353298, - "grad_norm": 0.08601599931716919, - "learning_rate": 0.00013453333333333334, - "loss": 0.1961, - "step": 5920 - }, - { - "epoch": 0.5664612886277881, - "grad_norm": 0.12143748253583908, - "learning_rate": 0.0001343851851851852, - "loss": 0.1991, - "step": 5930 - }, - { - "epoch": 0.5674165353202465, - "grad_norm": 0.16378195583820343, - "learning_rate": 0.00013423703703703706, - "loss": 0.2017, - "step": 5940 - }, - { - "epoch": 0.5683717820127048, - "grad_norm": 0.4630471467971802, - "learning_rate": 0.00013408888888888888, - "loss": 0.2049, - "step": 5950 - }, - { - "epoch": 0.5693270287051632, - "grad_norm": 0.3741738200187683, - "learning_rate": 0.00013394074074074075, - "loss": 0.2014, - "step": 5960 - }, - { - "epoch": 0.5702822753976214, - "grad_norm": 0.1016160398721695, - "learning_rate": 0.0001337925925925926, - "loss": 0.1998, - "step": 5970 - }, - { - "epoch": 0.5712375220900797, - "grad_norm": 0.09003070741891861, - "learning_rate": 0.00013364444444444447, - "loss": 0.1973, - "step": 5980 - }, - { - "epoch": 0.5721927687825381, - "grad_norm": 0.12196210771799088, - "learning_rate": 0.0001334962962962963, - "loss": 0.2013, - "step": 5990 - }, - { - "epoch": 0.5731480154749964, - "grad_norm": 0.3323560655117035, - "learning_rate": 0.00013334814814814817, - "loss": 0.2038, - "step": 6000 - }, - { - "epoch": 0.5741032621674548, - "grad_norm": 0.14074192941188812, - "learning_rate": 0.0001332, - "loss": 0.2042, - "step": 6010 - }, - { - "epoch": 0.575058508859913, - "grad_norm": 0.08944179117679596, - "learning_rate": 0.00013305185185185186, - "loss": 0.1973, - "step": 6020 - }, - { - "epoch": 0.5760137555523714, - "grad_norm": 0.09902766346931458, - "learning_rate": 0.0001329037037037037, - "loss": 0.1999, - "step": 6030 - }, - { - "epoch": 0.5769690022448297, - "grad_norm": 0.13410291075706482, - "learning_rate": 0.00013275555555555558, - "loss": 0.2037, - "step": 6040 - }, - { - "epoch": 0.5779242489372881, - "grad_norm": 0.27775460481643677, - "learning_rate": 0.00013260740740740742, - "loss": 0.2043, - "step": 6050 - }, - { - "epoch": 0.5788794956297464, - "grad_norm": 0.09671878814697266, - "learning_rate": 0.00013245925925925927, - "loss": 0.2022, - "step": 6060 - }, - { - "epoch": 0.5798347423222047, - "grad_norm": 0.09354133903980255, - "learning_rate": 0.00013231111111111112, - "loss": 0.1969, - "step": 6070 - }, - { - "epoch": 0.580789989014663, - "grad_norm": 0.10099305212497711, - "learning_rate": 0.000132162962962963, - "loss": 0.1995, - "step": 6080 - }, - { - "epoch": 0.5817452357071213, - "grad_norm": 0.13947731256484985, - "learning_rate": 0.0001320148148148148, - "loss": 0.2018, - "step": 6090 - }, - { - "epoch": 0.5827004823995797, - "grad_norm": 0.30881768465042114, - "learning_rate": 0.00013186666666666668, - "loss": 0.2043, - "step": 6100 - }, - { - "epoch": 0.583655729092038, - "grad_norm": 0.12652377784252167, - "learning_rate": 0.00013171851851851853, - "loss": 0.203, - "step": 6110 - }, - { - "epoch": 0.5846109757844964, - "grad_norm": 0.10004782676696777, - "learning_rate": 0.00013157037037037038, - "loss": 0.1979, - "step": 6120 - }, - { - "epoch": 0.5855662224769547, - "grad_norm": 0.09912140667438507, - "learning_rate": 0.00013142222222222222, - "loss": 0.1981, - "step": 6130 - }, - { - "epoch": 0.586521469169413, - "grad_norm": 0.14001400768756866, - "learning_rate": 0.0001312740740740741, - "loss": 0.2011, - "step": 6140 - }, - { - "epoch": 0.5874767158618713, - "grad_norm": 0.3409046232700348, - "learning_rate": 0.00013112592592592594, - "loss": 0.2032, - "step": 6150 - }, - { - "epoch": 0.5884319625543296, - "grad_norm": 0.15291565656661987, - "learning_rate": 0.0001309777777777778, - "loss": 0.2008, - "step": 6160 - }, - { - "epoch": 0.589387209246788, - "grad_norm": 0.09848395735025406, - "learning_rate": 0.00013082962962962963, - "loss": 0.2017, - "step": 6170 - }, - { - "epoch": 0.5903424559392463, - "grad_norm": 0.11449220776557922, - "learning_rate": 0.0001306814814814815, - "loss": 0.1994, - "step": 6180 - }, - { - "epoch": 0.5912977026317047, - "grad_norm": 0.15763157606124878, - "learning_rate": 0.00013053333333333333, - "loss": 0.2034, - "step": 6190 - }, - { - "epoch": 0.592252949324163, - "grad_norm": 0.34295904636383057, - "learning_rate": 0.0001303851851851852, - "loss": 0.204, - "step": 6200 - }, - { - "epoch": 0.5932081960166213, - "grad_norm": 0.168208509683609, - "learning_rate": 0.00013023703703703705, - "loss": 0.2024, - "step": 6210 - }, - { - "epoch": 0.5941634427090796, - "grad_norm": 0.08638432621955872, - "learning_rate": 0.0001300888888888889, - "loss": 0.2013, - "step": 6220 - }, - { - "epoch": 0.5951186894015379, - "grad_norm": 0.105532206594944, - "learning_rate": 0.00012994074074074074, - "loss": 0.199, - "step": 6230 - }, - { - "epoch": 0.5960739360939963, - "grad_norm": 0.1493140310049057, - "learning_rate": 0.0001297925925925926, - "loss": 0.2019, - "step": 6240 - }, - { - "epoch": 0.5970291827864546, - "grad_norm": 0.27100884914398193, - "learning_rate": 0.00012964444444444446, - "loss": 0.2039, - "step": 6250 - }, - { - "epoch": 0.597984429478913, - "grad_norm": 0.16155314445495605, - "learning_rate": 0.0001294962962962963, - "loss": 0.2014, - "step": 6260 - }, - { - "epoch": 0.5989396761713712, - "grad_norm": 0.13234031200408936, - "learning_rate": 0.00012934814814814815, - "loss": 0.2014, - "step": 6270 - }, - { - "epoch": 0.5998949228638296, - "grad_norm": 0.10277749598026276, - "learning_rate": 0.00012920000000000002, - "loss": 0.1998, - "step": 6280 - }, - { - "epoch": 0.6008501695562879, - "grad_norm": 0.1772095113992691, - "learning_rate": 0.00012905185185185187, - "loss": 0.2015, - "step": 6290 - }, - { - "epoch": 0.6018054162487463, - "grad_norm": 0.2691553831100464, - "learning_rate": 0.00012890370370370372, - "loss": 0.2036, - "step": 6300 - }, - { - "epoch": 0.6027606629412046, - "grad_norm": 0.1066615879535675, - "learning_rate": 0.00012875555555555556, - "loss": 0.2018, - "step": 6310 - }, - { - "epoch": 0.6037159096336628, - "grad_norm": 0.08042429387569427, - "learning_rate": 0.0001286074074074074, - "loss": 0.1989, - "step": 6320 - }, - { - "epoch": 0.6046711563261212, - "grad_norm": 0.10551764816045761, - "learning_rate": 0.00012845925925925925, - "loss": 0.1994, - "step": 6330 - }, - { - "epoch": 0.6056264030185795, - "grad_norm": 0.1348186433315277, - "learning_rate": 0.00012831111111111113, - "loss": 0.2035, - "step": 6340 - }, - { - "epoch": 0.6065816497110379, - "grad_norm": 0.3124079406261444, - "learning_rate": 0.00012816296296296297, - "loss": 0.2046, - "step": 6350 - }, - { - "epoch": 0.6075368964034962, - "grad_norm": 0.1197357103228569, - "learning_rate": 0.00012801481481481482, - "loss": 0.1994, - "step": 6360 - }, - { - "epoch": 0.6084921430959546, - "grad_norm": 0.08718045055866241, - "learning_rate": 0.00012786666666666667, - "loss": 0.1996, - "step": 6370 - }, - { - "epoch": 0.6094473897884128, - "grad_norm": 0.13004030287265778, - "learning_rate": 0.0001277185185185185, - "loss": 0.1988, - "step": 6380 - }, - { - "epoch": 0.6104026364808712, - "grad_norm": 0.1345457136631012, - "learning_rate": 0.00012757037037037039, - "loss": 0.2028, - "step": 6390 - }, - { - "epoch": 0.6113578831733295, - "grad_norm": 0.3091771900653839, - "learning_rate": 0.00012742222222222223, - "loss": 0.2078, - "step": 6400 - }, - { - "epoch": 0.6123131298657878, - "grad_norm": 0.15620669722557068, - "learning_rate": 0.00012727407407407408, - "loss": 0.2007, - "step": 6410 - }, - { - "epoch": 0.6132683765582462, - "grad_norm": 0.10864575207233429, - "learning_rate": 0.00012712592592592592, - "loss": 0.2002, - "step": 6420 - }, - { - "epoch": 0.6142236232507045, - "grad_norm": 0.09514521807432175, - "learning_rate": 0.00012697777777777777, - "loss": 0.2, - "step": 6430 - }, - { - "epoch": 0.6151788699431628, - "grad_norm": 0.12483695149421692, - "learning_rate": 0.00012682962962962964, - "loss": 0.202, - "step": 6440 - }, - { - "epoch": 0.6161341166356211, - "grad_norm": 0.22912171483039856, - "learning_rate": 0.0001266814814814815, - "loss": 0.2036, - "step": 6450 - }, - { - "epoch": 0.6170893633280795, - "grad_norm": 0.14288096129894257, - "learning_rate": 0.00012653333333333334, - "loss": 0.1948, - "step": 6460 - }, - { - "epoch": 0.6180446100205378, - "grad_norm": 0.11408117413520813, - "learning_rate": 0.00012638518518518518, - "loss": 0.2016, - "step": 6470 - }, - { - "epoch": 0.6189998567129962, - "grad_norm": 0.07267523556947708, - "learning_rate": 0.00012623703703703703, - "loss": 0.1998, - "step": 6480 - }, - { - "epoch": 0.6199551034054545, - "grad_norm": 0.11102595180273056, - "learning_rate": 0.0001260888888888889, - "loss": 0.2015, - "step": 6490 - }, - { - "epoch": 0.6209103500979127, - "grad_norm": 0.21290349960327148, - "learning_rate": 0.00012594074074074075, - "loss": 0.2053, - "step": 6500 - }, - { - "epoch": 0.6218655967903711, - "grad_norm": 0.12181617319583893, - "learning_rate": 0.0001257925925925926, - "loss": 0.1984, - "step": 6510 - }, - { - "epoch": 0.6228208434828294, - "grad_norm": 0.15453316271305084, - "learning_rate": 0.00012564444444444444, - "loss": 0.1981, - "step": 6520 - }, - { - "epoch": 0.6237760901752878, - "grad_norm": 0.12571464478969574, - "learning_rate": 0.00012549629629629631, - "loss": 0.1993, - "step": 6530 - }, - { - "epoch": 0.6247313368677461, - "grad_norm": 0.12125848233699799, - "learning_rate": 0.00012534814814814816, - "loss": 0.2018, - "step": 6540 - }, - { - "epoch": 0.6256865835602045, - "grad_norm": 0.23237983882427216, - "learning_rate": 0.0001252, - "loss": 0.2053, - "step": 6550 - }, - { - "epoch": 0.6266418302526627, - "grad_norm": 0.09626813232898712, - "learning_rate": 0.00012505185185185185, - "loss": 0.1961, - "step": 6560 - }, - { - "epoch": 0.6275970769451211, - "grad_norm": 0.08862323313951492, - "learning_rate": 0.0001249037037037037, - "loss": 0.1977, - "step": 6570 - }, - { - "epoch": 0.6285523236375794, - "grad_norm": 0.08696942031383514, - "learning_rate": 0.00012475555555555555, - "loss": 0.1996, - "step": 6580 - }, - { - "epoch": 0.6295075703300377, - "grad_norm": 0.14030681550502777, - "learning_rate": 0.00012460740740740742, - "loss": 0.2018, - "step": 6590 - }, - { - "epoch": 0.6304628170224961, - "grad_norm": 0.3134588301181793, - "learning_rate": 0.00012445925925925927, - "loss": 0.2036, - "step": 6600 - }, - { - "epoch": 0.6314180637149543, - "grad_norm": 0.14569984376430511, - "learning_rate": 0.0001243111111111111, - "loss": 0.1973, - "step": 6610 - }, - { - "epoch": 0.6323733104074127, - "grad_norm": 0.09531128406524658, - "learning_rate": 0.00012416296296296296, - "loss": 0.1997, - "step": 6620 - }, - { - "epoch": 0.633328557099871, - "grad_norm": 0.07848309725522995, - "learning_rate": 0.00012401481481481483, - "loss": 0.1996, - "step": 6630 - }, - { - "epoch": 0.6342838037923294, - "grad_norm": 0.1390487551689148, - "learning_rate": 0.00012386666666666665, - "loss": 0.2015, - "step": 6640 - }, - { - "epoch": 0.6352390504847877, - "grad_norm": 0.29697003960609436, - "learning_rate": 0.00012371851851851852, - "loss": 0.2038, - "step": 6650 - }, - { - "epoch": 0.6361942971772461, - "grad_norm": 0.16010154783725739, - "learning_rate": 0.00012357037037037037, - "loss": 0.1974, - "step": 6660 - }, - { - "epoch": 0.6371495438697043, - "grad_norm": 0.15159407258033752, - "learning_rate": 0.00012342222222222224, - "loss": 0.1984, - "step": 6670 - }, - { - "epoch": 0.6381047905621626, - "grad_norm": 0.1260843575000763, - "learning_rate": 0.00012327407407407406, - "loss": 0.1992, - "step": 6680 - }, - { - "epoch": 0.639060037254621, - "grad_norm": 0.1428649127483368, - "learning_rate": 0.00012312592592592594, - "loss": 0.2008, - "step": 6690 - }, - { - "epoch": 0.6400152839470793, - "grad_norm": 0.28894853591918945, - "learning_rate": 0.00012297777777777778, - "loss": 0.2038, - "step": 6700 - }, - { - "epoch": 0.6409705306395377, - "grad_norm": 0.10079352557659149, - "learning_rate": 0.00012282962962962963, - "loss": 0.196, - "step": 6710 - }, - { - "epoch": 0.641925777331996, - "grad_norm": 0.08398754894733429, - "learning_rate": 0.00012268148148148147, - "loss": 0.1973, - "step": 6720 - }, - { - "epoch": 0.6428810240244544, - "grad_norm": 0.10271371901035309, - "learning_rate": 0.00012253333333333335, - "loss": 0.1993, - "step": 6730 - }, - { - "epoch": 0.6438362707169126, - "grad_norm": 0.14714913070201874, - "learning_rate": 0.00012238518518518517, - "loss": 0.2017, - "step": 6740 - }, - { - "epoch": 0.644791517409371, - "grad_norm": 0.4217683970928192, - "learning_rate": 0.00012223703703703704, - "loss": 0.2046, - "step": 6750 - }, - { - "epoch": 0.6457467641018293, - "grad_norm": 0.11601690202951431, - "learning_rate": 0.0001220888888888889, - "loss": 0.1994, - "step": 6760 - }, - { - "epoch": 0.6467020107942876, - "grad_norm": 0.14154985547065735, - "learning_rate": 0.00012194074074074076, - "loss": 0.1992, - "step": 6770 - }, - { - "epoch": 0.647657257486746, - "grad_norm": 0.09532088786363602, - "learning_rate": 0.00012179259259259259, - "loss": 0.1996, - "step": 6780 - }, - { - "epoch": 0.6486125041792042, - "grad_norm": 0.13837966322898865, - "learning_rate": 0.00012164444444444445, - "loss": 0.2002, - "step": 6790 - }, - { - "epoch": 0.6495677508716626, - "grad_norm": 0.23131221532821655, - "learning_rate": 0.00012149629629629631, - "loss": 0.2038, - "step": 6800 - }, - { - "epoch": 0.6505229975641209, - "grad_norm": 0.24057185649871826, - "learning_rate": 0.00012134814814814815, - "loss": 0.1961, - "step": 6810 - }, - { - "epoch": 0.6514782442565793, - "grad_norm": 0.092747263610363, - "learning_rate": 0.0001212, - "loss": 0.1977, - "step": 6820 - }, - { - "epoch": 0.6524334909490376, - "grad_norm": 0.07853356748819351, - "learning_rate": 0.00012105185185185186, - "loss": 0.1983, - "step": 6830 - }, - { - "epoch": 0.653388737641496, - "grad_norm": 0.13685356080532074, - "learning_rate": 0.00012090370370370372, - "loss": 0.2029, - "step": 6840 - }, - { - "epoch": 0.6543439843339542, - "grad_norm": 0.3275240361690521, - "learning_rate": 0.00012075555555555556, - "loss": 0.2061, - "step": 6850 - }, - { - "epoch": 0.6552992310264125, - "grad_norm": 0.1146525964140892, - "learning_rate": 0.00012060740740740742, - "loss": 0.1988, - "step": 6860 - }, - { - "epoch": 0.6562544777188709, - "grad_norm": 0.08507192134857178, - "learning_rate": 0.00012045925925925928, - "loss": 0.1976, - "step": 6870 - }, - { - "epoch": 0.6572097244113292, - "grad_norm": 0.10274770855903625, - "learning_rate": 0.00012031111111111111, - "loss": 0.2, - "step": 6880 - }, - { - "epoch": 0.6581649711037876, - "grad_norm": 0.12793031334877014, - "learning_rate": 0.00012016296296296297, - "loss": 0.203, - "step": 6890 - }, - { - "epoch": 0.6591202177962459, - "grad_norm": 0.25618916749954224, - "learning_rate": 0.00012001481481481483, - "loss": 0.2042, - "step": 6900 - }, - { - "epoch": 0.6600754644887042, - "grad_norm": 0.11048085242509842, - "learning_rate": 0.00011986666666666669, - "loss": 0.1982, - "step": 6910 - }, - { - "epoch": 0.6610307111811625, - "grad_norm": 0.13066765666007996, - "learning_rate": 0.00011971851851851852, - "loss": 0.1985, - "step": 6920 - }, - { - "epoch": 0.6619859578736209, - "grad_norm": 0.09768009185791016, - "learning_rate": 0.00011957037037037038, - "loss": 0.1991, - "step": 6930 - }, - { - "epoch": 0.6629412045660792, - "grad_norm": 0.11774328351020813, - "learning_rate": 0.00011942222222222224, - "loss": 0.2029, - "step": 6940 - }, - { - "epoch": 0.6638964512585375, - "grad_norm": 0.2897031307220459, - "learning_rate": 0.00011927407407407407, - "loss": 0.2054, - "step": 6950 - }, - { - "epoch": 0.6648516979509959, - "grad_norm": 0.1458863914012909, - "learning_rate": 0.00011912592592592593, - "loss": 0.1971, - "step": 6960 - }, - { - "epoch": 0.6658069446434541, - "grad_norm": 0.10939256846904755, - "learning_rate": 0.0001189777777777778, - "loss": 0.1987, - "step": 6970 - }, - { - "epoch": 0.6667621913359125, - "grad_norm": 0.09529370814561844, - "learning_rate": 0.00011882962962962964, - "loss": 0.1975, - "step": 6980 - }, - { - "epoch": 0.6677174380283708, - "grad_norm": 0.1301857829093933, - "learning_rate": 0.00011868148148148149, - "loss": 0.2006, - "step": 6990 - }, - { - "epoch": 0.6686726847208292, - "grad_norm": 0.318764328956604, - "learning_rate": 0.00011853333333333335, - "loss": 0.2028, - "step": 7000 - }, - { - "epoch": 0.6696279314132875, - "grad_norm": 0.12811118364334106, - "learning_rate": 0.0001183851851851852, - "loss": 0.1977, - "step": 7010 - }, - { - "epoch": 0.6705831781057459, - "grad_norm": 0.09584329277276993, - "learning_rate": 0.00011823703703703704, - "loss": 0.1971, - "step": 7020 - }, - { - "epoch": 0.6715384247982041, - "grad_norm": 0.10128890722990036, - "learning_rate": 0.0001180888888888889, - "loss": 0.1976, - "step": 7030 - }, - { - "epoch": 0.6724936714906624, - "grad_norm": 0.11427022516727448, - "learning_rate": 0.00011794074074074076, - "loss": 0.2012, - "step": 7040 - }, - { - "epoch": 0.6734489181831208, - "grad_norm": 0.34024617075920105, - "learning_rate": 0.00011779259259259259, - "loss": 0.2044, - "step": 7050 - }, - { - "epoch": 0.6744041648755791, - "grad_norm": 0.09404119104146957, - "learning_rate": 0.00011764444444444445, - "loss": 0.1979, - "step": 7060 - }, - { - "epoch": 0.6753594115680375, - "grad_norm": 0.09541890770196915, - "learning_rate": 0.00011749629629629631, - "loss": 0.1984, - "step": 7070 - }, - { - "epoch": 0.6763146582604957, - "grad_norm": 0.1030392274260521, - "learning_rate": 0.00011734814814814816, - "loss": 0.1988, - "step": 7080 - }, - { - "epoch": 0.6772699049529541, - "grad_norm": 0.1543877124786377, - "learning_rate": 0.0001172, - "loss": 0.202, - "step": 7090 - }, - { - "epoch": 0.6782251516454124, - "grad_norm": 0.291363924741745, - "learning_rate": 0.00011705185185185186, - "loss": 0.2039, - "step": 7100 - }, - { - "epoch": 0.6791803983378708, - "grad_norm": 0.1557106226682663, - "learning_rate": 0.00011690370370370371, - "loss": 0.1975, - "step": 7110 - }, - { - "epoch": 0.6801356450303291, - "grad_norm": 0.13644851744174957, - "learning_rate": 0.00011675555555555556, - "loss": 0.1995, - "step": 7120 - }, - { - "epoch": 0.6810908917227874, - "grad_norm": 0.10906966030597687, - "learning_rate": 0.00011660740740740741, - "loss": 0.2003, - "step": 7130 - }, - { - "epoch": 0.6820461384152458, - "grad_norm": 0.1362205445766449, - "learning_rate": 0.00011645925925925927, - "loss": 0.2019, - "step": 7140 - }, - { - "epoch": 0.683001385107704, - "grad_norm": 0.22184917330741882, - "learning_rate": 0.00011631111111111112, - "loss": 0.2032, - "step": 7150 - }, - { - "epoch": 0.6839566318001624, - "grad_norm": 0.14768429100513458, - "learning_rate": 0.00011616296296296297, - "loss": 0.2001, - "step": 7160 - }, - { - "epoch": 0.6849118784926207, - "grad_norm": 0.10601145029067993, - "learning_rate": 0.00011601481481481483, - "loss": 0.2006, - "step": 7170 - }, - { - "epoch": 0.6858671251850791, - "grad_norm": 0.09267520159482956, - "learning_rate": 0.00011586666666666667, - "loss": 0.198, - "step": 7180 - }, - { - "epoch": 0.6868223718775374, - "grad_norm": 0.11036325246095657, - "learning_rate": 0.00011571851851851852, - "loss": 0.2021, - "step": 7190 - }, - { - "epoch": 0.6877776185699958, - "grad_norm": 0.22072440385818481, - "learning_rate": 0.00011557037037037038, - "loss": 0.2029, - "step": 7200 - }, - { - "epoch": 0.688732865262454, - "grad_norm": 0.1525413691997528, - "learning_rate": 0.00011542222222222223, - "loss": 0.1984, - "step": 7210 - }, - { - "epoch": 0.6896881119549123, - "grad_norm": 0.13061276078224182, - "learning_rate": 0.00011527407407407409, - "loss": 0.1998, - "step": 7220 - }, - { - "epoch": 0.6906433586473707, - "grad_norm": 0.07936930656433105, - "learning_rate": 0.00011512592592592593, - "loss": 0.199, - "step": 7230 - }, - { - "epoch": 0.691598605339829, - "grad_norm": 0.14293666183948517, - "learning_rate": 0.00011497777777777778, - "loss": 0.2016, - "step": 7240 - }, - { - "epoch": 0.6925538520322874, - "grad_norm": 0.2694840431213379, - "learning_rate": 0.00011482962962962964, - "loss": 0.2048, - "step": 7250 - }, - { - "epoch": 0.6935090987247456, - "grad_norm": 0.15118570625782013, - "learning_rate": 0.00011468148148148148, - "loss": 0.1995, - "step": 7260 - }, - { - "epoch": 0.694464345417204, - "grad_norm": 0.12141676247119904, - "learning_rate": 0.00011453333333333334, - "loss": 0.1979, - "step": 7270 - }, - { - "epoch": 0.6954195921096623, - "grad_norm": 0.07915287464857101, - "learning_rate": 0.00011438518518518519, - "loss": 0.198, - "step": 7280 - }, - { - "epoch": 0.6963748388021207, - "grad_norm": 0.1203601062297821, - "learning_rate": 0.00011423703703703705, - "loss": 0.2027, - "step": 7290 - }, - { - "epoch": 0.697330085494579, - "grad_norm": 0.22973081469535828, - "learning_rate": 0.0001140888888888889, - "loss": 0.2053, - "step": 7300 - }, - { - "epoch": 0.6982853321870373, - "grad_norm": 0.1236661821603775, - "learning_rate": 0.00011394074074074074, - "loss": 0.1994, - "step": 7310 - }, - { - "epoch": 0.6992405788794956, - "grad_norm": 0.10576550662517548, - "learning_rate": 0.0001137925925925926, - "loss": 0.1993, - "step": 7320 - }, - { - "epoch": 0.7001958255719539, - "grad_norm": 0.08561199903488159, - "learning_rate": 0.00011364444444444445, - "loss": 0.1984, - "step": 7330 - }, - { - "epoch": 0.7011510722644123, - "grad_norm": 0.1306900978088379, - "learning_rate": 0.0001134962962962963, - "loss": 0.2036, - "step": 7340 - }, - { - "epoch": 0.7021063189568706, - "grad_norm": 0.22850926220417023, - "learning_rate": 0.00011334814814814815, - "loss": 0.2043, - "step": 7350 - }, - { - "epoch": 0.703061565649329, - "grad_norm": 0.14131084084510803, - "learning_rate": 0.0001132, - "loss": 0.1969, - "step": 7360 - }, - { - "epoch": 0.7040168123417873, - "grad_norm": 0.12975451350212097, - "learning_rate": 0.00011305185185185185, - "loss": 0.2002, - "step": 7370 - }, - { - "epoch": 0.7049720590342456, - "grad_norm": 0.10148114711046219, - "learning_rate": 0.0001129037037037037, - "loss": 0.1998, - "step": 7380 - }, - { - "epoch": 0.7059273057267039, - "grad_norm": 0.10766816139221191, - "learning_rate": 0.00011275555555555557, - "loss": 0.2024, - "step": 7390 - }, - { - "epoch": 0.7068825524191622, - "grad_norm": 0.2067338526248932, - "learning_rate": 0.00011260740740740741, - "loss": 0.2044, - "step": 7400 - }, - { - "epoch": 0.7078377991116206, - "grad_norm": 0.17065021395683289, - "learning_rate": 0.00011245925925925926, - "loss": 0.1972, - "step": 7410 - }, - { - "epoch": 0.7087930458040789, - "grad_norm": 0.14703714847564697, - "learning_rate": 0.00011231111111111112, - "loss": 0.1993, - "step": 7420 - }, - { - "epoch": 0.7097482924965373, - "grad_norm": 0.0743475928902626, - "learning_rate": 0.00011216296296296296, - "loss": 0.1993, - "step": 7430 - }, - { - "epoch": 0.7107035391889955, - "grad_norm": 0.12671244144439697, - "learning_rate": 0.00011201481481481481, - "loss": 0.2015, - "step": 7440 - }, - { - "epoch": 0.7116587858814539, - "grad_norm": 0.20756767690181732, - "learning_rate": 0.00011186666666666667, - "loss": 0.2038, - "step": 7450 - }, - { - "epoch": 0.7126140325739122, - "grad_norm": 0.13475944101810455, - "learning_rate": 0.00011171851851851853, - "loss": 0.1946, - "step": 7460 - }, - { - "epoch": 0.7135692792663706, - "grad_norm": 0.14000599086284637, - "learning_rate": 0.00011157037037037036, - "loss": 0.1989, - "step": 7470 - }, - { - "epoch": 0.7145245259588289, - "grad_norm": 0.08982401341199875, - "learning_rate": 0.00011142222222222222, - "loss": 0.1998, - "step": 7480 - }, - { - "epoch": 0.7154797726512871, - "grad_norm": 0.13047632575035095, - "learning_rate": 0.00011127407407407408, - "loss": 0.202, - "step": 7490 - }, - { - "epoch": 0.7164350193437455, - "grad_norm": 0.2293279767036438, - "learning_rate": 0.00011112592592592592, - "loss": 0.2033, - "step": 7500 - }, - { - "epoch": 0.7173902660362038, - "grad_norm": 0.18973354995250702, - "learning_rate": 0.00011097777777777778, - "loss": 0.1974, - "step": 7510 - }, - { - "epoch": 0.7183455127286622, - "grad_norm": 0.16436341404914856, - "learning_rate": 0.00011082962962962964, - "loss": 0.1988, - "step": 7520 - }, - { - "epoch": 0.7193007594211205, - "grad_norm": 0.0743977501988411, - "learning_rate": 0.0001106814814814815, - "loss": 0.1978, - "step": 7530 - }, - { - "epoch": 0.7202560061135789, - "grad_norm": 0.09269341826438904, - "learning_rate": 0.00011053333333333333, - "loss": 0.2011, - "step": 7540 - }, - { - "epoch": 0.7212112528060372, - "grad_norm": 0.19990885257720947, - "learning_rate": 0.00011038518518518519, - "loss": 0.2035, - "step": 7550 - }, - { - "epoch": 0.7221664994984955, - "grad_norm": 0.1131897047162056, - "learning_rate": 0.00011023703703703705, - "loss": 0.1956, - "step": 7560 - }, - { - "epoch": 0.7231217461909538, - "grad_norm": 0.08888459950685501, - "learning_rate": 0.00011008888888888888, - "loss": 0.1967, - "step": 7570 - }, - { - "epoch": 0.7240769928834121, - "grad_norm": 0.08514232188463211, - "learning_rate": 0.00010994074074074074, - "loss": 0.1992, - "step": 7580 - }, - { - "epoch": 0.7250322395758705, - "grad_norm": 0.09677214175462723, - "learning_rate": 0.0001097925925925926, - "loss": 0.2015, - "step": 7590 - }, - { - "epoch": 0.7259874862683288, - "grad_norm": 0.20914286375045776, - "learning_rate": 0.00010964444444444446, - "loss": 0.2042, - "step": 7600 - }, - { - "epoch": 0.7269427329607872, - "grad_norm": 0.12801726162433624, - "learning_rate": 0.00010949629629629629, - "loss": 0.1967, - "step": 7610 - }, - { - "epoch": 0.7278979796532454, - "grad_norm": 0.10206489264965057, - "learning_rate": 0.00010934814814814815, - "loss": 0.1984, - "step": 7620 - }, - { - "epoch": 0.7288532263457038, - "grad_norm": 0.08488079160451889, - "learning_rate": 0.00010920000000000001, - "loss": 0.1983, - "step": 7630 - }, - { - "epoch": 0.7298084730381621, - "grad_norm": 0.15926392376422882, - "learning_rate": 0.00010905185185185184, - "loss": 0.2024, - "step": 7640 - }, - { - "epoch": 0.7307637197306205, - "grad_norm": 0.1908629834651947, - "learning_rate": 0.0001089037037037037, - "loss": 0.205, - "step": 7650 - }, - { - "epoch": 0.7317189664230788, - "grad_norm": 0.1444040983915329, - "learning_rate": 0.00010875555555555556, - "loss": 0.1973, - "step": 7660 - }, - { - "epoch": 0.732674213115537, - "grad_norm": 0.12023048847913742, - "learning_rate": 0.0001086074074074074, - "loss": 0.1969, - "step": 7670 - }, - { - "epoch": 0.7336294598079954, - "grad_norm": 0.09281352162361145, - "learning_rate": 0.00010845925925925926, - "loss": 0.1987, - "step": 7680 - }, - { - "epoch": 0.7345847065004537, - "grad_norm": 0.13516128063201904, - "learning_rate": 0.00010831111111111112, - "loss": 0.2037, - "step": 7690 - }, - { - "epoch": 0.7355399531929121, - "grad_norm": 0.2315383404493332, - "learning_rate": 0.00010816296296296298, - "loss": 0.203, - "step": 7700 - }, - { - "epoch": 0.7364951998853704, - "grad_norm": 0.1517767459154129, - "learning_rate": 0.00010801481481481481, - "loss": 0.199, - "step": 7710 - }, - { - "epoch": 0.7374504465778288, - "grad_norm": 0.10027414560317993, - "learning_rate": 0.00010786666666666667, - "loss": 0.1976, - "step": 7720 - }, - { - "epoch": 0.738405693270287, - "grad_norm": 0.07501152157783508, - "learning_rate": 0.00010771851851851853, - "loss": 0.1983, - "step": 7730 - }, - { - "epoch": 0.7393609399627454, - "grad_norm": 0.10912565141916275, - "learning_rate": 0.00010757037037037036, - "loss": 0.202, - "step": 7740 - }, - { - "epoch": 0.7403161866552037, - "grad_norm": 0.21067175269126892, - "learning_rate": 0.00010742222222222222, - "loss": 0.2043, - "step": 7750 - }, - { - "epoch": 0.741271433347662, - "grad_norm": 0.11937571316957474, - "learning_rate": 0.00010727407407407408, - "loss": 0.1966, - "step": 7760 - }, - { - "epoch": 0.7422266800401204, - "grad_norm": 0.1493697464466095, - "learning_rate": 0.00010712592592592594, - "loss": 0.1974, - "step": 7770 - }, - { - "epoch": 0.7431819267325787, - "grad_norm": 0.07944036275148392, - "learning_rate": 0.00010697777777777777, - "loss": 0.1983, - "step": 7780 - }, - { - "epoch": 0.744137173425037, - "grad_norm": 0.12938891351222992, - "learning_rate": 0.00010682962962962963, - "loss": 0.202, - "step": 7790 - }, - { - "epoch": 0.7450924201174953, - "grad_norm": 0.19698497653007507, - "learning_rate": 0.00010668148148148149, - "loss": 0.2043, - "step": 7800 - }, - { - "epoch": 0.7460476668099537, - "grad_norm": 0.12791553139686584, - "learning_rate": 0.00010653333333333333, - "loss": 0.1954, - "step": 7810 - }, - { - "epoch": 0.747002913502412, - "grad_norm": 0.1833954155445099, - "learning_rate": 0.00010638518518518519, - "loss": 0.1982, - "step": 7820 - }, - { - "epoch": 0.7479581601948704, - "grad_norm": 0.07448244094848633, - "learning_rate": 0.00010623703703703704, - "loss": 0.1995, - "step": 7830 - }, - { - "epoch": 0.7489134068873287, - "grad_norm": 0.18532446026802063, - "learning_rate": 0.0001060888888888889, - "loss": 0.2029, - "step": 7840 - }, - { - "epoch": 0.7498686535797869, - "grad_norm": 0.1823538988828659, - "learning_rate": 0.00010594074074074074, - "loss": 0.2045, - "step": 7850 - }, - { - "epoch": 0.7508239002722453, - "grad_norm": 0.17263145744800568, - "learning_rate": 0.0001057925925925926, - "loss": 0.1949, - "step": 7860 - }, - { - "epoch": 0.7517791469647036, - "grad_norm": 0.16599738597869873, - "learning_rate": 0.00010564444444444446, - "loss": 0.1974, - "step": 7870 - }, - { - "epoch": 0.752734393657162, - "grad_norm": 0.07930149137973785, - "learning_rate": 0.00010549629629629629, - "loss": 0.1969, - "step": 7880 - }, - { - "epoch": 0.7536896403496203, - "grad_norm": 0.16069641709327698, - "learning_rate": 0.00010534814814814815, - "loss": 0.2029, - "step": 7890 - }, - { - "epoch": 0.7546448870420787, - "grad_norm": 0.20293623208999634, - "learning_rate": 0.00010520000000000001, - "loss": 0.2044, - "step": 7900 - }, - { - "epoch": 0.7556001337345369, - "grad_norm": 0.18291781842708588, - "learning_rate": 0.00010505185185185187, - "loss": 0.195, - "step": 7910 - }, - { - "epoch": 0.7565553804269952, - "grad_norm": 0.13923436403274536, - "learning_rate": 0.0001049037037037037, - "loss": 0.1973, - "step": 7920 - }, - { - "epoch": 0.7575106271194536, - "grad_norm": 0.08182086795568466, - "learning_rate": 0.00010475555555555556, - "loss": 0.1986, - "step": 7930 - }, - { - "epoch": 0.7584658738119119, - "grad_norm": 0.11042799055576324, - "learning_rate": 0.00010460740740740742, - "loss": 0.2029, - "step": 7940 - }, - { - "epoch": 0.7594211205043703, - "grad_norm": 0.2199370265007019, - "learning_rate": 0.00010445925925925925, - "loss": 0.2032, - "step": 7950 - }, - { - "epoch": 0.7603763671968286, - "grad_norm": 0.19982871413230896, - "learning_rate": 0.00010431111111111111, - "loss": 0.1952, - "step": 7960 - }, - { - "epoch": 0.7613316138892869, - "grad_norm": 0.14492951333522797, - "learning_rate": 0.00010416296296296297, - "loss": 0.1963, - "step": 7970 - }, - { - "epoch": 0.7622868605817452, - "grad_norm": 0.09715760499238968, - "learning_rate": 0.0001040148148148148, - "loss": 0.1975, - "step": 7980 - }, - { - "epoch": 0.7632421072742036, - "grad_norm": 0.14056870341300964, - "learning_rate": 0.00010386666666666667, - "loss": 0.2027, - "step": 7990 - }, - { - "epoch": 0.7641973539666619, - "grad_norm": 0.23944570124149323, - "learning_rate": 0.00010371851851851853, - "loss": 0.2043, - "step": 8000 - }, - { - "epoch": 0.7651526006591202, - "grad_norm": 0.12410598993301392, - "learning_rate": 0.00010357037037037039, - "loss": 0.1944, - "step": 8010 - }, - { - "epoch": 0.7661078473515786, - "grad_norm": 0.14546865224838257, - "learning_rate": 0.00010342222222222222, - "loss": 0.1974, - "step": 8020 - }, - { - "epoch": 0.7670630940440368, - "grad_norm": 0.07528841495513916, - "learning_rate": 0.00010327407407407408, - "loss": 0.1991, - "step": 8030 - }, - { - "epoch": 0.7680183407364952, - "grad_norm": 0.11421654373407364, - "learning_rate": 0.00010312592592592594, - "loss": 0.201, - "step": 8040 - }, - { - "epoch": 0.7689735874289535, - "grad_norm": 0.2141459733247757, - "learning_rate": 0.00010297777777777777, - "loss": 0.2047, - "step": 8050 - }, - { - "epoch": 0.7699288341214119, - "grad_norm": 0.13949456810951233, - "learning_rate": 0.00010282962962962963, - "loss": 0.1943, - "step": 8060 - }, - { - "epoch": 0.7708840808138702, - "grad_norm": 0.14627870917320251, - "learning_rate": 0.00010268148148148149, - "loss": 0.1979, - "step": 8070 - }, - { - "epoch": 0.7718393275063286, - "grad_norm": 0.08122966438531876, - "learning_rate": 0.00010253333333333335, - "loss": 0.1985, - "step": 8080 - }, - { - "epoch": 0.7727945741987868, - "grad_norm": 0.11243822425603867, - "learning_rate": 0.00010238518518518518, - "loss": 0.2002, - "step": 8090 - }, - { - "epoch": 0.7737498208912451, - "grad_norm": 0.16983778774738312, - "learning_rate": 0.00010223703703703704, - "loss": 0.205, - "step": 8100 - }, - { - "epoch": 0.7747050675837035, - "grad_norm": 0.1632688045501709, - "learning_rate": 0.0001020888888888889, - "loss": 0.1929, - "step": 8110 - }, - { - "epoch": 0.7756603142761618, - "grad_norm": 0.16163240373134613, - "learning_rate": 0.00010194074074074073, - "loss": 0.1969, - "step": 8120 - }, - { - "epoch": 0.7766155609686202, - "grad_norm": 0.07510218024253845, - "learning_rate": 0.0001017925925925926, - "loss": 0.1986, - "step": 8130 - }, - { - "epoch": 0.7775708076610784, - "grad_norm": 0.10797309130430222, - "learning_rate": 0.00010164444444444445, - "loss": 0.2014, - "step": 8140 - }, - { - "epoch": 0.7785260543535368, - "grad_norm": 0.23113864660263062, - "learning_rate": 0.00010149629629629631, - "loss": 0.2046, - "step": 8150 - }, - { - "epoch": 0.7794813010459951, - "grad_norm": 0.2128959745168686, - "learning_rate": 0.00010134814814814815, - "loss": 0.1943, - "step": 8160 - }, - { - "epoch": 0.7804365477384535, - "grad_norm": 0.11465991288423538, - "learning_rate": 0.00010120000000000001, - "loss": 0.1974, - "step": 8170 - }, - { - "epoch": 0.7813917944309118, - "grad_norm": 0.08690842241048813, - "learning_rate": 0.00010105185185185187, - "loss": 0.1985, - "step": 8180 - }, - { - "epoch": 0.7823470411233701, - "grad_norm": 0.12978310883045197, - "learning_rate": 0.0001009037037037037, - "loss": 0.2004, - "step": 8190 - }, - { - "epoch": 0.7833022878158284, - "grad_norm": 0.21451863646507263, - "learning_rate": 0.00010075555555555556, - "loss": 0.2038, - "step": 8200 - }, - { - "epoch": 0.7842575345082867, - "grad_norm": 0.14841678738594055, - "learning_rate": 0.00010060740740740742, - "loss": 0.1945, - "step": 8210 - }, - { - "epoch": 0.7852127812007451, - "grad_norm": 0.1173843964934349, - "learning_rate": 0.00010045925925925928, - "loss": 0.1946, - "step": 8220 - }, - { - "epoch": 0.7861680278932034, - "grad_norm": 0.08300807327032089, - "learning_rate": 0.00010031111111111111, - "loss": 0.1971, - "step": 8230 - }, - { - "epoch": 0.7871232745856618, - "grad_norm": 0.1852853000164032, - "learning_rate": 0.00010016296296296297, - "loss": 0.2014, - "step": 8240 - }, - { - "epoch": 0.7880785212781201, - "grad_norm": 0.20180000364780426, - "learning_rate": 0.00010001481481481483, - "loss": 0.2053, - "step": 8250 - }, - { - "epoch": 0.7890337679705784, - "grad_norm": 0.1663861721754074, - "learning_rate": 9.986666666666668e-05, - "loss": 0.1941, - "step": 8260 - }, - { - "epoch": 0.7899890146630367, - "grad_norm": 0.14617478847503662, - "learning_rate": 9.971851851851852e-05, - "loss": 0.197, - "step": 8270 - }, - { - "epoch": 0.790944261355495, - "grad_norm": 0.06954821944236755, - "learning_rate": 9.957037037037038e-05, - "loss": 0.1993, - "step": 8280 - }, - { - "epoch": 0.7918995080479534, - "grad_norm": 0.1401512175798416, - "learning_rate": 9.942222222222223e-05, - "loss": 0.2021, - "step": 8290 - }, - { - "epoch": 0.7928547547404117, - "grad_norm": 0.2168128937482834, - "learning_rate": 9.927407407407408e-05, - "loss": 0.2049, - "step": 8300 - }, - { - "epoch": 0.7938100014328701, - "grad_norm": 0.14390647411346436, - "learning_rate": 9.912592592592594e-05, - "loss": 0.1922, - "step": 8310 - }, - { - "epoch": 0.7947652481253283, - "grad_norm": 0.1461930125951767, - "learning_rate": 9.897777777777778e-05, - "loss": 0.1975, - "step": 8320 - }, - { - "epoch": 0.7957204948177867, - "grad_norm": 0.06800387054681778, - "learning_rate": 9.882962962962964e-05, - "loss": 0.1974, - "step": 8330 - }, - { - "epoch": 0.796675741510245, - "grad_norm": 0.1342678666114807, - "learning_rate": 9.868148148148149e-05, - "loss": 0.2007, - "step": 8340 - }, - { - "epoch": 0.7976309882027034, - "grad_norm": 0.19177483022212982, - "learning_rate": 9.853333333333333e-05, - "loss": 0.2028, - "step": 8350 - }, - { - "epoch": 0.7985862348951617, - "grad_norm": 0.19509050250053406, - "learning_rate": 9.83851851851852e-05, - "loss": 0.1946, - "step": 8360 - }, - { - "epoch": 0.79954148158762, - "grad_norm": 0.1521720141172409, - "learning_rate": 9.823703703703704e-05, - "loss": 0.1962, - "step": 8370 - }, - { - "epoch": 0.8004967282800783, - "grad_norm": 0.07219533622264862, - "learning_rate": 9.80888888888889e-05, - "loss": 0.1983, - "step": 8380 - }, - { - "epoch": 0.8014519749725366, - "grad_norm": 0.15904979407787323, - "learning_rate": 9.794074074074075e-05, - "loss": 0.2, - "step": 8390 - }, - { - "epoch": 0.802407221664995, - "grad_norm": 0.20888783037662506, - "learning_rate": 9.77925925925926e-05, - "loss": 0.2023, - "step": 8400 - }, - { - "epoch": 0.8033624683574533, - "grad_norm": 0.15352968871593475, - "learning_rate": 9.764444444444445e-05, - "loss": 0.1941, - "step": 8410 - }, - { - "epoch": 0.8043177150499117, - "grad_norm": 0.11600544303655624, - "learning_rate": 9.74962962962963e-05, - "loss": 0.1947, - "step": 8420 - }, - { - "epoch": 0.80527296174237, - "grad_norm": 0.06507077813148499, - "learning_rate": 9.734814814814816e-05, - "loss": 0.197, - "step": 8430 - }, - { - "epoch": 0.8062282084348283, - "grad_norm": 0.14993168413639069, - "learning_rate": 9.72e-05, - "loss": 0.2017, - "step": 8440 - }, - { - "epoch": 0.8071834551272866, - "grad_norm": 0.2011600136756897, - "learning_rate": 9.705185185185186e-05, - "loss": 0.2041, - "step": 8450 - }, - { - "epoch": 0.8081387018197449, - "grad_norm": 0.12556695938110352, - "learning_rate": 9.690370370370371e-05, - "loss": 0.1939, - "step": 8460 - }, - { - "epoch": 0.8090939485122033, - "grad_norm": 0.12797193229198456, - "learning_rate": 9.675555555555556e-05, - "loss": 0.1959, - "step": 8470 - }, - { - "epoch": 0.8100491952046616, - "grad_norm": 0.0666249543428421, - "learning_rate": 9.660740740740742e-05, - "loss": 0.1977, - "step": 8480 - }, - { - "epoch": 0.81100444189712, - "grad_norm": 0.11509077996015549, - "learning_rate": 9.645925925925926e-05, - "loss": 0.2009, - "step": 8490 - }, - { - "epoch": 0.8119596885895782, - "grad_norm": 0.22886355221271515, - "learning_rate": 9.631111111111112e-05, - "loss": 0.2036, - "step": 8500 - }, - { - "epoch": 0.8129149352820366, - "grad_norm": 0.21546624600887299, - "learning_rate": 9.616296296296297e-05, - "loss": 0.1924, - "step": 8510 - }, - { - "epoch": 0.8138701819744949, - "grad_norm": 0.15088830888271332, - "learning_rate": 9.601481481481483e-05, - "loss": 0.1968, - "step": 8520 - }, - { - "epoch": 0.8148254286669533, - "grad_norm": 0.11225682497024536, - "learning_rate": 9.586666666666667e-05, - "loss": 0.1969, - "step": 8530 - }, - { - "epoch": 0.8157806753594116, - "grad_norm": 0.14504078030586243, - "learning_rate": 9.571851851851852e-05, - "loss": 0.2008, - "step": 8540 - }, - { - "epoch": 0.8167359220518698, - "grad_norm": 0.18268372118473053, - "learning_rate": 9.557037037037038e-05, - "loss": 0.2043, - "step": 8550 - }, - { - "epoch": 0.8176911687443282, - "grad_norm": 0.15672026574611664, - "learning_rate": 9.542222222222223e-05, - "loss": 0.1919, - "step": 8560 - }, - { - "epoch": 0.8186464154367865, - "grad_norm": 0.1838408261537552, - "learning_rate": 9.527407407407409e-05, - "loss": 0.1949, - "step": 8570 - }, - { - "epoch": 0.8196016621292449, - "grad_norm": 0.07827872782945633, - "learning_rate": 9.512592592592593e-05, - "loss": 0.1981, - "step": 8580 - }, - { - "epoch": 0.8205569088217032, - "grad_norm": 0.10423124581575394, - "learning_rate": 9.497777777777779e-05, - "loss": 0.2015, - "step": 8590 - }, - { - "epoch": 0.8215121555141616, - "grad_norm": 0.1930837333202362, - "learning_rate": 9.482962962962964e-05, - "loss": 0.2033, - "step": 8600 - }, - { - "epoch": 0.8224674022066198, - "grad_norm": 0.1966949999332428, - "learning_rate": 9.468148148148149e-05, - "loss": 0.1933, - "step": 8610 - }, - { - "epoch": 0.8234226488990782, - "grad_norm": 0.12053806334733963, - "learning_rate": 9.453333333333335e-05, - "loss": 0.1942, - "step": 8620 - }, - { - "epoch": 0.8243778955915365, - "grad_norm": 0.07416214793920517, - "learning_rate": 9.438518518518519e-05, - "loss": 0.1982, - "step": 8630 - }, - { - "epoch": 0.8253331422839948, - "grad_norm": 0.1334538608789444, - "learning_rate": 9.423703703703705e-05, - "loss": 0.2006, - "step": 8640 - }, - { - "epoch": 0.8262883889764532, - "grad_norm": 0.18779663741588593, - "learning_rate": 9.40888888888889e-05, - "loss": 0.2041, - "step": 8650 - }, - { - "epoch": 0.8272436356689115, - "grad_norm": 0.14109750092029572, - "learning_rate": 9.394074074074074e-05, - "loss": 0.1911, - "step": 8660 - }, - { - "epoch": 0.8281988823613698, - "grad_norm": 0.11741621792316437, - "learning_rate": 9.37925925925926e-05, - "loss": 0.1971, - "step": 8670 - }, - { - "epoch": 0.8291541290538281, - "grad_norm": 0.08277934044599533, - "learning_rate": 9.364444444444445e-05, - "loss": 0.1962, - "step": 8680 - }, - { - "epoch": 0.8301093757462865, - "grad_norm": 0.14392395317554474, - "learning_rate": 9.349629629629631e-05, - "loss": 0.2006, - "step": 8690 - }, - { - "epoch": 0.8310646224387448, - "grad_norm": 0.24216341972351074, - "learning_rate": 9.334814814814816e-05, - "loss": 0.204, - "step": 8700 - }, - { - "epoch": 0.8320198691312032, - "grad_norm": 0.15122485160827637, - "learning_rate": 9.320000000000002e-05, - "loss": 0.1921, - "step": 8710 - }, - { - "epoch": 0.8329751158236615, - "grad_norm": 0.13646939396858215, - "learning_rate": 9.305185185185186e-05, - "loss": 0.1971, - "step": 8720 - }, - { - "epoch": 0.8339303625161197, - "grad_norm": 0.07911708950996399, - "learning_rate": 9.290370370370371e-05, - "loss": 0.1982, - "step": 8730 - }, - { - "epoch": 0.8348856092085781, - "grad_norm": 0.10435432940721512, - "learning_rate": 9.275555555555557e-05, - "loss": 0.2005, - "step": 8740 - }, - { - "epoch": 0.8358408559010364, - "grad_norm": 0.23191022872924805, - "learning_rate": 9.260740740740741e-05, - "loss": 0.2042, - "step": 8750 - }, - { - "epoch": 0.8367961025934948, - "grad_norm": 0.16332274675369263, - "learning_rate": 9.245925925925927e-05, - "loss": 0.1928, - "step": 8760 - }, - { - "epoch": 0.8377513492859531, - "grad_norm": 0.16581833362579346, - "learning_rate": 9.231111111111112e-05, - "loss": 0.1964, - "step": 8770 - }, - { - "epoch": 0.8387065959784115, - "grad_norm": 0.06937286257743835, - "learning_rate": 9.216296296296297e-05, - "loss": 0.1963, - "step": 8780 - }, - { - "epoch": 0.8396618426708697, - "grad_norm": 0.12238670140504837, - "learning_rate": 9.201481481481483e-05, - "loss": 0.2002, - "step": 8790 - }, - { - "epoch": 0.8406170893633281, - "grad_norm": 0.1900535523891449, - "learning_rate": 9.186666666666667e-05, - "loss": 0.2038, - "step": 8800 - }, - { - "epoch": 0.8415723360557864, - "grad_norm": 0.20824721455574036, - "learning_rate": 9.171851851851853e-05, - "loss": 0.1949, - "step": 8810 - }, - { - "epoch": 0.8425275827482447, - "grad_norm": 0.127131387591362, - "learning_rate": 9.157037037037038e-05, - "loss": 0.1934, - "step": 8820 - }, - { - "epoch": 0.8434828294407031, - "grad_norm": 0.076890729367733, - "learning_rate": 9.142222222222222e-05, - "loss": 0.1955, - "step": 8830 - }, - { - "epoch": 0.8444380761331614, - "grad_norm": 0.14583854377269745, - "learning_rate": 9.127407407407408e-05, - "loss": 0.2006, - "step": 8840 - }, - { - "epoch": 0.8453933228256197, - "grad_norm": 0.2490548938512802, - "learning_rate": 9.112592592592593e-05, - "loss": 0.2054, - "step": 8850 - }, - { - "epoch": 0.846348569518078, - "grad_norm": 0.11510622501373291, - "learning_rate": 9.097777777777779e-05, - "loss": 0.1924, - "step": 8860 - }, - { - "epoch": 0.8473038162105364, - "grad_norm": 0.11410252749919891, - "learning_rate": 9.082962962962964e-05, - "loss": 0.1949, - "step": 8870 - }, - { - "epoch": 0.8482590629029947, - "grad_norm": 0.08570121228694916, - "learning_rate": 9.068148148148148e-05, - "loss": 0.1972, - "step": 8880 - }, - { - "epoch": 0.8492143095954531, - "grad_norm": 0.1635618507862091, - "learning_rate": 9.053333333333334e-05, - "loss": 0.2009, - "step": 8890 - }, - { - "epoch": 0.8501695562879114, - "grad_norm": 0.2034139484167099, - "learning_rate": 9.038518518518519e-05, - "loss": 0.2037, - "step": 8900 - }, - { - "epoch": 0.8511248029803696, - "grad_norm": 0.16847805678844452, - "learning_rate": 9.023703703703704e-05, - "loss": 0.1898, - "step": 8910 - }, - { - "epoch": 0.852080049672828, - "grad_norm": 0.16264882683753967, - "learning_rate": 9.00888888888889e-05, - "loss": 0.1967, - "step": 8920 - }, - { - "epoch": 0.8530352963652863, - "grad_norm": 0.07546920329332352, - "learning_rate": 8.994074074074074e-05, - "loss": 0.1982, - "step": 8930 - }, - { - "epoch": 0.8539905430577447, - "grad_norm": 0.12496549636125565, - "learning_rate": 8.97925925925926e-05, - "loss": 0.1992, - "step": 8940 - }, - { - "epoch": 0.854945789750203, - "grad_norm": 0.17706719040870667, - "learning_rate": 8.964444444444445e-05, - "loss": 0.204, - "step": 8950 - }, - { - "epoch": 0.8559010364426614, - "grad_norm": 0.17304418981075287, - "learning_rate": 8.94962962962963e-05, - "loss": 0.1928, - "step": 8960 - }, - { - "epoch": 0.8568562831351196, - "grad_norm": 0.22156664729118347, - "learning_rate": 8.934814814814815e-05, - "loss": 0.195, - "step": 8970 - }, - { - "epoch": 0.857811529827578, - "grad_norm": 0.07757692039012909, - "learning_rate": 8.92e-05, - "loss": 0.1975, - "step": 8980 - }, - { - "epoch": 0.8587667765200363, - "grad_norm": 0.10818079113960266, - "learning_rate": 8.905185185185186e-05, - "loss": 0.1993, - "step": 8990 - }, - { - "epoch": 0.8597220232124946, - "grad_norm": 0.19327661395072937, - "learning_rate": 8.89037037037037e-05, - "loss": 0.2033, - "step": 9000 - }, - { - "epoch": 0.860677269904953, - "grad_norm": 0.18037331104278564, - "learning_rate": 8.875555555555555e-05, - "loss": 0.1933, - "step": 9010 - }, - { - "epoch": 0.8616325165974112, - "grad_norm": 0.11041972786188126, - "learning_rate": 8.860740740740741e-05, - "loss": 0.1941, - "step": 9020 - }, - { - "epoch": 0.8625877632898696, - "grad_norm": 0.06798222661018372, - "learning_rate": 8.845925925925926e-05, - "loss": 0.1974, - "step": 9030 - }, - { - "epoch": 0.8635430099823279, - "grad_norm": 0.14087295532226562, - "learning_rate": 8.831111111111112e-05, - "loss": 0.2009, - "step": 9040 - }, - { - "epoch": 0.8644982566747863, - "grad_norm": 0.23657061159610748, - "learning_rate": 8.816296296296296e-05, - "loss": 0.2038, - "step": 9050 - }, - { - "epoch": 0.8654535033672446, - "grad_norm": 0.1508956402540207, - "learning_rate": 8.801481481481481e-05, - "loss": 0.1916, - "step": 9060 - }, - { - "epoch": 0.866408750059703, - "grad_norm": 0.07923205196857452, - "learning_rate": 8.786666666666667e-05, - "loss": 0.1969, - "step": 9070 - }, - { - "epoch": 0.8673639967521612, - "grad_norm": 0.08770725876092911, - "learning_rate": 8.771851851851852e-05, - "loss": 0.1988, - "step": 9080 - }, - { - "epoch": 0.8683192434446195, - "grad_norm": 0.12787607312202454, - "learning_rate": 8.757037037037036e-05, - "loss": 0.2013, - "step": 9090 - }, - { - "epoch": 0.8692744901370779, - "grad_norm": 0.23347334563732147, - "learning_rate": 8.742222222222222e-05, - "loss": 0.2038, - "step": 9100 - }, - { - "epoch": 0.8702297368295362, - "grad_norm": 0.20431476831436157, - "learning_rate": 8.727407407407407e-05, - "loss": 0.19, - "step": 9110 - }, - { - "epoch": 0.8711849835219946, - "grad_norm": 0.173463836312294, - "learning_rate": 8.712592592592593e-05, - "loss": 0.197, - "step": 9120 - }, - { - "epoch": 0.8721402302144529, - "grad_norm": 0.07533210515975952, - "learning_rate": 8.697777777777777e-05, - "loss": 0.1981, - "step": 9130 - }, - { - "epoch": 0.8730954769069112, - "grad_norm": 0.1275360882282257, - "learning_rate": 8.682962962962963e-05, - "loss": 0.2009, - "step": 9140 - }, - { - "epoch": 0.8740507235993695, - "grad_norm": 0.18006309866905212, - "learning_rate": 8.668148148148148e-05, - "loss": 0.204, - "step": 9150 - }, - { - "epoch": 0.8750059702918279, - "grad_norm": 0.17089661955833435, - "learning_rate": 8.653333333333333e-05, - "loss": 0.1897, - "step": 9160 - }, - { - "epoch": 0.8759612169842862, - "grad_norm": 0.19405335187911987, - "learning_rate": 8.638518518518519e-05, - "loss": 0.1961, - "step": 9170 - }, - { - "epoch": 0.8769164636767445, - "grad_norm": 0.07635375112295151, - "learning_rate": 8.623703703703703e-05, - "loss": 0.1976, - "step": 9180 - }, - { - "epoch": 0.8778717103692029, - "grad_norm": 0.1420837789773941, - "learning_rate": 8.608888888888889e-05, - "loss": 0.2007, - "step": 9190 - }, - { - "epoch": 0.8788269570616611, - "grad_norm": 0.2507460117340088, - "learning_rate": 8.594074074074074e-05, - "loss": 0.2019, - "step": 9200 - }, - { - "epoch": 0.8797822037541195, - "grad_norm": 0.1322818398475647, - "learning_rate": 8.57925925925926e-05, - "loss": 0.1921, - "step": 9210 - }, - { - "epoch": 0.8807374504465778, - "grad_norm": 0.15335558354854584, - "learning_rate": 8.564444444444445e-05, - "loss": 0.1948, - "step": 9220 - }, - { - "epoch": 0.8816926971390362, - "grad_norm": 0.0793461874127388, - "learning_rate": 8.549629629629629e-05, - "loss": 0.197, - "step": 9230 - }, - { - "epoch": 0.8826479438314945, - "grad_norm": 0.10587692260742188, - "learning_rate": 8.534814814814815e-05, - "loss": 0.2006, - "step": 9240 - }, - { - "epoch": 0.8836031905239529, - "grad_norm": 0.211539164185524, - "learning_rate": 8.52e-05, - "loss": 0.202, - "step": 9250 - }, - { - "epoch": 0.8845584372164111, - "grad_norm": 0.1647574007511139, - "learning_rate": 8.505185185185186e-05, - "loss": 0.1926, - "step": 9260 - }, - { - "epoch": 0.8855136839088694, - "grad_norm": 0.10489039868116379, - "learning_rate": 8.49037037037037e-05, - "loss": 0.1933, - "step": 9270 - }, - { - "epoch": 0.8864689306013278, - "grad_norm": 0.08796203136444092, - "learning_rate": 8.475555555555555e-05, - "loss": 0.1961, - "step": 9280 - }, - { - "epoch": 0.8874241772937861, - "grad_norm": 0.15730325877666473, - "learning_rate": 8.460740740740741e-05, - "loss": 0.2007, - "step": 9290 - }, - { - "epoch": 0.8883794239862445, - "grad_norm": 0.22997820377349854, - "learning_rate": 8.445925925925926e-05, - "loss": 0.203, - "step": 9300 - }, - { - "epoch": 0.8893346706787028, - "grad_norm": 0.0923224613070488, - "learning_rate": 8.431111111111112e-05, - "loss": 0.1927, - "step": 9310 - }, - { - "epoch": 0.8902899173711611, - "grad_norm": 0.09773056209087372, - "learning_rate": 8.416296296296296e-05, - "loss": 0.1962, - "step": 9320 - }, - { - "epoch": 0.8912451640636194, - "grad_norm": 0.07204195111989975, - "learning_rate": 8.401481481481482e-05, - "loss": 0.1969, - "step": 9330 - }, - { - "epoch": 0.8922004107560778, - "grad_norm": 0.1365681141614914, - "learning_rate": 8.386666666666667e-05, - "loss": 0.2012, - "step": 9340 - }, - { - "epoch": 0.8931556574485361, - "grad_norm": 0.1925460696220398, - "learning_rate": 8.371851851851851e-05, - "loss": 0.2034, - "step": 9350 - }, - { - "epoch": 0.8941109041409944, - "grad_norm": 0.13350558280944824, - "learning_rate": 8.357037037037037e-05, - "loss": 0.1927, - "step": 9360 - }, - { - "epoch": 0.8950661508334528, - "grad_norm": 0.1282387226819992, - "learning_rate": 8.342222222222222e-05, - "loss": 0.195, - "step": 9370 - }, - { - "epoch": 0.896021397525911, - "grad_norm": 0.08503387868404388, - "learning_rate": 8.327407407407408e-05, - "loss": 0.199, - "step": 9380 - }, - { - "epoch": 0.8969766442183694, - "grad_norm": 0.13588573038578033, - "learning_rate": 8.312592592592593e-05, - "loss": 0.2006, - "step": 9390 - }, - { - "epoch": 0.8979318909108277, - "grad_norm": 0.18095846474170685, - "learning_rate": 8.297777777777777e-05, - "loss": 0.2038, - "step": 9400 - }, - { - "epoch": 0.8988871376032861, - "grad_norm": 0.1470443606376648, - "learning_rate": 8.282962962962963e-05, - "loss": 0.1914, - "step": 9410 - }, - { - "epoch": 0.8998423842957444, - "grad_norm": 0.11700747907161713, - "learning_rate": 8.268148148148148e-05, - "loss": 0.1945, - "step": 9420 - }, - { - "epoch": 0.9007976309882028, - "grad_norm": 0.08257856965065002, - "learning_rate": 8.253333333333334e-05, - "loss": 0.1965, - "step": 9430 - }, - { - "epoch": 0.901752877680661, - "grad_norm": 0.11103440821170807, - "learning_rate": 8.238518518518518e-05, - "loss": 0.2012, - "step": 9440 - }, - { - "epoch": 0.9027081243731193, - "grad_norm": 0.19891560077667236, - "learning_rate": 8.223703703703704e-05, - "loss": 0.2051, - "step": 9450 - }, - { - "epoch": 0.9036633710655777, - "grad_norm": 0.15487581491470337, - "learning_rate": 8.208888888888889e-05, - "loss": 0.1919, - "step": 9460 - }, - { - "epoch": 0.904618617758036, - "grad_norm": 0.11258412897586823, - "learning_rate": 8.194074074074074e-05, - "loss": 0.1943, - "step": 9470 - }, - { - "epoch": 0.9055738644504944, - "grad_norm": 0.07216157019138336, - "learning_rate": 8.17925925925926e-05, - "loss": 0.1976, - "step": 9480 - }, - { - "epoch": 0.9065291111429526, - "grad_norm": 0.16788683831691742, - "learning_rate": 8.164444444444444e-05, - "loss": 0.2014, - "step": 9490 - }, - { - "epoch": 0.907484357835411, - "grad_norm": 0.1965964436531067, - "learning_rate": 8.14962962962963e-05, - "loss": 0.2047, - "step": 9500 - }, - { - "epoch": 0.9084396045278693, - "grad_norm": 0.2151501625776291, - "learning_rate": 8.134814814814815e-05, - "loss": 0.1917, - "step": 9510 - }, - { - "epoch": 0.9093948512203277, - "grad_norm": 0.09645062685012817, - "learning_rate": 8.120000000000001e-05, - "loss": 0.1931, - "step": 9520 - }, - { - "epoch": 0.910350097912786, - "grad_norm": 0.07982160151004791, - "learning_rate": 8.105185185185185e-05, - "loss": 0.1987, - "step": 9530 - }, - { - "epoch": 0.9113053446052443, - "grad_norm": 0.1163114458322525, - "learning_rate": 8.09037037037037e-05, - "loss": 0.2005, - "step": 9540 - }, - { - "epoch": 0.9122605912977026, - "grad_norm": 0.2425907999277115, - "learning_rate": 8.075555555555556e-05, - "loss": 0.2047, - "step": 9550 - }, - { - "epoch": 0.9132158379901609, - "grad_norm": 0.14334097504615784, - "learning_rate": 8.060740740740741e-05, - "loss": 0.191, - "step": 9560 - }, - { - "epoch": 0.9141710846826193, - "grad_norm": 0.10129152238368988, - "learning_rate": 8.045925925925927e-05, - "loss": 0.1947, - "step": 9570 - }, - { - "epoch": 0.9151263313750776, - "grad_norm": 0.08656469732522964, - "learning_rate": 8.031111111111111e-05, - "loss": 0.1975, - "step": 9580 - }, - { - "epoch": 0.916081578067536, - "grad_norm": 0.10407857596874237, - "learning_rate": 8.016296296296296e-05, - "loss": 0.1992, - "step": 9590 - }, - { - "epoch": 0.9170368247599943, - "grad_norm": 0.17440539598464966, - "learning_rate": 8.001481481481482e-05, - "loss": 0.203, - "step": 9600 - }, - { - "epoch": 0.9179920714524525, - "grad_norm": 0.15519316494464874, - "learning_rate": 7.986666666666667e-05, - "loss": 0.1895, - "step": 9610 - }, - { - "epoch": 0.9189473181449109, - "grad_norm": 0.1368798166513443, - "learning_rate": 7.971851851851853e-05, - "loss": 0.194, - "step": 9620 - }, - { - "epoch": 0.9199025648373692, - "grad_norm": 0.08210785686969757, - "learning_rate": 7.957037037037037e-05, - "loss": 0.1975, - "step": 9630 - }, - { - "epoch": 0.9208578115298276, - "grad_norm": 0.12684789299964905, - "learning_rate": 7.942222222222223e-05, - "loss": 0.1998, - "step": 9640 - }, - { - "epoch": 0.9218130582222859, - "grad_norm": 0.159325510263443, - "learning_rate": 7.927407407407408e-05, - "loss": 0.2051, - "step": 9650 - }, - { - "epoch": 0.9227683049147443, - "grad_norm": 0.21739524602890015, - "learning_rate": 7.912592592592592e-05, - "loss": 0.1886, - "step": 9660 - }, - { - "epoch": 0.9237235516072025, - "grad_norm": 0.12373743951320648, - "learning_rate": 7.897777777777778e-05, - "loss": 0.1939, - "step": 9670 - }, - { - "epoch": 0.9246787982996609, - "grad_norm": 0.11995956301689148, - "learning_rate": 7.882962962962963e-05, - "loss": 0.1963, - "step": 9680 - }, - { - "epoch": 0.9256340449921192, - "grad_norm": 0.1054321900010109, - "learning_rate": 7.868148148148149e-05, - "loss": 0.1995, - "step": 9690 - }, - { - "epoch": 0.9265892916845775, - "grad_norm": 0.18521544337272644, - "learning_rate": 7.853333333333334e-05, - "loss": 0.2033, - "step": 9700 - }, - { - "epoch": 0.9275445383770359, - "grad_norm": 0.14290209114551544, - "learning_rate": 7.838518518518518e-05, - "loss": 0.1912, - "step": 9710 - }, - { - "epoch": 0.9284997850694942, - "grad_norm": 0.12010928988456726, - "learning_rate": 7.823703703703704e-05, - "loss": 0.1938, - "step": 9720 - }, - { - "epoch": 0.9294550317619525, - "grad_norm": 0.08440925925970078, - "learning_rate": 7.808888888888889e-05, - "loss": 0.196, - "step": 9730 - }, - { - "epoch": 0.9304102784544108, - "grad_norm": 0.13756245374679565, - "learning_rate": 7.794074074074075e-05, - "loss": 0.2004, - "step": 9740 - }, - { - "epoch": 0.9313655251468692, - "grad_norm": 0.1666550189256668, - "learning_rate": 7.77925925925926e-05, - "loss": 0.2037, - "step": 9750 - }, - { - "epoch": 0.9323207718393275, - "grad_norm": 0.21596874296665192, - "learning_rate": 7.764444444444445e-05, - "loss": 0.1926, - "step": 9760 - }, - { - "epoch": 0.9332760185317859, - "grad_norm": 0.10867001116275787, - "learning_rate": 7.74962962962963e-05, - "loss": 0.1933, - "step": 9770 - }, - { - "epoch": 0.9342312652242442, - "grad_norm": 0.0639323890209198, - "learning_rate": 7.734814814814815e-05, - "loss": 0.1959, - "step": 9780 - }, - { - "epoch": 0.9351865119167024, - "grad_norm": 0.12160497158765793, - "learning_rate": 7.72e-05, - "loss": 0.2011, - "step": 9790 - }, - { - "epoch": 0.9361417586091608, - "grad_norm": 0.20208191871643066, - "learning_rate": 7.705185185185185e-05, - "loss": 0.2041, - "step": 9800 - }, - { - "epoch": 0.9370970053016191, - "grad_norm": 0.2048955261707306, - "learning_rate": 7.690370370370371e-05, - "loss": 0.1901, - "step": 9810 - }, - { - "epoch": 0.9380522519940775, - "grad_norm": 0.1331399381160736, - "learning_rate": 7.675555555555556e-05, - "loss": 0.1953, - "step": 9820 - }, - { - "epoch": 0.9390074986865358, - "grad_norm": 0.07257109880447388, - "learning_rate": 7.660740740740742e-05, - "loss": 0.1973, - "step": 9830 - }, - { - "epoch": 0.9399627453789942, - "grad_norm": 0.12627868354320526, - "learning_rate": 7.645925925925926e-05, - "loss": 0.1994, - "step": 9840 - }, - { - "epoch": 0.9409179920714524, - "grad_norm": 0.18185719847679138, - "learning_rate": 7.631111111111111e-05, - "loss": 0.2038, - "step": 9850 - }, - { - "epoch": 0.9418732387639108, - "grad_norm": 0.14436939358711243, - "learning_rate": 7.616296296296297e-05, - "loss": 0.1892, - "step": 9860 - }, - { - "epoch": 0.9428284854563691, - "grad_norm": 0.1035638079047203, - "learning_rate": 7.601481481481482e-05, - "loss": 0.1933, - "step": 9870 - }, - { - "epoch": 0.9437837321488274, - "grad_norm": 0.10040664672851562, - "learning_rate": 7.586666666666668e-05, - "loss": 0.1962, - "step": 9880 - }, - { - "epoch": 0.9447389788412858, - "grad_norm": 0.12080970406532288, - "learning_rate": 7.571851851851852e-05, - "loss": 0.2002, - "step": 9890 - }, - { - "epoch": 0.945694225533744, - "grad_norm": 0.16594280302524567, - "learning_rate": 7.557037037037037e-05, - "loss": 0.2032, - "step": 9900 - }, - { - "epoch": 0.9466494722262024, - "grad_norm": 0.20917296409606934, - "learning_rate": 7.542222222222223e-05, - "loss": 0.1906, - "step": 9910 - }, - { - "epoch": 0.9476047189186607, - "grad_norm": 0.12201665341854095, - "learning_rate": 7.527407407407408e-05, - "loss": 0.1947, - "step": 9920 - }, - { - "epoch": 0.9485599656111191, - "grad_norm": 0.12169451266527176, - "learning_rate": 7.512592592592593e-05, - "loss": 0.1989, - "step": 9930 - }, - { - "epoch": 0.9495152123035774, - "grad_norm": 0.09808619320392609, - "learning_rate": 7.497777777777778e-05, - "loss": 0.1994, - "step": 9940 - }, - { - "epoch": 0.9504704589960358, - "grad_norm": 0.1698949635028839, - "learning_rate": 7.482962962962964e-05, - "loss": 0.2037, - "step": 9950 - }, - { - "epoch": 0.951425705688494, - "grad_norm": 0.20752237737178802, - "learning_rate": 7.468148148148149e-05, - "loss": 0.1886, - "step": 9960 - }, - { - "epoch": 0.9523809523809523, - "grad_norm": 0.11208115518093109, - "learning_rate": 7.453333333333333e-05, - "loss": 0.1937, - "step": 9970 - }, - { - "epoch": 0.9533361990734107, - "grad_norm": 0.11728549003601074, - "learning_rate": 7.43851851851852e-05, - "loss": 0.1974, - "step": 9980 - }, - { - "epoch": 0.954291445765869, - "grad_norm": 0.10618801414966583, - "learning_rate": 7.423703703703704e-05, - "loss": 0.1995, - "step": 9990 - }, - { - "epoch": 0.9552466924583274, - "grad_norm": 0.1451137661933899, - "learning_rate": 7.40888888888889e-05, - "loss": 0.2033, - "step": 10000 - }, - { - "epoch": 0.9562019391507857, - "grad_norm": 0.24454337358474731, - "learning_rate": 7.394074074074075e-05, - "loss": 0.1911, - "step": 10010 - }, - { - "epoch": 0.957157185843244, - "grad_norm": 0.08611460030078888, - "learning_rate": 7.379259259259259e-05, - "loss": 0.1931, - "step": 10020 - }, - { - "epoch": 0.9581124325357023, - "grad_norm": 0.07674242556095123, - "learning_rate": 7.364444444444445e-05, - "loss": 0.1967, - "step": 10030 - }, - { - "epoch": 0.9590676792281607, - "grad_norm": 0.16499543190002441, - "learning_rate": 7.34962962962963e-05, - "loss": 0.1996, - "step": 10040 - }, - { - "epoch": 0.960022925920619, - "grad_norm": 0.19315092265605927, - "learning_rate": 7.334814814814816e-05, - "loss": 0.204, - "step": 10050 - }, - { - "epoch": 0.9609781726130773, - "grad_norm": 0.14484643936157227, - "learning_rate": 7.32e-05, - "loss": 0.1874, - "step": 10060 - }, - { - "epoch": 0.9619334193055357, - "grad_norm": 0.12405912578105927, - "learning_rate": 7.305185185185186e-05, - "loss": 0.195, - "step": 10070 - }, - { - "epoch": 0.9628886659979939, - "grad_norm": 0.06671059131622314, - "learning_rate": 7.290370370370371e-05, - "loss": 0.1973, - "step": 10080 - }, - { - "epoch": 0.9638439126904523, - "grad_norm": 0.09796929359436035, - "learning_rate": 7.275555555555556e-05, - "loss": 0.2002, - "step": 10090 - }, - { - "epoch": 0.9647991593829106, - "grad_norm": 0.18177813291549683, - "learning_rate": 7.260740740740742e-05, - "loss": 0.2044, - "step": 10100 - }, - { - "epoch": 0.965754406075369, - "grad_norm": 0.1589105874300003, - "learning_rate": 7.245925925925926e-05, - "loss": 0.1897, - "step": 10110 - }, - { - "epoch": 0.9667096527678273, - "grad_norm": 0.16179531812667847, - "learning_rate": 7.231111111111112e-05, - "loss": 0.1951, - "step": 10120 - }, - { - "epoch": 0.9676648994602857, - "grad_norm": 0.08069080859422684, - "learning_rate": 7.216296296296297e-05, - "loss": 0.1972, - "step": 10130 - }, - { - "epoch": 0.9686201461527439, - "grad_norm": 0.11243908107280731, - "learning_rate": 7.201481481481481e-05, - "loss": 0.1986, - "step": 10140 - }, - { - "epoch": 0.9695753928452022, - "grad_norm": 0.1934669017791748, - "learning_rate": 7.186666666666667e-05, - "loss": 0.2036, - "step": 10150 - }, - { - "epoch": 0.9705306395376606, - "grad_norm": 0.15914154052734375, - "learning_rate": 7.171851851851852e-05, - "loss": 0.1877, - "step": 10160 - }, - { - "epoch": 0.9714858862301189, - "grad_norm": 0.07927779853343964, - "learning_rate": 7.157037037037038e-05, - "loss": 0.1938, - "step": 10170 - }, - { - "epoch": 0.9724411329225773, - "grad_norm": 0.15848129987716675, - "learning_rate": 7.142222222222223e-05, - "loss": 0.1973, - "step": 10180 - }, - { - "epoch": 0.9733963796150356, - "grad_norm": 0.09003908932209015, - "learning_rate": 7.127407407407409e-05, - "loss": 0.2001, - "step": 10190 - }, - { - "epoch": 0.974351626307494, - "grad_norm": 0.16996480524539948, - "learning_rate": 7.112592592592593e-05, - "loss": 0.2019, - "step": 10200 - }, - { - "epoch": 0.9753068729999522, - "grad_norm": 0.18027909100055695, - "learning_rate": 7.097777777777778e-05, - "loss": 0.1914, - "step": 10210 - }, - { - "epoch": 0.9762621196924106, - "grad_norm": 0.09944116324186325, - "learning_rate": 7.082962962962964e-05, - "loss": 0.1929, - "step": 10220 - }, - { - "epoch": 0.9772173663848689, - "grad_norm": 0.11399278044700623, - "learning_rate": 7.068148148148148e-05, - "loss": 0.1965, - "step": 10230 - }, - { - "epoch": 0.9781726130773272, - "grad_norm": 0.11866579204797745, - "learning_rate": 7.053333333333334e-05, - "loss": 0.1999, - "step": 10240 - }, - { - "epoch": 0.9791278597697856, - "grad_norm": 0.20890718698501587, - "learning_rate": 7.038518518518519e-05, - "loss": 0.2044, - "step": 10250 - }, - { - "epoch": 0.9800831064622438, - "grad_norm": 0.16690291464328766, - "learning_rate": 7.023703703703705e-05, - "loss": 0.1896, - "step": 10260 - }, - { - "epoch": 0.9810383531547022, - "grad_norm": 0.12047307938337326, - "learning_rate": 7.00888888888889e-05, - "loss": 0.1929, - "step": 10270 - }, - { - "epoch": 0.9819935998471605, - "grad_norm": 0.07980793714523315, - "learning_rate": 6.994074074074074e-05, - "loss": 0.1968, - "step": 10280 - }, - { - "epoch": 0.9829488465396189, - "grad_norm": 0.14673179388046265, - "learning_rate": 6.97925925925926e-05, - "loss": 0.2006, - "step": 10290 - }, - { - "epoch": 0.9839040932320772, - "grad_norm": 0.1462879627943039, - "learning_rate": 6.964444444444445e-05, - "loss": 0.2029, - "step": 10300 - }, - { - "epoch": 0.9848593399245356, - "grad_norm": 0.16838905215263367, - "learning_rate": 6.949629629629631e-05, - "loss": 0.1887, - "step": 10310 - }, - { - "epoch": 0.9858145866169938, - "grad_norm": 0.14399904012680054, - "learning_rate": 6.934814814814816e-05, - "loss": 0.1941, - "step": 10320 - }, - { - "epoch": 0.9867698333094521, - "grad_norm": 0.06735172122716904, - "learning_rate": 6.92e-05, - "loss": 0.198, - "step": 10330 - }, - { - "epoch": 0.9877250800019105, - "grad_norm": 0.0967218354344368, - "learning_rate": 6.905185185185186e-05, - "loss": 0.2014, - "step": 10340 - }, - { - "epoch": 0.9886803266943688, - "grad_norm": 0.18309317529201508, - "learning_rate": 6.890370370370371e-05, - "loss": 0.2027, - "step": 10350 - }, - { - "epoch": 0.9896355733868272, - "grad_norm": 0.12692053616046906, - "learning_rate": 6.875555555555557e-05, - "loss": 0.1896, - "step": 10360 - }, - { - "epoch": 0.9905908200792855, - "grad_norm": 0.09795171767473221, - "learning_rate": 6.860740740740741e-05, - "loss": 0.193, - "step": 10370 - }, - { - "epoch": 0.9915460667717438, - "grad_norm": 0.11544561386108398, - "learning_rate": 6.845925925925927e-05, - "loss": 0.1973, - "step": 10380 - }, - { - "epoch": 0.9925013134642021, - "grad_norm": 0.1150268018245697, - "learning_rate": 6.831111111111112e-05, - "loss": 0.2015, - "step": 10390 - }, - { - "epoch": 0.9934565601566605, - "grad_norm": 0.160989910364151, - "learning_rate": 6.816296296296297e-05, - "loss": 0.2036, - "step": 10400 - }, - { - "epoch": 0.9944118068491188, - "grad_norm": 0.14440488815307617, - "learning_rate": 6.801481481481483e-05, - "loss": 0.1895, - "step": 10410 - }, - { - "epoch": 0.9953670535415771, - "grad_norm": 0.07250352948904037, - "learning_rate": 6.786666666666667e-05, - "loss": 0.1927, - "step": 10420 - }, - { - "epoch": 0.9963223002340355, - "grad_norm": 0.11280882358551025, - "learning_rate": 6.771851851851853e-05, - "loss": 0.1962, - "step": 10430 - }, - { - "epoch": 0.9972775469264937, - "grad_norm": 0.11649350076913834, - "learning_rate": 6.757037037037038e-05, - "loss": 0.2007, - "step": 10440 - }, - { - "epoch": 0.9982327936189521, - "grad_norm": 0.17705261707305908, - "learning_rate": 6.742222222222222e-05, - "loss": 0.204, - "step": 10450 - }, - { - "epoch": 0.9991880403114104, - "grad_norm": 0.07574629783630371, - "learning_rate": 6.727407407407408e-05, - "loss": 0.1946, - "step": 10460 - }, - { - "epoch": 1.0000955246692458, - "grad_norm": 0.16626384854316711, - "learning_rate": 6.712592592592593e-05, - "loss": 0.198, - "step": 10470 - }, - { - "epoch": 1.001050771361704, - "grad_norm": 0.09976531565189362, - "learning_rate": 6.697777777777779e-05, - "loss": 0.1857, - "step": 10480 - }, - { - "epoch": 1.0020060180541626, - "grad_norm": 0.06542108952999115, - "learning_rate": 6.682962962962964e-05, - "loss": 0.1922, - "step": 10490 - }, - { - "epoch": 1.0029612647466208, - "grad_norm": 0.09266208857297897, - "learning_rate": 6.668148148148148e-05, - "loss": 0.1982, - "step": 10500 - }, - { - "epoch": 1.0039165114390791, - "grad_norm": 0.11459262669086456, - "learning_rate": 6.653333333333334e-05, - "loss": 0.2009, - "step": 10510 - }, - { - "epoch": 1.0048717581315374, - "grad_norm": 0.1587928831577301, - "learning_rate": 6.638518518518519e-05, - "loss": 0.2008, - "step": 10520 - }, - { - "epoch": 1.005827004823996, - "grad_norm": 0.11515659838914871, - "learning_rate": 6.623703703703705e-05, - "loss": 0.1876, - "step": 10530 - }, - { - "epoch": 1.0067822515164542, - "grad_norm": 0.07409398257732391, - "learning_rate": 6.60888888888889e-05, - "loss": 0.1928, - "step": 10540 - }, - { - "epoch": 1.0077374982089125, - "grad_norm": 0.09634041041135788, - "learning_rate": 6.594074074074074e-05, - "loss": 0.1968, - "step": 10550 - }, - { - "epoch": 1.0086927449013707, - "grad_norm": 0.10461993515491486, - "learning_rate": 6.57925925925926e-05, - "loss": 0.2012, - "step": 10560 - }, - { - "epoch": 1.009647991593829, - "grad_norm": 0.188216432929039, - "learning_rate": 6.564444444444445e-05, - "loss": 0.2006, - "step": 10570 - }, - { - "epoch": 1.0106032382862875, - "grad_norm": 0.10396794974803925, - "learning_rate": 6.54962962962963e-05, - "loss": 0.188, - "step": 10580 - }, - { - "epoch": 1.0115584849787458, - "grad_norm": 0.08603309839963913, - "learning_rate": 6.534814814814815e-05, - "loss": 0.1926, - "step": 10590 - }, - { - "epoch": 1.012513731671204, - "grad_norm": 0.09479288011789322, - "learning_rate": 6.52e-05, - "loss": 0.1961, - "step": 10600 - }, - { - "epoch": 1.0134689783636623, - "grad_norm": 0.11584699153900146, - "learning_rate": 6.505185185185186e-05, - "loss": 0.1998, - "step": 10610 - }, - { - "epoch": 1.0144242250561208, - "grad_norm": 0.21939048171043396, - "learning_rate": 6.49037037037037e-05, - "loss": 0.201, - "step": 10620 - }, - { - "epoch": 1.0153794717485791, - "grad_norm": 0.11948587000370026, - "learning_rate": 6.475555555555555e-05, - "loss": 0.1877, - "step": 10630 - }, - { - "epoch": 1.0163347184410374, - "grad_norm": 0.087392158806324, - "learning_rate": 6.460740740740741e-05, - "loss": 0.1937, - "step": 10640 - }, - { - "epoch": 1.0172899651334957, - "grad_norm": 0.0761866346001625, - "learning_rate": 6.445925925925926e-05, - "loss": 0.1974, - "step": 10650 - }, - { - "epoch": 1.018245211825954, - "grad_norm": 0.1001187115907669, - "learning_rate": 6.431111111111112e-05, - "loss": 0.1999, - "step": 10660 - }, - { - "epoch": 1.0192004585184125, - "grad_norm": 0.20643304288387299, - "learning_rate": 6.416296296296296e-05, - "loss": 0.2001, - "step": 10670 - }, - { - "epoch": 1.0201557052108707, - "grad_norm": 0.12136881798505783, - "learning_rate": 6.401481481481481e-05, - "loss": 0.1889, - "step": 10680 - }, - { - "epoch": 1.021110951903329, - "grad_norm": 0.14471648633480072, - "learning_rate": 6.386666666666667e-05, - "loss": 0.1934, - "step": 10690 - }, - { - "epoch": 1.0220661985957873, - "grad_norm": 0.08957241475582123, - "learning_rate": 6.371851851851852e-05, - "loss": 0.1969, - "step": 10700 - }, - { - "epoch": 1.0230214452882458, - "grad_norm": 0.08908242732286453, - "learning_rate": 6.357037037037038e-05, - "loss": 0.2006, - "step": 10710 - }, - { - "epoch": 1.023976691980704, - "grad_norm": 0.17719383537769318, - "learning_rate": 6.342222222222222e-05, - "loss": 0.2005, - "step": 10720 - }, - { - "epoch": 1.0249319386731623, - "grad_norm": 0.13578200340270996, - "learning_rate": 6.327407407407407e-05, - "loss": 0.1896, - "step": 10730 - }, - { - "epoch": 1.0258871853656206, - "grad_norm": 0.09592164307832718, - "learning_rate": 6.312592592592593e-05, - "loss": 0.1925, - "step": 10740 - }, - { - "epoch": 1.026842432058079, - "grad_norm": 0.08957613259553909, - "learning_rate": 6.297777777777777e-05, - "loss": 0.1975, - "step": 10750 - }, - { - "epoch": 1.0277976787505374, - "grad_norm": 0.11708281934261322, - "learning_rate": 6.282962962962963e-05, - "loss": 0.2009, - "step": 10760 - }, - { - "epoch": 1.0287529254429957, - "grad_norm": 0.17692160606384277, - "learning_rate": 6.268148148148148e-05, - "loss": 0.2005, - "step": 10770 - }, - { - "epoch": 1.029708172135454, - "grad_norm": 0.1275843232870102, - "learning_rate": 6.253333333333333e-05, - "loss": 0.1888, - "step": 10780 - }, - { - "epoch": 1.0306634188279122, - "grad_norm": 0.07367505878210068, - "learning_rate": 6.238518518518519e-05, - "loss": 0.1918, - "step": 10790 - }, - { - "epoch": 1.0316186655203707, - "grad_norm": 0.09375837445259094, - "learning_rate": 6.223703703703703e-05, - "loss": 0.1969, - "step": 10800 - }, - { - "epoch": 1.032573912212829, - "grad_norm": 0.11127982288599014, - "learning_rate": 6.208888888888889e-05, - "loss": 0.1995, - "step": 10810 - }, - { - "epoch": 1.0335291589052873, - "grad_norm": 0.20982736349105835, - "learning_rate": 6.194074074074074e-05, - "loss": 0.2004, - "step": 10820 - }, - { - "epoch": 1.0344844055977456, - "grad_norm": 0.11941300332546234, - "learning_rate": 6.179259259259258e-05, - "loss": 0.1896, - "step": 10830 - }, - { - "epoch": 1.0354396522902038, - "grad_norm": 0.09594809263944626, - "learning_rate": 6.164444444444444e-05, - "loss": 0.1924, - "step": 10840 - }, - { - "epoch": 1.0363948989826623, - "grad_norm": 0.092034250497818, - "learning_rate": 6.149629629629629e-05, - "loss": 0.1956, - "step": 10850 - }, - { - "epoch": 1.0373501456751206, - "grad_norm": 0.10250823199748993, - "learning_rate": 6.134814814814815e-05, - "loss": 0.1999, - "step": 10860 - }, - { - "epoch": 1.038305392367579, - "grad_norm": 0.20155517756938934, - "learning_rate": 6.12e-05, - "loss": 0.2001, - "step": 10870 - }, - { - "epoch": 1.0392606390600372, - "grad_norm": 0.09367980062961578, - "learning_rate": 6.105185185185186e-05, - "loss": 0.1883, - "step": 10880 - }, - { - "epoch": 1.0402158857524957, - "grad_norm": 0.1017754077911377, - "learning_rate": 6.090370370370371e-05, - "loss": 0.1933, - "step": 10890 - }, - { - "epoch": 1.041171132444954, - "grad_norm": 0.07937656342983246, - "learning_rate": 6.0755555555555556e-05, - "loss": 0.1978, - "step": 10900 - }, - { - "epoch": 1.0421263791374122, - "grad_norm": 0.09843187034130096, - "learning_rate": 6.0607407407407416e-05, - "loss": 0.2007, - "step": 10910 - }, - { - "epoch": 1.0430816258298705, - "grad_norm": 0.17273889482021332, - "learning_rate": 6.045925925925926e-05, - "loss": 0.1988, - "step": 10920 - }, - { - "epoch": 1.0440368725223288, - "grad_norm": 0.10299399495124817, - "learning_rate": 6.0311111111111115e-05, - "loss": 0.1866, - "step": 10930 - }, - { - "epoch": 1.0449921192147873, - "grad_norm": 0.1454765498638153, - "learning_rate": 6.016296296296297e-05, - "loss": 0.1934, - "step": 10940 - }, - { - "epoch": 1.0459473659072456, - "grad_norm": 0.08828850090503693, - "learning_rate": 6.0014814814814814e-05, - "loss": 0.1962, - "step": 10950 - }, - { - "epoch": 1.0469026125997039, - "grad_norm": 0.08813127875328064, - "learning_rate": 5.9866666666666674e-05, - "loss": 0.199, - "step": 10960 - }, - { - "epoch": 1.0478578592921621, - "grad_norm": 0.17540910840034485, - "learning_rate": 5.971851851851852e-05, - "loss": 0.1988, - "step": 10970 - }, - { - "epoch": 1.0488131059846206, - "grad_norm": 0.11944608390331268, - "learning_rate": 5.957037037037037e-05, - "loss": 0.189, - "step": 10980 - }, - { - "epoch": 1.049768352677079, - "grad_norm": 0.07055499404668808, - "learning_rate": 5.9422222222222226e-05, - "loss": 0.1932, - "step": 10990 - }, - { - "epoch": 1.0507235993695372, - "grad_norm": 0.08947975933551788, - "learning_rate": 5.927407407407408e-05, - "loss": 0.1951, - "step": 11000 - }, - { - "epoch": 1.0516788460619955, - "grad_norm": 0.12585119903087616, - "learning_rate": 5.912592592592593e-05, - "loss": 0.201, - "step": 11010 - }, - { - "epoch": 1.0526340927544537, - "grad_norm": 0.18942105770111084, - "learning_rate": 5.897777777777778e-05, - "loss": 0.2003, - "step": 11020 - }, - { - "epoch": 1.0535893394469122, - "grad_norm": 0.11553331464529037, - "learning_rate": 5.882962962962963e-05, - "loss": 0.1869, - "step": 11030 - }, - { - "epoch": 1.0545445861393705, - "grad_norm": 0.06898508220911026, - "learning_rate": 5.8681481481481485e-05, - "loss": 0.1922, - "step": 11040 - }, - { - "epoch": 1.0554998328318288, - "grad_norm": 0.07421170175075531, - "learning_rate": 5.853333333333334e-05, - "loss": 0.1962, - "step": 11050 - }, - { - "epoch": 1.056455079524287, - "grad_norm": 0.1096966564655304, - "learning_rate": 5.8385185185185184e-05, - "loss": 0.2009, - "step": 11060 - }, - { - "epoch": 1.0574103262167456, - "grad_norm": 0.16953304409980774, - "learning_rate": 5.823703703703704e-05, - "loss": 0.2008, - "step": 11070 - }, - { - "epoch": 1.0583655729092039, - "grad_norm": 0.1059802696108818, - "learning_rate": 5.808888888888889e-05, - "loss": 0.186, - "step": 11080 - }, - { - "epoch": 1.0593208196016621, - "grad_norm": 0.0721934363245964, - "learning_rate": 5.794074074074074e-05, - "loss": 0.1926, - "step": 11090 - }, - { - "epoch": 1.0602760662941204, - "grad_norm": 0.08843927830457687, - "learning_rate": 5.7792592592592596e-05, - "loss": 0.1961, - "step": 11100 - }, - { - "epoch": 1.0612313129865787, - "grad_norm": 0.12775014340877533, - "learning_rate": 5.764444444444444e-05, - "loss": 0.2004, - "step": 11110 - }, - { - "epoch": 1.0621865596790372, - "grad_norm": 0.17205151915550232, - "learning_rate": 5.74962962962963e-05, - "loss": 0.2002, - "step": 11120 - }, - { - "epoch": 1.0631418063714955, - "grad_norm": 0.12345268577337265, - "learning_rate": 5.734814814814815e-05, - "loss": 0.1868, - "step": 11130 - }, - { - "epoch": 1.0640970530639537, - "grad_norm": 0.06966684758663177, - "learning_rate": 5.72e-05, - "loss": 0.1927, - "step": 11140 - }, - { - "epoch": 1.065052299756412, - "grad_norm": 0.10183680057525635, - "learning_rate": 5.7051851851851854e-05, - "loss": 0.1964, - "step": 11150 - }, - { - "epoch": 1.0660075464488705, - "grad_norm": 0.09469080716371536, - "learning_rate": 5.69037037037037e-05, - "loss": 0.2004, - "step": 11160 - }, - { - "epoch": 1.0669627931413288, - "grad_norm": 0.1744278520345688, - "learning_rate": 5.675555555555556e-05, - "loss": 0.1999, - "step": 11170 - }, - { - "epoch": 1.067918039833787, - "grad_norm": 0.12014975398778915, - "learning_rate": 5.6607407407407407e-05, - "loss": 0.186, - "step": 11180 - }, - { - "epoch": 1.0688732865262454, - "grad_norm": 0.06704577803611755, - "learning_rate": 5.6459259259259266e-05, - "loss": 0.1916, - "step": 11190 - }, - { - "epoch": 1.0698285332187036, - "grad_norm": 0.09180966764688492, - "learning_rate": 5.631111111111111e-05, - "loss": 0.1962, - "step": 11200 - }, - { - "epoch": 1.0707837799111621, - "grad_norm": 0.09561672806739807, - "learning_rate": 5.616296296296296e-05, - "loss": 0.2004, - "step": 11210 - }, - { - "epoch": 1.0717390266036204, - "grad_norm": 0.17557843029499054, - "learning_rate": 5.601481481481482e-05, - "loss": 0.1993, - "step": 11220 - }, - { - "epoch": 1.0726942732960787, - "grad_norm": 0.10663829743862152, - "learning_rate": 5.5866666666666665e-05, - "loss": 0.1863, - "step": 11230 - }, - { - "epoch": 1.073649519988537, - "grad_norm": 0.07170140743255615, - "learning_rate": 5.5718518518518525e-05, - "loss": 0.1926, - "step": 11240 - }, - { - "epoch": 1.0746047666809955, - "grad_norm": 0.07816746085882187, - "learning_rate": 5.557037037037037e-05, - "loss": 0.1952, - "step": 11250 - }, - { - "epoch": 1.0755600133734537, - "grad_norm": 0.10512028634548187, - "learning_rate": 5.542222222222222e-05, - "loss": 0.2002, - "step": 11260 - }, - { - "epoch": 1.076515260065912, - "grad_norm": 0.17822621762752533, - "learning_rate": 5.527407407407408e-05, - "loss": 0.1996, - "step": 11270 - }, - { - "epoch": 1.0774705067583703, - "grad_norm": 0.1166963279247284, - "learning_rate": 5.512592592592592e-05, - "loss": 0.1866, - "step": 11280 - }, - { - "epoch": 1.0784257534508286, - "grad_norm": 0.06219042092561722, - "learning_rate": 5.497777777777778e-05, - "loss": 0.1913, - "step": 11290 - }, - { - "epoch": 1.079381000143287, - "grad_norm": 0.07941684871912003, - "learning_rate": 5.482962962962963e-05, - "loss": 0.1951, - "step": 11300 - }, - { - "epoch": 1.0803362468357454, - "grad_norm": 0.07750697433948517, - "learning_rate": 5.468148148148149e-05, - "loss": 0.2003, - "step": 11310 - }, - { - "epoch": 1.0812914935282036, - "grad_norm": 0.15450026094913483, - "learning_rate": 5.4533333333333335e-05, - "loss": 0.1998, - "step": 11320 - }, - { - "epoch": 1.082246740220662, - "grad_norm": 0.12201431393623352, - "learning_rate": 5.438518518518518e-05, - "loss": 0.1865, - "step": 11330 - }, - { - "epoch": 1.0832019869131204, - "grad_norm": 0.06394163519144058, - "learning_rate": 5.423703703703704e-05, - "loss": 0.1908, - "step": 11340 - }, - { - "epoch": 1.0841572336055787, - "grad_norm": 0.08452702313661575, - "learning_rate": 5.408888888888889e-05, - "loss": 0.1959, - "step": 11350 - }, - { - "epoch": 1.085112480298037, - "grad_norm": 0.12962254881858826, - "learning_rate": 5.394074074074075e-05, - "loss": 0.2007, - "step": 11360 - }, - { - "epoch": 1.0860677269904953, - "grad_norm": 0.17652928829193115, - "learning_rate": 5.3792592592592594e-05, - "loss": 0.1986, - "step": 11370 - }, - { - "epoch": 1.0870229736829535, - "grad_norm": 0.11936990916728973, - "learning_rate": 5.364444444444444e-05, - "loss": 0.1855, - "step": 11380 - }, - { - "epoch": 1.087978220375412, - "grad_norm": 0.08107299357652664, - "learning_rate": 5.34962962962963e-05, - "loss": 0.1912, - "step": 11390 - }, - { - "epoch": 1.0889334670678703, - "grad_norm": 0.07732398808002472, - "learning_rate": 5.3348148148148146e-05, - "loss": 0.1965, - "step": 11400 - }, - { - "epoch": 1.0898887137603286, - "grad_norm": 0.10233469307422638, - "learning_rate": 5.3200000000000006e-05, - "loss": 0.2012, - "step": 11410 - }, - { - "epoch": 1.0908439604527869, - "grad_norm": 0.16339091956615448, - "learning_rate": 5.305185185185185e-05, - "loss": 0.2001, - "step": 11420 - }, - { - "epoch": 1.0917992071452454, - "grad_norm": 0.10522663593292236, - "learning_rate": 5.290370370370371e-05, - "loss": 0.1862, - "step": 11430 - }, - { - "epoch": 1.0927544538377036, - "grad_norm": 0.07971503585577011, - "learning_rate": 5.275555555555556e-05, - "loss": 0.1918, - "step": 11440 - }, - { - "epoch": 1.093709700530162, - "grad_norm": 0.08777160942554474, - "learning_rate": 5.2607407407407404e-05, - "loss": 0.1955, - "step": 11450 - }, - { - "epoch": 1.0946649472226202, - "grad_norm": 0.104684017598629, - "learning_rate": 5.2459259259259264e-05, - "loss": 0.1999, - "step": 11460 - }, - { - "epoch": 1.0956201939150785, - "grad_norm": 0.15735992789268494, - "learning_rate": 5.231111111111111e-05, - "loss": 0.1989, - "step": 11470 - }, - { - "epoch": 1.096575440607537, - "grad_norm": 0.11038859188556671, - "learning_rate": 5.216296296296297e-05, - "loss": 0.1866, - "step": 11480 - }, - { - "epoch": 1.0975306872999953, - "grad_norm": 0.06414885818958282, - "learning_rate": 5.2014814814814816e-05, - "loss": 0.1919, - "step": 11490 - }, - { - "epoch": 1.0984859339924535, - "grad_norm": 0.08361148089170456, - "learning_rate": 5.1866666666666676e-05, - "loss": 0.195, - "step": 11500 - }, - { - "epoch": 1.0994411806849118, - "grad_norm": 0.10123872011899948, - "learning_rate": 5.171851851851852e-05, - "loss": 0.1995, - "step": 11510 - }, - { - "epoch": 1.1003964273773703, - "grad_norm": 0.17141133546829224, - "learning_rate": 5.157037037037037e-05, - "loss": 0.2003, - "step": 11520 - }, - { - "epoch": 1.1013516740698286, - "grad_norm": 0.11368907988071442, - "learning_rate": 5.142222222222223e-05, - "loss": 0.1859, - "step": 11530 - }, - { - "epoch": 1.1023069207622869, - "grad_norm": 0.07556621730327606, - "learning_rate": 5.1274074074074075e-05, - "loss": 0.1908, - "step": 11540 - }, - { - "epoch": 1.1032621674547451, - "grad_norm": 0.06946977972984314, - "learning_rate": 5.1125925925925934e-05, - "loss": 0.195, - "step": 11550 - }, - { - "epoch": 1.1042174141472034, - "grad_norm": 0.09921626001596451, - "learning_rate": 5.097777777777778e-05, - "loss": 0.2001, - "step": 11560 - }, - { - "epoch": 1.105172660839662, - "grad_norm": 0.16775824129581451, - "learning_rate": 5.082962962962963e-05, - "loss": 0.201, - "step": 11570 - }, - { - "epoch": 1.1061279075321202, - "grad_norm": 0.10106077045202255, - "learning_rate": 5.068148148148149e-05, - "loss": 0.1848, - "step": 11580 - }, - { - "epoch": 1.1070831542245785, - "grad_norm": 0.06299445778131485, - "learning_rate": 5.053333333333333e-05, - "loss": 0.1904, - "step": 11590 - }, - { - "epoch": 1.1080384009170368, - "grad_norm": 0.08746166527271271, - "learning_rate": 5.038518518518519e-05, - "loss": 0.1949, - "step": 11600 - }, - { - "epoch": 1.1089936476094953, - "grad_norm": 0.0920008197426796, - "learning_rate": 5.023703703703704e-05, - "loss": 0.2014, - "step": 11610 - }, - { - "epoch": 1.1099488943019535, - "grad_norm": 0.1872783750295639, - "learning_rate": 5.00888888888889e-05, - "loss": 0.201, - "step": 11620 - }, - { - "epoch": 1.1109041409944118, - "grad_norm": 0.11330235749483109, - "learning_rate": 4.9940740740740745e-05, - "loss": 0.1851, - "step": 11630 - }, - { - "epoch": 1.11185938768687, - "grad_norm": 0.07352713495492935, - "learning_rate": 4.97925925925926e-05, - "loss": 0.1906, - "step": 11640 - }, - { - "epoch": 1.1128146343793284, - "grad_norm": 0.07548803091049194, - "learning_rate": 4.964444444444445e-05, - "loss": 0.1954, - "step": 11650 - }, - { - "epoch": 1.1137698810717869, - "grad_norm": 0.15241172909736633, - "learning_rate": 4.94962962962963e-05, - "loss": 0.2001, - "step": 11660 - }, - { - "epoch": 1.1147251277642451, - "grad_norm": 0.158297598361969, - "learning_rate": 4.934814814814815e-05, - "loss": 0.2002, - "step": 11670 - }, - { - "epoch": 1.1156803744567034, - "grad_norm": 0.1126081794500351, - "learning_rate": 4.92e-05, - "loss": 0.1847, - "step": 11680 - }, - { - "epoch": 1.1166356211491617, - "grad_norm": 0.059119775891304016, - "learning_rate": 4.9051851851851856e-05, - "loss": 0.1914, - "step": 11690 - }, - { - "epoch": 1.1175908678416202, - "grad_norm": 0.11391792446374893, - "learning_rate": 4.890370370370371e-05, - "loss": 0.1954, - "step": 11700 - }, - { - "epoch": 1.1185461145340785, - "grad_norm": 0.11543019860982895, - "learning_rate": 4.875555555555556e-05, - "loss": 0.1999, - "step": 11710 - }, - { - "epoch": 1.1195013612265368, - "grad_norm": 0.15582410991191864, - "learning_rate": 4.860740740740741e-05, - "loss": 0.1989, - "step": 11720 - }, - { - "epoch": 1.120456607918995, - "grad_norm": 0.1310344636440277, - "learning_rate": 4.845925925925926e-05, - "loss": 0.1852, - "step": 11730 - }, - { - "epoch": 1.1214118546114533, - "grad_norm": 0.06701342761516571, - "learning_rate": 4.8311111111111115e-05, - "loss": 0.1909, - "step": 11740 - }, - { - "epoch": 1.1223671013039118, - "grad_norm": 0.11257292330265045, - "learning_rate": 4.816296296296297e-05, - "loss": 0.194, - "step": 11750 - }, - { - "epoch": 1.12332234799637, - "grad_norm": 0.09040035307407379, - "learning_rate": 4.801481481481482e-05, - "loss": 0.2001, - "step": 11760 - }, - { - "epoch": 1.1242775946888284, - "grad_norm": 0.19392362236976624, - "learning_rate": 4.7866666666666674e-05, - "loss": 0.1997, - "step": 11770 - }, - { - "epoch": 1.1252328413812867, - "grad_norm": 0.0966700091958046, - "learning_rate": 4.771851851851853e-05, - "loss": 0.1868, - "step": 11780 - }, - { - "epoch": 1.1261880880737452, - "grad_norm": 0.06864186376333237, - "learning_rate": 4.757037037037037e-05, - "loss": 0.1909, - "step": 11790 - }, - { - "epoch": 1.1271433347662034, - "grad_norm": 0.0784897580742836, - "learning_rate": 4.7422222222222226e-05, - "loss": 0.1948, - "step": 11800 - }, - { - "epoch": 1.1280985814586617, - "grad_norm": 0.08286922425031662, - "learning_rate": 4.727407407407408e-05, - "loss": 0.1996, - "step": 11810 - }, - { - "epoch": 1.12905382815112, - "grad_norm": 0.22640322148799896, - "learning_rate": 4.712592592592593e-05, - "loss": 0.201, - "step": 11820 - }, - { - "epoch": 1.1300090748435783, - "grad_norm": 0.1183919832110405, - "learning_rate": 4.6977777777777785e-05, - "loss": 0.1846, - "step": 11830 - }, - { - "epoch": 1.1309643215360368, - "grad_norm": 0.08607591688632965, - "learning_rate": 4.682962962962963e-05, - "loss": 0.1906, - "step": 11840 - }, - { - "epoch": 1.131919568228495, - "grad_norm": 0.09465157240629196, - "learning_rate": 4.6681481481481484e-05, - "loss": 0.196, - "step": 11850 - }, - { - "epoch": 1.1328748149209533, - "grad_norm": 0.08989481627941132, - "learning_rate": 4.653333333333334e-05, - "loss": 0.1992, - "step": 11860 - }, - { - "epoch": 1.1338300616134116, - "grad_norm": 0.2044905126094818, - "learning_rate": 4.638518518518519e-05, - "loss": 0.199, - "step": 11870 - }, - { - "epoch": 1.13478530830587, - "grad_norm": 0.09982435405254364, - "learning_rate": 4.6237037037037037e-05, - "loss": 0.1859, - "step": 11880 - }, - { - "epoch": 1.1357405549983284, - "grad_norm": 0.08768076449632645, - "learning_rate": 4.608888888888889e-05, - "loss": 0.1911, - "step": 11890 - }, - { - "epoch": 1.1366958016907867, - "grad_norm": 0.08545703440904617, - "learning_rate": 4.594074074074074e-05, - "loss": 0.1962, - "step": 11900 - }, - { - "epoch": 1.137651048383245, - "grad_norm": 0.08855101466178894, - "learning_rate": 4.5792592592592596e-05, - "loss": 0.2002, - "step": 11910 - }, - { - "epoch": 1.1386062950757032, - "grad_norm": 0.165024995803833, - "learning_rate": 4.564444444444444e-05, - "loss": 0.199, - "step": 11920 - }, - { - "epoch": 1.1395615417681617, - "grad_norm": 0.10052850842475891, - "learning_rate": 4.5496296296296295e-05, - "loss": 0.1855, - "step": 11930 - }, - { - "epoch": 1.14051678846062, - "grad_norm": 0.06890951097011566, - "learning_rate": 4.534814814814815e-05, - "loss": 0.192, - "step": 11940 - }, - { - "epoch": 1.1414720351530783, - "grad_norm": 0.09082265198230743, - "learning_rate": 4.52e-05, - "loss": 0.1955, - "step": 11950 - }, - { - "epoch": 1.1424272818455365, - "grad_norm": 0.12514296174049377, - "learning_rate": 4.5051851851851854e-05, - "loss": 0.2, - "step": 11960 - }, - { - "epoch": 1.143382528537995, - "grad_norm": 0.15191242098808289, - "learning_rate": 4.49037037037037e-05, - "loss": 0.1994, - "step": 11970 - }, - { - "epoch": 1.1443377752304533, - "grad_norm": 0.1041502133011818, - "learning_rate": 4.475555555555555e-05, - "loss": 0.1855, - "step": 11980 - }, - { - "epoch": 1.1452930219229116, - "grad_norm": 0.07793102413415909, - "learning_rate": 4.4607407407407406e-05, - "loss": 0.1913, - "step": 11990 - }, - { - "epoch": 1.1462482686153699, - "grad_norm": 0.07437871396541595, - "learning_rate": 4.445925925925926e-05, - "loss": 0.1959, - "step": 12000 - }, - { - "epoch": 1.1472035153078282, - "grad_norm": 0.12875069677829742, - "learning_rate": 4.431111111111111e-05, - "loss": 0.1988, - "step": 12010 - }, - { - "epoch": 1.1481587620002867, - "grad_norm": 0.15850570797920227, - "learning_rate": 4.4162962962962965e-05, - "loss": 0.1995, - "step": 12020 - }, - { - "epoch": 1.149114008692745, - "grad_norm": 0.10010084509849548, - "learning_rate": 4.401481481481481e-05, - "loss": 0.1858, - "step": 12030 - }, - { - "epoch": 1.1500692553852032, - "grad_norm": 0.07205148786306381, - "learning_rate": 4.3866666666666665e-05, - "loss": 0.1919, - "step": 12040 - }, - { - "epoch": 1.1510245020776615, - "grad_norm": 0.09679614752531052, - "learning_rate": 4.371851851851852e-05, - "loss": 0.1963, - "step": 12050 - }, - { - "epoch": 1.15197974877012, - "grad_norm": 0.10506289452314377, - "learning_rate": 4.357037037037037e-05, - "loss": 0.2001, - "step": 12060 - }, - { - "epoch": 1.1529349954625783, - "grad_norm": 0.16931326687335968, - "learning_rate": 4.3422222222222224e-05, - "loss": 0.2002, - "step": 12070 - }, - { - "epoch": 1.1538902421550365, - "grad_norm": 0.09440700709819794, - "learning_rate": 4.327407407407408e-05, - "loss": 0.1848, - "step": 12080 - }, - { - "epoch": 1.1548454888474948, - "grad_norm": 0.054509177803993225, - "learning_rate": 4.312592592592593e-05, - "loss": 0.1915, - "step": 12090 - }, - { - "epoch": 1.155800735539953, - "grad_norm": 0.07895702868700027, - "learning_rate": 4.2977777777777776e-05, - "loss": 0.1951, - "step": 12100 - }, - { - "epoch": 1.1567559822324116, - "grad_norm": 0.12499396502971649, - "learning_rate": 4.282962962962963e-05, - "loss": 0.1999, - "step": 12110 - }, - { - "epoch": 1.1577112289248699, - "grad_norm": 0.14979098737239838, - "learning_rate": 4.268148148148148e-05, - "loss": 0.2, - "step": 12120 - }, - { - "epoch": 1.1586664756173282, - "grad_norm": 0.10792044550180435, - "learning_rate": 4.2533333333333335e-05, - "loss": 0.1844, - "step": 12130 - }, - { - "epoch": 1.1596217223097864, - "grad_norm": 0.0688839927315712, - "learning_rate": 4.238518518518519e-05, - "loss": 0.1914, - "step": 12140 - }, - { - "epoch": 1.160576969002245, - "grad_norm": 0.06082676351070404, - "learning_rate": 4.223703703703704e-05, - "loss": 0.1945, - "step": 12150 - }, - { - "epoch": 1.1615322156947032, - "grad_norm": 0.1477699875831604, - "learning_rate": 4.208888888888889e-05, - "loss": 0.1997, - "step": 12160 - }, - { - "epoch": 1.1624874623871615, - "grad_norm": 0.1498918980360031, - "learning_rate": 4.194074074074074e-05, - "loss": 0.2003, - "step": 12170 - }, - { - "epoch": 1.1634427090796198, - "grad_norm": 0.1018199622631073, - "learning_rate": 4.179259259259259e-05, - "loss": 0.184, - "step": 12180 - }, - { - "epoch": 1.164397955772078, - "grad_norm": 0.0773182362318039, - "learning_rate": 4.1644444444444446e-05, - "loss": 0.1912, - "step": 12190 - }, - { - "epoch": 1.1653532024645366, - "grad_norm": 0.06646312028169632, - "learning_rate": 4.14962962962963e-05, - "loss": 0.1945, - "step": 12200 - }, - { - "epoch": 1.1663084491569948, - "grad_norm": 0.09444624185562134, - "learning_rate": 4.134814814814815e-05, - "loss": 0.1996, - "step": 12210 - }, - { - "epoch": 1.167263695849453, - "grad_norm": 0.15636619925498962, - "learning_rate": 4.12e-05, - "loss": 0.199, - "step": 12220 - }, - { - "epoch": 1.1682189425419114, - "grad_norm": 0.09470692276954651, - "learning_rate": 4.105185185185185e-05, - "loss": 0.1845, - "step": 12230 - }, - { - "epoch": 1.1691741892343699, - "grad_norm": 0.06418988108634949, - "learning_rate": 4.0903703703703705e-05, - "loss": 0.1904, - "step": 12240 - }, - { - "epoch": 1.1701294359268282, - "grad_norm": 0.07210509479045868, - "learning_rate": 4.075555555555556e-05, - "loss": 0.1948, - "step": 12250 - }, - { - "epoch": 1.1710846826192864, - "grad_norm": 0.11306772381067276, - "learning_rate": 4.060740740740741e-05, - "loss": 0.2002, - "step": 12260 - }, - { - "epoch": 1.1720399293117447, - "grad_norm": 0.14943212270736694, - "learning_rate": 4.0459259259259264e-05, - "loss": 0.1997, - "step": 12270 - }, - { - "epoch": 1.172995176004203, - "grad_norm": 0.09277426451444626, - "learning_rate": 4.031111111111111e-05, - "loss": 0.1831, - "step": 12280 - }, - { - "epoch": 1.1739504226966615, - "grad_norm": 0.06567569822072983, - "learning_rate": 4.016296296296296e-05, - "loss": 0.1907, - "step": 12290 - }, - { - "epoch": 1.1749056693891198, - "grad_norm": 0.06958837062120438, - "learning_rate": 4.0014814814814816e-05, - "loss": 0.1943, - "step": 12300 - }, - { - "epoch": 1.175860916081578, - "grad_norm": 0.11151952296495438, - "learning_rate": 3.986666666666667e-05, - "loss": 0.1992, - "step": 12310 - }, - { - "epoch": 1.1768161627740363, - "grad_norm": 0.16728582978248596, - "learning_rate": 3.971851851851852e-05, - "loss": 0.1994, - "step": 12320 - }, - { - "epoch": 1.1777714094664948, - "grad_norm": 0.08983734250068665, - "learning_rate": 3.9570370370370375e-05, - "loss": 0.1835, - "step": 12330 - }, - { - "epoch": 1.1787266561589531, - "grad_norm": 0.060879047960042953, - "learning_rate": 3.942222222222222e-05, - "loss": 0.1905, - "step": 12340 - }, - { - "epoch": 1.1796819028514114, - "grad_norm": 0.07289200276136398, - "learning_rate": 3.9274074074074074e-05, - "loss": 0.1941, - "step": 12350 - }, - { - "epoch": 1.1806371495438697, - "grad_norm": 0.10434010624885559, - "learning_rate": 3.912592592592593e-05, - "loss": 0.1986, - "step": 12360 - }, - { - "epoch": 1.181592396236328, - "grad_norm": 0.1622079759836197, - "learning_rate": 3.897777777777778e-05, - "loss": 0.1997, - "step": 12370 - }, - { - "epoch": 1.1825476429287864, - "grad_norm": 0.08091533184051514, - "learning_rate": 3.882962962962963e-05, - "loss": 0.183, - "step": 12380 - }, - { - "epoch": 1.1835028896212447, - "grad_norm": 0.07402651757001877, - "learning_rate": 3.8681481481481486e-05, - "loss": 0.1899, - "step": 12390 - }, - { - "epoch": 1.184458136313703, - "grad_norm": 0.07148318737745285, - "learning_rate": 3.853333333333334e-05, - "loss": 0.1946, - "step": 12400 - }, - { - "epoch": 1.1854133830061613, - "grad_norm": 0.11965110898017883, - "learning_rate": 3.8385185185185186e-05, - "loss": 0.1987, - "step": 12410 - }, - { - "epoch": 1.1863686296986198, - "grad_norm": 0.1528114378452301, - "learning_rate": 3.823703703703704e-05, - "loss": 0.2001, - "step": 12420 - }, - { - "epoch": 1.187323876391078, - "grad_norm": 0.09836557507514954, - "learning_rate": 3.808888888888889e-05, - "loss": 0.1829, - "step": 12430 - }, - { - "epoch": 1.1882791230835363, - "grad_norm": 0.060973864048719406, - "learning_rate": 3.7940740740740745e-05, - "loss": 0.1897, - "step": 12440 - }, - { - "epoch": 1.1892343697759946, - "grad_norm": 0.08469880372285843, - "learning_rate": 3.77925925925926e-05, - "loss": 0.194, - "step": 12450 - }, - { - "epoch": 1.190189616468453, - "grad_norm": 0.10124852508306503, - "learning_rate": 3.764444444444445e-05, - "loss": 0.1988, - "step": 12460 - }, - { - "epoch": 1.1911448631609114, - "grad_norm": 0.15043075382709503, - "learning_rate": 3.74962962962963e-05, - "loss": 0.1997, - "step": 12470 - }, - { - "epoch": 1.1921001098533697, - "grad_norm": 0.09942149370908737, - "learning_rate": 3.734814814814815e-05, - "loss": 0.1831, - "step": 12480 - }, - { - "epoch": 1.193055356545828, - "grad_norm": 0.06965496391057968, - "learning_rate": 3.72e-05, - "loss": 0.19, - "step": 12490 - }, - { - "epoch": 1.1940106032382862, - "grad_norm": 0.06517937779426575, - "learning_rate": 3.7051851851851856e-05, - "loss": 0.1942, - "step": 12500 - }, - { - "epoch": 1.1949658499307447, - "grad_norm": 0.1225384771823883, - "learning_rate": 3.690370370370371e-05, - "loss": 0.1992, - "step": 12510 - }, - { - "epoch": 1.195921096623203, - "grad_norm": 0.15679340064525604, - "learning_rate": 3.675555555555556e-05, - "loss": 0.1994, - "step": 12520 - }, - { - "epoch": 1.1968763433156613, - "grad_norm": 0.09618114680051804, - "learning_rate": 3.660740740740741e-05, - "loss": 0.1825, - "step": 12530 - }, - { - "epoch": 1.1978315900081196, - "grad_norm": 0.05807056650519371, - "learning_rate": 3.645925925925926e-05, - "loss": 0.1901, - "step": 12540 - }, - { - "epoch": 1.1987868367005778, - "grad_norm": 0.0703585296869278, - "learning_rate": 3.6311111111111114e-05, - "loss": 0.1947, - "step": 12550 - }, - { - "epoch": 1.1997420833930363, - "grad_norm": 0.13773010671138763, - "learning_rate": 3.616296296296297e-05, - "loss": 0.1991, - "step": 12560 - }, - { - "epoch": 1.2006973300854946, - "grad_norm": 0.15620911121368408, - "learning_rate": 3.601481481481482e-05, - "loss": 0.1989, - "step": 12570 - }, - { - "epoch": 1.201652576777953, - "grad_norm": 0.08928284794092178, - "learning_rate": 3.586666666666667e-05, - "loss": 0.1831, - "step": 12580 - }, - { - "epoch": 1.2026078234704112, - "grad_norm": 0.06567124277353287, - "learning_rate": 3.571851851851852e-05, - "loss": 0.1912, - "step": 12590 - }, - { - "epoch": 1.2035630701628697, - "grad_norm": 0.06833696365356445, - "learning_rate": 3.557037037037037e-05, - "loss": 0.1942, - "step": 12600 - }, - { - "epoch": 1.204518316855328, - "grad_norm": 0.12125487625598907, - "learning_rate": 3.5422222222222226e-05, - "loss": 0.1991, - "step": 12610 - }, - { - "epoch": 1.2054735635477862, - "grad_norm": 0.1656818985939026, - "learning_rate": 3.527407407407408e-05, - "loss": 0.1985, - "step": 12620 - }, - { - "epoch": 1.2064288102402445, - "grad_norm": 0.10647820681333542, - "learning_rate": 3.512592592592593e-05, - "loss": 0.1831, - "step": 12630 - }, - { - "epoch": 1.2073840569327028, - "grad_norm": 0.06365940719842911, - "learning_rate": 3.4977777777777785e-05, - "loss": 0.1909, - "step": 12640 - }, - { - "epoch": 1.2083393036251613, - "grad_norm": 0.06544508039951324, - "learning_rate": 3.482962962962963e-05, - "loss": 0.1946, - "step": 12650 - }, - { - "epoch": 1.2092945503176196, - "grad_norm": 0.08825177699327469, - "learning_rate": 3.4681481481481484e-05, - "loss": 0.1983, - "step": 12660 - }, - { - "epoch": 1.2102497970100778, - "grad_norm": 0.15609724819660187, - "learning_rate": 3.453333333333334e-05, - "loss": 0.1986, - "step": 12670 - }, - { - "epoch": 1.2112050437025361, - "grad_norm": 0.09403195232152939, - "learning_rate": 3.438518518518519e-05, - "loss": 0.1822, - "step": 12680 - }, - { - "epoch": 1.2121602903949946, - "grad_norm": 0.055547330528497696, - "learning_rate": 3.423703703703704e-05, - "loss": 0.19, - "step": 12690 - }, - { - "epoch": 1.213115537087453, - "grad_norm": 0.09445223212242126, - "learning_rate": 3.408888888888889e-05, - "loss": 0.1942, - "step": 12700 - }, - { - "epoch": 1.2140707837799112, - "grad_norm": 0.10912507027387619, - "learning_rate": 3.394074074074074e-05, - "loss": 0.1982, - "step": 12710 - }, - { - "epoch": 1.2150260304723695, - "grad_norm": 0.15998390316963196, - "learning_rate": 3.3792592592592595e-05, - "loss": 0.1989, - "step": 12720 - }, - { - "epoch": 1.2159812771648277, - "grad_norm": 0.10185787081718445, - "learning_rate": 3.364444444444445e-05, - "loss": 0.1814, - "step": 12730 - }, - { - "epoch": 1.2169365238572862, - "grad_norm": 0.07232099026441574, - "learning_rate": 3.3496296296296295e-05, - "loss": 0.1895, - "step": 12740 - }, - { - "epoch": 1.2178917705497445, - "grad_norm": 0.07367640733718872, - "learning_rate": 3.334814814814815e-05, - "loss": 0.1937, - "step": 12750 - }, - { - "epoch": 1.2188470172422028, - "grad_norm": 0.09808887541294098, - "learning_rate": 3.32e-05, - "loss": 0.1977, - "step": 12760 - }, - { - "epoch": 1.219802263934661, - "grad_norm": 0.1700250655412674, - "learning_rate": 3.3051851851851854e-05, - "loss": 0.199, - "step": 12770 - }, - { - "epoch": 1.2207575106271196, - "grad_norm": 0.0908031091094017, - "learning_rate": 3.29037037037037e-05, - "loss": 0.1835, - "step": 12780 - }, - { - "epoch": 1.2217127573195778, - "grad_norm": 0.06603245437145233, - "learning_rate": 3.275555555555555e-05, - "loss": 0.1895, - "step": 12790 - }, - { - "epoch": 1.2226680040120361, - "grad_norm": 0.05788644403219223, - "learning_rate": 3.2607407407407406e-05, - "loss": 0.1934, - "step": 12800 - }, - { - "epoch": 1.2236232507044944, - "grad_norm": 0.10153844207525253, - "learning_rate": 3.245925925925926e-05, - "loss": 0.1989, - "step": 12810 - }, - { - "epoch": 1.2245784973969527, - "grad_norm": 0.15934138000011444, - "learning_rate": 3.231111111111111e-05, - "loss": 0.1984, - "step": 12820 - }, - { - "epoch": 1.2255337440894112, - "grad_norm": 0.08226645737886429, - "learning_rate": 3.2162962962962965e-05, - "loss": 0.1824, - "step": 12830 - }, - { - "epoch": 1.2264889907818695, - "grad_norm": 0.06215713173151016, - "learning_rate": 3.201481481481481e-05, - "loss": 0.1892, - "step": 12840 - }, - { - "epoch": 1.2274442374743277, - "grad_norm": 0.08781886845827103, - "learning_rate": 3.1866666666666664e-05, - "loss": 0.1936, - "step": 12850 - }, - { - "epoch": 1.228399484166786, - "grad_norm": 0.09996737539768219, - "learning_rate": 3.171851851851852e-05, - "loss": 0.1986, - "step": 12860 - }, - { - "epoch": 1.2293547308592445, - "grad_norm": 0.15390396118164062, - "learning_rate": 3.157037037037037e-05, - "loss": 0.1984, - "step": 12870 - }, - { - "epoch": 1.2303099775517028, - "grad_norm": 0.09596038609743118, - "learning_rate": 3.142222222222222e-05, - "loss": 0.1825, - "step": 12880 - }, - { - "epoch": 1.231265224244161, - "grad_norm": 0.06414441019296646, - "learning_rate": 3.1274074074074076e-05, - "loss": 0.1898, - "step": 12890 - }, - { - "epoch": 1.2322204709366193, - "grad_norm": 0.0710066705942154, - "learning_rate": 3.112592592592592e-05, - "loss": 0.1934, - "step": 12900 - }, - { - "epoch": 1.2331757176290776, - "grad_norm": 0.1029692143201828, - "learning_rate": 3.0977777777777776e-05, - "loss": 0.1981, - "step": 12910 - }, - { - "epoch": 1.2341309643215361, - "grad_norm": 0.15667466819286346, - "learning_rate": 3.082962962962963e-05, - "loss": 0.1997, - "step": 12920 - }, - { - "epoch": 1.2350862110139944, - "grad_norm": 0.09223178774118423, - "learning_rate": 3.068148148148148e-05, - "loss": 0.1822, - "step": 12930 - }, - { - "epoch": 1.2360414577064527, - "grad_norm": 0.06726747006177902, - "learning_rate": 3.0533333333333335e-05, - "loss": 0.1898, - "step": 12940 - }, - { - "epoch": 1.236996704398911, - "grad_norm": 0.07996299117803574, - "learning_rate": 3.0385185185185188e-05, - "loss": 0.1936, - "step": 12950 - }, - { - "epoch": 1.2379519510913695, - "grad_norm": 0.10410148650407791, - "learning_rate": 3.0237037037037037e-05, - "loss": 0.199, - "step": 12960 - }, - { - "epoch": 1.2389071977838277, - "grad_norm": 0.15508505702018738, - "learning_rate": 3.008888888888889e-05, - "loss": 0.1992, - "step": 12970 - }, - { - "epoch": 1.239862444476286, - "grad_norm": 0.10372573882341385, - "learning_rate": 2.994074074074074e-05, - "loss": 0.1822, - "step": 12980 - }, - { - "epoch": 1.2408176911687443, - "grad_norm": 0.061683837324380875, - "learning_rate": 2.9792592592592593e-05, - "loss": 0.1893, - "step": 12990 - }, - { - "epoch": 1.2417729378612026, - "grad_norm": 0.05991368368268013, - "learning_rate": 2.9644444444444446e-05, - "loss": 0.1934, - "step": 13000 - }, - { - "epoch": 1.242728184553661, - "grad_norm": 0.11899848282337189, - "learning_rate": 2.94962962962963e-05, - "loss": 0.1986, - "step": 13010 - }, - { - "epoch": 1.2436834312461194, - "grad_norm": 0.15275335311889648, - "learning_rate": 2.9348148148148145e-05, - "loss": 0.2002, - "step": 13020 - }, - { - "epoch": 1.2446386779385776, - "grad_norm": 0.08578670769929886, - "learning_rate": 2.9199999999999998e-05, - "loss": 0.1822, - "step": 13030 - }, - { - "epoch": 1.245593924631036, - "grad_norm": 0.06400281190872192, - "learning_rate": 2.905185185185185e-05, - "loss": 0.1895, - "step": 13040 - }, - { - "epoch": 1.2465491713234944, - "grad_norm": 0.07122600823640823, - "learning_rate": 2.8903703703703704e-05, - "loss": 0.1935, - "step": 13050 - }, - { - "epoch": 1.2475044180159527, - "grad_norm": 0.11315450817346573, - "learning_rate": 2.8755555555555557e-05, - "loss": 0.1979, - "step": 13060 - }, - { - "epoch": 1.248459664708411, - "grad_norm": 0.1562516689300537, - "learning_rate": 2.860740740740741e-05, - "loss": 0.2003, - "step": 13070 - }, - { - "epoch": 1.2494149114008692, - "grad_norm": 0.0909615084528923, - "learning_rate": 2.8459259259259263e-05, - "loss": 0.1828, - "step": 13080 - }, - { - "epoch": 1.2503701580933275, - "grad_norm": 0.056711986660957336, - "learning_rate": 2.831111111111111e-05, - "loss": 0.19, - "step": 13090 - }, - { - "epoch": 1.2513254047857858, - "grad_norm": 0.06463145464658737, - "learning_rate": 2.8162962962962963e-05, - "loss": 0.1934, - "step": 13100 - }, - { - "epoch": 1.2522806514782443, - "grad_norm": 0.09779531508684158, - "learning_rate": 2.8014814814814816e-05, - "loss": 0.1988, - "step": 13110 - }, - { - "epoch": 1.2532358981707026, - "grad_norm": 0.15403911471366882, - "learning_rate": 2.786666666666667e-05, - "loss": 0.1996, - "step": 13120 - }, - { - "epoch": 1.2541911448631609, - "grad_norm": 0.1008806899189949, - "learning_rate": 2.771851851851852e-05, - "loss": 0.1827, - "step": 13130 - }, - { - "epoch": 1.2551463915556194, - "grad_norm": 0.06381751596927643, - "learning_rate": 2.7570370370370375e-05, - "loss": 0.1892, - "step": 13140 - }, - { - "epoch": 1.2561016382480776, - "grad_norm": 0.06516945362091064, - "learning_rate": 2.742222222222222e-05, - "loss": 0.193, - "step": 13150 - }, - { - "epoch": 1.257056884940536, - "grad_norm": 0.10235823690891266, - "learning_rate": 2.7274074074074074e-05, - "loss": 0.1981, - "step": 13160 - }, - { - "epoch": 1.2580121316329942, - "grad_norm": 0.1551736742258072, - "learning_rate": 2.7125925925925927e-05, - "loss": 0.1989, - "step": 13170 - }, - { - "epoch": 1.2589673783254525, - "grad_norm": 0.09147223085165024, - "learning_rate": 2.697777777777778e-05, - "loss": 0.1816, - "step": 13180 - }, - { - "epoch": 1.2599226250179107, - "grad_norm": 0.07331864535808563, - "learning_rate": 2.6829629629629633e-05, - "loss": 0.1895, - "step": 13190 - }, - { - "epoch": 1.2608778717103692, - "grad_norm": 0.0740872323513031, - "learning_rate": 2.6681481481481486e-05, - "loss": 0.1934, - "step": 13200 - }, - { - "epoch": 1.2618331184028275, - "grad_norm": 0.1049848198890686, - "learning_rate": 2.6533333333333332e-05, - "loss": 0.1984, - "step": 13210 - }, - { - "epoch": 1.2627883650952858, - "grad_norm": 0.15695710480213165, - "learning_rate": 2.6385185185185185e-05, - "loss": 0.1997, - "step": 13220 - }, - { - "epoch": 1.2637436117877443, - "grad_norm": 0.09660939872264862, - "learning_rate": 2.623703703703704e-05, - "loss": 0.1815, - "step": 13230 - }, - { - "epoch": 1.2646988584802026, - "grad_norm": 0.0635804608464241, - "learning_rate": 2.608888888888889e-05, - "loss": 0.1891, - "step": 13240 - }, - { - "epoch": 1.2656541051726609, - "grad_norm": 0.08403673022985458, - "learning_rate": 2.5940740740740744e-05, - "loss": 0.1933, - "step": 13250 - }, - { - "epoch": 1.2666093518651191, - "grad_norm": 0.08146359026432037, - "learning_rate": 2.5792592592592597e-05, - "loss": 0.1987, - "step": 13260 - }, - { - "epoch": 1.2675645985575774, - "grad_norm": 0.14868749678134918, - "learning_rate": 2.5644444444444444e-05, - "loss": 0.1998, - "step": 13270 - }, - { - "epoch": 1.2685198452500357, - "grad_norm": 0.09479347616434097, - "learning_rate": 2.5496296296296297e-05, - "loss": 0.1817, - "step": 13280 - }, - { - "epoch": 1.2694750919424942, - "grad_norm": 0.059485744684934616, - "learning_rate": 2.534814814814815e-05, - "loss": 0.1893, - "step": 13290 - }, - { - "epoch": 1.2704303386349525, - "grad_norm": 0.07404431700706482, - "learning_rate": 2.5200000000000003e-05, - "loss": 0.1936, - "step": 13300 - }, - { - "epoch": 1.2713855853274107, - "grad_norm": 0.1138007789850235, - "learning_rate": 2.5051851851851856e-05, - "loss": 0.1984, - "step": 13310 - }, - { - "epoch": 1.2723408320198693, - "grad_norm": 0.15096613764762878, - "learning_rate": 2.4903703703703705e-05, - "loss": 0.1983, - "step": 13320 - }, - { - "epoch": 1.2732960787123275, - "grad_norm": 0.09422920644283295, - "learning_rate": 2.475555555555556e-05, - "loss": 0.1814, - "step": 13330 - }, - { - "epoch": 1.2742513254047858, - "grad_norm": 0.07392732799053192, - "learning_rate": 2.4607407407407408e-05, - "loss": 0.1891, - "step": 13340 - }, - { - "epoch": 1.275206572097244, - "grad_norm": 0.07089727371931076, - "learning_rate": 2.445925925925926e-05, - "loss": 0.1934, - "step": 13350 - }, - { - "epoch": 1.2761618187897024, - "grad_norm": 0.11914543807506561, - "learning_rate": 2.431111111111111e-05, - "loss": 0.1984, - "step": 13360 - }, - { - "epoch": 1.2771170654821606, - "grad_norm": 0.15955843031406403, - "learning_rate": 2.4162962962962964e-05, - "loss": 0.2002, - "step": 13370 - }, - { - "epoch": 1.2780723121746191, - "grad_norm": 0.09782761335372925, - "learning_rate": 2.4014814814814817e-05, - "loss": 0.1818, - "step": 13380 - }, - { - "epoch": 1.2790275588670774, - "grad_norm": 0.07677102088928223, - "learning_rate": 2.3866666666666666e-05, - "loss": 0.189, - "step": 13390 - }, - { - "epoch": 1.2799828055595357, - "grad_norm": 0.071205273270607, - "learning_rate": 2.371851851851852e-05, - "loss": 0.1924, - "step": 13400 - }, - { - "epoch": 1.2809380522519942, - "grad_norm": 0.09079116582870483, - "learning_rate": 2.357037037037037e-05, - "loss": 0.1978, - "step": 13410 - }, - { - "epoch": 1.2818932989444525, - "grad_norm": 0.17159642279148102, - "learning_rate": 2.3422222222222222e-05, - "loss": 0.198, - "step": 13420 - }, - { - "epoch": 1.2828485456369108, - "grad_norm": 0.09026908874511719, - "learning_rate": 2.3274074074074075e-05, - "loss": 0.1808, - "step": 13430 - }, - { - "epoch": 1.283803792329369, - "grad_norm": 0.0792994573712349, - "learning_rate": 2.3125925925925925e-05, - "loss": 0.1886, - "step": 13440 - }, - { - "epoch": 1.2847590390218273, - "grad_norm": 0.06254783272743225, - "learning_rate": 2.2977777777777778e-05, - "loss": 0.1928, - "step": 13450 - }, - { - "epoch": 1.2857142857142856, - "grad_norm": 0.11604833602905273, - "learning_rate": 2.282962962962963e-05, - "loss": 0.1983, - "step": 13460 - }, - { - "epoch": 1.286669532406744, - "grad_norm": 0.14751599729061127, - "learning_rate": 2.268148148148148e-05, - "loss": 0.1977, - "step": 13470 - }, - { - "epoch": 1.2876247790992024, - "grad_norm": 0.07771807163953781, - "learning_rate": 2.2533333333333333e-05, - "loss": 0.1817, - "step": 13480 - }, - { - "epoch": 1.2885800257916606, - "grad_norm": 0.06615254282951355, - "learning_rate": 2.2385185185185186e-05, - "loss": 0.1896, - "step": 13490 - }, - { - "epoch": 1.2895352724841191, - "grad_norm": 0.058883316814899445, - "learning_rate": 2.2237037037037036e-05, - "loss": 0.1932, - "step": 13500 - }, - { - "epoch": 1.2904905191765774, - "grad_norm": 0.10671798884868622, - "learning_rate": 2.208888888888889e-05, - "loss": 0.1987, - "step": 13510 - }, - { - "epoch": 1.2914457658690357, - "grad_norm": 0.14683255553245544, - "learning_rate": 2.1940740740740742e-05, - "loss": 0.1997, - "step": 13520 - }, - { - "epoch": 1.292401012561494, - "grad_norm": 0.09709542244672775, - "learning_rate": 2.1792592592592595e-05, - "loss": 0.1821, - "step": 13530 - }, - { - "epoch": 1.2933562592539523, - "grad_norm": 0.06401824951171875, - "learning_rate": 2.1644444444444445e-05, - "loss": 0.1891, - "step": 13540 - }, - { - "epoch": 1.2943115059464105, - "grad_norm": 0.07614504545927048, - "learning_rate": 2.1496296296296298e-05, - "loss": 0.1935, - "step": 13550 - }, - { - "epoch": 1.295266752638869, - "grad_norm": 0.10084769874811172, - "learning_rate": 2.134814814814815e-05, - "loss": 0.1986, - "step": 13560 - }, - { - "epoch": 1.2962219993313273, - "grad_norm": 0.1459190398454666, - "learning_rate": 2.12e-05, - "loss": 0.2, - "step": 13570 - }, - { - "epoch": 1.2971772460237856, - "grad_norm": 0.08136285096406937, - "learning_rate": 2.1051851851851853e-05, - "loss": 0.1815, - "step": 13580 - }, - { - "epoch": 1.298132492716244, - "grad_norm": 0.06184697896242142, - "learning_rate": 2.0903703703703706e-05, - "loss": 0.1892, - "step": 13590 - }, - { - "epoch": 1.2990877394087024, - "grad_norm": 0.06824415922164917, - "learning_rate": 2.0755555555555556e-05, - "loss": 0.1926, - "step": 13600 - }, - { - "epoch": 1.3000429861011606, - "grad_norm": 0.10083822160959244, - "learning_rate": 2.060740740740741e-05, - "loss": 0.1972, - "step": 13610 - }, - { - "epoch": 1.300998232793619, - "grad_norm": 0.15515734255313873, - "learning_rate": 2.0459259259259262e-05, - "loss": 0.1991, - "step": 13620 - }, - { - "epoch": 1.3019534794860772, - "grad_norm": 0.09369368106126785, - "learning_rate": 2.031111111111111e-05, - "loss": 0.1814, - "step": 13630 - }, - { - "epoch": 1.3029087261785355, - "grad_norm": 0.06133545935153961, - "learning_rate": 2.0162962962962965e-05, - "loss": 0.189, - "step": 13640 - }, - { - "epoch": 1.303863972870994, - "grad_norm": 0.06848432868719101, - "learning_rate": 2.0014814814814818e-05, - "loss": 0.1927, - "step": 13650 - }, - { - "epoch": 1.3048192195634523, - "grad_norm": 0.10023056715726852, - "learning_rate": 1.9866666666666667e-05, - "loss": 0.1973, - "step": 13660 - }, - { - "epoch": 1.3057744662559105, - "grad_norm": 0.16171331703662872, - "learning_rate": 1.971851851851852e-05, - "loss": 0.1983, - "step": 13670 - }, - { - "epoch": 1.306729712948369, - "grad_norm": 0.09014260023832321, - "learning_rate": 1.9570370370370373e-05, - "loss": 0.1823, - "step": 13680 - }, - { - "epoch": 1.3076849596408273, - "grad_norm": 0.0606960766017437, - "learning_rate": 1.9422222222222223e-05, - "loss": 0.1888, - "step": 13690 - }, - { - "epoch": 1.3086402063332856, - "grad_norm": 0.07925919443368912, - "learning_rate": 1.9274074074074076e-05, - "loss": 0.1935, - "step": 13700 - }, - { - "epoch": 1.3095954530257439, - "grad_norm": 0.10734547674655914, - "learning_rate": 1.912592592592593e-05, - "loss": 0.1985, - "step": 13710 - }, - { - "epoch": 1.3105506997182022, - "grad_norm": 0.15949596464633942, - "learning_rate": 1.897777777777778e-05, - "loss": 0.199, - "step": 13720 - }, - { - "epoch": 1.3115059464106604, - "grad_norm": 0.07958906143903732, - "learning_rate": 1.882962962962963e-05, - "loss": 0.1811, - "step": 13730 - }, - { - "epoch": 1.312461193103119, - "grad_norm": 0.06587712466716766, - "learning_rate": 1.8681481481481485e-05, - "loss": 0.1894, - "step": 13740 - }, - { - "epoch": 1.3134164397955772, - "grad_norm": 0.0697634294629097, - "learning_rate": 1.8533333333333334e-05, - "loss": 0.1931, - "step": 13750 - }, - { - "epoch": 1.3143716864880355, - "grad_norm": 0.08874509483575821, - "learning_rate": 1.8385185185185187e-05, - "loss": 0.1985, - "step": 13760 - }, - { - "epoch": 1.315326933180494, - "grad_norm": 0.1566840559244156, - "learning_rate": 1.8237037037037037e-05, - "loss": 0.1992, - "step": 13770 - }, - { - "epoch": 1.3162821798729523, - "grad_norm": 0.08778232336044312, - "learning_rate": 1.808888888888889e-05, - "loss": 0.1814, - "step": 13780 - }, - { - "epoch": 1.3172374265654105, - "grad_norm": 0.06845912337303162, - "learning_rate": 1.794074074074074e-05, - "loss": 0.189, - "step": 13790 - }, - { - "epoch": 1.3181926732578688, - "grad_norm": 0.07066404074430466, - "learning_rate": 1.7792592592592593e-05, - "loss": 0.1932, - "step": 13800 - }, - { - "epoch": 1.319147919950327, - "grad_norm": 0.10193174332380295, - "learning_rate": 1.7644444444444446e-05, - "loss": 0.1978, - "step": 13810 - }, - { - "epoch": 1.3201031666427854, - "grad_norm": 0.14859530329704285, - "learning_rate": 1.7496296296296295e-05, - "loss": 0.1987, - "step": 13820 - }, - { - "epoch": 1.3210584133352439, - "grad_norm": 0.07397311180830002, - "learning_rate": 1.734814814814815e-05, - "loss": 0.1816, - "step": 13830 - }, - { - "epoch": 1.3220136600277022, - "grad_norm": 0.07166632264852524, - "learning_rate": 1.7199999999999998e-05, - "loss": 0.1887, - "step": 13840 - }, - { - "epoch": 1.3229689067201604, - "grad_norm": 0.06838031858205795, - "learning_rate": 1.705185185185185e-05, - "loss": 0.1932, - "step": 13850 - }, - { - "epoch": 1.323924153412619, - "grad_norm": 0.08447981625795364, - "learning_rate": 1.6903703703703704e-05, - "loss": 0.1973, - "step": 13860 - }, - { - "epoch": 1.3248794001050772, - "grad_norm": 0.16111405193805695, - "learning_rate": 1.6755555555555557e-05, - "loss": 0.1982, - "step": 13870 - }, - { - "epoch": 1.3258346467975355, - "grad_norm": 0.11569291353225708, - "learning_rate": 1.6607407407407407e-05, - "loss": 0.1804, - "step": 13880 - }, - { - "epoch": 1.3267898934899938, - "grad_norm": 0.05849316716194153, - "learning_rate": 1.645925925925926e-05, - "loss": 0.1884, - "step": 13890 - }, - { - "epoch": 1.327745140182452, - "grad_norm": 0.07721465080976486, - "learning_rate": 1.6311111111111113e-05, - "loss": 0.193, - "step": 13900 - }, - { - "epoch": 1.3287003868749103, - "grad_norm": 0.10829592496156693, - "learning_rate": 1.6162962962962962e-05, - "loss": 0.1978, - "step": 13910 - }, - { - "epoch": 1.3296556335673688, - "grad_norm": 0.15014564990997314, - "learning_rate": 1.6014814814814815e-05, - "loss": 0.1988, - "step": 13920 - }, - { - "epoch": 1.330610880259827, - "grad_norm": 0.09521475434303284, - "learning_rate": 1.586666666666667e-05, - "loss": 0.1813, - "step": 13930 - }, - { - "epoch": 1.3315661269522854, - "grad_norm": 0.05856531485915184, - "learning_rate": 1.5718518518518518e-05, - "loss": 0.1887, - "step": 13940 - }, - { - "epoch": 1.3325213736447439, - "grad_norm": 0.07324463874101639, - "learning_rate": 1.557037037037037e-05, - "loss": 0.1926, - "step": 13950 - }, - { - "epoch": 1.3334766203372022, - "grad_norm": 0.09733791649341583, - "learning_rate": 1.5422222222222224e-05, - "loss": 0.1981, - "step": 13960 - }, - { - "epoch": 1.3344318670296604, - "grad_norm": 0.1468607783317566, - "learning_rate": 1.5274074074074074e-05, - "loss": 0.1988, - "step": 13970 - }, - { - "epoch": 1.3353871137221187, - "grad_norm": 0.07646271586418152, - "learning_rate": 1.5125925925925927e-05, - "loss": 0.1805, - "step": 13980 - }, - { - "epoch": 1.336342360414577, - "grad_norm": 0.06440643221139908, - "learning_rate": 1.497777777777778e-05, - "loss": 0.189, - "step": 13990 - }, - { - "epoch": 1.3372976071070353, - "grad_norm": 0.0725795105099678, - "learning_rate": 1.482962962962963e-05, - "loss": 0.1927, - "step": 14000 - }, - { - "epoch": 1.3382528537994938, - "grad_norm": 0.11663739383220673, - "learning_rate": 1.4681481481481482e-05, - "loss": 0.1981, - "step": 14010 - }, - { - "epoch": 1.339208100491952, - "grad_norm": 0.15067516267299652, - "learning_rate": 1.4533333333333335e-05, - "loss": 0.1973, - "step": 14020 - }, - { - "epoch": 1.3401633471844103, - "grad_norm": 0.0837399959564209, - "learning_rate": 1.4385185185185185e-05, - "loss": 0.1805, - "step": 14030 - }, - { - "epoch": 1.3411185938768688, - "grad_norm": 0.06571058183908463, - "learning_rate": 1.4237037037037038e-05, - "loss": 0.1884, - "step": 14040 - }, - { - "epoch": 1.342073840569327, - "grad_norm": 0.07058189064264297, - "learning_rate": 1.4088888888888891e-05, - "loss": 0.1924, - "step": 14050 - }, - { - "epoch": 1.3430290872617854, - "grad_norm": 0.09456133842468262, - "learning_rate": 1.394074074074074e-05, - "loss": 0.1976, - "step": 14060 - }, - { - "epoch": 1.3439843339542437, - "grad_norm": 0.15115611255168915, - "learning_rate": 1.3792592592592594e-05, - "loss": 0.1987, - "step": 14070 - }, - { - "epoch": 1.344939580646702, - "grad_norm": 0.07672551274299622, - "learning_rate": 1.3644444444444445e-05, - "loss": 0.1801, - "step": 14080 - }, - { - "epoch": 1.3458948273391602, - "grad_norm": 0.06534221768379211, - "learning_rate": 1.3496296296296296e-05, - "loss": 0.1883, - "step": 14090 - }, - { - "epoch": 1.3468500740316187, - "grad_norm": 0.06814494729042053, - "learning_rate": 1.334814814814815e-05, - "loss": 0.1929, - "step": 14100 - }, - { - "epoch": 1.347805320724077, - "grad_norm": 0.09241555631160736, - "learning_rate": 1.32e-05, - "loss": 0.1977, - "step": 14110 - }, - { - "epoch": 1.3487605674165353, - "grad_norm": 0.1599811464548111, - "learning_rate": 1.3051851851851852e-05, - "loss": 0.1982, - "step": 14120 - }, - { - "epoch": 1.3497158141089938, - "grad_norm": 0.1026700884103775, - "learning_rate": 1.2903703703703703e-05, - "loss": 0.1806, - "step": 14130 - }, - { - "epoch": 1.350671060801452, - "grad_norm": 0.06195655092597008, - "learning_rate": 1.2755555555555556e-05, - "loss": 0.1878, - "step": 14140 - }, - { - "epoch": 1.3516263074939103, - "grad_norm": 0.0745544582605362, - "learning_rate": 1.2607407407407406e-05, - "loss": 0.1929, - "step": 14150 - }, - { - "epoch": 1.3525815541863686, - "grad_norm": 0.10045164078474045, - "learning_rate": 1.2459259259259259e-05, - "loss": 0.1983, - "step": 14160 - }, - { - "epoch": 1.3535368008788269, - "grad_norm": 0.15764273703098297, - "learning_rate": 1.2311111111111112e-05, - "loss": 0.198, - "step": 14170 - }, - { - "epoch": 1.3544920475712852, - "grad_norm": 0.08975204080343246, - "learning_rate": 1.2162962962962963e-05, - "loss": 0.1809, - "step": 14180 - }, - { - "epoch": 1.3554472942637437, - "grad_norm": 0.06544684618711472, - "learning_rate": 1.2014814814814815e-05, - "loss": 0.1888, - "step": 14190 - }, - { - "epoch": 1.356402540956202, - "grad_norm": 0.07405107468366623, - "learning_rate": 1.1866666666666668e-05, - "loss": 0.1931, - "step": 14200 - }, - { - "epoch": 1.3573577876486602, - "grad_norm": 0.10317433625459671, - "learning_rate": 1.1718518518518519e-05, - "loss": 0.1977, - "step": 14210 - }, - { - "epoch": 1.3583130343411187, - "grad_norm": 0.14299127459526062, - "learning_rate": 1.157037037037037e-05, - "loss": 0.1988, - "step": 14220 - }, - { - "epoch": 1.359268281033577, - "grad_norm": 0.08238115906715393, - "learning_rate": 1.1422222222222223e-05, - "loss": 0.1796, - "step": 14230 - }, - { - "epoch": 1.3602235277260353, - "grad_norm": 0.07125524431467056, - "learning_rate": 1.1274074074074075e-05, - "loss": 0.1879, - "step": 14240 - }, - { - "epoch": 1.3611787744184936, - "grad_norm": 0.06334027647972107, - "learning_rate": 1.1125925925925928e-05, - "loss": 0.1932, - "step": 14250 - }, - { - "epoch": 1.3621340211109518, - "grad_norm": 0.1071334108710289, - "learning_rate": 1.0977777777777779e-05, - "loss": 0.1976, - "step": 14260 - }, - { - "epoch": 1.36308926780341, - "grad_norm": 0.14405353367328644, - "learning_rate": 1.082962962962963e-05, - "loss": 0.1973, - "step": 14270 - }, - { - "epoch": 1.3640445144958686, - "grad_norm": 0.08287125825881958, - "learning_rate": 1.0681481481481483e-05, - "loss": 0.1805, - "step": 14280 - }, - { - "epoch": 1.3649997611883269, - "grad_norm": 0.061365820467472076, - "learning_rate": 1.0533333333333335e-05, - "loss": 0.1885, - "step": 14290 - }, - { - "epoch": 1.3659550078807852, - "grad_norm": 0.06179488077759743, - "learning_rate": 1.0385185185185186e-05, - "loss": 0.193, - "step": 14300 - }, - { - "epoch": 1.3669102545732437, - "grad_norm": 0.10180076956748962, - "learning_rate": 1.0237037037037037e-05, - "loss": 0.1974, - "step": 14310 - }, - { - "epoch": 1.367865501265702, - "grad_norm": 0.1490979641675949, - "learning_rate": 1.0088888888888889e-05, - "loss": 0.1988, - "step": 14320 - }, - { - "epoch": 1.3688207479581602, - "grad_norm": 0.10851076245307922, - "learning_rate": 9.94074074074074e-06, - "loss": 0.1805, - "step": 14330 - }, - { - "epoch": 1.3697759946506185, - "grad_norm": 0.060650117695331573, - "learning_rate": 9.792592592592593e-06, - "loss": 0.189, - "step": 14340 - }, - { - "epoch": 1.3707312413430768, - "grad_norm": 0.06470511853694916, - "learning_rate": 9.644444444444444e-06, - "loss": 0.1936, - "step": 14350 - }, - { - "epoch": 1.371686488035535, - "grad_norm": 0.11010751873254776, - "learning_rate": 9.496296296296296e-06, - "loss": 0.1982, - "step": 14360 - }, - { - "epoch": 1.3726417347279936, - "grad_norm": 0.15772590041160583, - "learning_rate": 9.348148148148149e-06, - "loss": 0.1997, - "step": 14370 - }, - { - "epoch": 1.3735969814204518, - "grad_norm": 0.07963084429502487, - "learning_rate": 9.2e-06, - "loss": 0.1803, - "step": 14380 - }, - { - "epoch": 1.3745522281129101, - "grad_norm": 0.07517506927251816, - "learning_rate": 9.051851851851851e-06, - "loss": 0.1882, - "step": 14390 - }, - { - "epoch": 1.3755074748053686, - "grad_norm": 0.08012760430574417, - "learning_rate": 8.903703703703704e-06, - "loss": 0.193, - "step": 14400 - }, - { - "epoch": 1.376462721497827, - "grad_norm": 0.11921060085296631, - "learning_rate": 8.755555555555556e-06, - "loss": 0.1984, - "step": 14410 - }, - { - "epoch": 1.3774179681902852, - "grad_norm": 0.15329593420028687, - "learning_rate": 8.607407407407409e-06, - "loss": 0.1983, - "step": 14420 - }, - { - "epoch": 1.3783732148827434, - "grad_norm": 0.0893860012292862, - "learning_rate": 8.45925925925926e-06, - "loss": 0.1806, - "step": 14430 - }, - { - "epoch": 1.3793284615752017, - "grad_norm": 0.0695950984954834, - "learning_rate": 8.311111111111111e-06, - "loss": 0.1888, - "step": 14440 - }, - { - "epoch": 1.38028370826766, - "grad_norm": 0.07914608716964722, - "learning_rate": 8.162962962962964e-06, - "loss": 0.1937, - "step": 14450 - }, - { - "epoch": 1.3812389549601185, - "grad_norm": 0.10042670369148254, - "learning_rate": 8.014814814814816e-06, - "loss": 0.1983, - "step": 14460 - }, - { - "epoch": 1.3821942016525768, - "grad_norm": 0.1473369598388672, - "learning_rate": 7.866666666666667e-06, - "loss": 0.1988, - "step": 14470 - }, - { - "epoch": 1.383149448345035, - "grad_norm": 0.10038721561431885, - "learning_rate": 7.71851851851852e-06, - "loss": 0.18, - "step": 14480 - }, - { - "epoch": 1.3841046950374936, - "grad_norm": 0.06510590761899948, - "learning_rate": 7.5703703703703705e-06, - "loss": 0.1882, - "step": 14490 - }, - { - "epoch": 1.3850599417299518, - "grad_norm": 0.08909650892019272, - "learning_rate": 7.422222222222222e-06, - "loss": 0.1929, - "step": 14500 - }, - { - "epoch": 1.3860151884224101, - "grad_norm": 0.10360381007194519, - "learning_rate": 7.274074074074075e-06, - "loss": 0.1982, - "step": 14510 - }, - { - "epoch": 1.3869704351148684, - "grad_norm": 0.15662701427936554, - "learning_rate": 7.125925925925926e-06, - "loss": 0.1989, - "step": 14520 - }, - { - "epoch": 1.3879256818073267, - "grad_norm": 0.0809619128704071, - "learning_rate": 6.9777777777777775e-06, - "loss": 0.1801, - "step": 14530 - }, - { - "epoch": 1.388880928499785, - "grad_norm": 0.05729440972208977, - "learning_rate": 6.8296296296296305e-06, - "loss": 0.1885, - "step": 14540 - }, - { - "epoch": 1.3898361751922435, - "grad_norm": 0.0785176008939743, - "learning_rate": 6.681481481481482e-06, - "loss": 0.1933, - "step": 14550 - }, - { - "epoch": 1.3907914218847017, - "grad_norm": 0.11649096012115479, - "learning_rate": 6.533333333333333e-06, - "loss": 0.1975, - "step": 14560 - }, - { - "epoch": 1.39174666857716, - "grad_norm": 0.15687984228134155, - "learning_rate": 6.385185185185185e-06, - "loss": 0.198, - "step": 14570 - }, - { - "epoch": 1.3927019152696185, - "grad_norm": 0.07697634398937225, - "learning_rate": 6.237037037037037e-06, - "loss": 0.1808, - "step": 14580 - }, - { - "epoch": 1.3936571619620768, - "grad_norm": 0.06329817324876785, - "learning_rate": 6.088888888888889e-06, - "loss": 0.1889, - "step": 14590 - }, - { - "epoch": 1.394612408654535, - "grad_norm": 0.06335621327161789, - "learning_rate": 5.940740740740741e-06, - "loss": 0.1932, - "step": 14600 - }, - { - "epoch": 1.3955676553469933, - "grad_norm": 0.10063440352678299, - "learning_rate": 5.792592592592593e-06, - "loss": 0.198, - "step": 14610 - }, - { - "epoch": 1.3965229020394516, - "grad_norm": 0.15546005964279175, - "learning_rate": 5.6444444444444445e-06, - "loss": 0.1977, - "step": 14620 - }, - { - "epoch": 1.39747814873191, - "grad_norm": 0.08702757954597473, - "learning_rate": 5.496296296296297e-06, - "loss": 0.1794, - "step": 14630 - }, - { - "epoch": 1.3984333954243684, - "grad_norm": 0.0682302713394165, - "learning_rate": 5.348148148148149e-06, - "loss": 0.1883, - "step": 14640 - }, - { - "epoch": 1.3993886421168267, - "grad_norm": 0.07530588656663895, - "learning_rate": 5.2e-06, - "loss": 0.193, - "step": 14650 - }, - { - "epoch": 1.400343888809285, - "grad_norm": 0.10217908024787903, - "learning_rate": 5.051851851851852e-06, - "loss": 0.1979, - "step": 14660 - }, - { - "epoch": 1.4012991355017435, - "grad_norm": 0.15378542244434357, - "learning_rate": 4.903703703703704e-06, - "loss": 0.198, - "step": 14670 - }, - { - "epoch": 1.4022543821942017, - "grad_norm": 0.09957956522703171, - "learning_rate": 4.755555555555556e-06, - "loss": 0.1794, - "step": 14680 - }, - { - "epoch": 1.40320962888666, - "grad_norm": 0.06103844195604324, - "learning_rate": 4.607407407407407e-06, - "loss": 0.1881, - "step": 14690 - }, - { - "epoch": 1.4041648755791183, - "grad_norm": 0.09466850012540817, - "learning_rate": 4.459259259259259e-06, - "loss": 0.1931, - "step": 14700 - }, - { - "epoch": 1.4051201222715766, - "grad_norm": 0.09376849234104156, - "learning_rate": 4.3111111111111115e-06, - "loss": 0.1971, - "step": 14710 - }, - { - "epoch": 1.4060753689640348, - "grad_norm": 0.15457859635353088, - "learning_rate": 4.162962962962963e-06, - "loss": 0.1987, - "step": 14720 - }, - { - "epoch": 1.4070306156564933, - "grad_norm": 0.08470315486192703, - "learning_rate": 4.014814814814815e-06, - "loss": 0.179, - "step": 14730 - }, - { - "epoch": 1.4079858623489516, - "grad_norm": 0.06838913261890411, - "learning_rate": 3.866666666666667e-06, - "loss": 0.1881, - "step": 14740 - }, - { - "epoch": 1.40894110904141, - "grad_norm": 0.06744107604026794, - "learning_rate": 3.7185185185185185e-06, - "loss": 0.1923, - "step": 14750 - }, - { - "epoch": 1.4098963557338684, - "grad_norm": 0.10477790981531143, - "learning_rate": 3.5703703703703703e-06, - "loss": 0.1977, - "step": 14760 - }, - { - "epoch": 1.4108516024263267, - "grad_norm": 0.1559683233499527, - "learning_rate": 3.4222222222222224e-06, - "loss": 0.198, - "step": 14770 - }, - { - "epoch": 1.411806849118785, - "grad_norm": 0.0938514843583107, - "learning_rate": 3.2740740740740746e-06, - "loss": 0.179, - "step": 14780 - }, - { - "epoch": 1.4127620958112432, - "grad_norm": 0.06281778961420059, - "learning_rate": 3.125925925925926e-06, - "loss": 0.1881, - "step": 14790 - }, - { - "epoch": 1.4137173425037015, - "grad_norm": 0.06448351591825485, - "learning_rate": 2.977777777777778e-06, - "loss": 0.192, - "step": 14800 - }, - { - "epoch": 1.4146725891961598, - "grad_norm": 0.098030686378479, - "learning_rate": 2.8296296296296294e-06, - "loss": 0.1976, - "step": 14810 - }, - { - "epoch": 1.4156278358886183, - "grad_norm": 0.14580897986888885, - "learning_rate": 2.6814814814814816e-06, - "loss": 0.1982, - "step": 14820 - }, - { - "epoch": 1.4165830825810766, - "grad_norm": 0.08316420018672943, - "learning_rate": 2.5333333333333334e-06, - "loss": 0.1802, - "step": 14830 - }, - { - "epoch": 1.4175383292735348, - "grad_norm": 0.06413526833057404, - "learning_rate": 2.3851851851851855e-06, - "loss": 0.1882, - "step": 14840 - }, - { - "epoch": 1.4184935759659933, - "grad_norm": 0.07762613147497177, - "learning_rate": 2.2370370370370373e-06, - "loss": 0.1933, - "step": 14850 - }, - { - "epoch": 1.4194488226584516, - "grad_norm": 0.1181880459189415, - "learning_rate": 2.088888888888889e-06, - "loss": 0.1974, - "step": 14860 - }, - { - "epoch": 1.42040406935091, - "grad_norm": 0.16065983474254608, - "learning_rate": 1.9407407407407408e-06, - "loss": 0.1975, - "step": 14870 - }, - { - "epoch": 1.4213593160433682, - "grad_norm": 0.07450287789106369, - "learning_rate": 1.7925925925925925e-06, - "loss": 0.1794, - "step": 14880 - }, - { - "epoch": 1.4223145627358265, - "grad_norm": 0.058435749262571335, - "learning_rate": 1.6444444444444447e-06, - "loss": 0.1887, - "step": 14890 - }, - { - "epoch": 1.4232698094282847, - "grad_norm": 0.07401622831821442, - "learning_rate": 1.4962962962962962e-06, - "loss": 0.193, - "step": 14900 - }, - { - "epoch": 1.4242250561207432, - "grad_norm": 0.12826916575431824, - "learning_rate": 1.3481481481481482e-06, - "loss": 0.1977, - "step": 14910 - }, - { - "epoch": 1.4251803028132015, - "grad_norm": 0.15752126276493073, - "learning_rate": 1.2000000000000002e-06, - "loss": 0.1978, - "step": 14920 - }, - { - "epoch": 1.4261355495056598, - "grad_norm": 0.0686534121632576, - "learning_rate": 1.051851851851852e-06, - "loss": 0.1794, - "step": 14930 - }, - { - "epoch": 1.4270907961981183, - "grad_norm": 0.06406836956739426, - "learning_rate": 9.037037037037039e-07, - "loss": 0.1886, - "step": 14940 - }, - { - "epoch": 1.4280460428905766, - "grad_norm": 0.06210838258266449, - "learning_rate": 7.555555555555556e-07, - "loss": 0.1934, - "step": 14950 - }, - { - "epoch": 1.4290012895830349, - "grad_norm": 0.08844652026891708, - "learning_rate": 6.074074074074074e-07, - "loss": 0.198, - "step": 14960 - }, - { - "epoch": 1.4299565362754931, - "grad_norm": 0.15475121140480042, - "learning_rate": 4.5925925925925927e-07, - "loss": 0.1971, - "step": 14970 - }, - { - "epoch": 1.4309117829679514, - "grad_norm": 0.08120597153902054, - "learning_rate": 3.111111111111111e-07, - "loss": 0.1793, - "step": 14980 - }, - { - "epoch": 1.4318670296604097, - "grad_norm": 0.07655435800552368, - "learning_rate": 1.6296296296296295e-07, - "loss": 0.1884, - "step": 14990 - }, - { - "epoch": 1.4328222763528682, - "grad_norm": 0.07046757638454437, - "learning_rate": 1.4814814814814816e-08, - "loss": 0.193, - "step": 15000 - } - ], - "logging_steps": 10, - "max_steps": 15000, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 4.918384541218406e+18, - "train_batch_size": 16, - "trial_name": null, - "trial_params": null -}