diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8853552192869967, + "eval_steps": 500, + "global_step": 22500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003968253968253968, + "grad_norm": 2.719743251800537, + "learning_rate": 0.0019973544973544977, + "loss": 2.0582, + "step": 10 + }, + { + "epoch": 0.007936507936507936, + "grad_norm": 3.9530892372131348, + "learning_rate": 0.001994708994708995, + "loss": 1.5106, + "step": 20 + }, + { + "epoch": 0.011904761904761904, + "grad_norm": 2.873342514038086, + "learning_rate": 0.001992063492063492, + "loss": 1.3441, + "step": 30 + }, + { + "epoch": 0.015873015873015872, + "grad_norm": 2.5402843952178955, + "learning_rate": 0.001989417989417989, + "loss": 1.3346, + "step": 40 + }, + { + "epoch": 0.01984126984126984, + "grad_norm": 8.011046409606934, + "learning_rate": 0.001986772486772487, + "loss": 1.4858, + "step": 50 + }, + { + "epoch": 0.023809523809523808, + "grad_norm": 1.8670936822891235, + "learning_rate": 0.001984126984126984, + "loss": 1.3795, + "step": 60 + }, + { + "epoch": 0.027777777777777776, + "grad_norm": 4.0268025398254395, + "learning_rate": 0.0019814814814814816, + "loss": 1.2093, + "step": 70 + }, + { + "epoch": 0.031746031746031744, + "grad_norm": 2.250659227371216, + "learning_rate": 0.001978835978835979, + "loss": 1.4175, + "step": 80 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 2.6111180782318115, + "learning_rate": 0.0019761904761904764, + "loss": 1.2632, + "step": 90 + }, + { + "epoch": 0.03968253968253968, + "grad_norm": 2.8487629890441895, + "learning_rate": 0.0019735449735449736, + "loss": 1.3824, + "step": 100 + }, + { + "epoch": 0.04365079365079365, + "grad_norm": 2.2919723987579346, + "learning_rate": 0.001970899470899471, + "loss": 1.1783, + "step": 110 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 2.1400911808013916, + "learning_rate": 0.001968253968253968, + "loss": 1.2046, + "step": 120 + }, + { + "epoch": 0.051587301587301584, + "grad_norm": 2.4251630306243896, + "learning_rate": 0.0019656084656084656, + "loss": 0.9373, + "step": 130 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 4.872255325317383, + "learning_rate": 0.0019629629629629632, + "loss": 1.1487, + "step": 140 + }, + { + "epoch": 0.05952380952380952, + "grad_norm": 1.1926871538162231, + "learning_rate": 0.0019603174603174604, + "loss": 0.9628, + "step": 150 + }, + { + "epoch": 0.06349206349206349, + "grad_norm": 1.6297978162765503, + "learning_rate": 0.0019576719576719576, + "loss": 1.0797, + "step": 160 + }, + { + "epoch": 0.06746031746031746, + "grad_norm": 1.9080616235733032, + "learning_rate": 0.001955026455026455, + "loss": 1.1868, + "step": 170 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 1.5988057851791382, + "learning_rate": 0.0019523809523809524, + "loss": 1.2171, + "step": 180 + }, + { + "epoch": 0.07539682539682539, + "grad_norm": 3.464204788208008, + "learning_rate": 0.0019497354497354498, + "loss": 1.0786, + "step": 190 + }, + { + "epoch": 0.07936507936507936, + "grad_norm": 1.0056853294372559, + "learning_rate": 0.001947089947089947, + "loss": 1.0321, + "step": 200 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 0.7744622230529785, + "learning_rate": 0.0019444444444444444, + "loss": 1.0652, + "step": 210 + }, + { + "epoch": 0.0873015873015873, + "grad_norm": 1.8306702375411987, + "learning_rate": 0.0019417989417989418, + "loss": 1.2596, + "step": 220 + }, + { + "epoch": 0.09126984126984126, + "grad_norm": 4.866456985473633, + "learning_rate": 0.0019391534391534392, + "loss": 0.9659, + "step": 230 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 2.4852681159973145, + "learning_rate": 0.0019365079365079366, + "loss": 0.9604, + "step": 240 + }, + { + "epoch": 0.0992063492063492, + "grad_norm": 1.818182349205017, + "learning_rate": 0.001933862433862434, + "loss": 0.84, + "step": 250 + }, + { + "epoch": 0.10317460317460317, + "grad_norm": 2.9667937755584717, + "learning_rate": 0.0019312169312169312, + "loss": 1.0415, + "step": 260 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 2.0339770317077637, + "learning_rate": 0.0019285714285714286, + "loss": 1.1436, + "step": 270 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 1.2213151454925537, + "learning_rate": 0.0019259259259259258, + "loss": 0.9627, + "step": 280 + }, + { + "epoch": 0.11507936507936507, + "grad_norm": 0.9745686054229736, + "learning_rate": 0.0019232804232804234, + "loss": 1.1314, + "step": 290 + }, + { + "epoch": 0.11904761904761904, + "grad_norm": 0.9821905493736267, + "learning_rate": 0.0019206349206349208, + "loss": 0.8616, + "step": 300 + }, + { + "epoch": 0.12301587301587301, + "grad_norm": 1.1417244672775269, + "learning_rate": 0.001917989417989418, + "loss": 0.8589, + "step": 310 + }, + { + "epoch": 0.12698412698412698, + "grad_norm": 1.10502028465271, + "learning_rate": 0.0019153439153439154, + "loss": 0.8038, + "step": 320 + }, + { + "epoch": 0.13095238095238096, + "grad_norm": 3.0337259769439697, + "learning_rate": 0.0019126984126984128, + "loss": 1.1473, + "step": 330 + }, + { + "epoch": 0.1349206349206349, + "grad_norm": 1.5644335746765137, + "learning_rate": 0.00191005291005291, + "loss": 1.377, + "step": 340 + }, + { + "epoch": 0.1388888888888889, + "grad_norm": 1.4690322875976562, + "learning_rate": 0.0019074074074074076, + "loss": 0.9918, + "step": 350 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.3624849319458008, + "learning_rate": 0.0019047619047619048, + "loss": 1.0086, + "step": 360 + }, + { + "epoch": 0.14682539682539683, + "grad_norm": 1.9925272464752197, + "learning_rate": 0.0019021164021164022, + "loss": 0.8111, + "step": 370 + }, + { + "epoch": 0.15079365079365079, + "grad_norm": 1.8325337171554565, + "learning_rate": 0.0018994708994708996, + "loss": 0.9993, + "step": 380 + }, + { + "epoch": 0.15476190476190477, + "grad_norm": 1.2556744813919067, + "learning_rate": 0.0018968253968253967, + "loss": 0.9214, + "step": 390 + }, + { + "epoch": 0.15873015873015872, + "grad_norm": 2.262421131134033, + "learning_rate": 0.0018941798941798941, + "loss": 0.9461, + "step": 400 + }, + { + "epoch": 0.1626984126984127, + "grad_norm": 1.9751100540161133, + "learning_rate": 0.0018915343915343918, + "loss": 1.0798, + "step": 410 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 6.7523040771484375, + "learning_rate": 0.001888888888888889, + "loss": 0.9272, + "step": 420 + }, + { + "epoch": 0.17063492063492064, + "grad_norm": 3.15874981880188, + "learning_rate": 0.0018862433862433864, + "loss": 0.9037, + "step": 430 + }, + { + "epoch": 0.1746031746031746, + "grad_norm": 1.0322136878967285, + "learning_rate": 0.0018835978835978835, + "loss": 0.9704, + "step": 440 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 1.655899167060852, + "learning_rate": 0.001880952380952381, + "loss": 0.9093, + "step": 450 + }, + { + "epoch": 0.18253968253968253, + "grad_norm": 0.8752370476722717, + "learning_rate": 0.0018783068783068783, + "loss": 0.9002, + "step": 460 + }, + { + "epoch": 0.1865079365079365, + "grad_norm": 1.024077296257019, + "learning_rate": 0.0018756613756613755, + "loss": 0.9406, + "step": 470 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 1.2974797487258911, + "learning_rate": 0.0018730158730158731, + "loss": 1.0027, + "step": 480 + }, + { + "epoch": 0.19444444444444445, + "grad_norm": 1.0525192022323608, + "learning_rate": 0.0018703703703703705, + "loss": 0.8173, + "step": 490 + }, + { + "epoch": 0.1984126984126984, + "grad_norm": 0.8886928558349609, + "learning_rate": 0.0018677248677248677, + "loss": 0.8162, + "step": 500 + }, + { + "epoch": 0.20238095238095238, + "grad_norm": 2.219409704208374, + "learning_rate": 0.0018650793650793651, + "loss": 0.9328, + "step": 510 + }, + { + "epoch": 0.20634920634920634, + "grad_norm": 1.2269400358200073, + "learning_rate": 0.0018624338624338623, + "loss": 0.7617, + "step": 520 + }, + { + "epoch": 0.21031746031746032, + "grad_norm": 1.3941247463226318, + "learning_rate": 0.0018597883597883597, + "loss": 0.733, + "step": 530 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 1.3957165479660034, + "learning_rate": 0.0018571428571428573, + "loss": 0.8621, + "step": 540 + }, + { + "epoch": 0.21825396825396826, + "grad_norm": 1.3213554620742798, + "learning_rate": 0.0018544973544973545, + "loss": 0.9746, + "step": 550 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 1.3588542938232422, + "learning_rate": 0.001851851851851852, + "loss": 0.7973, + "step": 560 + }, + { + "epoch": 0.2261904761904762, + "grad_norm": 1.7744730710983276, + "learning_rate": 0.0018492063492063493, + "loss": 0.887, + "step": 570 + }, + { + "epoch": 0.23015873015873015, + "grad_norm": 0.7673001289367676, + "learning_rate": 0.0018465608465608465, + "loss": 0.7976, + "step": 580 + }, + { + "epoch": 0.23412698412698413, + "grad_norm": 1.4514744281768799, + "learning_rate": 0.001843915343915344, + "loss": 0.7594, + "step": 590 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 1.408557653427124, + "learning_rate": 0.0018412698412698413, + "loss": 0.8519, + "step": 600 + }, + { + "epoch": 0.24206349206349206, + "grad_norm": 1.758348822593689, + "learning_rate": 0.0018386243386243387, + "loss": 1.0071, + "step": 610 + }, + { + "epoch": 0.24603174603174602, + "grad_norm": 1.6447445154190063, + "learning_rate": 0.0018359788359788361, + "loss": 0.7517, + "step": 620 + }, + { + "epoch": 0.25, + "grad_norm": 1.568068027496338, + "learning_rate": 0.0018333333333333333, + "loss": 0.9271, + "step": 630 + }, + { + "epoch": 0.25396825396825395, + "grad_norm": 1.2021923065185547, + "learning_rate": 0.0018306878306878307, + "loss": 1.1121, + "step": 640 + }, + { + "epoch": 0.25793650793650796, + "grad_norm": 2.1598119735717773, + "learning_rate": 0.001828042328042328, + "loss": 0.8373, + "step": 650 + }, + { + "epoch": 0.2619047619047619, + "grad_norm": 1.0078835487365723, + "learning_rate": 0.0018253968253968253, + "loss": 0.8333, + "step": 660 + }, + { + "epoch": 0.26587301587301587, + "grad_norm": 0.9753168225288391, + "learning_rate": 0.001822751322751323, + "loss": 1.0549, + "step": 670 + }, + { + "epoch": 0.2698412698412698, + "grad_norm": 1.491974949836731, + "learning_rate": 0.00182010582010582, + "loss": 1.0431, + "step": 680 + }, + { + "epoch": 0.27380952380952384, + "grad_norm": 1.1669495105743408, + "learning_rate": 0.0018174603174603175, + "loss": 0.9501, + "step": 690 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.8744311332702637, + "learning_rate": 0.001814814814814815, + "loss": 0.7574, + "step": 700 + }, + { + "epoch": 0.28174603174603174, + "grad_norm": 0.619263768196106, + "learning_rate": 0.001812169312169312, + "loss": 0.8899, + "step": 710 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.276594638824463, + "learning_rate": 0.0018095238095238095, + "loss": 0.8611, + "step": 720 + }, + { + "epoch": 0.2896825396825397, + "grad_norm": 1.1073200702667236, + "learning_rate": 0.001806878306878307, + "loss": 0.7205, + "step": 730 + }, + { + "epoch": 0.29365079365079366, + "grad_norm": 1.8631259202957153, + "learning_rate": 0.0018042328042328043, + "loss": 0.9344, + "step": 740 + }, + { + "epoch": 0.2976190476190476, + "grad_norm": 1.406410813331604, + "learning_rate": 0.0018015873015873017, + "loss": 0.8569, + "step": 750 + }, + { + "epoch": 0.30158730158730157, + "grad_norm": 1.26906156539917, + "learning_rate": 0.0017989417989417989, + "loss": 0.7488, + "step": 760 + }, + { + "epoch": 0.3055555555555556, + "grad_norm": 1.0014851093292236, + "learning_rate": 0.0017962962962962963, + "loss": 0.6643, + "step": 770 + }, + { + "epoch": 0.30952380952380953, + "grad_norm": 1.0010994672775269, + "learning_rate": 0.0017936507936507937, + "loss": 0.8164, + "step": 780 + }, + { + "epoch": 0.3134920634920635, + "grad_norm": 1.0928398370742798, + "learning_rate": 0.001791005291005291, + "loss": 0.8814, + "step": 790 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 1.6183459758758545, + "learning_rate": 0.0017883597883597885, + "loss": 1.0908, + "step": 800 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 0.7748919129371643, + "learning_rate": 0.0017857142857142859, + "loss": 0.7623, + "step": 810 + }, + { + "epoch": 0.3253968253968254, + "grad_norm": 1.22903311252594, + "learning_rate": 0.001783068783068783, + "loss": 0.8888, + "step": 820 + }, + { + "epoch": 0.32936507936507936, + "grad_norm": 1.9972559213638306, + "learning_rate": 0.0017804232804232805, + "loss": 0.8963, + "step": 830 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.2421702146530151, + "learning_rate": 0.0017777777777777776, + "loss": 0.7896, + "step": 840 + }, + { + "epoch": 0.3373015873015873, + "grad_norm": 0.676760196685791, + "learning_rate": 0.001775132275132275, + "loss": 0.7917, + "step": 850 + }, + { + "epoch": 0.3412698412698413, + "grad_norm": 2.124894857406616, + "learning_rate": 0.0017724867724867727, + "loss": 0.8721, + "step": 860 + }, + { + "epoch": 0.34523809523809523, + "grad_norm": 1.416979432106018, + "learning_rate": 0.0017698412698412699, + "loss": 0.7461, + "step": 870 + }, + { + "epoch": 0.3492063492063492, + "grad_norm": 0.9547367691993713, + "learning_rate": 0.0017671957671957673, + "loss": 0.6878, + "step": 880 + }, + { + "epoch": 0.3531746031746032, + "grad_norm": 0.814999520778656, + "learning_rate": 0.0017645502645502647, + "loss": 0.7173, + "step": 890 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 1.4335911273956299, + "learning_rate": 0.0017619047619047618, + "loss": 0.9165, + "step": 900 + }, + { + "epoch": 0.3611111111111111, + "grad_norm": 2.418215274810791, + "learning_rate": 0.0017592592592592592, + "loss": 0.8312, + "step": 910 + }, + { + "epoch": 0.36507936507936506, + "grad_norm": 0.6499120593070984, + "learning_rate": 0.0017566137566137566, + "loss": 0.6524, + "step": 920 + }, + { + "epoch": 0.36904761904761907, + "grad_norm": 1.4244420528411865, + "learning_rate": 0.001753968253968254, + "loss": 0.7283, + "step": 930 + }, + { + "epoch": 0.373015873015873, + "grad_norm": 0.9467722177505493, + "learning_rate": 0.0017513227513227514, + "loss": 0.7102, + "step": 940 + }, + { + "epoch": 0.376984126984127, + "grad_norm": 0.9126266241073608, + "learning_rate": 0.0017486772486772486, + "loss": 0.8736, + "step": 950 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.739183783531189, + "learning_rate": 0.001746031746031746, + "loss": 0.7108, + "step": 960 + }, + { + "epoch": 0.38492063492063494, + "grad_norm": 0.7012743949890137, + "learning_rate": 0.0017433862433862434, + "loss": 0.708, + "step": 970 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.4281548261642456, + "learning_rate": 0.0017407407407407408, + "loss": 0.7982, + "step": 980 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 1.850917935371399, + "learning_rate": 0.0017380952380952382, + "loss": 0.7927, + "step": 990 + }, + { + "epoch": 0.3968253968253968, + "grad_norm": 1.2646055221557617, + "learning_rate": 0.0017354497354497354, + "loss": 0.6407, + "step": 1000 + }, + { + "epoch": 0.4007936507936508, + "grad_norm": 2.1877217292785645, + "learning_rate": 0.0017328042328042328, + "loss": 0.636, + "step": 1010 + }, + { + "epoch": 0.40476190476190477, + "grad_norm": 1.4416710138320923, + "learning_rate": 0.0017301587301587302, + "loss": 0.6643, + "step": 1020 + }, + { + "epoch": 0.4087301587301587, + "grad_norm": 0.9752436876296997, + "learning_rate": 0.0017275132275132274, + "loss": 0.7655, + "step": 1030 + }, + { + "epoch": 0.4126984126984127, + "grad_norm": 0.6438788175582886, + "learning_rate": 0.001724867724867725, + "loss": 0.8659, + "step": 1040 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.9634172320365906, + "learning_rate": 0.0017222222222222224, + "loss": 0.8015, + "step": 1050 + }, + { + "epoch": 0.42063492063492064, + "grad_norm": 1.4185785055160522, + "learning_rate": 0.0017195767195767196, + "loss": 0.991, + "step": 1060 + }, + { + "epoch": 0.4246031746031746, + "grad_norm": 1.0508280992507935, + "learning_rate": 0.001716931216931217, + "loss": 0.9047, + "step": 1070 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.1847171783447266, + "learning_rate": 0.0017142857142857142, + "loss": 0.7529, + "step": 1080 + }, + { + "epoch": 0.43253968253968256, + "grad_norm": 0.8445650935173035, + "learning_rate": 0.0017116402116402116, + "loss": 0.6472, + "step": 1090 + }, + { + "epoch": 0.4365079365079365, + "grad_norm": 0.6549813151359558, + "learning_rate": 0.001708994708994709, + "loss": 0.753, + "step": 1100 + }, + { + "epoch": 0.44047619047619047, + "grad_norm": 1.5086162090301514, + "learning_rate": 0.0017063492063492064, + "loss": 0.6774, + "step": 1110 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.5609638690948486, + "learning_rate": 0.0017037037037037038, + "loss": 0.6732, + "step": 1120 + }, + { + "epoch": 0.44841269841269843, + "grad_norm": 1.2099113464355469, + "learning_rate": 0.0017010582010582012, + "loss": 0.7132, + "step": 1130 + }, + { + "epoch": 0.4523809523809524, + "grad_norm": 1.5899118185043335, + "learning_rate": 0.0016984126984126984, + "loss": 1.1286, + "step": 1140 + }, + { + "epoch": 0.45634920634920634, + "grad_norm": 1.4785903692245483, + "learning_rate": 0.0016957671957671958, + "loss": 0.8924, + "step": 1150 + }, + { + "epoch": 0.4603174603174603, + "grad_norm": 1.7442249059677124, + "learning_rate": 0.001693121693121693, + "loss": 0.7868, + "step": 1160 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 1.7168885469436646, + "learning_rate": 0.0016904761904761906, + "loss": 0.7758, + "step": 1170 + }, + { + "epoch": 0.46825396825396826, + "grad_norm": 0.735222339630127, + "learning_rate": 0.001687830687830688, + "loss": 0.9121, + "step": 1180 + }, + { + "epoch": 0.4722222222222222, + "grad_norm": 1.0101484060287476, + "learning_rate": 0.0016851851851851852, + "loss": 0.6875, + "step": 1190 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.8721634149551392, + "learning_rate": 0.0016825396825396826, + "loss": 0.7557, + "step": 1200 + }, + { + "epoch": 0.4801587301587302, + "grad_norm": 1.2771320343017578, + "learning_rate": 0.00167989417989418, + "loss": 0.6477, + "step": 1210 + }, + { + "epoch": 0.48412698412698413, + "grad_norm": 0.573085606098175, + "learning_rate": 0.0016772486772486772, + "loss": 0.7522, + "step": 1220 + }, + { + "epoch": 0.4880952380952381, + "grad_norm": 0.6810621023178101, + "learning_rate": 0.0016746031746031748, + "loss": 0.9133, + "step": 1230 + }, + { + "epoch": 0.49206349206349204, + "grad_norm": 0.5593830347061157, + "learning_rate": 0.001671957671957672, + "loss": 0.6225, + "step": 1240 + }, + { + "epoch": 0.49603174603174605, + "grad_norm": 1.1917506456375122, + "learning_rate": 0.0016693121693121694, + "loss": 0.713, + "step": 1250 + }, + { + "epoch": 0.5, + "grad_norm": 2.748424530029297, + "learning_rate": 0.0016666666666666668, + "loss": 0.6865, + "step": 1260 + }, + { + "epoch": 0.503968253968254, + "grad_norm": 1.4518764019012451, + "learning_rate": 0.001664021164021164, + "loss": 0.6681, + "step": 1270 + }, + { + "epoch": 0.5079365079365079, + "grad_norm": 0.7594536542892456, + "learning_rate": 0.0016613756613756614, + "loss": 0.542, + "step": 1280 + }, + { + "epoch": 0.5119047619047619, + "grad_norm": 0.6531535387039185, + "learning_rate": 0.0016587301587301588, + "loss": 0.6516, + "step": 1290 + }, + { + "epoch": 0.5158730158730159, + "grad_norm": 1.2486604452133179, + "learning_rate": 0.0016560846560846562, + "loss": 0.5894, + "step": 1300 + }, + { + "epoch": 0.5198412698412699, + "grad_norm": 1.1929885149002075, + "learning_rate": 0.0016534391534391536, + "loss": 0.8147, + "step": 1310 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 1.1954102516174316, + "learning_rate": 0.0016507936507936507, + "loss": 0.7889, + "step": 1320 + }, + { + "epoch": 0.5277777777777778, + "grad_norm": 1.271843671798706, + "learning_rate": 0.0016481481481481482, + "loss": 0.5804, + "step": 1330 + }, + { + "epoch": 0.5317460317460317, + "grad_norm": 1.0248411893844604, + "learning_rate": 0.0016455026455026456, + "loss": 0.6674, + "step": 1340 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 0.9981194734573364, + "learning_rate": 0.0016428571428571427, + "loss": 0.7461, + "step": 1350 + }, + { + "epoch": 0.5396825396825397, + "grad_norm": 1.431178331375122, + "learning_rate": 0.0016402116402116404, + "loss": 0.8174, + "step": 1360 + }, + { + "epoch": 0.5436507936507936, + "grad_norm": 1.7068381309509277, + "learning_rate": 0.0016375661375661378, + "loss": 0.7248, + "step": 1370 + }, + { + "epoch": 0.5476190476190477, + "grad_norm": 1.1310241222381592, + "learning_rate": 0.001634920634920635, + "loss": 0.5469, + "step": 1380 + }, + { + "epoch": 0.5515873015873016, + "grad_norm": 0.8217313289642334, + "learning_rate": 0.0016322751322751323, + "loss": 0.689, + "step": 1390 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.0212846994400024, + "learning_rate": 0.0016296296296296295, + "loss": 0.8884, + "step": 1400 + }, + { + "epoch": 0.5595238095238095, + "grad_norm": 0.6781401634216309, + "learning_rate": 0.001626984126984127, + "loss": 0.6765, + "step": 1410 + }, + { + "epoch": 0.5634920634920635, + "grad_norm": 1.3569077253341675, + "learning_rate": 0.0016243386243386245, + "loss": 0.7301, + "step": 1420 + }, + { + "epoch": 0.5674603174603174, + "grad_norm": 1.1712183952331543, + "learning_rate": 0.0016216931216931217, + "loss": 0.7642, + "step": 1430 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.8609166145324707, + "learning_rate": 0.0016190476190476191, + "loss": 0.6985, + "step": 1440 + }, + { + "epoch": 0.5753968253968254, + "grad_norm": 1.7427066564559937, + "learning_rate": 0.0016164021164021165, + "loss": 0.6473, + "step": 1450 + }, + { + "epoch": 0.5793650793650794, + "grad_norm": 0.6781764030456543, + "learning_rate": 0.0016137566137566137, + "loss": 0.6664, + "step": 1460 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 1.3013015985488892, + "learning_rate": 0.0016111111111111111, + "loss": 0.68, + "step": 1470 + }, + { + "epoch": 0.5873015873015873, + "grad_norm": 0.5826123356819153, + "learning_rate": 0.0016084656084656083, + "loss": 0.6387, + "step": 1480 + }, + { + "epoch": 0.5912698412698413, + "grad_norm": 0.5697736144065857, + "learning_rate": 0.001605820105820106, + "loss": 0.6077, + "step": 1490 + }, + { + "epoch": 0.5952380952380952, + "grad_norm": 1.1636980772018433, + "learning_rate": 0.0016031746031746033, + "loss": 0.7455, + "step": 1500 + }, + { + "epoch": 0.5992063492063492, + "grad_norm": 1.3436973094940186, + "learning_rate": 0.0016005291005291005, + "loss": 0.7223, + "step": 1510 + }, + { + "epoch": 0.6031746031746031, + "grad_norm": 0.5332604050636292, + "learning_rate": 0.001597883597883598, + "loss": 0.7019, + "step": 1520 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 1.6034159660339355, + "learning_rate": 0.0015952380952380953, + "loss": 0.7367, + "step": 1530 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 1.0178996324539185, + "learning_rate": 0.0015925925925925925, + "loss": 0.8806, + "step": 1540 + }, + { + "epoch": 0.6150793650793651, + "grad_norm": 2.342480182647705, + "learning_rate": 0.0015899470899470901, + "loss": 0.5731, + "step": 1550 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 3.211264133453369, + "learning_rate": 0.0015873015873015873, + "loss": 0.754, + "step": 1560 + }, + { + "epoch": 0.623015873015873, + "grad_norm": 1.243814468383789, + "learning_rate": 0.0015846560846560847, + "loss": 0.8197, + "step": 1570 + }, + { + "epoch": 0.626984126984127, + "grad_norm": 0.645529568195343, + "learning_rate": 0.001582010582010582, + "loss": 0.8089, + "step": 1580 + }, + { + "epoch": 0.6309523809523809, + "grad_norm": 0.8136078715324402, + "learning_rate": 0.0015793650793650793, + "loss": 0.7806, + "step": 1590 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.820977509021759, + "learning_rate": 0.0015767195767195767, + "loss": 0.5663, + "step": 1600 + }, + { + "epoch": 0.6388888888888888, + "grad_norm": 0.8995200991630554, + "learning_rate": 0.0015740740740740743, + "loss": 0.5782, + "step": 1610 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.7738690376281738, + "learning_rate": 0.0015714285714285715, + "loss": 0.7698, + "step": 1620 + }, + { + "epoch": 0.6468253968253969, + "grad_norm": 0.6192114949226379, + "learning_rate": 0.001568783068783069, + "loss": 0.8364, + "step": 1630 + }, + { + "epoch": 0.6507936507936508, + "grad_norm": 1.5578278303146362, + "learning_rate": 0.001566137566137566, + "loss": 0.8536, + "step": 1640 + }, + { + "epoch": 0.6547619047619048, + "grad_norm": 1.2771915197372437, + "learning_rate": 0.0015634920634920635, + "loss": 0.5684, + "step": 1650 + }, + { + "epoch": 0.6587301587301587, + "grad_norm": 0.8459761738777161, + "learning_rate": 0.0015608465608465609, + "loss": 0.6247, + "step": 1660 + }, + { + "epoch": 0.6626984126984127, + "grad_norm": 1.2737908363342285, + "learning_rate": 0.0015582010582010583, + "loss": 0.6484, + "step": 1670 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.2066800594329834, + "learning_rate": 0.0015555555555555557, + "loss": 0.669, + "step": 1680 + }, + { + "epoch": 0.6706349206349206, + "grad_norm": 1.2899553775787354, + "learning_rate": 0.001552910052910053, + "loss": 0.6382, + "step": 1690 + }, + { + "epoch": 0.6746031746031746, + "grad_norm": 1.2886145114898682, + "learning_rate": 0.0015502645502645503, + "loss": 0.6506, + "step": 1700 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 1.1416516304016113, + "learning_rate": 0.0015476190476190477, + "loss": 0.6058, + "step": 1710 + }, + { + "epoch": 0.6825396825396826, + "grad_norm": 1.1607820987701416, + "learning_rate": 0.0015449735449735449, + "loss": 0.7332, + "step": 1720 + }, + { + "epoch": 0.6865079365079365, + "grad_norm": 1.350909948348999, + "learning_rate": 0.0015423280423280423, + "loss": 0.9355, + "step": 1730 + }, + { + "epoch": 0.6904761904761905, + "grad_norm": 0.5112187266349792, + "learning_rate": 0.0015396825396825399, + "loss": 0.7205, + "step": 1740 + }, + { + "epoch": 0.6944444444444444, + "grad_norm": 0.9334474205970764, + "learning_rate": 0.001537037037037037, + "loss": 0.6192, + "step": 1750 + }, + { + "epoch": 0.6984126984126984, + "grad_norm": 0.635986864566803, + "learning_rate": 0.0015343915343915345, + "loss": 0.6414, + "step": 1760 + }, + { + "epoch": 0.7023809523809523, + "grad_norm": 1.4325546026229858, + "learning_rate": 0.0015317460317460319, + "loss": 0.7799, + "step": 1770 + }, + { + "epoch": 0.7063492063492064, + "grad_norm": 0.814120352268219, + "learning_rate": 0.001529100529100529, + "loss": 0.6072, + "step": 1780 + }, + { + "epoch": 0.7103174603174603, + "grad_norm": 0.9185436367988586, + "learning_rate": 0.0015264550264550265, + "loss": 0.728, + "step": 1790 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.8025707602500916, + "learning_rate": 0.0015238095238095239, + "loss": 0.7527, + "step": 1800 + }, + { + "epoch": 0.7182539682539683, + "grad_norm": 0.816798985004425, + "learning_rate": 0.0015211640211640213, + "loss": 0.5058, + "step": 1810 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.8499689698219299, + "learning_rate": 0.0015185185185185187, + "loss": 0.5843, + "step": 1820 + }, + { + "epoch": 0.7261904761904762, + "grad_norm": 1.3355066776275635, + "learning_rate": 0.0015158730158730158, + "loss": 0.7095, + "step": 1830 + }, + { + "epoch": 0.7301587301587301, + "grad_norm": 1.4383025169372559, + "learning_rate": 0.0015132275132275132, + "loss": 0.7277, + "step": 1840 + }, + { + "epoch": 0.7341269841269841, + "grad_norm": 1.1233898401260376, + "learning_rate": 0.0015105820105820106, + "loss": 0.5746, + "step": 1850 + }, + { + "epoch": 0.7380952380952381, + "grad_norm": 0.6341880559921265, + "learning_rate": 0.001507936507936508, + "loss": 0.7063, + "step": 1860 + }, + { + "epoch": 0.7420634920634921, + "grad_norm": 0.8784427642822266, + "learning_rate": 0.0015052910052910054, + "loss": 0.5603, + "step": 1870 + }, + { + "epoch": 0.746031746031746, + "grad_norm": 1.2914808988571167, + "learning_rate": 0.0015026455026455026, + "loss": 0.6714, + "step": 1880 + }, + { + "epoch": 0.75, + "grad_norm": 0.7286548018455505, + "learning_rate": 0.0015, + "loss": 0.6926, + "step": 1890 + }, + { + "epoch": 0.753968253968254, + "grad_norm": 0.6523261070251465, + "learning_rate": 0.0014973544973544974, + "loss": 0.6169, + "step": 1900 + }, + { + "epoch": 0.7579365079365079, + "grad_norm": 0.971722424030304, + "learning_rate": 0.0014947089947089946, + "loss": 0.8645, + "step": 1910 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.7515843510627747, + "learning_rate": 0.001492063492063492, + "loss": 0.5931, + "step": 1920 + }, + { + "epoch": 0.7658730158730159, + "grad_norm": 0.8675608038902283, + "learning_rate": 0.0014894179894179894, + "loss": 0.5703, + "step": 1930 + }, + { + "epoch": 0.7698412698412699, + "grad_norm": 1.131606101989746, + "learning_rate": 0.0014867724867724868, + "loss": 0.6542, + "step": 1940 + }, + { + "epoch": 0.7738095238095238, + "grad_norm": 1.4298430681228638, + "learning_rate": 0.0014841269841269842, + "loss": 1.127, + "step": 1950 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.9001014828681946, + "learning_rate": 0.0014814814814814814, + "loss": 0.6976, + "step": 1960 + }, + { + "epoch": 0.7817460317460317, + "grad_norm": 0.9711846113204956, + "learning_rate": 0.0014788359788359788, + "loss": 0.6721, + "step": 1970 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.6609967350959778, + "learning_rate": 0.0014761904761904762, + "loss": 0.6133, + "step": 1980 + }, + { + "epoch": 0.7896825396825397, + "grad_norm": 1.0555015802383423, + "learning_rate": 0.0014735449735449736, + "loss": 0.7108, + "step": 1990 + }, + { + "epoch": 0.7936507936507936, + "grad_norm": 1.7377722263336182, + "learning_rate": 0.001470899470899471, + "loss": 0.5734, + "step": 2000 + }, + { + "epoch": 0.7976190476190477, + "grad_norm": 0.48703524470329285, + "learning_rate": 0.0014682539682539682, + "loss": 0.509, + "step": 2010 + }, + { + "epoch": 0.8015873015873016, + "grad_norm": 0.7599615454673767, + "learning_rate": 0.0014656084656084656, + "loss": 0.6224, + "step": 2020 + }, + { + "epoch": 0.8055555555555556, + "grad_norm": 1.351830005645752, + "learning_rate": 0.001462962962962963, + "loss": 0.6759, + "step": 2030 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.7260966897010803, + "learning_rate": 0.0014603174603174602, + "loss": 0.7076, + "step": 2040 + }, + { + "epoch": 0.8134920634920635, + "grad_norm": 1.2171436548233032, + "learning_rate": 0.0014576719576719578, + "loss": 0.6794, + "step": 2050 + }, + { + "epoch": 0.8174603174603174, + "grad_norm": 0.6401930451393127, + "learning_rate": 0.0014550264550264552, + "loss": 0.5448, + "step": 2060 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 1.0115227699279785, + "learning_rate": 0.0014523809523809524, + "loss": 0.7069, + "step": 2070 + }, + { + "epoch": 0.8253968253968254, + "grad_norm": 1.0564064979553223, + "learning_rate": 0.0014497354497354498, + "loss": 0.5207, + "step": 2080 + }, + { + "epoch": 0.8293650793650794, + "grad_norm": 1.908964991569519, + "learning_rate": 0.001447089947089947, + "loss": 0.7147, + "step": 2090 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 1.2274842262268066, + "learning_rate": 0.0014444444444444444, + "loss": 0.7366, + "step": 2100 + }, + { + "epoch": 0.8373015873015873, + "grad_norm": 0.8221492767333984, + "learning_rate": 0.0014417989417989418, + "loss": 0.5421, + "step": 2110 + }, + { + "epoch": 0.8412698412698413, + "grad_norm": 1.3362950086593628, + "learning_rate": 0.0014391534391534392, + "loss": 0.7575, + "step": 2120 + }, + { + "epoch": 0.8452380952380952, + "grad_norm": 0.8134861588478088, + "learning_rate": 0.0014365079365079366, + "loss": 0.6713, + "step": 2130 + }, + { + "epoch": 0.8492063492063492, + "grad_norm": 0.650597095489502, + "learning_rate": 0.001433862433862434, + "loss": 0.8714, + "step": 2140 + }, + { + "epoch": 0.8531746031746031, + "grad_norm": 1.5303138494491577, + "learning_rate": 0.0014312169312169312, + "loss": 0.6425, + "step": 2150 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.0913094282150269, + "learning_rate": 0.0014285714285714286, + "loss": 0.8642, + "step": 2160 + }, + { + "epoch": 0.8611111111111112, + "grad_norm": 0.6576964259147644, + "learning_rate": 0.0014259259259259258, + "loss": 0.5981, + "step": 2170 + }, + { + "epoch": 0.8650793650793651, + "grad_norm": 1.4192836284637451, + "learning_rate": 0.0014232804232804234, + "loss": 0.8404, + "step": 2180 + }, + { + "epoch": 0.8690476190476191, + "grad_norm": 1.345991611480713, + "learning_rate": 0.0014206349206349208, + "loss": 0.6921, + "step": 2190 + }, + { + "epoch": 0.873015873015873, + "grad_norm": 1.310991644859314, + "learning_rate": 0.001417989417989418, + "loss": 0.7689, + "step": 2200 + }, + { + "epoch": 0.876984126984127, + "grad_norm": 1.0328586101531982, + "learning_rate": 0.0014153439153439154, + "loss": 0.5498, + "step": 2210 + }, + { + "epoch": 0.8809523809523809, + "grad_norm": 1.0331602096557617, + "learning_rate": 0.0014126984126984128, + "loss": 0.7736, + "step": 2220 + }, + { + "epoch": 0.8849206349206349, + "grad_norm": 0.9896045327186584, + "learning_rate": 0.00141005291005291, + "loss": 0.6907, + "step": 2230 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7972356677055359, + "learning_rate": 0.0014074074074074076, + "loss": 0.6476, + "step": 2240 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 1.479012131690979, + "learning_rate": 0.0014047619047619047, + "loss": 0.6944, + "step": 2250 + }, + { + "epoch": 0.8968253968253969, + "grad_norm": 0.8372600674629211, + "learning_rate": 0.0014021164021164022, + "loss": 0.7234, + "step": 2260 + }, + { + "epoch": 0.9007936507936508, + "grad_norm": 0.9432483911514282, + "learning_rate": 0.0013994708994708996, + "loss": 0.5945, + "step": 2270 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 1.3562203645706177, + "learning_rate": 0.0013968253968253967, + "loss": 0.6736, + "step": 2280 + }, + { + "epoch": 0.9087301587301587, + "grad_norm": 1.511753797531128, + "learning_rate": 0.0013941798941798941, + "loss": 0.7797, + "step": 2290 + }, + { + "epoch": 0.9126984126984127, + "grad_norm": 1.4588041305541992, + "learning_rate": 0.0013915343915343918, + "loss": 0.6369, + "step": 2300 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 1.3627748489379883, + "learning_rate": 0.001388888888888889, + "loss": 0.5948, + "step": 2310 + }, + { + "epoch": 0.9206349206349206, + "grad_norm": 0.9026773571968079, + "learning_rate": 0.0013862433862433863, + "loss": 0.8748, + "step": 2320 + }, + { + "epoch": 0.9246031746031746, + "grad_norm": 2.1526966094970703, + "learning_rate": 0.0013835978835978835, + "loss": 0.7363, + "step": 2330 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.6556802988052368, + "learning_rate": 0.001380952380952381, + "loss": 0.6449, + "step": 2340 + }, + { + "epoch": 0.9325396825396826, + "grad_norm": 1.622631549835205, + "learning_rate": 0.0013783068783068783, + "loss": 0.6811, + "step": 2350 + }, + { + "epoch": 0.9365079365079365, + "grad_norm": 1.133255124092102, + "learning_rate": 0.0013756613756613755, + "loss": 0.7778, + "step": 2360 + }, + { + "epoch": 0.9404761904761905, + "grad_norm": 1.2756290435791016, + "learning_rate": 0.0013730158730158731, + "loss": 0.6244, + "step": 2370 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.6911134719848633, + "learning_rate": 0.0013703703703703705, + "loss": 0.6354, + "step": 2380 + }, + { + "epoch": 0.9484126984126984, + "grad_norm": 1.2925828695297241, + "learning_rate": 0.0013677248677248677, + "loss": 0.8165, + "step": 2390 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.8971231579780579, + "learning_rate": 0.0013650793650793651, + "loss": 0.6201, + "step": 2400 + }, + { + "epoch": 0.9563492063492064, + "grad_norm": 0.7667912244796753, + "learning_rate": 0.0013624338624338623, + "loss": 0.5925, + "step": 2410 + }, + { + "epoch": 0.9603174603174603, + "grad_norm": 2.7550241947174072, + "learning_rate": 0.0013597883597883597, + "loss": 0.6813, + "step": 2420 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 0.8356139659881592, + "learning_rate": 0.0013571428571428573, + "loss": 0.526, + "step": 2430 + }, + { + "epoch": 0.9682539682539683, + "grad_norm": 1.1391632556915283, + "learning_rate": 0.0013544973544973545, + "loss": 0.63, + "step": 2440 + }, + { + "epoch": 0.9722222222222222, + "grad_norm": 0.9076061248779297, + "learning_rate": 0.001351851851851852, + "loss": 0.6514, + "step": 2450 + }, + { + "epoch": 0.9761904761904762, + "grad_norm": 0.8281214237213135, + "learning_rate": 0.0013492063492063493, + "loss": 0.6106, + "step": 2460 + }, + { + "epoch": 0.9801587301587301, + "grad_norm": 0.7302814722061157, + "learning_rate": 0.0013465608465608465, + "loss": 0.5481, + "step": 2470 + }, + { + "epoch": 0.9841269841269841, + "grad_norm": 0.7019608020782471, + "learning_rate": 0.001343915343915344, + "loss": 0.6074, + "step": 2480 + }, + { + "epoch": 0.9880952380952381, + "grad_norm": 0.6560447812080383, + "learning_rate": 0.0013412698412698413, + "loss": 0.7778, + "step": 2490 + }, + { + "epoch": 0.9920634920634921, + "grad_norm": 0.9499639868736267, + "learning_rate": 0.0013386243386243387, + "loss": 0.6651, + "step": 2500 + }, + { + "epoch": 0.996031746031746, + "grad_norm": 0.9400144815444946, + "learning_rate": 0.001335978835978836, + "loss": 0.5866, + "step": 2510 + }, + { + "epoch": 1.0, + "grad_norm": 1.7043092250823975, + "learning_rate": 0.0013333333333333333, + "loss": 0.7614, + "step": 2520 + }, + { + "epoch": 1.003968253968254, + "grad_norm": 1.06705641746521, + "learning_rate": 0.0013306878306878307, + "loss": 0.4675, + "step": 2530 + }, + { + "epoch": 1.007936507936508, + "grad_norm": 1.0158171653747559, + "learning_rate": 0.001328042328042328, + "loss": 0.6015, + "step": 2540 + }, + { + "epoch": 1.0119047619047619, + "grad_norm": 0.9164927005767822, + "learning_rate": 0.0013253968253968253, + "loss": 0.5214, + "step": 2550 + }, + { + "epoch": 1.0158730158730158, + "grad_norm": 0.9178239703178406, + "learning_rate": 0.001322751322751323, + "loss": 0.5135, + "step": 2560 + }, + { + "epoch": 1.0198412698412698, + "grad_norm": 1.3159326314926147, + "learning_rate": 0.00132010582010582, + "loss": 0.6184, + "step": 2570 + }, + { + "epoch": 1.0238095238095237, + "grad_norm": 1.290663719177246, + "learning_rate": 0.0013174603174603175, + "loss": 0.5791, + "step": 2580 + }, + { + "epoch": 1.0277777777777777, + "grad_norm": 0.8518033027648926, + "learning_rate": 0.0013148148148148149, + "loss": 0.3943, + "step": 2590 + }, + { + "epoch": 1.0317460317460316, + "grad_norm": 0.523811399936676, + "learning_rate": 0.001312169312169312, + "loss": 0.4826, + "step": 2600 + }, + { + "epoch": 1.0357142857142858, + "grad_norm": 2.362725257873535, + "learning_rate": 0.0013095238095238095, + "loss": 0.6, + "step": 2610 + }, + { + "epoch": 1.0396825396825398, + "grad_norm": 0.7334272861480713, + "learning_rate": 0.001306878306878307, + "loss": 0.5216, + "step": 2620 + }, + { + "epoch": 1.0436507936507937, + "grad_norm": 1.4758929014205933, + "learning_rate": 0.0013042328042328043, + "loss": 0.5011, + "step": 2630 + }, + { + "epoch": 1.0476190476190477, + "grad_norm": 1.296991229057312, + "learning_rate": 0.0013015873015873017, + "loss": 0.5896, + "step": 2640 + }, + { + "epoch": 1.0515873015873016, + "grad_norm": 0.6447119116783142, + "learning_rate": 0.0012989417989417989, + "loss": 0.4612, + "step": 2650 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 1.2804654836654663, + "learning_rate": 0.0012962962962962963, + "loss": 0.5531, + "step": 2660 + }, + { + "epoch": 1.0595238095238095, + "grad_norm": 0.6714935898780823, + "learning_rate": 0.0012936507936507937, + "loss": 0.4928, + "step": 2670 + }, + { + "epoch": 1.0634920634920635, + "grad_norm": 2.083782434463501, + "learning_rate": 0.001291005291005291, + "loss": 0.3696, + "step": 2680 + }, + { + "epoch": 1.0674603174603174, + "grad_norm": 1.4924397468566895, + "learning_rate": 0.0012883597883597885, + "loss": 0.4776, + "step": 2690 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.8140655159950256, + "learning_rate": 0.0012857142857142859, + "loss": 0.4731, + "step": 2700 + }, + { + "epoch": 1.0753968253968254, + "grad_norm": 0.47565603256225586, + "learning_rate": 0.001283068783068783, + "loss": 0.6289, + "step": 2710 + }, + { + "epoch": 1.0793650793650793, + "grad_norm": 1.3005656003952026, + "learning_rate": 0.0012804232804232805, + "loss": 0.5688, + "step": 2720 + }, + { + "epoch": 1.0833333333333333, + "grad_norm": 1.2472827434539795, + "learning_rate": 0.0012777777777777776, + "loss": 0.6273, + "step": 2730 + }, + { + "epoch": 1.0873015873015872, + "grad_norm": 1.0685155391693115, + "learning_rate": 0.001275132275132275, + "loss": 0.5222, + "step": 2740 + }, + { + "epoch": 1.0912698412698412, + "grad_norm": 1.2605559825897217, + "learning_rate": 0.0012724867724867727, + "loss": 0.4724, + "step": 2750 + }, + { + "epoch": 1.0952380952380953, + "grad_norm": 0.9913002848625183, + "learning_rate": 0.0012698412698412698, + "loss": 0.5158, + "step": 2760 + }, + { + "epoch": 1.0992063492063493, + "grad_norm": 0.5711252093315125, + "learning_rate": 0.0012671957671957672, + "loss": 0.4382, + "step": 2770 + }, + { + "epoch": 1.1031746031746033, + "grad_norm": 1.4559530019760132, + "learning_rate": 0.0012645502645502646, + "loss": 0.7059, + "step": 2780 + }, + { + "epoch": 1.1071428571428572, + "grad_norm": 0.9595462083816528, + "learning_rate": 0.0012619047619047618, + "loss": 0.5346, + "step": 2790 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.7950549721717834, + "learning_rate": 0.0012592592592592592, + "loss": 0.4881, + "step": 2800 + }, + { + "epoch": 1.1150793650793651, + "grad_norm": 1.297609567642212, + "learning_rate": 0.0012566137566137566, + "loss": 0.4305, + "step": 2810 + }, + { + "epoch": 1.119047619047619, + "grad_norm": 0.741604745388031, + "learning_rate": 0.001253968253968254, + "loss": 0.6249, + "step": 2820 + }, + { + "epoch": 1.123015873015873, + "grad_norm": 1.4942420721054077, + "learning_rate": 0.0012513227513227514, + "loss": 0.4769, + "step": 2830 + }, + { + "epoch": 1.126984126984127, + "grad_norm": 1.299843192100525, + "learning_rate": 0.0012486772486772486, + "loss": 0.5311, + "step": 2840 + }, + { + "epoch": 1.130952380952381, + "grad_norm": 0.5215968489646912, + "learning_rate": 0.001246031746031746, + "loss": 0.4032, + "step": 2850 + }, + { + "epoch": 1.1349206349206349, + "grad_norm": 0.9502798914909363, + "learning_rate": 0.0012433862433862434, + "loss": 0.4089, + "step": 2860 + }, + { + "epoch": 1.1388888888888888, + "grad_norm": 0.5403910279273987, + "learning_rate": 0.0012407407407407408, + "loss": 0.6482, + "step": 2870 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.0824073553085327, + "learning_rate": 0.0012380952380952382, + "loss": 0.6369, + "step": 2880 + }, + { + "epoch": 1.1468253968253967, + "grad_norm": 0.7724151015281677, + "learning_rate": 0.0012354497354497354, + "loss": 0.7722, + "step": 2890 + }, + { + "epoch": 1.1507936507936507, + "grad_norm": 1.6870607137680054, + "learning_rate": 0.0012328042328042328, + "loss": 0.5139, + "step": 2900 + }, + { + "epoch": 1.1547619047619047, + "grad_norm": 1.8609074354171753, + "learning_rate": 0.0012301587301587302, + "loss": 0.5745, + "step": 2910 + }, + { + "epoch": 1.1587301587301586, + "grad_norm": 0.664623498916626, + "learning_rate": 0.0012275132275132274, + "loss": 0.6334, + "step": 2920 + }, + { + "epoch": 1.1626984126984128, + "grad_norm": 0.836618959903717, + "learning_rate": 0.001224867724867725, + "loss": 0.5948, + "step": 2930 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.8063789010047913, + "learning_rate": 0.0012222222222222224, + "loss": 0.5868, + "step": 2940 + }, + { + "epoch": 1.1706349206349207, + "grad_norm": 1.02044677734375, + "learning_rate": 0.0012195767195767196, + "loss": 0.5168, + "step": 2950 + }, + { + "epoch": 1.1746031746031746, + "grad_norm": 0.7230445742607117, + "learning_rate": 0.001216931216931217, + "loss": 0.4973, + "step": 2960 + }, + { + "epoch": 1.1785714285714286, + "grad_norm": 1.4907546043395996, + "learning_rate": 0.0012142857142857142, + "loss": 0.5717, + "step": 2970 + }, + { + "epoch": 1.1825396825396826, + "grad_norm": 0.5981312394142151, + "learning_rate": 0.0012116402116402116, + "loss": 0.5287, + "step": 2980 + }, + { + "epoch": 1.1865079365079365, + "grad_norm": 1.6976572275161743, + "learning_rate": 0.001208994708994709, + "loss": 0.4958, + "step": 2990 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 1.2186094522476196, + "learning_rate": 0.0012063492063492064, + "loss": 0.5103, + "step": 3000 + }, + { + "epoch": 1.1944444444444444, + "grad_norm": 2.3313498497009277, + "learning_rate": 0.0012037037037037038, + "loss": 0.3989, + "step": 3010 + }, + { + "epoch": 1.1984126984126984, + "grad_norm": 0.8640299439430237, + "learning_rate": 0.0012010582010582012, + "loss": 0.5046, + "step": 3020 + }, + { + "epoch": 1.2023809523809523, + "grad_norm": 0.7302188277244568, + "learning_rate": 0.0011984126984126984, + "loss": 0.4876, + "step": 3030 + }, + { + "epoch": 1.2063492063492063, + "grad_norm": 0.6321560740470886, + "learning_rate": 0.0011957671957671958, + "loss": 0.5512, + "step": 3040 + }, + { + "epoch": 1.2103174603174602, + "grad_norm": 1.4281076192855835, + "learning_rate": 0.001193121693121693, + "loss": 0.6275, + "step": 3050 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 1.3028194904327393, + "learning_rate": 0.0011904761904761906, + "loss": 0.4403, + "step": 3060 + }, + { + "epoch": 1.2182539682539684, + "grad_norm": 1.7041105031967163, + "learning_rate": 0.001187830687830688, + "loss": 0.6193, + "step": 3070 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.6587647199630737, + "learning_rate": 0.0011851851851851852, + "loss": 0.4893, + "step": 3080 + }, + { + "epoch": 1.2261904761904763, + "grad_norm": 1.2939643859863281, + "learning_rate": 0.0011825396825396826, + "loss": 0.6198, + "step": 3090 + }, + { + "epoch": 1.2301587301587302, + "grad_norm": 0.5572563409805298, + "learning_rate": 0.00117989417989418, + "loss": 0.5438, + "step": 3100 + }, + { + "epoch": 1.2341269841269842, + "grad_norm": 0.7885312438011169, + "learning_rate": 0.0011772486772486772, + "loss": 0.4951, + "step": 3110 + }, + { + "epoch": 1.2380952380952381, + "grad_norm": 0.7055696249008179, + "learning_rate": 0.0011746031746031748, + "loss": 0.6549, + "step": 3120 + }, + { + "epoch": 1.242063492063492, + "grad_norm": 0.9367688894271851, + "learning_rate": 0.001171957671957672, + "loss": 0.3874, + "step": 3130 + }, + { + "epoch": 1.246031746031746, + "grad_norm": 1.2354093790054321, + "learning_rate": 0.0011693121693121694, + "loss": 0.5545, + "step": 3140 + }, + { + "epoch": 1.25, + "grad_norm": 1.2741392850875854, + "learning_rate": 0.0011666666666666668, + "loss": 0.5277, + "step": 3150 + }, + { + "epoch": 1.253968253968254, + "grad_norm": 0.9361393451690674, + "learning_rate": 0.001164021164021164, + "loss": 0.5458, + "step": 3160 + }, + { + "epoch": 1.257936507936508, + "grad_norm": 1.4866970777511597, + "learning_rate": 0.0011613756613756613, + "loss": 0.5137, + "step": 3170 + }, + { + "epoch": 1.2619047619047619, + "grad_norm": 0.6895744800567627, + "learning_rate": 0.0011587301587301588, + "loss": 0.4681, + "step": 3180 + }, + { + "epoch": 1.2658730158730158, + "grad_norm": 1.1036232709884644, + "learning_rate": 0.0011560846560846562, + "loss": 0.5561, + "step": 3190 + }, + { + "epoch": 1.2698412698412698, + "grad_norm": 0.5537109375, + "learning_rate": 0.0011534391534391536, + "loss": 0.4245, + "step": 3200 + }, + { + "epoch": 1.2738095238095237, + "grad_norm": 1.1008318662643433, + "learning_rate": 0.0011507936507936507, + "loss": 0.5559, + "step": 3210 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 1.5348010063171387, + "learning_rate": 0.0011481481481481481, + "loss": 0.6614, + "step": 3220 + }, + { + "epoch": 1.2817460317460316, + "grad_norm": 0.7859022617340088, + "learning_rate": 0.0011455026455026455, + "loss": 0.5489, + "step": 3230 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.6240460872650146, + "learning_rate": 0.0011428571428571427, + "loss": 0.6612, + "step": 3240 + }, + { + "epoch": 1.2896825396825398, + "grad_norm": 1.0166038274765015, + "learning_rate": 0.0011402116402116403, + "loss": 0.4639, + "step": 3250 + }, + { + "epoch": 1.2936507936507937, + "grad_norm": 2.2691445350646973, + "learning_rate": 0.0011375661375661377, + "loss": 0.5383, + "step": 3260 + }, + { + "epoch": 1.2976190476190477, + "grad_norm": 0.6648916602134705, + "learning_rate": 0.001134920634920635, + "loss": 0.6158, + "step": 3270 + }, + { + "epoch": 1.3015873015873016, + "grad_norm": 0.6790510416030884, + "learning_rate": 0.0011322751322751323, + "loss": 0.5251, + "step": 3280 + }, + { + "epoch": 1.3055555555555556, + "grad_norm": 0.5222778916358948, + "learning_rate": 0.0011296296296296295, + "loss": 0.4507, + "step": 3290 + }, + { + "epoch": 1.3095238095238095, + "grad_norm": 1.31193208694458, + "learning_rate": 0.001126984126984127, + "loss": 0.6471, + "step": 3300 + }, + { + "epoch": 1.3134920634920635, + "grad_norm": 0.7240389585494995, + "learning_rate": 0.0011243386243386245, + "loss": 0.492, + "step": 3310 + }, + { + "epoch": 1.3174603174603174, + "grad_norm": 1.4572322368621826, + "learning_rate": 0.0011216931216931217, + "loss": 0.636, + "step": 3320 + }, + { + "epoch": 1.3214285714285714, + "grad_norm": 0.7390062212944031, + "learning_rate": 0.0011190476190476191, + "loss": 0.5134, + "step": 3330 + }, + { + "epoch": 1.3253968253968254, + "grad_norm": 0.9129742383956909, + "learning_rate": 0.0011164021164021165, + "loss": 0.5521, + "step": 3340 + }, + { + "epoch": 1.3293650793650793, + "grad_norm": 0.9507137537002563, + "learning_rate": 0.0011137566137566137, + "loss": 0.5191, + "step": 3350 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.7048954367637634, + "learning_rate": 0.0011111111111111111, + "loss": 0.4399, + "step": 3360 + }, + { + "epoch": 1.3373015873015874, + "grad_norm": 1.2110259532928467, + "learning_rate": 0.0011084656084656083, + "loss": 0.5302, + "step": 3370 + }, + { + "epoch": 1.3412698412698414, + "grad_norm": 1.2376341819763184, + "learning_rate": 0.001105820105820106, + "loss": 0.535, + "step": 3380 + }, + { + "epoch": 1.3452380952380953, + "grad_norm": 1.2114317417144775, + "learning_rate": 0.0011031746031746033, + "loss": 0.4426, + "step": 3390 + }, + { + "epoch": 1.3492063492063493, + "grad_norm": 1.3357186317443848, + "learning_rate": 0.0011005291005291005, + "loss": 0.5123, + "step": 3400 + }, + { + "epoch": 1.3531746031746033, + "grad_norm": 1.4146705865859985, + "learning_rate": 0.001097883597883598, + "loss": 0.6128, + "step": 3410 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.6163337230682373, + "learning_rate": 0.0010952380952380953, + "loss": 0.537, + "step": 3420 + }, + { + "epoch": 1.3611111111111112, + "grad_norm": 1.9845856428146362, + "learning_rate": 0.0010925925925925925, + "loss": 0.5789, + "step": 3430 + }, + { + "epoch": 1.3650793650793651, + "grad_norm": 0.7714751958847046, + "learning_rate": 0.00108994708994709, + "loss": 0.4769, + "step": 3440 + }, + { + "epoch": 1.369047619047619, + "grad_norm": 1.3484938144683838, + "learning_rate": 0.0010873015873015873, + "loss": 0.5798, + "step": 3450 + }, + { + "epoch": 1.373015873015873, + "grad_norm": 0.9264288544654846, + "learning_rate": 0.0010846560846560847, + "loss": 0.4747, + "step": 3460 + }, + { + "epoch": 1.376984126984127, + "grad_norm": 0.6862549185752869, + "learning_rate": 0.001082010582010582, + "loss": 0.4168, + "step": 3470 + }, + { + "epoch": 1.380952380952381, + "grad_norm": 0.9308891296386719, + "learning_rate": 0.0010793650793650793, + "loss": 0.4471, + "step": 3480 + }, + { + "epoch": 1.3849206349206349, + "grad_norm": 0.7059733867645264, + "learning_rate": 0.0010767195767195767, + "loss": 0.4222, + "step": 3490 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 1.4370836019515991, + "learning_rate": 0.0010740740740740743, + "loss": 0.4517, + "step": 3500 + }, + { + "epoch": 1.3928571428571428, + "grad_norm": 1.125847578048706, + "learning_rate": 0.0010714285714285715, + "loss": 0.6114, + "step": 3510 + }, + { + "epoch": 1.3968253968253967, + "grad_norm": 0.4711201786994934, + "learning_rate": 0.0010687830687830689, + "loss": 0.5844, + "step": 3520 + }, + { + "epoch": 1.4007936507936507, + "grad_norm": 1.1563987731933594, + "learning_rate": 0.001066137566137566, + "loss": 0.6552, + "step": 3530 + }, + { + "epoch": 1.4047619047619047, + "grad_norm": 0.5372576117515564, + "learning_rate": 0.0010634920634920635, + "loss": 0.4234, + "step": 3540 + }, + { + "epoch": 1.4087301587301586, + "grad_norm": 0.683944821357727, + "learning_rate": 0.0010608465608465609, + "loss": 0.5938, + "step": 3550 + }, + { + "epoch": 1.4126984126984126, + "grad_norm": 0.6815638542175293, + "learning_rate": 0.0010582010582010583, + "loss": 0.4907, + "step": 3560 + }, + { + "epoch": 1.4166666666666667, + "grad_norm": 1.6569042205810547, + "learning_rate": 0.0010555555555555557, + "loss": 0.5975, + "step": 3570 + }, + { + "epoch": 1.4206349206349207, + "grad_norm": 1.1780049800872803, + "learning_rate": 0.001052910052910053, + "loss": 0.4291, + "step": 3580 + }, + { + "epoch": 1.4246031746031746, + "grad_norm": 0.6545954346656799, + "learning_rate": 0.0010502645502645503, + "loss": 0.4288, + "step": 3590 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.9560195207595825, + "learning_rate": 0.0010476190476190477, + "loss": 0.4316, + "step": 3600 + }, + { + "epoch": 1.4325396825396826, + "grad_norm": 0.9896324276924133, + "learning_rate": 0.0010449735449735448, + "loss": 0.5869, + "step": 3610 + }, + { + "epoch": 1.4365079365079365, + "grad_norm": 1.3985390663146973, + "learning_rate": 0.0010423280423280422, + "loss": 0.4652, + "step": 3620 + }, + { + "epoch": 1.4404761904761905, + "grad_norm": 1.3849400281906128, + "learning_rate": 0.0010396825396825399, + "loss": 0.6423, + "step": 3630 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.9819910526275635, + "learning_rate": 0.001037037037037037, + "loss": 0.5072, + "step": 3640 + }, + { + "epoch": 1.4484126984126984, + "grad_norm": 0.9809389710426331, + "learning_rate": 0.0010343915343915345, + "loss": 0.4127, + "step": 3650 + }, + { + "epoch": 1.4523809523809523, + "grad_norm": 1.3953092098236084, + "learning_rate": 0.0010317460317460319, + "loss": 0.5313, + "step": 3660 + }, + { + "epoch": 1.4563492063492063, + "grad_norm": 1.1159425973892212, + "learning_rate": 0.001029100529100529, + "loss": 0.4939, + "step": 3670 + }, + { + "epoch": 1.4603174603174602, + "grad_norm": 0.5379135608673096, + "learning_rate": 0.0010264550264550264, + "loss": 0.4107, + "step": 3680 + }, + { + "epoch": 1.4642857142857144, + "grad_norm": 1.1204336881637573, + "learning_rate": 0.0010238095238095238, + "loss": 0.4742, + "step": 3690 + }, + { + "epoch": 1.4682539682539684, + "grad_norm": 0.8563843369483948, + "learning_rate": 0.0010211640211640212, + "loss": 0.5383, + "step": 3700 + }, + { + "epoch": 1.4722222222222223, + "grad_norm": 0.7000299096107483, + "learning_rate": 0.0010185185185185186, + "loss": 0.3803, + "step": 3710 + }, + { + "epoch": 1.4761904761904763, + "grad_norm": 1.4893783330917358, + "learning_rate": 0.0010158730158730158, + "loss": 0.6016, + "step": 3720 + }, + { + "epoch": 1.4801587301587302, + "grad_norm": 0.5601296424865723, + "learning_rate": 0.0010132275132275132, + "loss": 0.3314, + "step": 3730 + }, + { + "epoch": 1.4841269841269842, + "grad_norm": 0.5450819730758667, + "learning_rate": 0.0010105820105820106, + "loss": 0.5151, + "step": 3740 + }, + { + "epoch": 1.4880952380952381, + "grad_norm": 0.6305513381958008, + "learning_rate": 0.001007936507936508, + "loss": 0.5042, + "step": 3750 + }, + { + "epoch": 1.492063492063492, + "grad_norm": 1.2684389352798462, + "learning_rate": 0.0010052910052910054, + "loss": 0.5481, + "step": 3760 + }, + { + "epoch": 1.496031746031746, + "grad_norm": 1.5612815618515015, + "learning_rate": 0.0010026455026455026, + "loss": 0.4791, + "step": 3770 + }, + { + "epoch": 1.5, + "grad_norm": 1.206734538078308, + "learning_rate": 0.001, + "loss": 0.6305, + "step": 3780 + }, + { + "epoch": 1.503968253968254, + "grad_norm": 1.069503664970398, + "learning_rate": 0.0009973544973544974, + "loss": 0.5058, + "step": 3790 + }, + { + "epoch": 1.507936507936508, + "grad_norm": 0.3658556044101715, + "learning_rate": 0.0009947089947089946, + "loss": 0.3957, + "step": 3800 + }, + { + "epoch": 1.5119047619047619, + "grad_norm": 1.0885382890701294, + "learning_rate": 0.000992063492063492, + "loss": 0.4997, + "step": 3810 + }, + { + "epoch": 1.5158730158730158, + "grad_norm": 0.7469413876533508, + "learning_rate": 0.0009894179894179894, + "loss": 0.5236, + "step": 3820 + }, + { + "epoch": 1.5198412698412698, + "grad_norm": 0.7196665406227112, + "learning_rate": 0.0009867724867724868, + "loss": 0.5083, + "step": 3830 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 0.6840754151344299, + "learning_rate": 0.000984126984126984, + "loss": 0.3656, + "step": 3840 + }, + { + "epoch": 1.5277777777777777, + "grad_norm": 1.3978683948516846, + "learning_rate": 0.0009814814814814816, + "loss": 0.4371, + "step": 3850 + }, + { + "epoch": 1.5317460317460316, + "grad_norm": 0.5583405494689941, + "learning_rate": 0.0009788359788359788, + "loss": 0.5274, + "step": 3860 + }, + { + "epoch": 1.5357142857142856, + "grad_norm": 1.8063452243804932, + "learning_rate": 0.0009761904761904762, + "loss": 0.6141, + "step": 3870 + }, + { + "epoch": 1.5396825396825395, + "grad_norm": 0.9723803400993347, + "learning_rate": 0.0009735449735449735, + "loss": 0.5883, + "step": 3880 + }, + { + "epoch": 1.5436507936507935, + "grad_norm": 0.30504110455513, + "learning_rate": 0.0009708994708994709, + "loss": 0.5268, + "step": 3890 + }, + { + "epoch": 1.5476190476190477, + "grad_norm": 0.6150854229927063, + "learning_rate": 0.0009682539682539683, + "loss": 0.5819, + "step": 3900 + }, + { + "epoch": 1.5515873015873016, + "grad_norm": 1.4445383548736572, + "learning_rate": 0.0009656084656084656, + "loss": 0.4811, + "step": 3910 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.662739634513855, + "learning_rate": 0.0009629629629629629, + "loss": 0.3249, + "step": 3920 + }, + { + "epoch": 1.5595238095238095, + "grad_norm": 0.6104711294174194, + "learning_rate": 0.0009603174603174604, + "loss": 0.3257, + "step": 3930 + }, + { + "epoch": 1.5634920634920635, + "grad_norm": 0.6666992902755737, + "learning_rate": 0.0009576719576719577, + "loss": 0.4348, + "step": 3940 + }, + { + "epoch": 1.5674603174603174, + "grad_norm": 1.3601847887039185, + "learning_rate": 0.000955026455026455, + "loss": 0.6486, + "step": 3950 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 1.4528306722640991, + "learning_rate": 0.0009523809523809524, + "loss": 0.5363, + "step": 3960 + }, + { + "epoch": 1.5753968253968254, + "grad_norm": 0.8328957557678223, + "learning_rate": 0.0009497354497354498, + "loss": 0.4086, + "step": 3970 + }, + { + "epoch": 1.5793650793650795, + "grad_norm": 0.6136783361434937, + "learning_rate": 0.0009470899470899471, + "loss": 0.5066, + "step": 3980 + }, + { + "epoch": 1.5833333333333335, + "grad_norm": 1.767379641532898, + "learning_rate": 0.0009444444444444445, + "loss": 0.4943, + "step": 3990 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 0.50275719165802, + "learning_rate": 0.0009417989417989418, + "loss": 0.5513, + "step": 4000 + }, + { + "epoch": 1.5912698412698414, + "grad_norm": 0.9671531915664673, + "learning_rate": 0.0009391534391534392, + "loss": 0.4399, + "step": 4010 + }, + { + "epoch": 1.5952380952380953, + "grad_norm": 1.084027647972107, + "learning_rate": 0.0009365079365079366, + "loss": 0.4851, + "step": 4020 + }, + { + "epoch": 1.5992063492063493, + "grad_norm": 0.944523274898529, + "learning_rate": 0.0009338624338624339, + "loss": 0.4931, + "step": 4030 + }, + { + "epoch": 1.6031746031746033, + "grad_norm": 0.7656432390213013, + "learning_rate": 0.0009312169312169312, + "loss": 0.4205, + "step": 4040 + }, + { + "epoch": 1.6071428571428572, + "grad_norm": 1.1295371055603027, + "learning_rate": 0.0009285714285714287, + "loss": 0.5147, + "step": 4050 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 1.0330742597579956, + "learning_rate": 0.000925925925925926, + "loss": 0.533, + "step": 4060 + }, + { + "epoch": 1.6150793650793651, + "grad_norm": 0.4578189253807068, + "learning_rate": 0.0009232804232804233, + "loss": 0.5074, + "step": 4070 + }, + { + "epoch": 1.619047619047619, + "grad_norm": 0.9493432641029358, + "learning_rate": 0.0009206349206349207, + "loss": 0.4964, + "step": 4080 + }, + { + "epoch": 1.623015873015873, + "grad_norm": 1.229602336883545, + "learning_rate": 0.0009179894179894181, + "loss": 0.4398, + "step": 4090 + }, + { + "epoch": 1.626984126984127, + "grad_norm": 1.182271957397461, + "learning_rate": 0.0009153439153439154, + "loss": 0.6409, + "step": 4100 + }, + { + "epoch": 1.630952380952381, + "grad_norm": 1.6596875190734863, + "learning_rate": 0.0009126984126984126, + "loss": 0.3213, + "step": 4110 + }, + { + "epoch": 1.6349206349206349, + "grad_norm": 0.9412317276000977, + "learning_rate": 0.00091005291005291, + "loss": 0.3736, + "step": 4120 + }, + { + "epoch": 1.6388888888888888, + "grad_norm": 1.4627306461334229, + "learning_rate": 0.0009074074074074074, + "loss": 0.5455, + "step": 4130 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.4782025218009949, + "learning_rate": 0.0009047619047619047, + "loss": 0.4291, + "step": 4140 + }, + { + "epoch": 1.6468253968253967, + "grad_norm": 0.907647430896759, + "learning_rate": 0.0009021164021164021, + "loss": 0.425, + "step": 4150 + }, + { + "epoch": 1.6507936507936507, + "grad_norm": 0.8232408761978149, + "learning_rate": 0.0008994708994708994, + "loss": 0.5669, + "step": 4160 + }, + { + "epoch": 1.6547619047619047, + "grad_norm": 0.5824115872383118, + "learning_rate": 0.0008968253968253968, + "loss": 0.4001, + "step": 4170 + }, + { + "epoch": 1.6587301587301586, + "grad_norm": 0.7836323976516724, + "learning_rate": 0.0008941798941798942, + "loss": 0.5163, + "step": 4180 + }, + { + "epoch": 1.6626984126984126, + "grad_norm": 0.9716808795928955, + "learning_rate": 0.0008915343915343915, + "loss": 0.4334, + "step": 4190 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.46734583377838135, + "learning_rate": 0.0008888888888888888, + "loss": 0.5188, + "step": 4200 + }, + { + "epoch": 1.6706349206349205, + "grad_norm": 1.1709452867507935, + "learning_rate": 0.0008862433862433863, + "loss": 0.4686, + "step": 4210 + }, + { + "epoch": 1.6746031746031746, + "grad_norm": 0.9173301458358765, + "learning_rate": 0.0008835978835978836, + "loss": 0.5697, + "step": 4220 + }, + { + "epoch": 1.6785714285714286, + "grad_norm": 1.190338134765625, + "learning_rate": 0.0008809523809523809, + "loss": 0.4324, + "step": 4230 + }, + { + "epoch": 1.6825396825396826, + "grad_norm": 1.278975248336792, + "learning_rate": 0.0008783068783068783, + "loss": 0.4726, + "step": 4240 + }, + { + "epoch": 1.6865079365079365, + "grad_norm": 0.8761826157569885, + "learning_rate": 0.0008756613756613757, + "loss": 0.5543, + "step": 4250 + }, + { + "epoch": 1.6904761904761905, + "grad_norm": 0.8508326411247253, + "learning_rate": 0.000873015873015873, + "loss": 0.4135, + "step": 4260 + }, + { + "epoch": 1.6944444444444444, + "grad_norm": 0.8877843618392944, + "learning_rate": 0.0008703703703703704, + "loss": 0.3276, + "step": 4270 + }, + { + "epoch": 1.6984126984126984, + "grad_norm": 2.439880609512329, + "learning_rate": 0.0008677248677248677, + "loss": 0.3676, + "step": 4280 + }, + { + "epoch": 1.7023809523809523, + "grad_norm": 1.454038143157959, + "learning_rate": 0.0008650793650793651, + "loss": 0.6022, + "step": 4290 + }, + { + "epoch": 1.7063492063492065, + "grad_norm": 0.6033250093460083, + "learning_rate": 0.0008624338624338625, + "loss": 0.4068, + "step": 4300 + }, + { + "epoch": 1.7103174603174605, + "grad_norm": 0.7904770374298096, + "learning_rate": 0.0008597883597883598, + "loss": 0.4654, + "step": 4310 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.0783374309539795, + "learning_rate": 0.0008571428571428571, + "loss": 0.326, + "step": 4320 + }, + { + "epoch": 1.7182539682539684, + "grad_norm": 0.923893928527832, + "learning_rate": 0.0008544973544973545, + "loss": 0.4963, + "step": 4330 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.36724144220352173, + "learning_rate": 0.0008518518518518519, + "loss": 0.3676, + "step": 4340 + }, + { + "epoch": 1.7261904761904763, + "grad_norm": 1.1232455968856812, + "learning_rate": 0.0008492063492063492, + "loss": 0.4462, + "step": 4350 + }, + { + "epoch": 1.7301587301587302, + "grad_norm": 1.3588309288024902, + "learning_rate": 0.0008465608465608465, + "loss": 0.4954, + "step": 4360 + }, + { + "epoch": 1.7341269841269842, + "grad_norm": 1.016571283340454, + "learning_rate": 0.000843915343915344, + "loss": 0.5502, + "step": 4370 + }, + { + "epoch": 1.7380952380952381, + "grad_norm": 0.9546862244606018, + "learning_rate": 0.0008412698412698413, + "loss": 0.4983, + "step": 4380 + }, + { + "epoch": 1.742063492063492, + "grad_norm": 1.2846907377243042, + "learning_rate": 0.0008386243386243386, + "loss": 0.6217, + "step": 4390 + }, + { + "epoch": 1.746031746031746, + "grad_norm": 1.4156478643417358, + "learning_rate": 0.000835978835978836, + "loss": 0.5286, + "step": 4400 + }, + { + "epoch": 1.75, + "grad_norm": 0.692659318447113, + "learning_rate": 0.0008333333333333334, + "loss": 0.4571, + "step": 4410 + }, + { + "epoch": 1.753968253968254, + "grad_norm": 0.5487807989120483, + "learning_rate": 0.0008306878306878307, + "loss": 0.4216, + "step": 4420 + }, + { + "epoch": 1.757936507936508, + "grad_norm": 0.4136459529399872, + "learning_rate": 0.0008280423280423281, + "loss": 0.3706, + "step": 4430 + }, + { + "epoch": 1.7619047619047619, + "grad_norm": 2.730607748031616, + "learning_rate": 0.0008253968253968254, + "loss": 0.4341, + "step": 4440 + }, + { + "epoch": 1.7658730158730158, + "grad_norm": 1.0752816200256348, + "learning_rate": 0.0008227513227513228, + "loss": 0.6466, + "step": 4450 + }, + { + "epoch": 1.7698412698412698, + "grad_norm": 0.9848162531852722, + "learning_rate": 0.0008201058201058202, + "loss": 0.5904, + "step": 4460 + }, + { + "epoch": 1.7738095238095237, + "grad_norm": 1.4132823944091797, + "learning_rate": 0.0008174603174603175, + "loss": 0.4528, + "step": 4470 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.8410534858703613, + "learning_rate": 0.0008148148148148148, + "loss": 0.431, + "step": 4480 + }, + { + "epoch": 1.7817460317460316, + "grad_norm": 0.7188355922698975, + "learning_rate": 0.0008121693121693123, + "loss": 0.5955, + "step": 4490 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.8639283776283264, + "learning_rate": 0.0008095238095238096, + "loss": 0.4491, + "step": 4500 + }, + { + "epoch": 1.7896825396825395, + "grad_norm": 1.0643069744110107, + "learning_rate": 0.0008068783068783069, + "loss": 0.5299, + "step": 4510 + }, + { + "epoch": 1.7936507936507935, + "grad_norm": 1.1698801517486572, + "learning_rate": 0.0008042328042328042, + "loss": 0.5063, + "step": 4520 + }, + { + "epoch": 1.7976190476190477, + "grad_norm": 1.222699522972107, + "learning_rate": 0.0008015873015873017, + "loss": 0.4743, + "step": 4530 + }, + { + "epoch": 1.8015873015873016, + "grad_norm": 0.8404491543769836, + "learning_rate": 0.000798941798941799, + "loss": 0.4811, + "step": 4540 + }, + { + "epoch": 1.8055555555555556, + "grad_norm": 0.7801256775856018, + "learning_rate": 0.0007962962962962962, + "loss": 0.5615, + "step": 4550 + }, + { + "epoch": 1.8095238095238095, + "grad_norm": 0.735230565071106, + "learning_rate": 0.0007936507936507937, + "loss": 0.4375, + "step": 4560 + }, + { + "epoch": 1.8134920634920635, + "grad_norm": 0.8510635495185852, + "learning_rate": 0.000791005291005291, + "loss": 0.4999, + "step": 4570 + }, + { + "epoch": 1.8174603174603174, + "grad_norm": 1.2653560638427734, + "learning_rate": 0.0007883597883597883, + "loss": 0.4563, + "step": 4580 + }, + { + "epoch": 1.8214285714285714, + "grad_norm": 0.475337952375412, + "learning_rate": 0.0007857142857142857, + "loss": 0.3737, + "step": 4590 + }, + { + "epoch": 1.8253968253968254, + "grad_norm": 0.6187211871147156, + "learning_rate": 0.000783068783068783, + "loss": 0.5287, + "step": 4600 + }, + { + "epoch": 1.8293650793650795, + "grad_norm": 1.4211279153823853, + "learning_rate": 0.0007804232804232804, + "loss": 0.522, + "step": 4610 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 1.4588719606399536, + "learning_rate": 0.0007777777777777778, + "loss": 0.6195, + "step": 4620 + }, + { + "epoch": 1.8373015873015874, + "grad_norm": 0.5156915783882141, + "learning_rate": 0.0007751322751322751, + "loss": 0.5105, + "step": 4630 + }, + { + "epoch": 1.8412698412698414, + "grad_norm": 0.9501180648803711, + "learning_rate": 0.0007724867724867724, + "loss": 0.4081, + "step": 4640 + }, + { + "epoch": 1.8452380952380953, + "grad_norm": 0.45203983783721924, + "learning_rate": 0.0007698412698412699, + "loss": 0.4493, + "step": 4650 + }, + { + "epoch": 1.8492063492063493, + "grad_norm": 0.4670614004135132, + "learning_rate": 0.0007671957671957672, + "loss": 0.3351, + "step": 4660 + }, + { + "epoch": 1.8531746031746033, + "grad_norm": 0.9876275062561035, + "learning_rate": 0.0007645502645502645, + "loss": 0.5741, + "step": 4670 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.8845266103744507, + "learning_rate": 0.0007619047619047619, + "loss": 0.4142, + "step": 4680 + }, + { + "epoch": 1.8611111111111112, + "grad_norm": 0.7441647052764893, + "learning_rate": 0.0007592592592592593, + "loss": 0.4072, + "step": 4690 + }, + { + "epoch": 1.8650793650793651, + "grad_norm": 0.9643361568450928, + "learning_rate": 0.0007566137566137566, + "loss": 0.5352, + "step": 4700 + }, + { + "epoch": 1.869047619047619, + "grad_norm": 0.8456591367721558, + "learning_rate": 0.000753968253968254, + "loss": 0.5337, + "step": 4710 + }, + { + "epoch": 1.873015873015873, + "grad_norm": 1.6536881923675537, + "learning_rate": 0.0007513227513227513, + "loss": 0.4462, + "step": 4720 + }, + { + "epoch": 1.876984126984127, + "grad_norm": 0.6966465711593628, + "learning_rate": 0.0007486772486772487, + "loss": 0.7229, + "step": 4730 + }, + { + "epoch": 1.880952380952381, + "grad_norm": 0.9560131430625916, + "learning_rate": 0.000746031746031746, + "loss": 0.4636, + "step": 4740 + }, + { + "epoch": 1.8849206349206349, + "grad_norm": 0.6783252358436584, + "learning_rate": 0.0007433862433862434, + "loss": 0.5508, + "step": 4750 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5133827328681946, + "learning_rate": 0.0007407407407407407, + "loss": 0.3607, + "step": 4760 + }, + { + "epoch": 1.8928571428571428, + "grad_norm": 0.6404028534889221, + "learning_rate": 0.0007380952380952381, + "loss": 0.5246, + "step": 4770 + }, + { + "epoch": 1.8968253968253967, + "grad_norm": 0.7048952579498291, + "learning_rate": 0.0007354497354497355, + "loss": 0.4684, + "step": 4780 + }, + { + "epoch": 1.9007936507936507, + "grad_norm": 0.5569032430648804, + "learning_rate": 0.0007328042328042328, + "loss": 0.3872, + "step": 4790 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.5231502652168274, + "learning_rate": 0.0007301587301587301, + "loss": 0.3945, + "step": 4800 + }, + { + "epoch": 1.9087301587301586, + "grad_norm": 1.2239683866500854, + "learning_rate": 0.0007275132275132276, + "loss": 0.5389, + "step": 4810 + }, + { + "epoch": 1.9126984126984126, + "grad_norm": 1.5195467472076416, + "learning_rate": 0.0007248677248677249, + "loss": 0.5958, + "step": 4820 + }, + { + "epoch": 1.9166666666666665, + "grad_norm": 0.8116055727005005, + "learning_rate": 0.0007222222222222222, + "loss": 0.3983, + "step": 4830 + }, + { + "epoch": 1.9206349206349205, + "grad_norm": 1.067101240158081, + "learning_rate": 0.0007195767195767196, + "loss": 0.4035, + "step": 4840 + }, + { + "epoch": 1.9246031746031746, + "grad_norm": 0.9318575263023376, + "learning_rate": 0.000716931216931217, + "loss": 0.5596, + "step": 4850 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 1.1583409309387207, + "learning_rate": 0.0007142857142857143, + "loss": 0.3916, + "step": 4860 + }, + { + "epoch": 1.9325396825396826, + "grad_norm": 0.813510000705719, + "learning_rate": 0.0007116402116402117, + "loss": 0.5113, + "step": 4870 + }, + { + "epoch": 1.9365079365079365, + "grad_norm": 1.5386079549789429, + "learning_rate": 0.000708994708994709, + "loss": 0.5188, + "step": 4880 + }, + { + "epoch": 1.9404761904761905, + "grad_norm": 1.1007848978042603, + "learning_rate": 0.0007063492063492064, + "loss": 0.5993, + "step": 4890 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 2.2866406440734863, + "learning_rate": 0.0007037037037037038, + "loss": 0.6263, + "step": 4900 + }, + { + "epoch": 1.9484126984126984, + "grad_norm": 1.5539257526397705, + "learning_rate": 0.0007010582010582011, + "loss": 0.4029, + "step": 4910 + }, + { + "epoch": 1.9523809523809523, + "grad_norm": 0.9776302576065063, + "learning_rate": 0.0006984126984126984, + "loss": 0.45, + "step": 4920 + }, + { + "epoch": 1.9563492063492065, + "grad_norm": 0.7598035335540771, + "learning_rate": 0.0006957671957671959, + "loss": 0.4909, + "step": 4930 + }, + { + "epoch": 1.9603174603174605, + "grad_norm": 1.1056677103042603, + "learning_rate": 0.0006931216931216932, + "loss": 0.4979, + "step": 4940 + }, + { + "epoch": 1.9642857142857144, + "grad_norm": 0.796816349029541, + "learning_rate": 0.0006904761904761905, + "loss": 0.4591, + "step": 4950 + }, + { + "epoch": 1.9682539682539684, + "grad_norm": 0.5265449285507202, + "learning_rate": 0.0006878306878306878, + "loss": 0.3988, + "step": 4960 + }, + { + "epoch": 1.9722222222222223, + "grad_norm": 0.3230462074279785, + "learning_rate": 0.0006851851851851853, + "loss": 0.4432, + "step": 4970 + }, + { + "epoch": 1.9761904761904763, + "grad_norm": 1.2444729804992676, + "learning_rate": 0.0006825396825396826, + "loss": 0.3928, + "step": 4980 + }, + { + "epoch": 1.9801587301587302, + "grad_norm": 0.7676456570625305, + "learning_rate": 0.0006798941798941799, + "loss": 0.4148, + "step": 4990 + }, + { + "epoch": 1.9841269841269842, + "grad_norm": 1.05657160282135, + "learning_rate": 0.0006772486772486773, + "loss": 0.5529, + "step": 5000 + }, + { + "epoch": 1.9880952380952381, + "grad_norm": 0.4994324743747711, + "learning_rate": 0.0006746031746031747, + "loss": 0.3886, + "step": 5010 + }, + { + "epoch": 1.992063492063492, + "grad_norm": 1.1352735757827759, + "learning_rate": 0.000671957671957672, + "loss": 0.5021, + "step": 5020 + }, + { + "epoch": 1.996031746031746, + "grad_norm": 1.0702826976776123, + "learning_rate": 0.0006693121693121694, + "loss": 0.7276, + "step": 5030 + }, + { + "epoch": 2.0, + "grad_norm": 0.9455626010894775, + "learning_rate": 0.0006666666666666666, + "loss": 0.5663, + "step": 5040 + }, + { + "epoch": 2.003968253968254, + "grad_norm": 0.6044638156890869, + "learning_rate": 0.000664021164021164, + "loss": 0.3748, + "step": 5050 + }, + { + "epoch": 2.007936507936508, + "grad_norm": 0.8124226927757263, + "learning_rate": 0.0006613756613756614, + "loss": 0.3476, + "step": 5060 + }, + { + "epoch": 2.011904761904762, + "grad_norm": 0.5022208094596863, + "learning_rate": 0.0006587301587301587, + "loss": 0.313, + "step": 5070 + }, + { + "epoch": 2.015873015873016, + "grad_norm": 0.5413989424705505, + "learning_rate": 0.000656084656084656, + "loss": 0.3359, + "step": 5080 + }, + { + "epoch": 2.0198412698412698, + "grad_norm": 0.8055435419082642, + "learning_rate": 0.0006534391534391535, + "loss": 0.3807, + "step": 5090 + }, + { + "epoch": 2.0238095238095237, + "grad_norm": 1.5344974994659424, + "learning_rate": 0.0006507936507936508, + "loss": 0.3913, + "step": 5100 + }, + { + "epoch": 2.0277777777777777, + "grad_norm": 0.6911923289299011, + "learning_rate": 0.0006481481481481481, + "loss": 0.419, + "step": 5110 + }, + { + "epoch": 2.0317460317460316, + "grad_norm": 0.9209279417991638, + "learning_rate": 0.0006455026455026455, + "loss": 0.4246, + "step": 5120 + }, + { + "epoch": 2.0357142857142856, + "grad_norm": 0.7789056897163391, + "learning_rate": 0.0006428571428571429, + "loss": 0.3487, + "step": 5130 + }, + { + "epoch": 2.0396825396825395, + "grad_norm": 1.2143142223358154, + "learning_rate": 0.0006402116402116402, + "loss": 0.3457, + "step": 5140 + }, + { + "epoch": 2.0436507936507935, + "grad_norm": 1.2130590677261353, + "learning_rate": 0.0006375661375661375, + "loss": 0.4553, + "step": 5150 + }, + { + "epoch": 2.0476190476190474, + "grad_norm": 1.1139146089553833, + "learning_rate": 0.0006349206349206349, + "loss": 0.3192, + "step": 5160 + }, + { + "epoch": 2.0515873015873014, + "grad_norm": 0.9016938805580139, + "learning_rate": 0.0006322751322751323, + "loss": 0.284, + "step": 5170 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 1.2442255020141602, + "learning_rate": 0.0006296296296296296, + "loss": 0.3583, + "step": 5180 + }, + { + "epoch": 2.0595238095238093, + "grad_norm": 1.756134271621704, + "learning_rate": 0.000626984126984127, + "loss": 0.4804, + "step": 5190 + }, + { + "epoch": 2.0634920634920633, + "grad_norm": 0.9567892551422119, + "learning_rate": 0.0006243386243386243, + "loss": 0.3952, + "step": 5200 + }, + { + "epoch": 2.0674603174603177, + "grad_norm": 0.391501784324646, + "learning_rate": 0.0006216931216931217, + "loss": 0.3147, + "step": 5210 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.6419145464897156, + "learning_rate": 0.0006190476190476191, + "loss": 0.2739, + "step": 5220 + }, + { + "epoch": 2.0753968253968256, + "grad_norm": 0.8622870445251465, + "learning_rate": 0.0006164021164021164, + "loss": 0.3093, + "step": 5230 + }, + { + "epoch": 2.0793650793650795, + "grad_norm": 0.5181304812431335, + "learning_rate": 0.0006137566137566137, + "loss": 0.3315, + "step": 5240 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.9292448163032532, + "learning_rate": 0.0006111111111111112, + "loss": 0.3058, + "step": 5250 + }, + { + "epoch": 2.0873015873015874, + "grad_norm": 0.8386250734329224, + "learning_rate": 0.0006084656084656085, + "loss": 0.3641, + "step": 5260 + }, + { + "epoch": 2.0912698412698414, + "grad_norm": 0.7679039239883423, + "learning_rate": 0.0006058201058201058, + "loss": 0.4084, + "step": 5270 + }, + { + "epoch": 2.0952380952380953, + "grad_norm": 0.8268955945968628, + "learning_rate": 0.0006031746031746032, + "loss": 0.2579, + "step": 5280 + }, + { + "epoch": 2.0992063492063493, + "grad_norm": 0.9601532220840454, + "learning_rate": 0.0006005291005291006, + "loss": 0.3987, + "step": 5290 + }, + { + "epoch": 2.1031746031746033, + "grad_norm": 0.5090093612670898, + "learning_rate": 0.0005978835978835979, + "loss": 0.2707, + "step": 5300 + }, + { + "epoch": 2.107142857142857, + "grad_norm": 1.2176988124847412, + "learning_rate": 0.0005952380952380953, + "loss": 0.3985, + "step": 5310 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.5723254680633545, + "learning_rate": 0.0005925925925925926, + "loss": 0.2727, + "step": 5320 + }, + { + "epoch": 2.115079365079365, + "grad_norm": 0.6939801573753357, + "learning_rate": 0.00058994708994709, + "loss": 0.3126, + "step": 5330 + }, + { + "epoch": 2.119047619047619, + "grad_norm": 0.9729529619216919, + "learning_rate": 0.0005873015873015874, + "loss": 0.3971, + "step": 5340 + }, + { + "epoch": 2.123015873015873, + "grad_norm": 0.7717307806015015, + "learning_rate": 0.0005846560846560847, + "loss": 0.3643, + "step": 5350 + }, + { + "epoch": 2.126984126984127, + "grad_norm": 0.8346803784370422, + "learning_rate": 0.000582010582010582, + "loss": 0.4888, + "step": 5360 + }, + { + "epoch": 2.130952380952381, + "grad_norm": 0.6180922389030457, + "learning_rate": 0.0005793650793650794, + "loss": 0.36, + "step": 5370 + }, + { + "epoch": 2.134920634920635, + "grad_norm": 0.6804755926132202, + "learning_rate": 0.0005767195767195768, + "loss": 0.3333, + "step": 5380 + }, + { + "epoch": 2.138888888888889, + "grad_norm": 1.1156859397888184, + "learning_rate": 0.0005740740740740741, + "loss": 0.3147, + "step": 5390 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 1.1516271829605103, + "learning_rate": 0.0005714285714285714, + "loss": 0.3723, + "step": 5400 + }, + { + "epoch": 2.1468253968253967, + "grad_norm": 0.7301619648933411, + "learning_rate": 0.0005687830687830689, + "loss": 0.2972, + "step": 5410 + }, + { + "epoch": 2.1507936507936507, + "grad_norm": 0.46621087193489075, + "learning_rate": 0.0005661375661375662, + "loss": 0.4814, + "step": 5420 + }, + { + "epoch": 2.1547619047619047, + "grad_norm": 1.4534790515899658, + "learning_rate": 0.0005634920634920635, + "loss": 0.3715, + "step": 5430 + }, + { + "epoch": 2.1587301587301586, + "grad_norm": 1.0283761024475098, + "learning_rate": 0.0005608465608465609, + "loss": 0.3096, + "step": 5440 + }, + { + "epoch": 2.1626984126984126, + "grad_norm": 1.517444372177124, + "learning_rate": 0.0005582010582010583, + "loss": 0.4282, + "step": 5450 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 1.1968739032745361, + "learning_rate": 0.0005555555555555556, + "loss": 0.5653, + "step": 5460 + }, + { + "epoch": 2.1706349206349205, + "grad_norm": 0.8281181454658508, + "learning_rate": 0.000552910052910053, + "loss": 0.3734, + "step": 5470 + }, + { + "epoch": 2.1746031746031744, + "grad_norm": 0.825985312461853, + "learning_rate": 0.0005502645502645502, + "loss": 0.2995, + "step": 5480 + }, + { + "epoch": 2.1785714285714284, + "grad_norm": 1.011702060699463, + "learning_rate": 0.0005476190476190477, + "loss": 0.3567, + "step": 5490 + }, + { + "epoch": 2.1825396825396823, + "grad_norm": 1.0061122179031372, + "learning_rate": 0.000544973544973545, + "loss": 0.3029, + "step": 5500 + }, + { + "epoch": 2.1865079365079367, + "grad_norm": 0.7219818234443665, + "learning_rate": 0.0005423280423280423, + "loss": 0.2957, + "step": 5510 + }, + { + "epoch": 2.1904761904761907, + "grad_norm": 0.8629070520401001, + "learning_rate": 0.0005396825396825396, + "loss": 0.4025, + "step": 5520 + }, + { + "epoch": 2.1944444444444446, + "grad_norm": 0.846489429473877, + "learning_rate": 0.0005370370370370371, + "loss": 0.2563, + "step": 5530 + }, + { + "epoch": 2.1984126984126986, + "grad_norm": 1.013261079788208, + "learning_rate": 0.0005343915343915344, + "loss": 0.3241, + "step": 5540 + }, + { + "epoch": 2.2023809523809526, + "grad_norm": 1.1884324550628662, + "learning_rate": 0.0005317460317460317, + "loss": 0.3218, + "step": 5550 + }, + { + "epoch": 2.2063492063492065, + "grad_norm": 0.6852754950523376, + "learning_rate": 0.0005291005291005291, + "loss": 0.307, + "step": 5560 + }, + { + "epoch": 2.2103174603174605, + "grad_norm": 0.8409839272499084, + "learning_rate": 0.0005264550264550265, + "loss": 0.2683, + "step": 5570 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.6928064823150635, + "learning_rate": 0.0005238095238095238, + "loss": 0.3296, + "step": 5580 + }, + { + "epoch": 2.2182539682539684, + "grad_norm": 0.48399004340171814, + "learning_rate": 0.0005211640211640211, + "loss": 0.2453, + "step": 5590 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.6243159174919128, + "learning_rate": 0.0005185185185185185, + "loss": 0.4726, + "step": 5600 + }, + { + "epoch": 2.2261904761904763, + "grad_norm": 0.8214319944381714, + "learning_rate": 0.0005158730158730159, + "loss": 0.3647, + "step": 5610 + }, + { + "epoch": 2.2301587301587302, + "grad_norm": 1.350664496421814, + "learning_rate": 0.0005132275132275132, + "loss": 0.3515, + "step": 5620 + }, + { + "epoch": 2.234126984126984, + "grad_norm": 0.598360002040863, + "learning_rate": 0.0005105820105820106, + "loss": 0.4379, + "step": 5630 + }, + { + "epoch": 2.238095238095238, + "grad_norm": 0.744739830493927, + "learning_rate": 0.0005079365079365079, + "loss": 0.3241, + "step": 5640 + }, + { + "epoch": 2.242063492063492, + "grad_norm": 9.888148307800293, + "learning_rate": 0.0005052910052910053, + "loss": 0.2648, + "step": 5650 + }, + { + "epoch": 2.246031746031746, + "grad_norm": 0.5198895931243896, + "learning_rate": 0.0005026455026455027, + "loss": 0.3836, + "step": 5660 + }, + { + "epoch": 2.25, + "grad_norm": 0.9944855570793152, + "learning_rate": 0.0005, + "loss": 0.3527, + "step": 5670 + }, + { + "epoch": 2.253968253968254, + "grad_norm": 0.8176829218864441, + "learning_rate": 0.0004973544973544973, + "loss": 0.3, + "step": 5680 + }, + { + "epoch": 2.257936507936508, + "grad_norm": 0.37834370136260986, + "learning_rate": 0.0004947089947089947, + "loss": 0.2629, + "step": 5690 + }, + { + "epoch": 2.261904761904762, + "grad_norm": 1.0115917921066284, + "learning_rate": 0.000492063492063492, + "loss": 0.3813, + "step": 5700 + }, + { + "epoch": 2.265873015873016, + "grad_norm": 1.2166203260421753, + "learning_rate": 0.0004894179894179894, + "loss": 0.2698, + "step": 5710 + }, + { + "epoch": 2.2698412698412698, + "grad_norm": 0.4840317964553833, + "learning_rate": 0.00048677248677248675, + "loss": 0.2739, + "step": 5720 + }, + { + "epoch": 2.2738095238095237, + "grad_norm": 0.528724193572998, + "learning_rate": 0.00048412698412698415, + "loss": 0.3434, + "step": 5730 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.6342616081237793, + "learning_rate": 0.00048148148148148144, + "loss": 0.2495, + "step": 5740 + }, + { + "epoch": 2.2817460317460316, + "grad_norm": 0.5333026647567749, + "learning_rate": 0.00047883597883597884, + "loss": 0.2839, + "step": 5750 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.7824140787124634, + "learning_rate": 0.0004761904761904762, + "loss": 0.3332, + "step": 5760 + }, + { + "epoch": 2.2896825396825395, + "grad_norm": 0.9291231632232666, + "learning_rate": 0.00047354497354497354, + "loss": 0.4443, + "step": 5770 + }, + { + "epoch": 2.2936507936507935, + "grad_norm": 0.8615803122520447, + "learning_rate": 0.0004708994708994709, + "loss": 0.338, + "step": 5780 + }, + { + "epoch": 2.2976190476190474, + "grad_norm": 0.5790500640869141, + "learning_rate": 0.0004682539682539683, + "loss": 0.3032, + "step": 5790 + }, + { + "epoch": 2.3015873015873014, + "grad_norm": 0.5711954832077026, + "learning_rate": 0.0004656084656084656, + "loss": 0.2674, + "step": 5800 + }, + { + "epoch": 2.3055555555555554, + "grad_norm": 0.6912782192230225, + "learning_rate": 0.000462962962962963, + "loss": 0.3159, + "step": 5810 + }, + { + "epoch": 2.3095238095238093, + "grad_norm": 1.0069470405578613, + "learning_rate": 0.00046031746031746033, + "loss": 0.2548, + "step": 5820 + }, + { + "epoch": 2.3134920634920633, + "grad_norm": 0.7111985087394714, + "learning_rate": 0.0004576719576719577, + "loss": 0.4336, + "step": 5830 + }, + { + "epoch": 2.317460317460317, + "grad_norm": 0.7876987457275391, + "learning_rate": 0.000455026455026455, + "loss": 0.3417, + "step": 5840 + }, + { + "epoch": 2.3214285714285716, + "grad_norm": 1.222811222076416, + "learning_rate": 0.00045238095238095237, + "loss": 0.2989, + "step": 5850 + }, + { + "epoch": 2.3253968253968256, + "grad_norm": 0.6214492321014404, + "learning_rate": 0.0004497354497354497, + "loss": 0.3119, + "step": 5860 + }, + { + "epoch": 2.3293650793650795, + "grad_norm": 1.190848708152771, + "learning_rate": 0.0004470899470899471, + "loss": 0.3549, + "step": 5870 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.6466199159622192, + "learning_rate": 0.0004444444444444444, + "loss": 0.3483, + "step": 5880 + }, + { + "epoch": 2.3373015873015874, + "grad_norm": 1.195802927017212, + "learning_rate": 0.0004417989417989418, + "loss": 0.2867, + "step": 5890 + }, + { + "epoch": 2.3412698412698414, + "grad_norm": 0.705406665802002, + "learning_rate": 0.00043915343915343916, + "loss": 0.3974, + "step": 5900 + }, + { + "epoch": 2.3452380952380953, + "grad_norm": 1.0674729347229004, + "learning_rate": 0.0004365079365079365, + "loss": 0.3524, + "step": 5910 + }, + { + "epoch": 2.3492063492063493, + "grad_norm": 1.3207943439483643, + "learning_rate": 0.00043386243386243385, + "loss": 0.309, + "step": 5920 + }, + { + "epoch": 2.3531746031746033, + "grad_norm": 0.6910490393638611, + "learning_rate": 0.00043121693121693126, + "loss": 0.2557, + "step": 5930 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.7160533666610718, + "learning_rate": 0.00042857142857142855, + "loss": 0.3447, + "step": 5940 + }, + { + "epoch": 2.361111111111111, + "grad_norm": 1.117875576019287, + "learning_rate": 0.00042592592592592595, + "loss": 0.3796, + "step": 5950 + }, + { + "epoch": 2.365079365079365, + "grad_norm": 0.7119603753089905, + "learning_rate": 0.00042328042328042324, + "loss": 0.3458, + "step": 5960 + }, + { + "epoch": 2.369047619047619, + "grad_norm": 0.6892464756965637, + "learning_rate": 0.00042063492063492065, + "loss": 0.3888, + "step": 5970 + }, + { + "epoch": 2.373015873015873, + "grad_norm": 0.8669295310974121, + "learning_rate": 0.000417989417989418, + "loss": 0.4486, + "step": 5980 + }, + { + "epoch": 2.376984126984127, + "grad_norm": 0.3692854642868042, + "learning_rate": 0.00041534391534391534, + "loss": 0.2848, + "step": 5990 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.8515878915786743, + "learning_rate": 0.0004126984126984127, + "loss": 0.3851, + "step": 6000 + }, + { + "epoch": 2.384920634920635, + "grad_norm": 0.8710914850234985, + "learning_rate": 0.0004100529100529101, + "loss": 0.3629, + "step": 6010 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 1.1649229526519775, + "learning_rate": 0.0004074074074074074, + "loss": 0.3405, + "step": 6020 + }, + { + "epoch": 2.392857142857143, + "grad_norm": 0.536342442035675, + "learning_rate": 0.0004047619047619048, + "loss": 0.3541, + "step": 6030 + }, + { + "epoch": 2.3968253968253967, + "grad_norm": 0.6506990790367126, + "learning_rate": 0.0004021164021164021, + "loss": 0.3217, + "step": 6040 + }, + { + "epoch": 2.4007936507936507, + "grad_norm": 0.39036527276039124, + "learning_rate": 0.0003994708994708995, + "loss": 0.3307, + "step": 6050 + }, + { + "epoch": 2.4047619047619047, + "grad_norm": 0.5971412658691406, + "learning_rate": 0.0003968253968253968, + "loss": 0.3523, + "step": 6060 + }, + { + "epoch": 2.4087301587301586, + "grad_norm": 1.4851547479629517, + "learning_rate": 0.00039417989417989417, + "loss": 0.315, + "step": 6070 + }, + { + "epoch": 2.4126984126984126, + "grad_norm": 0.7956401705741882, + "learning_rate": 0.0003915343915343915, + "loss": 0.3218, + "step": 6080 + }, + { + "epoch": 2.4166666666666665, + "grad_norm": 0.7292457818984985, + "learning_rate": 0.0003888888888888889, + "loss": 0.3398, + "step": 6090 + }, + { + "epoch": 2.4206349206349205, + "grad_norm": 1.0612292289733887, + "learning_rate": 0.0003862433862433862, + "loss": 0.3405, + "step": 6100 + }, + { + "epoch": 2.4246031746031744, + "grad_norm": 0.7647016644477844, + "learning_rate": 0.0003835978835978836, + "loss": 0.4365, + "step": 6110 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.6238649487495422, + "learning_rate": 0.00038095238095238096, + "loss": 0.3462, + "step": 6120 + }, + { + "epoch": 2.432539682539683, + "grad_norm": 0.7567634582519531, + "learning_rate": 0.0003783068783068783, + "loss": 0.2732, + "step": 6130 + }, + { + "epoch": 2.4365079365079367, + "grad_norm": 0.589939534664154, + "learning_rate": 0.00037566137566137566, + "loss": 0.4249, + "step": 6140 + }, + { + "epoch": 2.4404761904761907, + "grad_norm": 0.9400720596313477, + "learning_rate": 0.000373015873015873, + "loss": 0.4112, + "step": 6150 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.7339090704917908, + "learning_rate": 0.00037037037037037035, + "loss": 0.3596, + "step": 6160 + }, + { + "epoch": 2.4484126984126986, + "grad_norm": 1.508101463317871, + "learning_rate": 0.00036772486772486775, + "loss": 0.2354, + "step": 6170 + }, + { + "epoch": 2.4523809523809526, + "grad_norm": 1.042312741279602, + "learning_rate": 0.00036507936507936505, + "loss": 0.4263, + "step": 6180 + }, + { + "epoch": 2.4563492063492065, + "grad_norm": 1.1017494201660156, + "learning_rate": 0.00036243386243386245, + "loss": 0.4159, + "step": 6190 + }, + { + "epoch": 2.4603174603174605, + "grad_norm": 0.7952788472175598, + "learning_rate": 0.0003597883597883598, + "loss": 0.2949, + "step": 6200 + }, + { + "epoch": 2.4642857142857144, + "grad_norm": 0.652211606502533, + "learning_rate": 0.00035714285714285714, + "loss": 0.3523, + "step": 6210 + }, + { + "epoch": 2.4682539682539684, + "grad_norm": 1.0506590604782104, + "learning_rate": 0.0003544973544973545, + "loss": 0.4683, + "step": 6220 + }, + { + "epoch": 2.4722222222222223, + "grad_norm": 0.7924396991729736, + "learning_rate": 0.0003518518518518519, + "loss": 0.3162, + "step": 6230 + }, + { + "epoch": 2.4761904761904763, + "grad_norm": 0.5057342052459717, + "learning_rate": 0.0003492063492063492, + "loss": 0.2741, + "step": 6240 + }, + { + "epoch": 2.4801587301587302, + "grad_norm": 1.0041768550872803, + "learning_rate": 0.0003465608465608466, + "loss": 0.3528, + "step": 6250 + }, + { + "epoch": 2.484126984126984, + "grad_norm": 1.06671941280365, + "learning_rate": 0.0003439153439153439, + "loss": 0.3605, + "step": 6260 + }, + { + "epoch": 2.488095238095238, + "grad_norm": 0.41186466813087463, + "learning_rate": 0.0003412698412698413, + "loss": 0.2841, + "step": 6270 + }, + { + "epoch": 2.492063492063492, + "grad_norm": 0.3925606906414032, + "learning_rate": 0.00033862433862433863, + "loss": 0.3427, + "step": 6280 + }, + { + "epoch": 2.496031746031746, + "grad_norm": 1.4012260437011719, + "learning_rate": 0.000335978835978836, + "loss": 0.4803, + "step": 6290 + }, + { + "epoch": 2.5, + "grad_norm": 0.5710623264312744, + "learning_rate": 0.0003333333333333333, + "loss": 0.3037, + "step": 6300 + }, + { + "epoch": 2.503968253968254, + "grad_norm": 0.9036715030670166, + "learning_rate": 0.0003306878306878307, + "loss": 0.3943, + "step": 6310 + }, + { + "epoch": 2.507936507936508, + "grad_norm": 0.6256608366966248, + "learning_rate": 0.000328042328042328, + "loss": 0.3014, + "step": 6320 + }, + { + "epoch": 2.511904761904762, + "grad_norm": 0.8218435645103455, + "learning_rate": 0.0003253968253968254, + "loss": 0.4707, + "step": 6330 + }, + { + "epoch": 2.515873015873016, + "grad_norm": 0.6735277771949768, + "learning_rate": 0.00032275132275132277, + "loss": 0.2808, + "step": 6340 + }, + { + "epoch": 2.5198412698412698, + "grad_norm": 0.6450037360191345, + "learning_rate": 0.0003201058201058201, + "loss": 0.3727, + "step": 6350 + }, + { + "epoch": 2.5238095238095237, + "grad_norm": 1.246138334274292, + "learning_rate": 0.00031746031746031746, + "loss": 0.3881, + "step": 6360 + }, + { + "epoch": 2.5277777777777777, + "grad_norm": 0.5396189093589783, + "learning_rate": 0.0003148148148148148, + "loss": 0.2661, + "step": 6370 + }, + { + "epoch": 2.5317460317460316, + "grad_norm": 1.2827895879745483, + "learning_rate": 0.00031216931216931215, + "loss": 0.2871, + "step": 6380 + }, + { + "epoch": 2.5357142857142856, + "grad_norm": 0.7319866418838501, + "learning_rate": 0.00030952380952380956, + "loss": 0.3458, + "step": 6390 + }, + { + "epoch": 2.5396825396825395, + "grad_norm": 0.5848907828330994, + "learning_rate": 0.00030687830687830685, + "loss": 0.3542, + "step": 6400 + }, + { + "epoch": 2.5436507936507935, + "grad_norm": 1.022750735282898, + "learning_rate": 0.00030423280423280425, + "loss": 0.3295, + "step": 6410 + }, + { + "epoch": 2.5476190476190474, + "grad_norm": 0.6221028566360474, + "learning_rate": 0.0003015873015873016, + "loss": 0.3135, + "step": 6420 + }, + { + "epoch": 2.5515873015873014, + "grad_norm": 0.7685695886611938, + "learning_rate": 0.00029894179894179895, + "loss": 0.3074, + "step": 6430 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 1.1064790487289429, + "learning_rate": 0.0002962962962962963, + "loss": 0.2694, + "step": 6440 + }, + { + "epoch": 2.5595238095238093, + "grad_norm": 1.2743747234344482, + "learning_rate": 0.0002936507936507937, + "loss": 0.3203, + "step": 6450 + }, + { + "epoch": 2.5634920634920633, + "grad_norm": 0.8724698424339294, + "learning_rate": 0.000291005291005291, + "loss": 0.3696, + "step": 6460 + }, + { + "epoch": 2.567460317460317, + "grad_norm": 0.5731073617935181, + "learning_rate": 0.0002883597883597884, + "loss": 0.4232, + "step": 6470 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 1.1916602849960327, + "learning_rate": 0.0002857142857142857, + "loss": 0.3249, + "step": 6480 + }, + { + "epoch": 2.575396825396825, + "grad_norm": 0.6559428572654724, + "learning_rate": 0.0002830687830687831, + "loss": 0.3469, + "step": 6490 + }, + { + "epoch": 2.5793650793650795, + "grad_norm": 0.7409236431121826, + "learning_rate": 0.00028042328042328043, + "loss": 0.2983, + "step": 6500 + }, + { + "epoch": 2.5833333333333335, + "grad_norm": 0.9593034982681274, + "learning_rate": 0.0002777777777777778, + "loss": 0.2303, + "step": 6510 + }, + { + "epoch": 2.5873015873015874, + "grad_norm": 1.2059444189071655, + "learning_rate": 0.0002751322751322751, + "loss": 0.2786, + "step": 6520 + }, + { + "epoch": 2.5912698412698414, + "grad_norm": 0.47993749380111694, + "learning_rate": 0.0002724867724867725, + "loss": 0.2946, + "step": 6530 + }, + { + "epoch": 2.5952380952380953, + "grad_norm": 1.1158372163772583, + "learning_rate": 0.0002698412698412698, + "loss": 0.3429, + "step": 6540 + }, + { + "epoch": 2.5992063492063493, + "grad_norm": 0.6710345149040222, + "learning_rate": 0.0002671957671957672, + "loss": 0.4059, + "step": 6550 + }, + { + "epoch": 2.6031746031746033, + "grad_norm": 0.6601153016090393, + "learning_rate": 0.00026455026455026457, + "loss": 0.4536, + "step": 6560 + }, + { + "epoch": 2.607142857142857, + "grad_norm": 1.258527398109436, + "learning_rate": 0.0002619047619047619, + "loss": 0.3204, + "step": 6570 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.6397349834442139, + "learning_rate": 0.00025925925925925926, + "loss": 0.34, + "step": 6580 + }, + { + "epoch": 2.615079365079365, + "grad_norm": 0.6242520213127136, + "learning_rate": 0.0002566137566137566, + "loss": 0.2976, + "step": 6590 + }, + { + "epoch": 2.619047619047619, + "grad_norm": 1.0702687501907349, + "learning_rate": 0.00025396825396825396, + "loss": 0.302, + "step": 6600 + }, + { + "epoch": 2.623015873015873, + "grad_norm": 0.38248881697654724, + "learning_rate": 0.00025132275132275136, + "loss": 0.2235, + "step": 6610 + }, + { + "epoch": 2.626984126984127, + "grad_norm": 0.67015141248703, + "learning_rate": 0.00024867724867724865, + "loss": 0.3465, + "step": 6620 + }, + { + "epoch": 2.630952380952381, + "grad_norm": 0.9071609377861023, + "learning_rate": 0.000246031746031746, + "loss": 0.3543, + "step": 6630 + }, + { + "epoch": 2.634920634920635, + "grad_norm": 1.1956970691680908, + "learning_rate": 0.00024338624338624337, + "loss": 0.3436, + "step": 6640 + }, + { + "epoch": 2.638888888888889, + "grad_norm": 0.6545996069908142, + "learning_rate": 0.00024074074074074072, + "loss": 0.2752, + "step": 6650 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 0.8755260705947876, + "learning_rate": 0.0002380952380952381, + "loss": 0.3287, + "step": 6660 + }, + { + "epoch": 2.6468253968253967, + "grad_norm": 0.5090301036834717, + "learning_rate": 0.00023544973544973544, + "loss": 0.2705, + "step": 6670 + }, + { + "epoch": 2.6507936507936507, + "grad_norm": 0.776059091091156, + "learning_rate": 0.0002328042328042328, + "loss": 0.315, + "step": 6680 + }, + { + "epoch": 2.6547619047619047, + "grad_norm": 0.752827525138855, + "learning_rate": 0.00023015873015873016, + "loss": 0.4351, + "step": 6690 + }, + { + "epoch": 2.6587301587301586, + "grad_norm": 0.5817317366600037, + "learning_rate": 0.0002275132275132275, + "loss": 0.3015, + "step": 6700 + }, + { + "epoch": 2.6626984126984126, + "grad_norm": 0.7703492641448975, + "learning_rate": 0.00022486772486772486, + "loss": 0.3444, + "step": 6710 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.1149251461029053, + "learning_rate": 0.0002222222222222222, + "loss": 0.3366, + "step": 6720 + }, + { + "epoch": 2.6706349206349205, + "grad_norm": 0.5407519340515137, + "learning_rate": 0.00021957671957671958, + "loss": 0.3062, + "step": 6730 + }, + { + "epoch": 2.674603174603175, + "grad_norm": 0.999150276184082, + "learning_rate": 0.00021693121693121693, + "loss": 0.3877, + "step": 6740 + }, + { + "epoch": 2.678571428571429, + "grad_norm": 1.0281010866165161, + "learning_rate": 0.00021428571428571427, + "loss": 0.3898, + "step": 6750 + }, + { + "epoch": 2.682539682539683, + "grad_norm": 0.5821579098701477, + "learning_rate": 0.00021164021164021162, + "loss": 0.4434, + "step": 6760 + }, + { + "epoch": 2.6865079365079367, + "grad_norm": 0.7311249375343323, + "learning_rate": 0.000208994708994709, + "loss": 0.2707, + "step": 6770 + }, + { + "epoch": 2.6904761904761907, + "grad_norm": 1.1552441120147705, + "learning_rate": 0.00020634920634920634, + "loss": 0.3594, + "step": 6780 + }, + { + "epoch": 2.6944444444444446, + "grad_norm": 0.5154855847358704, + "learning_rate": 0.0002037037037037037, + "loss": 0.2184, + "step": 6790 + }, + { + "epoch": 2.6984126984126986, + "grad_norm": 1.3952319622039795, + "learning_rate": 0.00020105820105820104, + "loss": 0.3249, + "step": 6800 + }, + { + "epoch": 2.7023809523809526, + "grad_norm": 1.300567626953125, + "learning_rate": 0.0001984126984126984, + "loss": 0.3479, + "step": 6810 + }, + { + "epoch": 2.7063492063492065, + "grad_norm": 0.8334280848503113, + "learning_rate": 0.00019576719576719576, + "loss": 0.2659, + "step": 6820 + }, + { + "epoch": 2.7103174603174605, + "grad_norm": 0.6446446776390076, + "learning_rate": 0.0001931216931216931, + "loss": 0.2593, + "step": 6830 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.5977747440338135, + "learning_rate": 0.00019047619047619048, + "loss": 0.3015, + "step": 6840 + }, + { + "epoch": 2.7182539682539684, + "grad_norm": 0.6966296434402466, + "learning_rate": 0.00018783068783068783, + "loss": 0.2615, + "step": 6850 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 1.3358402252197266, + "learning_rate": 0.00018518518518518518, + "loss": 0.3439, + "step": 6860 + }, + { + "epoch": 2.7261904761904763, + "grad_norm": 1.1806023120880127, + "learning_rate": 0.00018253968253968252, + "loss": 0.2893, + "step": 6870 + }, + { + "epoch": 2.7301587301587302, + "grad_norm": 0.6638475656509399, + "learning_rate": 0.0001798941798941799, + "loss": 0.236, + "step": 6880 + }, + { + "epoch": 2.734126984126984, + "grad_norm": 0.9930281639099121, + "learning_rate": 0.00017724867724867724, + "loss": 0.3489, + "step": 6890 + }, + { + "epoch": 2.738095238095238, + "grad_norm": 0.7173562049865723, + "learning_rate": 0.0001746031746031746, + "loss": 0.2933, + "step": 6900 + }, + { + "epoch": 2.742063492063492, + "grad_norm": 1.0661985874176025, + "learning_rate": 0.00017195767195767194, + "loss": 0.2888, + "step": 6910 + }, + { + "epoch": 2.746031746031746, + "grad_norm": 0.6391360759735107, + "learning_rate": 0.00016931216931216931, + "loss": 0.1776, + "step": 6920 + }, + { + "epoch": 2.75, + "grad_norm": 0.8126150965690613, + "learning_rate": 0.00016666666666666666, + "loss": 0.3875, + "step": 6930 + }, + { + "epoch": 2.753968253968254, + "grad_norm": 0.5906522274017334, + "learning_rate": 0.000164021164021164, + "loss": 0.2361, + "step": 6940 + }, + { + "epoch": 2.757936507936508, + "grad_norm": 0.7056891918182373, + "learning_rate": 0.00016137566137566138, + "loss": 0.356, + "step": 6950 + }, + { + "epoch": 2.761904761904762, + "grad_norm": 0.8128471970558167, + "learning_rate": 0.00015873015873015873, + "loss": 0.2629, + "step": 6960 + }, + { + "epoch": 2.765873015873016, + "grad_norm": 0.76819908618927, + "learning_rate": 0.00015608465608465608, + "loss": 0.2926, + "step": 6970 + }, + { + "epoch": 2.7698412698412698, + "grad_norm": 0.8994793891906738, + "learning_rate": 0.00015343915343915342, + "loss": 0.4022, + "step": 6980 + }, + { + "epoch": 2.7738095238095237, + "grad_norm": 0.6637232303619385, + "learning_rate": 0.0001507936507936508, + "loss": 0.393, + "step": 6990 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 1.1956053972244263, + "learning_rate": 0.00014814814814814815, + "loss": 0.3125, + "step": 7000 + }, + { + "epoch": 2.7817460317460316, + "grad_norm": 0.8361969590187073, + "learning_rate": 0.0001455026455026455, + "loss": 0.337, + "step": 7010 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.6757733225822449, + "learning_rate": 0.00014285714285714284, + "loss": 0.266, + "step": 7020 + }, + { + "epoch": 2.7896825396825395, + "grad_norm": 0.808002769947052, + "learning_rate": 0.00014021164021164022, + "loss": 0.3207, + "step": 7030 + }, + { + "epoch": 2.7936507936507935, + "grad_norm": 0.670218288898468, + "learning_rate": 0.00013756613756613756, + "loss": 0.3628, + "step": 7040 + }, + { + "epoch": 2.7976190476190474, + "grad_norm": 0.9069200158119202, + "learning_rate": 0.0001349206349206349, + "loss": 0.4583, + "step": 7050 + }, + { + "epoch": 2.8015873015873014, + "grad_norm": 0.7543951869010925, + "learning_rate": 0.00013227513227513228, + "loss": 0.3408, + "step": 7060 + }, + { + "epoch": 2.8055555555555554, + "grad_norm": 0.6418523788452148, + "learning_rate": 0.00012962962962962963, + "loss": 0.2587, + "step": 7070 + }, + { + "epoch": 2.8095238095238093, + "grad_norm": 0.4243696928024292, + "learning_rate": 0.00012698412698412698, + "loss": 0.2978, + "step": 7080 + }, + { + "epoch": 2.8134920634920633, + "grad_norm": 0.8575748801231384, + "learning_rate": 0.00012433862433862433, + "loss": 0.3119, + "step": 7090 + }, + { + "epoch": 2.817460317460317, + "grad_norm": 0.8136184215545654, + "learning_rate": 0.00012169312169312169, + "loss": 0.245, + "step": 7100 + }, + { + "epoch": 2.821428571428571, + "grad_norm": 1.1264744997024536, + "learning_rate": 0.00011904761904761905, + "loss": 0.3884, + "step": 7110 + }, + { + "epoch": 2.825396825396825, + "grad_norm": 0.6529180407524109, + "learning_rate": 0.0001164021164021164, + "loss": 0.2985, + "step": 7120 + }, + { + "epoch": 2.8293650793650795, + "grad_norm": 1.2286404371261597, + "learning_rate": 0.00011375661375661376, + "loss": 0.3886, + "step": 7130 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.46890988945961, + "learning_rate": 0.0001111111111111111, + "loss": 0.3925, + "step": 7140 + }, + { + "epoch": 2.8373015873015874, + "grad_norm": 0.8656564354896545, + "learning_rate": 0.00010846560846560846, + "loss": 0.2998, + "step": 7150 + }, + { + "epoch": 2.8412698412698414, + "grad_norm": 0.6795648336410522, + "learning_rate": 0.00010582010582010581, + "loss": 0.2654, + "step": 7160 + }, + { + "epoch": 2.8452380952380953, + "grad_norm": 0.9066348075866699, + "learning_rate": 0.00010317460317460317, + "loss": 0.4258, + "step": 7170 + }, + { + "epoch": 2.8492063492063493, + "grad_norm": 1.2527462244033813, + "learning_rate": 0.00010052910052910052, + "loss": 0.3391, + "step": 7180 + }, + { + "epoch": 2.8531746031746033, + "grad_norm": 0.767871081829071, + "learning_rate": 9.788359788359788e-05, + "loss": 0.2898, + "step": 7190 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.3189416825771332, + "learning_rate": 9.523809523809524e-05, + "loss": 0.2147, + "step": 7200 + }, + { + "epoch": 2.861111111111111, + "grad_norm": 0.7316614985466003, + "learning_rate": 9.259259259259259e-05, + "loss": 0.2906, + "step": 7210 + }, + { + "epoch": 2.865079365079365, + "grad_norm": 0.6827272772789001, + "learning_rate": 8.994708994708995e-05, + "loss": 0.2558, + "step": 7220 + }, + { + "epoch": 2.869047619047619, + "grad_norm": 0.40644726157188416, + "learning_rate": 8.73015873015873e-05, + "loss": 0.2516, + "step": 7230 + }, + { + "epoch": 2.873015873015873, + "grad_norm": 0.9451491236686707, + "learning_rate": 8.465608465608466e-05, + "loss": 0.3458, + "step": 7240 + }, + { + "epoch": 2.876984126984127, + "grad_norm": 0.5476970672607422, + "learning_rate": 8.2010582010582e-05, + "loss": 0.3106, + "step": 7250 + }, + { + "epoch": 2.880952380952381, + "grad_norm": 0.8001719117164612, + "learning_rate": 7.936507936507937e-05, + "loss": 0.3429, + "step": 7260 + }, + { + "epoch": 2.884920634920635, + "grad_norm": 0.5511224269866943, + "learning_rate": 7.671957671957671e-05, + "loss": 0.3601, + "step": 7270 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.6623083353042603, + "learning_rate": 7.407407407407407e-05, + "loss": 0.3627, + "step": 7280 + }, + { + "epoch": 2.892857142857143, + "grad_norm": 0.5939965844154358, + "learning_rate": 7.142857142857142e-05, + "loss": 0.3334, + "step": 7290 + }, + { + "epoch": 2.8968253968253967, + "grad_norm": 0.699934184551239, + "learning_rate": 6.878306878306878e-05, + "loss": 0.3701, + "step": 7300 + }, + { + "epoch": 2.9007936507936507, + "grad_norm": 1.0135327577590942, + "learning_rate": 6.613756613756614e-05, + "loss": 0.284, + "step": 7310 + }, + { + "epoch": 2.9047619047619047, + "grad_norm": 0.7858631014823914, + "learning_rate": 6.349206349206349e-05, + "loss": 0.3369, + "step": 7320 + }, + { + "epoch": 2.9087301587301586, + "grad_norm": 0.8691958785057068, + "learning_rate": 6.084656084656084e-05, + "loss": 0.3139, + "step": 7330 + }, + { + "epoch": 2.9126984126984126, + "grad_norm": 0.6003396511077881, + "learning_rate": 5.82010582010582e-05, + "loss": 0.246, + "step": 7340 + }, + { + "epoch": 2.9166666666666665, + "grad_norm": 0.5547357797622681, + "learning_rate": 5.555555555555555e-05, + "loss": 0.2271, + "step": 7350 + }, + { + "epoch": 2.9206349206349205, + "grad_norm": 1.1599771976470947, + "learning_rate": 5.2910052910052905e-05, + "loss": 0.3734, + "step": 7360 + }, + { + "epoch": 2.924603174603175, + "grad_norm": 0.7528437972068787, + "learning_rate": 5.026455026455026e-05, + "loss": 0.3274, + "step": 7370 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 1.2826586961746216, + "learning_rate": 4.761904761904762e-05, + "loss": 0.3819, + "step": 7380 + }, + { + "epoch": 2.932539682539683, + "grad_norm": 1.2130956649780273, + "learning_rate": 4.4973544973544974e-05, + "loss": 0.2897, + "step": 7390 + }, + { + "epoch": 2.9365079365079367, + "grad_norm": 0.4103514552116394, + "learning_rate": 4.232804232804233e-05, + "loss": 0.245, + "step": 7400 + }, + { + "epoch": 2.9404761904761907, + "grad_norm": 1.2988202571868896, + "learning_rate": 3.968253968253968e-05, + "loss": 0.3479, + "step": 7410 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.4846343696117401, + "learning_rate": 3.7037037037037037e-05, + "loss": 0.2929, + "step": 7420 + }, + { + "epoch": 2.9484126984126986, + "grad_norm": 1.0788893699645996, + "learning_rate": 3.439153439153439e-05, + "loss": 0.3317, + "step": 7430 + }, + { + "epoch": 2.9523809523809526, + "grad_norm": 0.5080360174179077, + "learning_rate": 3.1746031746031745e-05, + "loss": 0.3498, + "step": 7440 + }, + { + "epoch": 2.9563492063492065, + "grad_norm": 0.8950350284576416, + "learning_rate": 2.91005291005291e-05, + "loss": 0.3807, + "step": 7450 + }, + { + "epoch": 2.9603174603174605, + "grad_norm": 0.5955391526222229, + "learning_rate": 2.6455026455026453e-05, + "loss": 0.3587, + "step": 7460 + }, + { + "epoch": 2.9642857142857144, + "grad_norm": 0.8612658977508545, + "learning_rate": 2.380952380952381e-05, + "loss": 0.3857, + "step": 7470 + }, + { + "epoch": 2.9682539682539684, + "grad_norm": 0.4796072542667389, + "learning_rate": 2.1164021164021164e-05, + "loss": 0.2429, + "step": 7480 + }, + { + "epoch": 2.9722222222222223, + "grad_norm": 0.6475656032562256, + "learning_rate": 1.8518518518518518e-05, + "loss": 0.2567, + "step": 7490 + }, + { + "epoch": 2.9761904761904763, + "grad_norm": 1.2244335412979126, + "learning_rate": 1.5873015873015872e-05, + "loss": 0.3446, + "step": 7500 + }, + { + "epoch": 0.9630674531931265, + "grad_norm": 2.1661019325256348, + "learning_rate": 0.0013579550312045822, + "loss": 0.8551, + "step": 7510 + }, + { + "epoch": 0.9643498332905873, + "grad_norm": 2.811357021331787, + "learning_rate": 0.0013571001111396083, + "loss": 1.0575, + "step": 7520 + }, + { + "epoch": 0.9656322133880482, + "grad_norm": 1.8580657243728638, + "learning_rate": 0.0013562451910746345, + "loss": 0.7093, + "step": 7530 + }, + { + "epoch": 0.9669145934855091, + "grad_norm": 1.7952332496643066, + "learning_rate": 0.0013553902710096606, + "loss": 0.8066, + "step": 7540 + }, + { + "epoch": 0.96819697358297, + "grad_norm": 1.4091452360153198, + "learning_rate": 0.0013545353509446867, + "loss": 0.9003, + "step": 7550 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 0.9127289652824402, + "learning_rate": 0.0013536804308797129, + "loss": 0.607, + "step": 7560 + }, + { + "epoch": 0.9707617337778918, + "grad_norm": 1.1701823472976685, + "learning_rate": 0.001352825510814739, + "loss": 0.8753, + "step": 7570 + }, + { + "epoch": 0.9720441138753526, + "grad_norm": 1.0958774089813232, + "learning_rate": 0.001351970590749765, + "loss": 0.8458, + "step": 7580 + }, + { + "epoch": 0.9733264939728136, + "grad_norm": 1.0484057664871216, + "learning_rate": 0.0013511156706847909, + "loss": 0.8565, + "step": 7590 + }, + { + "epoch": 0.9746088740702744, + "grad_norm": 1.4138461351394653, + "learning_rate": 0.001350260750619817, + "loss": 0.6054, + "step": 7600 + }, + { + "epoch": 0.9758912541677354, + "grad_norm": 1.9181944131851196, + "learning_rate": 0.0013494058305548431, + "loss": 1.0207, + "step": 7610 + }, + { + "epoch": 0.9771736342651962, + "grad_norm": 1.6007705926895142, + "learning_rate": 0.0013485509104898693, + "loss": 0.6417, + "step": 7620 + }, + { + "epoch": 0.9784560143626571, + "grad_norm": 1.3225061893463135, + "learning_rate": 0.0013476959904248954, + "loss": 0.9063, + "step": 7630 + }, + { + "epoch": 0.979738394460118, + "grad_norm": 1.6732155084609985, + "learning_rate": 0.0013468410703599213, + "loss": 1.058, + "step": 7640 + }, + { + "epoch": 0.9810207745575789, + "grad_norm": 1.4079992771148682, + "learning_rate": 0.0013459861502949475, + "loss": 0.9633, + "step": 7650 + }, + { + "epoch": 0.9823031546550397, + "grad_norm": 1.1940516233444214, + "learning_rate": 0.0013451312302299736, + "loss": 0.7728, + "step": 7660 + }, + { + "epoch": 0.9835855347525007, + "grad_norm": 1.1965214014053345, + "learning_rate": 0.0013442763101649995, + "loss": 0.7916, + "step": 7670 + }, + { + "epoch": 0.9848679148499615, + "grad_norm": 1.6329299211502075, + "learning_rate": 0.0013434213901000257, + "loss": 0.702, + "step": 7680 + }, + { + "epoch": 0.9861502949474225, + "grad_norm": 0.9614496827125549, + "learning_rate": 0.0013425664700350518, + "loss": 1.1379, + "step": 7690 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 0.7053365707397461, + "learning_rate": 0.001341711549970078, + "loss": 0.6611, + "step": 7700 + }, + { + "epoch": 0.9887150551423441, + "grad_norm": 1.1425424814224243, + "learning_rate": 0.0013408566299051039, + "loss": 0.8594, + "step": 7710 + }, + { + "epoch": 0.9899974352398051, + "grad_norm": 1.6565475463867188, + "learning_rate": 0.00134000170984013, + "loss": 0.713, + "step": 7720 + }, + { + "epoch": 0.991279815337266, + "grad_norm": 0.6158244609832764, + "learning_rate": 0.0013391467897751561, + "loss": 0.6972, + "step": 7730 + }, + { + "epoch": 0.9925621954347269, + "grad_norm": 1.166113018989563, + "learning_rate": 0.001338291869710182, + "loss": 0.7004, + "step": 7740 + }, + { + "epoch": 0.9938445755321877, + "grad_norm": 1.3203206062316895, + "learning_rate": 0.0013374369496452082, + "loss": 0.8673, + "step": 7750 + }, + { + "epoch": 0.9951269556296486, + "grad_norm": 1.4865373373031616, + "learning_rate": 0.0013365820295802343, + "loss": 0.9206, + "step": 7760 + }, + { + "epoch": 0.9964093357271095, + "grad_norm": 1.5147637128829956, + "learning_rate": 0.0013357271095152602, + "loss": 0.6281, + "step": 7770 + }, + { + "epoch": 0.9976917158245704, + "grad_norm": 2.0644052028656006, + "learning_rate": 0.0013348721894502864, + "loss": 0.8273, + "step": 7780 + }, + { + "epoch": 0.9989740959220312, + "grad_norm": 1.2069566249847412, + "learning_rate": 0.0013340172693853125, + "loss": 0.8031, + "step": 7790 + }, + { + "epoch": 1.000256476019492, + "grad_norm": 0.8759682178497314, + "learning_rate": 0.0013331623493203386, + "loss": 0.7899, + "step": 7800 + }, + { + "epoch": 1.0015388561169531, + "grad_norm": 2.084101438522339, + "learning_rate": 0.0013323074292553648, + "loss": 0.9902, + "step": 7810 + }, + { + "epoch": 1.002821236214414, + "grad_norm": 1.871626377105713, + "learning_rate": 0.0013314525091903907, + "loss": 0.7242, + "step": 7820 + }, + { + "epoch": 1.0041036163118748, + "grad_norm": 1.0887974500656128, + "learning_rate": 0.0013305975891254166, + "loss": 1.0633, + "step": 7830 + }, + { + "epoch": 1.0053859964093357, + "grad_norm": 2.206550359725952, + "learning_rate": 0.0013297426690604428, + "loss": 0.8527, + "step": 7840 + }, + { + "epoch": 1.0066683765067965, + "grad_norm": 1.9103882312774658, + "learning_rate": 0.001328887748995469, + "loss": 0.9871, + "step": 7850 + }, + { + "epoch": 1.0079507566042576, + "grad_norm": 1.2385672330856323, + "learning_rate": 0.001328032828930495, + "loss": 0.7928, + "step": 7860 + }, + { + "epoch": 1.0092331367017184, + "grad_norm": 1.632601261138916, + "learning_rate": 0.0013271779088655212, + "loss": 0.8596, + "step": 7870 + }, + { + "epoch": 1.0105155167991793, + "grad_norm": 0.8926940560340881, + "learning_rate": 0.0013263229888005473, + "loss": 0.576, + "step": 7880 + }, + { + "epoch": 1.0117978968966401, + "grad_norm": 1.632102370262146, + "learning_rate": 0.0013254680687355734, + "loss": 0.717, + "step": 7890 + }, + { + "epoch": 1.013080276994101, + "grad_norm": 1.671094298362732, + "learning_rate": 0.0013246131486705991, + "loss": 0.7507, + "step": 7900 + }, + { + "epoch": 1.014362657091562, + "grad_norm": 1.1313315629959106, + "learning_rate": 0.0013237582286056253, + "loss": 0.7725, + "step": 7910 + }, + { + "epoch": 1.0156450371890229, + "grad_norm": 0.7332689762115479, + "learning_rate": 0.0013229033085406514, + "loss": 0.6037, + "step": 7920 + }, + { + "epoch": 1.0169274172864837, + "grad_norm": 1.1624342203140259, + "learning_rate": 0.0013220483884756776, + "loss": 0.8739, + "step": 7930 + }, + { + "epoch": 1.0182097973839446, + "grad_norm": 1.1926045417785645, + "learning_rate": 0.0013211934684107037, + "loss": 0.6407, + "step": 7940 + }, + { + "epoch": 1.0194921774814054, + "grad_norm": 0.8199732303619385, + "learning_rate": 0.0013203385483457298, + "loss": 0.7909, + "step": 7950 + }, + { + "epoch": 1.0207745575788665, + "grad_norm": 1.0138903856277466, + "learning_rate": 0.0013194836282807557, + "loss": 0.6335, + "step": 7960 + }, + { + "epoch": 1.0220569376763273, + "grad_norm": 1.0864981412887573, + "learning_rate": 0.0013186287082157819, + "loss": 0.6715, + "step": 7970 + }, + { + "epoch": 1.0233393177737882, + "grad_norm": 1.3601443767547607, + "learning_rate": 0.0013177737881508078, + "loss": 0.7332, + "step": 7980 + }, + { + "epoch": 1.024621697871249, + "grad_norm": 1.9008809328079224, + "learning_rate": 0.001316918868085834, + "loss": 0.8091, + "step": 7990 + }, + { + "epoch": 1.0259040779687099, + "grad_norm": 0.7857163548469543, + "learning_rate": 0.00131606394802086, + "loss": 0.7366, + "step": 8000 + }, + { + "epoch": 1.0271864580661707, + "grad_norm": 1.2369189262390137, + "learning_rate": 0.0013152090279558862, + "loss": 0.6633, + "step": 8010 + }, + { + "epoch": 1.0284688381636318, + "grad_norm": 0.9395790696144104, + "learning_rate": 0.0013143541078909123, + "loss": 0.6482, + "step": 8020 + }, + { + "epoch": 1.0297512182610926, + "grad_norm": 0.7295545935630798, + "learning_rate": 0.0013134991878259383, + "loss": 0.5748, + "step": 8030 + }, + { + "epoch": 1.0310335983585535, + "grad_norm": 0.824608564376831, + "learning_rate": 0.0013126442677609644, + "loss": 0.5573, + "step": 8040 + }, + { + "epoch": 1.0323159784560143, + "grad_norm": 0.4935958981513977, + "learning_rate": 0.0013117893476959903, + "loss": 0.7653, + "step": 8050 + }, + { + "epoch": 1.0335983585534751, + "grad_norm": 1.4792139530181885, + "learning_rate": 0.0013109344276310165, + "loss": 0.6865, + "step": 8060 + }, + { + "epoch": 1.0348807386509362, + "grad_norm": 1.3146531581878662, + "learning_rate": 0.0013100795075660426, + "loss": 1.062, + "step": 8070 + }, + { + "epoch": 1.036163118748397, + "grad_norm": 0.7179372906684875, + "learning_rate": 0.0013092245875010687, + "loss": 0.6821, + "step": 8080 + }, + { + "epoch": 1.037445498845858, + "grad_norm": 0.9141503572463989, + "learning_rate": 0.0013083696674360947, + "loss": 0.7213, + "step": 8090 + }, + { + "epoch": 1.0387278789433187, + "grad_norm": 1.0443553924560547, + "learning_rate": 0.0013075147473711208, + "loss": 0.7408, + "step": 8100 + }, + { + "epoch": 1.0400102590407796, + "grad_norm": 1.3247910737991333, + "learning_rate": 0.001306659827306147, + "loss": 0.7333, + "step": 8110 + }, + { + "epoch": 1.0412926391382407, + "grad_norm": 1.7711740732192993, + "learning_rate": 0.001305804907241173, + "loss": 0.5489, + "step": 8120 + }, + { + "epoch": 1.0425750192357015, + "grad_norm": 0.6265628337860107, + "learning_rate": 0.001304949987176199, + "loss": 0.6677, + "step": 8130 + }, + { + "epoch": 1.0438573993331624, + "grad_norm": 0.7401356101036072, + "learning_rate": 0.0013040950671112251, + "loss": 0.7797, + "step": 8140 + }, + { + "epoch": 1.0451397794306232, + "grad_norm": 1.509304165840149, + "learning_rate": 0.0013032401470462513, + "loss": 0.5999, + "step": 8150 + }, + { + "epoch": 1.046422159528084, + "grad_norm": 0.9695005416870117, + "learning_rate": 0.0013023852269812772, + "loss": 0.7253, + "step": 8160 + }, + { + "epoch": 1.047704539625545, + "grad_norm": 0.6109398603439331, + "learning_rate": 0.0013015303069163033, + "loss": 0.7742, + "step": 8170 + }, + { + "epoch": 1.048986919723006, + "grad_norm": 1.064182996749878, + "learning_rate": 0.0013006753868513295, + "loss": 0.9212, + "step": 8180 + }, + { + "epoch": 1.0502692998204668, + "grad_norm": 0.6978124976158142, + "learning_rate": 0.0012998204667863556, + "loss": 0.4896, + "step": 8190 + }, + { + "epoch": 1.0515516799179276, + "grad_norm": 1.6062722206115723, + "learning_rate": 0.0012989655467213817, + "loss": 0.7899, + "step": 8200 + }, + { + "epoch": 1.0528340600153885, + "grad_norm": 1.7857177257537842, + "learning_rate": 0.0012981106266564076, + "loss": 0.6839, + "step": 8210 + }, + { + "epoch": 1.0541164401128496, + "grad_norm": 0.8134258985519409, + "learning_rate": 0.0012972557065914336, + "loss": 0.7301, + "step": 8220 + }, + { + "epoch": 1.0553988202103104, + "grad_norm": 0.9167507886886597, + "learning_rate": 0.0012964007865264597, + "loss": 0.5785, + "step": 8230 + }, + { + "epoch": 1.0566812003077712, + "grad_norm": 1.9447743892669678, + "learning_rate": 0.0012955458664614858, + "loss": 0.8407, + "step": 8240 + }, + { + "epoch": 1.057963580405232, + "grad_norm": 1.4368077516555786, + "learning_rate": 0.001294690946396512, + "loss": 0.6482, + "step": 8250 + }, + { + "epoch": 1.059245960502693, + "grad_norm": 1.1576851606369019, + "learning_rate": 0.0012938360263315381, + "loss": 0.7483, + "step": 8260 + }, + { + "epoch": 1.060528340600154, + "grad_norm": 0.8467000126838684, + "learning_rate": 0.0012929811062665642, + "loss": 0.5062, + "step": 8270 + }, + { + "epoch": 1.0618107206976148, + "grad_norm": 1.393489956855774, + "learning_rate": 0.0012921261862015904, + "loss": 0.6136, + "step": 8280 + }, + { + "epoch": 1.0630931007950757, + "grad_norm": 1.543832540512085, + "learning_rate": 0.001291271266136616, + "loss": 0.6688, + "step": 8290 + }, + { + "epoch": 1.0643754808925365, + "grad_norm": 1.3143365383148193, + "learning_rate": 0.0012904163460716422, + "loss": 0.7964, + "step": 8300 + }, + { + "epoch": 1.0656578609899974, + "grad_norm": 0.8674246072769165, + "learning_rate": 0.0012895614260066684, + "loss": 0.6107, + "step": 8310 + }, + { + "epoch": 1.0669402410874582, + "grad_norm": 1.773195743560791, + "learning_rate": 0.0012887065059416945, + "loss": 0.5933, + "step": 8320 + }, + { + "epoch": 1.0682226211849193, + "grad_norm": 1.678631067276001, + "learning_rate": 0.0012878515858767206, + "loss": 0.6461, + "step": 8330 + }, + { + "epoch": 1.0695050012823801, + "grad_norm": 1.5302932262420654, + "learning_rate": 0.0012869966658117468, + "loss": 0.6384, + "step": 8340 + }, + { + "epoch": 1.070787381379841, + "grad_norm": 2.0298891067504883, + "learning_rate": 0.0012861417457467727, + "loss": 0.7581, + "step": 8350 + }, + { + "epoch": 1.0720697614773018, + "grad_norm": 0.6674436926841736, + "learning_rate": 0.0012852868256817988, + "loss": 0.5885, + "step": 8360 + }, + { + "epoch": 1.0733521415747627, + "grad_norm": 2.016268491744995, + "learning_rate": 0.0012844319056168247, + "loss": 0.7515, + "step": 8370 + }, + { + "epoch": 1.0746345216722237, + "grad_norm": 0.7742639780044556, + "learning_rate": 0.0012835769855518509, + "loss": 0.8347, + "step": 8380 + }, + { + "epoch": 1.0759169017696846, + "grad_norm": 0.7427125573158264, + "learning_rate": 0.001282722065486877, + "loss": 0.6399, + "step": 8390 + }, + { + "epoch": 1.0771992818671454, + "grad_norm": 1.2341949939727783, + "learning_rate": 0.0012818671454219032, + "loss": 0.7604, + "step": 8400 + }, + { + "epoch": 1.0784816619646063, + "grad_norm": 1.360198974609375, + "learning_rate": 0.0012810122253569293, + "loss": 0.6709, + "step": 8410 + }, + { + "epoch": 1.079764042062067, + "grad_norm": 0.9719991087913513, + "learning_rate": 0.0012801573052919552, + "loss": 0.6708, + "step": 8420 + }, + { + "epoch": 1.0810464221595282, + "grad_norm": 1.0374557971954346, + "learning_rate": 0.0012793023852269813, + "loss": 0.7977, + "step": 8430 + }, + { + "epoch": 1.082328802256989, + "grad_norm": 1.6081194877624512, + "learning_rate": 0.0012784474651620073, + "loss": 0.7365, + "step": 8440 + }, + { + "epoch": 1.0836111823544499, + "grad_norm": 1.6259393692016602, + "learning_rate": 0.0012775925450970334, + "loss": 0.9605, + "step": 8450 + }, + { + "epoch": 1.0848935624519107, + "grad_norm": 1.2967445850372314, + "learning_rate": 0.0012767376250320595, + "loss": 0.6966, + "step": 8460 + }, + { + "epoch": 1.0861759425493716, + "grad_norm": 1.928068995475769, + "learning_rate": 0.0012758827049670857, + "loss": 0.7251, + "step": 8470 + }, + { + "epoch": 1.0874583226468326, + "grad_norm": 0.8280344605445862, + "learning_rate": 0.0012750277849021116, + "loss": 0.6462, + "step": 8480 + }, + { + "epoch": 1.0887407027442935, + "grad_norm": 1.297142744064331, + "learning_rate": 0.0012741728648371377, + "loss": 0.6173, + "step": 8490 + }, + { + "epoch": 1.0900230828417543, + "grad_norm": 1.008922815322876, + "learning_rate": 0.0012733179447721639, + "loss": 0.6061, + "step": 8500 + }, + { + "epoch": 1.0913054629392152, + "grad_norm": 1.0894274711608887, + "learning_rate": 0.00127246302470719, + "loss": 0.553, + "step": 8510 + }, + { + "epoch": 1.092587843036676, + "grad_norm": 0.8227475881576538, + "learning_rate": 0.001271608104642216, + "loss": 0.8644, + "step": 8520 + }, + { + "epoch": 1.0938702231341368, + "grad_norm": 1.1827560663223267, + "learning_rate": 0.001270753184577242, + "loss": 0.6479, + "step": 8530 + }, + { + "epoch": 1.095152603231598, + "grad_norm": 0.7578190565109253, + "learning_rate": 0.001269898264512268, + "loss": 0.6047, + "step": 8540 + }, + { + "epoch": 1.0964349833290588, + "grad_norm": 1.0699750185012817, + "learning_rate": 0.0012690433444472941, + "loss": 0.6204, + "step": 8550 + }, + { + "epoch": 1.0977173634265196, + "grad_norm": 0.6562129259109497, + "learning_rate": 0.0012681884243823203, + "loss": 0.7301, + "step": 8560 + }, + { + "epoch": 1.0989997435239804, + "grad_norm": 1.25229811668396, + "learning_rate": 0.0012673335043173464, + "loss": 0.7684, + "step": 8570 + }, + { + "epoch": 1.1002821236214415, + "grad_norm": 2.2153637409210205, + "learning_rate": 0.0012664785842523725, + "loss": 0.8262, + "step": 8580 + }, + { + "epoch": 1.1015645037189024, + "grad_norm": 1.5694715976715088, + "learning_rate": 0.0012656236641873987, + "loss": 0.793, + "step": 8590 + }, + { + "epoch": 1.1028468838163632, + "grad_norm": 1.0267013311386108, + "learning_rate": 0.0012647687441224246, + "loss": 0.7762, + "step": 8600 + }, + { + "epoch": 1.104129263913824, + "grad_norm": 1.9953442811965942, + "learning_rate": 0.0012639138240574505, + "loss": 0.8645, + "step": 8610 + }, + { + "epoch": 1.1054116440112849, + "grad_norm": 1.799996018409729, + "learning_rate": 0.0012630589039924766, + "loss": 0.8559, + "step": 8620 + }, + { + "epoch": 1.1066940241087457, + "grad_norm": 0.7598561644554138, + "learning_rate": 0.0012622039839275028, + "loss": 0.5443, + "step": 8630 + }, + { + "epoch": 1.1079764042062068, + "grad_norm": 1.0869311094284058, + "learning_rate": 0.001261349063862529, + "loss": 0.6119, + "step": 8640 + }, + { + "epoch": 1.1092587843036676, + "grad_norm": 3.7719571590423584, + "learning_rate": 0.001260494143797555, + "loss": 0.6665, + "step": 8650 + }, + { + "epoch": 1.1105411644011285, + "grad_norm": 0.8712837100028992, + "learning_rate": 0.0012596392237325812, + "loss": 0.6976, + "step": 8660 + }, + { + "epoch": 1.1118235444985893, + "grad_norm": 0.7456098794937134, + "learning_rate": 0.001258784303667607, + "loss": 0.531, + "step": 8670 + }, + { + "epoch": 1.1131059245960502, + "grad_norm": 0.8376585245132446, + "learning_rate": 0.001257929383602633, + "loss": 0.6767, + "step": 8680 + }, + { + "epoch": 1.1143883046935112, + "grad_norm": 0.8283999562263489, + "learning_rate": 0.0012570744635376592, + "loss": 0.7066, + "step": 8690 + }, + { + "epoch": 1.115670684790972, + "grad_norm": 1.3523050546646118, + "learning_rate": 0.0012562195434726853, + "loss": 0.8031, + "step": 8700 + }, + { + "epoch": 1.116953064888433, + "grad_norm": 0.8163665533065796, + "learning_rate": 0.0012553646234077114, + "loss": 0.6172, + "step": 8710 + }, + { + "epoch": 1.1182354449858938, + "grad_norm": 0.7833021879196167, + "learning_rate": 0.0012545097033427376, + "loss": 0.6963, + "step": 8720 + }, + { + "epoch": 1.1195178250833546, + "grad_norm": 1.102979302406311, + "learning_rate": 0.0012536547832777637, + "loss": 0.5429, + "step": 8730 + }, + { + "epoch": 1.1208002051808157, + "grad_norm": 0.6765029430389404, + "learning_rate": 0.0012527998632127896, + "loss": 0.7414, + "step": 8740 + }, + { + "epoch": 1.1220825852782765, + "grad_norm": 0.9990458488464355, + "learning_rate": 0.0012519449431478155, + "loss": 0.7734, + "step": 8750 + }, + { + "epoch": 1.1233649653757374, + "grad_norm": 0.533682107925415, + "learning_rate": 0.0012510900230828417, + "loss": 0.538, + "step": 8760 + }, + { + "epoch": 1.1246473454731982, + "grad_norm": 0.8333368301391602, + "learning_rate": 0.0012502351030178678, + "loss": 0.6792, + "step": 8770 + }, + { + "epoch": 1.125929725570659, + "grad_norm": 1.0533137321472168, + "learning_rate": 0.001249380182952894, + "loss": 0.8741, + "step": 8780 + }, + { + "epoch": 1.1272121056681201, + "grad_norm": 0.6982734203338623, + "learning_rate": 0.00124852526288792, + "loss": 0.487, + "step": 8790 + }, + { + "epoch": 1.128494485765581, + "grad_norm": 1.0995063781738281, + "learning_rate": 0.001247670342822946, + "loss": 0.6424, + "step": 8800 + }, + { + "epoch": 1.1297768658630418, + "grad_norm": 1.018072485923767, + "learning_rate": 0.0012468154227579722, + "loss": 0.5269, + "step": 8810 + }, + { + "epoch": 1.1310592459605027, + "grad_norm": 1.1999799013137817, + "learning_rate": 0.0012459605026929983, + "loss": 0.6517, + "step": 8820 + }, + { + "epoch": 1.1323416260579635, + "grad_norm": 1.4877429008483887, + "learning_rate": 0.0012451055826280242, + "loss": 0.7614, + "step": 8830 + }, + { + "epoch": 1.1336240061554244, + "grad_norm": 1.8721554279327393, + "learning_rate": 0.0012442506625630503, + "loss": 0.7041, + "step": 8840 + }, + { + "epoch": 1.1349063862528854, + "grad_norm": 0.9679120779037476, + "learning_rate": 0.0012433957424980765, + "loss": 0.6513, + "step": 8850 + }, + { + "epoch": 1.1361887663503463, + "grad_norm": 1.2489194869995117, + "learning_rate": 0.0012425408224331026, + "loss": 0.7101, + "step": 8860 + }, + { + "epoch": 1.1374711464478071, + "grad_norm": 1.4882025718688965, + "learning_rate": 0.0012416859023681285, + "loss": 0.8231, + "step": 8870 + }, + { + "epoch": 1.138753526545268, + "grad_norm": 1.0859804153442383, + "learning_rate": 0.0012408309823031547, + "loss": 0.6414, + "step": 8880 + }, + { + "epoch": 1.140035906642729, + "grad_norm": 1.027529239654541, + "learning_rate": 0.0012399760622381808, + "loss": 0.5815, + "step": 8890 + }, + { + "epoch": 1.1413182867401899, + "grad_norm": 0.7189488410949707, + "learning_rate": 0.001239121142173207, + "loss": 0.6481, + "step": 8900 + }, + { + "epoch": 1.1426006668376507, + "grad_norm": 0.4419424831867218, + "learning_rate": 0.0012382662221082329, + "loss": 0.6162, + "step": 8910 + }, + { + "epoch": 1.1438830469351116, + "grad_norm": 1.4418524503707886, + "learning_rate": 0.001237411302043259, + "loss": 0.7172, + "step": 8920 + }, + { + "epoch": 1.1451654270325724, + "grad_norm": 1.9095782041549683, + "learning_rate": 0.001236556381978285, + "loss": 0.691, + "step": 8930 + }, + { + "epoch": 1.1464478071300332, + "grad_norm": 1.3859055042266846, + "learning_rate": 0.001235701461913311, + "loss": 0.7949, + "step": 8940 + }, + { + "epoch": 1.1477301872274943, + "grad_norm": 1.1789556741714478, + "learning_rate": 0.0012348465418483372, + "loss": 0.6125, + "step": 8950 + }, + { + "epoch": 1.1490125673249552, + "grad_norm": 1.1849820613861084, + "learning_rate": 0.0012339916217833633, + "loss": 0.7081, + "step": 8960 + }, + { + "epoch": 1.150294947422416, + "grad_norm": 1.8381001949310303, + "learning_rate": 0.0012331367017183895, + "loss": 0.6178, + "step": 8970 + }, + { + "epoch": 1.1515773275198768, + "grad_norm": 1.472754955291748, + "learning_rate": 0.0012322817816534156, + "loss": 0.6599, + "step": 8980 + }, + { + "epoch": 1.1528597076173377, + "grad_norm": 1.1755315065383911, + "learning_rate": 0.0012314268615884413, + "loss": 0.6519, + "step": 8990 + }, + { + "epoch": 1.1541420877147988, + "grad_norm": 1.3931992053985596, + "learning_rate": 0.0012305719415234674, + "loss": 0.7395, + "step": 9000 + }, + { + "epoch": 1.1554244678122596, + "grad_norm": 1.171525001525879, + "learning_rate": 0.0012297170214584936, + "loss": 0.6334, + "step": 9010 + }, + { + "epoch": 1.1567068479097204, + "grad_norm": 0.9669147729873657, + "learning_rate": 0.0012288621013935197, + "loss": 0.5607, + "step": 9020 + }, + { + "epoch": 1.1579892280071813, + "grad_norm": 1.3448598384857178, + "learning_rate": 0.0012280071813285459, + "loss": 0.6036, + "step": 9030 + }, + { + "epoch": 1.1592716081046421, + "grad_norm": 0.9272229671478271, + "learning_rate": 0.001227152261263572, + "loss": 0.6345, + "step": 9040 + }, + { + "epoch": 1.160553988202103, + "grad_norm": 1.4232205152511597, + "learning_rate": 0.0012262973411985981, + "loss": 0.5926, + "step": 9050 + }, + { + "epoch": 1.161836368299564, + "grad_norm": 1.1732438802719116, + "learning_rate": 0.001225442421133624, + "loss": 0.6894, + "step": 9060 + }, + { + "epoch": 1.163118748397025, + "grad_norm": 1.3374831676483154, + "learning_rate": 0.00122458750106865, + "loss": 0.5419, + "step": 9070 + }, + { + "epoch": 1.1644011284944857, + "grad_norm": 1.0163809061050415, + "learning_rate": 0.001223732581003676, + "loss": 0.537, + "step": 9080 + }, + { + "epoch": 1.1656835085919466, + "grad_norm": 1.285212755203247, + "learning_rate": 0.0012228776609387022, + "loss": 0.6088, + "step": 9090 + }, + { + "epoch": 1.1669658886894076, + "grad_norm": 0.41504955291748047, + "learning_rate": 0.0012220227408737284, + "loss": 0.5944, + "step": 9100 + }, + { + "epoch": 1.1682482687868685, + "grad_norm": 1.1668952703475952, + "learning_rate": 0.0012211678208087545, + "loss": 0.6529, + "step": 9110 + }, + { + "epoch": 1.1695306488843293, + "grad_norm": 2.5007708072662354, + "learning_rate": 0.0012203129007437804, + "loss": 0.7975, + "step": 9120 + }, + { + "epoch": 1.1708130289817902, + "grad_norm": 0.4132268726825714, + "learning_rate": 0.0012194579806788066, + "loss": 0.5383, + "step": 9130 + }, + { + "epoch": 1.172095409079251, + "grad_norm": 0.9651444554328918, + "learning_rate": 0.0012186030606138325, + "loss": 0.6613, + "step": 9140 + }, + { + "epoch": 1.1733777891767119, + "grad_norm": 1.2722069025039673, + "learning_rate": 0.0012177481405488586, + "loss": 0.8106, + "step": 9150 + }, + { + "epoch": 1.174660169274173, + "grad_norm": 1.5842227935791016, + "learning_rate": 0.0012168932204838848, + "loss": 0.5899, + "step": 9160 + }, + { + "epoch": 1.1759425493716338, + "grad_norm": 0.7606542110443115, + "learning_rate": 0.001216038300418911, + "loss": 0.6511, + "step": 9170 + }, + { + "epoch": 1.1772249294690946, + "grad_norm": 0.9012206196784973, + "learning_rate": 0.001215183380353937, + "loss": 0.5919, + "step": 9180 + }, + { + "epoch": 1.1785073095665555, + "grad_norm": 1.250051736831665, + "learning_rate": 0.001214328460288963, + "loss": 0.6909, + "step": 9190 + }, + { + "epoch": 1.1797896896640165, + "grad_norm": 1.4063526391983032, + "learning_rate": 0.001213473540223989, + "loss": 0.5535, + "step": 9200 + }, + { + "epoch": 1.1810720697614774, + "grad_norm": 0.7005236148834229, + "learning_rate": 0.0012126186201590152, + "loss": 0.5309, + "step": 9210 + }, + { + "epoch": 1.1823544498589382, + "grad_norm": 1.317863941192627, + "learning_rate": 0.0012117637000940411, + "loss": 0.8211, + "step": 9220 + }, + { + "epoch": 1.183636829956399, + "grad_norm": 1.379496693611145, + "learning_rate": 0.0012109087800290673, + "loss": 0.8195, + "step": 9230 + }, + { + "epoch": 1.18491921005386, + "grad_norm": 0.9941421747207642, + "learning_rate": 0.0012100538599640934, + "loss": 0.7493, + "step": 9240 + }, + { + "epoch": 1.1862015901513208, + "grad_norm": 1.5360379219055176, + "learning_rate": 0.0012091989398991193, + "loss": 0.7467, + "step": 9250 + }, + { + "epoch": 1.1874839702487818, + "grad_norm": 0.7074049711227417, + "learning_rate": 0.0012083440198341455, + "loss": 0.755, + "step": 9260 + }, + { + "epoch": 1.1887663503462427, + "grad_norm": 1.1832996606826782, + "learning_rate": 0.0012074890997691716, + "loss": 0.6105, + "step": 9270 + }, + { + "epoch": 1.1900487304437035, + "grad_norm": 0.9239598512649536, + "learning_rate": 0.0012066341797041978, + "loss": 0.6783, + "step": 9280 + }, + { + "epoch": 1.1913311105411644, + "grad_norm": 1.3701421022415161, + "learning_rate": 0.0012057792596392239, + "loss": 0.6904, + "step": 9290 + }, + { + "epoch": 1.1926134906386252, + "grad_norm": 1.2199441194534302, + "learning_rate": 0.0012049243395742498, + "loss": 0.5398, + "step": 9300 + }, + { + "epoch": 1.1938958707360863, + "grad_norm": 1.273148536682129, + "learning_rate": 0.001204069419509276, + "loss": 0.5927, + "step": 9310 + }, + { + "epoch": 1.1951782508335471, + "grad_norm": 1.4068207740783691, + "learning_rate": 0.0012032144994443019, + "loss": 0.7114, + "step": 9320 + }, + { + "epoch": 1.196460630931008, + "grad_norm": 0.7752937078475952, + "learning_rate": 0.001202359579379328, + "loss": 0.7165, + "step": 9330 + }, + { + "epoch": 1.1977430110284688, + "grad_norm": 0.880491316318512, + "learning_rate": 0.0012015046593143541, + "loss": 0.6698, + "step": 9340 + }, + { + "epoch": 1.1990253911259297, + "grad_norm": 0.8572263121604919, + "learning_rate": 0.0012006497392493803, + "loss": 0.5999, + "step": 9350 + }, + { + "epoch": 1.2003077712233905, + "grad_norm": 1.0356217622756958, + "learning_rate": 0.0011997948191844064, + "loss": 0.6217, + "step": 9360 + }, + { + "epoch": 1.2015901513208516, + "grad_norm": 0.6338940262794495, + "learning_rate": 0.0011989398991194325, + "loss": 0.5119, + "step": 9370 + }, + { + "epoch": 1.2028725314183124, + "grad_norm": 0.7291190028190613, + "learning_rate": 0.0011980849790544582, + "loss": 0.5494, + "step": 9380 + }, + { + "epoch": 1.2041549115157733, + "grad_norm": 1.3608429431915283, + "learning_rate": 0.0011972300589894844, + "loss": 0.5987, + "step": 9390 + }, + { + "epoch": 1.205437291613234, + "grad_norm": 0.8818786144256592, + "learning_rate": 0.0011963751389245105, + "loss": 0.6058, + "step": 9400 + }, + { + "epoch": 1.2067196717106952, + "grad_norm": 0.4697217345237732, + "learning_rate": 0.0011955202188595367, + "loss": 0.6277, + "step": 9410 + }, + { + "epoch": 1.208002051808156, + "grad_norm": 1.4859899282455444, + "learning_rate": 0.0011946652987945628, + "loss": 0.5474, + "step": 9420 + }, + { + "epoch": 1.2092844319056169, + "grad_norm": 1.107643723487854, + "learning_rate": 0.001193810378729589, + "loss": 0.6741, + "step": 9430 + }, + { + "epoch": 1.2105668120030777, + "grad_norm": 1.3313883543014526, + "learning_rate": 0.001192955458664615, + "loss": 0.6354, + "step": 9440 + }, + { + "epoch": 1.2118491921005385, + "grad_norm": 1.3976408243179321, + "learning_rate": 0.0011921005385996408, + "loss": 0.5456, + "step": 9450 + }, + { + "epoch": 1.2131315721979994, + "grad_norm": 0.9394209384918213, + "learning_rate": 0.001191245618534667, + "loss": 0.5251, + "step": 9460 + }, + { + "epoch": 1.2144139522954605, + "grad_norm": 1.3019652366638184, + "learning_rate": 0.001190390698469693, + "loss": 0.8192, + "step": 9470 + }, + { + "epoch": 1.2156963323929213, + "grad_norm": 1.342137098312378, + "learning_rate": 0.0011895357784047192, + "loss": 0.4957, + "step": 9480 + }, + { + "epoch": 1.2169787124903821, + "grad_norm": 0.8409485220909119, + "learning_rate": 0.0011886808583397453, + "loss": 0.6699, + "step": 9490 + }, + { + "epoch": 1.218261092587843, + "grad_norm": 1.7443925142288208, + "learning_rate": 0.0011878259382747715, + "loss": 0.7271, + "step": 9500 + }, + { + "epoch": 1.2195434726853038, + "grad_norm": 1.7577857971191406, + "learning_rate": 0.0011869710182097974, + "loss": 0.5655, + "step": 9510 + }, + { + "epoch": 1.220825852782765, + "grad_norm": 1.430893063545227, + "learning_rate": 0.0011861160981448235, + "loss": 0.6315, + "step": 9520 + }, + { + "epoch": 1.2221082328802257, + "grad_norm": 0.5352253913879395, + "learning_rate": 0.0011852611780798494, + "loss": 0.6559, + "step": 9530 + }, + { + "epoch": 1.2233906129776866, + "grad_norm": 0.7444478869438171, + "learning_rate": 0.0011844062580148756, + "loss": 0.5961, + "step": 9540 + }, + { + "epoch": 1.2246729930751474, + "grad_norm": 1.430808186531067, + "learning_rate": 0.0011835513379499017, + "loss": 0.6427, + "step": 9550 + }, + { + "epoch": 1.2259553731726083, + "grad_norm": 1.0020971298217773, + "learning_rate": 0.0011826964178849278, + "loss": 0.6509, + "step": 9560 + }, + { + "epoch": 1.2272377532700693, + "grad_norm": 0.9940693974494934, + "learning_rate": 0.001181841497819954, + "loss": 0.5086, + "step": 9570 + }, + { + "epoch": 1.2285201333675302, + "grad_norm": 0.8661133050918579, + "learning_rate": 0.00118098657775498, + "loss": 0.6015, + "step": 9580 + }, + { + "epoch": 1.229802513464991, + "grad_norm": 1.14053475856781, + "learning_rate": 0.001180131657690006, + "loss": 0.544, + "step": 9590 + }, + { + "epoch": 1.2310848935624519, + "grad_norm": 0.6881473660469055, + "learning_rate": 0.0011792767376250322, + "loss": 0.5568, + "step": 9600 + }, + { + "epoch": 1.2323672736599127, + "grad_norm": 0.9339885115623474, + "learning_rate": 0.001178421817560058, + "loss": 0.7278, + "step": 9610 + }, + { + "epoch": 1.2336496537573738, + "grad_norm": 0.9663743376731873, + "learning_rate": 0.0011775668974950842, + "loss": 0.5526, + "step": 9620 + }, + { + "epoch": 1.2349320338548346, + "grad_norm": 0.5652614235877991, + "learning_rate": 0.0011767119774301104, + "loss": 0.6143, + "step": 9630 + }, + { + "epoch": 1.2362144139522955, + "grad_norm": 1.0602763891220093, + "learning_rate": 0.0011758570573651363, + "loss": 0.5072, + "step": 9640 + }, + { + "epoch": 1.2374967940497563, + "grad_norm": 1.2798588275909424, + "learning_rate": 0.0011750021373001624, + "loss": 0.4941, + "step": 9650 + }, + { + "epoch": 1.2387791741472172, + "grad_norm": 0.8834647536277771, + "learning_rate": 0.0011741472172351886, + "loss": 0.7828, + "step": 9660 + }, + { + "epoch": 1.240061554244678, + "grad_norm": 0.47825196385383606, + "learning_rate": 0.0011732922971702147, + "loss": 0.5121, + "step": 9670 + }, + { + "epoch": 1.241343934342139, + "grad_norm": 1.1528728008270264, + "learning_rate": 0.0011724373771052408, + "loss": 0.6023, + "step": 9680 + }, + { + "epoch": 1.2426263144396, + "grad_norm": 0.7429089546203613, + "learning_rate": 0.0011715824570402667, + "loss": 0.6159, + "step": 9690 + }, + { + "epoch": 1.2439086945370608, + "grad_norm": 0.700433075428009, + "learning_rate": 0.0011707275369752927, + "loss": 0.5488, + "step": 9700 + }, + { + "epoch": 1.2451910746345216, + "grad_norm": 0.9546358585357666, + "learning_rate": 0.0011698726169103188, + "loss": 0.6106, + "step": 9710 + }, + { + "epoch": 1.2464734547319827, + "grad_norm": 0.6889375448226929, + "learning_rate": 0.001169017696845345, + "loss": 0.672, + "step": 9720 + }, + { + "epoch": 1.2477558348294435, + "grad_norm": 0.6451250314712524, + "learning_rate": 0.001168162776780371, + "loss": 0.7776, + "step": 9730 + }, + { + "epoch": 1.2490382149269044, + "grad_norm": 0.6140780448913574, + "learning_rate": 0.0011673078567153972, + "loss": 0.6053, + "step": 9740 + }, + { + "epoch": 1.2503205950243652, + "grad_norm": 0.8168278932571411, + "learning_rate": 0.0011664529366504234, + "loss": 0.6648, + "step": 9750 + }, + { + "epoch": 1.251602975121826, + "grad_norm": 0.7731073498725891, + "learning_rate": 0.0011655980165854495, + "loss": 0.7018, + "step": 9760 + }, + { + "epoch": 1.252885355219287, + "grad_norm": 1.4403185844421387, + "learning_rate": 0.0011647430965204752, + "loss": 0.7392, + "step": 9770 + }, + { + "epoch": 1.254167735316748, + "grad_norm": 2.162862777709961, + "learning_rate": 0.0011638881764555013, + "loss": 0.7369, + "step": 9780 + }, + { + "epoch": 1.2554501154142088, + "grad_norm": 1.393710970878601, + "learning_rate": 0.0011630332563905275, + "loss": 0.704, + "step": 9790 + }, + { + "epoch": 1.2567324955116697, + "grad_norm": 0.6522937417030334, + "learning_rate": 0.0011621783363255536, + "loss": 0.522, + "step": 9800 + }, + { + "epoch": 1.2580148756091305, + "grad_norm": 0.6343621611595154, + "learning_rate": 0.0011613234162605797, + "loss": 0.6998, + "step": 9810 + }, + { + "epoch": 1.2592972557065916, + "grad_norm": 1.105334758758545, + "learning_rate": 0.0011604684961956059, + "loss": 0.582, + "step": 9820 + }, + { + "epoch": 1.2605796358040524, + "grad_norm": 1.2634021043777466, + "learning_rate": 0.0011596135761306318, + "loss": 0.5776, + "step": 9830 + }, + { + "epoch": 1.2618620159015133, + "grad_norm": 1.232373595237732, + "learning_rate": 0.0011587586560656577, + "loss": 0.6949, + "step": 9840 + }, + { + "epoch": 1.263144395998974, + "grad_norm": 1.2917943000793457, + "learning_rate": 0.0011579037360006838, + "loss": 0.5866, + "step": 9850 + }, + { + "epoch": 1.264426776096435, + "grad_norm": 1.0393379926681519, + "learning_rate": 0.00115704881593571, + "loss": 0.6554, + "step": 9860 + }, + { + "epoch": 1.2657091561938958, + "grad_norm": 0.9786701202392578, + "learning_rate": 0.0011561938958707361, + "loss": 0.4743, + "step": 9870 + }, + { + "epoch": 1.2669915362913566, + "grad_norm": 0.6891704201698303, + "learning_rate": 0.0011553389758057623, + "loss": 0.6969, + "step": 9880 + }, + { + "epoch": 1.2682739163888177, + "grad_norm": 1.0330877304077148, + "learning_rate": 0.0011544840557407884, + "loss": 0.5176, + "step": 9890 + }, + { + "epoch": 1.2695562964862785, + "grad_norm": 1.5313884019851685, + "learning_rate": 0.0011536291356758143, + "loss": 0.8418, + "step": 9900 + }, + { + "epoch": 1.2708386765837394, + "grad_norm": 1.8381309509277344, + "learning_rate": 0.0011527742156108405, + "loss": 0.5932, + "step": 9910 + }, + { + "epoch": 1.2721210566812002, + "grad_norm": 0.8131228685379028, + "learning_rate": 0.0011519192955458664, + "loss": 0.5982, + "step": 9920 + }, + { + "epoch": 1.2734034367786613, + "grad_norm": 0.9269918203353882, + "learning_rate": 0.0011510643754808925, + "loss": 0.588, + "step": 9930 + }, + { + "epoch": 1.2746858168761221, + "grad_norm": 1.5636909008026123, + "learning_rate": 0.0011502094554159186, + "loss": 0.638, + "step": 9940 + }, + { + "epoch": 1.275968196973583, + "grad_norm": 0.8227086067199707, + "learning_rate": 0.0011493545353509448, + "loss": 0.6211, + "step": 9950 + }, + { + "epoch": 1.2772505770710438, + "grad_norm": 0.5944573283195496, + "learning_rate": 0.0011484996152859707, + "loss": 0.65, + "step": 9960 + }, + { + "epoch": 1.2785329571685047, + "grad_norm": 1.4585204124450684, + "learning_rate": 0.0011476446952209968, + "loss": 0.465, + "step": 9970 + }, + { + "epoch": 1.2798153372659655, + "grad_norm": 0.8570374250411987, + "learning_rate": 0.001146789775156023, + "loss": 0.5458, + "step": 9980 + }, + { + "epoch": 1.2810977173634266, + "grad_norm": 1.3235441446304321, + "learning_rate": 0.0011459348550910491, + "loss": 0.5912, + "step": 9990 + }, + { + "epoch": 1.2823800974608874, + "grad_norm": 1.786232352256775, + "learning_rate": 0.001145079935026075, + "loss": 0.7141, + "step": 10000 + }, + { + "epoch": 1.2836624775583483, + "grad_norm": 1.6500744819641113, + "learning_rate": 0.0011442250149611012, + "loss": 0.7004, + "step": 10010 + }, + { + "epoch": 1.2849448576558091, + "grad_norm": 0.9735982418060303, + "learning_rate": 0.0011433700948961273, + "loss": 0.6243, + "step": 10020 + }, + { + "epoch": 1.2862272377532702, + "grad_norm": 1.611070990562439, + "learning_rate": 0.0011425151748311532, + "loss": 0.7653, + "step": 10030 + }, + { + "epoch": 1.287509617850731, + "grad_norm": 0.9978891611099243, + "learning_rate": 0.0011416602547661794, + "loss": 0.5817, + "step": 10040 + }, + { + "epoch": 1.2887919979481919, + "grad_norm": 1.2319824695587158, + "learning_rate": 0.0011408053347012055, + "loss": 0.5303, + "step": 10050 + }, + { + "epoch": 1.2900743780456527, + "grad_norm": 0.8889154195785522, + "learning_rate": 0.0011399504146362316, + "loss": 0.4797, + "step": 10060 + }, + { + "epoch": 1.2913567581431136, + "grad_norm": 0.8058596253395081, + "learning_rate": 0.0011390954945712578, + "loss": 0.6967, + "step": 10070 + }, + { + "epoch": 1.2926391382405744, + "grad_norm": 1.0289708375930786, + "learning_rate": 0.0011382405745062837, + "loss": 0.6387, + "step": 10080 + }, + { + "epoch": 1.2939215183380353, + "grad_norm": 0.7614682912826538, + "learning_rate": 0.0011373856544413096, + "loss": 0.5186, + "step": 10090 + }, + { + "epoch": 1.2952038984354963, + "grad_norm": 1.5079838037490845, + "learning_rate": 0.0011365307343763357, + "loss": 0.7524, + "step": 10100 + }, + { + "epoch": 1.2964862785329572, + "grad_norm": 1.0859569311141968, + "learning_rate": 0.0011356758143113619, + "loss": 0.669, + "step": 10110 + }, + { + "epoch": 1.297768658630418, + "grad_norm": 1.2234021425247192, + "learning_rate": 0.001134820894246388, + "loss": 0.5603, + "step": 10120 + }, + { + "epoch": 1.299051038727879, + "grad_norm": 0.7844352126121521, + "learning_rate": 0.0011339659741814142, + "loss": 0.5283, + "step": 10130 + }, + { + "epoch": 1.30033341882534, + "grad_norm": 0.7370574474334717, + "learning_rate": 0.0011331110541164403, + "loss": 0.5756, + "step": 10140 + }, + { + "epoch": 1.3016157989228008, + "grad_norm": 0.7193623185157776, + "learning_rate": 0.001132256134051466, + "loss": 0.4713, + "step": 10150 + }, + { + "epoch": 1.3028981790202616, + "grad_norm": 0.8968930244445801, + "learning_rate": 0.0011314012139864921, + "loss": 0.502, + "step": 10160 + }, + { + "epoch": 1.3041805591177225, + "grad_norm": 0.7797239422798157, + "learning_rate": 0.0011305462939215183, + "loss": 0.5768, + "step": 10170 + }, + { + "epoch": 1.3054629392151833, + "grad_norm": 1.528817892074585, + "learning_rate": 0.0011296913738565444, + "loss": 0.7379, + "step": 10180 + }, + { + "epoch": 1.3067453193126441, + "grad_norm": 0.5706043839454651, + "learning_rate": 0.0011288364537915705, + "loss": 0.4838, + "step": 10190 + }, + { + "epoch": 1.3080276994101052, + "grad_norm": 0.8248624205589294, + "learning_rate": 0.0011279815337265967, + "loss": 0.7465, + "step": 10200 + }, + { + "epoch": 1.309310079507566, + "grad_norm": 0.7821047306060791, + "learning_rate": 0.0011271266136616228, + "loss": 0.6183, + "step": 10210 + }, + { + "epoch": 1.310592459605027, + "grad_norm": 0.7619379162788391, + "learning_rate": 0.0011262716935966487, + "loss": 0.6079, + "step": 10220 + }, + { + "epoch": 1.3118748397024877, + "grad_norm": 0.5874025225639343, + "learning_rate": 0.0011254167735316747, + "loss": 0.5995, + "step": 10230 + }, + { + "epoch": 1.3131572197999488, + "grad_norm": 1.180526852607727, + "learning_rate": 0.0011245618534667008, + "loss": 0.567, + "step": 10240 + }, + { + "epoch": 1.3144395998974097, + "grad_norm": 1.1229068040847778, + "learning_rate": 0.001123706933401727, + "loss": 0.3913, + "step": 10250 + }, + { + "epoch": 1.3157219799948705, + "grad_norm": 0.6968095898628235, + "learning_rate": 0.001122852013336753, + "loss": 0.5576, + "step": 10260 + }, + { + "epoch": 1.3170043600923313, + "grad_norm": 1.0002440214157104, + "learning_rate": 0.0011219970932717792, + "loss": 0.7124, + "step": 10270 + }, + { + "epoch": 1.3182867401897922, + "grad_norm": 1.1690402030944824, + "learning_rate": 0.0011211421732068051, + "loss": 0.4824, + "step": 10280 + }, + { + "epoch": 1.319569120287253, + "grad_norm": 1.384547472000122, + "learning_rate": 0.0011202872531418313, + "loss": 0.6647, + "step": 10290 + }, + { + "epoch": 1.320851500384714, + "grad_norm": 1.943840503692627, + "learning_rate": 0.0011194323330768574, + "loss": 0.5974, + "step": 10300 + }, + { + "epoch": 1.322133880482175, + "grad_norm": 0.721809983253479, + "learning_rate": 0.0011185774130118833, + "loss": 0.601, + "step": 10310 + }, + { + "epoch": 1.3234162605796358, + "grad_norm": 0.594584584236145, + "learning_rate": 0.0011177224929469094, + "loss": 0.4628, + "step": 10320 + }, + { + "epoch": 1.3246986406770966, + "grad_norm": 1.1963189840316772, + "learning_rate": 0.0011168675728819356, + "loss": 0.5409, + "step": 10330 + }, + { + "epoch": 1.3259810207745577, + "grad_norm": 0.9252663254737854, + "learning_rate": 0.0011160126528169617, + "loss": 0.4973, + "step": 10340 + }, + { + "epoch": 1.3272634008720186, + "grad_norm": 0.7232112288475037, + "learning_rate": 0.0011151577327519876, + "loss": 0.6411, + "step": 10350 + }, + { + "epoch": 1.3285457809694794, + "grad_norm": 1.3147138357162476, + "learning_rate": 0.0011143028126870138, + "loss": 0.5462, + "step": 10360 + }, + { + "epoch": 1.3298281610669402, + "grad_norm": 1.6422502994537354, + "learning_rate": 0.00111344789262204, + "loss": 0.5364, + "step": 10370 + }, + { + "epoch": 1.331110541164401, + "grad_norm": 0.6929153203964233, + "learning_rate": 0.001112592972557066, + "loss": 0.4947, + "step": 10380 + }, + { + "epoch": 1.332392921261862, + "grad_norm": 1.4353240728378296, + "learning_rate": 0.001111738052492092, + "loss": 0.52, + "step": 10390 + }, + { + "epoch": 1.3336753013593228, + "grad_norm": 1.516634225845337, + "learning_rate": 0.0011108831324271181, + "loss": 0.6082, + "step": 10400 + }, + { + "epoch": 1.3349576814567838, + "grad_norm": 1.1383343935012817, + "learning_rate": 0.001110028212362144, + "loss": 0.663, + "step": 10410 + }, + { + "epoch": 1.3362400615542447, + "grad_norm": 1.2249925136566162, + "learning_rate": 0.0011091732922971702, + "loss": 0.6528, + "step": 10420 + }, + { + "epoch": 1.3375224416517055, + "grad_norm": 0.720862865447998, + "learning_rate": 0.0011083183722321963, + "loss": 0.5235, + "step": 10430 + }, + { + "epoch": 1.3388048217491664, + "grad_norm": 0.6571218967437744, + "learning_rate": 0.0011074634521672224, + "loss": 0.7425, + "step": 10440 + }, + { + "epoch": 1.3400872018466274, + "grad_norm": 1.3739579916000366, + "learning_rate": 0.0011066085321022486, + "loss": 0.7658, + "step": 10450 + }, + { + "epoch": 1.3413695819440883, + "grad_norm": 1.992790937423706, + "learning_rate": 0.0011057536120372747, + "loss": 0.6204, + "step": 10460 + }, + { + "epoch": 1.3426519620415491, + "grad_norm": 0.7727292776107788, + "learning_rate": 0.0011048986919723006, + "loss": 0.5973, + "step": 10470 + }, + { + "epoch": 1.34393434213901, + "grad_norm": 0.9260819554328918, + "learning_rate": 0.0011040437719073266, + "loss": 0.6831, + "step": 10480 + }, + { + "epoch": 1.3452167222364708, + "grad_norm": 0.4422336220741272, + "learning_rate": 0.0011031888518423527, + "loss": 0.5292, + "step": 10490 + }, + { + "epoch": 1.3464991023339317, + "grad_norm": 0.5913951992988586, + "learning_rate": 0.0011023339317773788, + "loss": 0.5232, + "step": 10500 + }, + { + "epoch": 1.3477814824313927, + "grad_norm": 1.8508780002593994, + "learning_rate": 0.001101479011712405, + "loss": 0.6073, + "step": 10510 + }, + { + "epoch": 1.3490638625288536, + "grad_norm": 1.7794585227966309, + "learning_rate": 0.001100624091647431, + "loss": 0.5784, + "step": 10520 + }, + { + "epoch": 1.3503462426263144, + "grad_norm": 1.4535781145095825, + "learning_rate": 0.0010997691715824572, + "loss": 0.6948, + "step": 10530 + }, + { + "epoch": 1.3516286227237753, + "grad_norm": 0.6549120545387268, + "learning_rate": 0.001098914251517483, + "loss": 0.6816, + "step": 10540 + }, + { + "epoch": 1.3529110028212363, + "grad_norm": 2.080423355102539, + "learning_rate": 0.001098059331452509, + "loss": 0.5362, + "step": 10550 + }, + { + "epoch": 1.3541933829186972, + "grad_norm": 0.6796220541000366, + "learning_rate": 0.0010972044113875352, + "loss": 0.5462, + "step": 10560 + }, + { + "epoch": 1.355475763016158, + "grad_norm": 0.9593464732170105, + "learning_rate": 0.0010963494913225613, + "loss": 0.7782, + "step": 10570 + }, + { + "epoch": 1.3567581431136189, + "grad_norm": 0.9870818853378296, + "learning_rate": 0.0010954945712575875, + "loss": 0.5401, + "step": 10580 + }, + { + "epoch": 1.3580405232110797, + "grad_norm": 1.1072885990142822, + "learning_rate": 0.0010946396511926136, + "loss": 0.5625, + "step": 10590 + }, + { + "epoch": 1.3593229033085406, + "grad_norm": 0.8635666370391846, + "learning_rate": 0.0010937847311276398, + "loss": 0.6213, + "step": 10600 + }, + { + "epoch": 1.3606052834060016, + "grad_norm": 0.6433390378952026, + "learning_rate": 0.0010929298110626657, + "loss": 0.5149, + "step": 10610 + }, + { + "epoch": 1.3618876635034625, + "grad_norm": 0.9244104623794556, + "learning_rate": 0.0010920748909976916, + "loss": 0.8149, + "step": 10620 + }, + { + "epoch": 1.3631700436009233, + "grad_norm": 1.0596814155578613, + "learning_rate": 0.0010912199709327177, + "loss": 0.6769, + "step": 10630 + }, + { + "epoch": 1.3644524236983842, + "grad_norm": 1.2836452722549438, + "learning_rate": 0.0010903650508677439, + "loss": 0.7406, + "step": 10640 + }, + { + "epoch": 1.3657348037958452, + "grad_norm": 1.069035291671753, + "learning_rate": 0.00108951013080277, + "loss": 0.5905, + "step": 10650 + }, + { + "epoch": 1.367017183893306, + "grad_norm": 0.6436813473701477, + "learning_rate": 0.0010886552107377961, + "loss": 0.6066, + "step": 10660 + }, + { + "epoch": 1.368299563990767, + "grad_norm": 1.8772107362747192, + "learning_rate": 0.001087800290672822, + "loss": 0.7121, + "step": 10670 + }, + { + "epoch": 1.3695819440882278, + "grad_norm": 0.6196737289428711, + "learning_rate": 0.0010869453706078482, + "loss": 0.6006, + "step": 10680 + }, + { + "epoch": 1.3708643241856886, + "grad_norm": 1.3433279991149902, + "learning_rate": 0.0010860904505428743, + "loss": 0.6368, + "step": 10690 + }, + { + "epoch": 1.3721467042831494, + "grad_norm": 0.9667194485664368, + "learning_rate": 0.0010852355304779003, + "loss": 0.5574, + "step": 10700 + }, + { + "epoch": 1.3734290843806103, + "grad_norm": 1.4600547552108765, + "learning_rate": 0.0010843806104129264, + "loss": 0.5626, + "step": 10710 + }, + { + "epoch": 1.3747114644780714, + "grad_norm": 0.7120881676673889, + "learning_rate": 0.0010835256903479525, + "loss": 0.6258, + "step": 10720 + }, + { + "epoch": 1.3759938445755322, + "grad_norm": 1.2124048471450806, + "learning_rate": 0.0010826707702829787, + "loss": 0.7491, + "step": 10730 + }, + { + "epoch": 1.377276224672993, + "grad_norm": 0.9732292294502258, + "learning_rate": 0.0010818158502180046, + "loss": 0.5647, + "step": 10740 + }, + { + "epoch": 1.3785586047704539, + "grad_norm": 0.7741032838821411, + "learning_rate": 0.0010809609301530307, + "loss": 0.5397, + "step": 10750 + }, + { + "epoch": 1.379840984867915, + "grad_norm": 1.0396802425384521, + "learning_rate": 0.0010801060100880569, + "loss": 0.7297, + "step": 10760 + }, + { + "epoch": 1.3811233649653758, + "grad_norm": 1.2885736227035522, + "learning_rate": 0.001079251090023083, + "loss": 0.532, + "step": 10770 + }, + { + "epoch": 1.3824057450628366, + "grad_norm": 0.7599356174468994, + "learning_rate": 0.001078396169958109, + "loss": 0.7877, + "step": 10780 + }, + { + "epoch": 1.3836881251602975, + "grad_norm": 1.040028691291809, + "learning_rate": 0.001077541249893135, + "loss": 0.5618, + "step": 10790 + }, + { + "epoch": 1.3849705052577583, + "grad_norm": 0.859203577041626, + "learning_rate": 0.001076686329828161, + "loss": 0.5895, + "step": 10800 + }, + { + "epoch": 1.3862528853552192, + "grad_norm": 0.6244560480117798, + "learning_rate": 0.001075831409763187, + "loss": 0.4826, + "step": 10810 + }, + { + "epoch": 1.3875352654526802, + "grad_norm": 0.6640686392784119, + "learning_rate": 0.0010749764896982132, + "loss": 0.6616, + "step": 10820 + }, + { + "epoch": 1.388817645550141, + "grad_norm": 1.2225605249404907, + "learning_rate": 0.0010741215696332394, + "loss": 0.6392, + "step": 10830 + }, + { + "epoch": 1.390100025647602, + "grad_norm": 0.7027501463890076, + "learning_rate": 0.0010732666495682655, + "loss": 0.5071, + "step": 10840 + }, + { + "epoch": 1.3913824057450628, + "grad_norm": 0.8924635052680969, + "learning_rate": 0.0010724117295032914, + "loss": 0.5538, + "step": 10850 + }, + { + "epoch": 1.3926647858425238, + "grad_norm": 1.6392470598220825, + "learning_rate": 0.0010715568094383174, + "loss": 0.7056, + "step": 10860 + }, + { + "epoch": 1.3939471659399847, + "grad_norm": 0.6672780513763428, + "learning_rate": 0.0010707018893733435, + "loss": 0.6533, + "step": 10870 + }, + { + "epoch": 1.3952295460374455, + "grad_norm": 0.9473418593406677, + "learning_rate": 0.0010698469693083696, + "loss": 0.6053, + "step": 10880 + }, + { + "epoch": 1.3965119261349064, + "grad_norm": 1.2938871383666992, + "learning_rate": 0.0010689920492433958, + "loss": 0.6268, + "step": 10890 + }, + { + "epoch": 1.3977943062323672, + "grad_norm": 1.0239317417144775, + "learning_rate": 0.001068137129178422, + "loss": 0.7315, + "step": 10900 + }, + { + "epoch": 1.399076686329828, + "grad_norm": 1.4379597902297974, + "learning_rate": 0.001067282209113448, + "loss": 0.6352, + "step": 10910 + }, + { + "epoch": 1.400359066427289, + "grad_norm": 1.5610178709030151, + "learning_rate": 0.0010664272890484742, + "loss": 0.6641, + "step": 10920 + }, + { + "epoch": 1.40164144652475, + "grad_norm": 0.7390224933624268, + "learning_rate": 0.0010655723689834999, + "loss": 0.5248, + "step": 10930 + }, + { + "epoch": 1.4029238266222108, + "grad_norm": 0.9852975606918335, + "learning_rate": 0.001064717448918526, + "loss": 0.4792, + "step": 10940 + }, + { + "epoch": 1.4042062067196717, + "grad_norm": 1.171047568321228, + "learning_rate": 0.0010638625288535522, + "loss": 0.7306, + "step": 10950 + }, + { + "epoch": 1.4054885868171327, + "grad_norm": 0.7043918371200562, + "learning_rate": 0.0010630076087885783, + "loss": 0.4516, + "step": 10960 + }, + { + "epoch": 1.4067709669145936, + "grad_norm": 2.092144250869751, + "learning_rate": 0.0010621526887236044, + "loss": 0.6593, + "step": 10970 + }, + { + "epoch": 1.4080533470120544, + "grad_norm": 0.4322734475135803, + "learning_rate": 0.0010612977686586306, + "loss": 0.6476, + "step": 10980 + }, + { + "epoch": 1.4093357271095153, + "grad_norm": 1.1757038831710815, + "learning_rate": 0.0010604428485936565, + "loss": 0.4645, + "step": 10990 + }, + { + "epoch": 1.4106181072069761, + "grad_norm": 2.142357587814331, + "learning_rate": 0.0010595879285286826, + "loss": 0.5823, + "step": 11000 + }, + { + "epoch": 1.411900487304437, + "grad_norm": 0.8038185834884644, + "learning_rate": 0.0010587330084637085, + "loss": 0.6456, + "step": 11010 + }, + { + "epoch": 1.4131828674018978, + "grad_norm": 0.9236948490142822, + "learning_rate": 0.0010578780883987347, + "loss": 0.4065, + "step": 11020 + }, + { + "epoch": 1.4144652474993589, + "grad_norm": 1.382051706314087, + "learning_rate": 0.0010570231683337608, + "loss": 0.5759, + "step": 11030 + }, + { + "epoch": 1.4157476275968197, + "grad_norm": 1.405614972114563, + "learning_rate": 0.001056168248268787, + "loss": 0.6169, + "step": 11040 + }, + { + "epoch": 1.4170300076942806, + "grad_norm": 0.9285224080085754, + "learning_rate": 0.001055313328203813, + "loss": 0.5831, + "step": 11050 + }, + { + "epoch": 1.4183123877917414, + "grad_norm": 0.7825279235839844, + "learning_rate": 0.001054458408138839, + "loss": 0.6827, + "step": 11060 + }, + { + "epoch": 1.4195947678892025, + "grad_norm": 2.2078566551208496, + "learning_rate": 0.0010536034880738651, + "loss": 0.6374, + "step": 11070 + }, + { + "epoch": 1.4208771479866633, + "grad_norm": 0.5845392942428589, + "learning_rate": 0.0010527485680088913, + "loss": 0.4996, + "step": 11080 + }, + { + "epoch": 1.4221595280841242, + "grad_norm": 1.3388561010360718, + "learning_rate": 0.0010518936479439172, + "loss": 0.5793, + "step": 11090 + }, + { + "epoch": 1.423441908181585, + "grad_norm": 0.7074248790740967, + "learning_rate": 0.0010510387278789433, + "loss": 0.5553, + "step": 11100 + }, + { + "epoch": 1.4247242882790458, + "grad_norm": 0.9576848149299622, + "learning_rate": 0.0010501838078139695, + "loss": 0.7863, + "step": 11110 + }, + { + "epoch": 1.4260066683765067, + "grad_norm": 1.0783685445785522, + "learning_rate": 0.0010493288877489954, + "loss": 0.5752, + "step": 11120 + }, + { + "epoch": 1.4272890484739678, + "grad_norm": 1.143621802330017, + "learning_rate": 0.0010484739676840215, + "loss": 0.5223, + "step": 11130 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.0758188962936401, + "learning_rate": 0.0010476190476190477, + "loss": 0.4724, + "step": 11140 + }, + { + "epoch": 1.4298538086688894, + "grad_norm": 0.7286604642868042, + "learning_rate": 0.0010467641275540738, + "loss": 0.493, + "step": 11150 + }, + { + "epoch": 1.4311361887663503, + "grad_norm": 0.7054126858711243, + "learning_rate": 0.0010459092074891, + "loss": 0.6056, + "step": 11160 + }, + { + "epoch": 1.4324185688638114, + "grad_norm": 0.7918633222579956, + "learning_rate": 0.0010450542874241259, + "loss": 0.5409, + "step": 11170 + }, + { + "epoch": 1.4337009489612722, + "grad_norm": 1.105986475944519, + "learning_rate": 0.001044199367359152, + "loss": 0.5131, + "step": 11180 + }, + { + "epoch": 1.434983329058733, + "grad_norm": 1.0960558652877808, + "learning_rate": 0.001043344447294178, + "loss": 0.6069, + "step": 11190 + }, + { + "epoch": 1.436265709156194, + "grad_norm": 0.9920992851257324, + "learning_rate": 0.001042489527229204, + "loss": 0.6847, + "step": 11200 + }, + { + "epoch": 1.4375480892536547, + "grad_norm": 1.1634384393692017, + "learning_rate": 0.0010416346071642302, + "loss": 0.5016, + "step": 11210 + }, + { + "epoch": 1.4388304693511156, + "grad_norm": 0.9485201239585876, + "learning_rate": 0.0010407796870992563, + "loss": 0.3805, + "step": 11220 + }, + { + "epoch": 1.4401128494485764, + "grad_norm": 0.9039535522460938, + "learning_rate": 0.0010399247670342825, + "loss": 0.4757, + "step": 11230 + }, + { + "epoch": 1.4413952295460375, + "grad_norm": 1.6443145275115967, + "learning_rate": 0.0010390698469693084, + "loss": 0.5901, + "step": 11240 + }, + { + "epoch": 1.4426776096434983, + "grad_norm": 0.9684635400772095, + "learning_rate": 0.0010382149269043343, + "loss": 0.5933, + "step": 11250 + }, + { + "epoch": 1.4439599897409592, + "grad_norm": 0.9142680764198303, + "learning_rate": 0.0010373600068393604, + "loss": 0.5636, + "step": 11260 + }, + { + "epoch": 1.44524236983842, + "grad_norm": 1.5742777585983276, + "learning_rate": 0.0010365050867743866, + "loss": 0.513, + "step": 11270 + }, + { + "epoch": 1.446524749935881, + "grad_norm": 1.9768625497817993, + "learning_rate": 0.0010356501667094127, + "loss": 0.7878, + "step": 11280 + }, + { + "epoch": 1.447807130033342, + "grad_norm": 0.7365511655807495, + "learning_rate": 0.0010347952466444388, + "loss": 0.488, + "step": 11290 + }, + { + "epoch": 1.4490895101308028, + "grad_norm": 0.7075834274291992, + "learning_rate": 0.001033940326579465, + "loss": 0.7751, + "step": 11300 + }, + { + "epoch": 1.4503718902282636, + "grad_norm": 0.7053199410438538, + "learning_rate": 0.0010330854065144911, + "loss": 0.5618, + "step": 11310 + }, + { + "epoch": 1.4516542703257245, + "grad_norm": 0.7528406977653503, + "learning_rate": 0.0010322304864495168, + "loss": 0.4849, + "step": 11320 + }, + { + "epoch": 1.4529366504231853, + "grad_norm": 0.96307772397995, + "learning_rate": 0.001031375566384543, + "loss": 0.5907, + "step": 11330 + }, + { + "epoch": 1.4542190305206464, + "grad_norm": 0.5311592817306519, + "learning_rate": 0.001030520646319569, + "loss": 0.5292, + "step": 11340 + }, + { + "epoch": 1.4555014106181072, + "grad_norm": 1.3051635026931763, + "learning_rate": 0.0010296657262545952, + "loss": 0.5106, + "step": 11350 + }, + { + "epoch": 1.456783790715568, + "grad_norm": 1.5041792392730713, + "learning_rate": 0.0010288108061896214, + "loss": 0.5676, + "step": 11360 + }, + { + "epoch": 1.458066170813029, + "grad_norm": 1.6428672075271606, + "learning_rate": 0.0010279558861246475, + "loss": 0.6384, + "step": 11370 + }, + { + "epoch": 1.45934855091049, + "grad_norm": 0.9617105722427368, + "learning_rate": 0.0010271009660596734, + "loss": 0.4024, + "step": 11380 + }, + { + "epoch": 1.4606309310079508, + "grad_norm": 1.0485490560531616, + "learning_rate": 0.0010262460459946996, + "loss": 0.5619, + "step": 11390 + }, + { + "epoch": 1.4619133111054117, + "grad_norm": 1.21506667137146, + "learning_rate": 0.0010253911259297255, + "loss": 0.4969, + "step": 11400 + }, + { + "epoch": 1.4631956912028725, + "grad_norm": 1.1984257698059082, + "learning_rate": 0.0010245362058647516, + "loss": 0.4939, + "step": 11410 + }, + { + "epoch": 1.4644780713003334, + "grad_norm": 0.8625141382217407, + "learning_rate": 0.0010236812857997778, + "loss": 0.543, + "step": 11420 + }, + { + "epoch": 1.4657604513977942, + "grad_norm": 1.3549401760101318, + "learning_rate": 0.0010228263657348039, + "loss": 0.5781, + "step": 11430 + }, + { + "epoch": 1.4670428314952553, + "grad_norm": 1.5862414836883545, + "learning_rate": 0.0010219714456698298, + "loss": 0.5567, + "step": 11440 + }, + { + "epoch": 1.4683252115927161, + "grad_norm": 0.9037706255912781, + "learning_rate": 0.001021116525604856, + "loss": 0.519, + "step": 11450 + }, + { + "epoch": 1.469607591690177, + "grad_norm": 0.9766443967819214, + "learning_rate": 0.001020261605539882, + "loss": 0.5847, + "step": 11460 + }, + { + "epoch": 1.4708899717876378, + "grad_norm": 1.0838594436645508, + "learning_rate": 0.0010194066854749082, + "loss": 0.5357, + "step": 11470 + }, + { + "epoch": 1.4721723518850989, + "grad_norm": 1.4483375549316406, + "learning_rate": 0.0010185517654099341, + "loss": 0.5475, + "step": 11480 + }, + { + "epoch": 1.4734547319825597, + "grad_norm": 0.8537694215774536, + "learning_rate": 0.0010176968453449603, + "loss": 0.6359, + "step": 11490 + }, + { + "epoch": 1.4747371120800206, + "grad_norm": 1.2811238765716553, + "learning_rate": 0.0010168419252799864, + "loss": 0.4657, + "step": 11500 + }, + { + "epoch": 1.4760194921774814, + "grad_norm": 1.3045051097869873, + "learning_rate": 0.0010159870052150123, + "loss": 0.4271, + "step": 11510 + }, + { + "epoch": 1.4773018722749423, + "grad_norm": 0.577139139175415, + "learning_rate": 0.0010151320851500385, + "loss": 0.5664, + "step": 11520 + }, + { + "epoch": 1.478584252372403, + "grad_norm": 0.9219268560409546, + "learning_rate": 0.0010142771650850646, + "loss": 0.5258, + "step": 11530 + }, + { + "epoch": 1.479866632469864, + "grad_norm": 0.6909111142158508, + "learning_rate": 0.0010134222450200907, + "loss": 0.4782, + "step": 11540 + }, + { + "epoch": 1.481149012567325, + "grad_norm": 1.5484191179275513, + "learning_rate": 0.0010125673249551167, + "loss": 0.6158, + "step": 11550 + }, + { + "epoch": 1.4824313926647859, + "grad_norm": 1.1076061725616455, + "learning_rate": 0.0010117124048901428, + "loss": 0.687, + "step": 11560 + }, + { + "epoch": 1.4837137727622467, + "grad_norm": 0.5345991253852844, + "learning_rate": 0.0010108574848251687, + "loss": 0.4507, + "step": 11570 + }, + { + "epoch": 1.4849961528597075, + "grad_norm": 1.8849554061889648, + "learning_rate": 0.0010100025647601949, + "loss": 0.7036, + "step": 11580 + }, + { + "epoch": 1.4862785329571686, + "grad_norm": 1.3229577541351318, + "learning_rate": 0.001009147644695221, + "loss": 0.6121, + "step": 11590 + }, + { + "epoch": 1.4875609130546295, + "grad_norm": 0.5365115404129028, + "learning_rate": 0.0010082927246302471, + "loss": 0.5233, + "step": 11600 + }, + { + "epoch": 1.4888432931520903, + "grad_norm": 1.0795190334320068, + "learning_rate": 0.0010074378045652733, + "loss": 0.464, + "step": 11610 + }, + { + "epoch": 1.4901256732495511, + "grad_norm": 1.2084637880325317, + "learning_rate": 0.0010065828845002994, + "loss": 0.5965, + "step": 11620 + }, + { + "epoch": 1.491408053347012, + "grad_norm": 1.16603684425354, + "learning_rate": 0.0010057279644353253, + "loss": 0.5806, + "step": 11630 + }, + { + "epoch": 1.4926904334444728, + "grad_norm": 1.2406288385391235, + "learning_rate": 0.0010048730443703512, + "loss": 0.5364, + "step": 11640 + }, + { + "epoch": 1.493972813541934, + "grad_norm": 1.1917636394500732, + "learning_rate": 0.0010040181243053774, + "loss": 0.7354, + "step": 11650 + }, + { + "epoch": 1.4952551936393947, + "grad_norm": 0.8824617862701416, + "learning_rate": 0.0010031632042404035, + "loss": 0.5803, + "step": 11660 + }, + { + "epoch": 1.4965375737368556, + "grad_norm": 1.87214195728302, + "learning_rate": 0.0010023082841754296, + "loss": 0.6652, + "step": 11670 + }, + { + "epoch": 1.4978199538343164, + "grad_norm": 1.6992979049682617, + "learning_rate": 0.0010014533641104558, + "loss": 0.6646, + "step": 11680 + }, + { + "epoch": 1.4991023339317775, + "grad_norm": 0.8672876954078674, + "learning_rate": 0.001000598444045482, + "loss": 0.6627, + "step": 11690 + }, + { + "epoch": 1.5003847140292383, + "grad_norm": 0.7643082141876221, + "learning_rate": 0.0009997435239805078, + "loss": 0.4474, + "step": 11700 + }, + { + "epoch": 1.5016670941266992, + "grad_norm": 0.6023688912391663, + "learning_rate": 0.000998888603915534, + "loss": 0.5694, + "step": 11710 + }, + { + "epoch": 1.50294947422416, + "grad_norm": 0.637225866317749, + "learning_rate": 0.0009980336838505601, + "loss": 0.5371, + "step": 11720 + }, + { + "epoch": 1.5042318543216209, + "grad_norm": 1.3987553119659424, + "learning_rate": 0.000997178763785586, + "loss": 0.5906, + "step": 11730 + }, + { + "epoch": 1.5055142344190817, + "grad_norm": 1.1652394533157349, + "learning_rate": 0.0009963238437206122, + "loss": 0.5353, + "step": 11740 + }, + { + "epoch": 1.5067966145165426, + "grad_norm": 0.8522770404815674, + "learning_rate": 0.0009954689236556383, + "loss": 0.5415, + "step": 11750 + }, + { + "epoch": 1.5080789946140036, + "grad_norm": 0.6423736810684204, + "learning_rate": 0.0009946140035906644, + "loss": 0.4791, + "step": 11760 + }, + { + "epoch": 1.5093613747114645, + "grad_norm": 1.1400604248046875, + "learning_rate": 0.0009937590835256904, + "loss": 0.7027, + "step": 11770 + }, + { + "epoch": 1.5106437548089253, + "grad_norm": 1.3665844202041626, + "learning_rate": 0.0009929041634607165, + "loss": 0.5289, + "step": 11780 + }, + { + "epoch": 1.5119261349063864, + "grad_norm": 0.5529056191444397, + "learning_rate": 0.0009920492433957426, + "loss": 0.4608, + "step": 11790 + }, + { + "epoch": 1.5132085150038472, + "grad_norm": 0.7571165561676025, + "learning_rate": 0.0009911943233307686, + "loss": 0.5328, + "step": 11800 + }, + { + "epoch": 1.514490895101308, + "grad_norm": 1.0876634120941162, + "learning_rate": 0.0009903394032657947, + "loss": 0.6374, + "step": 11810 + }, + { + "epoch": 1.515773275198769, + "grad_norm": 0.9497352242469788, + "learning_rate": 0.0009894844832008208, + "loss": 0.5999, + "step": 11820 + }, + { + "epoch": 1.5170556552962298, + "grad_norm": 1.0220736265182495, + "learning_rate": 0.0009886295631358467, + "loss": 0.5733, + "step": 11830 + }, + { + "epoch": 1.5183380353936906, + "grad_norm": 0.8994792699813843, + "learning_rate": 0.0009877746430708729, + "loss": 0.5991, + "step": 11840 + }, + { + "epoch": 1.5196204154911515, + "grad_norm": 0.9045878648757935, + "learning_rate": 0.000986919723005899, + "loss": 0.4144, + "step": 11850 + }, + { + "epoch": 1.5209027955886123, + "grad_norm": 0.8704327344894409, + "learning_rate": 0.000986064802940925, + "loss": 0.4937, + "step": 11860 + }, + { + "epoch": 1.5221851756860734, + "grad_norm": 2.1458346843719482, + "learning_rate": 0.000985209882875951, + "loss": 0.5259, + "step": 11870 + }, + { + "epoch": 1.5234675557835342, + "grad_norm": 0.5980050563812256, + "learning_rate": 0.0009843549628109772, + "loss": 0.5159, + "step": 11880 + }, + { + "epoch": 1.5247499358809953, + "grad_norm": 1.8885890245437622, + "learning_rate": 0.0009835000427460034, + "loss": 0.6933, + "step": 11890 + }, + { + "epoch": 1.5260323159784561, + "grad_norm": 1.1115361452102661, + "learning_rate": 0.0009826451226810293, + "loss": 0.48, + "step": 11900 + }, + { + "epoch": 1.527314696075917, + "grad_norm": 0.6800194978713989, + "learning_rate": 0.0009817902026160554, + "loss": 0.5799, + "step": 11910 + }, + { + "epoch": 1.5285970761733778, + "grad_norm": 0.7408868074417114, + "learning_rate": 0.0009809352825510815, + "loss": 0.4767, + "step": 11920 + }, + { + "epoch": 1.5298794562708387, + "grad_norm": 0.7443408966064453, + "learning_rate": 0.0009800803624861075, + "loss": 0.7237, + "step": 11930 + }, + { + "epoch": 1.5311618363682995, + "grad_norm": 0.7522343993186951, + "learning_rate": 0.0009792254424211336, + "loss": 0.3609, + "step": 11940 + }, + { + "epoch": 1.5324442164657603, + "grad_norm": 1.1541762351989746, + "learning_rate": 0.0009783705223561597, + "loss": 0.6063, + "step": 11950 + }, + { + "epoch": 1.5337265965632212, + "grad_norm": 0.7068589329719543, + "learning_rate": 0.0009775156022911857, + "loss": 0.4613, + "step": 11960 + }, + { + "epoch": 1.5350089766606823, + "grad_norm": 0.4808710515499115, + "learning_rate": 0.0009766606822262118, + "loss": 0.538, + "step": 11970 + }, + { + "epoch": 1.536291356758143, + "grad_norm": 0.7787117958068848, + "learning_rate": 0.0009758057621612379, + "loss": 0.5906, + "step": 11980 + }, + { + "epoch": 1.5375737368556042, + "grad_norm": 0.8264428973197937, + "learning_rate": 0.0009749508420962641, + "loss": 0.5553, + "step": 11990 + }, + { + "epoch": 1.538856116953065, + "grad_norm": 1.344869613647461, + "learning_rate": 0.0009740959220312901, + "loss": 0.4615, + "step": 12000 + }, + { + "epoch": 1.5401384970505259, + "grad_norm": 0.622340977191925, + "learning_rate": 0.0009732410019663161, + "loss": 0.5527, + "step": 12010 + }, + { + "epoch": 1.5414208771479867, + "grad_norm": 0.8521725535392761, + "learning_rate": 0.0009723860819013423, + "loss": 0.7133, + "step": 12020 + }, + { + "epoch": 1.5427032572454475, + "grad_norm": 0.7346990704536438, + "learning_rate": 0.0009715311618363684, + "loss": 0.4141, + "step": 12030 + }, + { + "epoch": 1.5439856373429084, + "grad_norm": 0.42431166768074036, + "learning_rate": 0.0009706762417713943, + "loss": 0.524, + "step": 12040 + }, + { + "epoch": 1.5452680174403692, + "grad_norm": 0.6801153421401978, + "learning_rate": 0.0009698213217064205, + "loss": 0.5191, + "step": 12050 + }, + { + "epoch": 1.54655039753783, + "grad_norm": 1.2390146255493164, + "learning_rate": 0.0009689664016414466, + "loss": 0.6671, + "step": 12060 + }, + { + "epoch": 1.5478327776352911, + "grad_norm": 1.2838438749313354, + "learning_rate": 0.0009681114815764726, + "loss": 0.5188, + "step": 12070 + }, + { + "epoch": 1.549115157732752, + "grad_norm": 1.216489315032959, + "learning_rate": 0.0009672565615114986, + "loss": 0.6267, + "step": 12080 + }, + { + "epoch": 1.5503975378302128, + "grad_norm": 1.3445849418640137, + "learning_rate": 0.0009664016414465248, + "loss": 0.5555, + "step": 12090 + }, + { + "epoch": 1.551679917927674, + "grad_norm": 0.9177038669586182, + "learning_rate": 0.0009655467213815508, + "loss": 0.5171, + "step": 12100 + }, + { + "epoch": 1.5529622980251347, + "grad_norm": 0.784768283367157, + "learning_rate": 0.000964691801316577, + "loss": 0.5645, + "step": 12110 + }, + { + "epoch": 1.5542446781225956, + "grad_norm": 1.3219481706619263, + "learning_rate": 0.000963836881251603, + "loss": 0.5687, + "step": 12120 + }, + { + "epoch": 1.5555270582200564, + "grad_norm": 1.0199742317199707, + "learning_rate": 0.0009629819611866291, + "loss": 0.4982, + "step": 12130 + }, + { + "epoch": 1.5568094383175173, + "grad_norm": 0.6068373322486877, + "learning_rate": 0.0009621270411216551, + "loss": 0.4458, + "step": 12140 + }, + { + "epoch": 1.5580918184149781, + "grad_norm": 0.7337871193885803, + "learning_rate": 0.0009612721210566813, + "loss": 0.5002, + "step": 12150 + }, + { + "epoch": 1.559374198512439, + "grad_norm": 0.9381522536277771, + "learning_rate": 0.0009604172009917073, + "loss": 0.4884, + "step": 12160 + }, + { + "epoch": 1.5606565786098998, + "grad_norm": 1.3527436256408691, + "learning_rate": 0.0009595622809267333, + "loss": 0.6701, + "step": 12170 + }, + { + "epoch": 1.5619389587073609, + "grad_norm": 0.9414606094360352, + "learning_rate": 0.0009587073608617595, + "loss": 0.6152, + "step": 12180 + }, + { + "epoch": 1.5632213388048217, + "grad_norm": 0.8174257874488831, + "learning_rate": 0.0009578524407967856, + "loss": 0.5274, + "step": 12190 + }, + { + "epoch": 1.5645037189022828, + "grad_norm": 1.5723670721054077, + "learning_rate": 0.0009569975207318115, + "loss": 0.6343, + "step": 12200 + }, + { + "epoch": 1.5657860989997436, + "grad_norm": 1.176369309425354, + "learning_rate": 0.0009561426006668377, + "loss": 0.5032, + "step": 12210 + }, + { + "epoch": 1.5670684790972045, + "grad_norm": 0.9606861472129822, + "learning_rate": 0.0009552876806018638, + "loss": 0.5375, + "step": 12220 + }, + { + "epoch": 1.5683508591946653, + "grad_norm": 0.65707927942276, + "learning_rate": 0.0009544327605368897, + "loss": 0.5424, + "step": 12230 + }, + { + "epoch": 1.5696332392921262, + "grad_norm": 1.4007467031478882, + "learning_rate": 0.0009535778404719159, + "loss": 0.6445, + "step": 12240 + }, + { + "epoch": 1.570915619389587, + "grad_norm": 1.0171335935592651, + "learning_rate": 0.000952722920406942, + "loss": 0.6483, + "step": 12250 + }, + { + "epoch": 1.5721979994870479, + "grad_norm": 1.3080893754959106, + "learning_rate": 0.0009518680003419681, + "loss": 0.6063, + "step": 12260 + }, + { + "epoch": 1.5734803795845087, + "grad_norm": 1.2589539289474487, + "learning_rate": 0.000951013080276994, + "loss": 0.5336, + "step": 12270 + }, + { + "epoch": 1.5747627596819698, + "grad_norm": 1.1046485900878906, + "learning_rate": 0.0009501581602120202, + "loss": 0.5861, + "step": 12280 + }, + { + "epoch": 1.5760451397794306, + "grad_norm": 0.5925372242927551, + "learning_rate": 0.0009493032401470463, + "loss": 0.5389, + "step": 12290 + }, + { + "epoch": 1.5773275198768915, + "grad_norm": 0.9463218450546265, + "learning_rate": 0.0009484483200820723, + "loss": 0.4316, + "step": 12300 + }, + { + "epoch": 1.5786098999743525, + "grad_norm": 1.0939500331878662, + "learning_rate": 0.0009475934000170984, + "loss": 0.5682, + "step": 12310 + }, + { + "epoch": 1.5798922800718134, + "grad_norm": 1.314713478088379, + "learning_rate": 0.0009467384799521245, + "loss": 0.7091, + "step": 12320 + }, + { + "epoch": 1.5811746601692742, + "grad_norm": 1.5223866701126099, + "learning_rate": 0.0009458835598871505, + "loss": 0.6481, + "step": 12330 + }, + { + "epoch": 1.582457040266735, + "grad_norm": 0.6593065857887268, + "learning_rate": 0.0009450286398221767, + "loss": 0.5051, + "step": 12340 + }, + { + "epoch": 1.583739420364196, + "grad_norm": 0.6195816993713379, + "learning_rate": 0.0009441737197572027, + "loss": 0.5446, + "step": 12350 + }, + { + "epoch": 1.5850218004616567, + "grad_norm": 0.5355708599090576, + "learning_rate": 0.0009433187996922287, + "loss": 0.592, + "step": 12360 + }, + { + "epoch": 1.5863041805591176, + "grad_norm": 0.5407887697219849, + "learning_rate": 0.0009424638796272549, + "loss": 0.4079, + "step": 12370 + }, + { + "epoch": 1.5875865606565787, + "grad_norm": 0.8656293153762817, + "learning_rate": 0.000941608959562281, + "loss": 0.6232, + "step": 12380 + }, + { + "epoch": 1.5888689407540395, + "grad_norm": 1.0088326930999756, + "learning_rate": 0.0009407540394973069, + "loss": 0.5646, + "step": 12390 + }, + { + "epoch": 1.5901513208515003, + "grad_norm": 1.7522426843643188, + "learning_rate": 0.0009398991194323331, + "loss": 0.6204, + "step": 12400 + }, + { + "epoch": 1.5914337009489614, + "grad_norm": 1.3139151334762573, + "learning_rate": 0.0009390441993673592, + "loss": 0.5363, + "step": 12410 + }, + { + "epoch": 1.5927160810464223, + "grad_norm": 1.4125381708145142, + "learning_rate": 0.0009381892793023853, + "loss": 0.5504, + "step": 12420 + }, + { + "epoch": 1.593998461143883, + "grad_norm": 1.0490381717681885, + "learning_rate": 0.0009373343592374113, + "loss": 0.5976, + "step": 12430 + }, + { + "epoch": 1.595280841241344, + "grad_norm": 0.8534132242202759, + "learning_rate": 0.0009364794391724374, + "loss": 0.5671, + "step": 12440 + }, + { + "epoch": 1.5965632213388048, + "grad_norm": 0.882533848285675, + "learning_rate": 0.0009356245191074635, + "loss": 0.5291, + "step": 12450 + }, + { + "epoch": 1.5978456014362656, + "grad_norm": 0.9412997364997864, + "learning_rate": 0.0009347695990424896, + "loss": 0.5206, + "step": 12460 + }, + { + "epoch": 1.5991279815337265, + "grad_norm": 0.8173903822898865, + "learning_rate": 0.0009339146789775156, + "loss": 0.6253, + "step": 12470 + }, + { + "epoch": 1.6004103616311873, + "grad_norm": 1.2541340589523315, + "learning_rate": 0.0009330597589125417, + "loss": 0.6774, + "step": 12480 + }, + { + "epoch": 1.6016927417286484, + "grad_norm": 0.8679422736167908, + "learning_rate": 0.0009322048388475678, + "loss": 0.4629, + "step": 12490 + }, + { + "epoch": 1.6029751218261092, + "grad_norm": 0.8610258102416992, + "learning_rate": 0.0009313499187825939, + "loss": 0.575, + "step": 12500 + }, + { + "epoch": 1.6042575019235703, + "grad_norm": 0.6528512835502625, + "learning_rate": 0.0009304949987176199, + "loss": 0.4773, + "step": 12510 + }, + { + "epoch": 1.6055398820210312, + "grad_norm": 0.7231767773628235, + "learning_rate": 0.0009296400786526459, + "loss": 0.643, + "step": 12520 + }, + { + "epoch": 1.606822262118492, + "grad_norm": 1.6632248163223267, + "learning_rate": 0.0009287851585876721, + "loss": 0.7517, + "step": 12530 + }, + { + "epoch": 1.6081046422159528, + "grad_norm": 0.7457917332649231, + "learning_rate": 0.0009279302385226982, + "loss": 0.3932, + "step": 12540 + }, + { + "epoch": 1.6093870223134137, + "grad_norm": 0.8939314484596252, + "learning_rate": 0.0009270753184577242, + "loss": 0.6226, + "step": 12550 + }, + { + "epoch": 1.6106694024108745, + "grad_norm": 0.4794626235961914, + "learning_rate": 0.0009262203983927503, + "loss": 0.5804, + "step": 12560 + }, + { + "epoch": 1.6119517825083354, + "grad_norm": 0.7062269449234009, + "learning_rate": 0.0009253654783277764, + "loss": 0.4038, + "step": 12570 + }, + { + "epoch": 1.6132341626057962, + "grad_norm": 0.6264899373054504, + "learning_rate": 0.0009245105582628024, + "loss": 0.7183, + "step": 12580 + }, + { + "epoch": 1.6145165427032573, + "grad_norm": 0.5829173922538757, + "learning_rate": 0.0009236556381978285, + "loss": 0.61, + "step": 12590 + }, + { + "epoch": 1.6157989228007181, + "grad_norm": 0.9000409841537476, + "learning_rate": 0.0009228007181328546, + "loss": 0.6004, + "step": 12600 + }, + { + "epoch": 1.617081302898179, + "grad_norm": 0.5521026849746704, + "learning_rate": 0.0009219457980678807, + "loss": 0.5799, + "step": 12610 + }, + { + "epoch": 1.61836368299564, + "grad_norm": 0.9383637309074402, + "learning_rate": 0.0009210908780029067, + "loss": 0.4902, + "step": 12620 + }, + { + "epoch": 1.6196460630931009, + "grad_norm": 1.1581825017929077, + "learning_rate": 0.0009202359579379328, + "loss": 0.4182, + "step": 12630 + }, + { + "epoch": 1.6209284431905617, + "grad_norm": 2.7146356105804443, + "learning_rate": 0.0009193810378729589, + "loss": 0.6339, + "step": 12640 + }, + { + "epoch": 1.6222108232880226, + "grad_norm": 0.607458233833313, + "learning_rate": 0.000918526117807985, + "loss": 0.7317, + "step": 12650 + }, + { + "epoch": 1.6234932033854834, + "grad_norm": 0.8015214800834656, + "learning_rate": 0.000917671197743011, + "loss": 0.5006, + "step": 12660 + }, + { + "epoch": 1.6247755834829443, + "grad_norm": 0.9352098703384399, + "learning_rate": 0.0009168162776780371, + "loss": 0.5692, + "step": 12670 + }, + { + "epoch": 1.626057963580405, + "grad_norm": 1.3403977155685425, + "learning_rate": 0.0009159613576130632, + "loss": 0.3569, + "step": 12680 + }, + { + "epoch": 1.6273403436778662, + "grad_norm": 0.9648029804229736, + "learning_rate": 0.0009151064375480893, + "loss": 0.6767, + "step": 12690 + }, + { + "epoch": 1.628622723775327, + "grad_norm": 0.7948251962661743, + "learning_rate": 0.0009142515174831153, + "loss": 0.5911, + "step": 12700 + }, + { + "epoch": 1.6299051038727879, + "grad_norm": 0.9088913798332214, + "learning_rate": 0.0009133965974181415, + "loss": 0.7068, + "step": 12710 + }, + { + "epoch": 1.631187483970249, + "grad_norm": 0.6906175017356873, + "learning_rate": 0.0009125416773531675, + "loss": 0.7111, + "step": 12720 + }, + { + "epoch": 1.6324698640677098, + "grad_norm": 1.3398417234420776, + "learning_rate": 0.0009116867572881936, + "loss": 0.7023, + "step": 12730 + }, + { + "epoch": 1.6337522441651706, + "grad_norm": 1.101651906967163, + "learning_rate": 0.0009108318372232196, + "loss": 0.6276, + "step": 12740 + }, + { + "epoch": 1.6350346242626315, + "grad_norm": 1.373275637626648, + "learning_rate": 0.0009099769171582457, + "loss": 0.5836, + "step": 12750 + }, + { + "epoch": 1.6363170043600923, + "grad_norm": 1.2274094820022583, + "learning_rate": 0.0009091219970932718, + "loss": 0.7078, + "step": 12760 + }, + { + "epoch": 1.6375993844575532, + "grad_norm": 1.5747358798980713, + "learning_rate": 0.000908267077028298, + "loss": 0.5668, + "step": 12770 + }, + { + "epoch": 1.638881764555014, + "grad_norm": 0.8394151329994202, + "learning_rate": 0.0009074121569633239, + "loss": 0.4556, + "step": 12780 + }, + { + "epoch": 1.6401641446524748, + "grad_norm": 0.7515396475791931, + "learning_rate": 0.00090655723689835, + "loss": 0.5956, + "step": 12790 + }, + { + "epoch": 1.641446524749936, + "grad_norm": 1.009969711303711, + "learning_rate": 0.0009057023168333761, + "loss": 0.4052, + "step": 12800 + }, + { + "epoch": 1.6427289048473968, + "grad_norm": 1.0753581523895264, + "learning_rate": 0.0009048473967684022, + "loss": 0.4942, + "step": 12810 + }, + { + "epoch": 1.6440112849448578, + "grad_norm": 0.8667649030685425, + "learning_rate": 0.0009039924767034282, + "loss": 0.5448, + "step": 12820 + }, + { + "epoch": 1.6452936650423187, + "grad_norm": 0.7869744300842285, + "learning_rate": 0.0009031375566384543, + "loss": 0.4488, + "step": 12830 + }, + { + "epoch": 1.6465760451397795, + "grad_norm": 0.9994969367980957, + "learning_rate": 0.0009022826365734805, + "loss": 0.4666, + "step": 12840 + }, + { + "epoch": 1.6478584252372404, + "grad_norm": 1.0947333574295044, + "learning_rate": 0.0009014277165085065, + "loss": 0.5769, + "step": 12850 + }, + { + "epoch": 1.6491408053347012, + "grad_norm": 0.44217410683631897, + "learning_rate": 0.0009005727964435325, + "loss": 0.4351, + "step": 12860 + }, + { + "epoch": 1.650423185432162, + "grad_norm": 1.5107439756393433, + "learning_rate": 0.0008997178763785587, + "loss": 0.4896, + "step": 12870 + }, + { + "epoch": 1.6517055655296229, + "grad_norm": 1.3806378841400146, + "learning_rate": 0.0008988629563135847, + "loss": 0.4819, + "step": 12880 + }, + { + "epoch": 1.6529879456270837, + "grad_norm": 1.0684335231781006, + "learning_rate": 0.0008980080362486108, + "loss": 0.5076, + "step": 12890 + }, + { + "epoch": 1.6542703257245448, + "grad_norm": 0.6249770522117615, + "learning_rate": 0.0008971531161836369, + "loss": 0.5086, + "step": 12900 + }, + { + "epoch": 1.6555527058220056, + "grad_norm": 0.8188676238059998, + "learning_rate": 0.0008962981961186629, + "loss": 0.4622, + "step": 12910 + }, + { + "epoch": 1.6568350859194665, + "grad_norm": 0.6623940467834473, + "learning_rate": 0.000895443276053689, + "loss": 0.5601, + "step": 12920 + }, + { + "epoch": 1.6581174660169276, + "grad_norm": 0.7788714170455933, + "learning_rate": 0.000894588355988715, + "loss": 0.4996, + "step": 12930 + }, + { + "epoch": 1.6593998461143884, + "grad_norm": 1.1400748491287231, + "learning_rate": 0.0008937334359237411, + "loss": 0.6324, + "step": 12940 + }, + { + "epoch": 1.6606822262118492, + "grad_norm": 0.873874306678772, + "learning_rate": 0.0008928785158587672, + "loss": 0.4832, + "step": 12950 + }, + { + "epoch": 1.66196460630931, + "grad_norm": 1.9320780038833618, + "learning_rate": 0.0008920235957937934, + "loss": 0.5149, + "step": 12960 + }, + { + "epoch": 1.663246986406771, + "grad_norm": 0.7874430418014526, + "learning_rate": 0.0008911686757288193, + "loss": 0.5972, + "step": 12970 + }, + { + "epoch": 1.6645293665042318, + "grad_norm": 0.5509324073791504, + "learning_rate": 0.0008903137556638454, + "loss": 0.6655, + "step": 12980 + }, + { + "epoch": 1.6658117466016926, + "grad_norm": 1.294395089149475, + "learning_rate": 0.0008894588355988715, + "loss": 0.5809, + "step": 12990 + }, + { + "epoch": 1.6670941266991535, + "grad_norm": 1.0513406991958618, + "learning_rate": 0.0008886039155338977, + "loss": 0.5312, + "step": 13000 + }, + { + "epoch": 1.6683765067966145, + "grad_norm": 0.7653344869613647, + "learning_rate": 0.0008877489954689236, + "loss": 0.5772, + "step": 13010 + }, + { + "epoch": 1.6696588868940754, + "grad_norm": 0.8619967103004456, + "learning_rate": 0.0008868940754039497, + "loss": 0.5977, + "step": 13020 + }, + { + "epoch": 1.6709412669915364, + "grad_norm": 0.5212082862854004, + "learning_rate": 0.0008860391553389759, + "loss": 0.514, + "step": 13030 + }, + { + "epoch": 1.6722236470889973, + "grad_norm": 1.150930404663086, + "learning_rate": 0.0008851842352740019, + "loss": 0.4911, + "step": 13040 + }, + { + "epoch": 1.6735060271864581, + "grad_norm": 1.1434667110443115, + "learning_rate": 0.0008843293152090279, + "loss": 0.6873, + "step": 13050 + }, + { + "epoch": 1.674788407283919, + "grad_norm": 0.8454691171646118, + "learning_rate": 0.0008834743951440541, + "loss": 0.5696, + "step": 13060 + }, + { + "epoch": 1.6760707873813798, + "grad_norm": 0.9413100481033325, + "learning_rate": 0.0008826194750790801, + "loss": 0.5203, + "step": 13070 + }, + { + "epoch": 1.6773531674788407, + "grad_norm": 1.1929399967193604, + "learning_rate": 0.0008817645550141062, + "loss": 0.6685, + "step": 13080 + }, + { + "epoch": 1.6786355475763015, + "grad_norm": 0.7188916206359863, + "learning_rate": 0.0008809096349491323, + "loss": 0.5879, + "step": 13090 + }, + { + "epoch": 1.6799179276737624, + "grad_norm": 1.4014157056808472, + "learning_rate": 0.0008800547148841583, + "loss": 0.7174, + "step": 13100 + }, + { + "epoch": 1.6812003077712234, + "grad_norm": 1.280308485031128, + "learning_rate": 0.0008791997948191844, + "loss": 0.6408, + "step": 13110 + }, + { + "epoch": 1.6824826878686843, + "grad_norm": 0.7468828558921814, + "learning_rate": 0.0008783448747542106, + "loss": 0.5482, + "step": 13120 + }, + { + "epoch": 1.6837650679661451, + "grad_norm": 1.2528510093688965, + "learning_rate": 0.0008774899546892366, + "loss": 0.5617, + "step": 13130 + }, + { + "epoch": 1.6850474480636062, + "grad_norm": 0.8010075092315674, + "learning_rate": 0.0008766350346242626, + "loss": 0.497, + "step": 13140 + }, + { + "epoch": 1.686329828161067, + "grad_norm": 0.6345623731613159, + "learning_rate": 0.0008757801145592888, + "loss": 0.5155, + "step": 13150 + }, + { + "epoch": 1.6876122082585279, + "grad_norm": 0.8836259245872498, + "learning_rate": 0.0008749251944943149, + "loss": 0.5199, + "step": 13160 + }, + { + "epoch": 1.6888945883559887, + "grad_norm": 1.7731555700302124, + "learning_rate": 0.0008740702744293408, + "loss": 0.7202, + "step": 13170 + }, + { + "epoch": 1.6901769684534496, + "grad_norm": 0.9856127500534058, + "learning_rate": 0.0008732153543643669, + "loss": 0.5554, + "step": 13180 + }, + { + "epoch": 1.6914593485509104, + "grad_norm": 0.678236722946167, + "learning_rate": 0.0008723604342993931, + "loss": 0.5101, + "step": 13190 + }, + { + "epoch": 1.6927417286483712, + "grad_norm": 1.093641996383667, + "learning_rate": 0.0008715055142344191, + "loss": 0.4779, + "step": 13200 + }, + { + "epoch": 1.6940241087458323, + "grad_norm": 0.8706515431404114, + "learning_rate": 0.0008706505941694451, + "loss": 0.5454, + "step": 13210 + }, + { + "epoch": 1.6953064888432932, + "grad_norm": 1.9918123483657837, + "learning_rate": 0.0008697956741044713, + "loss": 0.5804, + "step": 13220 + }, + { + "epoch": 1.696588868940754, + "grad_norm": 0.68386310338974, + "learning_rate": 0.0008689407540394973, + "loss": 0.5132, + "step": 13230 + }, + { + "epoch": 1.697871249038215, + "grad_norm": 0.9491919279098511, + "learning_rate": 0.0008680858339745234, + "loss": 0.5333, + "step": 13240 + }, + { + "epoch": 1.699153629135676, + "grad_norm": 1.1484148502349854, + "learning_rate": 0.0008672309139095495, + "loss": 0.5035, + "step": 13250 + }, + { + "epoch": 1.7004360092331368, + "grad_norm": 0.6695376038551331, + "learning_rate": 0.0008663759938445755, + "loss": 0.4074, + "step": 13260 + }, + { + "epoch": 1.7017183893305976, + "grad_norm": 1.1360992193222046, + "learning_rate": 0.0008655210737796016, + "loss": 0.4778, + "step": 13270 + }, + { + "epoch": 1.7030007694280584, + "grad_norm": 0.9604383111000061, + "learning_rate": 0.0008646661537146277, + "loss": 0.5601, + "step": 13280 + }, + { + "epoch": 1.7042831495255193, + "grad_norm": 1.3364579677581787, + "learning_rate": 0.0008638112336496538, + "loss": 0.4396, + "step": 13290 + }, + { + "epoch": 1.7055655296229801, + "grad_norm": 0.6195886731147766, + "learning_rate": 0.0008629563135846798, + "loss": 0.5143, + "step": 13300 + }, + { + "epoch": 1.706847909720441, + "grad_norm": 0.9771988987922668, + "learning_rate": 0.000862101393519706, + "loss": 0.504, + "step": 13310 + }, + { + "epoch": 1.708130289817902, + "grad_norm": 1.2056382894515991, + "learning_rate": 0.000861246473454732, + "loss": 0.5157, + "step": 13320 + }, + { + "epoch": 1.709412669915363, + "grad_norm": 1.0065749883651733, + "learning_rate": 0.000860391553389758, + "loss": 0.5178, + "step": 13330 + }, + { + "epoch": 1.710695050012824, + "grad_norm": 0.5957716703414917, + "learning_rate": 0.0008595366333247842, + "loss": 0.4264, + "step": 13340 + }, + { + "epoch": 1.7119774301102848, + "grad_norm": 0.7101840376853943, + "learning_rate": 0.0008586817132598103, + "loss": 0.4966, + "step": 13350 + }, + { + "epoch": 1.7132598102077456, + "grad_norm": 0.7323270440101624, + "learning_rate": 0.0008578267931948362, + "loss": 0.427, + "step": 13360 + }, + { + "epoch": 1.7145421903052065, + "grad_norm": 0.798598051071167, + "learning_rate": 0.0008569718731298623, + "loss": 0.3831, + "step": 13370 + }, + { + "epoch": 1.7158245704026673, + "grad_norm": 0.7318591475486755, + "learning_rate": 0.0008561169530648885, + "loss": 0.5691, + "step": 13380 + }, + { + "epoch": 1.7171069505001282, + "grad_norm": 0.951507031917572, + "learning_rate": 0.0008552620329999145, + "loss": 0.8433, + "step": 13390 + }, + { + "epoch": 1.718389330597589, + "grad_norm": 0.7206411361694336, + "learning_rate": 0.0008544071129349405, + "loss": 0.5138, + "step": 13400 + }, + { + "epoch": 1.7196717106950499, + "grad_norm": 0.9043763875961304, + "learning_rate": 0.0008535521928699667, + "loss": 0.5955, + "step": 13410 + }, + { + "epoch": 1.720954090792511, + "grad_norm": 2.0460662841796875, + "learning_rate": 0.0008526972728049928, + "loss": 0.5227, + "step": 13420 + }, + { + "epoch": 1.7222364708899718, + "grad_norm": 1.167067289352417, + "learning_rate": 0.0008518423527400188, + "loss": 0.4719, + "step": 13430 + }, + { + "epoch": 1.7235188509874326, + "grad_norm": 0.7188780307769775, + "learning_rate": 0.0008509874326750449, + "loss": 0.5359, + "step": 13440 + }, + { + "epoch": 1.7248012310848937, + "grad_norm": 1.6519628763198853, + "learning_rate": 0.000850132512610071, + "loss": 0.5869, + "step": 13450 + }, + { + "epoch": 1.7260836111823545, + "grad_norm": 1.1188615560531616, + "learning_rate": 0.000849277592545097, + "loss": 0.6952, + "step": 13460 + }, + { + "epoch": 1.7273659912798154, + "grad_norm": 0.7881381511688232, + "learning_rate": 0.0008484226724801232, + "loss": 0.5809, + "step": 13470 + }, + { + "epoch": 1.7286483713772762, + "grad_norm": 0.39665651321411133, + "learning_rate": 0.0008475677524151492, + "loss": 0.6488, + "step": 13480 + }, + { + "epoch": 1.729930751474737, + "grad_norm": 0.7501810193061829, + "learning_rate": 0.0008467128323501752, + "loss": 0.6566, + "step": 13490 + }, + { + "epoch": 1.731213131572198, + "grad_norm": 0.7444621324539185, + "learning_rate": 0.0008458579122852014, + "loss": 0.559, + "step": 13500 + }, + { + "epoch": 1.7324955116696588, + "grad_norm": 1.1984418630599976, + "learning_rate": 0.0008450029922202275, + "loss": 0.5926, + "step": 13510 + }, + { + "epoch": 1.7337778917671198, + "grad_norm": 1.1419836282730103, + "learning_rate": 0.0008441480721552534, + "loss": 0.4816, + "step": 13520 + }, + { + "epoch": 1.7350602718645807, + "grad_norm": 1.021096110343933, + "learning_rate": 0.0008432931520902796, + "loss": 0.534, + "step": 13530 + }, + { + "epoch": 1.7363426519620415, + "grad_norm": 0.5502016544342041, + "learning_rate": 0.0008424382320253057, + "loss": 0.5077, + "step": 13540 + }, + { + "epoch": 1.7376250320595026, + "grad_norm": 1.1149070262908936, + "learning_rate": 0.0008415833119603318, + "loss": 0.6677, + "step": 13550 + }, + { + "epoch": 1.7389074121569634, + "grad_norm": 1.017102837562561, + "learning_rate": 0.0008407283918953577, + "loss": 0.5076, + "step": 13560 + }, + { + "epoch": 1.7401897922544243, + "grad_norm": 1.4042975902557373, + "learning_rate": 0.0008398734718303839, + "loss": 0.588, + "step": 13570 + }, + { + "epoch": 1.7414721723518851, + "grad_norm": 1.2214784622192383, + "learning_rate": 0.00083901855176541, + "loss": 0.6285, + "step": 13580 + }, + { + "epoch": 1.742754552449346, + "grad_norm": 0.9134330153465271, + "learning_rate": 0.000838163631700436, + "loss": 0.4326, + "step": 13590 + }, + { + "epoch": 1.7440369325468068, + "grad_norm": 1.0869004726409912, + "learning_rate": 0.0008373087116354621, + "loss": 0.4825, + "step": 13600 + }, + { + "epoch": 1.7453193126442677, + "grad_norm": 0.8992425799369812, + "learning_rate": 0.0008364537915704882, + "loss": 0.6364, + "step": 13610 + }, + { + "epoch": 1.7466016927417285, + "grad_norm": 1.2545522451400757, + "learning_rate": 0.0008355988715055142, + "loss": 0.4515, + "step": 13620 + }, + { + "epoch": 1.7478840728391896, + "grad_norm": 0.7109204530715942, + "learning_rate": 0.0008347439514405403, + "loss": 0.5838, + "step": 13630 + }, + { + "epoch": 1.7491664529366504, + "grad_norm": 1.2190492153167725, + "learning_rate": 0.0008338890313755664, + "loss": 0.4962, + "step": 13640 + }, + { + "epoch": 1.7504488330341115, + "grad_norm": 0.9201902151107788, + "learning_rate": 0.0008330341113105924, + "loss": 0.4827, + "step": 13650 + }, + { + "epoch": 1.7517312131315723, + "grad_norm": 1.5981885194778442, + "learning_rate": 0.0008321791912456186, + "loss": 0.6323, + "step": 13660 + }, + { + "epoch": 1.7530135932290332, + "grad_norm": 0.8127802014350891, + "learning_rate": 0.0008313242711806446, + "loss": 0.407, + "step": 13670 + }, + { + "epoch": 1.754295973326494, + "grad_norm": 0.7639079689979553, + "learning_rate": 0.0008304693511156706, + "loss": 0.455, + "step": 13680 + }, + { + "epoch": 1.7555783534239549, + "grad_norm": 1.8039822578430176, + "learning_rate": 0.0008296144310506968, + "loss": 0.53, + "step": 13690 + }, + { + "epoch": 1.7568607335214157, + "grad_norm": 1.4500998258590698, + "learning_rate": 0.0008287595109857229, + "loss": 0.6162, + "step": 13700 + }, + { + "epoch": 1.7581431136188765, + "grad_norm": 2.520433187484741, + "learning_rate": 0.0008279045909207489, + "loss": 0.5441, + "step": 13710 + }, + { + "epoch": 1.7594254937163374, + "grad_norm": 1.0140107870101929, + "learning_rate": 0.000827049670855775, + "loss": 0.4956, + "step": 13720 + }, + { + "epoch": 1.7607078738137985, + "grad_norm": 0.4604131281375885, + "learning_rate": 0.0008261947507908011, + "loss": 0.51, + "step": 13730 + }, + { + "epoch": 1.7619902539112593, + "grad_norm": 0.9998058080673218, + "learning_rate": 0.0008253398307258272, + "loss": 0.5095, + "step": 13740 + }, + { + "epoch": 1.7632726340087201, + "grad_norm": 0.8125320076942444, + "learning_rate": 0.0008244849106608532, + "loss": 0.4844, + "step": 13750 + }, + { + "epoch": 1.7645550141061812, + "grad_norm": 2.0400047302246094, + "learning_rate": 0.0008236299905958793, + "loss": 0.521, + "step": 13760 + }, + { + "epoch": 1.765837394203642, + "grad_norm": 1.3145325183868408, + "learning_rate": 0.0008227750705309054, + "loss": 0.5762, + "step": 13770 + }, + { + "epoch": 1.767119774301103, + "grad_norm": 1.6746065616607666, + "learning_rate": 0.0008219201504659315, + "loss": 0.5174, + "step": 13780 + }, + { + "epoch": 1.7684021543985637, + "grad_norm": 1.6866681575775146, + "learning_rate": 0.0008210652304009575, + "loss": 0.4871, + "step": 13790 + }, + { + "epoch": 1.7696845344960246, + "grad_norm": 1.2878737449645996, + "learning_rate": 0.0008202103103359836, + "loss": 0.6269, + "step": 13800 + }, + { + "epoch": 1.7709669145934854, + "grad_norm": 1.4274048805236816, + "learning_rate": 0.0008193553902710096, + "loss": 0.5801, + "step": 13810 + }, + { + "epoch": 1.7722492946909463, + "grad_norm": 0.7440363168716431, + "learning_rate": 0.0008185004702060358, + "loss": 0.4935, + "step": 13820 + }, + { + "epoch": 1.7735316747884071, + "grad_norm": 0.7374436259269714, + "learning_rate": 0.0008176455501410618, + "loss": 0.4152, + "step": 13830 + }, + { + "epoch": 1.7748140548858682, + "grad_norm": 0.7930302619934082, + "learning_rate": 0.0008167906300760878, + "loss": 0.5713, + "step": 13840 + }, + { + "epoch": 1.776096434983329, + "grad_norm": 0.8752467036247253, + "learning_rate": 0.000815935710011114, + "loss": 0.5714, + "step": 13850 + }, + { + "epoch": 1.77737881508079, + "grad_norm": 0.8803384900093079, + "learning_rate": 0.0008150807899461401, + "loss": 0.4603, + "step": 13860 + }, + { + "epoch": 1.778661195178251, + "grad_norm": 0.8935397863388062, + "learning_rate": 0.0008142258698811661, + "loss": 0.432, + "step": 13870 + }, + { + "epoch": 1.7799435752757118, + "grad_norm": 1.1395505666732788, + "learning_rate": 0.0008133709498161922, + "loss": 0.5874, + "step": 13880 + }, + { + "epoch": 1.7812259553731726, + "grad_norm": 1.5835202932357788, + "learning_rate": 0.0008125160297512183, + "loss": 0.5225, + "step": 13890 + }, + { + "epoch": 1.7825083354706335, + "grad_norm": 0.9241839647293091, + "learning_rate": 0.0008116611096862444, + "loss": 0.4985, + "step": 13900 + }, + { + "epoch": 1.7837907155680943, + "grad_norm": 0.7671691179275513, + "learning_rate": 0.0008108061896212704, + "loss": 0.6306, + "step": 13910 + }, + { + "epoch": 1.7850730956655552, + "grad_norm": 0.9022935628890991, + "learning_rate": 0.0008099512695562965, + "loss": 0.5379, + "step": 13920 + }, + { + "epoch": 1.786355475763016, + "grad_norm": 1.420850157737732, + "learning_rate": 0.0008090963494913226, + "loss": 0.3985, + "step": 13930 + }, + { + "epoch": 1.787637855860477, + "grad_norm": 0.733504593372345, + "learning_rate": 0.0008082414294263487, + "loss": 0.7121, + "step": 13940 + }, + { + "epoch": 1.788920235957938, + "grad_norm": 1.4567188024520874, + "learning_rate": 0.0008073865093613747, + "loss": 0.6869, + "step": 13950 + }, + { + "epoch": 1.7902026160553988, + "grad_norm": 0.6763759255409241, + "learning_rate": 0.0008065315892964008, + "loss": 0.621, + "step": 13960 + }, + { + "epoch": 1.7914849961528598, + "grad_norm": 0.8437899947166443, + "learning_rate": 0.0008056766692314269, + "loss": 0.5373, + "step": 13970 + }, + { + "epoch": 1.7927673762503207, + "grad_norm": 0.4620661437511444, + "learning_rate": 0.0008048217491664529, + "loss": 0.5236, + "step": 13980 + }, + { + "epoch": 1.7940497563477815, + "grad_norm": 0.7685003280639648, + "learning_rate": 0.000803966829101479, + "loss": 0.4337, + "step": 13990 + }, + { + "epoch": 1.7953321364452424, + "grad_norm": 1.1188052892684937, + "learning_rate": 0.0008031119090365052, + "loss": 0.691, + "step": 14000 + }, + { + "epoch": 1.7966145165427032, + "grad_norm": 0.8180050849914551, + "learning_rate": 0.0008022569889715312, + "loss": 0.403, + "step": 14010 + }, + { + "epoch": 1.797896896640164, + "grad_norm": 0.6858202219009399, + "learning_rate": 0.0008014020689065572, + "loss": 0.4301, + "step": 14020 + }, + { + "epoch": 1.799179276737625, + "grad_norm": 0.5204628705978394, + "learning_rate": 0.0008005471488415833, + "loss": 0.5291, + "step": 14030 + }, + { + "epoch": 1.800461656835086, + "grad_norm": 1.3663321733474731, + "learning_rate": 0.0007996922287766094, + "loss": 0.5184, + "step": 14040 + }, + { + "epoch": 1.8017440369325468, + "grad_norm": 1.3524236679077148, + "learning_rate": 0.0007988373087116355, + "loss": 0.5009, + "step": 14050 + }, + { + "epoch": 1.8030264170300077, + "grad_norm": 0.8444858193397522, + "learning_rate": 0.0007979823886466615, + "loss": 0.5588, + "step": 14060 + }, + { + "epoch": 1.8043087971274687, + "grad_norm": 1.0178775787353516, + "learning_rate": 0.0007971274685816876, + "loss": 0.4677, + "step": 14070 + }, + { + "epoch": 1.8055911772249296, + "grad_norm": 0.7170798778533936, + "learning_rate": 0.0007962725485167137, + "loss": 0.6669, + "step": 14080 + }, + { + "epoch": 1.8068735573223904, + "grad_norm": 0.8029789328575134, + "learning_rate": 0.0007954176284517398, + "loss": 0.4776, + "step": 14090 + }, + { + "epoch": 1.8081559374198513, + "grad_norm": 1.1515179872512817, + "learning_rate": 0.0007945627083867658, + "loss": 0.3991, + "step": 14100 + }, + { + "epoch": 1.809438317517312, + "grad_norm": 1.060905933380127, + "learning_rate": 0.0007937077883217919, + "loss": 0.5106, + "step": 14110 + }, + { + "epoch": 1.810720697614773, + "grad_norm": 0.6174659132957458, + "learning_rate": 0.000792852868256818, + "loss": 0.5666, + "step": 14120 + }, + { + "epoch": 1.8120030777122338, + "grad_norm": 1.2170026302337646, + "learning_rate": 0.0007919979481918442, + "loss": 0.6201, + "step": 14130 + }, + { + "epoch": 1.8132854578096946, + "grad_norm": 1.1146901845932007, + "learning_rate": 0.0007911430281268701, + "loss": 0.4525, + "step": 14140 + }, + { + "epoch": 1.8145678379071557, + "grad_norm": 0.9617615342140198, + "learning_rate": 0.0007902881080618962, + "loss": 0.4363, + "step": 14150 + }, + { + "epoch": 1.8158502180046165, + "grad_norm": 0.9604726433753967, + "learning_rate": 0.0007894331879969224, + "loss": 0.4125, + "step": 14160 + }, + { + "epoch": 1.8171325981020776, + "grad_norm": 1.3785549402236938, + "learning_rate": 0.0007885782679319484, + "loss": 0.5336, + "step": 14170 + }, + { + "epoch": 1.8184149781995385, + "grad_norm": 1.3045930862426758, + "learning_rate": 0.0007877233478669744, + "loss": 0.6223, + "step": 14180 + }, + { + "epoch": 1.8196973582969993, + "grad_norm": 1.0294426679611206, + "learning_rate": 0.0007868684278020006, + "loss": 0.567, + "step": 14190 + }, + { + "epoch": 1.8209797383944601, + "grad_norm": 0.7799472808837891, + "learning_rate": 0.0007860135077370266, + "loss": 0.5708, + "step": 14200 + }, + { + "epoch": 1.822262118491921, + "grad_norm": 0.9570887684822083, + "learning_rate": 0.0007851585876720527, + "loss": 0.7051, + "step": 14210 + }, + { + "epoch": 1.8235444985893818, + "grad_norm": 1.0479843616485596, + "learning_rate": 0.0007843036676070788, + "loss": 0.5006, + "step": 14220 + }, + { + "epoch": 1.8248268786868427, + "grad_norm": 0.8999461531639099, + "learning_rate": 0.0007834487475421048, + "loss": 0.5086, + "step": 14230 + }, + { + "epoch": 1.8261092587843035, + "grad_norm": 0.9917542934417725, + "learning_rate": 0.0007825938274771309, + "loss": 0.7195, + "step": 14240 + }, + { + "epoch": 1.8273916388817646, + "grad_norm": 0.7102358341217041, + "learning_rate": 0.000781738907412157, + "loss": 0.3533, + "step": 14250 + }, + { + "epoch": 1.8286740189792254, + "grad_norm": 0.8428940176963806, + "learning_rate": 0.000780883987347183, + "loss": 0.47, + "step": 14260 + }, + { + "epoch": 1.8299563990766863, + "grad_norm": 1.4296576976776123, + "learning_rate": 0.0007800290672822091, + "loss": 0.4745, + "step": 14270 + }, + { + "epoch": 1.8312387791741473, + "grad_norm": 0.6871338486671448, + "learning_rate": 0.0007791741472172352, + "loss": 0.4092, + "step": 14280 + }, + { + "epoch": 1.8325211592716082, + "grad_norm": 1.1090123653411865, + "learning_rate": 0.0007783192271522614, + "loss": 0.6469, + "step": 14290 + }, + { + "epoch": 1.833803539369069, + "grad_norm": 0.88601154088974, + "learning_rate": 0.0007774643070872873, + "loss": 0.55, + "step": 14300 + }, + { + "epoch": 1.8350859194665299, + "grad_norm": 1.2753747701644897, + "learning_rate": 0.0007766093870223134, + "loss": 0.5905, + "step": 14310 + }, + { + "epoch": 1.8363682995639907, + "grad_norm": 1.068947196006775, + "learning_rate": 0.0007757544669573396, + "loss": 0.557, + "step": 14320 + }, + { + "epoch": 1.8376506796614516, + "grad_norm": 0.49127456545829773, + "learning_rate": 0.0007748995468923655, + "loss": 0.5636, + "step": 14330 + }, + { + "epoch": 1.8389330597589124, + "grad_norm": 0.5474888682365417, + "learning_rate": 0.0007740446268273916, + "loss": 0.5462, + "step": 14340 + }, + { + "epoch": 1.8402154398563735, + "grad_norm": 0.7848386168479919, + "learning_rate": 0.0007731897067624178, + "loss": 0.4884, + "step": 14350 + }, + { + "epoch": 1.8414978199538343, + "grad_norm": 1.106774091720581, + "learning_rate": 0.0007723347866974438, + "loss": 0.5223, + "step": 14360 + }, + { + "epoch": 1.8427802000512952, + "grad_norm": 1.2404162883758545, + "learning_rate": 0.0007714798666324698, + "loss": 0.5551, + "step": 14370 + }, + { + "epoch": 1.8440625801487562, + "grad_norm": 1.1383230686187744, + "learning_rate": 0.000770624946567496, + "loss": 0.4654, + "step": 14380 + }, + { + "epoch": 1.845344960246217, + "grad_norm": 0.90556800365448, + "learning_rate": 0.000769770026502522, + "loss": 0.5061, + "step": 14390 + }, + { + "epoch": 1.846627340343678, + "grad_norm": 0.922673761844635, + "learning_rate": 0.0007689151064375481, + "loss": 0.5311, + "step": 14400 + }, + { + "epoch": 1.8479097204411388, + "grad_norm": 1.4559677839279175, + "learning_rate": 0.0007680601863725742, + "loss": 0.5076, + "step": 14410 + }, + { + "epoch": 1.8491921005385996, + "grad_norm": 0.6838889718055725, + "learning_rate": 0.0007672052663076002, + "loss": 0.6319, + "step": 14420 + }, + { + "epoch": 1.8504744806360605, + "grad_norm": 0.5238109230995178, + "learning_rate": 0.0007663503462426263, + "loss": 0.3426, + "step": 14430 + }, + { + "epoch": 1.8517568607335213, + "grad_norm": 0.9826338887214661, + "learning_rate": 0.0007654954261776525, + "loss": 0.6484, + "step": 14440 + }, + { + "epoch": 1.8530392408309821, + "grad_norm": 0.6289449334144592, + "learning_rate": 0.0007646405061126785, + "loss": 0.5969, + "step": 14450 + }, + { + "epoch": 1.8543216209284432, + "grad_norm": 0.7887721657752991, + "learning_rate": 0.0007637855860477045, + "loss": 0.6056, + "step": 14460 + }, + { + "epoch": 1.855604001025904, + "grad_norm": 0.9152905344963074, + "learning_rate": 0.0007629306659827306, + "loss": 0.568, + "step": 14470 + }, + { + "epoch": 1.8568863811233651, + "grad_norm": 1.7868821620941162, + "learning_rate": 0.0007620757459177568, + "loss": 0.5455, + "step": 14480 + }, + { + "epoch": 1.858168761220826, + "grad_norm": 0.6593843102455139, + "learning_rate": 0.0007612208258527827, + "loss": 0.5395, + "step": 14490 + }, + { + "epoch": 1.8594511413182868, + "grad_norm": 1.7809274196624756, + "learning_rate": 0.0007603659057878088, + "loss": 0.5335, + "step": 14500 + }, + { + "epoch": 1.8607335214157477, + "grad_norm": 0.5800178050994873, + "learning_rate": 0.000759510985722835, + "loss": 0.5194, + "step": 14510 + }, + { + "epoch": 1.8620159015132085, + "grad_norm": 1.2198340892791748, + "learning_rate": 0.000758656065657861, + "loss": 0.5906, + "step": 14520 + }, + { + "epoch": 1.8632982816106693, + "grad_norm": 0.9660494327545166, + "learning_rate": 0.000757801145592887, + "loss": 0.6645, + "step": 14530 + }, + { + "epoch": 1.8645806617081302, + "grad_norm": 0.9636703729629517, + "learning_rate": 0.0007569462255279132, + "loss": 0.4877, + "step": 14540 + }, + { + "epoch": 1.865863041805591, + "grad_norm": 1.3353137969970703, + "learning_rate": 0.0007560913054629392, + "loss": 0.5084, + "step": 14550 + }, + { + "epoch": 1.867145421903052, + "grad_norm": 0.5856647491455078, + "learning_rate": 0.0007552363853979653, + "loss": 0.4878, + "step": 14560 + }, + { + "epoch": 1.868427802000513, + "grad_norm": 0.8503313064575195, + "learning_rate": 0.0007543814653329914, + "loss": 0.522, + "step": 14570 + }, + { + "epoch": 1.8697101820979738, + "grad_norm": 0.700071394443512, + "learning_rate": 0.0007535265452680175, + "loss": 0.6389, + "step": 14580 + }, + { + "epoch": 1.8709925621954349, + "grad_norm": 0.879586398601532, + "learning_rate": 0.0007526716252030435, + "loss": 0.5235, + "step": 14590 + }, + { + "epoch": 1.8722749422928957, + "grad_norm": 0.9506930708885193, + "learning_rate": 0.0007518167051380697, + "loss": 0.4805, + "step": 14600 + }, + { + "epoch": 1.8735573223903565, + "grad_norm": 1.2647199630737305, + "learning_rate": 0.0007509617850730957, + "loss": 0.5111, + "step": 14610 + }, + { + "epoch": 1.8748397024878174, + "grad_norm": 0.6026537418365479, + "learning_rate": 0.0007501068650081217, + "loss": 0.3779, + "step": 14620 + }, + { + "epoch": 1.8761220825852782, + "grad_norm": 1.2891452312469482, + "learning_rate": 0.0007492519449431479, + "loss": 0.5285, + "step": 14630 + }, + { + "epoch": 1.877404462682739, + "grad_norm": 0.638653039932251, + "learning_rate": 0.000748397024878174, + "loss": 0.6281, + "step": 14640 + }, + { + "epoch": 1.8786868427802, + "grad_norm": 0.8396057486534119, + "learning_rate": 0.0007475421048131999, + "loss": 0.5179, + "step": 14650 + }, + { + "epoch": 1.8799692228776608, + "grad_norm": 0.5984233021736145, + "learning_rate": 0.000746687184748226, + "loss": 0.3829, + "step": 14660 + }, + { + "epoch": 1.8812516029751218, + "grad_norm": 0.9279236793518066, + "learning_rate": 0.0007458322646832522, + "loss": 0.4392, + "step": 14670 + }, + { + "epoch": 1.8825339830725827, + "grad_norm": 0.736960768699646, + "learning_rate": 0.0007449773446182781, + "loss": 0.5099, + "step": 14680 + }, + { + "epoch": 1.8838163631700438, + "grad_norm": 2.048767566680908, + "learning_rate": 0.0007441224245533042, + "loss": 0.5781, + "step": 14690 + }, + { + "epoch": 1.8850987432675046, + "grad_norm": 0.7400988340377808, + "learning_rate": 0.0007432675044883304, + "loss": 0.4852, + "step": 14700 + }, + { + "epoch": 1.8863811233649654, + "grad_norm": 1.009475827217102, + "learning_rate": 0.0007424125844233565, + "loss": 0.5739, + "step": 14710 + }, + { + "epoch": 1.8876635034624263, + "grad_norm": 0.7888931035995483, + "learning_rate": 0.0007415576643583824, + "loss": 0.6543, + "step": 14720 + }, + { + "epoch": 1.8889458835598871, + "grad_norm": 1.3015084266662598, + "learning_rate": 0.0007407027442934086, + "loss": 0.673, + "step": 14730 + }, + { + "epoch": 1.890228263657348, + "grad_norm": 0.8470888137817383, + "learning_rate": 0.0007398478242284347, + "loss": 0.5653, + "step": 14740 + }, + { + "epoch": 1.8915106437548088, + "grad_norm": 1.296543002128601, + "learning_rate": 0.0007389929041634607, + "loss": 0.4443, + "step": 14750 + }, + { + "epoch": 1.8927930238522697, + "grad_norm": 0.8180189728736877, + "learning_rate": 0.0007381379840984868, + "loss": 0.5177, + "step": 14760 + }, + { + "epoch": 1.8940754039497307, + "grad_norm": 1.1298378705978394, + "learning_rate": 0.0007372830640335129, + "loss": 0.4627, + "step": 14770 + }, + { + "epoch": 1.8953577840471916, + "grad_norm": 1.1105875968933105, + "learning_rate": 0.0007364281439685389, + "loss": 0.6339, + "step": 14780 + }, + { + "epoch": 1.8966401641446526, + "grad_norm": 0.8860158324241638, + "learning_rate": 0.0007355732239035651, + "loss": 0.4824, + "step": 14790 + }, + { + "epoch": 1.8979225442421135, + "grad_norm": 1.0457231998443604, + "learning_rate": 0.0007347183038385911, + "loss": 0.625, + "step": 14800 + }, + { + "epoch": 1.8992049243395743, + "grad_norm": 1.6747428178787231, + "learning_rate": 0.0007338633837736171, + "loss": 0.6475, + "step": 14810 + }, + { + "epoch": 1.9004873044370352, + "grad_norm": 0.7799145579338074, + "learning_rate": 0.0007330084637086433, + "loss": 0.6254, + "step": 14820 + }, + { + "epoch": 1.901769684534496, + "grad_norm": 0.7187017798423767, + "learning_rate": 0.0007321535436436694, + "loss": 0.5614, + "step": 14830 + }, + { + "epoch": 1.9030520646319569, + "grad_norm": 1.1263870000839233, + "learning_rate": 0.0007312986235786953, + "loss": 0.5503, + "step": 14840 + }, + { + "epoch": 1.9043344447294177, + "grad_norm": 0.8937992453575134, + "learning_rate": 0.0007304437035137215, + "loss": 0.6243, + "step": 14850 + }, + { + "epoch": 1.9056168248268786, + "grad_norm": 0.9904497861862183, + "learning_rate": 0.0007295887834487476, + "loss": 0.616, + "step": 14860 + }, + { + "epoch": 1.9068992049243396, + "grad_norm": 1.0227149724960327, + "learning_rate": 0.0007287338633837737, + "loss": 0.4517, + "step": 14870 + }, + { + "epoch": 1.9081815850218005, + "grad_norm": 0.5922604203224182, + "learning_rate": 0.0007278789433187996, + "loss": 0.3924, + "step": 14880 + }, + { + "epoch": 1.9094639651192613, + "grad_norm": 0.9521912336349487, + "learning_rate": 0.0007270240232538258, + "loss": 0.3611, + "step": 14890 + }, + { + "epoch": 1.9107463452167224, + "grad_norm": 1.0910950899124146, + "learning_rate": 0.0007261691031888519, + "loss": 0.5036, + "step": 14900 + }, + { + "epoch": 1.9120287253141832, + "grad_norm": 0.8863834738731384, + "learning_rate": 0.000725314183123878, + "loss": 0.5765, + "step": 14910 + }, + { + "epoch": 1.913311105411644, + "grad_norm": 0.6470763683319092, + "learning_rate": 0.000724459263058904, + "loss": 0.518, + "step": 14920 + }, + { + "epoch": 1.914593485509105, + "grad_norm": 1.0323649644851685, + "learning_rate": 0.0007236043429939301, + "loss": 0.4739, + "step": 14930 + }, + { + "epoch": 1.9158758656065658, + "grad_norm": 1.0393568277359009, + "learning_rate": 0.0007227494229289561, + "loss": 0.5144, + "step": 14940 + }, + { + "epoch": 1.9171582457040266, + "grad_norm": 0.9331060647964478, + "learning_rate": 0.0007218945028639823, + "loss": 0.4523, + "step": 14950 + }, + { + "epoch": 1.9184406258014874, + "grad_norm": 0.44560134410858154, + "learning_rate": 0.0007210395827990083, + "loss": 0.5049, + "step": 14960 + }, + { + "epoch": 1.9197230058989483, + "grad_norm": 0.3747738003730774, + "learning_rate": 0.0007201846627340343, + "loss": 0.5326, + "step": 14970 + }, + { + "epoch": 1.9210053859964094, + "grad_norm": 1.22909414768219, + "learning_rate": 0.0007193297426690605, + "loss": 0.6298, + "step": 14980 + }, + { + "epoch": 1.9222877660938702, + "grad_norm": 0.871557354927063, + "learning_rate": 0.0007184748226040866, + "loss": 0.4603, + "step": 14990 + }, + { + "epoch": 1.9235701461913313, + "grad_norm": 0.933385968208313, + "learning_rate": 0.0007176199025391125, + "loss": 0.4374, + "step": 15000 + }, + { + "epoch": 1.924852526288792, + "grad_norm": 1.254412293434143, + "learning_rate": 0.0007167649824741387, + "loss": 0.5107, + "step": 15010 + }, + { + "epoch": 1.926134906386253, + "grad_norm": 0.7056450247764587, + "learning_rate": 0.0007159100624091648, + "loss": 0.6199, + "step": 15020 + }, + { + "epoch": 1.9274172864837138, + "grad_norm": 1.064945936203003, + "learning_rate": 0.0007150551423441908, + "loss": 0.6342, + "step": 15030 + }, + { + "epoch": 1.9286996665811746, + "grad_norm": 1.5574430227279663, + "learning_rate": 0.0007142002222792169, + "loss": 0.452, + "step": 15040 + }, + { + "epoch": 1.9299820466786355, + "grad_norm": 1.3377269506454468, + "learning_rate": 0.000713345302214243, + "loss": 0.6947, + "step": 15050 + }, + { + "epoch": 1.9312644267760963, + "grad_norm": 2.000349760055542, + "learning_rate": 0.0007124903821492691, + "loss": 0.555, + "step": 15060 + }, + { + "epoch": 1.9325468068735572, + "grad_norm": 1.7576501369476318, + "learning_rate": 0.000711635462084295, + "loss": 0.5503, + "step": 15070 + }, + { + "epoch": 1.9338291869710182, + "grad_norm": 0.6069478392601013, + "learning_rate": 0.0007107805420193212, + "loss": 0.4408, + "step": 15080 + }, + { + "epoch": 1.935111567068479, + "grad_norm": 0.8294945955276489, + "learning_rate": 0.0007099256219543473, + "loss": 0.4914, + "step": 15090 + }, + { + "epoch": 1.93639394716594, + "grad_norm": 0.6512126922607422, + "learning_rate": 0.0007090707018893733, + "loss": 0.5977, + "step": 15100 + }, + { + "epoch": 1.937676327263401, + "grad_norm": 0.736539363861084, + "learning_rate": 0.0007082157818243994, + "loss": 0.4151, + "step": 15110 + }, + { + "epoch": 1.9389587073608618, + "grad_norm": 0.33729881048202515, + "learning_rate": 0.0007073608617594255, + "loss": 0.5454, + "step": 15120 + }, + { + "epoch": 1.9402410874583227, + "grad_norm": 0.603800356388092, + "learning_rate": 0.0007065059416944515, + "loss": 0.3752, + "step": 15130 + }, + { + "epoch": 1.9415234675557835, + "grad_norm": 1.2846564054489136, + "learning_rate": 0.0007056510216294777, + "loss": 0.4826, + "step": 15140 + }, + { + "epoch": 1.9428058476532444, + "grad_norm": 0.5370314717292786, + "learning_rate": 0.0007047961015645037, + "loss": 0.4963, + "step": 15150 + }, + { + "epoch": 1.9440882277507052, + "grad_norm": 1.2183728218078613, + "learning_rate": 0.0007039411814995298, + "loss": 0.6168, + "step": 15160 + }, + { + "epoch": 1.945370607848166, + "grad_norm": 1.1323776245117188, + "learning_rate": 0.0007030862614345559, + "loss": 0.5229, + "step": 15170 + }, + { + "epoch": 1.9466529879456271, + "grad_norm": 0.6309476494789124, + "learning_rate": 0.000702231341369582, + "loss": 0.5992, + "step": 15180 + }, + { + "epoch": 1.947935368043088, + "grad_norm": 1.0059658288955688, + "learning_rate": 0.000701376421304608, + "loss": 0.6053, + "step": 15190 + }, + { + "epoch": 1.9492177481405488, + "grad_norm": 1.3484851121902466, + "learning_rate": 0.0007005215012396341, + "loss": 0.6799, + "step": 15200 + }, + { + "epoch": 1.95050012823801, + "grad_norm": 1.7294602394104004, + "learning_rate": 0.0006996665811746602, + "loss": 0.5543, + "step": 15210 + }, + { + "epoch": 1.9517825083354707, + "grad_norm": 0.3680081367492676, + "learning_rate": 0.0006988116611096863, + "loss": 0.6018, + "step": 15220 + }, + { + "epoch": 1.9530648884329316, + "grad_norm": 0.649849534034729, + "learning_rate": 0.0006979567410447123, + "loss": 0.4314, + "step": 15230 + }, + { + "epoch": 1.9543472685303924, + "grad_norm": 1.2836802005767822, + "learning_rate": 0.0006971018209797384, + "loss": 0.5788, + "step": 15240 + }, + { + "epoch": 1.9556296486278533, + "grad_norm": 0.961693525314331, + "learning_rate": 0.0006962469009147645, + "loss": 0.4917, + "step": 15250 + }, + { + "epoch": 1.9569120287253141, + "grad_norm": 0.6490185856819153, + "learning_rate": 0.0006953919808497906, + "loss": 0.5938, + "step": 15260 + }, + { + "epoch": 1.958194408822775, + "grad_norm": 1.116169810295105, + "learning_rate": 0.0006945370607848166, + "loss": 0.4643, + "step": 15270 + }, + { + "epoch": 1.9594767889202358, + "grad_norm": 0.6350299715995789, + "learning_rate": 0.0006936821407198427, + "loss": 0.3787, + "step": 15280 + }, + { + "epoch": 1.9607591690176969, + "grad_norm": 1.2164093255996704, + "learning_rate": 0.0006928272206548689, + "loss": 0.5109, + "step": 15290 + }, + { + "epoch": 1.9620415491151577, + "grad_norm": 0.6592891812324524, + "learning_rate": 0.0006919723005898949, + "loss": 0.4356, + "step": 15300 + }, + { + "epoch": 1.9633239292126188, + "grad_norm": 0.9608144760131836, + "learning_rate": 0.0006911173805249209, + "loss": 0.4848, + "step": 15310 + }, + { + "epoch": 1.9646063093100796, + "grad_norm": 1.3843706846237183, + "learning_rate": 0.000690262460459947, + "loss": 0.6653, + "step": 15320 + }, + { + "epoch": 1.9658886894075405, + "grad_norm": 0.7894043922424316, + "learning_rate": 0.0006894075403949731, + "loss": 0.5874, + "step": 15330 + }, + { + "epoch": 1.9671710695050013, + "grad_norm": 0.7226264476776123, + "learning_rate": 0.0006885526203299992, + "loss": 0.4726, + "step": 15340 + }, + { + "epoch": 1.9684534496024622, + "grad_norm": 1.4548835754394531, + "learning_rate": 0.0006876977002650252, + "loss": 0.525, + "step": 15350 + }, + { + "epoch": 1.969735829699923, + "grad_norm": 0.6473925709724426, + "learning_rate": 0.0006868427802000513, + "loss": 0.4194, + "step": 15360 + }, + { + "epoch": 1.9710182097973838, + "grad_norm": 0.42092105746269226, + "learning_rate": 0.0006859878601350774, + "loss": 0.3743, + "step": 15370 + }, + { + "epoch": 1.9723005898948447, + "grad_norm": 0.8969188332557678, + "learning_rate": 0.0006851329400701034, + "loss": 0.5525, + "step": 15380 + }, + { + "epoch": 1.9735829699923058, + "grad_norm": 0.8764629364013672, + "learning_rate": 0.0006842780200051295, + "loss": 0.6307, + "step": 15390 + }, + { + "epoch": 1.9748653500897666, + "grad_norm": 0.4493338167667389, + "learning_rate": 0.0006834230999401556, + "loss": 0.5157, + "step": 15400 + }, + { + "epoch": 1.9761477301872274, + "grad_norm": 1.2919282913208008, + "learning_rate": 0.0006825681798751817, + "loss": 0.6005, + "step": 15410 + }, + { + "epoch": 1.9774301102846885, + "grad_norm": 0.78176349401474, + "learning_rate": 0.0006817132598102077, + "loss": 0.4165, + "step": 15420 + }, + { + "epoch": 1.9787124903821494, + "grad_norm": 0.7286581993103027, + "learning_rate": 0.0006808583397452338, + "loss": 0.481, + "step": 15430 + }, + { + "epoch": 1.9799948704796102, + "grad_norm": 0.9931614995002747, + "learning_rate": 0.0006800034196802599, + "loss": 0.5327, + "step": 15440 + }, + { + "epoch": 1.981277250577071, + "grad_norm": 0.9504096508026123, + "learning_rate": 0.0006791484996152861, + "loss": 0.5105, + "step": 15450 + }, + { + "epoch": 1.982559630674532, + "grad_norm": 1.473724365234375, + "learning_rate": 0.000678293579550312, + "loss": 0.5, + "step": 15460 + }, + { + "epoch": 1.9838420107719927, + "grad_norm": 0.9803527593612671, + "learning_rate": 0.0006774386594853381, + "loss": 0.4608, + "step": 15470 + }, + { + "epoch": 1.9851243908694536, + "grad_norm": 0.7079563736915588, + "learning_rate": 0.0006765837394203643, + "loss": 0.3944, + "step": 15480 + }, + { + "epoch": 1.9864067709669146, + "grad_norm": 1.4155352115631104, + "learning_rate": 0.0006757288193553903, + "loss": 0.5133, + "step": 15490 + }, + { + "epoch": 1.9876891510643755, + "grad_norm": 1.1894326210021973, + "learning_rate": 0.0006748738992904163, + "loss": 0.4759, + "step": 15500 + }, + { + "epoch": 1.9889715311618363, + "grad_norm": 0.5845767259597778, + "learning_rate": 0.0006740189792254425, + "loss": 0.4065, + "step": 15510 + }, + { + "epoch": 1.9902539112592974, + "grad_norm": 0.3843328654766083, + "learning_rate": 0.0006731640591604685, + "loss": 0.3548, + "step": 15520 + }, + { + "epoch": 1.9915362913567582, + "grad_norm": 1.3628671169281006, + "learning_rate": 0.0006723091390954946, + "loss": 0.3994, + "step": 15530 + }, + { + "epoch": 1.992818671454219, + "grad_norm": 0.7082588076591492, + "learning_rate": 0.0006714542190305206, + "loss": 0.3953, + "step": 15540 + }, + { + "epoch": 1.99410105155168, + "grad_norm": 0.56044602394104, + "learning_rate": 0.0006705992989655467, + "loss": 0.4706, + "step": 15550 + }, + { + "epoch": 1.9953834316491408, + "grad_norm": 0.6746466159820557, + "learning_rate": 0.0006697443789005728, + "loss": 0.4086, + "step": 15560 + }, + { + "epoch": 1.9966658117466016, + "grad_norm": 0.8921716213226318, + "learning_rate": 0.000668889458835599, + "loss": 0.4959, + "step": 15570 + }, + { + "epoch": 1.9979481918440625, + "grad_norm": 1.0937660932540894, + "learning_rate": 0.0006680345387706249, + "loss": 0.4432, + "step": 15580 + }, + { + "epoch": 1.9992305719415233, + "grad_norm": 0.7332781553268433, + "learning_rate": 0.000667179618705651, + "loss": 0.4807, + "step": 15590 + }, + { + "epoch": 2.000512952038984, + "grad_norm": 0.775030791759491, + "learning_rate": 0.0006663246986406771, + "loss": 0.4139, + "step": 15600 + }, + { + "epoch": 2.0017953321364454, + "grad_norm": 0.6231206059455872, + "learning_rate": 0.0006654697785757033, + "loss": 0.5095, + "step": 15610 + }, + { + "epoch": 2.0030777122339063, + "grad_norm": 0.7950479388237, + "learning_rate": 0.0006646148585107292, + "loss": 0.4397, + "step": 15620 + }, + { + "epoch": 2.004360092331367, + "grad_norm": 0.970693051815033, + "learning_rate": 0.0006637599384457553, + "loss": 0.4875, + "step": 15630 + }, + { + "epoch": 2.005642472428828, + "grad_norm": 0.5207669138908386, + "learning_rate": 0.0006629050183807815, + "loss": 0.4713, + "step": 15640 + }, + { + "epoch": 2.006924852526289, + "grad_norm": 0.8894481062889099, + "learning_rate": 0.0006620500983158075, + "loss": 0.3892, + "step": 15650 + }, + { + "epoch": 2.0082072326237497, + "grad_norm": 0.9765975475311279, + "learning_rate": 0.0006611951782508335, + "loss": 0.5046, + "step": 15660 + }, + { + "epoch": 2.0094896127212105, + "grad_norm": 0.6186564564704895, + "learning_rate": 0.0006603402581858597, + "loss": 0.435, + "step": 15670 + }, + { + "epoch": 2.0107719928186714, + "grad_norm": 1.0802409648895264, + "learning_rate": 0.0006594853381208857, + "loss": 0.379, + "step": 15680 + }, + { + "epoch": 2.012054372916132, + "grad_norm": 0.519303560256958, + "learning_rate": 0.0006586304180559118, + "loss": 0.3067, + "step": 15690 + }, + { + "epoch": 2.013336753013593, + "grad_norm": 0.6543425917625427, + "learning_rate": 0.0006577754979909379, + "loss": 0.4907, + "step": 15700 + }, + { + "epoch": 2.0146191331110543, + "grad_norm": 1.0013840198516846, + "learning_rate": 0.0006569205779259639, + "loss": 0.3811, + "step": 15710 + }, + { + "epoch": 2.015901513208515, + "grad_norm": 1.0863186120986938, + "learning_rate": 0.00065606565786099, + "loss": 0.4327, + "step": 15720 + }, + { + "epoch": 2.017183893305976, + "grad_norm": 0.8166930079460144, + "learning_rate": 0.000655210737796016, + "loss": 0.4129, + "step": 15730 + }, + { + "epoch": 2.018466273403437, + "grad_norm": 0.8001251220703125, + "learning_rate": 0.0006543558177310422, + "loss": 0.3394, + "step": 15740 + }, + { + "epoch": 2.0197486535008977, + "grad_norm": 1.3382858037948608, + "learning_rate": 0.0006535008976660682, + "loss": 0.4998, + "step": 15750 + }, + { + "epoch": 2.0210310335983586, + "grad_norm": 0.8801462054252625, + "learning_rate": 0.0006526459776010944, + "loss": 0.4464, + "step": 15760 + }, + { + "epoch": 2.0223134136958194, + "grad_norm": 0.940180778503418, + "learning_rate": 0.0006517910575361204, + "loss": 0.4152, + "step": 15770 + }, + { + "epoch": 2.0235957937932803, + "grad_norm": 0.6335304379463196, + "learning_rate": 0.0006509361374711464, + "loss": 0.3804, + "step": 15780 + }, + { + "epoch": 2.024878173890741, + "grad_norm": 0.5638919472694397, + "learning_rate": 0.0006500812174061725, + "loss": 0.4404, + "step": 15790 + }, + { + "epoch": 2.026160553988202, + "grad_norm": 1.3646224737167358, + "learning_rate": 0.0006492262973411987, + "loss": 0.4917, + "step": 15800 + }, + { + "epoch": 2.027442934085663, + "grad_norm": 0.39091867208480835, + "learning_rate": 0.0006483713772762246, + "loss": 0.4169, + "step": 15810 + }, + { + "epoch": 2.028725314183124, + "grad_norm": 1.3595271110534668, + "learning_rate": 0.0006475164572112507, + "loss": 0.4892, + "step": 15820 + }, + { + "epoch": 2.030007694280585, + "grad_norm": 0.7239012718200684, + "learning_rate": 0.0006466615371462769, + "loss": 0.4574, + "step": 15830 + }, + { + "epoch": 2.0312900743780458, + "grad_norm": 1.1888518333435059, + "learning_rate": 0.0006458066170813029, + "loss": 0.3847, + "step": 15840 + }, + { + "epoch": 2.0325724544755066, + "grad_norm": 0.48686522245407104, + "learning_rate": 0.0006449516970163289, + "loss": 0.2997, + "step": 15850 + }, + { + "epoch": 2.0338548345729675, + "grad_norm": 0.963004469871521, + "learning_rate": 0.0006440967769513551, + "loss": 0.4397, + "step": 15860 + }, + { + "epoch": 2.0351372146704283, + "grad_norm": 0.45735833048820496, + "learning_rate": 0.0006432418568863811, + "loss": 0.461, + "step": 15870 + }, + { + "epoch": 2.036419594767889, + "grad_norm": 1.019104242324829, + "learning_rate": 0.0006423869368214072, + "loss": 0.4187, + "step": 15880 + }, + { + "epoch": 2.03770197486535, + "grad_norm": 0.6047408580780029, + "learning_rate": 0.0006415320167564333, + "loss": 0.4521, + "step": 15890 + }, + { + "epoch": 2.038984354962811, + "grad_norm": 1.1490784883499146, + "learning_rate": 0.0006406770966914594, + "loss": 0.378, + "step": 15900 + }, + { + "epoch": 2.0402667350602717, + "grad_norm": 1.2890042066574097, + "learning_rate": 0.0006398221766264854, + "loss": 0.3913, + "step": 15910 + }, + { + "epoch": 2.041549115157733, + "grad_norm": 0.7499234676361084, + "learning_rate": 0.0006389672565615116, + "loss": 0.3937, + "step": 15920 + }, + { + "epoch": 2.042831495255194, + "grad_norm": 0.600645899772644, + "learning_rate": 0.0006381123364965376, + "loss": 0.3343, + "step": 15930 + }, + { + "epoch": 2.0441138753526547, + "grad_norm": 1.029549479484558, + "learning_rate": 0.0006372574164315636, + "loss": 0.4275, + "step": 15940 + }, + { + "epoch": 2.0453962554501155, + "grad_norm": 1.660400629043579, + "learning_rate": 0.0006364024963665898, + "loss": 0.5022, + "step": 15950 + }, + { + "epoch": 2.0466786355475763, + "grad_norm": 0.932639479637146, + "learning_rate": 0.0006355475763016159, + "loss": 0.3318, + "step": 15960 + }, + { + "epoch": 2.047961015645037, + "grad_norm": 0.5352082252502441, + "learning_rate": 0.0006346926562366418, + "loss": 0.459, + "step": 15970 + }, + { + "epoch": 2.049243395742498, + "grad_norm": 0.41553980112075806, + "learning_rate": 0.000633837736171668, + "loss": 0.4066, + "step": 15980 + }, + { + "epoch": 2.050525775839959, + "grad_norm": 0.6084936261177063, + "learning_rate": 0.0006329828161066941, + "loss": 0.3279, + "step": 15990 + }, + { + "epoch": 2.0518081559374197, + "grad_norm": 1.441450834274292, + "learning_rate": 0.0006321278960417201, + "loss": 0.4136, + "step": 16000 + }, + { + "epoch": 2.0530905360348806, + "grad_norm": 0.9884285926818848, + "learning_rate": 0.0006312729759767461, + "loss": 0.3913, + "step": 16010 + }, + { + "epoch": 2.0543729161323414, + "grad_norm": 1.6738002300262451, + "learning_rate": 0.0006304180559117723, + "loss": 0.4175, + "step": 16020 + }, + { + "epoch": 2.0556552962298027, + "grad_norm": 1.0428452491760254, + "learning_rate": 0.0006295631358467984, + "loss": 0.4279, + "step": 16030 + }, + { + "epoch": 2.0569376763272635, + "grad_norm": 1.673563838005066, + "learning_rate": 0.0006287082157818244, + "loss": 0.4554, + "step": 16040 + }, + { + "epoch": 2.0582200564247244, + "grad_norm": 0.5701791048049927, + "learning_rate": 0.0006278532957168505, + "loss": 0.3846, + "step": 16050 + }, + { + "epoch": 2.0595024365221852, + "grad_norm": 0.9378145337104797, + "learning_rate": 0.0006269983756518766, + "loss": 0.4478, + "step": 16060 + }, + { + "epoch": 2.060784816619646, + "grad_norm": 0.7080726623535156, + "learning_rate": 0.0006261434555869026, + "loss": 0.3637, + "step": 16070 + }, + { + "epoch": 2.062067196717107, + "grad_norm": 1.104427456855774, + "learning_rate": 0.0006252885355219287, + "loss": 0.3889, + "step": 16080 + }, + { + "epoch": 2.0633495768145678, + "grad_norm": 0.49368712306022644, + "learning_rate": 0.0006244336154569548, + "loss": 0.3342, + "step": 16090 + }, + { + "epoch": 2.0646319569120286, + "grad_norm": 0.5924476385116577, + "learning_rate": 0.0006235786953919808, + "loss": 0.383, + "step": 16100 + }, + { + "epoch": 2.0659143370094895, + "grad_norm": 0.8648740649223328, + "learning_rate": 0.000622723775327007, + "loss": 0.3354, + "step": 16110 + }, + { + "epoch": 2.0671967171069503, + "grad_norm": 0.9857394695281982, + "learning_rate": 0.000621868855262033, + "loss": 0.2874, + "step": 16120 + }, + { + "epoch": 2.0684790972044116, + "grad_norm": 0.6319371461868286, + "learning_rate": 0.000621013935197059, + "loss": 0.3847, + "step": 16130 + }, + { + "epoch": 2.0697614773018724, + "grad_norm": 1.4830057621002197, + "learning_rate": 0.0006201590151320852, + "loss": 0.3945, + "step": 16140 + }, + { + "epoch": 2.0710438573993333, + "grad_norm": 1.0306016206741333, + "learning_rate": 0.0006193040950671113, + "loss": 0.4081, + "step": 16150 + }, + { + "epoch": 2.072326237496794, + "grad_norm": 0.6749256253242493, + "learning_rate": 0.0006184491750021372, + "loss": 0.314, + "step": 16160 + }, + { + "epoch": 2.073608617594255, + "grad_norm": 0.7656669020652771, + "learning_rate": 0.0006175942549371633, + "loss": 0.3983, + "step": 16170 + }, + { + "epoch": 2.074890997691716, + "grad_norm": 0.7537424564361572, + "learning_rate": 0.0006167393348721895, + "loss": 0.3204, + "step": 16180 + }, + { + "epoch": 2.0761733777891767, + "grad_norm": 0.45361366868019104, + "learning_rate": 0.0006158844148072156, + "loss": 0.4101, + "step": 16190 + }, + { + "epoch": 2.0774557578866375, + "grad_norm": 1.6658540964126587, + "learning_rate": 0.0006150294947422415, + "loss": 0.4625, + "step": 16200 + }, + { + "epoch": 2.0787381379840983, + "grad_norm": 0.9616145491600037, + "learning_rate": 0.0006141745746772677, + "loss": 0.343, + "step": 16210 + }, + { + "epoch": 2.080020518081559, + "grad_norm": 1.1583889722824097, + "learning_rate": 0.0006133196546122938, + "loss": 0.3612, + "step": 16220 + }, + { + "epoch": 2.08130289817902, + "grad_norm": 0.46162256598472595, + "learning_rate": 0.0006124647345473198, + "loss": 0.3976, + "step": 16230 + }, + { + "epoch": 2.0825852782764813, + "grad_norm": 0.5580847859382629, + "learning_rate": 0.0006116098144823459, + "loss": 0.3302, + "step": 16240 + }, + { + "epoch": 2.083867658373942, + "grad_norm": 0.9140333533287048, + "learning_rate": 0.000610754894417372, + "loss": 0.4096, + "step": 16250 + }, + { + "epoch": 2.085150038471403, + "grad_norm": 1.2011090517044067, + "learning_rate": 0.000609899974352398, + "loss": 0.4436, + "step": 16260 + }, + { + "epoch": 2.086432418568864, + "grad_norm": 0.5058355331420898, + "learning_rate": 0.0006090450542874242, + "loss": 0.4824, + "step": 16270 + }, + { + "epoch": 2.0877147986663247, + "grad_norm": 0.9225788712501526, + "learning_rate": 0.0006081901342224502, + "loss": 0.3696, + "step": 16280 + }, + { + "epoch": 2.0889971787637855, + "grad_norm": 0.8377031683921814, + "learning_rate": 0.0006073352141574762, + "loss": 0.4635, + "step": 16290 + }, + { + "epoch": 2.0902795588612464, + "grad_norm": 1.528792142868042, + "learning_rate": 0.0006064802940925024, + "loss": 0.4176, + "step": 16300 + }, + { + "epoch": 2.0915619389587072, + "grad_norm": 0.798425555229187, + "learning_rate": 0.0006056253740275285, + "loss": 0.3938, + "step": 16310 + }, + { + "epoch": 2.092844319056168, + "grad_norm": 0.49224352836608887, + "learning_rate": 0.0006047704539625545, + "loss": 0.3357, + "step": 16320 + }, + { + "epoch": 2.094126699153629, + "grad_norm": 0.5816643238067627, + "learning_rate": 0.0006039155338975806, + "loss": 0.3523, + "step": 16330 + }, + { + "epoch": 2.09540907925109, + "grad_norm": 0.7259325385093689, + "learning_rate": 0.0006030606138326067, + "loss": 0.4355, + "step": 16340 + }, + { + "epoch": 2.096691459348551, + "grad_norm": 0.8192687630653381, + "learning_rate": 0.0006022056937676328, + "loss": 0.4217, + "step": 16350 + }, + { + "epoch": 2.097973839446012, + "grad_norm": 1.0315042734146118, + "learning_rate": 0.0006013507737026588, + "loss": 0.4108, + "step": 16360 + }, + { + "epoch": 2.0992562195434727, + "grad_norm": 0.7295234203338623, + "learning_rate": 0.0006004958536376849, + "loss": 0.3817, + "step": 16370 + }, + { + "epoch": 2.1005385996409336, + "grad_norm": 0.8055635094642639, + "learning_rate": 0.000599640933572711, + "loss": 0.4461, + "step": 16380 + }, + { + "epoch": 2.1018209797383944, + "grad_norm": 0.9838399887084961, + "learning_rate": 0.000598786013507737, + "loss": 0.4033, + "step": 16390 + }, + { + "epoch": 2.1031033598358553, + "grad_norm": 0.9164043068885803, + "learning_rate": 0.0005979310934427631, + "loss": 0.4508, + "step": 16400 + }, + { + "epoch": 2.104385739933316, + "grad_norm": 1.4616590738296509, + "learning_rate": 0.0005970761733777892, + "loss": 0.4832, + "step": 16410 + }, + { + "epoch": 2.105668120030777, + "grad_norm": 0.7076154351234436, + "learning_rate": 0.0005962212533128152, + "loss": 0.3766, + "step": 16420 + }, + { + "epoch": 2.106950500128238, + "grad_norm": 0.9713407754898071, + "learning_rate": 0.0005953663332478413, + "loss": 0.4856, + "step": 16430 + }, + { + "epoch": 2.108232880225699, + "grad_norm": 0.5862424373626709, + "learning_rate": 0.0005945114131828674, + "loss": 0.3552, + "step": 16440 + }, + { + "epoch": 2.10951526032316, + "grad_norm": 1.3312978744506836, + "learning_rate": 0.0005936564931178934, + "loss": 0.4499, + "step": 16450 + }, + { + "epoch": 2.110797640420621, + "grad_norm": 0.790224552154541, + "learning_rate": 0.0005928015730529196, + "loss": 0.4647, + "step": 16460 + }, + { + "epoch": 2.1120800205180816, + "grad_norm": 0.6152584552764893, + "learning_rate": 0.0005919466529879456, + "loss": 0.4688, + "step": 16470 + }, + { + "epoch": 2.1133624006155425, + "grad_norm": 0.586744487285614, + "learning_rate": 0.0005910917329229717, + "loss": 0.3254, + "step": 16480 + }, + { + "epoch": 2.1146447807130033, + "grad_norm": 0.9276888370513916, + "learning_rate": 0.0005902368128579978, + "loss": 0.4067, + "step": 16490 + }, + { + "epoch": 2.115927160810464, + "grad_norm": 0.5232440829277039, + "learning_rate": 0.0005893818927930239, + "loss": 0.2955, + "step": 16500 + }, + { + "epoch": 2.117209540907925, + "grad_norm": 1.1610713005065918, + "learning_rate": 0.0005885269727280499, + "loss": 0.3482, + "step": 16510 + }, + { + "epoch": 2.118491921005386, + "grad_norm": 0.8713477849960327, + "learning_rate": 0.000587672052663076, + "loss": 0.3607, + "step": 16520 + }, + { + "epoch": 2.1197743011028467, + "grad_norm": 1.2299057245254517, + "learning_rate": 0.0005868171325981021, + "loss": 0.489, + "step": 16530 + }, + { + "epoch": 2.121056681200308, + "grad_norm": 0.8449939489364624, + "learning_rate": 0.0005859622125331282, + "loss": 0.4024, + "step": 16540 + }, + { + "epoch": 2.122339061297769, + "grad_norm": 1.0268441438674927, + "learning_rate": 0.0005851072924681542, + "loss": 0.4341, + "step": 16550 + }, + { + "epoch": 2.1236214413952297, + "grad_norm": 0.7868974804878235, + "learning_rate": 0.0005842523724031803, + "loss": 0.3437, + "step": 16560 + }, + { + "epoch": 2.1249038214926905, + "grad_norm": 0.45466360449790955, + "learning_rate": 0.0005833974523382064, + "loss": 0.3059, + "step": 16570 + }, + { + "epoch": 2.1261862015901514, + "grad_norm": 0.604418933391571, + "learning_rate": 0.0005825425322732325, + "loss": 0.4058, + "step": 16580 + }, + { + "epoch": 2.127468581687612, + "grad_norm": 1.0346992015838623, + "learning_rate": 0.0005816876122082585, + "loss": 0.3903, + "step": 16590 + }, + { + "epoch": 2.128750961785073, + "grad_norm": 0.8088748455047607, + "learning_rate": 0.0005808326921432846, + "loss": 0.4438, + "step": 16600 + }, + { + "epoch": 2.130033341882534, + "grad_norm": 1.0457253456115723, + "learning_rate": 0.0005799777720783108, + "loss": 0.444, + "step": 16610 + }, + { + "epoch": 2.1313157219799947, + "grad_norm": 1.0352778434753418, + "learning_rate": 0.0005791228520133368, + "loss": 0.3832, + "step": 16620 + }, + { + "epoch": 2.1325981020774556, + "grad_norm": 0.9149858355522156, + "learning_rate": 0.0005782679319483628, + "loss": 0.3834, + "step": 16630 + }, + { + "epoch": 2.1338804821749164, + "grad_norm": 0.8805481791496277, + "learning_rate": 0.000577413011883389, + "loss": 0.4027, + "step": 16640 + }, + { + "epoch": 2.1351628622723777, + "grad_norm": 1.2850439548492432, + "learning_rate": 0.000576558091818415, + "loss": 0.5138, + "step": 16650 + }, + { + "epoch": 2.1364452423698386, + "grad_norm": 1.2789738178253174, + "learning_rate": 0.0005757031717534411, + "loss": 0.3797, + "step": 16660 + }, + { + "epoch": 2.1377276224672994, + "grad_norm": 1.1163911819458008, + "learning_rate": 0.0005748482516884671, + "loss": 0.4236, + "step": 16670 + }, + { + "epoch": 2.1390100025647603, + "grad_norm": 1.351048469543457, + "learning_rate": 0.0005739933316234932, + "loss": 0.5308, + "step": 16680 + }, + { + "epoch": 2.140292382662221, + "grad_norm": 0.5716691613197327, + "learning_rate": 0.0005731384115585193, + "loss": 0.4708, + "step": 16690 + }, + { + "epoch": 2.141574762759682, + "grad_norm": 0.4324432909488678, + "learning_rate": 0.0005722834914935454, + "loss": 0.4613, + "step": 16700 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 1.034508228302002, + "learning_rate": 0.0005714285714285714, + "loss": 0.4185, + "step": 16710 + }, + { + "epoch": 2.1441395229546036, + "grad_norm": 0.8945391774177551, + "learning_rate": 0.0005705736513635975, + "loss": 0.3495, + "step": 16720 + }, + { + "epoch": 2.1454219030520645, + "grad_norm": 1.2209150791168213, + "learning_rate": 0.0005697187312986236, + "loss": 0.3843, + "step": 16730 + }, + { + "epoch": 2.1467042831495253, + "grad_norm": 0.5386025309562683, + "learning_rate": 0.0005688638112336498, + "loss": 0.4124, + "step": 16740 + }, + { + "epoch": 2.1479866632469866, + "grad_norm": 0.7066617012023926, + "learning_rate": 0.0005680088911686757, + "loss": 0.3361, + "step": 16750 + }, + { + "epoch": 2.1492690433444475, + "grad_norm": 0.8300710916519165, + "learning_rate": 0.0005671539711037018, + "loss": 0.4056, + "step": 16760 + }, + { + "epoch": 2.1505514234419083, + "grad_norm": 1.522805094718933, + "learning_rate": 0.000566299051038728, + "loss": 0.4305, + "step": 16770 + }, + { + "epoch": 2.151833803539369, + "grad_norm": 0.5003076791763306, + "learning_rate": 0.0005654441309737539, + "loss": 0.3918, + "step": 16780 + }, + { + "epoch": 2.15311618363683, + "grad_norm": 1.1118338108062744, + "learning_rate": 0.00056458921090878, + "loss": 0.4591, + "step": 16790 + }, + { + "epoch": 2.154398563734291, + "grad_norm": 0.6473600268363953, + "learning_rate": 0.0005637342908438062, + "loss": 0.3684, + "step": 16800 + }, + { + "epoch": 2.1556809438317517, + "grad_norm": 0.5967467427253723, + "learning_rate": 0.0005628793707788322, + "loss": 0.4068, + "step": 16810 + }, + { + "epoch": 2.1569633239292125, + "grad_norm": 1.1594890356063843, + "learning_rate": 0.0005620244507138582, + "loss": 0.3506, + "step": 16820 + }, + { + "epoch": 2.1582457040266734, + "grad_norm": 0.6853704452514648, + "learning_rate": 0.0005611695306488844, + "loss": 0.4528, + "step": 16830 + }, + { + "epoch": 2.159528084124134, + "grad_norm": 0.7889552116394043, + "learning_rate": 0.0005603146105839104, + "loss": 0.4236, + "step": 16840 + }, + { + "epoch": 2.160810464221595, + "grad_norm": 1.3520945310592651, + "learning_rate": 0.0005594596905189365, + "loss": 0.521, + "step": 16850 + }, + { + "epoch": 2.1620928443190564, + "grad_norm": 1.283141851425171, + "learning_rate": 0.0005586047704539625, + "loss": 0.3847, + "step": 16860 + }, + { + "epoch": 2.163375224416517, + "grad_norm": 0.6394121050834656, + "learning_rate": 0.0005577498503889886, + "loss": 0.4256, + "step": 16870 + }, + { + "epoch": 2.164657604513978, + "grad_norm": 0.9717941880226135, + "learning_rate": 0.0005568949303240147, + "loss": 0.3616, + "step": 16880 + }, + { + "epoch": 2.165939984611439, + "grad_norm": 1.2002935409545898, + "learning_rate": 0.0005560400102590408, + "loss": 0.3632, + "step": 16890 + }, + { + "epoch": 2.1672223647088997, + "grad_norm": 1.209804654121399, + "learning_rate": 0.0005551850901940669, + "loss": 0.3692, + "step": 16900 + }, + { + "epoch": 2.1685047448063606, + "grad_norm": 1.1191928386688232, + "learning_rate": 0.0005543301701290929, + "loss": 0.3921, + "step": 16910 + }, + { + "epoch": 2.1697871249038214, + "grad_norm": 1.0837756395339966, + "learning_rate": 0.000553475250064119, + "loss": 0.3838, + "step": 16920 + }, + { + "epoch": 2.1710695050012823, + "grad_norm": 0.950324296951294, + "learning_rate": 0.0005526203299991452, + "loss": 0.4226, + "step": 16930 + }, + { + "epoch": 2.172351885098743, + "grad_norm": 0.4663751423358917, + "learning_rate": 0.0005517654099341711, + "loss": 0.412, + "step": 16940 + }, + { + "epoch": 2.173634265196204, + "grad_norm": 1.0437508821487427, + "learning_rate": 0.0005509104898691972, + "loss": 0.3732, + "step": 16950 + }, + { + "epoch": 2.1749166452936652, + "grad_norm": 0.5461912155151367, + "learning_rate": 0.0005500555698042234, + "loss": 0.4139, + "step": 16960 + }, + { + "epoch": 2.176199025391126, + "grad_norm": 0.7421871423721313, + "learning_rate": 0.0005492006497392494, + "loss": 0.3798, + "step": 16970 + }, + { + "epoch": 2.177481405488587, + "grad_norm": 0.5213621854782104, + "learning_rate": 0.0005483457296742754, + "loss": 0.4407, + "step": 16980 + }, + { + "epoch": 2.1787637855860478, + "grad_norm": 1.0616869926452637, + "learning_rate": 0.0005474908096093016, + "loss": 0.4209, + "step": 16990 + }, + { + "epoch": 2.1800461656835086, + "grad_norm": 0.861587405204773, + "learning_rate": 0.0005466358895443276, + "loss": 0.3885, + "step": 17000 + }, + { + "epoch": 2.1813285457809695, + "grad_norm": 0.4381602704524994, + "learning_rate": 0.0005457809694793537, + "loss": 0.5041, + "step": 17010 + }, + { + "epoch": 2.1826109258784303, + "grad_norm": 0.8949865102767944, + "learning_rate": 0.0005449260494143798, + "loss": 0.5102, + "step": 17020 + }, + { + "epoch": 2.183893305975891, + "grad_norm": 0.879464328289032, + "learning_rate": 0.0005440711293494058, + "loss": 0.536, + "step": 17030 + }, + { + "epoch": 2.185175686073352, + "grad_norm": 0.9006835222244263, + "learning_rate": 0.0005432162092844319, + "loss": 0.3617, + "step": 17040 + }, + { + "epoch": 2.186458066170813, + "grad_norm": 1.1125102043151855, + "learning_rate": 0.0005423612892194581, + "loss": 0.3968, + "step": 17050 + }, + { + "epoch": 2.1877404462682737, + "grad_norm": 0.6102015376091003, + "learning_rate": 0.0005415063691544841, + "loss": 0.2718, + "step": 17060 + }, + { + "epoch": 2.189022826365735, + "grad_norm": 1.4588760137557983, + "learning_rate": 0.0005406514490895101, + "loss": 0.5102, + "step": 17070 + }, + { + "epoch": 2.190305206463196, + "grad_norm": 1.0599772930145264, + "learning_rate": 0.0005397965290245362, + "loss": 0.3631, + "step": 17080 + }, + { + "epoch": 2.1915875865606567, + "grad_norm": 1.4130918979644775, + "learning_rate": 0.0005389416089595624, + "loss": 0.3659, + "step": 17090 + }, + { + "epoch": 2.1928699666581175, + "grad_norm": 0.6802207231521606, + "learning_rate": 0.0005380866888945883, + "loss": 0.3489, + "step": 17100 + }, + { + "epoch": 2.1941523467555784, + "grad_norm": 0.5897672176361084, + "learning_rate": 0.0005372317688296144, + "loss": 0.3961, + "step": 17110 + }, + { + "epoch": 2.195434726853039, + "grad_norm": 1.033302664756775, + "learning_rate": 0.0005363768487646406, + "loss": 0.4143, + "step": 17120 + }, + { + "epoch": 2.1967171069505, + "grad_norm": 0.8548007011413574, + "learning_rate": 0.0005355219286996665, + "loss": 0.3485, + "step": 17130 + }, + { + "epoch": 2.197999487047961, + "grad_norm": 1.1116507053375244, + "learning_rate": 0.0005346670086346926, + "loss": 0.444, + "step": 17140 + }, + { + "epoch": 2.1992818671454217, + "grad_norm": 1.2781219482421875, + "learning_rate": 0.0005338120885697188, + "loss": 0.4845, + "step": 17150 + }, + { + "epoch": 2.200564247242883, + "grad_norm": 1.2359662055969238, + "learning_rate": 0.0005329571685047448, + "loss": 0.4639, + "step": 17160 + }, + { + "epoch": 2.201846627340344, + "grad_norm": 1.5580798387527466, + "learning_rate": 0.0005321022484397708, + "loss": 0.4575, + "step": 17170 + }, + { + "epoch": 2.2031290074378047, + "grad_norm": 1.4028860330581665, + "learning_rate": 0.000531247328374797, + "loss": 0.324, + "step": 17180 + }, + { + "epoch": 2.2044113875352656, + "grad_norm": 0.6842575669288635, + "learning_rate": 0.0005303924083098231, + "loss": 0.4808, + "step": 17190 + }, + { + "epoch": 2.2056937676327264, + "grad_norm": 1.1696909666061401, + "learning_rate": 0.0005295374882448491, + "loss": 0.5438, + "step": 17200 + }, + { + "epoch": 2.2069761477301872, + "grad_norm": 0.7407712936401367, + "learning_rate": 0.0005286825681798752, + "loss": 0.3779, + "step": 17210 + }, + { + "epoch": 2.208258527827648, + "grad_norm": 1.0011707544326782, + "learning_rate": 0.0005278276481149013, + "loss": 0.5065, + "step": 17220 + }, + { + "epoch": 2.209540907925109, + "grad_norm": 0.871257483959198, + "learning_rate": 0.0005269727280499273, + "loss": 0.3252, + "step": 17230 + }, + { + "epoch": 2.2108232880225698, + "grad_norm": 0.9432925581932068, + "learning_rate": 0.0005261178079849535, + "loss": 0.344, + "step": 17240 + }, + { + "epoch": 2.2121056681200306, + "grad_norm": 0.726510763168335, + "learning_rate": 0.0005252628879199795, + "loss": 0.431, + "step": 17250 + }, + { + "epoch": 2.2133880482174915, + "grad_norm": 0.698881983757019, + "learning_rate": 0.0005244079678550055, + "loss": 0.4012, + "step": 17260 + }, + { + "epoch": 2.2146704283149528, + "grad_norm": 1.3157625198364258, + "learning_rate": 0.0005235530477900317, + "loss": 0.4714, + "step": 17270 + }, + { + "epoch": 2.2159528084124136, + "grad_norm": 1.106425166130066, + "learning_rate": 0.0005226981277250578, + "loss": 0.4249, + "step": 17280 + }, + { + "epoch": 2.2172351885098744, + "grad_norm": 1.1882113218307495, + "learning_rate": 0.0005218432076600837, + "loss": 0.4074, + "step": 17290 + }, + { + "epoch": 2.2185175686073353, + "grad_norm": 1.2039605379104614, + "learning_rate": 0.0005209882875951098, + "loss": 0.5073, + "step": 17300 + }, + { + "epoch": 2.219799948704796, + "grad_norm": 1.7524374723434448, + "learning_rate": 0.000520133367530136, + "loss": 0.4022, + "step": 17310 + }, + { + "epoch": 2.221082328802257, + "grad_norm": 0.8379983901977539, + "learning_rate": 0.0005192784474651621, + "loss": 0.4739, + "step": 17320 + }, + { + "epoch": 2.222364708899718, + "grad_norm": 1.3615164756774902, + "learning_rate": 0.000518423527400188, + "loss": 0.4061, + "step": 17330 + }, + { + "epoch": 2.2236470889971787, + "grad_norm": 1.1694985628128052, + "learning_rate": 0.0005175686073352142, + "loss": 0.3913, + "step": 17340 + }, + { + "epoch": 2.2249294690946395, + "grad_norm": 0.9127678871154785, + "learning_rate": 0.0005167136872702403, + "loss": 0.3678, + "step": 17350 + }, + { + "epoch": 2.2262118491921004, + "grad_norm": 1.2487945556640625, + "learning_rate": 0.0005158587672052663, + "loss": 0.4025, + "step": 17360 + }, + { + "epoch": 2.2274942292895616, + "grad_norm": 0.7297146916389465, + "learning_rate": 0.0005150038471402924, + "loss": 0.3391, + "step": 17370 + }, + { + "epoch": 2.2287766093870225, + "grad_norm": 0.7297811508178711, + "learning_rate": 0.0005141489270753185, + "loss": 0.404, + "step": 17380 + }, + { + "epoch": 2.2300589894844833, + "grad_norm": 1.2100460529327393, + "learning_rate": 0.0005132940070103445, + "loss": 0.5015, + "step": 17390 + }, + { + "epoch": 2.231341369581944, + "grad_norm": 1.232190728187561, + "learning_rate": 0.0005124390869453707, + "loss": 0.4982, + "step": 17400 + }, + { + "epoch": 2.232623749679405, + "grad_norm": 1.462148904800415, + "learning_rate": 0.0005115841668803967, + "loss": 0.4461, + "step": 17410 + }, + { + "epoch": 2.233906129776866, + "grad_norm": 0.9447479844093323, + "learning_rate": 0.0005107292468154227, + "loss": 0.3381, + "step": 17420 + }, + { + "epoch": 2.2351885098743267, + "grad_norm": 1.2533239126205444, + "learning_rate": 0.0005098743267504489, + "loss": 0.3932, + "step": 17430 + }, + { + "epoch": 2.2364708899717876, + "grad_norm": 0.4960061013698578, + "learning_rate": 0.000509019406685475, + "loss": 0.4423, + "step": 17440 + }, + { + "epoch": 2.2377532700692484, + "grad_norm": 1.033347487449646, + "learning_rate": 0.0005081644866205009, + "loss": 0.3857, + "step": 17450 + }, + { + "epoch": 2.2390356501667092, + "grad_norm": 0.45185425877571106, + "learning_rate": 0.000507309566555527, + "loss": 0.3485, + "step": 17460 + }, + { + "epoch": 2.24031803026417, + "grad_norm": 0.7259741425514221, + "learning_rate": 0.0005064546464905532, + "loss": 0.4259, + "step": 17470 + }, + { + "epoch": 2.2416004103616314, + "grad_norm": 1.2143189907073975, + "learning_rate": 0.0005055997264255792, + "loss": 0.439, + "step": 17480 + }, + { + "epoch": 2.2428827904590922, + "grad_norm": 0.7752086520195007, + "learning_rate": 0.0005047448063606052, + "loss": 0.3788, + "step": 17490 + }, + { + "epoch": 2.244165170556553, + "grad_norm": 1.4273003339767456, + "learning_rate": 0.0005038898862956314, + "loss": 0.4544, + "step": 17500 + }, + { + "epoch": 2.245447550654014, + "grad_norm": 0.5938236713409424, + "learning_rate": 0.0005030349662306575, + "loss": 0.3911, + "step": 17510 + }, + { + "epoch": 2.2467299307514748, + "grad_norm": 1.0833735466003418, + "learning_rate": 0.0005021800461656834, + "loss": 0.4827, + "step": 17520 + }, + { + "epoch": 2.2480123108489356, + "grad_norm": 0.9137888550758362, + "learning_rate": 0.0005013251261007096, + "loss": 0.4302, + "step": 17530 + }, + { + "epoch": 2.2492946909463964, + "grad_norm": 1.2359901666641235, + "learning_rate": 0.0005004702060357357, + "loss": 0.4328, + "step": 17540 + }, + { + "epoch": 2.2505770710438573, + "grad_norm": 0.5860967636108398, + "learning_rate": 0.0004996152859707617, + "loss": 0.3769, + "step": 17550 + }, + { + "epoch": 2.251859451141318, + "grad_norm": 0.7964845299720764, + "learning_rate": 0.0004987603659057879, + "loss": 0.4499, + "step": 17560 + }, + { + "epoch": 2.253141831238779, + "grad_norm": 0.6681275367736816, + "learning_rate": 0.0004979054458408139, + "loss": 0.5417, + "step": 17570 + }, + { + "epoch": 2.2544242113362403, + "grad_norm": 0.5192536115646362, + "learning_rate": 0.0004970505257758399, + "loss": 0.3263, + "step": 17580 + }, + { + "epoch": 2.255706591433701, + "grad_norm": 0.7628294229507446, + "learning_rate": 0.0004961956057108661, + "loss": 0.2887, + "step": 17590 + }, + { + "epoch": 2.256988971531162, + "grad_norm": 0.8533459901809692, + "learning_rate": 0.0004953406856458921, + "loss": 0.3149, + "step": 17600 + }, + { + "epoch": 2.258271351628623, + "grad_norm": 0.5388279557228088, + "learning_rate": 0.0004944857655809181, + "loss": 0.4372, + "step": 17610 + }, + { + "epoch": 2.2595537317260836, + "grad_norm": 0.8363872766494751, + "learning_rate": 0.0004936308455159443, + "loss": 0.5383, + "step": 17620 + }, + { + "epoch": 2.2608361118235445, + "grad_norm": 1.2380322217941284, + "learning_rate": 0.0004927759254509703, + "loss": 0.3947, + "step": 17630 + }, + { + "epoch": 2.2621184919210053, + "grad_norm": 0.5750362277030945, + "learning_rate": 0.0004919210053859964, + "loss": 0.3112, + "step": 17640 + }, + { + "epoch": 2.263400872018466, + "grad_norm": 1.3540990352630615, + "learning_rate": 0.0004910660853210225, + "loss": 0.4291, + "step": 17650 + }, + { + "epoch": 2.264683252115927, + "grad_norm": 1.2334551811218262, + "learning_rate": 0.0004902111652560486, + "loss": 0.3534, + "step": 17660 + }, + { + "epoch": 2.265965632213388, + "grad_norm": 1.0018736124038696, + "learning_rate": 0.0004893562451910746, + "loss": 0.3656, + "step": 17670 + }, + { + "epoch": 2.2672480123108487, + "grad_norm": 1.0932631492614746, + "learning_rate": 0.0004885013251261008, + "loss": 0.3613, + "step": 17680 + }, + { + "epoch": 2.26853039240831, + "grad_norm": 0.900193989276886, + "learning_rate": 0.0004876464050611268, + "loss": 0.3352, + "step": 17690 + }, + { + "epoch": 2.269812772505771, + "grad_norm": 0.511600136756897, + "learning_rate": 0.00048679148499615287, + "loss": 0.5532, + "step": 17700 + }, + { + "epoch": 2.2710951526032317, + "grad_norm": 1.1176284551620483, + "learning_rate": 0.00048593656493117895, + "loss": 0.2997, + "step": 17710 + }, + { + "epoch": 2.2723775327006925, + "grad_norm": 1.379473090171814, + "learning_rate": 0.00048508164486620503, + "loss": 0.3758, + "step": 17720 + }, + { + "epoch": 2.2736599127981534, + "grad_norm": 0.7329534888267517, + "learning_rate": 0.0004842267248012311, + "loss": 0.3541, + "step": 17730 + }, + { + "epoch": 2.2749422928956142, + "grad_norm": 1.0883692502975464, + "learning_rate": 0.0004833718047362572, + "loss": 0.4156, + "step": 17740 + }, + { + "epoch": 2.276224672993075, + "grad_norm": 1.1010819673538208, + "learning_rate": 0.0004825168846712832, + "loss": 0.5474, + "step": 17750 + }, + { + "epoch": 2.277507053090536, + "grad_norm": 1.4709731340408325, + "learning_rate": 0.00048166196460630936, + "loss": 0.3586, + "step": 17760 + }, + { + "epoch": 2.2787894331879968, + "grad_norm": 1.0419952869415283, + "learning_rate": 0.0004808070445413354, + "loss": 0.3637, + "step": 17770 + }, + { + "epoch": 2.280071813285458, + "grad_norm": 0.6669880747795105, + "learning_rate": 0.00047995212447636147, + "loss": 0.4534, + "step": 17780 + }, + { + "epoch": 2.281354193382919, + "grad_norm": 0.7150077223777771, + "learning_rate": 0.00047909720441138755, + "loss": 0.3625, + "step": 17790 + }, + { + "epoch": 2.2826365734803797, + "grad_norm": 0.8918224573135376, + "learning_rate": 0.00047824228434641364, + "loss": 0.469, + "step": 17800 + }, + { + "epoch": 2.2839189535778406, + "grad_norm": 1.1246883869171143, + "learning_rate": 0.0004773873642814397, + "loss": 0.3144, + "step": 17810 + }, + { + "epoch": 2.2852013336753014, + "grad_norm": 0.7975451946258545, + "learning_rate": 0.00047653244421646575, + "loss": 0.3007, + "step": 17820 + }, + { + "epoch": 2.2864837137727623, + "grad_norm": 1.3306605815887451, + "learning_rate": 0.00047567752415149183, + "loss": 0.5171, + "step": 17830 + }, + { + "epoch": 2.287766093870223, + "grad_norm": 0.8955139517784119, + "learning_rate": 0.0004748226040865179, + "loss": 0.4836, + "step": 17840 + }, + { + "epoch": 2.289048473967684, + "grad_norm": 1.8671926259994507, + "learning_rate": 0.000473967684021544, + "loss": 0.4806, + "step": 17850 + }, + { + "epoch": 2.290330854065145, + "grad_norm": 0.8943301439285278, + "learning_rate": 0.00047311276395657, + "loss": 0.3481, + "step": 17860 + }, + { + "epoch": 2.2916132341626057, + "grad_norm": 0.938799262046814, + "learning_rate": 0.00047225784389159616, + "loss": 0.3384, + "step": 17870 + }, + { + "epoch": 2.2928956142600665, + "grad_norm": 0.9175413846969604, + "learning_rate": 0.0004714029238266222, + "loss": 0.3455, + "step": 17880 + }, + { + "epoch": 2.2941779943575273, + "grad_norm": 0.8490305542945862, + "learning_rate": 0.0004705480037616483, + "loss": 0.4001, + "step": 17890 + }, + { + "epoch": 2.2954603744549886, + "grad_norm": 0.525170087814331, + "learning_rate": 0.00046969308369667435, + "loss": 0.3755, + "step": 17900 + }, + { + "epoch": 2.2967427545524495, + "grad_norm": 0.45375433564186096, + "learning_rate": 0.00046883816363170043, + "loss": 0.4539, + "step": 17910 + }, + { + "epoch": 2.2980251346499103, + "grad_norm": 0.6057801246643066, + "learning_rate": 0.0004679832435667265, + "loss": 0.3656, + "step": 17920 + }, + { + "epoch": 2.299307514747371, + "grad_norm": 1.6983225345611572, + "learning_rate": 0.0004671283235017526, + "loss": 0.4801, + "step": 17930 + }, + { + "epoch": 2.300589894844832, + "grad_norm": 0.8477333188056946, + "learning_rate": 0.0004662734034367787, + "loss": 0.3411, + "step": 17940 + }, + { + "epoch": 2.301872274942293, + "grad_norm": 1.024043321609497, + "learning_rate": 0.00046541848337180476, + "loss": 0.3007, + "step": 17950 + }, + { + "epoch": 2.3031546550397537, + "grad_norm": 1.2260679006576538, + "learning_rate": 0.0004645635633068308, + "loss": 0.4926, + "step": 17960 + }, + { + "epoch": 2.3044370351372145, + "grad_norm": 0.626004159450531, + "learning_rate": 0.00046370864324185693, + "loss": 0.3906, + "step": 17970 + }, + { + "epoch": 2.3057194152346754, + "grad_norm": 0.8693203330039978, + "learning_rate": 0.00046285372317688296, + "loss": 0.3919, + "step": 17980 + }, + { + "epoch": 2.3070017953321367, + "grad_norm": 0.8525885343551636, + "learning_rate": 0.00046199880311190904, + "loss": 0.4073, + "step": 17990 + }, + { + "epoch": 2.3082841754295975, + "grad_norm": 0.7898913025856018, + "learning_rate": 0.0004611438830469351, + "loss": 0.3701, + "step": 18000 + }, + { + "epoch": 2.3095665555270584, + "grad_norm": 0.6249486804008484, + "learning_rate": 0.0004602889629819612, + "loss": 0.4573, + "step": 18010 + }, + { + "epoch": 2.310848935624519, + "grad_norm": 0.5609285831451416, + "learning_rate": 0.0004594340429169873, + "loss": 0.2935, + "step": 18020 + }, + { + "epoch": 2.31213131572198, + "grad_norm": 0.6433789730072021, + "learning_rate": 0.00045857912285201337, + "loss": 0.386, + "step": 18030 + }, + { + "epoch": 2.313413695819441, + "grad_norm": 1.4051841497421265, + "learning_rate": 0.0004577242027870394, + "loss": 0.4438, + "step": 18040 + }, + { + "epoch": 2.3146960759169017, + "grad_norm": 0.8757970929145813, + "learning_rate": 0.00045686928272206553, + "loss": 0.3941, + "step": 18050 + }, + { + "epoch": 2.3159784560143626, + "grad_norm": 0.6573584675788879, + "learning_rate": 0.00045601436265709156, + "loss": 0.412, + "step": 18060 + }, + { + "epoch": 2.3172608361118234, + "grad_norm": 0.6750732064247131, + "learning_rate": 0.00045515944259211764, + "loss": 0.33, + "step": 18070 + }, + { + "epoch": 2.3185432162092843, + "grad_norm": 0.9263201951980591, + "learning_rate": 0.0004543045225271437, + "loss": 0.4086, + "step": 18080 + }, + { + "epoch": 2.319825596306745, + "grad_norm": 0.9872358441352844, + "learning_rate": 0.0004534496024621698, + "loss": 0.4036, + "step": 18090 + }, + { + "epoch": 2.321107976404206, + "grad_norm": 1.5108319520950317, + "learning_rate": 0.0004525946823971959, + "loss": 0.3106, + "step": 18100 + }, + { + "epoch": 2.3223903565016673, + "grad_norm": 0.9161720871925354, + "learning_rate": 0.00045173976233222197, + "loss": 0.3777, + "step": 18110 + }, + { + "epoch": 2.323672736599128, + "grad_norm": 1.0512194633483887, + "learning_rate": 0.000450884842267248, + "loss": 0.4419, + "step": 18120 + }, + { + "epoch": 2.324955116696589, + "grad_norm": 0.6393684148788452, + "learning_rate": 0.00045002992220227414, + "loss": 0.4628, + "step": 18130 + }, + { + "epoch": 2.32623749679405, + "grad_norm": 0.9643192887306213, + "learning_rate": 0.00044917500213730017, + "loss": 0.4549, + "step": 18140 + }, + { + "epoch": 2.3275198768915106, + "grad_norm": 1.658616542816162, + "learning_rate": 0.00044832008207232625, + "loss": 0.3435, + "step": 18150 + }, + { + "epoch": 2.3288022569889715, + "grad_norm": 0.7164269685745239, + "learning_rate": 0.00044746516200735233, + "loss": 0.2776, + "step": 18160 + }, + { + "epoch": 2.3300846370864323, + "grad_norm": 1.204102873802185, + "learning_rate": 0.00044661024194237836, + "loss": 0.399, + "step": 18170 + }, + { + "epoch": 2.331367017183893, + "grad_norm": 0.719174325466156, + "learning_rate": 0.0004457553218774045, + "loss": 0.3717, + "step": 18180 + }, + { + "epoch": 2.332649397281354, + "grad_norm": 0.8231685757637024, + "learning_rate": 0.0004449004018124305, + "loss": 0.3388, + "step": 18190 + }, + { + "epoch": 2.3339317773788153, + "grad_norm": 0.542766809463501, + "learning_rate": 0.0004440454817474566, + "loss": 0.3687, + "step": 18200 + }, + { + "epoch": 2.335214157476276, + "grad_norm": 0.7932581305503845, + "learning_rate": 0.0004431905616824827, + "loss": 0.4434, + "step": 18210 + }, + { + "epoch": 2.336496537573737, + "grad_norm": 1.064727544784546, + "learning_rate": 0.00044233564161750877, + "loss": 0.4495, + "step": 18220 + }, + { + "epoch": 2.337778917671198, + "grad_norm": 0.7613261342048645, + "learning_rate": 0.00044148072155253485, + "loss": 0.4192, + "step": 18230 + }, + { + "epoch": 2.3390612977686587, + "grad_norm": 1.3468183279037476, + "learning_rate": 0.00044062580148756093, + "loss": 0.454, + "step": 18240 + }, + { + "epoch": 2.3403436778661195, + "grad_norm": 1.017491102218628, + "learning_rate": 0.00043977088142258696, + "loss": 0.3561, + "step": 18250 + }, + { + "epoch": 2.3416260579635804, + "grad_norm": 1.4051862955093384, + "learning_rate": 0.0004389159613576131, + "loss": 0.4603, + "step": 18260 + }, + { + "epoch": 2.342908438061041, + "grad_norm": 0.8021685481071472, + "learning_rate": 0.00043806104129263913, + "loss": 0.3409, + "step": 18270 + }, + { + "epoch": 2.344190818158502, + "grad_norm": 0.889196515083313, + "learning_rate": 0.0004372061212276652, + "loss": 0.3659, + "step": 18280 + }, + { + "epoch": 2.345473198255963, + "grad_norm": 1.0410467386245728, + "learning_rate": 0.0004363512011626913, + "loss": 0.3478, + "step": 18290 + }, + { + "epoch": 2.3467555783534237, + "grad_norm": 0.5652367472648621, + "learning_rate": 0.0004354962810977174, + "loss": 0.3194, + "step": 18300 + }, + { + "epoch": 2.348037958450885, + "grad_norm": 1.7215555906295776, + "learning_rate": 0.00043464136103274346, + "loss": 0.39, + "step": 18310 + }, + { + "epoch": 2.349320338548346, + "grad_norm": 0.96045982837677, + "learning_rate": 0.00043378644096776954, + "loss": 0.3795, + "step": 18320 + }, + { + "epoch": 2.3506027186458067, + "grad_norm": 1.5710773468017578, + "learning_rate": 0.00043293152090279557, + "loss": 0.39, + "step": 18330 + }, + { + "epoch": 2.3518850987432676, + "grad_norm": 1.176043152809143, + "learning_rate": 0.0004320766008378217, + "loss": 0.3328, + "step": 18340 + }, + { + "epoch": 2.3531674788407284, + "grad_norm": 1.4193735122680664, + "learning_rate": 0.00043122168077284773, + "loss": 0.36, + "step": 18350 + }, + { + "epoch": 2.3544498589381893, + "grad_norm": 0.6019266247749329, + "learning_rate": 0.0004303667607078738, + "loss": 0.3065, + "step": 18360 + }, + { + "epoch": 2.35573223903565, + "grad_norm": 0.5137869715690613, + "learning_rate": 0.0004295118406428999, + "loss": 0.4731, + "step": 18370 + }, + { + "epoch": 2.357014619133111, + "grad_norm": 1.5411295890808105, + "learning_rate": 0.000428656920577926, + "loss": 0.4936, + "step": 18380 + }, + { + "epoch": 2.358296999230572, + "grad_norm": 0.8280097842216492, + "learning_rate": 0.00042780200051295206, + "loss": 0.4289, + "step": 18390 + }, + { + "epoch": 2.359579379328033, + "grad_norm": 0.6101049184799194, + "learning_rate": 0.00042694708044797814, + "loss": 0.2947, + "step": 18400 + }, + { + "epoch": 2.360861759425494, + "grad_norm": 1.0666029453277588, + "learning_rate": 0.00042609216038300417, + "loss": 0.3594, + "step": 18410 + }, + { + "epoch": 2.3621441395229548, + "grad_norm": 0.8030332326889038, + "learning_rate": 0.0004252372403180303, + "loss": 0.4313, + "step": 18420 + }, + { + "epoch": 2.3634265196204156, + "grad_norm": 1.3051592111587524, + "learning_rate": 0.00042438232025305634, + "loss": 0.3878, + "step": 18430 + }, + { + "epoch": 2.3647088997178765, + "grad_norm": 0.7515511512756348, + "learning_rate": 0.0004235274001880824, + "loss": 0.4099, + "step": 18440 + }, + { + "epoch": 2.3659912798153373, + "grad_norm": 0.8009350895881653, + "learning_rate": 0.0004226724801231085, + "loss": 0.5788, + "step": 18450 + }, + { + "epoch": 2.367273659912798, + "grad_norm": 0.7808216214179993, + "learning_rate": 0.0004218175600581346, + "loss": 0.3801, + "step": 18460 + }, + { + "epoch": 2.368556040010259, + "grad_norm": 0.9818991422653198, + "learning_rate": 0.00042096263999316067, + "loss": 0.4815, + "step": 18470 + }, + { + "epoch": 2.36983842010772, + "grad_norm": 0.838982343673706, + "learning_rate": 0.00042010771992818675, + "loss": 0.4868, + "step": 18480 + }, + { + "epoch": 2.3711208002051807, + "grad_norm": 1.2091493606567383, + "learning_rate": 0.0004192527998632128, + "loss": 0.4504, + "step": 18490 + }, + { + "epoch": 2.3724031803026415, + "grad_norm": 0.793835461139679, + "learning_rate": 0.0004183978797982389, + "loss": 0.3619, + "step": 18500 + }, + { + "epoch": 2.3736855604001024, + "grad_norm": 0.6502864956855774, + "learning_rate": 0.00041754295973326494, + "loss": 0.313, + "step": 18510 + }, + { + "epoch": 2.3749679404975637, + "grad_norm": 0.6209380626678467, + "learning_rate": 0.000416688039668291, + "loss": 0.466, + "step": 18520 + }, + { + "epoch": 2.3762503205950245, + "grad_norm": 0.6486326456069946, + "learning_rate": 0.0004158331196033171, + "loss": 0.3264, + "step": 18530 + }, + { + "epoch": 2.3775327006924853, + "grad_norm": 1.1120644807815552, + "learning_rate": 0.00041497819953834313, + "loss": 0.4322, + "step": 18540 + }, + { + "epoch": 2.378815080789946, + "grad_norm": 0.805433452129364, + "learning_rate": 0.00041412327947336927, + "loss": 0.3712, + "step": 18550 + }, + { + "epoch": 2.380097460887407, + "grad_norm": 1.1664881706237793, + "learning_rate": 0.0004132683594083953, + "loss": 0.3844, + "step": 18560 + }, + { + "epoch": 2.381379840984868, + "grad_norm": 0.5431153178215027, + "learning_rate": 0.0004124134393434214, + "loss": 0.419, + "step": 18570 + }, + { + "epoch": 2.3826622210823287, + "grad_norm": 1.0935227870941162, + "learning_rate": 0.00041155851927844746, + "loss": 0.3942, + "step": 18580 + }, + { + "epoch": 2.3839446011797896, + "grad_norm": 0.9874739050865173, + "learning_rate": 0.00041070359921347355, + "loss": 0.4566, + "step": 18590 + }, + { + "epoch": 2.3852269812772504, + "grad_norm": 1.6212762594223022, + "learning_rate": 0.00040984867914849963, + "loss": 0.5304, + "step": 18600 + }, + { + "epoch": 2.3865093613747117, + "grad_norm": 0.9659703969955444, + "learning_rate": 0.0004089937590835257, + "loss": 0.3745, + "step": 18610 + }, + { + "epoch": 2.3877917414721725, + "grad_norm": 1.1413301229476929, + "learning_rate": 0.00040813883901855174, + "loss": 0.3498, + "step": 18620 + }, + { + "epoch": 2.3890741215696334, + "grad_norm": 0.9907665848731995, + "learning_rate": 0.0004072839189535779, + "loss": 0.3246, + "step": 18630 + }, + { + "epoch": 2.3903565016670942, + "grad_norm": 1.7018821239471436, + "learning_rate": 0.0004064289988886039, + "loss": 0.4562, + "step": 18640 + }, + { + "epoch": 2.391638881764555, + "grad_norm": 0.7171698808670044, + "learning_rate": 0.00040557407882363, + "loss": 0.3188, + "step": 18650 + }, + { + "epoch": 2.392921261862016, + "grad_norm": 1.6024487018585205, + "learning_rate": 0.00040471915875865607, + "loss": 0.4274, + "step": 18660 + }, + { + "epoch": 2.3942036419594768, + "grad_norm": 0.6559688448905945, + "learning_rate": 0.00040386423869368215, + "loss": 0.3052, + "step": 18670 + }, + { + "epoch": 2.3954860220569376, + "grad_norm": 0.2720082402229309, + "learning_rate": 0.00040300931862870823, + "loss": 0.309, + "step": 18680 + }, + { + "epoch": 2.3967684021543985, + "grad_norm": 1.082115650177002, + "learning_rate": 0.0004021543985637343, + "loss": 0.3961, + "step": 18690 + }, + { + "epoch": 2.3980507822518593, + "grad_norm": 1.2949116230010986, + "learning_rate": 0.00040129947849876034, + "loss": 0.4343, + "step": 18700 + }, + { + "epoch": 2.39933316234932, + "grad_norm": 1.1575446128845215, + "learning_rate": 0.0004004445584337865, + "loss": 0.3872, + "step": 18710 + }, + { + "epoch": 2.400615542446781, + "grad_norm": 1.3714033365249634, + "learning_rate": 0.0003995896383688125, + "loss": 0.403, + "step": 18720 + }, + { + "epoch": 2.4018979225442423, + "grad_norm": 0.7358514070510864, + "learning_rate": 0.0003987347183038386, + "loss": 0.3598, + "step": 18730 + }, + { + "epoch": 2.403180302641703, + "grad_norm": 0.6895415186882019, + "learning_rate": 0.0003978797982388647, + "loss": 0.3644, + "step": 18740 + }, + { + "epoch": 2.404462682739164, + "grad_norm": 0.7910656332969666, + "learning_rate": 0.00039702487817389076, + "loss": 0.3489, + "step": 18750 + }, + { + "epoch": 2.405745062836625, + "grad_norm": 0.6187024712562561, + "learning_rate": 0.00039616995810891684, + "loss": 0.4018, + "step": 18760 + }, + { + "epoch": 2.4070274429340857, + "grad_norm": 1.0988044738769531, + "learning_rate": 0.0003953150380439429, + "loss": 0.4284, + "step": 18770 + }, + { + "epoch": 2.4083098230315465, + "grad_norm": 1.2347112894058228, + "learning_rate": 0.00039446011797896895, + "loss": 0.4418, + "step": 18780 + }, + { + "epoch": 2.4095922031290073, + "grad_norm": 0.756648600101471, + "learning_rate": 0.0003936051979139951, + "loss": 0.2948, + "step": 18790 + }, + { + "epoch": 2.410874583226468, + "grad_norm": 0.7087267637252808, + "learning_rate": 0.0003927502778490211, + "loss": 0.3494, + "step": 18800 + }, + { + "epoch": 2.412156963323929, + "grad_norm": 0.8558051586151123, + "learning_rate": 0.00039189535778404725, + "loss": 0.3838, + "step": 18810 + }, + { + "epoch": 2.4134393434213903, + "grad_norm": 0.669138491153717, + "learning_rate": 0.0003910404377190733, + "loss": 0.3141, + "step": 18820 + }, + { + "epoch": 2.414721723518851, + "grad_norm": 0.7983182072639465, + "learning_rate": 0.00039018551765409936, + "loss": 0.3701, + "step": 18830 + }, + { + "epoch": 2.416004103616312, + "grad_norm": 0.9110289812088013, + "learning_rate": 0.00038933059758912544, + "loss": 0.3892, + "step": 18840 + }, + { + "epoch": 2.417286483713773, + "grad_norm": 0.7137938141822815, + "learning_rate": 0.0003884756775241515, + "loss": 0.4112, + "step": 18850 + }, + { + "epoch": 2.4185688638112337, + "grad_norm": 1.2632485628128052, + "learning_rate": 0.00038762075745917755, + "loss": 0.5279, + "step": 18860 + }, + { + "epoch": 2.4198512439086945, + "grad_norm": 0.7221540212631226, + "learning_rate": 0.00038676583739420364, + "loss": 0.3697, + "step": 18870 + }, + { + "epoch": 2.4211336240061554, + "grad_norm": 0.3167746365070343, + "learning_rate": 0.0003859109173292297, + "loss": 0.2561, + "step": 18880 + }, + { + "epoch": 2.4224160041036162, + "grad_norm": 1.2461453676223755, + "learning_rate": 0.0003850559972642558, + "loss": 0.4454, + "step": 18890 + }, + { + "epoch": 2.423698384201077, + "grad_norm": 1.2429416179656982, + "learning_rate": 0.0003842010771992819, + "loss": 0.3414, + "step": 18900 + }, + { + "epoch": 2.424980764298538, + "grad_norm": 0.8229495882987976, + "learning_rate": 0.0003833461571343079, + "loss": 0.5697, + "step": 18910 + }, + { + "epoch": 2.4262631443959988, + "grad_norm": 1.0524449348449707, + "learning_rate": 0.00038249123706933405, + "loss": 0.4613, + "step": 18920 + }, + { + "epoch": 2.4275455244934596, + "grad_norm": 1.0772918462753296, + "learning_rate": 0.0003816363170043601, + "loss": 0.3401, + "step": 18930 + }, + { + "epoch": 2.428827904590921, + "grad_norm": 1.0349977016448975, + "learning_rate": 0.00038078139693938616, + "loss": 0.4301, + "step": 18940 + }, + { + "epoch": 2.4301102846883817, + "grad_norm": 1.188043236732483, + "learning_rate": 0.00037992647687441224, + "loss": 0.4321, + "step": 18950 + }, + { + "epoch": 2.4313926647858426, + "grad_norm": 0.5111313462257385, + "learning_rate": 0.0003790715568094383, + "loss": 0.406, + "step": 18960 + }, + { + "epoch": 2.4326750448833034, + "grad_norm": 0.7800171375274658, + "learning_rate": 0.0003782166367444644, + "loss": 0.336, + "step": 18970 + }, + { + "epoch": 2.4339574249807643, + "grad_norm": 1.0893301963806152, + "learning_rate": 0.0003773617166794905, + "loss": 0.3267, + "step": 18980 + }, + { + "epoch": 2.435239805078225, + "grad_norm": 1.028470754623413, + "learning_rate": 0.0003765067966145165, + "loss": 0.3977, + "step": 18990 + }, + { + "epoch": 2.436522185175686, + "grad_norm": 1.0852724313735962, + "learning_rate": 0.00037565187654954265, + "loss": 0.3528, + "step": 19000 + }, + { + "epoch": 2.437804565273147, + "grad_norm": 0.8436377644538879, + "learning_rate": 0.0003747969564845687, + "loss": 0.4934, + "step": 19010 + }, + { + "epoch": 2.4390869453706077, + "grad_norm": 0.8028691411018372, + "learning_rate": 0.00037394203641959476, + "loss": 0.2514, + "step": 19020 + }, + { + "epoch": 2.440369325468069, + "grad_norm": 0.6978164911270142, + "learning_rate": 0.00037308711635462084, + "loss": 0.2909, + "step": 19030 + }, + { + "epoch": 2.44165170556553, + "grad_norm": 0.9961578249931335, + "learning_rate": 0.0003722321962896469, + "loss": 0.5097, + "step": 19040 + }, + { + "epoch": 2.4429340856629906, + "grad_norm": 0.8044784069061279, + "learning_rate": 0.000371377276224673, + "loss": 0.3696, + "step": 19050 + }, + { + "epoch": 2.4442164657604515, + "grad_norm": 0.9142523407936096, + "learning_rate": 0.0003705223561596991, + "loss": 0.2975, + "step": 19060 + }, + { + "epoch": 2.4454988458579123, + "grad_norm": 0.6743261814117432, + "learning_rate": 0.0003696674360947251, + "loss": 0.3438, + "step": 19070 + }, + { + "epoch": 2.446781225955373, + "grad_norm": 0.9086779356002808, + "learning_rate": 0.00036881251602975126, + "loss": 0.3507, + "step": 19080 + }, + { + "epoch": 2.448063606052834, + "grad_norm": 0.8643527030944824, + "learning_rate": 0.0003679575959647773, + "loss": 0.4654, + "step": 19090 + }, + { + "epoch": 2.449345986150295, + "grad_norm": 0.6658887267112732, + "learning_rate": 0.0003671026758998034, + "loss": 0.2952, + "step": 19100 + }, + { + "epoch": 2.4506283662477557, + "grad_norm": 1.4154678583145142, + "learning_rate": 0.00036624775583482945, + "loss": 0.4116, + "step": 19110 + }, + { + "epoch": 2.4519107463452166, + "grad_norm": 0.9834240674972534, + "learning_rate": 0.00036539283576985553, + "loss": 0.4582, + "step": 19120 + }, + { + "epoch": 2.4531931264426774, + "grad_norm": 1.1444348096847534, + "learning_rate": 0.0003645379157048816, + "loss": 0.4559, + "step": 19130 + }, + { + "epoch": 2.4544755065401387, + "grad_norm": 1.2544337511062622, + "learning_rate": 0.0003636829956399077, + "loss": 0.3877, + "step": 19140 + }, + { + "epoch": 2.4557578866375995, + "grad_norm": 0.7545201182365417, + "learning_rate": 0.0003628280755749337, + "loss": 0.3751, + "step": 19150 + }, + { + "epoch": 2.4570402667350604, + "grad_norm": 1.476630449295044, + "learning_rate": 0.00036197315550995986, + "loss": 0.5416, + "step": 19160 + }, + { + "epoch": 2.458322646832521, + "grad_norm": 0.867030143737793, + "learning_rate": 0.0003611182354449859, + "loss": 0.3719, + "step": 19170 + }, + { + "epoch": 2.459605026929982, + "grad_norm": 0.511754035949707, + "learning_rate": 0.000360263315380012, + "loss": 0.41, + "step": 19180 + }, + { + "epoch": 2.460887407027443, + "grad_norm": 1.1626338958740234, + "learning_rate": 0.00035940839531503805, + "loss": 0.4129, + "step": 19190 + }, + { + "epoch": 2.4621697871249038, + "grad_norm": 0.35824307799339294, + "learning_rate": 0.00035855347525006414, + "loss": 0.319, + "step": 19200 + }, + { + "epoch": 2.4634521672223646, + "grad_norm": 1.2998716831207275, + "learning_rate": 0.0003576985551850902, + "loss": 0.3915, + "step": 19210 + }, + { + "epoch": 2.4647345473198254, + "grad_norm": 0.6478980183601379, + "learning_rate": 0.00035684363512011625, + "loss": 0.3292, + "step": 19220 + }, + { + "epoch": 2.4660169274172867, + "grad_norm": 1.1961947679519653, + "learning_rate": 0.00035598871505514233, + "loss": 0.4412, + "step": 19230 + }, + { + "epoch": 2.4672993075147476, + "grad_norm": 0.7244174480438232, + "learning_rate": 0.0003551337949901684, + "loss": 0.3171, + "step": 19240 + }, + { + "epoch": 2.4685816876122084, + "grad_norm": 0.6592457294464111, + "learning_rate": 0.0003542788749251945, + "loss": 0.3354, + "step": 19250 + }, + { + "epoch": 2.4698640677096693, + "grad_norm": 0.946502685546875, + "learning_rate": 0.0003534239548602206, + "loss": 0.3961, + "step": 19260 + }, + { + "epoch": 2.47114644780713, + "grad_norm": 0.8770771026611328, + "learning_rate": 0.00035256903479524666, + "loss": 0.4879, + "step": 19270 + }, + { + "epoch": 2.472428827904591, + "grad_norm": 0.7424082159996033, + "learning_rate": 0.0003517141147302727, + "loss": 0.3845, + "step": 19280 + }, + { + "epoch": 2.473711208002052, + "grad_norm": 0.8747217655181885, + "learning_rate": 0.0003508591946652988, + "loss": 0.3393, + "step": 19290 + }, + { + "epoch": 2.4749935880995126, + "grad_norm": 1.3483731746673584, + "learning_rate": 0.00035000427460032485, + "loss": 0.3517, + "step": 19300 + }, + { + "epoch": 2.4762759681969735, + "grad_norm": 0.5340741276741028, + "learning_rate": 0.00034914935453535093, + "loss": 0.4246, + "step": 19310 + }, + { + "epoch": 2.4775583482944343, + "grad_norm": 1.0605217218399048, + "learning_rate": 0.000348294434470377, + "loss": 0.5212, + "step": 19320 + }, + { + "epoch": 2.478840728391895, + "grad_norm": 1.1678279638290405, + "learning_rate": 0.0003474395144054031, + "loss": 0.3448, + "step": 19330 + }, + { + "epoch": 2.480123108489356, + "grad_norm": 1.3842048645019531, + "learning_rate": 0.0003465845943404292, + "loss": 0.4713, + "step": 19340 + }, + { + "epoch": 2.4814054885868173, + "grad_norm": 0.9531245231628418, + "learning_rate": 0.00034572967427545526, + "loss": 0.4228, + "step": 19350 + }, + { + "epoch": 2.482687868684278, + "grad_norm": 1.5676864385604858, + "learning_rate": 0.0003448747542104813, + "loss": 0.397, + "step": 19360 + }, + { + "epoch": 2.483970248781739, + "grad_norm": 0.8071860671043396, + "learning_rate": 0.00034401983414550743, + "loss": 0.3884, + "step": 19370 + }, + { + "epoch": 2.4852526288792, + "grad_norm": 1.1921252012252808, + "learning_rate": 0.00034316491408053346, + "loss": 0.3698, + "step": 19380 + }, + { + "epoch": 2.4865350089766607, + "grad_norm": 0.7575945854187012, + "learning_rate": 0.0003423099940155596, + "loss": 0.4962, + "step": 19390 + }, + { + "epoch": 2.4878173890741215, + "grad_norm": 0.9211723804473877, + "learning_rate": 0.0003414550739505856, + "loss": 0.4712, + "step": 19400 + }, + { + "epoch": 2.4890997691715824, + "grad_norm": 1.3572173118591309, + "learning_rate": 0.0003406001538856117, + "loss": 0.3369, + "step": 19410 + }, + { + "epoch": 2.490382149269043, + "grad_norm": 0.8064128160476685, + "learning_rate": 0.0003397452338206378, + "loss": 0.3977, + "step": 19420 + }, + { + "epoch": 2.491664529366504, + "grad_norm": 0.708720326423645, + "learning_rate": 0.00033889031375566387, + "loss": 0.4332, + "step": 19430 + }, + { + "epoch": 2.4929469094639654, + "grad_norm": 0.34566161036491394, + "learning_rate": 0.0003380353936906899, + "loss": 0.366, + "step": 19440 + }, + { + "epoch": 2.494229289561426, + "grad_norm": 0.815828263759613, + "learning_rate": 0.00033718047362571603, + "loss": 0.4021, + "step": 19450 + }, + { + "epoch": 2.495511669658887, + "grad_norm": 0.8650433421134949, + "learning_rate": 0.00033632555356074206, + "loss": 0.3124, + "step": 19460 + }, + { + "epoch": 2.496794049756348, + "grad_norm": 1.2092469930648804, + "learning_rate": 0.0003354706334957682, + "loss": 0.4188, + "step": 19470 + }, + { + "epoch": 2.4980764298538087, + "grad_norm": 0.8805145025253296, + "learning_rate": 0.0003346157134307942, + "loss": 0.3866, + "step": 19480 + }, + { + "epoch": 2.4993588099512696, + "grad_norm": 0.9097617864608765, + "learning_rate": 0.0003337607933658203, + "loss": 0.3674, + "step": 19490 + }, + { + "epoch": 2.5006411900487304, + "grad_norm": 0.8548180460929871, + "learning_rate": 0.0003329058733008464, + "loss": 0.4066, + "step": 19500 + }, + { + "epoch": 2.5019235701461913, + "grad_norm": 0.5404782295227051, + "learning_rate": 0.00033205095323587247, + "loss": 0.3742, + "step": 19510 + }, + { + "epoch": 2.503205950243652, + "grad_norm": 0.4802301526069641, + "learning_rate": 0.0003311960331708985, + "loss": 0.3414, + "step": 19520 + }, + { + "epoch": 2.504488330341113, + "grad_norm": 0.5459701418876648, + "learning_rate": 0.00033034111310592464, + "loss": 0.3094, + "step": 19530 + }, + { + "epoch": 2.505770710438574, + "grad_norm": 1.0268832445144653, + "learning_rate": 0.00032948619304095067, + "loss": 0.3653, + "step": 19540 + }, + { + "epoch": 2.5070530905360346, + "grad_norm": 1.0585857629776, + "learning_rate": 0.0003286312729759768, + "loss": 0.4984, + "step": 19550 + }, + { + "epoch": 2.508335470633496, + "grad_norm": 0.943658709526062, + "learning_rate": 0.00032777635291100283, + "loss": 0.3321, + "step": 19560 + }, + { + "epoch": 2.509617850730957, + "grad_norm": 1.1988105773925781, + "learning_rate": 0.00032692143284602886, + "loss": 0.3189, + "step": 19570 + }, + { + "epoch": 2.5109002308284176, + "grad_norm": 1.466678261756897, + "learning_rate": 0.000326066512781055, + "loss": 0.4272, + "step": 19580 + }, + { + "epoch": 2.5121826109258785, + "grad_norm": 0.9461327791213989, + "learning_rate": 0.000325211592716081, + "loss": 0.5022, + "step": 19590 + }, + { + "epoch": 2.5134649910233393, + "grad_norm": 0.9493967294692993, + "learning_rate": 0.0003243566726511071, + "loss": 0.2942, + "step": 19600 + }, + { + "epoch": 2.5147473711208, + "grad_norm": 0.6060981154441833, + "learning_rate": 0.0003235017525861332, + "loss": 0.3608, + "step": 19610 + }, + { + "epoch": 2.516029751218261, + "grad_norm": 1.081632137298584, + "learning_rate": 0.00032264683252115927, + "loss": 0.3932, + "step": 19620 + }, + { + "epoch": 2.517312131315722, + "grad_norm": 0.272013396024704, + "learning_rate": 0.00032179191245618535, + "loss": 0.3491, + "step": 19630 + }, + { + "epoch": 2.518594511413183, + "grad_norm": 0.7338408827781677, + "learning_rate": 0.00032093699239121144, + "loss": 0.4811, + "step": 19640 + }, + { + "epoch": 2.519876891510644, + "grad_norm": 0.6062107086181641, + "learning_rate": 0.00032008207232623746, + "loss": 0.3817, + "step": 19650 + }, + { + "epoch": 2.521159271608105, + "grad_norm": 1.2783069610595703, + "learning_rate": 0.0003192271522612636, + "loss": 0.3512, + "step": 19660 + }, + { + "epoch": 2.5224416517055657, + "grad_norm": 1.2621718645095825, + "learning_rate": 0.00031837223219628963, + "loss": 0.416, + "step": 19670 + }, + { + "epoch": 2.5237240318030265, + "grad_norm": 0.6203981637954712, + "learning_rate": 0.00031751731213131576, + "loss": 0.307, + "step": 19680 + }, + { + "epoch": 2.5250064119004874, + "grad_norm": 0.8723649978637695, + "learning_rate": 0.0003166623920663418, + "loss": 0.3647, + "step": 19690 + }, + { + "epoch": 2.526288791997948, + "grad_norm": 0.887333333492279, + "learning_rate": 0.0003158074720013679, + "loss": 0.5004, + "step": 19700 + }, + { + "epoch": 2.527571172095409, + "grad_norm": 0.40670618414878845, + "learning_rate": 0.00031495255193639396, + "loss": 0.3769, + "step": 19710 + }, + { + "epoch": 2.52885355219287, + "grad_norm": 0.5381103157997131, + "learning_rate": 0.00031409763187142004, + "loss": 0.259, + "step": 19720 + }, + { + "epoch": 2.5301359322903307, + "grad_norm": 0.7360714673995972, + "learning_rate": 0.00031324271180644607, + "loss": 0.4182, + "step": 19730 + }, + { + "epoch": 2.5314183123877916, + "grad_norm": 0.8640091419219971, + "learning_rate": 0.0003123877917414722, + "loss": 0.3245, + "step": 19740 + }, + { + "epoch": 2.5327006924852524, + "grad_norm": 0.5540979504585266, + "learning_rate": 0.00031153287167649823, + "loss": 0.3386, + "step": 19750 + }, + { + "epoch": 2.5339830725827133, + "grad_norm": 0.7436388731002808, + "learning_rate": 0.00031067795161152437, + "loss": 0.3423, + "step": 19760 + }, + { + "epoch": 2.5352654526801746, + "grad_norm": 0.657111644744873, + "learning_rate": 0.0003098230315465504, + "loss": 0.5047, + "step": 19770 + }, + { + "epoch": 2.5365478327776354, + "grad_norm": 0.8611753582954407, + "learning_rate": 0.0003089681114815765, + "loss": 0.4797, + "step": 19780 + }, + { + "epoch": 2.5378302128750962, + "grad_norm": 0.834993302822113, + "learning_rate": 0.00030811319141660256, + "loss": 0.3757, + "step": 19790 + }, + { + "epoch": 2.539112592972557, + "grad_norm": 1.335398554801941, + "learning_rate": 0.00030725827135162864, + "loss": 0.4135, + "step": 19800 + }, + { + "epoch": 2.540394973070018, + "grad_norm": 0.5498932600021362, + "learning_rate": 0.00030640335128665467, + "loss": 0.4223, + "step": 19810 + }, + { + "epoch": 2.541677353167479, + "grad_norm": 0.754500150680542, + "learning_rate": 0.0003055484312216808, + "loss": 0.3339, + "step": 19820 + }, + { + "epoch": 2.5429597332649396, + "grad_norm": 1.278773307800293, + "learning_rate": 0.00030469351115670684, + "loss": 0.5497, + "step": 19830 + }, + { + "epoch": 2.5442421133624005, + "grad_norm": 0.549717903137207, + "learning_rate": 0.000303838591091733, + "loss": 0.4507, + "step": 19840 + }, + { + "epoch": 2.5455244934598618, + "grad_norm": 0.7708590626716614, + "learning_rate": 0.000302983671026759, + "loss": 0.363, + "step": 19850 + }, + { + "epoch": 2.5468068735573226, + "grad_norm": 0.4803219437599182, + "learning_rate": 0.0003021287509617851, + "loss": 0.2542, + "step": 19860 + }, + { + "epoch": 2.5480892536547834, + "grad_norm": 0.9697148203849792, + "learning_rate": 0.00030127383089681117, + "loss": 0.3473, + "step": 19870 + }, + { + "epoch": 2.5493716337522443, + "grad_norm": 1.0347312688827515, + "learning_rate": 0.00030041891083183725, + "loss": 0.3943, + "step": 19880 + }, + { + "epoch": 2.550654013849705, + "grad_norm": 0.8918094635009766, + "learning_rate": 0.0002995639907668633, + "loss": 0.295, + "step": 19890 + }, + { + "epoch": 2.551936393947166, + "grad_norm": 0.8626148700714111, + "learning_rate": 0.0002987090707018894, + "loss": 0.3919, + "step": 19900 + }, + { + "epoch": 2.553218774044627, + "grad_norm": 1.0296040773391724, + "learning_rate": 0.00029785415063691544, + "loss": 0.3154, + "step": 19910 + }, + { + "epoch": 2.5545011541420877, + "grad_norm": 0.8652689456939697, + "learning_rate": 0.0002969992305719415, + "loss": 0.4078, + "step": 19920 + }, + { + "epoch": 2.5557835342395485, + "grad_norm": 0.6881958246231079, + "learning_rate": 0.0002961443105069676, + "loss": 0.3029, + "step": 19930 + }, + { + "epoch": 2.5570659143370094, + "grad_norm": 0.627172589302063, + "learning_rate": 0.00029528939044199363, + "loss": 0.3965, + "step": 19940 + }, + { + "epoch": 2.55834829443447, + "grad_norm": 0.9632807970046997, + "learning_rate": 0.00029443447037701977, + "loss": 0.3817, + "step": 19950 + }, + { + "epoch": 2.559630674531931, + "grad_norm": 0.7820205688476562, + "learning_rate": 0.0002935795503120458, + "loss": 0.2984, + "step": 19960 + }, + { + "epoch": 2.560913054629392, + "grad_norm": 0.7165734767913818, + "learning_rate": 0.00029272463024707194, + "loss": 0.3589, + "step": 19970 + }, + { + "epoch": 2.562195434726853, + "grad_norm": 0.43464410305023193, + "learning_rate": 0.00029186971018209796, + "loss": 0.2894, + "step": 19980 + }, + { + "epoch": 2.563477814824314, + "grad_norm": 0.7545332312583923, + "learning_rate": 0.00029101479011712405, + "loss": 0.3861, + "step": 19990 + }, + { + "epoch": 2.564760194921775, + "grad_norm": 0.6315227746963501, + "learning_rate": 0.00029015987005215013, + "loss": 0.3933, + "step": 20000 + }, + { + "epoch": 2.5660425750192357, + "grad_norm": 0.8390231132507324, + "learning_rate": 0.0002893049499871762, + "loss": 0.4576, + "step": 20010 + }, + { + "epoch": 2.5673249551166966, + "grad_norm": 1.075249433517456, + "learning_rate": 0.00028845002992220224, + "loss": 0.394, + "step": 20020 + }, + { + "epoch": 2.5686073352141574, + "grad_norm": 0.9567833542823792, + "learning_rate": 0.0002875951098572284, + "loss": 0.3557, + "step": 20030 + }, + { + "epoch": 2.5698897153116183, + "grad_norm": 1.3885024785995483, + "learning_rate": 0.0002867401897922544, + "loss": 0.367, + "step": 20040 + }, + { + "epoch": 2.571172095409079, + "grad_norm": 0.8868879079818726, + "learning_rate": 0.00028588526972728054, + "loss": 0.3346, + "step": 20050 + }, + { + "epoch": 2.5724544755065404, + "grad_norm": 0.37503331899642944, + "learning_rate": 0.00028503034966230657, + "loss": 0.3142, + "step": 20060 + }, + { + "epoch": 2.5737368556040012, + "grad_norm": 1.0467469692230225, + "learning_rate": 0.00028417542959733265, + "loss": 0.379, + "step": 20070 + }, + { + "epoch": 2.575019235701462, + "grad_norm": 1.1559125185012817, + "learning_rate": 0.00028332050953235873, + "loss": 0.3753, + "step": 20080 + }, + { + "epoch": 2.576301615798923, + "grad_norm": 0.7905515432357788, + "learning_rate": 0.0002824655894673848, + "loss": 0.3534, + "step": 20090 + }, + { + "epoch": 2.5775839958963838, + "grad_norm": 0.44288355112075806, + "learning_rate": 0.00028161066940241084, + "loss": 0.3416, + "step": 20100 + }, + { + "epoch": 2.5788663759938446, + "grad_norm": 0.7493765950202942, + "learning_rate": 0.000280755749337437, + "loss": 0.4072, + "step": 20110 + }, + { + "epoch": 2.5801487560913055, + "grad_norm": 0.42998605966567993, + "learning_rate": 0.000279900829272463, + "loss": 0.317, + "step": 20120 + }, + { + "epoch": 2.5814311361887663, + "grad_norm": 1.049352765083313, + "learning_rate": 0.00027904590920748915, + "loss": 0.4015, + "step": 20130 + }, + { + "epoch": 2.582713516286227, + "grad_norm": 0.5475008487701416, + "learning_rate": 0.0002781909891425152, + "loss": 0.3598, + "step": 20140 + }, + { + "epoch": 2.583995896383688, + "grad_norm": 0.8483502864837646, + "learning_rate": 0.00027733606907754126, + "loss": 0.4564, + "step": 20150 + }, + { + "epoch": 2.585278276481149, + "grad_norm": 1.3677246570587158, + "learning_rate": 0.00027648114901256734, + "loss": 0.4583, + "step": 20160 + }, + { + "epoch": 2.5865606565786097, + "grad_norm": 1.5475443601608276, + "learning_rate": 0.0002756262289475934, + "loss": 0.4106, + "step": 20170 + }, + { + "epoch": 2.5878430366760705, + "grad_norm": 0.5748480558395386, + "learning_rate": 0.00027477130888261945, + "loss": 0.416, + "step": 20180 + }, + { + "epoch": 2.589125416773532, + "grad_norm": 0.9539164304733276, + "learning_rate": 0.0002739163888176456, + "loss": 0.4095, + "step": 20190 + }, + { + "epoch": 2.5904077968709927, + "grad_norm": 0.8380826115608215, + "learning_rate": 0.0002730614687526716, + "loss": 0.404, + "step": 20200 + }, + { + "epoch": 2.5916901769684535, + "grad_norm": 1.1457738876342773, + "learning_rate": 0.00027220654868769775, + "loss": 0.478, + "step": 20210 + }, + { + "epoch": 2.5929725570659143, + "grad_norm": 0.5963801741600037, + "learning_rate": 0.0002713516286227238, + "loss": 0.3425, + "step": 20220 + }, + { + "epoch": 2.594254937163375, + "grad_norm": 1.2528159618377686, + "learning_rate": 0.00027049670855774986, + "loss": 0.4059, + "step": 20230 + }, + { + "epoch": 2.595537317260836, + "grad_norm": 1.1081477403640747, + "learning_rate": 0.00026964178849277594, + "loss": 0.2923, + "step": 20240 + }, + { + "epoch": 2.596819697358297, + "grad_norm": 1.046190857887268, + "learning_rate": 0.000268786868427802, + "loss": 0.3084, + "step": 20250 + }, + { + "epoch": 2.598102077455758, + "grad_norm": 0.7045506238937378, + "learning_rate": 0.0002679319483628281, + "loss": 0.3575, + "step": 20260 + }, + { + "epoch": 2.599384457553219, + "grad_norm": 0.8695869445800781, + "learning_rate": 0.00026707702829785414, + "loss": 0.4175, + "step": 20270 + }, + { + "epoch": 2.60066683765068, + "grad_norm": 0.9905348420143127, + "learning_rate": 0.0002662221082328802, + "loss": 0.4376, + "step": 20280 + }, + { + "epoch": 2.6019492177481407, + "grad_norm": 1.3747539520263672, + "learning_rate": 0.0002653671881679063, + "loss": 0.4679, + "step": 20290 + }, + { + "epoch": 2.6032315978456015, + "grad_norm": 1.023525595664978, + "learning_rate": 0.0002645122681029324, + "loss": 0.4441, + "step": 20300 + }, + { + "epoch": 2.6045139779430624, + "grad_norm": 0.8504759669303894, + "learning_rate": 0.0002636573480379584, + "loss": 0.539, + "step": 20310 + }, + { + "epoch": 2.6057963580405232, + "grad_norm": 0.48631325364112854, + "learning_rate": 0.00026280242797298455, + "loss": 0.5464, + "step": 20320 + }, + { + "epoch": 2.607078738137984, + "grad_norm": 0.42857420444488525, + "learning_rate": 0.0002619475079080106, + "loss": 0.3781, + "step": 20330 + }, + { + "epoch": 2.608361118235445, + "grad_norm": 0.6672760844230652, + "learning_rate": 0.0002610925878430367, + "loss": 0.4347, + "step": 20340 + }, + { + "epoch": 2.6096434983329058, + "grad_norm": 0.5698977112770081, + "learning_rate": 0.00026023766777806274, + "loss": 0.4583, + "step": 20350 + }, + { + "epoch": 2.6109258784303666, + "grad_norm": 1.0976148843765259, + "learning_rate": 0.0002593827477130888, + "loss": 0.3995, + "step": 20360 + }, + { + "epoch": 2.6122082585278275, + "grad_norm": 1.1578220129013062, + "learning_rate": 0.0002585278276481149, + "loss": 0.3532, + "step": 20370 + }, + { + "epoch": 2.6134906386252883, + "grad_norm": 1.0207488536834717, + "learning_rate": 0.000257672907583141, + "loss": 0.4577, + "step": 20380 + }, + { + "epoch": 2.6147730187227496, + "grad_norm": 1.2871861457824707, + "learning_rate": 0.000256817987518167, + "loss": 0.336, + "step": 20390 + }, + { + "epoch": 2.6160553988202104, + "grad_norm": 0.5854607224464417, + "learning_rate": 0.00025596306745319315, + "loss": 0.3488, + "step": 20400 + }, + { + "epoch": 2.6173377789176713, + "grad_norm": 1.5783365964889526, + "learning_rate": 0.0002551081473882192, + "loss": 0.4988, + "step": 20410 + }, + { + "epoch": 2.618620159015132, + "grad_norm": 1.0990679264068604, + "learning_rate": 0.0002542532273232453, + "loss": 0.4439, + "step": 20420 + }, + { + "epoch": 2.619902539112593, + "grad_norm": 0.5611817836761475, + "learning_rate": 0.00025339830725827135, + "loss": 0.2935, + "step": 20430 + }, + { + "epoch": 2.621184919210054, + "grad_norm": 1.3916196823120117, + "learning_rate": 0.00025254338719329743, + "loss": 0.3949, + "step": 20440 + }, + { + "epoch": 2.6224672993075147, + "grad_norm": 0.7436792254447937, + "learning_rate": 0.0002516884671283235, + "loss": 0.444, + "step": 20450 + }, + { + "epoch": 2.6237496794049755, + "grad_norm": 1.4927194118499756, + "learning_rate": 0.0002508335470633496, + "loss": 0.384, + "step": 20460 + }, + { + "epoch": 2.625032059502437, + "grad_norm": 1.047260046005249, + "learning_rate": 0.0002499786269983756, + "loss": 0.3375, + "step": 20470 + }, + { + "epoch": 2.6263144395998976, + "grad_norm": 0.9535210728645325, + "learning_rate": 0.0002491237069334017, + "loss": 0.3326, + "step": 20480 + }, + { + "epoch": 2.6275968196973585, + "grad_norm": 1.1021919250488281, + "learning_rate": 0.0002482687868684278, + "loss": 0.4592, + "step": 20490 + }, + { + "epoch": 2.6288791997948193, + "grad_norm": 0.6787020564079285, + "learning_rate": 0.00024741386680345387, + "loss": 0.3567, + "step": 20500 + }, + { + "epoch": 2.63016157989228, + "grad_norm": 0.5073117017745972, + "learning_rate": 0.00024655894673847995, + "loss": 0.2802, + "step": 20510 + }, + { + "epoch": 2.631443959989741, + "grad_norm": 0.7730292677879333, + "learning_rate": 0.00024570402667350603, + "loss": 0.3604, + "step": 20520 + }, + { + "epoch": 2.632726340087202, + "grad_norm": 1.1327155828475952, + "learning_rate": 0.0002448491066085321, + "loss": 0.4408, + "step": 20530 + }, + { + "epoch": 2.6340087201846627, + "grad_norm": 0.8838372826576233, + "learning_rate": 0.00024399418654355817, + "loss": 0.5678, + "step": 20540 + }, + { + "epoch": 2.6352911002821235, + "grad_norm": 0.5180802345275879, + "learning_rate": 0.00024313926647858425, + "loss": 0.3285, + "step": 20550 + }, + { + "epoch": 2.6365734803795844, + "grad_norm": 0.879054605960846, + "learning_rate": 0.00024228434641361033, + "loss": 0.4184, + "step": 20560 + }, + { + "epoch": 2.6378558604770452, + "grad_norm": 0.9276881814002991, + "learning_rate": 0.00024142942634863642, + "loss": 0.3412, + "step": 20570 + }, + { + "epoch": 2.639138240574506, + "grad_norm": 1.4996106624603271, + "learning_rate": 0.00024057450628366247, + "loss": 0.3899, + "step": 20580 + }, + { + "epoch": 2.640420620671967, + "grad_norm": 1.0205820798873901, + "learning_rate": 0.00023971958621868855, + "loss": 0.3253, + "step": 20590 + }, + { + "epoch": 2.641703000769428, + "grad_norm": 1.2541202306747437, + "learning_rate": 0.00023886466615371464, + "loss": 0.579, + "step": 20600 + }, + { + "epoch": 2.642985380866889, + "grad_norm": 1.1668142080307007, + "learning_rate": 0.00023800974608874072, + "loss": 0.4951, + "step": 20610 + }, + { + "epoch": 2.64426776096435, + "grad_norm": 0.9040181636810303, + "learning_rate": 0.00023715482602376677, + "loss": 0.3967, + "step": 20620 + }, + { + "epoch": 2.6455501410618107, + "grad_norm": 1.3997057676315308, + "learning_rate": 0.00023629990595879286, + "loss": 0.414, + "step": 20630 + }, + { + "epoch": 2.6468325211592716, + "grad_norm": 0.3811419904232025, + "learning_rate": 0.00023544498589381894, + "loss": 0.4728, + "step": 20640 + }, + { + "epoch": 2.6481149012567324, + "grad_norm": 0.7340693473815918, + "learning_rate": 0.00023459006582884502, + "loss": 0.4724, + "step": 20650 + }, + { + "epoch": 2.6493972813541933, + "grad_norm": 0.602635383605957, + "learning_rate": 0.00023373514576387108, + "loss": 0.305, + "step": 20660 + }, + { + "epoch": 2.650679661451654, + "grad_norm": 1.357358694076538, + "learning_rate": 0.00023288022569889716, + "loss": 0.3615, + "step": 20670 + }, + { + "epoch": 2.6519620415491154, + "grad_norm": 1.631966233253479, + "learning_rate": 0.00023202530563392324, + "loss": 0.3992, + "step": 20680 + }, + { + "epoch": 2.6532444216465763, + "grad_norm": 1.3770554065704346, + "learning_rate": 0.00023117038556894932, + "loss": 0.4558, + "step": 20690 + }, + { + "epoch": 2.654526801744037, + "grad_norm": 1.2345914840698242, + "learning_rate": 0.00023031546550397538, + "loss": 0.53, + "step": 20700 + }, + { + "epoch": 2.655809181841498, + "grad_norm": 0.8587237000465393, + "learning_rate": 0.00022946054543900146, + "loss": 0.326, + "step": 20710 + }, + { + "epoch": 2.657091561938959, + "grad_norm": 0.5960670709609985, + "learning_rate": 0.00022860562537402754, + "loss": 0.321, + "step": 20720 + }, + { + "epoch": 2.6583739420364196, + "grad_norm": 0.8732848763465881, + "learning_rate": 0.00022775070530905363, + "loss": 0.3383, + "step": 20730 + }, + { + "epoch": 2.6596563221338805, + "grad_norm": 1.1003626585006714, + "learning_rate": 0.00022689578524407968, + "loss": 0.2627, + "step": 20740 + }, + { + "epoch": 2.6609387022313413, + "grad_norm": 0.8450182676315308, + "learning_rate": 0.00022604086517910576, + "loss": 0.3813, + "step": 20750 + }, + { + "epoch": 2.662221082328802, + "grad_norm": 1.0902912616729736, + "learning_rate": 0.00022518594511413185, + "loss": 0.4519, + "step": 20760 + }, + { + "epoch": 2.663503462426263, + "grad_norm": 0.6427618861198425, + "learning_rate": 0.00022433102504915793, + "loss": 0.355, + "step": 20770 + }, + { + "epoch": 2.664785842523724, + "grad_norm": 1.0089173316955566, + "learning_rate": 0.00022347610498418396, + "loss": 0.316, + "step": 20780 + }, + { + "epoch": 2.6660682226211847, + "grad_norm": 0.8819127082824707, + "learning_rate": 0.00022262118491921004, + "loss": 0.3964, + "step": 20790 + }, + { + "epoch": 2.6673506027186455, + "grad_norm": 1.0088744163513184, + "learning_rate": 0.00022176626485423612, + "loss": 0.4064, + "step": 20800 + }, + { + "epoch": 2.668632982816107, + "grad_norm": 0.45587706565856934, + "learning_rate": 0.0002209113447892622, + "loss": 0.4366, + "step": 20810 + }, + { + "epoch": 2.6699153629135677, + "grad_norm": 0.8636410236358643, + "learning_rate": 0.0002200564247242883, + "loss": 0.2694, + "step": 20820 + }, + { + "epoch": 2.6711977430110285, + "grad_norm": 0.5451250672340393, + "learning_rate": 0.00021920150465931434, + "loss": 0.3359, + "step": 20830 + }, + { + "epoch": 2.6724801231084894, + "grad_norm": 1.1861648559570312, + "learning_rate": 0.00021834658459434042, + "loss": 0.4072, + "step": 20840 + }, + { + "epoch": 2.67376250320595, + "grad_norm": 1.4000024795532227, + "learning_rate": 0.0002174916645293665, + "loss": 0.4205, + "step": 20850 + }, + { + "epoch": 2.675044883303411, + "grad_norm": 0.5738406181335449, + "learning_rate": 0.0002166367444643926, + "loss": 0.5497, + "step": 20860 + }, + { + "epoch": 2.676327263400872, + "grad_norm": 0.49758780002593994, + "learning_rate": 0.00021578182439941864, + "loss": 0.2646, + "step": 20870 + }, + { + "epoch": 2.6776096434983327, + "grad_norm": 0.9785353541374207, + "learning_rate": 0.00021492690433444473, + "loss": 0.4507, + "step": 20880 + }, + { + "epoch": 2.678892023595794, + "grad_norm": 0.9146700501441956, + "learning_rate": 0.0002140719842694708, + "loss": 0.3821, + "step": 20890 + }, + { + "epoch": 2.680174403693255, + "grad_norm": 0.8285348415374756, + "learning_rate": 0.0002132170642044969, + "loss": 0.2993, + "step": 20900 + }, + { + "epoch": 2.6814567837907157, + "grad_norm": 0.9271901845932007, + "learning_rate": 0.00021236214413952295, + "loss": 0.3949, + "step": 20910 + }, + { + "epoch": 2.6827391638881766, + "grad_norm": 0.5074835419654846, + "learning_rate": 0.00021150722407454903, + "loss": 0.2717, + "step": 20920 + }, + { + "epoch": 2.6840215439856374, + "grad_norm": 0.8156689405441284, + "learning_rate": 0.0002106523040095751, + "loss": 0.3361, + "step": 20930 + }, + { + "epoch": 2.6853039240830983, + "grad_norm": 0.5775778293609619, + "learning_rate": 0.0002097973839446012, + "loss": 0.3324, + "step": 20940 + }, + { + "epoch": 2.686586304180559, + "grad_norm": 0.9868631958961487, + "learning_rate": 0.00020894246387962725, + "loss": 0.2793, + "step": 20950 + }, + { + "epoch": 2.68786868427802, + "grad_norm": 0.631433367729187, + "learning_rate": 0.00020808754381465333, + "loss": 0.4849, + "step": 20960 + }, + { + "epoch": 2.689151064375481, + "grad_norm": 0.5689303874969482, + "learning_rate": 0.0002072326237496794, + "loss": 0.3136, + "step": 20970 + }, + { + "epoch": 2.6904334444729416, + "grad_norm": 1.3480650186538696, + "learning_rate": 0.0002063777036847055, + "loss": 0.3781, + "step": 20980 + }, + { + "epoch": 2.6917158245704025, + "grad_norm": 0.9957194328308105, + "learning_rate": 0.00020552278361973155, + "loss": 0.3369, + "step": 20990 + }, + { + "epoch": 2.6929982046678633, + "grad_norm": 1.6954656839370728, + "learning_rate": 0.00020466786355475763, + "loss": 0.4473, + "step": 21000 + }, + { + "epoch": 2.694280584765324, + "grad_norm": 0.7176766991615295, + "learning_rate": 0.00020381294348978372, + "loss": 0.2455, + "step": 21010 + }, + { + "epoch": 2.6955629648627855, + "grad_norm": 1.3188832998275757, + "learning_rate": 0.0002029580234248098, + "loss": 0.4423, + "step": 21020 + }, + { + "epoch": 2.6968453449602463, + "grad_norm": 1.5381674766540527, + "learning_rate": 0.00020210310335983585, + "loss": 0.375, + "step": 21030 + }, + { + "epoch": 2.698127725057707, + "grad_norm": 0.8166912794113159, + "learning_rate": 0.00020124818329486194, + "loss": 0.4334, + "step": 21040 + }, + { + "epoch": 2.699410105155168, + "grad_norm": 1.0441280603408813, + "learning_rate": 0.00020039326322988802, + "loss": 0.3905, + "step": 21050 + }, + { + "epoch": 2.700692485252629, + "grad_norm": 0.7561319470405579, + "learning_rate": 0.0001995383431649141, + "loss": 0.316, + "step": 21060 + }, + { + "epoch": 2.7019748653500897, + "grad_norm": 0.8861315250396729, + "learning_rate": 0.00019868342309994016, + "loss": 0.3531, + "step": 21070 + }, + { + "epoch": 2.7032572454475505, + "grad_norm": 0.887611448764801, + "learning_rate": 0.00019782850303496624, + "loss": 0.4692, + "step": 21080 + }, + { + "epoch": 2.704539625545012, + "grad_norm": 0.985729455947876, + "learning_rate": 0.00019697358296999232, + "loss": 0.4464, + "step": 21090 + }, + { + "epoch": 2.7058220056424727, + "grad_norm": 0.8642510175704956, + "learning_rate": 0.0001961186629050184, + "loss": 0.3553, + "step": 21100 + }, + { + "epoch": 2.7071043857399335, + "grad_norm": 0.7116155028343201, + "learning_rate": 0.00019526374284004449, + "loss": 0.4635, + "step": 21110 + }, + { + "epoch": 2.7083867658373943, + "grad_norm": 0.9211375713348389, + "learning_rate": 0.00019440882277507054, + "loss": 0.356, + "step": 21120 + }, + { + "epoch": 2.709669145934855, + "grad_norm": 0.6295248866081238, + "learning_rate": 0.0001935539027100966, + "loss": 0.3683, + "step": 21130 + }, + { + "epoch": 2.710951526032316, + "grad_norm": 0.596107006072998, + "learning_rate": 0.00019269898264512268, + "loss": 0.274, + "step": 21140 + }, + { + "epoch": 2.712233906129777, + "grad_norm": 1.4957377910614014, + "learning_rate": 0.00019184406258014876, + "loss": 0.3562, + "step": 21150 + }, + { + "epoch": 2.7135162862272377, + "grad_norm": 1.4567288160324097, + "learning_rate": 0.00019098914251517482, + "loss": 0.3553, + "step": 21160 + }, + { + "epoch": 2.7147986663246986, + "grad_norm": 0.44168442487716675, + "learning_rate": 0.0001901342224502009, + "loss": 0.434, + "step": 21170 + }, + { + "epoch": 2.7160810464221594, + "grad_norm": 1.3469419479370117, + "learning_rate": 0.00018927930238522698, + "loss": 0.5778, + "step": 21180 + }, + { + "epoch": 2.7173634265196203, + "grad_norm": 0.3783499300479889, + "learning_rate": 0.00018842438232025306, + "loss": 0.2967, + "step": 21190 + }, + { + "epoch": 2.718645806617081, + "grad_norm": 0.9081128239631653, + "learning_rate": 0.00018756946225527912, + "loss": 0.4134, + "step": 21200 + }, + { + "epoch": 2.719928186714542, + "grad_norm": 1.2152372598648071, + "learning_rate": 0.0001867145421903052, + "loss": 0.4139, + "step": 21210 + }, + { + "epoch": 2.7212105668120032, + "grad_norm": 0.8168225288391113, + "learning_rate": 0.00018585962212533128, + "loss": 0.3853, + "step": 21220 + }, + { + "epoch": 2.722492946909464, + "grad_norm": 0.8900707364082336, + "learning_rate": 0.00018500470206035737, + "loss": 0.3369, + "step": 21230 + }, + { + "epoch": 2.723775327006925, + "grad_norm": 0.8105087280273438, + "learning_rate": 0.00018414978199538342, + "loss": 0.3421, + "step": 21240 + }, + { + "epoch": 2.7250577071043858, + "grad_norm": 1.3624615669250488, + "learning_rate": 0.0001832948619304095, + "loss": 0.4038, + "step": 21250 + }, + { + "epoch": 2.7263400872018466, + "grad_norm": 0.7589739561080933, + "learning_rate": 0.00018243994186543559, + "loss": 0.2604, + "step": 21260 + }, + { + "epoch": 2.7276224672993075, + "grad_norm": 0.9599238038063049, + "learning_rate": 0.00018158502180046167, + "loss": 0.4537, + "step": 21270 + }, + { + "epoch": 2.7289048473967683, + "grad_norm": 0.5657153725624084, + "learning_rate": 0.00018073010173548772, + "loss": 0.4679, + "step": 21280 + }, + { + "epoch": 2.730187227494229, + "grad_norm": 1.2358009815216064, + "learning_rate": 0.0001798751816705138, + "loss": 0.3009, + "step": 21290 + }, + { + "epoch": 2.7314696075916904, + "grad_norm": 0.7661507725715637, + "learning_rate": 0.0001790202616055399, + "loss": 0.3509, + "step": 21300 + }, + { + "epoch": 2.7327519876891513, + "grad_norm": 2.557483673095703, + "learning_rate": 0.00017816534154056597, + "loss": 0.4234, + "step": 21310 + }, + { + "epoch": 2.734034367786612, + "grad_norm": 0.7089506387710571, + "learning_rate": 0.00017731042147559203, + "loss": 0.3774, + "step": 21320 + }, + { + "epoch": 2.735316747884073, + "grad_norm": 1.5683780908584595, + "learning_rate": 0.0001764555014106181, + "loss": 0.3991, + "step": 21330 + }, + { + "epoch": 2.736599127981534, + "grad_norm": 0.6015053987503052, + "learning_rate": 0.0001756005813456442, + "loss": 0.332, + "step": 21340 + }, + { + "epoch": 2.7378815080789947, + "grad_norm": 0.6616173386573792, + "learning_rate": 0.00017474566128067027, + "loss": 0.3991, + "step": 21350 + }, + { + "epoch": 2.7391638881764555, + "grad_norm": 1.1823351383209229, + "learning_rate": 0.00017389074121569633, + "loss": 0.4555, + "step": 21360 + }, + { + "epoch": 2.7404462682739164, + "grad_norm": 0.7915502190589905, + "learning_rate": 0.0001730358211507224, + "loss": 0.3425, + "step": 21370 + }, + { + "epoch": 2.741728648371377, + "grad_norm": 1.186974287033081, + "learning_rate": 0.0001721809010857485, + "loss": 0.5156, + "step": 21380 + }, + { + "epoch": 2.743011028468838, + "grad_norm": 0.8260472416877747, + "learning_rate": 0.00017132598102077457, + "loss": 0.4004, + "step": 21390 + }, + { + "epoch": 2.744293408566299, + "grad_norm": 1.5226585865020752, + "learning_rate": 0.00017047106095580066, + "loss": 0.3825, + "step": 21400 + }, + { + "epoch": 2.7455757886637597, + "grad_norm": 0.7888827919960022, + "learning_rate": 0.0001696161408908267, + "loss": 0.3379, + "step": 21410 + }, + { + "epoch": 2.7468581687612206, + "grad_norm": 1.188528060913086, + "learning_rate": 0.0001687612208258528, + "loss": 0.2501, + "step": 21420 + }, + { + "epoch": 2.748140548858682, + "grad_norm": 1.040313720703125, + "learning_rate": 0.00016790630076087888, + "loss": 0.5706, + "step": 21430 + }, + { + "epoch": 2.7494229289561427, + "grad_norm": 1.1419790983200073, + "learning_rate": 0.00016705138069590496, + "loss": 0.345, + "step": 21440 + }, + { + "epoch": 2.7507053090536036, + "grad_norm": 1.0169458389282227, + "learning_rate": 0.00016619646063093101, + "loss": 0.4079, + "step": 21450 + }, + { + "epoch": 2.7519876891510644, + "grad_norm": 1.201564073562622, + "learning_rate": 0.0001653415405659571, + "loss": 0.2988, + "step": 21460 + }, + { + "epoch": 2.7532700692485252, + "grad_norm": 0.5512075424194336, + "learning_rate": 0.00016448662050098318, + "loss": 0.3651, + "step": 21470 + }, + { + "epoch": 2.754552449345986, + "grad_norm": 1.1715940237045288, + "learning_rate": 0.00016363170043600923, + "loss": 0.3561, + "step": 21480 + }, + { + "epoch": 2.755834829443447, + "grad_norm": 1.5060564279556274, + "learning_rate": 0.0001627767803710353, + "loss": 0.4964, + "step": 21490 + }, + { + "epoch": 2.7571172095409078, + "grad_norm": 1.0363975763320923, + "learning_rate": 0.00016192186030606137, + "loss": 0.4628, + "step": 21500 + }, + { + "epoch": 2.758399589638369, + "grad_norm": 0.6451253890991211, + "learning_rate": 0.00016106694024108745, + "loss": 0.5482, + "step": 21510 + }, + { + "epoch": 2.75968196973583, + "grad_norm": 0.8802538514137268, + "learning_rate": 0.00016021202017611354, + "loss": 0.4662, + "step": 21520 + }, + { + "epoch": 2.7609643498332908, + "grad_norm": 0.6708236336708069, + "learning_rate": 0.0001593571001111396, + "loss": 0.2757, + "step": 21530 + }, + { + "epoch": 2.7622467299307516, + "grad_norm": 0.5467422604560852, + "learning_rate": 0.00015850218004616567, + "loss": 0.4559, + "step": 21540 + }, + { + "epoch": 2.7635291100282124, + "grad_norm": 0.9822036623954773, + "learning_rate": 0.00015764725998119176, + "loss": 0.3517, + "step": 21550 + }, + { + "epoch": 2.7648114901256733, + "grad_norm": 0.6225240230560303, + "learning_rate": 0.00015679233991621784, + "loss": 0.3221, + "step": 21560 + }, + { + "epoch": 2.766093870223134, + "grad_norm": 0.5968758463859558, + "learning_rate": 0.0001559374198512439, + "loss": 0.4548, + "step": 21570 + }, + { + "epoch": 2.767376250320595, + "grad_norm": 0.8913034200668335, + "learning_rate": 0.00015508249978626998, + "loss": 0.4053, + "step": 21580 + }, + { + "epoch": 2.768658630418056, + "grad_norm": 1.6031399965286255, + "learning_rate": 0.00015422757972129606, + "loss": 0.3838, + "step": 21590 + }, + { + "epoch": 2.7699410105155167, + "grad_norm": 0.9392004609107971, + "learning_rate": 0.00015337265965632214, + "loss": 0.3233, + "step": 21600 + }, + { + "epoch": 2.7712233906129775, + "grad_norm": 0.7516948580741882, + "learning_rate": 0.0001525177395913482, + "loss": 0.25, + "step": 21610 + }, + { + "epoch": 2.7725057707104384, + "grad_norm": 0.7983139157295227, + "learning_rate": 0.00015166281952637428, + "loss": 0.3126, + "step": 21620 + }, + { + "epoch": 2.773788150807899, + "grad_norm": 0.7680755853652954, + "learning_rate": 0.00015080789946140036, + "loss": 0.3528, + "step": 21630 + }, + { + "epoch": 2.7750705309053605, + "grad_norm": 0.7174438834190369, + "learning_rate": 0.00014995297939642644, + "loss": 0.4091, + "step": 21640 + }, + { + "epoch": 2.7763529110028213, + "grad_norm": 0.8676108717918396, + "learning_rate": 0.0001490980593314525, + "loss": 0.5095, + "step": 21650 + }, + { + "epoch": 2.777635291100282, + "grad_norm": 0.7086964249610901, + "learning_rate": 0.00014824313926647858, + "loss": 0.2819, + "step": 21660 + }, + { + "epoch": 2.778917671197743, + "grad_norm": 1.6894848346710205, + "learning_rate": 0.00014738821920150466, + "loss": 0.3624, + "step": 21670 + }, + { + "epoch": 2.780200051295204, + "grad_norm": 0.7783902287483215, + "learning_rate": 0.00014653329913653075, + "loss": 0.4304, + "step": 21680 + }, + { + "epoch": 2.7814824313926647, + "grad_norm": 0.7895000576972961, + "learning_rate": 0.00014567837907155683, + "loss": 0.3576, + "step": 21690 + }, + { + "epoch": 2.7827648114901256, + "grad_norm": 0.5636423826217651, + "learning_rate": 0.00014482345900658288, + "loss": 0.3083, + "step": 21700 + }, + { + "epoch": 2.7840471915875864, + "grad_norm": 1.1489410400390625, + "learning_rate": 0.00014396853894160897, + "loss": 0.3091, + "step": 21710 + }, + { + "epoch": 2.7853295716850477, + "grad_norm": 0.59771728515625, + "learning_rate": 0.00014311361887663505, + "loss": 0.4021, + "step": 21720 + }, + { + "epoch": 2.7866119517825085, + "grad_norm": 0.722762405872345, + "learning_rate": 0.00014225869881166113, + "loss": 0.3188, + "step": 21730 + }, + { + "epoch": 2.7878943318799694, + "grad_norm": 0.6990886330604553, + "learning_rate": 0.00014140377874668719, + "loss": 0.3733, + "step": 21740 + }, + { + "epoch": 2.7891767119774302, + "grad_norm": 0.8142735362052917, + "learning_rate": 0.00014054885868171327, + "loss": 0.3454, + "step": 21750 + }, + { + "epoch": 2.790459092074891, + "grad_norm": 1.8750430345535278, + "learning_rate": 0.00013969393861673935, + "loss": 0.501, + "step": 21760 + }, + { + "epoch": 2.791741472172352, + "grad_norm": 0.7295469641685486, + "learning_rate": 0.00013883901855176543, + "loss": 0.3375, + "step": 21770 + }, + { + "epoch": 2.7930238522698128, + "grad_norm": 0.9579476118087769, + "learning_rate": 0.0001379840984867915, + "loss": 0.4572, + "step": 21780 + }, + { + "epoch": 2.7943062323672736, + "grad_norm": 0.9507008790969849, + "learning_rate": 0.00013712917842181757, + "loss": 0.3134, + "step": 21790 + }, + { + "epoch": 2.7955886124647344, + "grad_norm": 1.0686496496200562, + "learning_rate": 0.00013627425835684365, + "loss": 0.365, + "step": 21800 + }, + { + "epoch": 2.7968709925621953, + "grad_norm": 0.6618695855140686, + "learning_rate": 0.00013541933829186974, + "loss": 0.5158, + "step": 21810 + }, + { + "epoch": 2.798153372659656, + "grad_norm": 0.7745763659477234, + "learning_rate": 0.0001345644182268958, + "loss": 0.4521, + "step": 21820 + }, + { + "epoch": 2.799435752757117, + "grad_norm": 0.9630032777786255, + "learning_rate": 0.00013370949816192185, + "loss": 0.383, + "step": 21830 + }, + { + "epoch": 2.800718132854578, + "grad_norm": 0.9685844779014587, + "learning_rate": 0.00013285457809694793, + "loss": 0.351, + "step": 21840 + }, + { + "epoch": 2.802000512952039, + "grad_norm": 1.8922075033187866, + "learning_rate": 0.000131999658031974, + "loss": 0.3359, + "step": 21850 + }, + { + "epoch": 2.8032828930495, + "grad_norm": 1.1599595546722412, + "learning_rate": 0.00013114473796700007, + "loss": 0.5045, + "step": 21860 + }, + { + "epoch": 2.804565273146961, + "grad_norm": 0.761369526386261, + "learning_rate": 0.00013028981790202615, + "loss": 0.3795, + "step": 21870 + }, + { + "epoch": 2.8058476532444216, + "grad_norm": 0.4400559663772583, + "learning_rate": 0.00012943489783705223, + "loss": 0.4146, + "step": 21880 + }, + { + "epoch": 2.8071300333418825, + "grad_norm": 0.6165184378623962, + "learning_rate": 0.0001285799777720783, + "loss": 0.3696, + "step": 21890 + }, + { + "epoch": 2.8084124134393433, + "grad_norm": 1.1559704542160034, + "learning_rate": 0.00012772505770710437, + "loss": 0.3715, + "step": 21900 + }, + { + "epoch": 2.809694793536804, + "grad_norm": 0.7321136593818665, + "learning_rate": 0.00012687013764213045, + "loss": 0.3048, + "step": 21910 + }, + { + "epoch": 2.8109771736342655, + "grad_norm": 0.5283898711204529, + "learning_rate": 0.00012601521757715653, + "loss": 0.2636, + "step": 21920 + }, + { + "epoch": 2.8122595537317263, + "grad_norm": 0.8270158171653748, + "learning_rate": 0.00012516029751218262, + "loss": 0.4423, + "step": 21930 + }, + { + "epoch": 2.813541933829187, + "grad_norm": 0.872068464756012, + "learning_rate": 0.00012430537744720867, + "loss": 0.2926, + "step": 21940 + }, + { + "epoch": 2.814824313926648, + "grad_norm": 1.1108500957489014, + "learning_rate": 0.00012345045738223475, + "loss": 0.349, + "step": 21950 + }, + { + "epoch": 2.816106694024109, + "grad_norm": 1.0009726285934448, + "learning_rate": 0.00012259553731726084, + "loss": 0.3206, + "step": 21960 + }, + { + "epoch": 2.8173890741215697, + "grad_norm": 0.44574859738349915, + "learning_rate": 0.00012174061725228692, + "loss": 0.2925, + "step": 21970 + }, + { + "epoch": 2.8186714542190305, + "grad_norm": 0.8400396704673767, + "learning_rate": 0.00012088569718731299, + "loss": 0.3977, + "step": 21980 + }, + { + "epoch": 2.8199538343164914, + "grad_norm": 0.853813111782074, + "learning_rate": 0.00012003077712233907, + "loss": 0.4258, + "step": 21990 + }, + { + "epoch": 2.8212362144139522, + "grad_norm": 0.6891235709190369, + "learning_rate": 0.00011917585705736514, + "loss": 0.2868, + "step": 22000 + }, + { + "epoch": 2.822518594511413, + "grad_norm": 0.9624373316764832, + "learning_rate": 0.00011832093699239122, + "loss": 0.4167, + "step": 22010 + }, + { + "epoch": 2.823800974608874, + "grad_norm": 0.8667474389076233, + "learning_rate": 0.00011746601692741729, + "loss": 0.3906, + "step": 22020 + }, + { + "epoch": 2.8250833547063348, + "grad_norm": 0.9315304756164551, + "learning_rate": 0.00011661109686244337, + "loss": 0.2763, + "step": 22030 + }, + { + "epoch": 2.8263657348037956, + "grad_norm": 0.48842424154281616, + "learning_rate": 0.00011575617679746944, + "loss": 0.3595, + "step": 22040 + }, + { + "epoch": 2.827648114901257, + "grad_norm": 2.02878737449646, + "learning_rate": 0.00011490125673249552, + "loss": 0.432, + "step": 22050 + }, + { + "epoch": 2.8289304949987177, + "grad_norm": 1.5318242311477661, + "learning_rate": 0.00011404633666752159, + "loss": 0.4374, + "step": 22060 + }, + { + "epoch": 2.8302128750961786, + "grad_norm": 1.2656123638153076, + "learning_rate": 0.00011319141660254767, + "loss": 0.3225, + "step": 22070 + }, + { + "epoch": 2.8314952551936394, + "grad_norm": 1.2422733306884766, + "learning_rate": 0.00011233649653757374, + "loss": 0.4328, + "step": 22080 + }, + { + "epoch": 2.8327776352911003, + "grad_norm": 0.769603967666626, + "learning_rate": 0.00011148157647259981, + "loss": 0.301, + "step": 22090 + }, + { + "epoch": 2.834060015388561, + "grad_norm": 1.1890935897827148, + "learning_rate": 0.00011062665640762588, + "loss": 0.4696, + "step": 22100 + }, + { + "epoch": 2.835342395486022, + "grad_norm": 0.8918318748474121, + "learning_rate": 0.00010977173634265196, + "loss": 0.3997, + "step": 22110 + }, + { + "epoch": 2.836624775583483, + "grad_norm": 0.7001236081123352, + "learning_rate": 0.00010891681627767803, + "loss": 0.4216, + "step": 22120 + }, + { + "epoch": 2.837907155680944, + "grad_norm": 0.84539794921875, + "learning_rate": 0.00010806189621270411, + "loss": 0.3192, + "step": 22130 + }, + { + "epoch": 2.839189535778405, + "grad_norm": 0.9644067287445068, + "learning_rate": 0.00010720697614773018, + "loss": 0.474, + "step": 22140 + }, + { + "epoch": 2.840471915875866, + "grad_norm": 0.9339047074317932, + "learning_rate": 0.00010635205608275626, + "loss": 0.3042, + "step": 22150 + }, + { + "epoch": 2.8417542959733266, + "grad_norm": 0.7227121591567993, + "learning_rate": 0.00010549713601778233, + "loss": 0.2391, + "step": 22160 + }, + { + "epoch": 2.8430366760707875, + "grad_norm": 0.7822548747062683, + "learning_rate": 0.00010464221595280842, + "loss": 0.4902, + "step": 22170 + }, + { + "epoch": 2.8443190561682483, + "grad_norm": 0.9597374200820923, + "learning_rate": 0.00010378729588783448, + "loss": 0.3201, + "step": 22180 + }, + { + "epoch": 2.845601436265709, + "grad_norm": 1.0328844785690308, + "learning_rate": 0.00010293237582286057, + "loss": 0.3916, + "step": 22190 + }, + { + "epoch": 2.84688381636317, + "grad_norm": 0.6888856291770935, + "learning_rate": 0.00010207745575788664, + "loss": 0.415, + "step": 22200 + }, + { + "epoch": 2.848166196460631, + "grad_norm": 1.4465842247009277, + "learning_rate": 0.00010122253569291272, + "loss": 0.3799, + "step": 22210 + }, + { + "epoch": 2.8494485765580917, + "grad_norm": 1.1186655759811401, + "learning_rate": 0.00010036761562793879, + "loss": 0.3732, + "step": 22220 + }, + { + "epoch": 2.8507309566555525, + "grad_norm": 0.5343247056007385, + "learning_rate": 9.951269556296487e-05, + "loss": 0.3953, + "step": 22230 + }, + { + "epoch": 2.8520133367530134, + "grad_norm": 0.5710815191268921, + "learning_rate": 9.865777549799095e-05, + "loss": 0.4029, + "step": 22240 + }, + { + "epoch": 2.8532957168504742, + "grad_norm": 1.0526983737945557, + "learning_rate": 9.780285543301702e-05, + "loss": 0.4013, + "step": 22250 + }, + { + "epoch": 2.8545780969479355, + "grad_norm": 0.9180122017860413, + "learning_rate": 9.694793536804309e-05, + "loss": 0.3656, + "step": 22260 + }, + { + "epoch": 2.8558604770453964, + "grad_norm": 0.5228607654571533, + "learning_rate": 9.609301530306916e-05, + "loss": 0.388, + "step": 22270 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.7112893462181091, + "learning_rate": 9.523809523809524e-05, + "loss": 0.2866, + "step": 22280 + }, + { + "epoch": 2.858425237240318, + "grad_norm": 1.2582242488861084, + "learning_rate": 9.438317517312131e-05, + "loss": 0.3768, + "step": 22290 + }, + { + "epoch": 2.859707617337779, + "grad_norm": 0.9449999332427979, + "learning_rate": 9.352825510814739e-05, + "loss": 0.4034, + "step": 22300 + }, + { + "epoch": 2.8609899974352397, + "grad_norm": 0.7868074774742126, + "learning_rate": 9.267333504317346e-05, + "loss": 0.4197, + "step": 22310 + }, + { + "epoch": 2.8622723775327006, + "grad_norm": 0.5401546359062195, + "learning_rate": 9.181841497819954e-05, + "loss": 0.3198, + "step": 22320 + }, + { + "epoch": 2.8635547576301614, + "grad_norm": 1.1672154664993286, + "learning_rate": 9.096349491322561e-05, + "loss": 0.3383, + "step": 22330 + }, + { + "epoch": 2.8648371377276227, + "grad_norm": 0.43170639872550964, + "learning_rate": 9.01085748482517e-05, + "loss": 0.3007, + "step": 22340 + }, + { + "epoch": 2.8661195178250836, + "grad_norm": 1.1403450965881348, + "learning_rate": 8.925365478327776e-05, + "loss": 0.3279, + "step": 22350 + }, + { + "epoch": 2.8674018979225444, + "grad_norm": 1.2685964107513428, + "learning_rate": 8.839873471830385e-05, + "loss": 0.5152, + "step": 22360 + }, + { + "epoch": 2.8686842780200053, + "grad_norm": 0.43280231952667236, + "learning_rate": 8.754381465332991e-05, + "loss": 0.3793, + "step": 22370 + }, + { + "epoch": 2.869966658117466, + "grad_norm": 0.7950090169906616, + "learning_rate": 8.6688894588356e-05, + "loss": 0.4339, + "step": 22380 + }, + { + "epoch": 2.871249038214927, + "grad_norm": 0.9394015669822693, + "learning_rate": 8.583397452338207e-05, + "loss": 0.4288, + "step": 22390 + }, + { + "epoch": 2.872531418312388, + "grad_norm": 1.5615211725234985, + "learning_rate": 8.497905445840815e-05, + "loss": 0.5027, + "step": 22400 + }, + { + "epoch": 2.8738137984098486, + "grad_norm": 0.9067406058311462, + "learning_rate": 8.412413439343422e-05, + "loss": 0.4342, + "step": 22410 + }, + { + "epoch": 2.8750961785073095, + "grad_norm": 1.3683377504348755, + "learning_rate": 8.32692143284603e-05, + "loss": 0.3952, + "step": 22420 + }, + { + "epoch": 2.8763785586047703, + "grad_norm": 0.5947908163070679, + "learning_rate": 8.241429426348637e-05, + "loss": 0.3272, + "step": 22430 + }, + { + "epoch": 2.877660938702231, + "grad_norm": 0.5604143142700195, + "learning_rate": 8.155937419851244e-05, + "loss": 0.3151, + "step": 22440 + }, + { + "epoch": 2.878943318799692, + "grad_norm": 0.4945407509803772, + "learning_rate": 8.07044541335385e-05, + "loss": 0.3728, + "step": 22450 + }, + { + "epoch": 2.880225698897153, + "grad_norm": 1.287941336631775, + "learning_rate": 7.984953406856459e-05, + "loss": 0.2752, + "step": 22460 + }, + { + "epoch": 2.881508078994614, + "grad_norm": 0.7874084115028381, + "learning_rate": 7.899461400359066e-05, + "loss": 0.4383, + "step": 22470 + }, + { + "epoch": 2.882790459092075, + "grad_norm": 0.8812971115112305, + "learning_rate": 7.813969393861674e-05, + "loss": 0.4036, + "step": 22480 + }, + { + "epoch": 2.884072839189536, + "grad_norm": 0.5514728426933289, + "learning_rate": 7.728477387364281e-05, + "loss": 0.2882, + "step": 22490 + }, + { + "epoch": 2.8853552192869967, + "grad_norm": 0.7565945386886597, + "learning_rate": 7.642985380866889e-05, + "loss": 0.3155, + "step": 22500 + } + ], + "logging_steps": 10, + "max_steps": 23394, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1628484544661760.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}