{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 531.25, "eval_steps": 500, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.625, "grad_norm": 0.9679650664329529, "learning_rate": 1.8e-07, "loss": 1.4147, "step": 10 }, { "epoch": 1.25, "grad_norm": 0.9715470671653748, "learning_rate": 3.8e-07, "loss": 1.4102, "step": 20 }, { "epoch": 1.875, "grad_norm": 0.9312183260917664, "learning_rate": 5.8e-07, "loss": 1.4117, "step": 30 }, { "epoch": 2.5, "grad_norm": 0.9487130641937256, "learning_rate": 7.8e-07, "loss": 1.4086, "step": 40 }, { "epoch": 3.125, "grad_norm": 0.8825913071632385, "learning_rate": 9.8e-07, "loss": 1.3962, "step": 50 }, { "epoch": 3.75, "grad_norm": 0.8918140530586243, "learning_rate": 1.18e-06, "loss": 1.3851, "step": 60 }, { "epoch": 4.375, "grad_norm": 0.8738917708396912, "learning_rate": 1.3800000000000001e-06, "loss": 1.3686, "step": 70 }, { "epoch": 5.0, "grad_norm": 0.9906870126724243, "learning_rate": 1.5800000000000003e-06, "loss": 1.3532, "step": 80 }, { "epoch": 5.625, "grad_norm": 1.1455429792404175, "learning_rate": 1.7800000000000001e-06, "loss": 1.3182, "step": 90 }, { "epoch": 6.25, "grad_norm": 1.236258864402771, "learning_rate": 1.98e-06, "loss": 1.2925, "step": 100 }, { "epoch": 6.875, "grad_norm": 1.327654242515564, "learning_rate": 2.1800000000000003e-06, "loss": 1.258, "step": 110 }, { "epoch": 7.5, "grad_norm": 1.2924314737319946, "learning_rate": 2.38e-06, "loss": 1.2277, "step": 120 }, { "epoch": 8.125, "grad_norm": 1.167385220527649, "learning_rate": 2.5800000000000003e-06, "loss": 1.1915, "step": 130 }, { "epoch": 8.75, "grad_norm": 0.9448131322860718, "learning_rate": 2.78e-06, "loss": 1.1615, "step": 140 }, { "epoch": 9.375, "grad_norm": 0.6841535568237305, "learning_rate": 2.9800000000000003e-06, "loss": 1.1314, "step": 150 }, { "epoch": 10.0, "grad_norm": 0.41633787751197815, "learning_rate": 3.1800000000000005e-06, "loss": 1.1138, "step": 160 }, { "epoch": 10.625, "grad_norm": 0.3072815537452698, "learning_rate": 3.38e-06, "loss": 1.1009, "step": 170 }, { "epoch": 11.25, "grad_norm": 0.28385525941848755, "learning_rate": 3.58e-06, "loss": 1.0909, "step": 180 }, { "epoch": 11.875, "grad_norm": 0.26593610644340515, "learning_rate": 3.7800000000000002e-06, "loss": 1.0844, "step": 190 }, { "epoch": 12.5, "grad_norm": 0.2532358169555664, "learning_rate": 3.98e-06, "loss": 1.0767, "step": 200 }, { "epoch": 13.125, "grad_norm": 0.2549743354320526, "learning_rate": 4.18e-06, "loss": 1.0693, "step": 210 }, { "epoch": 13.75, "grad_norm": 0.22734442353248596, "learning_rate": 4.38e-06, "loss": 1.0659, "step": 220 }, { "epoch": 14.375, "grad_norm": 0.22863343358039856, "learning_rate": 4.58e-06, "loss": 1.0584, "step": 230 }, { "epoch": 15.0, "grad_norm": 0.22417008876800537, "learning_rate": 4.780000000000001e-06, "loss": 1.0538, "step": 240 }, { "epoch": 15.625, "grad_norm": 0.20044955611228943, "learning_rate": 4.98e-06, "loss": 1.0501, "step": 250 }, { "epoch": 16.25, "grad_norm": 0.20292679965496063, "learning_rate": 5.18e-06, "loss": 1.0471, "step": 260 }, { "epoch": 16.875, "grad_norm": 0.18847720324993134, "learning_rate": 5.38e-06, "loss": 1.0408, "step": 270 }, { "epoch": 17.5, "grad_norm": 0.16966596245765686, "learning_rate": 5.580000000000001e-06, "loss": 1.0372, "step": 280 }, { "epoch": 18.125, "grad_norm": 0.189141184091568, "learning_rate": 5.78e-06, "loss": 1.0352, "step": 290 }, { "epoch": 18.75, "grad_norm": 0.18442021310329437, "learning_rate": 5.98e-06, "loss": 1.0313, "step": 300 }, { "epoch": 19.375, "grad_norm": 0.1687787026166916, "learning_rate": 6.18e-06, "loss": 1.0297, "step": 310 }, { "epoch": 20.0, "grad_norm": 0.20706340670585632, "learning_rate": 6.38e-06, "loss": 1.0266, "step": 320 }, { "epoch": 20.625, "grad_norm": 0.17096994817256927, "learning_rate": 6.58e-06, "loss": 1.0222, "step": 330 }, { "epoch": 21.25, "grad_norm": 0.18136513233184814, "learning_rate": 6.78e-06, "loss": 1.0181, "step": 340 }, { "epoch": 21.875, "grad_norm": 0.18053822219371796, "learning_rate": 6.98e-06, "loss": 1.0154, "step": 350 }, { "epoch": 22.5, "grad_norm": 0.2154332548379898, "learning_rate": 7.180000000000001e-06, "loss": 1.0116, "step": 360 }, { "epoch": 23.125, "grad_norm": 0.23920832574367523, "learning_rate": 7.3800000000000005e-06, "loss": 1.0056, "step": 370 }, { "epoch": 23.75, "grad_norm": 0.24692080914974213, "learning_rate": 7.580000000000001e-06, "loss": 0.9982, "step": 380 }, { "epoch": 24.375, "grad_norm": 0.32462307810783386, "learning_rate": 7.78e-06, "loss": 0.988, "step": 390 }, { "epoch": 25.0, "grad_norm": 0.3690284788608551, "learning_rate": 7.98e-06, "loss": 0.9721, "step": 400 }, { "epoch": 25.625, "grad_norm": 0.44107159972190857, "learning_rate": 8.18e-06, "loss": 0.9429, "step": 410 }, { "epoch": 26.25, "grad_norm": 0.5854696035385132, "learning_rate": 8.380000000000001e-06, "loss": 0.8986, "step": 420 }, { "epoch": 26.875, "grad_norm": 0.7961719632148743, "learning_rate": 8.580000000000001e-06, "loss": 0.8538, "step": 430 }, { "epoch": 27.5, "grad_norm": 0.9634988307952881, "learning_rate": 8.78e-06, "loss": 0.8043, "step": 440 }, { "epoch": 28.125, "grad_norm": 0.8321980834007263, "learning_rate": 8.98e-06, "loss": 0.7597, "step": 450 }, { "epoch": 28.75, "grad_norm": 0.784283459186554, "learning_rate": 9.180000000000002e-06, "loss": 0.7166, "step": 460 }, { "epoch": 29.375, "grad_norm": 0.8551204800605774, "learning_rate": 9.38e-06, "loss": 0.6809, "step": 470 }, { "epoch": 30.0, "grad_norm": 0.774193525314331, "learning_rate": 9.58e-06, "loss": 0.6511, "step": 480 }, { "epoch": 30.625, "grad_norm": 0.8994619846343994, "learning_rate": 9.78e-06, "loss": 0.6267, "step": 490 }, { "epoch": 31.25, "grad_norm": 1.0884357690811157, "learning_rate": 9.980000000000001e-06, "loss": 0.6061, "step": 500 }, { "epoch": 31.875, "grad_norm": 1.0175591707229614, "learning_rate": 1.018e-05, "loss": 0.5858, "step": 510 }, { "epoch": 32.5, "grad_norm": 1.1407521963119507, "learning_rate": 1.038e-05, "loss": 0.5714, "step": 520 }, { "epoch": 33.125, "grad_norm": 1.042667269706726, "learning_rate": 1.058e-05, "loss": 0.5586, "step": 530 }, { "epoch": 33.75, "grad_norm": 0.9843167662620544, "learning_rate": 1.0780000000000002e-05, "loss": 0.5492, "step": 540 }, { "epoch": 34.375, "grad_norm": 1.0415880680084229, "learning_rate": 1.098e-05, "loss": 0.5431, "step": 550 }, { "epoch": 35.0, "grad_norm": 1.1198561191558838, "learning_rate": 1.118e-05, "loss": 0.5375, "step": 560 }, { "epoch": 35.625, "grad_norm": 1.2082597017288208, "learning_rate": 1.1380000000000001e-05, "loss": 0.5338, "step": 570 }, { "epoch": 36.25, "grad_norm": 1.2071870565414429, "learning_rate": 1.1580000000000001e-05, "loss": 0.5298, "step": 580 }, { "epoch": 36.875, "grad_norm": 1.3737869262695312, "learning_rate": 1.178e-05, "loss": 0.5287, "step": 590 }, { "epoch": 37.5, "grad_norm": 1.0583442449569702, "learning_rate": 1.198e-05, "loss": 0.5243, "step": 600 }, { "epoch": 38.125, "grad_norm": 1.0685936212539673, "learning_rate": 1.2180000000000002e-05, "loss": 0.5229, "step": 610 }, { "epoch": 38.75, "grad_norm": 1.2425378561019897, "learning_rate": 1.238e-05, "loss": 0.5215, "step": 620 }, { "epoch": 39.375, "grad_norm": 1.1828131675720215, "learning_rate": 1.258e-05, "loss": 0.5194, "step": 630 }, { "epoch": 40.0, "grad_norm": 1.0766721963882446, "learning_rate": 1.278e-05, "loss": 0.519, "step": 640 }, { "epoch": 40.625, "grad_norm": 1.0481817722320557, "learning_rate": 1.2980000000000001e-05, "loss": 0.5154, "step": 650 }, { "epoch": 41.25, "grad_norm": 1.0572658777236938, "learning_rate": 1.3180000000000001e-05, "loss": 0.5159, "step": 660 }, { "epoch": 41.875, "grad_norm": 1.203803539276123, "learning_rate": 1.338e-05, "loss": 0.5121, "step": 670 }, { "epoch": 42.5, "grad_norm": 0.872924268245697, "learning_rate": 1.358e-05, "loss": 0.512, "step": 680 }, { "epoch": 43.125, "grad_norm": 1.162695050239563, "learning_rate": 1.3780000000000002e-05, "loss": 0.5064, "step": 690 }, { "epoch": 43.75, "grad_norm": 1.1628501415252686, "learning_rate": 1.3980000000000002e-05, "loss": 0.5082, "step": 700 }, { "epoch": 44.375, "grad_norm": 1.217319369316101, "learning_rate": 1.4180000000000001e-05, "loss": 0.5067, "step": 710 }, { "epoch": 45.0, "grad_norm": 1.1384787559509277, "learning_rate": 1.4380000000000001e-05, "loss": 0.506, "step": 720 }, { "epoch": 45.625, "grad_norm": 1.2870819568634033, "learning_rate": 1.4580000000000003e-05, "loss": 0.505, "step": 730 }, { "epoch": 46.25, "grad_norm": 1.1514242887496948, "learning_rate": 1.4779999999999999e-05, "loss": 0.5064, "step": 740 }, { "epoch": 46.875, "grad_norm": 1.144319772720337, "learning_rate": 1.4979999999999999e-05, "loss": 0.5017, "step": 750 }, { "epoch": 47.5, "grad_norm": 1.22275710105896, "learning_rate": 1.518e-05, "loss": 0.5013, "step": 760 }, { "epoch": 48.125, "grad_norm": 1.118972659111023, "learning_rate": 1.538e-05, "loss": 0.4997, "step": 770 }, { "epoch": 48.75, "grad_norm": 1.051975965499878, "learning_rate": 1.558e-05, "loss": 0.5008, "step": 780 }, { "epoch": 49.375, "grad_norm": 1.4525421857833862, "learning_rate": 1.578e-05, "loss": 0.4996, "step": 790 }, { "epoch": 50.0, "grad_norm": 1.183190941810608, "learning_rate": 1.598e-05, "loss": 0.4963, "step": 800 }, { "epoch": 50.625, "grad_norm": 1.2612169981002808, "learning_rate": 1.618e-05, "loss": 0.4974, "step": 810 }, { "epoch": 51.25, "grad_norm": 0.9744483828544617, "learning_rate": 1.6380000000000002e-05, "loss": 0.4954, "step": 820 }, { "epoch": 51.875, "grad_norm": 1.1227869987487793, "learning_rate": 1.658e-05, "loss": 0.4959, "step": 830 }, { "epoch": 52.5, "grad_norm": 1.043039321899414, "learning_rate": 1.6780000000000002e-05, "loss": 0.4927, "step": 840 }, { "epoch": 53.125, "grad_norm": 1.2920984029769897, "learning_rate": 1.698e-05, "loss": 0.4918, "step": 850 }, { "epoch": 53.75, "grad_norm": 1.2774907350540161, "learning_rate": 1.718e-05, "loss": 0.4907, "step": 860 }, { "epoch": 54.375, "grad_norm": 1.1010966300964355, "learning_rate": 1.7380000000000003e-05, "loss": 0.4914, "step": 870 }, { "epoch": 55.0, "grad_norm": 1.0284886360168457, "learning_rate": 1.758e-05, "loss": 0.4873, "step": 880 }, { "epoch": 55.625, "grad_norm": 1.3238409757614136, "learning_rate": 1.7780000000000003e-05, "loss": 0.4883, "step": 890 }, { "epoch": 56.25, "grad_norm": 1.127139687538147, "learning_rate": 1.798e-05, "loss": 0.4871, "step": 900 }, { "epoch": 56.875, "grad_norm": 1.3855187892913818, "learning_rate": 1.818e-05, "loss": 0.4828, "step": 910 }, { "epoch": 57.5, "grad_norm": 1.536786437034607, "learning_rate": 1.838e-05, "loss": 0.4812, "step": 920 }, { "epoch": 58.125, "grad_norm": 2.120702028274536, "learning_rate": 1.858e-05, "loss": 0.4776, "step": 930 }, { "epoch": 58.75, "grad_norm": 1.6191856861114502, "learning_rate": 1.878e-05, "loss": 0.4761, "step": 940 }, { "epoch": 59.375, "grad_norm": 1.6021475791931152, "learning_rate": 1.898e-05, "loss": 0.4743, "step": 950 }, { "epoch": 60.0, "grad_norm": 1.604246735572815, "learning_rate": 1.918e-05, "loss": 0.4705, "step": 960 }, { "epoch": 60.625, "grad_norm": 3.4168691635131836, "learning_rate": 1.938e-05, "loss": 0.4732, "step": 970 }, { "epoch": 61.25, "grad_norm": 1.997071623802185, "learning_rate": 1.9580000000000002e-05, "loss": 0.4709, "step": 980 }, { "epoch": 61.875, "grad_norm": 2.351092576980591, "learning_rate": 1.978e-05, "loss": 0.4678, "step": 990 }, { "epoch": 62.5, "grad_norm": 1.9072397947311401, "learning_rate": 1.9980000000000002e-05, "loss": 0.469, "step": 1000 }, { "epoch": 63.125, "grad_norm": 1.777255892753601, "learning_rate": 2.0180000000000003e-05, "loss": 0.4642, "step": 1010 }, { "epoch": 63.75, "grad_norm": 1.3945808410644531, "learning_rate": 2.038e-05, "loss": 0.4613, "step": 1020 }, { "epoch": 64.375, "grad_norm": 1.5424234867095947, "learning_rate": 2.0580000000000003e-05, "loss": 0.4621, "step": 1030 }, { "epoch": 65.0, "grad_norm": 1.4399698972702026, "learning_rate": 2.078e-05, "loss": 0.4596, "step": 1040 }, { "epoch": 65.625, "grad_norm": 1.5211741924285889, "learning_rate": 2.098e-05, "loss": 0.4571, "step": 1050 }, { "epoch": 66.25, "grad_norm": 1.6879644393920898, "learning_rate": 2.118e-05, "loss": 0.4535, "step": 1060 }, { "epoch": 66.875, "grad_norm": 1.7169924974441528, "learning_rate": 2.138e-05, "loss": 0.4506, "step": 1070 }, { "epoch": 67.5, "grad_norm": 2.5419564247131348, "learning_rate": 2.158e-05, "loss": 0.4497, "step": 1080 }, { "epoch": 68.125, "grad_norm": 1.9424971342086792, "learning_rate": 2.178e-05, "loss": 0.446, "step": 1090 }, { "epoch": 68.75, "grad_norm": 4.030938148498535, "learning_rate": 2.198e-05, "loss": 0.4459, "step": 1100 }, { "epoch": 69.375, "grad_norm": 2.3670308589935303, "learning_rate": 2.218e-05, "loss": 0.4454, "step": 1110 }, { "epoch": 70.0, "grad_norm": 2.562795400619507, "learning_rate": 2.2380000000000003e-05, "loss": 0.4397, "step": 1120 }, { "epoch": 70.625, "grad_norm": 2.4600791931152344, "learning_rate": 2.258e-05, "loss": 0.4359, "step": 1130 }, { "epoch": 71.25, "grad_norm": 2.77167010307312, "learning_rate": 2.2780000000000002e-05, "loss": 0.4325, "step": 1140 }, { "epoch": 71.875, "grad_norm": 3.150618314743042, "learning_rate": 2.298e-05, "loss": 0.4285, "step": 1150 }, { "epoch": 72.5, "grad_norm": 2.5932984352111816, "learning_rate": 2.318e-05, "loss": 0.4247, "step": 1160 }, { "epoch": 73.125, "grad_norm": 2.8484175205230713, "learning_rate": 2.3380000000000003e-05, "loss": 0.4199, "step": 1170 }, { "epoch": 73.75, "grad_norm": 3.6509993076324463, "learning_rate": 2.358e-05, "loss": 0.4183, "step": 1180 }, { "epoch": 74.375, "grad_norm": 5.74982213973999, "learning_rate": 2.3780000000000003e-05, "loss": 0.4146, "step": 1190 }, { "epoch": 75.0, "grad_norm": 5.700360298156738, "learning_rate": 2.398e-05, "loss": 0.4322, "step": 1200 }, { "epoch": 75.625, "grad_norm": 3.3000857830047607, "learning_rate": 2.418e-05, "loss": 0.4228, "step": 1210 }, { "epoch": 76.25, "grad_norm": 2.5206165313720703, "learning_rate": 2.438e-05, "loss": 0.4072, "step": 1220 }, { "epoch": 76.875, "grad_norm": 4.3755083084106445, "learning_rate": 2.4580000000000002e-05, "loss": 0.4025, "step": 1230 }, { "epoch": 77.5, "grad_norm": 3.973033905029297, "learning_rate": 2.478e-05, "loss": 0.3953, "step": 1240 }, { "epoch": 78.125, "grad_norm": 3.5819246768951416, "learning_rate": 2.498e-05, "loss": 0.3909, "step": 1250 }, { "epoch": 78.75, "grad_norm": 3.183445930480957, "learning_rate": 2.5180000000000003e-05, "loss": 0.3852, "step": 1260 }, { "epoch": 79.375, "grad_norm": 3.4347057342529297, "learning_rate": 2.5380000000000004e-05, "loss": 0.3785, "step": 1270 }, { "epoch": 80.0, "grad_norm": 5.1768388748168945, "learning_rate": 2.5580000000000002e-05, "loss": 0.3717, "step": 1280 }, { "epoch": 80.625, "grad_norm": 3.9438490867614746, "learning_rate": 2.5779999999999997e-05, "loss": 0.3681, "step": 1290 }, { "epoch": 81.25, "grad_norm": 4.442440986633301, "learning_rate": 2.598e-05, "loss": 0.3571, "step": 1300 }, { "epoch": 81.875, "grad_norm": 5.6028900146484375, "learning_rate": 2.618e-05, "loss": 0.3553, "step": 1310 }, { "epoch": 82.5, "grad_norm": 5.537353038787842, "learning_rate": 2.6379999999999998e-05, "loss": 0.3515, "step": 1320 }, { "epoch": 83.125, "grad_norm": 6.435239315032959, "learning_rate": 2.658e-05, "loss": 0.3486, "step": 1330 }, { "epoch": 83.75, "grad_norm": 3.413828134536743, "learning_rate": 2.678e-05, "loss": 0.3373, "step": 1340 }, { "epoch": 84.375, "grad_norm": 3.34212327003479, "learning_rate": 2.698e-05, "loss": 0.3297, "step": 1350 }, { "epoch": 85.0, "grad_norm": 3.8223774433135986, "learning_rate": 2.718e-05, "loss": 0.322, "step": 1360 }, { "epoch": 85.625, "grad_norm": 4.211275577545166, "learning_rate": 2.738e-05, "loss": 0.314, "step": 1370 }, { "epoch": 86.25, "grad_norm": 5.035346984863281, "learning_rate": 2.758e-05, "loss": 0.3089, "step": 1380 }, { "epoch": 86.875, "grad_norm": 5.702706813812256, "learning_rate": 2.778e-05, "loss": 0.3017, "step": 1390 }, { "epoch": 87.5, "grad_norm": 3.9658496379852295, "learning_rate": 2.798e-05, "loss": 0.3002, "step": 1400 }, { "epoch": 88.125, "grad_norm": 3.6658170223236084, "learning_rate": 2.818e-05, "loss": 0.2877, "step": 1410 }, { "epoch": 88.75, "grad_norm": 4.639245986938477, "learning_rate": 2.8380000000000003e-05, "loss": 0.2773, "step": 1420 }, { "epoch": 89.375, "grad_norm": 4.7095947265625, "learning_rate": 2.858e-05, "loss": 0.2739, "step": 1430 }, { "epoch": 90.0, "grad_norm": 5.2461700439453125, "learning_rate": 2.8780000000000002e-05, "loss": 0.267, "step": 1440 }, { "epoch": 90.625, "grad_norm": 9.186333656311035, "learning_rate": 2.898e-05, "loss": 0.2638, "step": 1450 }, { "epoch": 91.25, "grad_norm": 5.92840051651001, "learning_rate": 2.9180000000000002e-05, "loss": 0.2639, "step": 1460 }, { "epoch": 91.875, "grad_norm": 4.675635814666748, "learning_rate": 2.9380000000000003e-05, "loss": 0.2485, "step": 1470 }, { "epoch": 92.5, "grad_norm": 4.761613845825195, "learning_rate": 2.958e-05, "loss": 0.2392, "step": 1480 }, { "epoch": 93.125, "grad_norm": 5.566701889038086, "learning_rate": 2.9780000000000003e-05, "loss": 0.2341, "step": 1490 }, { "epoch": 93.75, "grad_norm": 4.253500461578369, "learning_rate": 2.998e-05, "loss": 0.2222, "step": 1500 }, { "epoch": 94.375, "grad_norm": 4.602695465087891, "learning_rate": 3.0180000000000002e-05, "loss": 0.2183, "step": 1510 }, { "epoch": 95.0, "grad_norm": 5.938004493713379, "learning_rate": 3.0380000000000004e-05, "loss": 0.213, "step": 1520 }, { "epoch": 95.625, "grad_norm": 6.689606189727783, "learning_rate": 3.058e-05, "loss": 0.2123, "step": 1530 }, { "epoch": 96.25, "grad_norm": 5.87599515914917, "learning_rate": 3.078e-05, "loss": 0.2011, "step": 1540 }, { "epoch": 96.875, "grad_norm": 5.971210956573486, "learning_rate": 3.0980000000000005e-05, "loss": 0.1966, "step": 1550 }, { "epoch": 97.5, "grad_norm": 5.859025001525879, "learning_rate": 3.118e-05, "loss": 0.1946, "step": 1560 }, { "epoch": 98.125, "grad_norm": 5.950936317443848, "learning_rate": 3.138e-05, "loss": 0.1814, "step": 1570 }, { "epoch": 98.75, "grad_norm": 4.768659591674805, "learning_rate": 3.1580000000000006e-05, "loss": 0.1768, "step": 1580 }, { "epoch": 99.375, "grad_norm": 5.677441596984863, "learning_rate": 3.1780000000000004e-05, "loss": 0.1727, "step": 1590 }, { "epoch": 100.0, "grad_norm": 4.38816499710083, "learning_rate": 3.198e-05, "loss": 0.162, "step": 1600 }, { "epoch": 100.625, "grad_norm": 4.571051120758057, "learning_rate": 3.218e-05, "loss": 0.1606, "step": 1610 }, { "epoch": 101.25, "grad_norm": 6.406070709228516, "learning_rate": 3.238e-05, "loss": 0.1494, "step": 1620 }, { "epoch": 101.875, "grad_norm": 4.923046588897705, "learning_rate": 3.2579999999999996e-05, "loss": 0.1479, "step": 1630 }, { "epoch": 102.5, "grad_norm": 6.735141277313232, "learning_rate": 3.278e-05, "loss": 0.1468, "step": 1640 }, { "epoch": 103.125, "grad_norm": 5.201277732849121, "learning_rate": 3.298e-05, "loss": 0.137, "step": 1650 }, { "epoch": 103.75, "grad_norm": 5.127175331115723, "learning_rate": 3.318e-05, "loss": 0.1377, "step": 1660 }, { "epoch": 104.375, "grad_norm": 5.3002471923828125, "learning_rate": 3.338e-05, "loss": 0.1277, "step": 1670 }, { "epoch": 105.0, "grad_norm": 5.290287017822266, "learning_rate": 3.358e-05, "loss": 0.1224, "step": 1680 }, { "epoch": 105.625, "grad_norm": 6.6339311599731445, "learning_rate": 3.378e-05, "loss": 0.121, "step": 1690 }, { "epoch": 106.25, "grad_norm": 6.3351826667785645, "learning_rate": 3.398e-05, "loss": 0.1202, "step": 1700 }, { "epoch": 106.875, "grad_norm": 6.30771017074585, "learning_rate": 3.418e-05, "loss": 0.1182, "step": 1710 }, { "epoch": 107.5, "grad_norm": 5.454580307006836, "learning_rate": 3.438e-05, "loss": 0.1132, "step": 1720 }, { "epoch": 108.125, "grad_norm": 5.418821811676025, "learning_rate": 3.4580000000000004e-05, "loss": 0.1064, "step": 1730 }, { "epoch": 108.75, "grad_norm": 4.996350288391113, "learning_rate": 3.478e-05, "loss": 0.0993, "step": 1740 }, { "epoch": 109.375, "grad_norm": 6.961830139160156, "learning_rate": 3.498e-05, "loss": 0.0994, "step": 1750 }, { "epoch": 110.0, "grad_norm": 6.240096569061279, "learning_rate": 3.518e-05, "loss": 0.1032, "step": 1760 }, { "epoch": 110.625, "grad_norm": 5.9896111488342285, "learning_rate": 3.5380000000000003e-05, "loss": 0.0931, "step": 1770 }, { "epoch": 111.25, "grad_norm": 4.381375312805176, "learning_rate": 3.558e-05, "loss": 0.0899, "step": 1780 }, { "epoch": 111.875, "grad_norm": 4.847299098968506, "learning_rate": 3.578e-05, "loss": 0.0845, "step": 1790 }, { "epoch": 112.5, "grad_norm": 6.442471981048584, "learning_rate": 3.5980000000000004e-05, "loss": 0.0903, "step": 1800 }, { "epoch": 113.125, "grad_norm": 6.36176872253418, "learning_rate": 3.618e-05, "loss": 0.0871, "step": 1810 }, { "epoch": 113.75, "grad_norm": 5.581753730773926, "learning_rate": 3.638e-05, "loss": 0.0832, "step": 1820 }, { "epoch": 114.375, "grad_norm": 5.413025379180908, "learning_rate": 3.6580000000000006e-05, "loss": 0.0894, "step": 1830 }, { "epoch": 115.0, "grad_norm": 3.545975685119629, "learning_rate": 3.6780000000000004e-05, "loss": 0.0816, "step": 1840 }, { "epoch": 115.625, "grad_norm": 5.415703773498535, "learning_rate": 3.698e-05, "loss": 0.0847, "step": 1850 }, { "epoch": 116.25, "grad_norm": 5.82867956161499, "learning_rate": 3.7180000000000007e-05, "loss": 0.0781, "step": 1860 }, { "epoch": 116.875, "grad_norm": 6.405238151550293, "learning_rate": 3.7380000000000005e-05, "loss": 0.082, "step": 1870 }, { "epoch": 117.5, "grad_norm": 3.8289811611175537, "learning_rate": 3.758e-05, "loss": 0.0766, "step": 1880 }, { "epoch": 118.125, "grad_norm": 4.225410461425781, "learning_rate": 3.778000000000001e-05, "loss": 0.0753, "step": 1890 }, { "epoch": 118.75, "grad_norm": 3.565117120742798, "learning_rate": 3.7980000000000006e-05, "loss": 0.0714, "step": 1900 }, { "epoch": 119.375, "grad_norm": 4.679031848907471, "learning_rate": 3.818e-05, "loss": 0.0727, "step": 1910 }, { "epoch": 120.0, "grad_norm": 3.9762325286865234, "learning_rate": 3.838e-05, "loss": 0.0746, "step": 1920 }, { "epoch": 120.625, "grad_norm": 5.354043960571289, "learning_rate": 3.858e-05, "loss": 0.0715, "step": 1930 }, { "epoch": 121.25, "grad_norm": 4.858035564422607, "learning_rate": 3.878e-05, "loss": 0.0736, "step": 1940 }, { "epoch": 121.875, "grad_norm": 5.547657012939453, "learning_rate": 3.898e-05, "loss": 0.0747, "step": 1950 }, { "epoch": 122.5, "grad_norm": 4.109276294708252, "learning_rate": 3.918e-05, "loss": 0.0671, "step": 1960 }, { "epoch": 123.125, "grad_norm": 2.984168291091919, "learning_rate": 3.938e-05, "loss": 0.0675, "step": 1970 }, { "epoch": 123.75, "grad_norm": 3.5227620601654053, "learning_rate": 3.958e-05, "loss": 0.0704, "step": 1980 }, { "epoch": 124.375, "grad_norm": 4.113119125366211, "learning_rate": 3.978e-05, "loss": 0.0684, "step": 1990 }, { "epoch": 125.0, "grad_norm": 4.130417823791504, "learning_rate": 3.998e-05, "loss": 0.0696, "step": 2000 }, { "epoch": 125.625, "grad_norm": 4.076992034912109, "learning_rate": 4.018e-05, "loss": 0.0648, "step": 2010 }, { "epoch": 126.25, "grad_norm": 3.623624801635742, "learning_rate": 4.038e-05, "loss": 0.0634, "step": 2020 }, { "epoch": 126.875, "grad_norm": 3.0136911869049072, "learning_rate": 4.058e-05, "loss": 0.0619, "step": 2030 }, { "epoch": 127.5, "grad_norm": 2.3207767009735107, "learning_rate": 4.078e-05, "loss": 0.0602, "step": 2040 }, { "epoch": 128.125, "grad_norm": 6.006433963775635, "learning_rate": 4.0980000000000004e-05, "loss": 0.0618, "step": 2050 }, { "epoch": 128.75, "grad_norm": 4.211705684661865, "learning_rate": 4.118e-05, "loss": 0.0614, "step": 2060 }, { "epoch": 129.375, "grad_norm": 3.0991692543029785, "learning_rate": 4.138e-05, "loss": 0.0596, "step": 2070 }, { "epoch": 130.0, "grad_norm": 3.5333359241485596, "learning_rate": 4.1580000000000005e-05, "loss": 0.0594, "step": 2080 }, { "epoch": 130.625, "grad_norm": 2.464125394821167, "learning_rate": 4.178e-05, "loss": 0.0579, "step": 2090 }, { "epoch": 131.25, "grad_norm": 3.499553680419922, "learning_rate": 4.198e-05, "loss": 0.058, "step": 2100 }, { "epoch": 131.875, "grad_norm": 4.001912593841553, "learning_rate": 4.2180000000000006e-05, "loss": 0.0585, "step": 2110 }, { "epoch": 132.5, "grad_norm": 3.2040934562683105, "learning_rate": 4.2380000000000004e-05, "loss": 0.0597, "step": 2120 }, { "epoch": 133.125, "grad_norm": 3.650881767272949, "learning_rate": 4.258e-05, "loss": 0.0594, "step": 2130 }, { "epoch": 133.75, "grad_norm": 3.5435853004455566, "learning_rate": 4.278e-05, "loss": 0.0577, "step": 2140 }, { "epoch": 134.375, "grad_norm": 3.881361484527588, "learning_rate": 4.2980000000000005e-05, "loss": 0.0586, "step": 2150 }, { "epoch": 135.0, "grad_norm": 3.0621840953826904, "learning_rate": 4.318e-05, "loss": 0.0562, "step": 2160 }, { "epoch": 135.625, "grad_norm": 3.5643880367279053, "learning_rate": 4.338e-05, "loss": 0.0573, "step": 2170 }, { "epoch": 136.25, "grad_norm": 3.4029245376586914, "learning_rate": 4.3580000000000006e-05, "loss": 0.0538, "step": 2180 }, { "epoch": 136.875, "grad_norm": 2.941638469696045, "learning_rate": 4.3780000000000004e-05, "loss": 0.0518, "step": 2190 }, { "epoch": 137.5, "grad_norm": 3.19802188873291, "learning_rate": 4.398e-05, "loss": 0.0578, "step": 2200 }, { "epoch": 138.125, "grad_norm": 3.2176332473754883, "learning_rate": 4.418000000000001e-05, "loss": 0.0553, "step": 2210 }, { "epoch": 138.75, "grad_norm": 3.1325228214263916, "learning_rate": 4.438e-05, "loss": 0.0544, "step": 2220 }, { "epoch": 139.375, "grad_norm": 2.721820116043091, "learning_rate": 4.458e-05, "loss": 0.0537, "step": 2230 }, { "epoch": 140.0, "grad_norm": 2.2297821044921875, "learning_rate": 4.478e-05, "loss": 0.0536, "step": 2240 }, { "epoch": 140.625, "grad_norm": 3.736509323120117, "learning_rate": 4.498e-05, "loss": 0.0568, "step": 2250 }, { "epoch": 141.25, "grad_norm": 3.414687156677246, "learning_rate": 4.518e-05, "loss": 0.0535, "step": 2260 }, { "epoch": 141.875, "grad_norm": 3.533870220184326, "learning_rate": 4.538e-05, "loss": 0.0528, "step": 2270 }, { "epoch": 142.5, "grad_norm": 2.922818422317505, "learning_rate": 4.558e-05, "loss": 0.0509, "step": 2280 }, { "epoch": 143.125, "grad_norm": 3.248502731323242, "learning_rate": 4.578e-05, "loss": 0.0499, "step": 2290 }, { "epoch": 143.75, "grad_norm": 2.737330913543701, "learning_rate": 4.5980000000000004e-05, "loss": 0.0504, "step": 2300 }, { "epoch": 144.375, "grad_norm": 2.7490787506103516, "learning_rate": 4.618e-05, "loss": 0.0494, "step": 2310 }, { "epoch": 145.0, "grad_norm": 3.3917601108551025, "learning_rate": 4.638e-05, "loss": 0.0529, "step": 2320 }, { "epoch": 145.625, "grad_norm": 3.115227699279785, "learning_rate": 4.6580000000000005e-05, "loss": 0.0487, "step": 2330 }, { "epoch": 146.25, "grad_norm": 3.6642770767211914, "learning_rate": 4.678e-05, "loss": 0.0511, "step": 2340 }, { "epoch": 146.875, "grad_norm": 3.4796688556671143, "learning_rate": 4.698e-05, "loss": 0.048, "step": 2350 }, { "epoch": 147.5, "grad_norm": 2.7523436546325684, "learning_rate": 4.718e-05, "loss": 0.0478, "step": 2360 }, { "epoch": 148.125, "grad_norm": 3.309631824493408, "learning_rate": 4.7380000000000004e-05, "loss": 0.0489, "step": 2370 }, { "epoch": 148.75, "grad_norm": 3.5280392169952393, "learning_rate": 4.758e-05, "loss": 0.045, "step": 2380 }, { "epoch": 149.375, "grad_norm": 3.055738925933838, "learning_rate": 4.778e-05, "loss": 0.0443, "step": 2390 }, { "epoch": 150.0, "grad_norm": 2.935150146484375, "learning_rate": 4.7980000000000005e-05, "loss": 0.047, "step": 2400 }, { "epoch": 150.625, "grad_norm": 3.540233612060547, "learning_rate": 4.818e-05, "loss": 0.0483, "step": 2410 }, { "epoch": 151.25, "grad_norm": 3.3195087909698486, "learning_rate": 4.838e-05, "loss": 0.0461, "step": 2420 }, { "epoch": 151.875, "grad_norm": 3.5009474754333496, "learning_rate": 4.8580000000000006e-05, "loss": 0.0478, "step": 2430 }, { "epoch": 152.5, "grad_norm": 3.110968589782715, "learning_rate": 4.8780000000000004e-05, "loss": 0.0476, "step": 2440 }, { "epoch": 153.125, "grad_norm": 2.5114879608154297, "learning_rate": 4.898e-05, "loss": 0.0457, "step": 2450 }, { "epoch": 153.75, "grad_norm": 2.591670513153076, "learning_rate": 4.918000000000001e-05, "loss": 0.0425, "step": 2460 }, { "epoch": 154.375, "grad_norm": 2.149576187133789, "learning_rate": 4.9380000000000005e-05, "loss": 0.0432, "step": 2470 }, { "epoch": 155.0, "grad_norm": 2.866494655609131, "learning_rate": 4.958e-05, "loss": 0.047, "step": 2480 }, { "epoch": 155.625, "grad_norm": 3.465266227722168, "learning_rate": 4.978e-05, "loss": 0.0458, "step": 2490 }, { "epoch": 156.25, "grad_norm": 2.855782985687256, "learning_rate": 4.9980000000000006e-05, "loss": 0.043, "step": 2500 }, { "epoch": 156.875, "grad_norm": 2.906052350997925, "learning_rate": 5.0180000000000004e-05, "loss": 0.042, "step": 2510 }, { "epoch": 157.5, "grad_norm": 3.16371488571167, "learning_rate": 5.038e-05, "loss": 0.0427, "step": 2520 }, { "epoch": 158.125, "grad_norm": 2.54278826713562, "learning_rate": 5.058000000000001e-05, "loss": 0.0433, "step": 2530 }, { "epoch": 158.75, "grad_norm": 2.1072380542755127, "learning_rate": 5.0780000000000005e-05, "loss": 0.0378, "step": 2540 }, { "epoch": 159.375, "grad_norm": 2.749347448348999, "learning_rate": 5.098e-05, "loss": 0.0412, "step": 2550 }, { "epoch": 160.0, "grad_norm": 3.022982120513916, "learning_rate": 5.118000000000001e-05, "loss": 0.0413, "step": 2560 }, { "epoch": 160.625, "grad_norm": 3.038039207458496, "learning_rate": 5.1380000000000006e-05, "loss": 0.0418, "step": 2570 }, { "epoch": 161.25, "grad_norm": 2.538886070251465, "learning_rate": 5.1580000000000004e-05, "loss": 0.0407, "step": 2580 }, { "epoch": 161.875, "grad_norm": 2.79771089553833, "learning_rate": 5.178000000000001e-05, "loss": 0.0421, "step": 2590 }, { "epoch": 162.5, "grad_norm": 2.6892521381378174, "learning_rate": 5.198000000000001e-05, "loss": 0.0408, "step": 2600 }, { "epoch": 163.125, "grad_norm": 2.829843759536743, "learning_rate": 5.2180000000000005e-05, "loss": 0.0386, "step": 2610 }, { "epoch": 163.75, "grad_norm": 2.272169828414917, "learning_rate": 5.238000000000001e-05, "loss": 0.0392, "step": 2620 }, { "epoch": 164.375, "grad_norm": 2.683228015899658, "learning_rate": 5.258000000000001e-05, "loss": 0.0403, "step": 2630 }, { "epoch": 165.0, "grad_norm": 2.4979324340820312, "learning_rate": 5.2780000000000006e-05, "loss": 0.0412, "step": 2640 }, { "epoch": 165.625, "grad_norm": 2.7030258178710938, "learning_rate": 5.2980000000000004e-05, "loss": 0.0386, "step": 2650 }, { "epoch": 166.25, "grad_norm": 2.9168074131011963, "learning_rate": 5.318000000000001e-05, "loss": 0.041, "step": 2660 }, { "epoch": 166.875, "grad_norm": 2.2548749446868896, "learning_rate": 5.338000000000001e-05, "loss": 0.0386, "step": 2670 }, { "epoch": 167.5, "grad_norm": 2.6179001331329346, "learning_rate": 5.3580000000000005e-05, "loss": 0.0399, "step": 2680 }, { "epoch": 168.125, "grad_norm": 3.1817469596862793, "learning_rate": 5.378e-05, "loss": 0.0391, "step": 2690 }, { "epoch": 168.75, "grad_norm": 2.606260061264038, "learning_rate": 5.3979999999999995e-05, "loss": 0.0358, "step": 2700 }, { "epoch": 169.375, "grad_norm": 2.6046321392059326, "learning_rate": 5.418e-05, "loss": 0.0366, "step": 2710 }, { "epoch": 170.0, "grad_norm": 2.150594711303711, "learning_rate": 5.438e-05, "loss": 0.0366, "step": 2720 }, { "epoch": 170.625, "grad_norm": 1.9119679927825928, "learning_rate": 5.4579999999999996e-05, "loss": 0.0359, "step": 2730 }, { "epoch": 171.25, "grad_norm": 2.6968297958374023, "learning_rate": 5.478e-05, "loss": 0.0358, "step": 2740 }, { "epoch": 171.875, "grad_norm": 2.433364152908325, "learning_rate": 5.498e-05, "loss": 0.0396, "step": 2750 }, { "epoch": 172.5, "grad_norm": 2.7723114490509033, "learning_rate": 5.518e-05, "loss": 0.0369, "step": 2760 }, { "epoch": 173.125, "grad_norm": 1.9324524402618408, "learning_rate": 5.538e-05, "loss": 0.0366, "step": 2770 }, { "epoch": 173.75, "grad_norm": 2.4898505210876465, "learning_rate": 5.558e-05, "loss": 0.0357, "step": 2780 }, { "epoch": 174.375, "grad_norm": 3.377042293548584, "learning_rate": 5.578e-05, "loss": 0.0356, "step": 2790 }, { "epoch": 175.0, "grad_norm": 2.3189809322357178, "learning_rate": 5.5979999999999996e-05, "loss": 0.0383, "step": 2800 }, { "epoch": 175.625, "grad_norm": 2.4106035232543945, "learning_rate": 5.618e-05, "loss": 0.0377, "step": 2810 }, { "epoch": 176.25, "grad_norm": 2.3675427436828613, "learning_rate": 5.638e-05, "loss": 0.034, "step": 2820 }, { "epoch": 176.875, "grad_norm": 2.3263936042785645, "learning_rate": 5.658e-05, "loss": 0.0329, "step": 2830 }, { "epoch": 177.5, "grad_norm": 2.6326184272766113, "learning_rate": 5.678e-05, "loss": 0.0372, "step": 2840 }, { "epoch": 178.125, "grad_norm": 2.5026683807373047, "learning_rate": 5.698e-05, "loss": 0.0384, "step": 2850 }, { "epoch": 178.75, "grad_norm": 2.7007641792297363, "learning_rate": 5.718e-05, "loss": 0.0345, "step": 2860 }, { "epoch": 179.375, "grad_norm": 2.948171854019165, "learning_rate": 5.738e-05, "loss": 0.0371, "step": 2870 }, { "epoch": 180.0, "grad_norm": 2.368053674697876, "learning_rate": 5.758e-05, "loss": 0.0358, "step": 2880 }, { "epoch": 180.625, "grad_norm": 2.625312328338623, "learning_rate": 5.778e-05, "loss": 0.0382, "step": 2890 }, { "epoch": 181.25, "grad_norm": 2.2241172790527344, "learning_rate": 5.7980000000000004e-05, "loss": 0.0346, "step": 2900 }, { "epoch": 181.875, "grad_norm": 2.2202515602111816, "learning_rate": 5.818e-05, "loss": 0.0391, "step": 2910 }, { "epoch": 182.5, "grad_norm": 2.5838396549224854, "learning_rate": 5.838e-05, "loss": 0.0332, "step": 2920 }, { "epoch": 183.125, "grad_norm": 2.4340357780456543, "learning_rate": 5.858e-05, "loss": 0.0341, "step": 2930 }, { "epoch": 183.75, "grad_norm": 3.3191001415252686, "learning_rate": 5.878e-05, "loss": 0.0372, "step": 2940 }, { "epoch": 184.375, "grad_norm": 2.798825263977051, "learning_rate": 5.898e-05, "loss": 0.0344, "step": 2950 }, { "epoch": 185.0, "grad_norm": 2.0992839336395264, "learning_rate": 5.918e-05, "loss": 0.0364, "step": 2960 }, { "epoch": 185.625, "grad_norm": 2.3140695095062256, "learning_rate": 5.9380000000000004e-05, "loss": 0.0345, "step": 2970 }, { "epoch": 186.25, "grad_norm": 2.1252496242523193, "learning_rate": 5.958e-05, "loss": 0.0341, "step": 2980 }, { "epoch": 186.875, "grad_norm": 1.9925975799560547, "learning_rate": 5.978e-05, "loss": 0.0371, "step": 2990 }, { "epoch": 187.5, "grad_norm": 1.8534867763519287, "learning_rate": 5.9980000000000005e-05, "loss": 0.0324, "step": 3000 }, { "epoch": 188.125, "grad_norm": 1.8940081596374512, "learning_rate": 6.018e-05, "loss": 0.0313, "step": 3010 }, { "epoch": 188.75, "grad_norm": 3.098815679550171, "learning_rate": 6.038e-05, "loss": 0.0316, "step": 3020 }, { "epoch": 189.375, "grad_norm": 2.562849521636963, "learning_rate": 6.0580000000000006e-05, "loss": 0.034, "step": 3030 }, { "epoch": 190.0, "grad_norm": 2.3118202686309814, "learning_rate": 6.0780000000000004e-05, "loss": 0.0324, "step": 3040 }, { "epoch": 190.625, "grad_norm": 1.8349565267562866, "learning_rate": 6.098e-05, "loss": 0.0316, "step": 3050 }, { "epoch": 191.25, "grad_norm": 2.3919525146484375, "learning_rate": 6.118000000000001e-05, "loss": 0.0341, "step": 3060 }, { "epoch": 191.875, "grad_norm": 2.795734405517578, "learning_rate": 6.138e-05, "loss": 0.0321, "step": 3070 }, { "epoch": 192.5, "grad_norm": 2.4285318851470947, "learning_rate": 6.158e-05, "loss": 0.0338, "step": 3080 }, { "epoch": 193.125, "grad_norm": 2.724107265472412, "learning_rate": 6.178000000000001e-05, "loss": 0.0325, "step": 3090 }, { "epoch": 193.75, "grad_norm": 2.212014675140381, "learning_rate": 6.198e-05, "loss": 0.0297, "step": 3100 }, { "epoch": 194.375, "grad_norm": 1.8803651332855225, "learning_rate": 6.218e-05, "loss": 0.0298, "step": 3110 }, { "epoch": 195.0, "grad_norm": 1.7469961643218994, "learning_rate": 6.238000000000001e-05, "loss": 0.0291, "step": 3120 }, { "epoch": 195.625, "grad_norm": 2.5273945331573486, "learning_rate": 6.258e-05, "loss": 0.0309, "step": 3130 }, { "epoch": 196.25, "grad_norm": 2.398287773132324, "learning_rate": 6.278e-05, "loss": 0.0317, "step": 3140 }, { "epoch": 196.875, "grad_norm": 1.9407683610916138, "learning_rate": 6.298000000000001e-05, "loss": 0.0299, "step": 3150 }, { "epoch": 197.5, "grad_norm": 1.6159769296646118, "learning_rate": 6.318e-05, "loss": 0.0286, "step": 3160 }, { "epoch": 198.125, "grad_norm": 2.744300603866577, "learning_rate": 6.338e-05, "loss": 0.0303, "step": 3170 }, { "epoch": 198.75, "grad_norm": 2.6293482780456543, "learning_rate": 6.358000000000001e-05, "loss": 0.0328, "step": 3180 }, { "epoch": 199.375, "grad_norm": 2.2811481952667236, "learning_rate": 6.378e-05, "loss": 0.0328, "step": 3190 }, { "epoch": 200.0, "grad_norm": 2.951794385910034, "learning_rate": 6.398000000000001e-05, "loss": 0.033, "step": 3200 }, { "epoch": 200.625, "grad_norm": 2.3573927879333496, "learning_rate": 6.418000000000001e-05, "loss": 0.0312, "step": 3210 }, { "epoch": 201.25, "grad_norm": 2.088592529296875, "learning_rate": 6.438e-05, "loss": 0.0314, "step": 3220 }, { "epoch": 201.875, "grad_norm": 2.646054983139038, "learning_rate": 6.458000000000001e-05, "loss": 0.0295, "step": 3230 }, { "epoch": 202.5, "grad_norm": 2.5917739868164062, "learning_rate": 6.478000000000001e-05, "loss": 0.032, "step": 3240 }, { "epoch": 203.125, "grad_norm": 2.122236490249634, "learning_rate": 6.498e-05, "loss": 0.0306, "step": 3250 }, { "epoch": 203.75, "grad_norm": 2.2258174419403076, "learning_rate": 6.518000000000001e-05, "loss": 0.0289, "step": 3260 }, { "epoch": 204.375, "grad_norm": 2.1164627075195312, "learning_rate": 6.538000000000001e-05, "loss": 0.0298, "step": 3270 }, { "epoch": 205.0, "grad_norm": 2.397019386291504, "learning_rate": 6.558e-05, "loss": 0.0298, "step": 3280 }, { "epoch": 205.625, "grad_norm": 2.260453701019287, "learning_rate": 6.578000000000001e-05, "loss": 0.0279, "step": 3290 }, { "epoch": 206.25, "grad_norm": 2.1338107585906982, "learning_rate": 6.598e-05, "loss": 0.0292, "step": 3300 }, { "epoch": 206.875, "grad_norm": 1.875387191772461, "learning_rate": 6.618e-05, "loss": 0.0276, "step": 3310 }, { "epoch": 207.5, "grad_norm": 1.619683027267456, "learning_rate": 6.638e-05, "loss": 0.0286, "step": 3320 }, { "epoch": 208.125, "grad_norm": 2.5062685012817383, "learning_rate": 6.658e-05, "loss": 0.031, "step": 3330 }, { "epoch": 208.75, "grad_norm": 2.3004539012908936, "learning_rate": 6.678e-05, "loss": 0.0305, "step": 3340 }, { "epoch": 209.375, "grad_norm": 2.2835469245910645, "learning_rate": 6.698e-05, "loss": 0.0281, "step": 3350 }, { "epoch": 210.0, "grad_norm": 2.0576257705688477, "learning_rate": 6.718e-05, "loss": 0.0318, "step": 3360 }, { "epoch": 210.625, "grad_norm": 2.0494043827056885, "learning_rate": 6.738e-05, "loss": 0.0284, "step": 3370 }, { "epoch": 211.25, "grad_norm": 1.5460221767425537, "learning_rate": 6.758e-05, "loss": 0.0274, "step": 3380 }, { "epoch": 211.875, "grad_norm": 2.422177791595459, "learning_rate": 6.778e-05, "loss": 0.0287, "step": 3390 }, { "epoch": 212.5, "grad_norm": 2.38964581489563, "learning_rate": 6.798e-05, "loss": 0.0329, "step": 3400 }, { "epoch": 213.125, "grad_norm": 2.0634000301361084, "learning_rate": 6.818e-05, "loss": 0.0273, "step": 3410 }, { "epoch": 213.75, "grad_norm": 2.5334651470184326, "learning_rate": 6.838e-05, "loss": 0.0312, "step": 3420 }, { "epoch": 214.375, "grad_norm": 2.527052402496338, "learning_rate": 6.858e-05, "loss": 0.0297, "step": 3430 }, { "epoch": 215.0, "grad_norm": 2.3704299926757812, "learning_rate": 6.878e-05, "loss": 0.0313, "step": 3440 }, { "epoch": 215.625, "grad_norm": 1.926483154296875, "learning_rate": 6.898e-05, "loss": 0.0256, "step": 3450 }, { "epoch": 216.25, "grad_norm": 1.56046724319458, "learning_rate": 6.918e-05, "loss": 0.0278, "step": 3460 }, { "epoch": 216.875, "grad_norm": 1.8307677507400513, "learning_rate": 6.938e-05, "loss": 0.0269, "step": 3470 }, { "epoch": 217.5, "grad_norm": 1.9908180236816406, "learning_rate": 6.958e-05, "loss": 0.0276, "step": 3480 }, { "epoch": 218.125, "grad_norm": 2.067988395690918, "learning_rate": 6.978e-05, "loss": 0.0268, "step": 3490 }, { "epoch": 218.75, "grad_norm": 1.8545929193496704, "learning_rate": 6.998e-05, "loss": 0.0268, "step": 3500 }, { "epoch": 219.375, "grad_norm": 2.052927017211914, "learning_rate": 7.018e-05, "loss": 0.0253, "step": 3510 }, { "epoch": 220.0, "grad_norm": 2.1113545894622803, "learning_rate": 7.038e-05, "loss": 0.0245, "step": 3520 }, { "epoch": 220.625, "grad_norm": 1.541675329208374, "learning_rate": 7.058e-05, "loss": 0.0253, "step": 3530 }, { "epoch": 221.25, "grad_norm": 1.7272151708602905, "learning_rate": 7.078e-05, "loss": 0.0251, "step": 3540 }, { "epoch": 221.875, "grad_norm": 1.7178980112075806, "learning_rate": 7.098e-05, "loss": 0.026, "step": 3550 }, { "epoch": 222.5, "grad_norm": 2.246424913406372, "learning_rate": 7.118e-05, "loss": 0.0267, "step": 3560 }, { "epoch": 223.125, "grad_norm": 1.9230071306228638, "learning_rate": 7.138e-05, "loss": 0.0268, "step": 3570 }, { "epoch": 223.75, "grad_norm": 1.9361920356750488, "learning_rate": 7.158e-05, "loss": 0.0268, "step": 3580 }, { "epoch": 224.375, "grad_norm": 1.6865476369857788, "learning_rate": 7.178000000000001e-05, "loss": 0.0248, "step": 3590 }, { "epoch": 225.0, "grad_norm": 2.019584894180298, "learning_rate": 7.198e-05, "loss": 0.0258, "step": 3600 }, { "epoch": 225.625, "grad_norm": 1.8740990161895752, "learning_rate": 7.218e-05, "loss": 0.0243, "step": 3610 }, { "epoch": 226.25, "grad_norm": 2.088883399963379, "learning_rate": 7.238000000000001e-05, "loss": 0.0253, "step": 3620 }, { "epoch": 226.875, "grad_norm": 2.107874870300293, "learning_rate": 7.258e-05, "loss": 0.0265, "step": 3630 }, { "epoch": 227.5, "grad_norm": 1.690873622894287, "learning_rate": 7.278e-05, "loss": 0.0262, "step": 3640 }, { "epoch": 228.125, "grad_norm": 2.7033252716064453, "learning_rate": 7.298000000000001e-05, "loss": 0.025, "step": 3650 }, { "epoch": 228.75, "grad_norm": 1.91816246509552, "learning_rate": 7.318e-05, "loss": 0.0265, "step": 3660 }, { "epoch": 229.375, "grad_norm": 1.9548629522323608, "learning_rate": 7.338e-05, "loss": 0.0251, "step": 3670 }, { "epoch": 230.0, "grad_norm": 1.911120891571045, "learning_rate": 7.358000000000001e-05, "loss": 0.0245, "step": 3680 }, { "epoch": 230.625, "grad_norm": 1.6720895767211914, "learning_rate": 7.378e-05, "loss": 0.0252, "step": 3690 }, { "epoch": 231.25, "grad_norm": 1.9147329330444336, "learning_rate": 7.398e-05, "loss": 0.0247, "step": 3700 }, { "epoch": 231.875, "grad_norm": 2.1456077098846436, "learning_rate": 7.418000000000001e-05, "loss": 0.0252, "step": 3710 }, { "epoch": 232.5, "grad_norm": 1.9418590068817139, "learning_rate": 7.438e-05, "loss": 0.0257, "step": 3720 }, { "epoch": 233.125, "grad_norm": 1.9458227157592773, "learning_rate": 7.458000000000001e-05, "loss": 0.0271, "step": 3730 }, { "epoch": 233.75, "grad_norm": 1.9564207792282104, "learning_rate": 7.478e-05, "loss": 0.0262, "step": 3740 }, { "epoch": 234.375, "grad_norm": 1.4478167295455933, "learning_rate": 7.498e-05, "loss": 0.0254, "step": 3750 }, { "epoch": 235.0, "grad_norm": 2.14218807220459, "learning_rate": 7.518000000000001e-05, "loss": 0.0251, "step": 3760 }, { "epoch": 235.625, "grad_norm": 2.029665946960449, "learning_rate": 7.538e-05, "loss": 0.0276, "step": 3770 }, { "epoch": 236.25, "grad_norm": 1.8243962526321411, "learning_rate": 7.558e-05, "loss": 0.0267, "step": 3780 }, { "epoch": 236.875, "grad_norm": 1.6162742376327515, "learning_rate": 7.578000000000001e-05, "loss": 0.0224, "step": 3790 }, { "epoch": 237.5, "grad_norm": 2.0405139923095703, "learning_rate": 7.598e-05, "loss": 0.0248, "step": 3800 }, { "epoch": 238.125, "grad_norm": 1.9894390106201172, "learning_rate": 7.618e-05, "loss": 0.0239, "step": 3810 }, { "epoch": 238.75, "grad_norm": 1.7805562019348145, "learning_rate": 7.638000000000001e-05, "loss": 0.0245, "step": 3820 }, { "epoch": 239.375, "grad_norm": 2.0249173641204834, "learning_rate": 7.658e-05, "loss": 0.0221, "step": 3830 }, { "epoch": 240.0, "grad_norm": 1.8023134469985962, "learning_rate": 7.678000000000001e-05, "loss": 0.0234, "step": 3840 }, { "epoch": 240.625, "grad_norm": 1.5592528581619263, "learning_rate": 7.698000000000001e-05, "loss": 0.0245, "step": 3850 }, { "epoch": 241.25, "grad_norm": 2.1557257175445557, "learning_rate": 7.718e-05, "loss": 0.0243, "step": 3860 }, { "epoch": 241.875, "grad_norm": 1.9655349254608154, "learning_rate": 7.738000000000001e-05, "loss": 0.0223, "step": 3870 }, { "epoch": 242.5, "grad_norm": 1.616184115409851, "learning_rate": 7.758000000000001e-05, "loss": 0.0249, "step": 3880 }, { "epoch": 243.125, "grad_norm": 2.146557331085205, "learning_rate": 7.778e-05, "loss": 0.0243, "step": 3890 }, { "epoch": 243.75, "grad_norm": 1.6077772378921509, "learning_rate": 7.798000000000001e-05, "loss": 0.0237, "step": 3900 }, { "epoch": 244.375, "grad_norm": 2.073211431503296, "learning_rate": 7.818000000000001e-05, "loss": 0.0211, "step": 3910 }, { "epoch": 245.0, "grad_norm": 1.7445831298828125, "learning_rate": 7.838e-05, "loss": 0.0225, "step": 3920 }, { "epoch": 245.625, "grad_norm": 1.5558561086654663, "learning_rate": 7.858000000000001e-05, "loss": 0.0215, "step": 3930 }, { "epoch": 246.25, "grad_norm": 1.4040555953979492, "learning_rate": 7.878e-05, "loss": 0.0219, "step": 3940 }, { "epoch": 246.875, "grad_norm": 1.6972527503967285, "learning_rate": 7.897999999999999e-05, "loss": 0.0247, "step": 3950 }, { "epoch": 247.5, "grad_norm": 1.862613320350647, "learning_rate": 7.918e-05, "loss": 0.0235, "step": 3960 }, { "epoch": 248.125, "grad_norm": 1.9567930698394775, "learning_rate": 7.938e-05, "loss": 0.0243, "step": 3970 }, { "epoch": 248.75, "grad_norm": 1.8193110227584839, "learning_rate": 7.958e-05, "loss": 0.0232, "step": 3980 }, { "epoch": 249.375, "grad_norm": 1.8279744386672974, "learning_rate": 7.978e-05, "loss": 0.022, "step": 3990 }, { "epoch": 250.0, "grad_norm": 1.9170351028442383, "learning_rate": 7.998e-05, "loss": 0.0238, "step": 4000 }, { "epoch": 250.625, "grad_norm": 1.7806050777435303, "learning_rate": 8.018e-05, "loss": 0.0224, "step": 4010 }, { "epoch": 251.25, "grad_norm": 1.618657112121582, "learning_rate": 8.038e-05, "loss": 0.0228, "step": 4020 }, { "epoch": 251.875, "grad_norm": 1.3448606729507446, "learning_rate": 8.058e-05, "loss": 0.0213, "step": 4030 }, { "epoch": 252.5, "grad_norm": 2.1564993858337402, "learning_rate": 8.078e-05, "loss": 0.0224, "step": 4040 }, { "epoch": 253.125, "grad_norm": 1.9321818351745605, "learning_rate": 8.098e-05, "loss": 0.0258, "step": 4050 }, { "epoch": 253.75, "grad_norm": 1.6877397298812866, "learning_rate": 8.118e-05, "loss": 0.0235, "step": 4060 }, { "epoch": 254.375, "grad_norm": 1.899335265159607, "learning_rate": 8.138e-05, "loss": 0.0243, "step": 4070 }, { "epoch": 255.0, "grad_norm": 1.6680128574371338, "learning_rate": 8.158e-05, "loss": 0.0247, "step": 4080 }, { "epoch": 255.625, "grad_norm": 1.4403914213180542, "learning_rate": 8.178e-05, "loss": 0.0224, "step": 4090 }, { "epoch": 256.25, "grad_norm": 1.8112647533416748, "learning_rate": 8.198e-05, "loss": 0.022, "step": 4100 }, { "epoch": 256.875, "grad_norm": 1.6451849937438965, "learning_rate": 8.218e-05, "loss": 0.0216, "step": 4110 }, { "epoch": 257.5, "grad_norm": 1.4328521490097046, "learning_rate": 8.238000000000001e-05, "loss": 0.0217, "step": 4120 }, { "epoch": 258.125, "grad_norm": 1.8865714073181152, "learning_rate": 8.258e-05, "loss": 0.0218, "step": 4130 }, { "epoch": 258.75, "grad_norm": 1.6151604652404785, "learning_rate": 8.278e-05, "loss": 0.0207, "step": 4140 }, { "epoch": 259.375, "grad_norm": 1.576856017112732, "learning_rate": 8.298000000000001e-05, "loss": 0.0227, "step": 4150 }, { "epoch": 260.0, "grad_norm": 1.9383561611175537, "learning_rate": 8.318e-05, "loss": 0.0211, "step": 4160 }, { "epoch": 260.625, "grad_norm": 1.417213797569275, "learning_rate": 8.338e-05, "loss": 0.0218, "step": 4170 }, { "epoch": 261.25, "grad_norm": 1.4880584478378296, "learning_rate": 8.358e-05, "loss": 0.0215, "step": 4180 }, { "epoch": 261.875, "grad_norm": 1.7698973417282104, "learning_rate": 8.378e-05, "loss": 0.0209, "step": 4190 }, { "epoch": 262.5, "grad_norm": 1.4688743352890015, "learning_rate": 8.398e-05, "loss": 0.022, "step": 4200 }, { "epoch": 263.125, "grad_norm": 1.563480019569397, "learning_rate": 8.418e-05, "loss": 0.021, "step": 4210 }, { "epoch": 263.75, "grad_norm": 1.6026536226272583, "learning_rate": 8.438e-05, "loss": 0.0196, "step": 4220 }, { "epoch": 264.375, "grad_norm": 1.390167236328125, "learning_rate": 8.458e-05, "loss": 0.0203, "step": 4230 }, { "epoch": 265.0, "grad_norm": 1.3945834636688232, "learning_rate": 8.478e-05, "loss": 0.0187, "step": 4240 }, { "epoch": 265.625, "grad_norm": 1.6028813123703003, "learning_rate": 8.498e-05, "loss": 0.0211, "step": 4250 }, { "epoch": 266.25, "grad_norm": 1.5985839366912842, "learning_rate": 8.518000000000001e-05, "loss": 0.021, "step": 4260 }, { "epoch": 266.875, "grad_norm": 1.3894219398498535, "learning_rate": 8.538e-05, "loss": 0.0203, "step": 4270 }, { "epoch": 267.5, "grad_norm": 1.9198909997940063, "learning_rate": 8.558e-05, "loss": 0.0217, "step": 4280 }, { "epoch": 268.125, "grad_norm": 1.6992826461791992, "learning_rate": 8.578000000000001e-05, "loss": 0.0218, "step": 4290 }, { "epoch": 268.75, "grad_norm": 1.5295377969741821, "learning_rate": 8.598e-05, "loss": 0.0209, "step": 4300 }, { "epoch": 269.375, "grad_norm": 1.9647233486175537, "learning_rate": 8.618e-05, "loss": 0.0208, "step": 4310 }, { "epoch": 270.0, "grad_norm": 1.6796159744262695, "learning_rate": 8.638000000000001e-05, "loss": 0.0207, "step": 4320 }, { "epoch": 270.625, "grad_norm": 1.7937408685684204, "learning_rate": 8.658e-05, "loss": 0.0212, "step": 4330 }, { "epoch": 271.25, "grad_norm": 1.944583535194397, "learning_rate": 8.678e-05, "loss": 0.0214, "step": 4340 }, { "epoch": 271.875, "grad_norm": 1.501273512840271, "learning_rate": 8.698000000000001e-05, "loss": 0.0227, "step": 4350 }, { "epoch": 272.5, "grad_norm": 1.630289077758789, "learning_rate": 8.718e-05, "loss": 0.0187, "step": 4360 }, { "epoch": 273.125, "grad_norm": 1.558972716331482, "learning_rate": 8.738000000000001e-05, "loss": 0.0201, "step": 4370 }, { "epoch": 273.75, "grad_norm": 1.319100260734558, "learning_rate": 8.758000000000001e-05, "loss": 0.0199, "step": 4380 }, { "epoch": 274.375, "grad_norm": 1.687119722366333, "learning_rate": 8.778e-05, "loss": 0.0222, "step": 4390 }, { "epoch": 275.0, "grad_norm": 1.5666712522506714, "learning_rate": 8.798000000000001e-05, "loss": 0.0218, "step": 4400 }, { "epoch": 275.625, "grad_norm": 1.374186396598816, "learning_rate": 8.818000000000001e-05, "loss": 0.0201, "step": 4410 }, { "epoch": 276.25, "grad_norm": 1.4911551475524902, "learning_rate": 8.838e-05, "loss": 0.0197, "step": 4420 }, { "epoch": 276.875, "grad_norm": 1.391093134880066, "learning_rate": 8.858000000000001e-05, "loss": 0.019, "step": 4430 }, { "epoch": 277.5, "grad_norm": 1.5734379291534424, "learning_rate": 8.878000000000001e-05, "loss": 0.0182, "step": 4440 }, { "epoch": 278.125, "grad_norm": 1.5925443172454834, "learning_rate": 8.898e-05, "loss": 0.0196, "step": 4450 }, { "epoch": 278.75, "grad_norm": 1.6269075870513916, "learning_rate": 8.918000000000001e-05, "loss": 0.0211, "step": 4460 }, { "epoch": 279.375, "grad_norm": 1.5029900074005127, "learning_rate": 8.938e-05, "loss": 0.0208, "step": 4470 }, { "epoch": 280.0, "grad_norm": 1.56442391872406, "learning_rate": 8.958e-05, "loss": 0.0196, "step": 4480 }, { "epoch": 280.625, "grad_norm": 1.3483182191848755, "learning_rate": 8.978000000000001e-05, "loss": 0.0185, "step": 4490 }, { "epoch": 281.25, "grad_norm": 1.2249255180358887, "learning_rate": 8.998e-05, "loss": 0.0182, "step": 4500 }, { "epoch": 281.875, "grad_norm": 1.206023097038269, "learning_rate": 9.018000000000001e-05, "loss": 0.0191, "step": 4510 }, { "epoch": 282.5, "grad_norm": 1.9158329963684082, "learning_rate": 9.038000000000001e-05, "loss": 0.0202, "step": 4520 }, { "epoch": 283.125, "grad_norm": 1.6515963077545166, "learning_rate": 9.058e-05, "loss": 0.0199, "step": 4530 }, { "epoch": 283.75, "grad_norm": 1.7891855239868164, "learning_rate": 9.078000000000001e-05, "loss": 0.0213, "step": 4540 }, { "epoch": 284.375, "grad_norm": 1.5916194915771484, "learning_rate": 9.098000000000001e-05, "loss": 0.0204, "step": 4550 }, { "epoch": 285.0, "grad_norm": 1.6548500061035156, "learning_rate": 9.118e-05, "loss": 0.0206, "step": 4560 }, { "epoch": 285.625, "grad_norm": 1.7890138626098633, "learning_rate": 9.138e-05, "loss": 0.0216, "step": 4570 }, { "epoch": 286.25, "grad_norm": 1.3698619604110718, "learning_rate": 9.158e-05, "loss": 0.021, "step": 4580 }, { "epoch": 286.875, "grad_norm": 1.3164348602294922, "learning_rate": 9.178e-05, "loss": 0.0201, "step": 4590 }, { "epoch": 287.5, "grad_norm": 1.2602595090866089, "learning_rate": 9.198e-05, "loss": 0.0206, "step": 4600 }, { "epoch": 288.125, "grad_norm": 1.6356364488601685, "learning_rate": 9.218e-05, "loss": 0.0206, "step": 4610 }, { "epoch": 288.75, "grad_norm": 1.339037299156189, "learning_rate": 9.238e-05, "loss": 0.0194, "step": 4620 }, { "epoch": 289.375, "grad_norm": 1.5343581438064575, "learning_rate": 9.258e-05, "loss": 0.0214, "step": 4630 }, { "epoch": 290.0, "grad_norm": 1.7950295209884644, "learning_rate": 9.278e-05, "loss": 0.02, "step": 4640 }, { "epoch": 290.625, "grad_norm": 1.346240758895874, "learning_rate": 9.298e-05, "loss": 0.0202, "step": 4650 }, { "epoch": 291.25, "grad_norm": 1.1901124715805054, "learning_rate": 9.318e-05, "loss": 0.0176, "step": 4660 }, { "epoch": 291.875, "grad_norm": 1.3559141159057617, "learning_rate": 9.338e-05, "loss": 0.0165, "step": 4670 }, { "epoch": 292.5, "grad_norm": 1.342185139656067, "learning_rate": 9.358e-05, "loss": 0.0191, "step": 4680 }, { "epoch": 293.125, "grad_norm": 1.5401999950408936, "learning_rate": 9.378e-05, "loss": 0.0193, "step": 4690 }, { "epoch": 293.75, "grad_norm": 1.4412999153137207, "learning_rate": 9.398e-05, "loss": 0.0191, "step": 4700 }, { "epoch": 294.375, "grad_norm": 1.2340666055679321, "learning_rate": 9.418e-05, "loss": 0.0182, "step": 4710 }, { "epoch": 295.0, "grad_norm": 1.1816933155059814, "learning_rate": 9.438e-05, "loss": 0.0175, "step": 4720 }, { "epoch": 295.625, "grad_norm": 1.2440204620361328, "learning_rate": 9.458e-05, "loss": 0.0178, "step": 4730 }, { "epoch": 296.25, "grad_norm": 1.4980961084365845, "learning_rate": 9.478e-05, "loss": 0.0173, "step": 4740 }, { "epoch": 296.875, "grad_norm": 1.4015268087387085, "learning_rate": 9.498e-05, "loss": 0.0198, "step": 4750 }, { "epoch": 297.5, "grad_norm": 1.420882225036621, "learning_rate": 9.518000000000001e-05, "loss": 0.019, "step": 4760 }, { "epoch": 298.125, "grad_norm": 1.2662218809127808, "learning_rate": 9.538e-05, "loss": 0.0195, "step": 4770 }, { "epoch": 298.75, "grad_norm": 1.528330683708191, "learning_rate": 9.558e-05, "loss": 0.0213, "step": 4780 }, { "epoch": 299.375, "grad_norm": 1.3324357271194458, "learning_rate": 9.578000000000001e-05, "loss": 0.0194, "step": 4790 }, { "epoch": 300.0, "grad_norm": 1.3170146942138672, "learning_rate": 9.598e-05, "loss": 0.0186, "step": 4800 }, { "epoch": 300.625, "grad_norm": 1.4495036602020264, "learning_rate": 9.618e-05, "loss": 0.0178, "step": 4810 }, { "epoch": 301.25, "grad_norm": 1.6242793798446655, "learning_rate": 9.638000000000001e-05, "loss": 0.0204, "step": 4820 }, { "epoch": 301.875, "grad_norm": 1.4832464456558228, "learning_rate": 9.658e-05, "loss": 0.0203, "step": 4830 }, { "epoch": 302.5, "grad_norm": 1.3549563884735107, "learning_rate": 9.678e-05, "loss": 0.0177, "step": 4840 }, { "epoch": 303.125, "grad_norm": 1.804412841796875, "learning_rate": 9.698000000000001e-05, "loss": 0.021, "step": 4850 }, { "epoch": 303.75, "grad_norm": 1.5907257795333862, "learning_rate": 9.718e-05, "loss": 0.0209, "step": 4860 }, { "epoch": 304.375, "grad_norm": 1.4540935754776, "learning_rate": 9.738e-05, "loss": 0.017, "step": 4870 }, { "epoch": 305.0, "grad_norm": 1.223158597946167, "learning_rate": 9.758000000000001e-05, "loss": 0.018, "step": 4880 }, { "epoch": 305.625, "grad_norm": 1.2038943767547607, "learning_rate": 9.778e-05, "loss": 0.0176, "step": 4890 }, { "epoch": 306.25, "grad_norm": 1.110867977142334, "learning_rate": 9.798000000000001e-05, "loss": 0.0173, "step": 4900 }, { "epoch": 306.875, "grad_norm": 1.414939522743225, "learning_rate": 9.818000000000001e-05, "loss": 0.017, "step": 4910 }, { "epoch": 307.5, "grad_norm": 1.3866313695907593, "learning_rate": 9.838e-05, "loss": 0.02, "step": 4920 }, { "epoch": 308.125, "grad_norm": 1.5799922943115234, "learning_rate": 9.858000000000001e-05, "loss": 0.0162, "step": 4930 }, { "epoch": 308.75, "grad_norm": 1.261763334274292, "learning_rate": 9.878e-05, "loss": 0.0189, "step": 4940 }, { "epoch": 309.375, "grad_norm": 1.474787950515747, "learning_rate": 9.898e-05, "loss": 0.0181, "step": 4950 }, { "epoch": 310.0, "grad_norm": 1.287822961807251, "learning_rate": 9.918000000000001e-05, "loss": 0.0184, "step": 4960 }, { "epoch": 310.625, "grad_norm": 1.0713199377059937, "learning_rate": 9.938e-05, "loss": 0.0179, "step": 4970 }, { "epoch": 311.25, "grad_norm": 1.2200391292572021, "learning_rate": 9.958e-05, "loss": 0.018, "step": 4980 }, { "epoch": 311.875, "grad_norm": 1.5587009191513062, "learning_rate": 9.978000000000001e-05, "loss": 0.0199, "step": 4990 }, { "epoch": 312.5, "grad_norm": 1.4640460014343262, "learning_rate": 9.998e-05, "loss": 0.017, "step": 5000 }, { "epoch": 313.125, "grad_norm": 1.4215519428253174, "learning_rate": 9.999999778549045e-05, "loss": 0.0171, "step": 5010 }, { "epoch": 313.75, "grad_norm": 1.1879425048828125, "learning_rate": 9.999999013039593e-05, "loss": 0.016, "step": 5020 }, { "epoch": 314.375, "grad_norm": 1.231829047203064, "learning_rate": 9.999997700737766e-05, "loss": 0.0158, "step": 5030 }, { "epoch": 315.0, "grad_norm": 1.224221110343933, "learning_rate": 9.999995841643709e-05, "loss": 0.0164, "step": 5040 }, { "epoch": 315.625, "grad_norm": 1.491013765335083, "learning_rate": 9.999993435757623e-05, "loss": 0.0166, "step": 5050 }, { "epoch": 316.25, "grad_norm": 1.2551881074905396, "learning_rate": 9.999990483079773e-05, "loss": 0.0187, "step": 5060 }, { "epoch": 316.875, "grad_norm": 1.3919192552566528, "learning_rate": 9.999986983610481e-05, "loss": 0.0167, "step": 5070 }, { "epoch": 317.5, "grad_norm": 1.145408272743225, "learning_rate": 9.99998293735013e-05, "loss": 0.0169, "step": 5080 }, { "epoch": 318.125, "grad_norm": 1.5774271488189697, "learning_rate": 9.999978344299161e-05, "loss": 0.0171, "step": 5090 }, { "epoch": 318.75, "grad_norm": 1.4125555753707886, "learning_rate": 9.99997320445808e-05, "loss": 0.0191, "step": 5100 }, { "epoch": 319.375, "grad_norm": 1.110128402709961, "learning_rate": 9.999967517827444e-05, "loss": 0.0159, "step": 5110 }, { "epoch": 320.0, "grad_norm": 1.3442533016204834, "learning_rate": 9.999961284407879e-05, "loss": 0.0177, "step": 5120 }, { "epoch": 320.625, "grad_norm": 1.3384839296340942, "learning_rate": 9.999954504200067e-05, "loss": 0.0154, "step": 5130 }, { "epoch": 321.25, "grad_norm": 1.1482480764389038, "learning_rate": 9.999947177204744e-05, "loss": 0.0166, "step": 5140 }, { "epoch": 321.875, "grad_norm": 1.2519944906234741, "learning_rate": 9.999939303422718e-05, "loss": 0.0172, "step": 5150 }, { "epoch": 322.5, "grad_norm": 1.3870333433151245, "learning_rate": 9.999930882854847e-05, "loss": 0.0168, "step": 5160 }, { "epoch": 323.125, "grad_norm": 1.366909146308899, "learning_rate": 9.999921915502051e-05, "loss": 0.016, "step": 5170 }, { "epoch": 323.75, "grad_norm": 1.1931958198547363, "learning_rate": 9.99991240136531e-05, "loss": 0.0186, "step": 5180 }, { "epoch": 324.375, "grad_norm": 1.1246201992034912, "learning_rate": 9.999902340445668e-05, "loss": 0.0151, "step": 5190 }, { "epoch": 325.0, "grad_norm": 1.2969485521316528, "learning_rate": 9.999891732744224e-05, "loss": 0.0154, "step": 5200 }, { "epoch": 325.625, "grad_norm": 1.1869677305221558, "learning_rate": 9.999880578262135e-05, "loss": 0.0167, "step": 5210 }, { "epoch": 326.25, "grad_norm": 1.221058964729309, "learning_rate": 9.999868877000624e-05, "loss": 0.0164, "step": 5220 }, { "epoch": 326.875, "grad_norm": 1.2182931900024414, "learning_rate": 9.99985662896097e-05, "loss": 0.0175, "step": 5230 }, { "epoch": 327.5, "grad_norm": 1.2568279504776, "learning_rate": 9.999843834144513e-05, "loss": 0.0159, "step": 5240 }, { "epoch": 328.125, "grad_norm": 1.254540205001831, "learning_rate": 9.99983049255265e-05, "loss": 0.0161, "step": 5250 }, { "epoch": 328.75, "grad_norm": 1.2322643995285034, "learning_rate": 9.999816604186843e-05, "loss": 0.0168, "step": 5260 }, { "epoch": 329.375, "grad_norm": 0.9582310914993286, "learning_rate": 9.999802169048609e-05, "loss": 0.0149, "step": 5270 }, { "epoch": 330.0, "grad_norm": 0.900672197341919, "learning_rate": 9.999787187139527e-05, "loss": 0.0141, "step": 5280 }, { "epoch": 330.625, "grad_norm": 1.049651026725769, "learning_rate": 9.999771658461234e-05, "loss": 0.0153, "step": 5290 }, { "epoch": 331.25, "grad_norm": 1.0110572576522827, "learning_rate": 9.999755583015431e-05, "loss": 0.0145, "step": 5300 }, { "epoch": 331.875, "grad_norm": 1.1884170770645142, "learning_rate": 9.999738960803874e-05, "loss": 0.0152, "step": 5310 }, { "epoch": 332.5, "grad_norm": 1.4686788320541382, "learning_rate": 9.99972179182838e-05, "loss": 0.0136, "step": 5320 }, { "epoch": 333.125, "grad_norm": 1.0699830055236816, "learning_rate": 9.99970407609083e-05, "loss": 0.0161, "step": 5330 }, { "epoch": 333.75, "grad_norm": 1.8003672361373901, "learning_rate": 9.999685813593159e-05, "loss": 0.0177, "step": 5340 }, { "epoch": 334.375, "grad_norm": 1.38191556930542, "learning_rate": 9.999667004337362e-05, "loss": 0.0161, "step": 5350 }, { "epoch": 335.0, "grad_norm": 1.199036717414856, "learning_rate": 9.9996476483255e-05, "loss": 0.0164, "step": 5360 }, { "epoch": 335.625, "grad_norm": 1.1064685583114624, "learning_rate": 9.999627745559688e-05, "loss": 0.0153, "step": 5370 }, { "epoch": 336.25, "grad_norm": 0.968438982963562, "learning_rate": 9.999607296042101e-05, "loss": 0.015, "step": 5380 }, { "epoch": 336.875, "grad_norm": 1.3204340934753418, "learning_rate": 9.99958629977498e-05, "loss": 0.0144, "step": 5390 }, { "epoch": 337.5, "grad_norm": 1.0026376247406006, "learning_rate": 9.999564756760615e-05, "loss": 0.0144, "step": 5400 }, { "epoch": 338.125, "grad_norm": 1.094014048576355, "learning_rate": 9.999542667001366e-05, "loss": 0.0143, "step": 5410 }, { "epoch": 338.75, "grad_norm": 1.0915470123291016, "learning_rate": 9.999520030499647e-05, "loss": 0.0138, "step": 5420 }, { "epoch": 339.375, "grad_norm": 1.0048651695251465, "learning_rate": 9.999496847257936e-05, "loss": 0.0146, "step": 5430 }, { "epoch": 340.0, "grad_norm": 1.138767123222351, "learning_rate": 9.999473117278764e-05, "loss": 0.0162, "step": 5440 }, { "epoch": 340.625, "grad_norm": 1.3121551275253296, "learning_rate": 9.999448840564731e-05, "loss": 0.0144, "step": 5450 }, { "epoch": 341.25, "grad_norm": 1.2357908487319946, "learning_rate": 9.999424017118488e-05, "loss": 0.0155, "step": 5460 }, { "epoch": 341.875, "grad_norm": 1.4110485315322876, "learning_rate": 9.999398646942751e-05, "loss": 0.0171, "step": 5470 }, { "epoch": 342.5, "grad_norm": 1.231876015663147, "learning_rate": 9.999372730040296e-05, "loss": 0.0148, "step": 5480 }, { "epoch": 343.125, "grad_norm": 1.1513409614562988, "learning_rate": 9.999346266413953e-05, "loss": 0.0155, "step": 5490 }, { "epoch": 343.75, "grad_norm": 1.0324758291244507, "learning_rate": 9.99931925606662e-05, "loss": 0.0155, "step": 5500 }, { "epoch": 344.375, "grad_norm": 1.2001458406448364, "learning_rate": 9.99929169900125e-05, "loss": 0.0142, "step": 5510 }, { "epoch": 345.0, "grad_norm": 0.9090719819068909, "learning_rate": 9.999263595220855e-05, "loss": 0.0133, "step": 5520 }, { "epoch": 345.625, "grad_norm": 0.9517356157302856, "learning_rate": 9.99923494472851e-05, "loss": 0.0131, "step": 5530 }, { "epoch": 346.25, "grad_norm": 0.9557884931564331, "learning_rate": 9.999205747527348e-05, "loss": 0.0153, "step": 5540 }, { "epoch": 346.875, "grad_norm": 1.039165735244751, "learning_rate": 9.999176003620561e-05, "loss": 0.0141, "step": 5550 }, { "epoch": 347.5, "grad_norm": 0.930853545665741, "learning_rate": 9.999145713011405e-05, "loss": 0.0143, "step": 5560 }, { "epoch": 348.125, "grad_norm": 0.956095278263092, "learning_rate": 9.999114875703186e-05, "loss": 0.0141, "step": 5570 }, { "epoch": 348.75, "grad_norm": 0.771486222743988, "learning_rate": 9.999083491699281e-05, "loss": 0.0143, "step": 5580 }, { "epoch": 349.375, "grad_norm": 0.6893032193183899, "learning_rate": 9.999051561003123e-05, "loss": 0.0144, "step": 5590 }, { "epoch": 350.0, "grad_norm": 1.0121644735336304, "learning_rate": 9.999019083618202e-05, "loss": 0.0151, "step": 5600 }, { "epoch": 350.625, "grad_norm": 1.1058743000030518, "learning_rate": 9.99898605954807e-05, "loss": 0.0162, "step": 5610 }, { "epoch": 351.25, "grad_norm": 1.0109678506851196, "learning_rate": 9.998952488796338e-05, "loss": 0.015, "step": 5620 }, { "epoch": 351.875, "grad_norm": 0.8328022360801697, "learning_rate": 9.998918371366676e-05, "loss": 0.0142, "step": 5630 }, { "epoch": 352.5, "grad_norm": 0.836746096611023, "learning_rate": 9.99888370726282e-05, "loss": 0.0137, "step": 5640 }, { "epoch": 353.125, "grad_norm": 0.9082058072090149, "learning_rate": 9.998848496488556e-05, "loss": 0.0141, "step": 5650 }, { "epoch": 353.75, "grad_norm": 0.9380905628204346, "learning_rate": 9.998812739047736e-05, "loss": 0.0149, "step": 5660 }, { "epoch": 354.375, "grad_norm": 0.9345435500144958, "learning_rate": 9.99877643494427e-05, "loss": 0.0144, "step": 5670 }, { "epoch": 355.0, "grad_norm": 0.8377882242202759, "learning_rate": 9.998739584182128e-05, "loss": 0.0151, "step": 5680 }, { "epoch": 355.625, "grad_norm": 1.1241296529769897, "learning_rate": 9.998702186765342e-05, "loss": 0.0145, "step": 5690 }, { "epoch": 356.25, "grad_norm": 1.0022445917129517, "learning_rate": 9.998664242698e-05, "loss": 0.0137, "step": 5700 }, { "epoch": 356.875, "grad_norm": 1.14398992061615, "learning_rate": 9.998625751984251e-05, "loss": 0.0122, "step": 5710 }, { "epoch": 357.5, "grad_norm": 1.511240839958191, "learning_rate": 9.998586714628307e-05, "loss": 0.0141, "step": 5720 }, { "epoch": 358.125, "grad_norm": 1.257946252822876, "learning_rate": 9.998547130634432e-05, "loss": 0.0157, "step": 5730 }, { "epoch": 358.75, "grad_norm": 1.1702454090118408, "learning_rate": 9.99850700000696e-05, "loss": 0.0144, "step": 5740 }, { "epoch": 359.375, "grad_norm": 0.8067399859428406, "learning_rate": 9.998466322750278e-05, "loss": 0.0136, "step": 5750 }, { "epoch": 360.0, "grad_norm": 0.8550326228141785, "learning_rate": 9.998425098868834e-05, "loss": 0.0129, "step": 5760 }, { "epoch": 360.625, "grad_norm": 0.9919332265853882, "learning_rate": 9.998383328367136e-05, "loss": 0.013, "step": 5770 }, { "epoch": 361.25, "grad_norm": 0.9598110914230347, "learning_rate": 9.99834101124975e-05, "loss": 0.0136, "step": 5780 }, { "epoch": 361.875, "grad_norm": 0.8677031397819519, "learning_rate": 9.998298147521309e-05, "loss": 0.0137, "step": 5790 }, { "epoch": 362.5, "grad_norm": 0.9038897156715393, "learning_rate": 9.998254737186496e-05, "loss": 0.0124, "step": 5800 }, { "epoch": 363.125, "grad_norm": 0.9390170574188232, "learning_rate": 9.99821078025006e-05, "loss": 0.0119, "step": 5810 }, { "epoch": 363.75, "grad_norm": 1.011299967765808, "learning_rate": 9.998166276716807e-05, "loss": 0.0131, "step": 5820 }, { "epoch": 364.375, "grad_norm": 0.7727632522583008, "learning_rate": 9.998121226591606e-05, "loss": 0.0124, "step": 5830 }, { "epoch": 365.0, "grad_norm": 0.9111457467079163, "learning_rate": 9.998075629879382e-05, "loss": 0.0122, "step": 5840 }, { "epoch": 365.625, "grad_norm": 0.8254387378692627, "learning_rate": 9.99802948658512e-05, "loss": 0.0122, "step": 5850 }, { "epoch": 366.25, "grad_norm": 0.8419124484062195, "learning_rate": 9.99798279671387e-05, "loss": 0.0136, "step": 5860 }, { "epoch": 366.875, "grad_norm": 0.9950329661369324, "learning_rate": 9.997935560270734e-05, "loss": 0.0139, "step": 5870 }, { "epoch": 367.5, "grad_norm": 0.8446523547172546, "learning_rate": 9.997887777260879e-05, "loss": 0.0128, "step": 5880 }, { "epoch": 368.125, "grad_norm": 0.8795507550239563, "learning_rate": 9.997839447689532e-05, "loss": 0.0142, "step": 5890 }, { "epoch": 368.75, "grad_norm": 0.9794557094573975, "learning_rate": 9.997790571561978e-05, "loss": 0.0134, "step": 5900 }, { "epoch": 369.375, "grad_norm": 0.9027246236801147, "learning_rate": 9.99774114888356e-05, "loss": 0.0126, "step": 5910 }, { "epoch": 370.0, "grad_norm": 0.8756938576698303, "learning_rate": 9.997691179659684e-05, "loss": 0.014, "step": 5920 }, { "epoch": 370.625, "grad_norm": 1.2023380994796753, "learning_rate": 9.997640663895815e-05, "loss": 0.0131, "step": 5930 }, { "epoch": 371.25, "grad_norm": 1.141804814338684, "learning_rate": 9.997589601597477e-05, "loss": 0.015, "step": 5940 }, { "epoch": 371.875, "grad_norm": 0.9179847836494446, "learning_rate": 9.997537992770252e-05, "loss": 0.0126, "step": 5950 }, { "epoch": 372.5, "grad_norm": 0.8151926398277283, "learning_rate": 9.997485837419788e-05, "loss": 0.013, "step": 5960 }, { "epoch": 373.125, "grad_norm": 0.6601715683937073, "learning_rate": 9.997433135551786e-05, "loss": 0.0123, "step": 5970 }, { "epoch": 373.75, "grad_norm": 0.8281500935554504, "learning_rate": 9.997379887172009e-05, "loss": 0.0115, "step": 5980 }, { "epoch": 374.375, "grad_norm": 0.8727806806564331, "learning_rate": 9.997326092286281e-05, "loss": 0.0128, "step": 5990 }, { "epoch": 375.0, "grad_norm": 0.8489688038825989, "learning_rate": 9.997271750900486e-05, "loss": 0.0129, "step": 6000 }, { "epoch": 375.625, "grad_norm": 0.6510198712348938, "learning_rate": 9.997216863020565e-05, "loss": 0.0117, "step": 6010 }, { "epoch": 376.25, "grad_norm": 0.8793591856956482, "learning_rate": 9.99716142865252e-05, "loss": 0.012, "step": 6020 }, { "epoch": 376.875, "grad_norm": 0.7070950269699097, "learning_rate": 9.997105447802415e-05, "loss": 0.0118, "step": 6030 }, { "epoch": 377.5, "grad_norm": 0.8314371109008789, "learning_rate": 9.997048920476373e-05, "loss": 0.0118, "step": 6040 }, { "epoch": 378.125, "grad_norm": 0.761350154876709, "learning_rate": 9.996991846680572e-05, "loss": 0.0127, "step": 6050 }, { "epoch": 378.75, "grad_norm": 0.7484061121940613, "learning_rate": 9.996934226421257e-05, "loss": 0.0119, "step": 6060 }, { "epoch": 379.375, "grad_norm": 0.7929844260215759, "learning_rate": 9.996876059704726e-05, "loss": 0.012, "step": 6070 }, { "epoch": 380.0, "grad_norm": 0.8181713819503784, "learning_rate": 9.996817346537343e-05, "loss": 0.0142, "step": 6080 }, { "epoch": 380.625, "grad_norm": 0.9369438290596008, "learning_rate": 9.996758086925526e-05, "loss": 0.0132, "step": 6090 }, { "epoch": 381.25, "grad_norm": 0.8046433925628662, "learning_rate": 9.996698280875759e-05, "loss": 0.012, "step": 6100 }, { "epoch": 381.875, "grad_norm": 0.7803655862808228, "learning_rate": 9.99663792839458e-05, "loss": 0.0134, "step": 6110 }, { "epoch": 382.5, "grad_norm": 0.7660366296768188, "learning_rate": 9.99657702948859e-05, "loss": 0.0124, "step": 6120 }, { "epoch": 383.125, "grad_norm": 0.6417670845985413, "learning_rate": 9.996515584164448e-05, "loss": 0.012, "step": 6130 }, { "epoch": 383.75, "grad_norm": 0.8960108160972595, "learning_rate": 9.996453592428873e-05, "loss": 0.0117, "step": 6140 }, { "epoch": 384.375, "grad_norm": 0.8871966600418091, "learning_rate": 9.996391054288646e-05, "loss": 0.0116, "step": 6150 }, { "epoch": 385.0, "grad_norm": 0.8760678172111511, "learning_rate": 9.996327969750605e-05, "loss": 0.0117, "step": 6160 }, { "epoch": 385.625, "grad_norm": 0.865280032157898, "learning_rate": 9.996264338821649e-05, "loss": 0.011, "step": 6170 }, { "epoch": 386.25, "grad_norm": 1.1085981130599976, "learning_rate": 9.996200161508735e-05, "loss": 0.0128, "step": 6180 }, { "epoch": 386.875, "grad_norm": 1.0455905199050903, "learning_rate": 9.996135437818885e-05, "loss": 0.0121, "step": 6190 }, { "epoch": 387.5, "grad_norm": 0.8136721253395081, "learning_rate": 9.996070167759175e-05, "loss": 0.013, "step": 6200 }, { "epoch": 388.125, "grad_norm": 0.7488872408866882, "learning_rate": 9.996004351336743e-05, "loss": 0.0126, "step": 6210 }, { "epoch": 388.75, "grad_norm": 0.8310092091560364, "learning_rate": 9.995937988558785e-05, "loss": 0.0136, "step": 6220 }, { "epoch": 389.375, "grad_norm": 0.8811050653457642, "learning_rate": 9.995871079432561e-05, "loss": 0.0132, "step": 6230 }, { "epoch": 390.0, "grad_norm": 0.9369884133338928, "learning_rate": 9.995803623965389e-05, "loss": 0.0133, "step": 6240 }, { "epoch": 390.625, "grad_norm": 0.9472755193710327, "learning_rate": 9.995735622164641e-05, "loss": 0.0132, "step": 6250 }, { "epoch": 391.25, "grad_norm": 1.1913206577301025, "learning_rate": 9.995667074037758e-05, "loss": 0.0134, "step": 6260 }, { "epoch": 391.875, "grad_norm": 0.8896439075469971, "learning_rate": 9.995597979592232e-05, "loss": 0.0134, "step": 6270 }, { "epoch": 392.5, "grad_norm": 0.8965170383453369, "learning_rate": 9.995528338835625e-05, "loss": 0.0124, "step": 6280 }, { "epoch": 393.125, "grad_norm": 0.8789317011833191, "learning_rate": 9.995458151775547e-05, "loss": 0.0126, "step": 6290 }, { "epoch": 393.75, "grad_norm": 0.7865223288536072, "learning_rate": 9.995387418419677e-05, "loss": 0.0119, "step": 6300 }, { "epoch": 394.375, "grad_norm": 0.7527452111244202, "learning_rate": 9.99531613877575e-05, "loss": 0.0118, "step": 6310 }, { "epoch": 395.0, "grad_norm": 0.7900567650794983, "learning_rate": 9.995244312851559e-05, "loss": 0.0116, "step": 6320 }, { "epoch": 395.625, "grad_norm": 0.7366781234741211, "learning_rate": 9.995171940654961e-05, "loss": 0.0112, "step": 6330 }, { "epoch": 396.25, "grad_norm": 0.8073196411132812, "learning_rate": 9.995099022193871e-05, "loss": 0.0116, "step": 6340 }, { "epoch": 396.875, "grad_norm": 0.924555242061615, "learning_rate": 9.995025557476261e-05, "loss": 0.0109, "step": 6350 }, { "epoch": 397.5, "grad_norm": 0.8284614682197571, "learning_rate": 9.994951546510165e-05, "loss": 0.0117, "step": 6360 }, { "epoch": 398.125, "grad_norm": 0.8100062012672424, "learning_rate": 9.994876989303679e-05, "loss": 0.0127, "step": 6370 }, { "epoch": 398.75, "grad_norm": 0.9377039670944214, "learning_rate": 9.994801885864955e-05, "loss": 0.0122, "step": 6380 }, { "epoch": 399.375, "grad_norm": 0.9842908978462219, "learning_rate": 9.994726236202205e-05, "loss": 0.013, "step": 6390 }, { "epoch": 400.0, "grad_norm": 1.1019262075424194, "learning_rate": 9.994650040323704e-05, "loss": 0.0134, "step": 6400 }, { "epoch": 400.625, "grad_norm": 1.0751221179962158, "learning_rate": 9.994573298237784e-05, "loss": 0.0118, "step": 6410 }, { "epoch": 401.25, "grad_norm": 0.898923933506012, "learning_rate": 9.994496009952837e-05, "loss": 0.012, "step": 6420 }, { "epoch": 401.875, "grad_norm": 0.8281941413879395, "learning_rate": 9.994418175477316e-05, "loss": 0.0124, "step": 6430 }, { "epoch": 402.5, "grad_norm": 0.692079484462738, "learning_rate": 9.994339794819733e-05, "loss": 0.011, "step": 6440 }, { "epoch": 403.125, "grad_norm": 0.7526706457138062, "learning_rate": 9.994260867988658e-05, "loss": 0.0121, "step": 6450 }, { "epoch": 403.75, "grad_norm": 0.8704769015312195, "learning_rate": 9.994181394992723e-05, "loss": 0.0109, "step": 6460 }, { "epoch": 404.375, "grad_norm": 0.8282954096794128, "learning_rate": 9.994101375840618e-05, "loss": 0.0107, "step": 6470 }, { "epoch": 405.0, "grad_norm": 0.7742241621017456, "learning_rate": 9.994020810541098e-05, "loss": 0.0115, "step": 6480 }, { "epoch": 405.625, "grad_norm": 0.7262750267982483, "learning_rate": 9.99393969910297e-05, "loss": 0.011, "step": 6490 }, { "epoch": 406.25, "grad_norm": 0.8099271655082703, "learning_rate": 9.993858041535104e-05, "loss": 0.0126, "step": 6500 }, { "epoch": 406.875, "grad_norm": 0.8308644890785217, "learning_rate": 9.99377583784643e-05, "loss": 0.0119, "step": 6510 }, { "epoch": 407.5, "grad_norm": 0.900124728679657, "learning_rate": 9.993693088045939e-05, "loss": 0.0112, "step": 6520 }, { "epoch": 408.125, "grad_norm": 0.8921932578086853, "learning_rate": 9.99360979214268e-05, "loss": 0.0112, "step": 6530 }, { "epoch": 408.75, "grad_norm": 0.9405972361564636, "learning_rate": 9.99352595014576e-05, "loss": 0.0107, "step": 6540 }, { "epoch": 409.375, "grad_norm": 0.8436768651008606, "learning_rate": 9.993441562064354e-05, "loss": 0.0113, "step": 6550 }, { "epoch": 410.0, "grad_norm": 0.804934024810791, "learning_rate": 9.993356627907685e-05, "loss": 0.0117, "step": 6560 }, { "epoch": 410.625, "grad_norm": 0.945950984954834, "learning_rate": 9.99327114768504e-05, "loss": 0.0125, "step": 6570 }, { "epoch": 411.25, "grad_norm": 0.925611674785614, "learning_rate": 9.99318512140577e-05, "loss": 0.0121, "step": 6580 }, { "epoch": 411.875, "grad_norm": 0.9319164156913757, "learning_rate": 9.993098549079284e-05, "loss": 0.012, "step": 6590 }, { "epoch": 412.5, "grad_norm": 1.0740889310836792, "learning_rate": 9.993011430715047e-05, "loss": 0.0137, "step": 6600 }, { "epoch": 413.125, "grad_norm": 1.1442779302597046, "learning_rate": 9.992923766322586e-05, "loss": 0.0125, "step": 6610 }, { "epoch": 413.75, "grad_norm": 0.8353562355041504, "learning_rate": 9.99283555591149e-05, "loss": 0.0119, "step": 6620 }, { "epoch": 414.375, "grad_norm": 0.720020592212677, "learning_rate": 9.992746799491404e-05, "loss": 0.012, "step": 6630 }, { "epoch": 415.0, "grad_norm": 0.7117792367935181, "learning_rate": 9.992657497072033e-05, "loss": 0.0118, "step": 6640 }, { "epoch": 415.625, "grad_norm": 0.8013281226158142, "learning_rate": 9.992567648663147e-05, "loss": 0.0134, "step": 6650 }, { "epoch": 416.25, "grad_norm": 0.8130918145179749, "learning_rate": 9.992477254274568e-05, "loss": 0.0118, "step": 6660 }, { "epoch": 416.875, "grad_norm": 0.7213727235794067, "learning_rate": 9.992386313916183e-05, "loss": 0.0111, "step": 6670 }, { "epoch": 417.5, "grad_norm": 0.6564821004867554, "learning_rate": 9.992294827597934e-05, "loss": 0.0113, "step": 6680 }, { "epoch": 418.125, "grad_norm": 0.6905478239059448, "learning_rate": 9.992202795329831e-05, "loss": 0.012, "step": 6690 }, { "epoch": 418.75, "grad_norm": 0.821371853351593, "learning_rate": 9.992110217121936e-05, "loss": 0.0128, "step": 6700 }, { "epoch": 419.375, "grad_norm": 0.8834856152534485, "learning_rate": 9.992017092984372e-05, "loss": 0.0106, "step": 6710 }, { "epoch": 420.0, "grad_norm": 0.8281375765800476, "learning_rate": 9.991923422927326e-05, "loss": 0.0115, "step": 6720 }, { "epoch": 420.625, "grad_norm": 0.7310401797294617, "learning_rate": 9.991829206961037e-05, "loss": 0.0101, "step": 6730 }, { "epoch": 421.25, "grad_norm": 0.7845788598060608, "learning_rate": 9.991734445095813e-05, "loss": 0.0105, "step": 6740 }, { "epoch": 421.875, "grad_norm": 0.8412182331085205, "learning_rate": 9.991639137342015e-05, "loss": 0.0111, "step": 6750 }, { "epoch": 422.5, "grad_norm": 0.7537260055541992, "learning_rate": 9.991543283710064e-05, "loss": 0.0113, "step": 6760 }, { "epoch": 423.125, "grad_norm": 0.6647925972938538, "learning_rate": 9.991446884210445e-05, "loss": 0.0119, "step": 6770 }, { "epoch": 423.75, "grad_norm": 0.7035212516784668, "learning_rate": 9.9913499388537e-05, "loss": 0.0097, "step": 6780 }, { "epoch": 424.375, "grad_norm": 0.7553647756576538, "learning_rate": 9.99125244765043e-05, "loss": 0.01, "step": 6790 }, { "epoch": 425.0, "grad_norm": 0.7420441508293152, "learning_rate": 9.991154410611296e-05, "loss": 0.0114, "step": 6800 }, { "epoch": 425.625, "grad_norm": 0.6657722592353821, "learning_rate": 9.99105582774702e-05, "loss": 0.0109, "step": 6810 }, { "epoch": 426.25, "grad_norm": 0.7254708409309387, "learning_rate": 9.990956699068384e-05, "loss": 0.0116, "step": 6820 }, { "epoch": 426.875, "grad_norm": 0.8595172166824341, "learning_rate": 9.990857024586224e-05, "loss": 0.0113, "step": 6830 }, { "epoch": 427.5, "grad_norm": 0.9384058117866516, "learning_rate": 9.990756804311446e-05, "loss": 0.0112, "step": 6840 }, { "epoch": 428.125, "grad_norm": 0.8805230855941772, "learning_rate": 9.990656038255006e-05, "loss": 0.0097, "step": 6850 }, { "epoch": 428.75, "grad_norm": 0.8175788521766663, "learning_rate": 9.990554726427926e-05, "loss": 0.0111, "step": 6860 }, { "epoch": 429.375, "grad_norm": 0.8853816390037537, "learning_rate": 9.990452868841284e-05, "loss": 0.0119, "step": 6870 }, { "epoch": 430.0, "grad_norm": 0.8857107758522034, "learning_rate": 9.99035046550622e-05, "loss": 0.0111, "step": 6880 }, { "epoch": 430.625, "grad_norm": 0.7299500107765198, "learning_rate": 9.99024751643393e-05, "loss": 0.0113, "step": 6890 }, { "epoch": 431.25, "grad_norm": 0.6400433778762817, "learning_rate": 9.990144021635677e-05, "loss": 0.0106, "step": 6900 }, { "epoch": 431.875, "grad_norm": 0.6998341083526611, "learning_rate": 9.990039981122775e-05, "loss": 0.0117, "step": 6910 }, { "epoch": 432.5, "grad_norm": 0.6614553928375244, "learning_rate": 9.989935394906602e-05, "loss": 0.0108, "step": 6920 }, { "epoch": 433.125, "grad_norm": 0.8393372893333435, "learning_rate": 9.989830262998598e-05, "loss": 0.013, "step": 6930 }, { "epoch": 433.75, "grad_norm": 0.7657507061958313, "learning_rate": 9.989724585410259e-05, "loss": 0.0115, "step": 6940 }, { "epoch": 434.375, "grad_norm": 0.6534095406532288, "learning_rate": 9.989618362153139e-05, "loss": 0.0116, "step": 6950 }, { "epoch": 435.0, "grad_norm": 0.5554938316345215, "learning_rate": 9.989511593238859e-05, "loss": 0.0101, "step": 6960 }, { "epoch": 435.625, "grad_norm": 0.633482813835144, "learning_rate": 9.98940427867909e-05, "loss": 0.0105, "step": 6970 }, { "epoch": 436.25, "grad_norm": 0.5705388784408569, "learning_rate": 9.989296418485573e-05, "loss": 0.0127, "step": 6980 }, { "epoch": 436.875, "grad_norm": 0.560118556022644, "learning_rate": 9.989188012670101e-05, "loss": 0.0102, "step": 6990 }, { "epoch": 437.5, "grad_norm": 0.5680054426193237, "learning_rate": 9.989079061244528e-05, "loss": 0.0108, "step": 7000 }, { "epoch": 438.125, "grad_norm": 0.6862987875938416, "learning_rate": 9.988969564220769e-05, "loss": 0.011, "step": 7010 }, { "epoch": 438.75, "grad_norm": 0.6537038683891296, "learning_rate": 9.988859521610801e-05, "loss": 0.011, "step": 7020 }, { "epoch": 439.375, "grad_norm": 0.7102747559547424, "learning_rate": 9.988748933426656e-05, "loss": 0.0114, "step": 7030 }, { "epoch": 440.0, "grad_norm": 0.7743424773216248, "learning_rate": 9.988637799680428e-05, "loss": 0.0114, "step": 7040 }, { "epoch": 440.625, "grad_norm": 0.7385320663452148, "learning_rate": 9.98852612038427e-05, "loss": 0.0102, "step": 7050 }, { "epoch": 441.25, "grad_norm": 0.7324809432029724, "learning_rate": 9.988413895550397e-05, "loss": 0.0095, "step": 7060 }, { "epoch": 441.875, "grad_norm": 0.6916730999946594, "learning_rate": 9.98830112519108e-05, "loss": 0.012, "step": 7070 }, { "epoch": 442.5, "grad_norm": 0.5611207485198975, "learning_rate": 9.98818780931865e-05, "loss": 0.0099, "step": 7080 }, { "epoch": 443.125, "grad_norm": 0.6533907055854797, "learning_rate": 9.988073947945502e-05, "loss": 0.0097, "step": 7090 }, { "epoch": 443.75, "grad_norm": 0.8114432096481323, "learning_rate": 9.987959541084087e-05, "loss": 0.0096, "step": 7100 }, { "epoch": 444.375, "grad_norm": 0.5615887641906738, "learning_rate": 9.987844588746915e-05, "loss": 0.0085, "step": 7110 }, { "epoch": 445.0, "grad_norm": 0.6930294632911682, "learning_rate": 9.987729090946558e-05, "loss": 0.0096, "step": 7120 }, { "epoch": 445.625, "grad_norm": 0.7661396265029907, "learning_rate": 9.987613047695647e-05, "loss": 0.0099, "step": 7130 }, { "epoch": 446.25, "grad_norm": 0.7148370146751404, "learning_rate": 9.987496459006871e-05, "loss": 0.0092, "step": 7140 }, { "epoch": 446.875, "grad_norm": 0.9166419506072998, "learning_rate": 9.987379324892982e-05, "loss": 0.0113, "step": 7150 }, { "epoch": 447.5, "grad_norm": 0.8479866981506348, "learning_rate": 9.987261645366788e-05, "loss": 0.0101, "step": 7160 }, { "epoch": 448.125, "grad_norm": 0.6642943620681763, "learning_rate": 9.987143420441158e-05, "loss": 0.01, "step": 7170 }, { "epoch": 448.75, "grad_norm": 0.6536929607391357, "learning_rate": 9.987024650129022e-05, "loss": 0.0102, "step": 7180 }, { "epoch": 449.375, "grad_norm": 0.8466352820396423, "learning_rate": 9.986905334443368e-05, "loss": 0.0117, "step": 7190 }, { "epoch": 450.0, "grad_norm": 0.7626696228981018, "learning_rate": 9.986785473397245e-05, "loss": 0.0103, "step": 7200 }, { "epoch": 450.625, "grad_norm": 0.7776815891265869, "learning_rate": 9.98666506700376e-05, "loss": 0.0115, "step": 7210 }, { "epoch": 451.25, "grad_norm": 1.0069994926452637, "learning_rate": 9.986544115276081e-05, "loss": 0.0128, "step": 7220 }, { "epoch": 451.875, "grad_norm": 0.8917898535728455, "learning_rate": 9.986422618227433e-05, "loss": 0.0109, "step": 7230 }, { "epoch": 452.5, "grad_norm": 0.7967373728752136, "learning_rate": 9.986300575871106e-05, "loss": 0.0116, "step": 7240 }, { "epoch": 453.125, "grad_norm": 0.6768915057182312, "learning_rate": 9.986177988220444e-05, "loss": 0.0099, "step": 7250 }, { "epoch": 453.75, "grad_norm": 0.7261281609535217, "learning_rate": 9.986054855288856e-05, "loss": 0.0103, "step": 7260 }, { "epoch": 454.375, "grad_norm": 0.7023577094078064, "learning_rate": 9.985931177089802e-05, "loss": 0.0112, "step": 7270 }, { "epoch": 455.0, "grad_norm": 0.5902547836303711, "learning_rate": 9.985806953636814e-05, "loss": 0.0098, "step": 7280 }, { "epoch": 455.625, "grad_norm": 0.6153225302696228, "learning_rate": 9.985682184943471e-05, "loss": 0.0111, "step": 7290 }, { "epoch": 456.25, "grad_norm": 0.6180372834205627, "learning_rate": 9.98555687102342e-05, "loss": 0.0096, "step": 7300 }, { "epoch": 456.875, "grad_norm": 0.7004512548446655, "learning_rate": 9.985431011890367e-05, "loss": 0.0107, "step": 7310 }, { "epoch": 457.5, "grad_norm": 0.8018707036972046, "learning_rate": 9.985304607558075e-05, "loss": 0.0104, "step": 7320 }, { "epoch": 458.125, "grad_norm": 0.6335276365280151, "learning_rate": 9.985177658040364e-05, "loss": 0.0102, "step": 7330 }, { "epoch": 458.75, "grad_norm": 0.8146379590034485, "learning_rate": 9.985050163351119e-05, "loss": 0.0106, "step": 7340 }, { "epoch": 459.375, "grad_norm": 0.7131094336509705, "learning_rate": 9.984922123504286e-05, "loss": 0.0093, "step": 7350 }, { "epoch": 460.0, "grad_norm": 0.647261381149292, "learning_rate": 9.984793538513862e-05, "loss": 0.0103, "step": 7360 }, { "epoch": 460.625, "grad_norm": 0.6319265961647034, "learning_rate": 9.984664408393912e-05, "loss": 0.01, "step": 7370 }, { "epoch": 461.25, "grad_norm": 0.5086030960083008, "learning_rate": 9.984534733158556e-05, "loss": 0.0105, "step": 7380 }, { "epoch": 461.875, "grad_norm": 0.6072356104850769, "learning_rate": 9.984404512821977e-05, "loss": 0.0089, "step": 7390 }, { "epoch": 462.5, "grad_norm": 0.6429985165596008, "learning_rate": 9.984273747398411e-05, "loss": 0.0102, "step": 7400 }, { "epoch": 463.125, "grad_norm": 0.5790389776229858, "learning_rate": 9.984142436902165e-05, "loss": 0.0104, "step": 7410 }, { "epoch": 463.75, "grad_norm": 0.701302170753479, "learning_rate": 9.984010581347596e-05, "loss": 0.0089, "step": 7420 }, { "epoch": 464.375, "grad_norm": 0.6150535941123962, "learning_rate": 9.983878180749121e-05, "loss": 0.0098, "step": 7430 }, { "epoch": 465.0, "grad_norm": 0.6264737248420715, "learning_rate": 9.983745235121222e-05, "loss": 0.0093, "step": 7440 }, { "epoch": 465.625, "grad_norm": 0.5422685146331787, "learning_rate": 9.983611744478438e-05, "loss": 0.0104, "step": 7450 }, { "epoch": 466.25, "grad_norm": 0.6225709915161133, "learning_rate": 9.983477708835365e-05, "loss": 0.0101, "step": 7460 }, { "epoch": 466.875, "grad_norm": 0.5819153785705566, "learning_rate": 9.983343128206664e-05, "loss": 0.0106, "step": 7470 }, { "epoch": 467.5, "grad_norm": 0.7224307060241699, "learning_rate": 9.983208002607049e-05, "loss": 0.0107, "step": 7480 }, { "epoch": 468.125, "grad_norm": 0.7039912939071655, "learning_rate": 9.9830723320513e-05, "loss": 0.0103, "step": 7490 }, { "epoch": 468.75, "grad_norm": 0.6855049133300781, "learning_rate": 9.982936116554254e-05, "loss": 0.0088, "step": 7500 }, { "epoch": 469.375, "grad_norm": 0.6290692687034607, "learning_rate": 9.982799356130803e-05, "loss": 0.0106, "step": 7510 }, { "epoch": 470.0, "grad_norm": 0.5659773945808411, "learning_rate": 9.982662050795908e-05, "loss": 0.0106, "step": 7520 }, { "epoch": 470.625, "grad_norm": 0.5781753063201904, "learning_rate": 9.982524200564583e-05, "loss": 0.0104, "step": 7530 }, { "epoch": 471.25, "grad_norm": 0.6644128561019897, "learning_rate": 9.982385805451901e-05, "loss": 0.0103, "step": 7540 }, { "epoch": 471.875, "grad_norm": 0.7858973145484924, "learning_rate": 9.982246865472998e-05, "loss": 0.0093, "step": 7550 }, { "epoch": 472.5, "grad_norm": 0.7751241326332092, "learning_rate": 9.982107380643069e-05, "loss": 0.0101, "step": 7560 }, { "epoch": 473.125, "grad_norm": 0.8384363055229187, "learning_rate": 9.981967350977368e-05, "loss": 0.0107, "step": 7570 }, { "epoch": 473.75, "grad_norm": 0.8584528565406799, "learning_rate": 9.981826776491208e-05, "loss": 0.0095, "step": 7580 }, { "epoch": 474.375, "grad_norm": 0.995509922504425, "learning_rate": 9.98168565719996e-05, "loss": 0.0115, "step": 7590 }, { "epoch": 475.0, "grad_norm": 0.8218001127243042, "learning_rate": 9.98154399311906e-05, "loss": 0.011, "step": 7600 }, { "epoch": 475.625, "grad_norm": 0.7269605994224548, "learning_rate": 9.981401784263997e-05, "loss": 0.0103, "step": 7610 }, { "epoch": 476.25, "grad_norm": 0.6630864143371582, "learning_rate": 9.981259030650326e-05, "loss": 0.0092, "step": 7620 }, { "epoch": 476.875, "grad_norm": 0.7081972360610962, "learning_rate": 9.981115732293655e-05, "loss": 0.0084, "step": 7630 }, { "epoch": 477.5, "grad_norm": 0.6908837556838989, "learning_rate": 9.980971889209659e-05, "loss": 0.0096, "step": 7640 }, { "epoch": 478.125, "grad_norm": 0.6863625645637512, "learning_rate": 9.980827501414064e-05, "loss": 0.0094, "step": 7650 }, { "epoch": 478.75, "grad_norm": 0.628754734992981, "learning_rate": 9.980682568922663e-05, "loss": 0.0087, "step": 7660 }, { "epoch": 479.375, "grad_norm": 0.6461851000785828, "learning_rate": 9.980537091751304e-05, "loss": 0.0091, "step": 7670 }, { "epoch": 480.0, "grad_norm": 0.6353027820587158, "learning_rate": 9.980391069915897e-05, "loss": 0.009, "step": 7680 }, { "epoch": 480.625, "grad_norm": 0.5868967175483704, "learning_rate": 9.98024450343241e-05, "loss": 0.0101, "step": 7690 }, { "epoch": 481.25, "grad_norm": 0.6688029766082764, "learning_rate": 9.980097392316872e-05, "loss": 0.0083, "step": 7700 }, { "epoch": 481.875, "grad_norm": 0.5620129108428955, "learning_rate": 9.97994973658537e-05, "loss": 0.0088, "step": 7710 }, { "epoch": 482.5, "grad_norm": 0.6990760564804077, "learning_rate": 9.979801536254054e-05, "loss": 0.008, "step": 7720 }, { "epoch": 483.125, "grad_norm": 0.5271959900856018, "learning_rate": 9.979652791339127e-05, "loss": 0.01, "step": 7730 }, { "epoch": 483.75, "grad_norm": 0.717219352722168, "learning_rate": 9.97950350185686e-05, "loss": 0.0104, "step": 7740 }, { "epoch": 484.375, "grad_norm": 0.5886634588241577, "learning_rate": 9.979353667823574e-05, "loss": 0.0086, "step": 7750 }, { "epoch": 485.0, "grad_norm": 0.7227773070335388, "learning_rate": 9.979203289255658e-05, "loss": 0.0094, "step": 7760 }, { "epoch": 485.625, "grad_norm": 0.6355369687080383, "learning_rate": 9.979052366169557e-05, "loss": 0.0098, "step": 7770 }, { "epoch": 486.25, "grad_norm": 0.6813123226165771, "learning_rate": 9.978900898581775e-05, "loss": 0.01, "step": 7780 }, { "epoch": 486.875, "grad_norm": 0.659970223903656, "learning_rate": 9.978748886508875e-05, "loss": 0.0088, "step": 7790 }, { "epoch": 487.5, "grad_norm": 0.7737880349159241, "learning_rate": 9.978596329967484e-05, "loss": 0.0106, "step": 7800 }, { "epoch": 488.125, "grad_norm": 0.7581619024276733, "learning_rate": 9.978443228974284e-05, "loss": 0.0087, "step": 7810 }, { "epoch": 488.75, "grad_norm": 0.7430512309074402, "learning_rate": 9.978289583546015e-05, "loss": 0.0093, "step": 7820 }, { "epoch": 489.375, "grad_norm": 0.6579586863517761, "learning_rate": 9.978135393699484e-05, "loss": 0.0092, "step": 7830 }, { "epoch": 490.0, "grad_norm": 0.6156346797943115, "learning_rate": 9.977980659451548e-05, "loss": 0.0099, "step": 7840 }, { "epoch": 490.625, "grad_norm": 0.6920315623283386, "learning_rate": 9.977825380819135e-05, "loss": 0.0101, "step": 7850 }, { "epoch": 491.25, "grad_norm": 0.7143272161483765, "learning_rate": 9.97766955781922e-05, "loss": 0.0102, "step": 7860 }, { "epoch": 491.875, "grad_norm": 0.6715136170387268, "learning_rate": 9.977513190468848e-05, "loss": 0.0092, "step": 7870 }, { "epoch": 492.5, "grad_norm": 0.792335569858551, "learning_rate": 9.977356278785116e-05, "loss": 0.0094, "step": 7880 }, { "epoch": 493.125, "grad_norm": 0.8089608550071716, "learning_rate": 9.977198822785184e-05, "loss": 0.0099, "step": 7890 }, { "epoch": 493.75, "grad_norm": 0.727393627166748, "learning_rate": 9.977040822486273e-05, "loss": 0.0093, "step": 7900 }, { "epoch": 494.375, "grad_norm": 0.7314863204956055, "learning_rate": 9.97688227790566e-05, "loss": 0.01, "step": 7910 }, { "epoch": 495.0, "grad_norm": 0.6197735667228699, "learning_rate": 9.976723189060684e-05, "loss": 0.0093, "step": 7920 }, { "epoch": 495.625, "grad_norm": 0.6258811950683594, "learning_rate": 9.976563555968742e-05, "loss": 0.0089, "step": 7930 }, { "epoch": 496.25, "grad_norm": 0.6613799929618835, "learning_rate": 9.976403378647292e-05, "loss": 0.0099, "step": 7940 }, { "epoch": 496.875, "grad_norm": 0.5219643115997314, "learning_rate": 9.97624265711385e-05, "loss": 0.0102, "step": 7950 }, { "epoch": 497.5, "grad_norm": 0.5938867330551147, "learning_rate": 9.976081391385993e-05, "loss": 0.0101, "step": 7960 }, { "epoch": 498.125, "grad_norm": 0.5493279099464417, "learning_rate": 9.975919581481356e-05, "loss": 0.01, "step": 7970 }, { "epoch": 498.75, "grad_norm": 0.5064048767089844, "learning_rate": 9.975757227417634e-05, "loss": 0.0092, "step": 7980 }, { "epoch": 499.375, "grad_norm": 0.5940008163452148, "learning_rate": 9.975594329212586e-05, "loss": 0.0097, "step": 7990 }, { "epoch": 500.0, "grad_norm": 0.5561034083366394, "learning_rate": 9.97543088688402e-05, "loss": 0.0092, "step": 8000 }, { "epoch": 500.625, "grad_norm": 0.587040901184082, "learning_rate": 9.975266900449814e-05, "loss": 0.0105, "step": 8010 }, { "epoch": 501.25, "grad_norm": 0.6578340530395508, "learning_rate": 9.975102369927898e-05, "loss": 0.0088, "step": 8020 }, { "epoch": 501.875, "grad_norm": 0.6301031708717346, "learning_rate": 9.974937295336269e-05, "loss": 0.0096, "step": 8030 }, { "epoch": 502.5, "grad_norm": 0.49646562337875366, "learning_rate": 9.974771676692975e-05, "loss": 0.0094, "step": 8040 }, { "epoch": 503.125, "grad_norm": 0.5952965021133423, "learning_rate": 9.974605514016131e-05, "loss": 0.0088, "step": 8050 }, { "epoch": 503.75, "grad_norm": 0.6772691607475281, "learning_rate": 9.974438807323907e-05, "loss": 0.0093, "step": 8060 }, { "epoch": 504.375, "grad_norm": 0.5597459673881531, "learning_rate": 9.974271556634535e-05, "loss": 0.0088, "step": 8070 }, { "epoch": 505.0, "grad_norm": 0.8469547033309937, "learning_rate": 9.974103761966302e-05, "loss": 0.0106, "step": 8080 }, { "epoch": 505.625, "grad_norm": 0.775303065776825, "learning_rate": 9.973935423337563e-05, "loss": 0.0097, "step": 8090 }, { "epoch": 506.25, "grad_norm": 0.7015887498855591, "learning_rate": 9.973766540766722e-05, "loss": 0.0095, "step": 8100 }, { "epoch": 506.875, "grad_norm": 0.6640006303787231, "learning_rate": 9.97359711427225e-05, "loss": 0.0111, "step": 8110 }, { "epoch": 507.5, "grad_norm": 0.6578481793403625, "learning_rate": 9.973427143872677e-05, "loss": 0.0088, "step": 8120 }, { "epoch": 508.125, "grad_norm": 0.6807109713554382, "learning_rate": 9.973256629586589e-05, "loss": 0.0102, "step": 8130 }, { "epoch": 508.75, "grad_norm": 0.5422506332397461, "learning_rate": 9.973085571432632e-05, "loss": 0.0101, "step": 8140 }, { "epoch": 509.375, "grad_norm": 0.5136811137199402, "learning_rate": 9.972913969429513e-05, "loss": 0.0097, "step": 8150 }, { "epoch": 510.0, "grad_norm": 0.693134069442749, "learning_rate": 9.972741823596e-05, "loss": 0.0094, "step": 8160 }, { "epoch": 510.625, "grad_norm": 0.611960232257843, "learning_rate": 9.972569133950917e-05, "loss": 0.0089, "step": 8170 }, { "epoch": 511.25, "grad_norm": 0.617396354675293, "learning_rate": 9.972395900513151e-05, "loss": 0.0088, "step": 8180 }, { "epoch": 511.875, "grad_norm": 0.6016327738761902, "learning_rate": 9.972222123301645e-05, "loss": 0.0095, "step": 8190 }, { "epoch": 512.5, "grad_norm": 0.5470365881919861, "learning_rate": 9.972047802335403e-05, "loss": 0.0096, "step": 8200 }, { "epoch": 513.125, "grad_norm": 0.6275759935379028, "learning_rate": 9.971872937633488e-05, "loss": 0.0085, "step": 8210 }, { "epoch": 513.75, "grad_norm": 0.5876614451408386, "learning_rate": 9.971697529215024e-05, "loss": 0.0093, "step": 8220 }, { "epoch": 514.375, "grad_norm": 0.57300865650177, "learning_rate": 9.971521577099192e-05, "loss": 0.0091, "step": 8230 }, { "epoch": 515.0, "grad_norm": 0.6590330600738525, "learning_rate": 9.971345081305236e-05, "loss": 0.0094, "step": 8240 }, { "epoch": 515.625, "grad_norm": 0.7168742418289185, "learning_rate": 9.971168041852456e-05, "loss": 0.0091, "step": 8250 }, { "epoch": 516.25, "grad_norm": 0.7002500295639038, "learning_rate": 9.970990458760215e-05, "loss": 0.0082, "step": 8260 }, { "epoch": 516.875, "grad_norm": 0.5979912877082825, "learning_rate": 9.970812332047929e-05, "loss": 0.0083, "step": 8270 }, { "epoch": 517.5, "grad_norm": 0.6995880603790283, "learning_rate": 9.97063366173508e-05, "loss": 0.0083, "step": 8280 }, { "epoch": 518.125, "grad_norm": 0.6054606437683105, "learning_rate": 9.970454447841207e-05, "loss": 0.0086, "step": 8290 }, { "epoch": 518.75, "grad_norm": 0.6761727333068848, "learning_rate": 9.970274690385909e-05, "loss": 0.0091, "step": 8300 }, { "epoch": 519.375, "grad_norm": 0.7297013401985168, "learning_rate": 9.970094389388844e-05, "loss": 0.0101, "step": 8310 }, { "epoch": 520.0, "grad_norm": 0.6933302879333496, "learning_rate": 9.969913544869728e-05, "loss": 0.009, "step": 8320 }, { "epoch": 520.625, "grad_norm": 0.632068932056427, "learning_rate": 9.96973215684834e-05, "loss": 0.0092, "step": 8330 }, { "epoch": 521.25, "grad_norm": 0.5213248133659363, "learning_rate": 9.969550225344513e-05, "loss": 0.0095, "step": 8340 }, { "epoch": 521.875, "grad_norm": 0.5387685298919678, "learning_rate": 9.969367750378147e-05, "loss": 0.0072, "step": 8350 }, { "epoch": 522.5, "grad_norm": 0.5790697336196899, "learning_rate": 9.969184731969194e-05, "loss": 0.0098, "step": 8360 }, { "epoch": 523.125, "grad_norm": 0.6181520819664001, "learning_rate": 9.96900117013767e-05, "loss": 0.0094, "step": 8370 }, { "epoch": 523.75, "grad_norm": 0.6647499799728394, "learning_rate": 9.96881706490365e-05, "loss": 0.0092, "step": 8380 }, { "epoch": 524.375, "grad_norm": 0.5274850726127625, "learning_rate": 9.968632416287265e-05, "loss": 0.0092, "step": 8390 }, { "epoch": 525.0, "grad_norm": 0.5954369902610779, "learning_rate": 9.96844722430871e-05, "loss": 0.0083, "step": 8400 }, { "epoch": 525.625, "grad_norm": 0.5637514591217041, "learning_rate": 9.968261488988235e-05, "loss": 0.0096, "step": 8410 }, { "epoch": 526.25, "grad_norm": 0.5467987656593323, "learning_rate": 9.968075210346155e-05, "loss": 0.0087, "step": 8420 }, { "epoch": 526.875, "grad_norm": 0.6766216158866882, "learning_rate": 9.967888388402839e-05, "loss": 0.0098, "step": 8430 }, { "epoch": 527.5, "grad_norm": 0.689804196357727, "learning_rate": 9.967701023178717e-05, "loss": 0.0094, "step": 8440 }, { "epoch": 528.125, "grad_norm": 0.6711739301681519, "learning_rate": 9.967513114694282e-05, "loss": 0.0098, "step": 8450 }, { "epoch": 528.75, "grad_norm": 0.7609061002731323, "learning_rate": 9.967324662970079e-05, "loss": 0.0091, "step": 8460 }, { "epoch": 529.375, "grad_norm": 0.6599430441856384, "learning_rate": 9.96713566802672e-05, "loss": 0.0096, "step": 8470 }, { "epoch": 530.0, "grad_norm": 0.6817207932472229, "learning_rate": 9.966946129884873e-05, "loss": 0.0093, "step": 8480 }, { "epoch": 530.625, "grad_norm": 0.8081104755401611, "learning_rate": 9.966756048565265e-05, "loss": 0.01, "step": 8490 }, { "epoch": 531.25, "grad_norm": 0.6982617378234863, "learning_rate": 9.966565424088681e-05, "loss": 0.0088, "step": 8500 } ], "logging_steps": 10, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 6250, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 256, "trial_name": null, "trial_params": null }