{ "best_global_step": 9800, "best_metric": 0.8585858585858586, "best_model_checkpoint": "ctsinov1/checkpoint-9800", "epoch": 49.02, "eval_steps": 500, "global_step": 17500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005714285714285715, "grad_norm": 6.513674259185791, "learning_rate": 5.142857142857143e-08, "loss": 0.696, "step": 10 }, { "epoch": 0.001142857142857143, "grad_norm": 2.7451493740081787, "learning_rate": 1.0857142857142857e-07, "loss": 0.6997, "step": 20 }, { "epoch": 0.0017142857142857142, "grad_norm": 2.719616174697876, "learning_rate": 1.657142857142857e-07, "loss": 0.6917, "step": 30 }, { "epoch": 0.002285714285714286, "grad_norm": 2.852513074874878, "learning_rate": 2.228571428571429e-07, "loss": 0.6825, "step": 40 }, { "epoch": 0.002857142857142857, "grad_norm": 6.225112438201904, "learning_rate": 2.8e-07, "loss": 0.685, "step": 50 }, { "epoch": 0.0034285714285714284, "grad_norm": 2.038743495941162, "learning_rate": 3.371428571428572e-07, "loss": 0.691, "step": 60 }, { "epoch": 0.004, "grad_norm": 2.501828908920288, "learning_rate": 3.9428571428571436e-07, "loss": 0.701, "step": 70 }, { "epoch": 0.004571428571428572, "grad_norm": 5.308904647827148, "learning_rate": 4.514285714285715e-07, "loss": 0.6841, "step": 80 }, { "epoch": 0.005142857142857143, "grad_norm": 2.171583890914917, "learning_rate": 5.085714285714286e-07, "loss": 0.6845, "step": 90 }, { "epoch": 0.005714285714285714, "grad_norm": 1.9283939599990845, "learning_rate": 5.657142857142857e-07, "loss": 0.6901, "step": 100 }, { "epoch": 0.006285714285714286, "grad_norm": 5.542513370513916, "learning_rate": 6.228571428571429e-07, "loss": 0.6768, "step": 110 }, { "epoch": 0.006857142857142857, "grad_norm": 6.506889820098877, "learning_rate": 6.800000000000001e-07, "loss": 0.666, "step": 120 }, { "epoch": 0.0074285714285714285, "grad_norm": 3.021516799926758, "learning_rate": 7.371428571428572e-07, "loss": 0.6799, "step": 130 }, { "epoch": 0.008, "grad_norm": 3.173678398132324, "learning_rate": 7.942857142857144e-07, "loss": 0.6749, "step": 140 }, { "epoch": 0.008571428571428572, "grad_norm": 6.398750305175781, "learning_rate": 8.514285714285716e-07, "loss": 0.7071, "step": 150 }, { "epoch": 0.009142857142857144, "grad_norm": 4.193422317504883, "learning_rate": 9.085714285714286e-07, "loss": 0.7018, "step": 160 }, { "epoch": 0.009714285714285713, "grad_norm": 6.120445728302002, "learning_rate": 9.657142857142857e-07, "loss": 0.6685, "step": 170 }, { "epoch": 0.010285714285714285, "grad_norm": 9.130088806152344, "learning_rate": 1.0228571428571429e-06, "loss": 0.6791, "step": 180 }, { "epoch": 0.010857142857142857, "grad_norm": 4.139962673187256, "learning_rate": 1.08e-06, "loss": 0.6318, "step": 190 }, { "epoch": 0.011428571428571429, "grad_norm": 10.299067497253418, "learning_rate": 1.1371428571428572e-06, "loss": 0.6648, "step": 200 }, { "epoch": 0.012, "grad_norm": 8.751834869384766, "learning_rate": 1.1942857142857144e-06, "loss": 0.7039, "step": 210 }, { "epoch": 0.012571428571428572, "grad_norm": 6.685519695281982, "learning_rate": 1.2514285714285715e-06, "loss": 0.6488, "step": 220 }, { "epoch": 0.013142857142857144, "grad_norm": 12.093446731567383, "learning_rate": 1.3085714285714287e-06, "loss": 0.6134, "step": 230 }, { "epoch": 0.013714285714285714, "grad_norm": 10.574247360229492, "learning_rate": 1.3657142857142857e-06, "loss": 0.6786, "step": 240 }, { "epoch": 0.014285714285714285, "grad_norm": 12.45152473449707, "learning_rate": 1.422857142857143e-06, "loss": 0.6112, "step": 250 }, { "epoch": 0.014857142857142857, "grad_norm": 20.439712524414062, "learning_rate": 1.48e-06, "loss": 0.6244, "step": 260 }, { "epoch": 0.015428571428571429, "grad_norm": 14.572081565856934, "learning_rate": 1.5371428571428574e-06, "loss": 0.6225, "step": 270 }, { "epoch": 0.016, "grad_norm": 12.138855934143066, "learning_rate": 1.5942857142857144e-06, "loss": 0.6541, "step": 280 }, { "epoch": 0.01657142857142857, "grad_norm": 10.430656433105469, "learning_rate": 1.6514285714285715e-06, "loss": 0.6444, "step": 290 }, { "epoch": 0.017142857142857144, "grad_norm": 20.285198211669922, "learning_rate": 1.7085714285714287e-06, "loss": 0.6046, "step": 300 }, { "epoch": 0.017714285714285714, "grad_norm": 14.162407875061035, "learning_rate": 1.7657142857142859e-06, "loss": 0.5698, "step": 310 }, { "epoch": 0.018285714285714287, "grad_norm": 12.920883178710938, "learning_rate": 1.8228571428571428e-06, "loss": 0.5905, "step": 320 }, { "epoch": 0.018857142857142857, "grad_norm": 13.900638580322266, "learning_rate": 1.8800000000000002e-06, "loss": 0.5498, "step": 330 }, { "epoch": 0.019428571428571427, "grad_norm": 12.032271385192871, "learning_rate": 1.9371428571428576e-06, "loss": 0.5398, "step": 340 }, { "epoch": 0.02, "grad_norm": 10.851804733276367, "learning_rate": 1.9942857142857146e-06, "loss": 0.5787, "step": 350 }, { "epoch": 0.02, "eval_accuracy": 0.7104377104377104, "eval_loss": 0.5787273049354553, "eval_runtime": 141.3287, "eval_samples_per_second": 2.101, "eval_steps_per_second": 1.054, "step": 350 }, { "epoch": 1.0005714285714287, "grad_norm": 21.635120391845703, "learning_rate": 2.0514285714285715e-06, "loss": 0.5574, "step": 360 }, { "epoch": 1.0011428571428571, "grad_norm": 8.378313064575195, "learning_rate": 2.108571428571429e-06, "loss": 0.6273, "step": 370 }, { "epoch": 1.0017142857142858, "grad_norm": 25.1311092376709, "learning_rate": 2.165714285714286e-06, "loss": 0.4157, "step": 380 }, { "epoch": 1.0022857142857142, "grad_norm": 68.14948272705078, "learning_rate": 2.222857142857143e-06, "loss": 0.6408, "step": 390 }, { "epoch": 1.002857142857143, "grad_norm": 7.816642761230469, "learning_rate": 2.28e-06, "loss": 0.5284, "step": 400 }, { "epoch": 1.0034285714285713, "grad_norm": 23.304162979125977, "learning_rate": 2.337142857142857e-06, "loss": 0.6163, "step": 410 }, { "epoch": 1.004, "grad_norm": 48.6783332824707, "learning_rate": 2.3942857142857145e-06, "loss": 0.5474, "step": 420 }, { "epoch": 1.0045714285714287, "grad_norm": 17.72844696044922, "learning_rate": 2.4514285714285715e-06, "loss": 0.5792, "step": 430 }, { "epoch": 1.0051428571428571, "grad_norm": 8.830244064331055, "learning_rate": 2.5085714285714285e-06, "loss": 0.5057, "step": 440 }, { "epoch": 1.0057142857142858, "grad_norm": 63.260135650634766, "learning_rate": 2.565714285714286e-06, "loss": 0.5394, "step": 450 }, { "epoch": 1.0062857142857142, "grad_norm": 3.5656652450561523, "learning_rate": 2.6228571428571432e-06, "loss": 0.4402, "step": 460 }, { "epoch": 1.006857142857143, "grad_norm": 31.8543758392334, "learning_rate": 2.68e-06, "loss": 0.5913, "step": 470 }, { "epoch": 1.0074285714285713, "grad_norm": 3.7504777908325195, "learning_rate": 2.737142857142857e-06, "loss": 0.579, "step": 480 }, { "epoch": 1.008, "grad_norm": 13.457418441772461, "learning_rate": 2.7942857142857145e-06, "loss": 0.3719, "step": 490 }, { "epoch": 1.0085714285714287, "grad_norm": 8.717528343200684, "learning_rate": 2.8514285714285715e-06, "loss": 0.5286, "step": 500 }, { "epoch": 1.0091428571428571, "grad_norm": 51.75168991088867, "learning_rate": 2.908571428571429e-06, "loss": 0.5389, "step": 510 }, { "epoch": 1.0097142857142858, "grad_norm": 4.350738048553467, "learning_rate": 2.9657142857142862e-06, "loss": 0.1809, "step": 520 }, { "epoch": 1.0102857142857142, "grad_norm": 51.596439361572266, "learning_rate": 3.0228571428571428e-06, "loss": 0.4441, "step": 530 }, { "epoch": 1.010857142857143, "grad_norm": 60.40998840332031, "learning_rate": 3.08e-06, "loss": 0.6152, "step": 540 }, { "epoch": 1.0114285714285713, "grad_norm": 13.335647583007812, "learning_rate": 3.1371428571428575e-06, "loss": 0.3943, "step": 550 }, { "epoch": 1.012, "grad_norm": 0.6494603157043457, "learning_rate": 3.194285714285715e-06, "loss": 0.4774, "step": 560 }, { "epoch": 1.0125714285714287, "grad_norm": 2.483837127685547, "learning_rate": 3.2514285714285715e-06, "loss": 0.5635, "step": 570 }, { "epoch": 1.0131428571428571, "grad_norm": 79.77306365966797, "learning_rate": 3.308571428571429e-06, "loss": 1.0499, "step": 580 }, { "epoch": 1.0137142857142858, "grad_norm": 1.0996540784835815, "learning_rate": 3.3657142857142862e-06, "loss": 0.5509, "step": 590 }, { "epoch": 1.0142857142857142, "grad_norm": 48.051490783691406, "learning_rate": 3.422857142857143e-06, "loss": 1.0613, "step": 600 }, { "epoch": 1.014857142857143, "grad_norm": 1.7596064805984497, "learning_rate": 3.48e-06, "loss": 0.2783, "step": 610 }, { "epoch": 1.0154285714285713, "grad_norm": 0.31064021587371826, "learning_rate": 3.5371428571428575e-06, "loss": 0.649, "step": 620 }, { "epoch": 1.016, "grad_norm": 0.5567959547042847, "learning_rate": 3.5942857142857145e-06, "loss": 0.2289, "step": 630 }, { "epoch": 1.0165714285714285, "grad_norm": 2.9928736686706543, "learning_rate": 3.651428571428572e-06, "loss": 0.2232, "step": 640 }, { "epoch": 1.0171428571428571, "grad_norm": 56.99592208862305, "learning_rate": 3.7085714285714284e-06, "loss": 0.3615, "step": 650 }, { "epoch": 1.0177142857142858, "grad_norm": 1.2344051599502563, "learning_rate": 3.7657142857142858e-06, "loss": 1.8052, "step": 660 }, { "epoch": 1.0182857142857142, "grad_norm": 44.120845794677734, "learning_rate": 3.822857142857143e-06, "loss": 0.7242, "step": 670 }, { "epoch": 1.018857142857143, "grad_norm": 1.3891487121582031, "learning_rate": 3.88e-06, "loss": 0.737, "step": 680 }, { "epoch": 1.0194285714285714, "grad_norm": 31.26248550415039, "learning_rate": 3.937142857142858e-06, "loss": 0.5545, "step": 690 }, { "epoch": 1.02, "grad_norm": 39.20292663574219, "learning_rate": 3.994285714285714e-06, "loss": 0.5175, "step": 700 }, { "epoch": 1.02, "eval_accuracy": 0.8080808080808081, "eval_loss": 0.7401928901672363, "eval_runtime": 137.8773, "eval_samples_per_second": 2.154, "eval_steps_per_second": 1.081, "step": 700 }, { "epoch": 2.0005714285714284, "grad_norm": 0.3124562203884125, "learning_rate": 4.051428571428572e-06, "loss": 1.2603, "step": 710 }, { "epoch": 2.0011428571428573, "grad_norm": 20.57735252380371, "learning_rate": 4.108571428571429e-06, "loss": 0.693, "step": 720 }, { "epoch": 2.001714285714286, "grad_norm": 49.15703582763672, "learning_rate": 4.165714285714287e-06, "loss": 1.4011, "step": 730 }, { "epoch": 2.0022857142857142, "grad_norm": 21.79818344116211, "learning_rate": 4.222857142857143e-06, "loss": 1.7675, "step": 740 }, { "epoch": 2.0028571428571427, "grad_norm": 8.412939071655273, "learning_rate": 4.2800000000000005e-06, "loss": 0.7266, "step": 750 }, { "epoch": 2.0034285714285716, "grad_norm": 16.968673706054688, "learning_rate": 4.3371428571428575e-06, "loss": 0.7732, "step": 760 }, { "epoch": 2.004, "grad_norm": 70.78853607177734, "learning_rate": 4.3942857142857144e-06, "loss": 0.7847, "step": 770 }, { "epoch": 2.0045714285714284, "grad_norm": 6.835943698883057, "learning_rate": 4.451428571428571e-06, "loss": 0.8395, "step": 780 }, { "epoch": 2.0051428571428573, "grad_norm": 24.16145133972168, "learning_rate": 4.508571428571429e-06, "loss": 0.531, "step": 790 }, { "epoch": 2.005714285714286, "grad_norm": 1.41634202003479, "learning_rate": 4.565714285714286e-06, "loss": 0.4976, "step": 800 }, { "epoch": 2.0062857142857142, "grad_norm": 2.940735340118408, "learning_rate": 4.622857142857143e-06, "loss": 0.4175, "step": 810 }, { "epoch": 2.0068571428571427, "grad_norm": 2.9641330242156982, "learning_rate": 4.680000000000001e-06, "loss": 0.2843, "step": 820 }, { "epoch": 2.0074285714285716, "grad_norm": 46.827030181884766, "learning_rate": 4.737142857142857e-06, "loss": 1.0676, "step": 830 }, { "epoch": 2.008, "grad_norm": 92.20220184326172, "learning_rate": 4.794285714285715e-06, "loss": 0.789, "step": 840 }, { "epoch": 2.0085714285714285, "grad_norm": 181.356201171875, "learning_rate": 4.851428571428572e-06, "loss": 0.6685, "step": 850 }, { "epoch": 2.0091428571428573, "grad_norm": 0.2601974904537201, "learning_rate": 4.90857142857143e-06, "loss": 0.6834, "step": 860 }, { "epoch": 2.009714285714286, "grad_norm": 123.33201599121094, "learning_rate": 4.965714285714286e-06, "loss": 0.8586, "step": 870 }, { "epoch": 2.0102857142857142, "grad_norm": 143.15835571289062, "learning_rate": 5.0228571428571435e-06, "loss": 0.8327, "step": 880 }, { "epoch": 2.0108571428571427, "grad_norm": 2.790085792541504, "learning_rate": 5.0800000000000005e-06, "loss": 1.6946, "step": 890 }, { "epoch": 2.0114285714285716, "grad_norm": 50.61666488647461, "learning_rate": 5.1371428571428574e-06, "loss": 0.3813, "step": 900 }, { "epoch": 2.012, "grad_norm": 4.110306739807129, "learning_rate": 5.194285714285715e-06, "loss": 0.5825, "step": 910 }, { "epoch": 2.0125714285714285, "grad_norm": 2.2944891452789307, "learning_rate": 5.251428571428571e-06, "loss": 0.9097, "step": 920 }, { "epoch": 2.0131428571428573, "grad_norm": 0.1945544183254242, "learning_rate": 5.308571428571428e-06, "loss": 0.2092, "step": 930 }, { "epoch": 2.013714285714286, "grad_norm": 0.31246262788772583, "learning_rate": 5.365714285714286e-06, "loss": 0.6408, "step": 940 }, { "epoch": 2.0142857142857142, "grad_norm": 0.27790567278862, "learning_rate": 5.422857142857143e-06, "loss": 0.2627, "step": 950 }, { "epoch": 2.0148571428571427, "grad_norm": 0.11815852671861649, "learning_rate": 5.480000000000001e-06, "loss": 0.7804, "step": 960 }, { "epoch": 2.0154285714285716, "grad_norm": 81.77935028076172, "learning_rate": 5.537142857142858e-06, "loss": 0.9296, "step": 970 }, { "epoch": 2.016, "grad_norm": 0.16326524317264557, "learning_rate": 5.594285714285714e-06, "loss": 0.811, "step": 980 }, { "epoch": 2.0165714285714285, "grad_norm": 0.6108279228210449, "learning_rate": 5.651428571428572e-06, "loss": 0.0187, "step": 990 }, { "epoch": 2.0171428571428573, "grad_norm": 0.0926850363612175, "learning_rate": 5.708571428571429e-06, "loss": 1.1172, "step": 1000 }, { "epoch": 2.017714285714286, "grad_norm": 27.080734252929688, "learning_rate": 5.7657142857142865e-06, "loss": 0.2747, "step": 1010 }, { "epoch": 2.0182857142857142, "grad_norm": 32.981544494628906, "learning_rate": 5.8228571428571435e-06, "loss": 0.754, "step": 1020 }, { "epoch": 2.0188571428571427, "grad_norm": 30.81633186340332, "learning_rate": 5.8800000000000005e-06, "loss": 0.3928, "step": 1030 }, { "epoch": 2.0194285714285716, "grad_norm": 0.45116323232650757, "learning_rate": 5.937142857142858e-06, "loss": 0.3754, "step": 1040 }, { "epoch": 2.02, "grad_norm": 0.12361976504325867, "learning_rate": 5.994285714285714e-06, "loss": 0.4062, "step": 1050 }, { "epoch": 2.02, "eval_accuracy": 0.8282828282828283, "eval_loss": 0.8531781435012817, "eval_runtime": 137.2114, "eval_samples_per_second": 2.165, "eval_steps_per_second": 1.086, "step": 1050 }, { "epoch": 3.0005714285714284, "grad_norm": 0.03133957087993622, "learning_rate": 6.051428571428571e-06, "loss": 0.9268, "step": 1060 }, { "epoch": 3.0011428571428573, "grad_norm": 2.317265748977661, "learning_rate": 6.108571428571429e-06, "loss": 0.602, "step": 1070 }, { "epoch": 3.001714285714286, "grad_norm": 0.7146720290184021, "learning_rate": 6.165714285714286e-06, "loss": 0.2092, "step": 1080 }, { "epoch": 3.0022857142857142, "grad_norm": 0.33893364667892456, "learning_rate": 6.222857142857144e-06, "loss": 1.658, "step": 1090 }, { "epoch": 3.0028571428571427, "grad_norm": 45.28017807006836, "learning_rate": 6.280000000000001e-06, "loss": 0.2583, "step": 1100 }, { "epoch": 3.0034285714285716, "grad_norm": 0.11307206004858017, "learning_rate": 6.337142857142857e-06, "loss": 0.381, "step": 1110 }, { "epoch": 3.004, "grad_norm": 0.7531479597091675, "learning_rate": 6.394285714285715e-06, "loss": 0.5401, "step": 1120 }, { "epoch": 3.0045714285714284, "grad_norm": 0.3606512248516083, "learning_rate": 6.451428571428572e-06, "loss": 0.8567, "step": 1130 }, { "epoch": 3.0051428571428573, "grad_norm": 1.0117719173431396, "learning_rate": 6.5085714285714295e-06, "loss": 0.8051, "step": 1140 }, { "epoch": 3.005714285714286, "grad_norm": 26.470552444458008, "learning_rate": 6.5657142857142865e-06, "loss": 0.8478, "step": 1150 }, { "epoch": 3.0062857142857142, "grad_norm": 0.5472680926322937, "learning_rate": 6.6228571428571435e-06, "loss": 0.2333, "step": 1160 }, { "epoch": 3.0068571428571427, "grad_norm": 56.5042839050293, "learning_rate": 6.680000000000001e-06, "loss": 1.0095, "step": 1170 }, { "epoch": 3.0074285714285716, "grad_norm": 0.2309829443693161, "learning_rate": 6.737142857142857e-06, "loss": 0.495, "step": 1180 }, { "epoch": 3.008, "grad_norm": 0.2706996202468872, "learning_rate": 6.794285714285714e-06, "loss": 0.7256, "step": 1190 }, { "epoch": 3.0085714285714285, "grad_norm": 31.313552856445312, "learning_rate": 6.851428571428572e-06, "loss": 1.3175, "step": 1200 }, { "epoch": 3.0091428571428573, "grad_norm": 37.99323272705078, "learning_rate": 6.908571428571429e-06, "loss": 0.7056, "step": 1210 }, { "epoch": 3.009714285714286, "grad_norm": 34.16461181640625, "learning_rate": 6.965714285714287e-06, "loss": 0.4093, "step": 1220 }, { "epoch": 3.0102857142857142, "grad_norm": 0.45271262526512146, "learning_rate": 7.022857142857144e-06, "loss": 0.3888, "step": 1230 }, { "epoch": 3.0108571428571427, "grad_norm": 15.342572212219238, "learning_rate": 7.08e-06, "loss": 0.6555, "step": 1240 }, { "epoch": 3.0114285714285716, "grad_norm": 209.43014526367188, "learning_rate": 7.137142857142858e-06, "loss": 1.0105, "step": 1250 }, { "epoch": 3.012, "grad_norm": 46.23904800415039, "learning_rate": 7.194285714285715e-06, "loss": 1.2347, "step": 1260 }, { "epoch": 3.0125714285714285, "grad_norm": 120.28668975830078, "learning_rate": 7.251428571428572e-06, "loss": 1.229, "step": 1270 }, { "epoch": 3.0131428571428573, "grad_norm": 15.921452522277832, "learning_rate": 7.3085714285714295e-06, "loss": 0.8367, "step": 1280 }, { "epoch": 3.013714285714286, "grad_norm": 0.2926520109176636, "learning_rate": 7.365714285714286e-06, "loss": 0.2382, "step": 1290 }, { "epoch": 3.0142857142857142, "grad_norm": 0.1254614293575287, "learning_rate": 7.422857142857144e-06, "loss": 0.857, "step": 1300 }, { "epoch": 3.0148571428571427, "grad_norm": 0.10389392822980881, "learning_rate": 7.48e-06, "loss": 0.0128, "step": 1310 }, { "epoch": 3.0154285714285716, "grad_norm": 0.1118193119764328, "learning_rate": 7.537142857142857e-06, "loss": 0.01, "step": 1320 }, { "epoch": 3.016, "grad_norm": 0.023473914712667465, "learning_rate": 7.594285714285715e-06, "loss": 1.3965, "step": 1330 }, { "epoch": 3.0165714285714285, "grad_norm": 79.61312103271484, "learning_rate": 7.651428571428571e-06, "loss": 1.6107, "step": 1340 }, { "epoch": 3.0171428571428573, "grad_norm": 4.563874244689941, "learning_rate": 7.708571428571429e-06, "loss": 0.3567, "step": 1350 }, { "epoch": 3.017714285714286, "grad_norm": 33.95674514770508, "learning_rate": 7.765714285714287e-06, "loss": 1.1828, "step": 1360 }, { "epoch": 3.0182857142857142, "grad_norm": 36.74417495727539, "learning_rate": 7.822857142857143e-06, "loss": 0.9229, "step": 1370 }, { "epoch": 3.0188571428571427, "grad_norm": 63.20321273803711, "learning_rate": 7.88e-06, "loss": 0.3935, "step": 1380 }, { "epoch": 3.0194285714285716, "grad_norm": 29.39291000366211, "learning_rate": 7.937142857142857e-06, "loss": 0.7986, "step": 1390 }, { "epoch": 3.02, "grad_norm": 29.3126277923584, "learning_rate": 7.994285714285715e-06, "loss": 0.7962, "step": 1400 }, { "epoch": 3.02, "eval_accuracy": 0.8114478114478114, "eval_loss": 0.7183520197868347, "eval_runtime": 137.6694, "eval_samples_per_second": 2.157, "eval_steps_per_second": 1.082, "step": 1400 }, { "epoch": 4.000571428571429, "grad_norm": 2.059375047683716, "learning_rate": 8.051428571428573e-06, "loss": 0.7828, "step": 1410 }, { "epoch": 4.001142857142857, "grad_norm": 47.22036361694336, "learning_rate": 8.108571428571429e-06, "loss": 0.9877, "step": 1420 }, { "epoch": 4.001714285714286, "grad_norm": 0.17464332282543182, "learning_rate": 8.165714285714286e-06, "loss": 0.6497, "step": 1430 }, { "epoch": 4.002285714285715, "grad_norm": 1.4610955715179443, "learning_rate": 8.222857142857144e-06, "loss": 0.5103, "step": 1440 }, { "epoch": 4.002857142857143, "grad_norm": 0.10568273067474365, "learning_rate": 8.28e-06, "loss": 0.0078, "step": 1450 }, { "epoch": 4.003428571428572, "grad_norm": 0.04568742215633392, "learning_rate": 8.337142857142858e-06, "loss": 0.0033, "step": 1460 }, { "epoch": 4.004, "grad_norm": 0.1109299287199974, "learning_rate": 8.394285714285714e-06, "loss": 0.8707, "step": 1470 }, { "epoch": 4.0045714285714284, "grad_norm": 0.08762586861848831, "learning_rate": 8.451428571428572e-06, "loss": 0.7401, "step": 1480 }, { "epoch": 4.005142857142857, "grad_norm": 0.16827107965946198, "learning_rate": 8.50857142857143e-06, "loss": 0.6692, "step": 1490 }, { "epoch": 4.005714285714285, "grad_norm": 0.9331682324409485, "learning_rate": 8.565714285714286e-06, "loss": 0.005, "step": 1500 }, { "epoch": 4.006285714285714, "grad_norm": 0.14198555052280426, "learning_rate": 8.622857142857144e-06, "loss": 0.0065, "step": 1510 }, { "epoch": 4.006857142857143, "grad_norm": 0.1513282060623169, "learning_rate": 8.68e-06, "loss": 0.5806, "step": 1520 }, { "epoch": 4.007428571428571, "grad_norm": 0.06693244725465775, "learning_rate": 8.737142857142858e-06, "loss": 0.7874, "step": 1530 }, { "epoch": 4.008, "grad_norm": 0.12991634011268616, "learning_rate": 8.794285714285716e-06, "loss": 0.2847, "step": 1540 }, { "epoch": 4.008571428571429, "grad_norm": 19.66016960144043, "learning_rate": 8.851428571428572e-06, "loss": 1.0498, "step": 1550 }, { "epoch": 4.009142857142857, "grad_norm": 2.3184783458709717, "learning_rate": 8.90857142857143e-06, "loss": 0.2343, "step": 1560 }, { "epoch": 4.009714285714286, "grad_norm": 0.2929948568344116, "learning_rate": 8.965714285714287e-06, "loss": 0.5086, "step": 1570 }, { "epoch": 4.010285714285715, "grad_norm": 22.23305892944336, "learning_rate": 9.022857142857143e-06, "loss": 0.5249, "step": 1580 }, { "epoch": 4.010857142857143, "grad_norm": 15.748601913452148, "learning_rate": 9.080000000000001e-06, "loss": 1.3767, "step": 1590 }, { "epoch": 4.011428571428572, "grad_norm": 0.976803183555603, "learning_rate": 9.137142857142857e-06, "loss": 0.4042, "step": 1600 }, { "epoch": 4.012, "grad_norm": 0.44382333755493164, "learning_rate": 9.194285714285715e-06, "loss": 0.4972, "step": 1610 }, { "epoch": 4.0125714285714285, "grad_norm": 27.20537757873535, "learning_rate": 9.251428571428573e-06, "loss": 0.9029, "step": 1620 }, { "epoch": 4.013142857142857, "grad_norm": 14.854856491088867, "learning_rate": 9.308571428571429e-06, "loss": 1.1477, "step": 1630 }, { "epoch": 4.013714285714285, "grad_norm": 14.011804580688477, "learning_rate": 9.365714285714287e-06, "loss": 0.4896, "step": 1640 }, { "epoch": 4.014285714285714, "grad_norm": 0.38275304436683655, "learning_rate": 9.422857142857143e-06, "loss": 0.8805, "step": 1650 }, { "epoch": 4.014857142857143, "grad_norm": 28.851638793945312, "learning_rate": 9.48e-06, "loss": 1.3966, "step": 1660 }, { "epoch": 4.015428571428571, "grad_norm": 35.203712463378906, "learning_rate": 9.537142857142859e-06, "loss": 0.613, "step": 1670 }, { "epoch": 4.016, "grad_norm": 0.1584346443414688, "learning_rate": 9.594285714285715e-06, "loss": 0.2728, "step": 1680 }, { "epoch": 4.016571428571429, "grad_norm": 0.10078814625740051, "learning_rate": 9.651428571428572e-06, "loss": 0.4286, "step": 1690 }, { "epoch": 4.017142857142857, "grad_norm": 13.510937690734863, "learning_rate": 9.70857142857143e-06, "loss": 0.6424, "step": 1700 }, { "epoch": 4.017714285714286, "grad_norm": 17.916301727294922, "learning_rate": 9.765714285714286e-06, "loss": 0.993, "step": 1710 }, { "epoch": 4.018285714285715, "grad_norm": 35.681949615478516, "learning_rate": 9.822857142857144e-06, "loss": 1.2221, "step": 1720 }, { "epoch": 4.018857142857143, "grad_norm": 15.966533660888672, "learning_rate": 9.88e-06, "loss": 0.7781, "step": 1730 }, { "epoch": 4.019428571428572, "grad_norm": 6.339288234710693, "learning_rate": 9.937142857142858e-06, "loss": 0.4475, "step": 1740 }, { "epoch": 4.02, "grad_norm": 145.03355407714844, "learning_rate": 9.994285714285716e-06, "loss": 0.8225, "step": 1750 }, { "epoch": 4.02, "eval_accuracy": 0.5656565656565656, "eval_loss": 1.686766266822815, "eval_runtime": 137.779, "eval_samples_per_second": 2.156, "eval_steps_per_second": 1.081, "step": 1750 }, { "epoch": 5.000571428571429, "grad_norm": 9.777188301086426, "learning_rate": 9.994285714285716e-06, "loss": 1.4855, "step": 1760 }, { "epoch": 5.001142857142857, "grad_norm": 39.02730178833008, "learning_rate": 9.987936507936509e-06, "loss": 1.214, "step": 1770 }, { "epoch": 5.001714285714286, "grad_norm": 40.402408599853516, "learning_rate": 9.981587301587303e-06, "loss": 0.5619, "step": 1780 }, { "epoch": 5.002285714285715, "grad_norm": 13.327921867370605, "learning_rate": 9.975238095238095e-06, "loss": 0.6943, "step": 1790 }, { "epoch": 5.002857142857143, "grad_norm": 7.069613456726074, "learning_rate": 9.96888888888889e-06, "loss": 0.5466, "step": 1800 }, { "epoch": 5.003428571428572, "grad_norm": 81.00372314453125, "learning_rate": 9.962539682539684e-06, "loss": 1.0447, "step": 1810 }, { "epoch": 5.004, "grad_norm": 0.9917150735855103, "learning_rate": 9.956190476190477e-06, "loss": 0.5806, "step": 1820 }, { "epoch": 5.0045714285714284, "grad_norm": 0.22874107956886292, "learning_rate": 9.949841269841271e-06, "loss": 0.4238, "step": 1830 }, { "epoch": 5.005142857142857, "grad_norm": 13.64168643951416, "learning_rate": 9.943492063492064e-06, "loss": 0.7042, "step": 1840 }, { "epoch": 5.005714285714285, "grad_norm": 0.5768861174583435, "learning_rate": 9.937142857142858e-06, "loss": 0.4288, "step": 1850 }, { "epoch": 5.006285714285714, "grad_norm": 31.998321533203125, "learning_rate": 9.930793650793652e-06, "loss": 1.2366, "step": 1860 }, { "epoch": 5.006857142857143, "grad_norm": 13.20121955871582, "learning_rate": 9.924444444444445e-06, "loss": 0.6062, "step": 1870 }, { "epoch": 5.007428571428571, "grad_norm": 0.0679963082075119, "learning_rate": 9.91809523809524e-06, "loss": 0.5082, "step": 1880 }, { "epoch": 5.008, "grad_norm": 0.2697165608406067, "learning_rate": 9.911746031746032e-06, "loss": 0.8584, "step": 1890 }, { "epoch": 5.008571428571429, "grad_norm": 12.50837230682373, "learning_rate": 9.905396825396826e-06, "loss": 0.5164, "step": 1900 }, { "epoch": 5.009142857142857, "grad_norm": 28.091419219970703, "learning_rate": 9.89904761904762e-06, "loss": 0.9162, "step": 1910 }, { "epoch": 5.009714285714286, "grad_norm": 0.1433819681406021, "learning_rate": 9.892698412698413e-06, "loss": 0.39, "step": 1920 }, { "epoch": 5.010285714285715, "grad_norm": 0.6932072043418884, "learning_rate": 9.886349206349208e-06, "loss": 0.5809, "step": 1930 }, { "epoch": 5.010857142857143, "grad_norm": 0.5466342568397522, "learning_rate": 9.88e-06, "loss": 0.5664, "step": 1940 }, { "epoch": 5.011428571428572, "grad_norm": 10.084834098815918, "learning_rate": 9.873650793650795e-06, "loss": 0.4083, "step": 1950 }, { "epoch": 5.012, "grad_norm": 0.201524555683136, "learning_rate": 9.867301587301587e-06, "loss": 0.9247, "step": 1960 }, { "epoch": 5.0125714285714285, "grad_norm": 49.69109344482422, "learning_rate": 9.860952380952382e-06, "loss": 0.4362, "step": 1970 }, { "epoch": 5.013142857142857, "grad_norm": 2.8617489337921143, "learning_rate": 9.854603174603176e-06, "loss": 0.009, "step": 1980 }, { "epoch": 5.013714285714285, "grad_norm": 0.3022560775279999, "learning_rate": 9.848253968253969e-06, "loss": 0.849, "step": 1990 }, { "epoch": 5.014285714285714, "grad_norm": 0.2493607997894287, "learning_rate": 9.841904761904763e-06, "loss": 0.438, "step": 2000 }, { "epoch": 5.014857142857143, "grad_norm": 0.21475115418434143, "learning_rate": 9.835555555555556e-06, "loss": 0.846, "step": 2010 }, { "epoch": 5.015428571428571, "grad_norm": 0.6425090432167053, "learning_rate": 9.82920634920635e-06, "loss": 0.5353, "step": 2020 }, { "epoch": 5.016, "grad_norm": 0.29937419295310974, "learning_rate": 9.822857142857144e-06, "loss": 0.8212, "step": 2030 }, { "epoch": 5.016571428571429, "grad_norm": 5.154245853424072, "learning_rate": 9.816507936507937e-06, "loss": 0.9711, "step": 2040 }, { "epoch": 5.017142857142857, "grad_norm": 0.5371858477592468, "learning_rate": 9.810158730158731e-06, "loss": 0.4229, "step": 2050 }, { "epoch": 5.017714285714286, "grad_norm": 11.923803329467773, "learning_rate": 9.803809523809524e-06, "loss": 1.5601, "step": 2060 }, { "epoch": 5.018285714285715, "grad_norm": 25.41405487060547, "learning_rate": 9.797460317460318e-06, "loss": 0.374, "step": 2070 }, { "epoch": 5.018857142857143, "grad_norm": 12.134407043457031, "learning_rate": 9.791111111111112e-06, "loss": 1.0097, "step": 2080 }, { "epoch": 5.019428571428572, "grad_norm": 191.27777099609375, "learning_rate": 9.784761904761905e-06, "loss": 0.8402, "step": 2090 }, { "epoch": 5.02, "grad_norm": 48.240779876708984, "learning_rate": 9.7784126984127e-06, "loss": 0.724, "step": 2100 }, { "epoch": 5.02, "eval_accuracy": 0.7508417508417509, "eval_loss": 1.0066299438476562, "eval_runtime": 136.2857, "eval_samples_per_second": 2.179, "eval_steps_per_second": 1.093, "step": 2100 }, { "epoch": 6.000571428571429, "grad_norm": 16.156768798828125, "learning_rate": 9.772063492063492e-06, "loss": 0.4888, "step": 2110 }, { "epoch": 6.001142857142857, "grad_norm": 3.711266040802002, "learning_rate": 9.765714285714286e-06, "loss": 0.92, "step": 2120 }, { "epoch": 6.001714285714286, "grad_norm": 13.800188064575195, "learning_rate": 9.75936507936508e-06, "loss": 1.3366, "step": 2130 }, { "epoch": 6.002285714285715, "grad_norm": 42.781002044677734, "learning_rate": 9.753015873015873e-06, "loss": 0.4288, "step": 2140 }, { "epoch": 6.002857142857143, "grad_norm": 35.87137985229492, "learning_rate": 9.746666666666668e-06, "loss": 0.3171, "step": 2150 }, { "epoch": 6.003428571428572, "grad_norm": 0.34640470147132874, "learning_rate": 9.74031746031746e-06, "loss": 0.6315, "step": 2160 }, { "epoch": 6.004, "grad_norm": 0.15693581104278564, "learning_rate": 9.733968253968255e-06, "loss": 0.9196, "step": 2170 }, { "epoch": 6.0045714285714284, "grad_norm": 0.8390889763832092, "learning_rate": 9.727619047619047e-06, "loss": 0.5796, "step": 2180 }, { "epoch": 6.005142857142857, "grad_norm": 0.38496437668800354, "learning_rate": 9.721269841269843e-06, "loss": 0.7525, "step": 2190 }, { "epoch": 6.005714285714285, "grad_norm": 0.7874132990837097, "learning_rate": 9.714920634920636e-06, "loss": 0.5311, "step": 2200 }, { "epoch": 6.006285714285714, "grad_norm": 0.2642306983470917, "learning_rate": 9.70857142857143e-06, "loss": 0.3915, "step": 2210 }, { "epoch": 6.006857142857143, "grad_norm": 0.1062416136264801, "learning_rate": 9.702222222222223e-06, "loss": 0.4608, "step": 2220 }, { "epoch": 6.007428571428571, "grad_norm": 8.830906867980957, "learning_rate": 9.695873015873016e-06, "loss": 0.4464, "step": 2230 }, { "epoch": 6.008, "grad_norm": 69.506103515625, "learning_rate": 9.68952380952381e-06, "loss": 0.7923, "step": 2240 }, { "epoch": 6.008571428571429, "grad_norm": 42.58530807495117, "learning_rate": 9.683174603174604e-06, "loss": 0.2295, "step": 2250 }, { "epoch": 6.009142857142857, "grad_norm": 3.4575247764587402, "learning_rate": 9.676825396825399e-06, "loss": 0.0089, "step": 2260 }, { "epoch": 6.009714285714286, "grad_norm": 12.615706443786621, "learning_rate": 9.670476190476191e-06, "loss": 0.5269, "step": 2270 }, { "epoch": 6.010285714285715, "grad_norm": 0.2601805627346039, "learning_rate": 9.664126984126985e-06, "loss": 0.5875, "step": 2280 }, { "epoch": 6.010857142857143, "grad_norm": 0.12038259208202362, "learning_rate": 9.657777777777778e-06, "loss": 0.2318, "step": 2290 }, { "epoch": 6.011428571428572, "grad_norm": 0.11833731830120087, "learning_rate": 9.651428571428572e-06, "loss": 0.4668, "step": 2300 }, { "epoch": 6.012, "grad_norm": 13.778709411621094, "learning_rate": 9.645079365079367e-06, "loss": 1.5556, "step": 2310 }, { "epoch": 6.0125714285714285, "grad_norm": 12.374515533447266, "learning_rate": 9.63873015873016e-06, "loss": 0.7705, "step": 2320 }, { "epoch": 6.013142857142857, "grad_norm": 12.057218551635742, "learning_rate": 9.632380952380954e-06, "loss": 1.2163, "step": 2330 }, { "epoch": 6.013714285714285, "grad_norm": 0.848735511302948, "learning_rate": 9.626031746031746e-06, "loss": 0.3669, "step": 2340 }, { "epoch": 6.014285714285714, "grad_norm": 0.11727745085954666, "learning_rate": 9.61968253968254e-06, "loss": 0.6778, "step": 2350 }, { "epoch": 6.014857142857143, "grad_norm": 0.6909122467041016, "learning_rate": 9.613333333333335e-06, "loss": 0.2815, "step": 2360 }, { "epoch": 6.015428571428571, "grad_norm": 0.31187936663627625, "learning_rate": 9.606984126984128e-06, "loss": 0.418, "step": 2370 }, { "epoch": 6.016, "grad_norm": 12.6905517578125, "learning_rate": 9.600634920634922e-06, "loss": 0.6797, "step": 2380 }, { "epoch": 6.016571428571429, "grad_norm": 14.10185432434082, "learning_rate": 9.594285714285715e-06, "loss": 0.2239, "step": 2390 }, { "epoch": 6.017142857142857, "grad_norm": 0.6114095449447632, "learning_rate": 9.587936507936509e-06, "loss": 0.4369, "step": 2400 }, { "epoch": 6.017714285714286, "grad_norm": 86.14923095703125, "learning_rate": 9.581587301587303e-06, "loss": 0.7966, "step": 2410 }, { "epoch": 6.018285714285715, "grad_norm": 136.2497100830078, "learning_rate": 9.575238095238096e-06, "loss": 0.8023, "step": 2420 }, { "epoch": 6.018857142857143, "grad_norm": 22.98550796508789, "learning_rate": 9.56888888888889e-06, "loss": 0.2369, "step": 2430 }, { "epoch": 6.019428571428572, "grad_norm": 0.261092871427536, "learning_rate": 9.562539682539683e-06, "loss": 0.5792, "step": 2440 }, { "epoch": 6.02, "grad_norm": 0.12631256878376007, "learning_rate": 9.556190476190477e-06, "loss": 0.1468, "step": 2450 }, { "epoch": 6.02, "eval_accuracy": 0.8316498316498316, "eval_loss": 0.770273745059967, "eval_runtime": 137.4635, "eval_samples_per_second": 2.161, "eval_steps_per_second": 1.084, "step": 2450 }, { "epoch": 7.000571428571429, "grad_norm": 0.15533775091171265, "learning_rate": 9.54984126984127e-06, "loss": 0.6611, "step": 2460 }, { "epoch": 7.001142857142857, "grad_norm": 15.251904487609863, "learning_rate": 9.543492063492064e-06, "loss": 0.7168, "step": 2470 }, { "epoch": 7.001714285714286, "grad_norm": 0.15305683016777039, "learning_rate": 9.537142857142859e-06, "loss": 0.2702, "step": 2480 }, { "epoch": 7.002285714285715, "grad_norm": 0.5726995468139648, "learning_rate": 9.530793650793651e-06, "loss": 0.2093, "step": 2490 }, { "epoch": 7.002857142857143, "grad_norm": 0.23029519617557526, "learning_rate": 9.524444444444445e-06, "loss": 0.6895, "step": 2500 }, { "epoch": 7.003428571428572, "grad_norm": 0.13955456018447876, "learning_rate": 9.518095238095238e-06, "loss": 0.8783, "step": 2510 }, { "epoch": 7.004, "grad_norm": 0.01913376897573471, "learning_rate": 9.511746031746032e-06, "loss": 0.1524, "step": 2520 }, { "epoch": 7.0045714285714284, "grad_norm": 0.20704668760299683, "learning_rate": 9.505396825396827e-06, "loss": 1.607, "step": 2530 }, { "epoch": 7.005142857142857, "grad_norm": 12.19189167022705, "learning_rate": 9.49904761904762e-06, "loss": 0.5308, "step": 2540 }, { "epoch": 7.005714285714285, "grad_norm": 0.30487680435180664, "learning_rate": 9.492698412698414e-06, "loss": 0.4346, "step": 2550 }, { "epoch": 7.006285714285714, "grad_norm": 0.047250378876924515, "learning_rate": 9.486349206349206e-06, "loss": 0.4239, "step": 2560 }, { "epoch": 7.006857142857143, "grad_norm": 0.45693501830101013, "learning_rate": 9.48e-06, "loss": 0.477, "step": 2570 }, { "epoch": 7.007428571428571, "grad_norm": 0.449886292219162, "learning_rate": 9.473650793650795e-06, "loss": 0.8766, "step": 2580 }, { "epoch": 7.008, "grad_norm": 0.1275578886270523, "learning_rate": 9.467301587301588e-06, "loss": 0.4449, "step": 2590 }, { "epoch": 7.008571428571429, "grad_norm": 0.08520308881998062, "learning_rate": 9.460952380952382e-06, "loss": 0.4128, "step": 2600 }, { "epoch": 7.009142857142857, "grad_norm": 18.56478500366211, "learning_rate": 9.454603174603175e-06, "loss": 0.6254, "step": 2610 }, { "epoch": 7.009714285714286, "grad_norm": 0.32579270005226135, "learning_rate": 9.448253968253969e-06, "loss": 0.1916, "step": 2620 }, { "epoch": 7.010285714285715, "grad_norm": 0.2691892087459564, "learning_rate": 9.441904761904762e-06, "loss": 0.4455, "step": 2630 }, { "epoch": 7.010857142857143, "grad_norm": 0.18024860322475433, "learning_rate": 9.435555555555556e-06, "loss": 0.2448, "step": 2640 }, { "epoch": 7.011428571428572, "grad_norm": 12.539464950561523, "learning_rate": 9.42920634920635e-06, "loss": 0.6483, "step": 2650 }, { "epoch": 7.012, "grad_norm": 24.065021514892578, "learning_rate": 9.422857142857143e-06, "loss": 1.1989, "step": 2660 }, { "epoch": 7.0125714285714285, "grad_norm": 0.15510013699531555, "learning_rate": 9.416507936507937e-06, "loss": 0.4549, "step": 2670 }, { "epoch": 7.013142857142857, "grad_norm": 0.21933165192604065, "learning_rate": 9.41015873015873e-06, "loss": 0.4212, "step": 2680 }, { "epoch": 7.013714285714285, "grad_norm": 46.31543731689453, "learning_rate": 9.403809523809526e-06, "loss": 0.6977, "step": 2690 }, { "epoch": 7.014285714285714, "grad_norm": 0.49816974997520447, "learning_rate": 9.397460317460319e-06, "loss": 0.7104, "step": 2700 }, { "epoch": 7.014857142857143, "grad_norm": 0.6536113023757935, "learning_rate": 9.391111111111111e-06, "loss": 0.7153, "step": 2710 }, { "epoch": 7.015428571428571, "grad_norm": 40.60562515258789, "learning_rate": 9.384761904761906e-06, "loss": 0.3319, "step": 2720 }, { "epoch": 7.016, "grad_norm": 13.018871307373047, "learning_rate": 9.378412698412698e-06, "loss": 0.2629, "step": 2730 }, { "epoch": 7.016571428571429, "grad_norm": 0.34966105222702026, "learning_rate": 9.372063492063492e-06, "loss": 0.9186, "step": 2740 }, { "epoch": 7.017142857142857, "grad_norm": 57.22317123413086, "learning_rate": 9.365714285714287e-06, "loss": 1.1176, "step": 2750 }, { "epoch": 7.017714285714286, "grad_norm": 2.8083252906799316, "learning_rate": 9.359365079365081e-06, "loss": 0.5711, "step": 2760 }, { "epoch": 7.018285714285715, "grad_norm": 0.2513306140899658, "learning_rate": 9.353015873015874e-06, "loss": 0.3423, "step": 2770 }, { "epoch": 7.018857142857143, "grad_norm": 0.24741508066654205, "learning_rate": 9.346666666666666e-06, "loss": 0.412, "step": 2780 }, { "epoch": 7.019428571428572, "grad_norm": 0.17685921490192413, "learning_rate": 9.34031746031746e-06, "loss": 0.8817, "step": 2790 }, { "epoch": 7.02, "grad_norm": 0.2248346507549286, "learning_rate": 9.333968253968255e-06, "loss": 0.8406, "step": 2800 }, { "epoch": 7.02, "eval_accuracy": 0.8484848484848485, "eval_loss": 0.5862956047058105, "eval_runtime": 137.2695, "eval_samples_per_second": 2.164, "eval_steps_per_second": 1.085, "step": 2800 }, { "epoch": 8.000571428571428, "grad_norm": 0.10712999105453491, "learning_rate": 9.32761904761905e-06, "loss": 0.5924, "step": 2810 }, { "epoch": 8.001142857142858, "grad_norm": 0.37382298707962036, "learning_rate": 9.321269841269842e-06, "loss": 0.2847, "step": 2820 }, { "epoch": 8.001714285714286, "grad_norm": 0.08458230644464493, "learning_rate": 9.314920634920636e-06, "loss": 1.0296, "step": 2830 }, { "epoch": 8.002285714285714, "grad_norm": 2.4656920433044434, "learning_rate": 9.308571428571429e-06, "loss": 0.4784, "step": 2840 }, { "epoch": 8.002857142857144, "grad_norm": 10.618043899536133, "learning_rate": 9.302222222222223e-06, "loss": 0.733, "step": 2850 }, { "epoch": 8.003428571428572, "grad_norm": 0.1708851009607315, "learning_rate": 9.295873015873018e-06, "loss": 0.2806, "step": 2860 }, { "epoch": 8.004, "grad_norm": 0.06309555470943451, "learning_rate": 9.28952380952381e-06, "loss": 0.5611, "step": 2870 }, { "epoch": 8.00457142857143, "grad_norm": 0.46451622247695923, "learning_rate": 9.283174603174605e-06, "loss": 0.1565, "step": 2880 }, { "epoch": 8.005142857142857, "grad_norm": 13.138236999511719, "learning_rate": 9.276825396825397e-06, "loss": 0.7403, "step": 2890 }, { "epoch": 8.005714285714285, "grad_norm": 1.0157630443572998, "learning_rate": 9.270476190476192e-06, "loss": 0.3141, "step": 2900 }, { "epoch": 8.006285714285715, "grad_norm": 0.22003336250782013, "learning_rate": 9.264126984126986e-06, "loss": 0.7244, "step": 2910 }, { "epoch": 8.006857142857143, "grad_norm": 0.22460892796516418, "learning_rate": 9.257777777777779e-06, "loss": 0.397, "step": 2920 }, { "epoch": 8.007428571428571, "grad_norm": 1.3341542482376099, "learning_rate": 9.251428571428573e-06, "loss": 0.9219, "step": 2930 }, { "epoch": 8.008, "grad_norm": 0.180324986577034, "learning_rate": 9.245079365079366e-06, "loss": 1.0728, "step": 2940 }, { "epoch": 8.008571428571429, "grad_norm": 3.0605599880218506, "learning_rate": 9.23873015873016e-06, "loss": 0.6721, "step": 2950 }, { "epoch": 8.009142857142857, "grad_norm": 0.17587581276893616, "learning_rate": 9.232380952380952e-06, "loss": 0.3343, "step": 2960 }, { "epoch": 8.009714285714285, "grad_norm": 1.3076510429382324, "learning_rate": 9.226031746031747e-06, "loss": 0.5037, "step": 2970 }, { "epoch": 8.010285714285715, "grad_norm": 0.15406930446624756, "learning_rate": 9.219682539682541e-06, "loss": 0.4566, "step": 2980 }, { "epoch": 8.010857142857143, "grad_norm": 12.370487213134766, "learning_rate": 9.213333333333334e-06, "loss": 0.6773, "step": 2990 }, { "epoch": 8.01142857142857, "grad_norm": 24.417396545410156, "learning_rate": 9.206984126984128e-06, "loss": 0.8717, "step": 3000 }, { "epoch": 8.012, "grad_norm": 0.2499029040336609, "learning_rate": 9.20063492063492e-06, "loss": 0.9279, "step": 3010 }, { "epoch": 8.012571428571428, "grad_norm": 8.220014572143555, "learning_rate": 9.194285714285715e-06, "loss": 0.0187, "step": 3020 }, { "epoch": 8.013142857142856, "grad_norm": 0.12472938001155853, "learning_rate": 9.18793650793651e-06, "loss": 0.1618, "step": 3030 }, { "epoch": 8.013714285714286, "grad_norm": 12.041219711303711, "learning_rate": 9.181587301587302e-06, "loss": 0.7642, "step": 3040 }, { "epoch": 8.014285714285714, "grad_norm": 0.26260077953338623, "learning_rate": 9.175238095238096e-06, "loss": 0.586, "step": 3050 }, { "epoch": 8.014857142857142, "grad_norm": 0.13079126179218292, "learning_rate": 9.168888888888889e-06, "loss": 0.4182, "step": 3060 }, { "epoch": 8.015428571428572, "grad_norm": 0.1269873082637787, "learning_rate": 9.162539682539683e-06, "loss": 0.2187, "step": 3070 }, { "epoch": 8.016, "grad_norm": 0.14042362570762634, "learning_rate": 9.156190476190478e-06, "loss": 0.3016, "step": 3080 }, { "epoch": 8.016571428571428, "grad_norm": 13.570809364318848, "learning_rate": 9.14984126984127e-06, "loss": 0.8131, "step": 3090 }, { "epoch": 8.017142857142858, "grad_norm": 0.28256142139434814, "learning_rate": 9.143492063492065e-06, "loss": 0.2524, "step": 3100 }, { "epoch": 8.017714285714286, "grad_norm": 0.08613581210374832, "learning_rate": 9.137142857142857e-06, "loss": 0.9875, "step": 3110 }, { "epoch": 8.018285714285714, "grad_norm": 29.147151947021484, "learning_rate": 9.130793650793652e-06, "loss": 0.6377, "step": 3120 }, { "epoch": 8.018857142857144, "grad_norm": 119.630126953125, "learning_rate": 9.124444444444444e-06, "loss": 0.1833, "step": 3130 }, { "epoch": 8.019428571428572, "grad_norm": 0.17084644734859467, "learning_rate": 9.118095238095239e-06, "loss": 0.9565, "step": 3140 }, { "epoch": 8.02, "grad_norm": 12.522076606750488, "learning_rate": 9.111746031746033e-06, "loss": 0.4485, "step": 3150 }, { "epoch": 8.02, "eval_accuracy": 0.8383838383838383, "eval_loss": 0.6601792573928833, "eval_runtime": 137.764, "eval_samples_per_second": 2.156, "eval_steps_per_second": 1.082, "step": 3150 }, { "epoch": 9.000571428571428, "grad_norm": 0.1897997260093689, "learning_rate": 9.105396825396826e-06, "loss": 0.0101, "step": 3160 }, { "epoch": 9.001142857142858, "grad_norm": 26.98958396911621, "learning_rate": 9.09904761904762e-06, "loss": 1.1643, "step": 3170 }, { "epoch": 9.001714285714286, "grad_norm": 0.6562146544456482, "learning_rate": 9.092698412698412e-06, "loss": 0.5971, "step": 3180 }, { "epoch": 9.002285714285714, "grad_norm": 6.7756476402282715, "learning_rate": 9.086349206349207e-06, "loss": 0.448, "step": 3190 }, { "epoch": 9.002857142857144, "grad_norm": 0.3257388472557068, "learning_rate": 9.080000000000001e-06, "loss": 0.3954, "step": 3200 }, { "epoch": 9.003428571428572, "grad_norm": 0.04351123794913292, "learning_rate": 9.073650793650794e-06, "loss": 0.2449, "step": 3210 }, { "epoch": 9.004, "grad_norm": 0.12862706184387207, "learning_rate": 9.067301587301588e-06, "loss": 0.238, "step": 3220 }, { "epoch": 9.00457142857143, "grad_norm": 0.30041778087615967, "learning_rate": 9.06095238095238e-06, "loss": 0.0238, "step": 3230 }, { "epoch": 9.005142857142857, "grad_norm": 12.429978370666504, "learning_rate": 9.054603174603175e-06, "loss": 1.2523, "step": 3240 }, { "epoch": 9.005714285714285, "grad_norm": 12.339615821838379, "learning_rate": 9.04825396825397e-06, "loss": 0.2253, "step": 3250 }, { "epoch": 9.006285714285715, "grad_norm": 0.3683285117149353, "learning_rate": 9.041904761904762e-06, "loss": 0.5474, "step": 3260 }, { "epoch": 9.006857142857143, "grad_norm": 0.29264554381370544, "learning_rate": 9.035555555555556e-06, "loss": 0.0913, "step": 3270 }, { "epoch": 9.007428571428571, "grad_norm": 0.14638611674308777, "learning_rate": 9.029206349206349e-06, "loss": 0.0075, "step": 3280 }, { "epoch": 9.008, "grad_norm": 0.13496576249599457, "learning_rate": 9.022857142857143e-06, "loss": 0.3115, "step": 3290 }, { "epoch": 9.008571428571429, "grad_norm": 0.09361585974693298, "learning_rate": 9.016507936507938e-06, "loss": 0.5494, "step": 3300 }, { "epoch": 9.009142857142857, "grad_norm": 0.09114635735750198, "learning_rate": 9.010158730158732e-06, "loss": 0.4454, "step": 3310 }, { "epoch": 9.009714285714285, "grad_norm": 107.50813293457031, "learning_rate": 9.003809523809525e-06, "loss": 0.8347, "step": 3320 }, { "epoch": 9.010285714285715, "grad_norm": 0.29273521900177, "learning_rate": 8.997460317460317e-06, "loss": 0.5854, "step": 3330 }, { "epoch": 9.010857142857143, "grad_norm": 18.072633743286133, "learning_rate": 8.991111111111112e-06, "loss": 0.7517, "step": 3340 }, { "epoch": 9.01142857142857, "grad_norm": 0.3977367877960205, "learning_rate": 8.984761904761904e-06, "loss": 0.7607, "step": 3350 }, { "epoch": 9.012, "grad_norm": 0.37464699149131775, "learning_rate": 8.9784126984127e-06, "loss": 0.659, "step": 3360 }, { "epoch": 9.012571428571428, "grad_norm": 0.13560600578784943, "learning_rate": 8.972063492063493e-06, "loss": 0.3875, "step": 3370 }, { "epoch": 9.013142857142856, "grad_norm": 0.609217643737793, "learning_rate": 8.965714285714287e-06, "loss": 0.8813, "step": 3380 }, { "epoch": 9.013714285714286, "grad_norm": 0.6446564793586731, "learning_rate": 8.95936507936508e-06, "loss": 0.3148, "step": 3390 }, { "epoch": 9.014285714285714, "grad_norm": 48.195072174072266, "learning_rate": 8.953015873015874e-06, "loss": 1.2333, "step": 3400 }, { "epoch": 9.014857142857142, "grad_norm": 0.19967958331108093, "learning_rate": 8.946666666666669e-06, "loss": 0.3988, "step": 3410 }, { "epoch": 9.015428571428572, "grad_norm": 0.22969119250774384, "learning_rate": 8.940317460317461e-06, "loss": 0.2877, "step": 3420 }, { "epoch": 9.016, "grad_norm": 14.168766975402832, "learning_rate": 8.933968253968256e-06, "loss": 1.2265, "step": 3430 }, { "epoch": 9.016571428571428, "grad_norm": 0.15207406878471375, "learning_rate": 8.927619047619048e-06, "loss": 0.4666, "step": 3440 }, { "epoch": 9.017142857142858, "grad_norm": 0.09754037857055664, "learning_rate": 8.921269841269842e-06, "loss": 0.7216, "step": 3450 }, { "epoch": 9.017714285714286, "grad_norm": 13.291885375976562, "learning_rate": 8.914920634920635e-06, "loss": 0.4949, "step": 3460 }, { "epoch": 9.018285714285714, "grad_norm": 0.16420067846775055, "learning_rate": 8.90857142857143e-06, "loss": 0.1193, "step": 3470 }, { "epoch": 9.018857142857144, "grad_norm": 1.3726239204406738, "learning_rate": 8.902222222222224e-06, "loss": 0.4908, "step": 3480 }, { "epoch": 9.019428571428572, "grad_norm": 12.551621437072754, "learning_rate": 8.895873015873016e-06, "loss": 1.2967, "step": 3490 }, { "epoch": 9.02, "grad_norm": 0.05716940015554428, "learning_rate": 8.88952380952381e-06, "loss": 0.0134, "step": 3500 }, { "epoch": 9.02, "eval_accuracy": 0.8316498316498316, "eval_loss": 0.6907000541687012, "eval_runtime": 137.8377, "eval_samples_per_second": 2.155, "eval_steps_per_second": 1.081, "step": 3500 }, { "epoch": 10.000571428571428, "grad_norm": 63.84096145629883, "learning_rate": 8.883174603174603e-06, "loss": 1.0078, "step": 3510 }, { "epoch": 10.001142857142858, "grad_norm": 25.32430076599121, "learning_rate": 8.876825396825398e-06, "loss": 1.2722, "step": 3520 }, { "epoch": 10.001714285714286, "grad_norm": 126.55713653564453, "learning_rate": 8.870476190476192e-06, "loss": 0.5043, "step": 3530 }, { "epoch": 10.002285714285714, "grad_norm": 0.31926584243774414, "learning_rate": 8.864126984126985e-06, "loss": 0.122, "step": 3540 }, { "epoch": 10.002857142857144, "grad_norm": 75.52194213867188, "learning_rate": 8.857777777777779e-06, "loss": 0.579, "step": 3550 }, { "epoch": 10.003428571428572, "grad_norm": 59.787635803222656, "learning_rate": 8.851428571428572e-06, "loss": 0.5681, "step": 3560 }, { "epoch": 10.004, "grad_norm": 0.1938324123620987, "learning_rate": 8.845079365079366e-06, "loss": 0.7633, "step": 3570 }, { "epoch": 10.00457142857143, "grad_norm": 12.319451332092285, "learning_rate": 8.83873015873016e-06, "loss": 1.1426, "step": 3580 }, { "epoch": 10.005142857142857, "grad_norm": 12.19789981842041, "learning_rate": 8.832380952380953e-06, "loss": 0.4935, "step": 3590 }, { "epoch": 10.005714285714285, "grad_norm": 0.08483655005693436, "learning_rate": 8.826031746031747e-06, "loss": 0.4319, "step": 3600 }, { "epoch": 10.006285714285715, "grad_norm": 0.2407812774181366, "learning_rate": 8.81968253968254e-06, "loss": 0.4388, "step": 3610 }, { "epoch": 10.006857142857143, "grad_norm": 0.20790864527225494, "learning_rate": 8.813333333333334e-06, "loss": 0.6393, "step": 3620 }, { "epoch": 10.007428571428571, "grad_norm": 6.273917198181152, "learning_rate": 8.806984126984127e-06, "loss": 0.2051, "step": 3630 }, { "epoch": 10.008, "grad_norm": 0.6706716418266296, "learning_rate": 8.800634920634921e-06, "loss": 0.6428, "step": 3640 }, { "epoch": 10.008571428571429, "grad_norm": 0.3309767544269562, "learning_rate": 8.794285714285716e-06, "loss": 0.2073, "step": 3650 }, { "epoch": 10.009142857142857, "grad_norm": 0.3042340576648712, "learning_rate": 8.787936507936508e-06, "loss": 0.7154, "step": 3660 }, { "epoch": 10.009714285714285, "grad_norm": 0.23488643765449524, "learning_rate": 8.781587301587302e-06, "loss": 0.2957, "step": 3670 }, { "epoch": 10.010285714285715, "grad_norm": 0.09738802909851074, "learning_rate": 8.775238095238095e-06, "loss": 0.0073, "step": 3680 }, { "epoch": 10.010857142857143, "grad_norm": 0.10398265719413757, "learning_rate": 8.76888888888889e-06, "loss": 0.8261, "step": 3690 }, { "epoch": 10.01142857142857, "grad_norm": 0.1489027887582779, "learning_rate": 8.762539682539684e-06, "loss": 0.8143, "step": 3700 }, { "epoch": 10.012, "grad_norm": 0.2872755527496338, "learning_rate": 8.756190476190476e-06, "loss": 0.7231, "step": 3710 }, { "epoch": 10.012571428571428, "grad_norm": 0.08701366931200027, "learning_rate": 8.74984126984127e-06, "loss": 0.4884, "step": 3720 }, { "epoch": 10.013142857142856, "grad_norm": 0.18288986384868622, "learning_rate": 8.743492063492063e-06, "loss": 0.2193, "step": 3730 }, { "epoch": 10.013714285714286, "grad_norm": 0.14304602146148682, "learning_rate": 8.737142857142858e-06, "loss": 0.4951, "step": 3740 }, { "epoch": 10.014285714285714, "grad_norm": 0.23201577365398407, "learning_rate": 8.730793650793652e-06, "loss": 0.3121, "step": 3750 }, { "epoch": 10.014857142857142, "grad_norm": 196.32400512695312, "learning_rate": 8.724444444444445e-06, "loss": 0.5922, "step": 3760 }, { "epoch": 10.015428571428572, "grad_norm": 0.09482351690530777, "learning_rate": 8.718095238095239e-06, "loss": 0.4892, "step": 3770 }, { "epoch": 10.016, "grad_norm": 0.1551242470741272, "learning_rate": 8.711746031746032e-06, "loss": 0.523, "step": 3780 }, { "epoch": 10.016571428571428, "grad_norm": 0.15693677961826324, "learning_rate": 8.705396825396826e-06, "loss": 0.9376, "step": 3790 }, { "epoch": 10.017142857142858, "grad_norm": 0.38993221521377563, "learning_rate": 8.69904761904762e-06, "loss": 0.8241, "step": 3800 }, { "epoch": 10.017714285714286, "grad_norm": 0.5978397727012634, "learning_rate": 8.692698412698413e-06, "loss": 0.806, "step": 3810 }, { "epoch": 10.018285714285714, "grad_norm": 0.41364625096321106, "learning_rate": 8.686349206349207e-06, "loss": 0.5164, "step": 3820 }, { "epoch": 10.018857142857144, "grad_norm": 0.32526934146881104, "learning_rate": 8.68e-06, "loss": 0.5054, "step": 3830 }, { "epoch": 10.019428571428572, "grad_norm": 0.6359074711799622, "learning_rate": 8.673650793650794e-06, "loss": 0.31, "step": 3840 }, { "epoch": 10.02, "grad_norm": 85.47923278808594, "learning_rate": 8.667301587301587e-06, "loss": 0.11, "step": 3850 }, { "epoch": 10.02, "eval_accuracy": 0.8316498316498316, "eval_loss": 0.7097596526145935, "eval_runtime": 137.5142, "eval_samples_per_second": 2.16, "eval_steps_per_second": 1.084, "step": 3850 }, { "epoch": 11.000571428571428, "grad_norm": 0.1688612848520279, "learning_rate": 8.660952380952383e-06, "loss": 0.4141, "step": 3860 }, { "epoch": 11.001142857142858, "grad_norm": 12.029210090637207, "learning_rate": 8.654603174603176e-06, "loss": 0.8705, "step": 3870 }, { "epoch": 11.001714285714286, "grad_norm": 0.19282205402851105, "learning_rate": 8.648253968253968e-06, "loss": 0.3451, "step": 3880 }, { "epoch": 11.002285714285714, "grad_norm": 12.639680862426758, "learning_rate": 8.641904761904762e-06, "loss": 0.9504, "step": 3890 }, { "epoch": 11.002857142857144, "grad_norm": 0.34344059228897095, "learning_rate": 8.635555555555555e-06, "loss": 0.3582, "step": 3900 }, { "epoch": 11.003428571428572, "grad_norm": 193.6134033203125, "learning_rate": 8.62920634920635e-06, "loss": 0.1138, "step": 3910 }, { "epoch": 11.004, "grad_norm": 0.36762064695358276, "learning_rate": 8.622857142857144e-06, "loss": 0.556, "step": 3920 }, { "epoch": 11.00457142857143, "grad_norm": 0.19313932955265045, "learning_rate": 8.616507936507938e-06, "loss": 0.5323, "step": 3930 }, { "epoch": 11.005142857142857, "grad_norm": 0.26507726311683655, "learning_rate": 8.61015873015873e-06, "loss": 0.4584, "step": 3940 }, { "epoch": 11.005714285714285, "grad_norm": 0.09229809045791626, "learning_rate": 8.603809523809525e-06, "loss": 0.4409, "step": 3950 }, { "epoch": 11.006285714285715, "grad_norm": 0.14882345497608185, "learning_rate": 8.597460317460318e-06, "loss": 0.0062, "step": 3960 }, { "epoch": 11.006857142857143, "grad_norm": 12.128185272216797, "learning_rate": 8.591111111111112e-06, "loss": 0.2344, "step": 3970 }, { "epoch": 11.007428571428571, "grad_norm": 0.22297517955303192, "learning_rate": 8.584761904761906e-06, "loss": 0.1457, "step": 3980 }, { "epoch": 11.008, "grad_norm": 119.80109405517578, "learning_rate": 8.578412698412699e-06, "loss": 0.1712, "step": 3990 }, { "epoch": 11.008571428571429, "grad_norm": 0.09167554974555969, "learning_rate": 8.572063492063493e-06, "loss": 0.5816, "step": 4000 }, { "epoch": 11.009142857142857, "grad_norm": 21.545888900756836, "learning_rate": 8.565714285714286e-06, "loss": 0.9225, "step": 4010 }, { "epoch": 11.009714285714285, "grad_norm": 0.07726138085126877, "learning_rate": 8.55936507936508e-06, "loss": 0.3713, "step": 4020 }, { "epoch": 11.010285714285715, "grad_norm": 0.14826586842536926, "learning_rate": 8.553015873015875e-06, "loss": 0.2916, "step": 4030 }, { "epoch": 11.010857142857143, "grad_norm": 13.104598045349121, "learning_rate": 8.546666666666667e-06, "loss": 1.1717, "step": 4040 }, { "epoch": 11.01142857142857, "grad_norm": 0.14416873455047607, "learning_rate": 8.540317460317462e-06, "loss": 0.5383, "step": 4050 }, { "epoch": 11.012, "grad_norm": 0.22047214210033417, "learning_rate": 8.533968253968254e-06, "loss": 0.4027, "step": 4060 }, { "epoch": 11.012571428571428, "grad_norm": 1.8622279167175293, "learning_rate": 8.527619047619049e-06, "loss": 0.4155, "step": 4070 }, { "epoch": 11.013142857142856, "grad_norm": 18.07794952392578, "learning_rate": 8.521269841269843e-06, "loss": 0.2927, "step": 4080 }, { "epoch": 11.013714285714286, "grad_norm": 30.948429107666016, "learning_rate": 8.514920634920636e-06, "loss": 0.7502, "step": 4090 }, { "epoch": 11.014285714285714, "grad_norm": 12.959928512573242, "learning_rate": 8.50857142857143e-06, "loss": 0.2332, "step": 4100 }, { "epoch": 11.014857142857142, "grad_norm": 147.560546875, "learning_rate": 8.502222222222223e-06, "loss": 0.3589, "step": 4110 }, { "epoch": 11.015428571428572, "grad_norm": 0.5723152756690979, "learning_rate": 8.495873015873017e-06, "loss": 0.7663, "step": 4120 }, { "epoch": 11.016, "grad_norm": 0.13386109471321106, "learning_rate": 8.48952380952381e-06, "loss": 1.0655, "step": 4130 }, { "epoch": 11.016571428571428, "grad_norm": 0.2565602660179138, "learning_rate": 8.483174603174604e-06, "loss": 0.6372, "step": 4140 }, { "epoch": 11.017142857142858, "grad_norm": 0.517414391040802, "learning_rate": 8.476825396825398e-06, "loss": 0.2303, "step": 4150 }, { "epoch": 11.017714285714286, "grad_norm": 0.18709756433963776, "learning_rate": 8.47047619047619e-06, "loss": 0.8909, "step": 4160 }, { "epoch": 11.018285714285714, "grad_norm": 241.76470947265625, "learning_rate": 8.464126984126985e-06, "loss": 1.3007, "step": 4170 }, { "epoch": 11.018857142857144, "grad_norm": 0.4402504861354828, "learning_rate": 8.457777777777778e-06, "loss": 0.1889, "step": 4180 }, { "epoch": 11.019428571428572, "grad_norm": 0.13897043466567993, "learning_rate": 8.451428571428572e-06, "loss": 0.5257, "step": 4190 }, { "epoch": 11.02, "grad_norm": 12.089799880981445, "learning_rate": 8.445079365079366e-06, "loss": 0.6557, "step": 4200 }, { "epoch": 11.02, "eval_accuracy": 0.8383838383838383, "eval_loss": 0.650736927986145, "eval_runtime": 126.7199, "eval_samples_per_second": 2.344, "eval_steps_per_second": 1.176, "step": 4200 }, { "epoch": 12.000571428571428, "grad_norm": 0.39973193407058716, "learning_rate": 8.438730158730159e-06, "loss": 0.4371, "step": 4210 }, { "epoch": 12.001142857142858, "grad_norm": 11.976874351501465, "learning_rate": 8.432380952380953e-06, "loss": 0.4166, "step": 4220 }, { "epoch": 12.001714285714286, "grad_norm": 0.3415261209011078, "learning_rate": 8.426031746031746e-06, "loss": 0.3675, "step": 4230 }, { "epoch": 12.002285714285714, "grad_norm": 0.4809642732143402, "learning_rate": 8.41968253968254e-06, "loss": 0.2013, "step": 4240 }, { "epoch": 12.002857142857144, "grad_norm": 0.682577908039093, "learning_rate": 8.413333333333335e-06, "loss": 0.2326, "step": 4250 }, { "epoch": 12.003428571428572, "grad_norm": 0.03049817495048046, "learning_rate": 8.406984126984127e-06, "loss": 0.7267, "step": 4260 }, { "epoch": 12.004, "grad_norm": 0.10638611763715744, "learning_rate": 8.400634920634922e-06, "loss": 0.8069, "step": 4270 }, { "epoch": 12.00457142857143, "grad_norm": 0.041142355650663376, "learning_rate": 8.394285714285714e-06, "loss": 0.1764, "step": 4280 }, { "epoch": 12.005142857142857, "grad_norm": 0.5415995121002197, "learning_rate": 8.387936507936509e-06, "loss": 1.0413, "step": 4290 }, { "epoch": 12.005714285714285, "grad_norm": 0.4838305711746216, "learning_rate": 8.381587301587303e-06, "loss": 0.4062, "step": 4300 }, { "epoch": 12.006285714285715, "grad_norm": 23.351930618286133, "learning_rate": 8.375238095238096e-06, "loss": 0.4338, "step": 4310 }, { "epoch": 12.006857142857143, "grad_norm": 15.27731704711914, "learning_rate": 8.36888888888889e-06, "loss": 0.682, "step": 4320 }, { "epoch": 12.007428571428571, "grad_norm": 0.21492832899093628, "learning_rate": 8.362539682539683e-06, "loss": 0.2962, "step": 4330 }, { "epoch": 12.008, "grad_norm": 0.39623013138771057, "learning_rate": 8.356190476190477e-06, "loss": 0.4511, "step": 4340 }, { "epoch": 12.008571428571429, "grad_norm": 0.8621640205383301, "learning_rate": 8.34984126984127e-06, "loss": 0.7211, "step": 4350 }, { "epoch": 12.009142857142857, "grad_norm": 0.17645128071308136, "learning_rate": 8.343492063492064e-06, "loss": 0.609, "step": 4360 }, { "epoch": 12.009714285714285, "grad_norm": 1.3618069887161255, "learning_rate": 8.337142857142858e-06, "loss": 0.1656, "step": 4370 }, { "epoch": 12.010285714285715, "grad_norm": 12.654424667358398, "learning_rate": 8.33079365079365e-06, "loss": 0.715, "step": 4380 }, { "epoch": 12.010857142857143, "grad_norm": 14.740551948547363, "learning_rate": 8.324444444444445e-06, "loss": 1.23, "step": 4390 }, { "epoch": 12.01142857142857, "grad_norm": 33.95640182495117, "learning_rate": 8.318095238095238e-06, "loss": 0.6692, "step": 4400 }, { "epoch": 12.012, "grad_norm": 7.696796417236328, "learning_rate": 8.311746031746032e-06, "loss": 0.8213, "step": 4410 }, { "epoch": 12.012571428571428, "grad_norm": 0.5786144733428955, "learning_rate": 8.305396825396826e-06, "loss": 0.2544, "step": 4420 }, { "epoch": 12.013142857142856, "grad_norm": 0.296314537525177, "learning_rate": 8.29904761904762e-06, "loss": 0.0166, "step": 4430 }, { "epoch": 12.013714285714286, "grad_norm": 0.23542876541614532, "learning_rate": 8.292698412698413e-06, "loss": 0.2817, "step": 4440 }, { "epoch": 12.014285714285714, "grad_norm": 16.194673538208008, "learning_rate": 8.286349206349206e-06, "loss": 0.7401, "step": 4450 }, { "epoch": 12.014857142857142, "grad_norm": 0.1849094033241272, "learning_rate": 8.28e-06, "loss": 0.5863, "step": 4460 }, { "epoch": 12.015428571428572, "grad_norm": 0.14007309079170227, "learning_rate": 8.273650793650795e-06, "loss": 1.4727, "step": 4470 }, { "epoch": 12.016, "grad_norm": 74.8141098022461, "learning_rate": 8.267301587301589e-06, "loss": 0.4532, "step": 4480 }, { "epoch": 12.016571428571428, "grad_norm": 0.1595589816570282, "learning_rate": 8.260952380952382e-06, "loss": 0.2142, "step": 4490 }, { "epoch": 12.017142857142858, "grad_norm": 23.603548049926758, "learning_rate": 8.254603174603176e-06, "loss": 0.2174, "step": 4500 }, { "epoch": 12.017714285714286, "grad_norm": 0.32662734389305115, "learning_rate": 8.248253968253969e-06, "loss": 0.4129, "step": 4510 }, { "epoch": 12.018285714285714, "grad_norm": 15.74620532989502, "learning_rate": 8.241904761904761e-06, "loss": 0.5215, "step": 4520 }, { "epoch": 12.018857142857144, "grad_norm": 0.10295995324850082, "learning_rate": 8.235555555555557e-06, "loss": 0.2909, "step": 4530 }, { "epoch": 12.019428571428572, "grad_norm": 0.2319801300764084, "learning_rate": 8.22920634920635e-06, "loss": 1.1379, "step": 4540 }, { "epoch": 12.02, "grad_norm": 0.1083206757903099, "learning_rate": 8.222857142857144e-06, "loss": 0.2642, "step": 4550 }, { "epoch": 12.02, "eval_accuracy": 0.8518518518518519, "eval_loss": 0.6555494070053101, "eval_runtime": 127.2639, "eval_samples_per_second": 2.334, "eval_steps_per_second": 1.171, "step": 4550 }, { "epoch": 13.000571428571428, "grad_norm": 0.09389404952526093, "learning_rate": 8.216507936507937e-06, "loss": 0.2617, "step": 4560 }, { "epoch": 13.001142857142858, "grad_norm": 0.11154793202877045, "learning_rate": 8.210158730158731e-06, "loss": 0.0051, "step": 4570 }, { "epoch": 13.001714285714286, "grad_norm": 0.1510355919599533, "learning_rate": 8.203809523809526e-06, "loss": 0.0096, "step": 4580 }, { "epoch": 13.002285714285714, "grad_norm": 14.562790870666504, "learning_rate": 8.197460317460318e-06, "loss": 1.4925, "step": 4590 }, { "epoch": 13.002857142857144, "grad_norm": 0.1331201046705246, "learning_rate": 8.191111111111112e-06, "loss": 0.5703, "step": 4600 }, { "epoch": 13.003428571428572, "grad_norm": 0.2739226520061493, "learning_rate": 8.184761904761905e-06, "loss": 0.8987, "step": 4610 }, { "epoch": 13.004, "grad_norm": 0.46435093879699707, "learning_rate": 8.1784126984127e-06, "loss": 0.4531, "step": 4620 }, { "epoch": 13.00457142857143, "grad_norm": 0.17606890201568604, "learning_rate": 8.172063492063492e-06, "loss": 0.7506, "step": 4630 }, { "epoch": 13.005142857142857, "grad_norm": 0.20931817591190338, "learning_rate": 8.165714285714286e-06, "loss": 0.2208, "step": 4640 }, { "epoch": 13.005714285714285, "grad_norm": 0.1586388796567917, "learning_rate": 8.15936507936508e-06, "loss": 0.1573, "step": 4650 }, { "epoch": 13.006285714285715, "grad_norm": 0.2133455127477646, "learning_rate": 8.153015873015873e-06, "loss": 1.0149, "step": 4660 }, { "epoch": 13.006857142857143, "grad_norm": 0.10580915212631226, "learning_rate": 8.146666666666668e-06, "loss": 0.6209, "step": 4670 }, { "epoch": 13.007428571428571, "grad_norm": 0.05809102952480316, "learning_rate": 8.14031746031746e-06, "loss": 0.4834, "step": 4680 }, { "epoch": 13.008, "grad_norm": 0.18141770362854004, "learning_rate": 8.133968253968255e-06, "loss": 0.9549, "step": 4690 }, { "epoch": 13.008571428571429, "grad_norm": 0.3100875914096832, "learning_rate": 8.127619047619049e-06, "loss": 0.4513, "step": 4700 }, { "epoch": 13.009142857142857, "grad_norm": 12.605453491210938, "learning_rate": 8.121269841269842e-06, "loss": 0.9802, "step": 4710 }, { "epoch": 13.009714285714285, "grad_norm": 18.329814910888672, "learning_rate": 8.114920634920636e-06, "loss": 0.4474, "step": 4720 }, { "epoch": 13.010285714285715, "grad_norm": 0.18632696568965912, "learning_rate": 8.108571428571429e-06, "loss": 0.5227, "step": 4730 }, { "epoch": 13.010857142857143, "grad_norm": 0.15728500485420227, "learning_rate": 8.102222222222223e-06, "loss": 0.3175, "step": 4740 }, { "epoch": 13.01142857142857, "grad_norm": 16.559619903564453, "learning_rate": 8.095873015873017e-06, "loss": 0.4173, "step": 4750 }, { "epoch": 13.012, "grad_norm": 0.04901667311787605, "learning_rate": 8.08952380952381e-06, "loss": 0.3976, "step": 4760 }, { "epoch": 13.012571428571428, "grad_norm": 0.22993479669094086, "learning_rate": 8.083174603174604e-06, "loss": 0.1813, "step": 4770 }, { "epoch": 13.013142857142856, "grad_norm": 0.3767968416213989, "learning_rate": 8.076825396825397e-06, "loss": 0.7901, "step": 4780 }, { "epoch": 13.013714285714286, "grad_norm": 0.03023025207221508, "learning_rate": 8.070476190476191e-06, "loss": 0.1889, "step": 4790 }, { "epoch": 13.014285714285714, "grad_norm": 0.9322307109832764, "learning_rate": 8.064126984126984e-06, "loss": 1.3871, "step": 4800 }, { "epoch": 13.014857142857142, "grad_norm": 25.164409637451172, "learning_rate": 8.057777777777778e-06, "loss": 0.9618, "step": 4810 }, { "epoch": 13.015428571428572, "grad_norm": 94.88408660888672, "learning_rate": 8.051428571428573e-06, "loss": 0.632, "step": 4820 }, { "epoch": 13.016, "grad_norm": 0.15524975955486298, "learning_rate": 8.045079365079365e-06, "loss": 0.7624, "step": 4830 }, { "epoch": 13.016571428571428, "grad_norm": 0.06509774178266525, "learning_rate": 8.03873015873016e-06, "loss": 0.2143, "step": 4840 }, { "epoch": 13.017142857142858, "grad_norm": 0.24536223709583282, "learning_rate": 8.032380952380952e-06, "loss": 0.2895, "step": 4850 }, { "epoch": 13.017714285714286, "grad_norm": 58.50824737548828, "learning_rate": 8.026031746031746e-06, "loss": 0.0836, "step": 4860 }, { "epoch": 13.018285714285714, "grad_norm": 0.14553256332874298, "learning_rate": 8.01968253968254e-06, "loss": 0.672, "step": 4870 }, { "epoch": 13.018857142857144, "grad_norm": 0.06745839864015579, "learning_rate": 8.013333333333333e-06, "loss": 0.6554, "step": 4880 }, { "epoch": 13.019428571428572, "grad_norm": 0.15893854200839996, "learning_rate": 8.006984126984128e-06, "loss": 0.6895, "step": 4890 }, { "epoch": 13.02, "grad_norm": 0.23196399211883545, "learning_rate": 8.00063492063492e-06, "loss": 0.2413, "step": 4900 }, { "epoch": 13.02, "eval_accuracy": 0.8518518518518519, "eval_loss": 0.6480634808540344, "eval_runtime": 126.6312, "eval_samples_per_second": 2.345, "eval_steps_per_second": 1.177, "step": 4900 }, { "epoch": 14.000571428571428, "grad_norm": 0.1645181030035019, "learning_rate": 7.994285714285715e-06, "loss": 0.6897, "step": 4910 }, { "epoch": 14.001142857142858, "grad_norm": 168.658203125, "learning_rate": 7.987936507936509e-06, "loss": 0.8059, "step": 4920 }, { "epoch": 14.001714285714286, "grad_norm": 0.17724451422691345, "learning_rate": 7.981587301587302e-06, "loss": 0.4341, "step": 4930 }, { "epoch": 14.002285714285714, "grad_norm": 0.25565865635871887, "learning_rate": 7.975238095238096e-06, "loss": 0.588, "step": 4940 }, { "epoch": 14.002857142857144, "grad_norm": 0.2059396207332611, "learning_rate": 7.968888888888889e-06, "loss": 0.2607, "step": 4950 }, { "epoch": 14.003428571428572, "grad_norm": 21.05925941467285, "learning_rate": 7.962539682539683e-06, "loss": 0.676, "step": 4960 }, { "epoch": 14.004, "grad_norm": 0.20337478816509247, "learning_rate": 7.956190476190477e-06, "loss": 0.235, "step": 4970 }, { "epoch": 14.00457142857143, "grad_norm": 12.385817527770996, "learning_rate": 7.949841269841272e-06, "loss": 1.1436, "step": 4980 }, { "epoch": 14.005142857142857, "grad_norm": 169.90371704101562, "learning_rate": 7.943492063492064e-06, "loss": 0.4043, "step": 4990 }, { "epoch": 14.005714285714285, "grad_norm": 0.9865061044692993, "learning_rate": 7.937142857142857e-06, "loss": 0.5453, "step": 5000 }, { "epoch": 14.006285714285715, "grad_norm": 0.21574608981609344, "learning_rate": 7.930793650793651e-06, "loss": 0.4185, "step": 5010 }, { "epoch": 14.006857142857143, "grad_norm": 0.22090749442577362, "learning_rate": 7.924444444444444e-06, "loss": 0.6939, "step": 5020 }, { "epoch": 14.007428571428571, "grad_norm": 0.02600650116801262, "learning_rate": 7.91809523809524e-06, "loss": 0.8623, "step": 5030 }, { "epoch": 14.008, "grad_norm": 0.13779287040233612, "learning_rate": 7.911746031746033e-06, "loss": 0.0248, "step": 5040 }, { "epoch": 14.008571428571429, "grad_norm": 12.554780960083008, "learning_rate": 7.905396825396827e-06, "loss": 0.9988, "step": 5050 }, { "epoch": 14.009142857142857, "grad_norm": 0.41419580578804016, "learning_rate": 7.89904761904762e-06, "loss": 0.2302, "step": 5060 }, { "epoch": 14.009714285714285, "grad_norm": 0.16763442754745483, "learning_rate": 7.892698412698412e-06, "loss": 0.8218, "step": 5070 }, { "epoch": 14.010285714285715, "grad_norm": 120.92338562011719, "learning_rate": 7.886349206349208e-06, "loss": 0.5738, "step": 5080 }, { "epoch": 14.010857142857143, "grad_norm": 0.15212488174438477, "learning_rate": 7.88e-06, "loss": 0.8477, "step": 5090 }, { "epoch": 14.01142857142857, "grad_norm": 0.14688020944595337, "learning_rate": 7.873650793650795e-06, "loss": 0.2236, "step": 5100 }, { "epoch": 14.012, "grad_norm": 0.11865352839231491, "learning_rate": 7.867301587301588e-06, "loss": 0.2869, "step": 5110 }, { "epoch": 14.012571428571428, "grad_norm": 13.466479301452637, "learning_rate": 7.860952380952382e-06, "loss": 0.4885, "step": 5120 }, { "epoch": 14.013142857142856, "grad_norm": 0.07300411909818649, "learning_rate": 7.854603174603175e-06, "loss": 0.2464, "step": 5130 }, { "epoch": 14.013714285714286, "grad_norm": 0.052624981850385666, "learning_rate": 7.848253968253969e-06, "loss": 0.946, "step": 5140 }, { "epoch": 14.014285714285714, "grad_norm": 0.09969048947095871, "learning_rate": 7.841904761904763e-06, "loss": 0.3992, "step": 5150 }, { "epoch": 14.014857142857142, "grad_norm": 49.853572845458984, "learning_rate": 7.835555555555556e-06, "loss": 0.0232, "step": 5160 }, { "epoch": 14.015428571428572, "grad_norm": 0.3982076644897461, "learning_rate": 7.82920634920635e-06, "loss": 0.3344, "step": 5170 }, { "epoch": 14.016, "grad_norm": 1.8964109420776367, "learning_rate": 7.822857142857143e-06, "loss": 0.4479, "step": 5180 }, { "epoch": 14.016571428571428, "grad_norm": 13.034674644470215, "learning_rate": 7.816507936507937e-06, "loss": 0.4343, "step": 5190 }, { "epoch": 14.017142857142858, "grad_norm": 0.1686258316040039, "learning_rate": 7.810158730158732e-06, "loss": 0.4579, "step": 5200 }, { "epoch": 14.017714285714286, "grad_norm": 0.14098533987998962, "learning_rate": 7.803809523809524e-06, "loss": 0.49, "step": 5210 }, { "epoch": 14.018285714285714, "grad_norm": 13.026440620422363, "learning_rate": 7.797460317460319e-06, "loss": 0.2029, "step": 5220 }, { "epoch": 14.018857142857144, "grad_norm": 12.317622184753418, "learning_rate": 7.791111111111111e-06, "loss": 0.7561, "step": 5230 }, { "epoch": 14.019428571428572, "grad_norm": 0.3012208938598633, "learning_rate": 7.784761904761906e-06, "loss": 0.2343, "step": 5240 }, { "epoch": 14.02, "grad_norm": 151.4730987548828, "learning_rate": 7.7784126984127e-06, "loss": 0.6278, "step": 5250 }, { "epoch": 14.02, "eval_accuracy": 0.8552188552188552, "eval_loss": 0.6555138826370239, "eval_runtime": 126.3021, "eval_samples_per_second": 2.352, "eval_steps_per_second": 1.18, "step": 5250 }, { "epoch": 15.000571428571428, "grad_norm": 0.2991902828216553, "learning_rate": 7.772063492063493e-06, "loss": 0.8981, "step": 5260 }, { "epoch": 15.001142857142858, "grad_norm": 0.3251695930957794, "learning_rate": 7.765714285714287e-06, "loss": 0.552, "step": 5270 }, { "epoch": 15.001714285714286, "grad_norm": 0.38806581497192383, "learning_rate": 7.75936507936508e-06, "loss": 0.8149, "step": 5280 }, { "epoch": 15.002285714285714, "grad_norm": 1.2852102518081665, "learning_rate": 7.753015873015874e-06, "loss": 0.7302, "step": 5290 }, { "epoch": 15.002857142857144, "grad_norm": 0.33824622631073, "learning_rate": 7.746666666666666e-06, "loss": 0.6803, "step": 5300 }, { "epoch": 15.003428571428572, "grad_norm": 0.7059460282325745, "learning_rate": 7.74031746031746e-06, "loss": 0.8826, "step": 5310 }, { "epoch": 15.004, "grad_norm": 0.17795021831989288, "learning_rate": 7.733968253968255e-06, "loss": 0.3081, "step": 5320 }, { "epoch": 15.00457142857143, "grad_norm": 0.21305586397647858, "learning_rate": 7.727619047619048e-06, "loss": 0.4185, "step": 5330 }, { "epoch": 15.005142857142857, "grad_norm": 0.458345890045166, "learning_rate": 7.721269841269842e-06, "loss": 0.6545, "step": 5340 }, { "epoch": 15.005714285714285, "grad_norm": 0.11840073764324188, "learning_rate": 7.714920634920635e-06, "loss": 0.2937, "step": 5350 }, { "epoch": 15.006285714285715, "grad_norm": 0.17106355726718903, "learning_rate": 7.708571428571429e-06, "loss": 0.2317, "step": 5360 }, { "epoch": 15.006857142857143, "grad_norm": 0.09697124361991882, "learning_rate": 7.702222222222223e-06, "loss": 0.4099, "step": 5370 }, { "epoch": 15.007428571428571, "grad_norm": 0.021129153668880463, "learning_rate": 7.695873015873016e-06, "loss": 0.2169, "step": 5380 }, { "epoch": 15.008, "grad_norm": 0.4217149019241333, "learning_rate": 7.68952380952381e-06, "loss": 0.784, "step": 5390 }, { "epoch": 15.008571428571429, "grad_norm": 0.17383898794651031, "learning_rate": 7.683174603174603e-06, "loss": 0.2046, "step": 5400 }, { "epoch": 15.009142857142857, "grad_norm": 0.08642620593309402, "learning_rate": 7.676825396825397e-06, "loss": 0.2429, "step": 5410 }, { "epoch": 15.009714285714285, "grad_norm": 13.887992858886719, "learning_rate": 7.670476190476192e-06, "loss": 0.9628, "step": 5420 }, { "epoch": 15.010285714285715, "grad_norm": 0.249220609664917, "learning_rate": 7.664126984126984e-06, "loss": 0.5269, "step": 5430 }, { "epoch": 15.010857142857143, "grad_norm": 0.07579600065946579, "learning_rate": 7.657777777777779e-06, "loss": 0.4449, "step": 5440 }, { "epoch": 15.01142857142857, "grad_norm": 0.12848280370235443, "learning_rate": 7.651428571428571e-06, "loss": 0.245, "step": 5450 }, { "epoch": 15.012, "grad_norm": 13.671841621398926, "learning_rate": 7.645079365079366e-06, "loss": 0.7608, "step": 5460 }, { "epoch": 15.012571428571428, "grad_norm": 0.19747650623321533, "learning_rate": 7.63873015873016e-06, "loss": 0.3063, "step": 5470 }, { "epoch": 15.013142857142856, "grad_norm": 0.2382912039756775, "learning_rate": 7.632380952380953e-06, "loss": 0.6737, "step": 5480 }, { "epoch": 15.013714285714286, "grad_norm": 0.34730061888694763, "learning_rate": 7.626031746031747e-06, "loss": 0.3126, "step": 5490 }, { "epoch": 15.014285714285714, "grad_norm": 12.084749221801758, "learning_rate": 7.61968253968254e-06, "loss": 0.4176, "step": 5500 }, { "epoch": 15.014857142857142, "grad_norm": 0.24228954315185547, "learning_rate": 7.613333333333334e-06, "loss": 0.4131, "step": 5510 }, { "epoch": 15.015428571428572, "grad_norm": 40.641319274902344, "learning_rate": 7.606984126984127e-06, "loss": 0.5213, "step": 5520 }, { "epoch": 15.016, "grad_norm": 13.184736251831055, "learning_rate": 7.600634920634922e-06, "loss": 0.7655, "step": 5530 }, { "epoch": 15.016571428571428, "grad_norm": 0.10150767862796783, "learning_rate": 7.594285714285715e-06, "loss": 0.7638, "step": 5540 }, { "epoch": 15.017142857142858, "grad_norm": 0.16732892394065857, "learning_rate": 7.587936507936509e-06, "loss": 0.2141, "step": 5550 }, { "epoch": 15.017714285714286, "grad_norm": 0.31903275847435, "learning_rate": 7.581587301587302e-06, "loss": 0.44, "step": 5560 }, { "epoch": 15.018285714285714, "grad_norm": 0.04827267304062843, "learning_rate": 7.575238095238096e-06, "loss": 0.0074, "step": 5570 }, { "epoch": 15.018857142857144, "grad_norm": 0.05066690221428871, "learning_rate": 7.56888888888889e-06, "loss": 1.0622, "step": 5580 }, { "epoch": 15.019428571428572, "grad_norm": 0.14147022366523743, "learning_rate": 7.562539682539683e-06, "loss": 0.5052, "step": 5590 }, { "epoch": 15.02, "grad_norm": 0.4057610332965851, "learning_rate": 7.556190476190477e-06, "loss": 0.0107, "step": 5600 }, { "epoch": 15.02, "eval_accuracy": 0.8518518518518519, "eval_loss": 0.6550477147102356, "eval_runtime": 126.4968, "eval_samples_per_second": 2.348, "eval_steps_per_second": 1.178, "step": 5600 }, { "epoch": 16.00057142857143, "grad_norm": 0.12237854301929474, "learning_rate": 7.54984126984127e-06, "loss": 0.0925, "step": 5610 }, { "epoch": 16.001142857142856, "grad_norm": 0.1167961061000824, "learning_rate": 7.543492063492064e-06, "loss": 0.6409, "step": 5620 }, { "epoch": 16.001714285714286, "grad_norm": 0.33305051922798157, "learning_rate": 7.537142857142857e-06, "loss": 0.961, "step": 5630 }, { "epoch": 16.002285714285716, "grad_norm": 0.10626853257417679, "learning_rate": 7.530793650793652e-06, "loss": 1.2608, "step": 5640 }, { "epoch": 16.002857142857142, "grad_norm": 67.93856048583984, "learning_rate": 7.524444444444445e-06, "loss": 0.7227, "step": 5650 }, { "epoch": 16.00342857142857, "grad_norm": 0.09368494153022766, "learning_rate": 7.518095238095239e-06, "loss": 0.0237, "step": 5660 }, { "epoch": 16.004, "grad_norm": 0.1382751762866974, "learning_rate": 7.511746031746032e-06, "loss": 0.4677, "step": 5670 }, { "epoch": 16.004571428571428, "grad_norm": 0.31585273146629333, "learning_rate": 7.505396825396826e-06, "loss": 0.8341, "step": 5680 }, { "epoch": 16.005142857142857, "grad_norm": 0.2557847797870636, "learning_rate": 7.499047619047619e-06, "loss": 0.2347, "step": 5690 }, { "epoch": 16.005714285714287, "grad_norm": 0.14112916588783264, "learning_rate": 7.492698412698413e-06, "loss": 0.7506, "step": 5700 }, { "epoch": 16.006285714285713, "grad_norm": 109.31643676757812, "learning_rate": 7.486349206349207e-06, "loss": 0.6344, "step": 5710 }, { "epoch": 16.006857142857143, "grad_norm": 0.407155305147171, "learning_rate": 7.48e-06, "loss": 0.2471, "step": 5720 }, { "epoch": 16.007428571428573, "grad_norm": 0.037655897438526154, "learning_rate": 7.473650793650794e-06, "loss": 0.4707, "step": 5730 }, { "epoch": 16.008, "grad_norm": 0.08762380480766296, "learning_rate": 7.467301587301587e-06, "loss": 0.0635, "step": 5740 }, { "epoch": 16.00857142857143, "grad_norm": 0.15955060720443726, "learning_rate": 7.460952380952382e-06, "loss": 0.5162, "step": 5750 }, { "epoch": 16.00914285714286, "grad_norm": 13.772893905639648, "learning_rate": 7.454603174603175e-06, "loss": 0.6237, "step": 5760 }, { "epoch": 16.009714285714285, "grad_norm": 20.581626892089844, "learning_rate": 7.448253968253969e-06, "loss": 0.7149, "step": 5770 }, { "epoch": 16.010285714285715, "grad_norm": 0.14489834010601044, "learning_rate": 7.441904761904762e-06, "loss": 0.0079, "step": 5780 }, { "epoch": 16.010857142857144, "grad_norm": 0.1383170783519745, "learning_rate": 7.435555555555556e-06, "loss": 0.2527, "step": 5790 }, { "epoch": 16.01142857142857, "grad_norm": 68.98406219482422, "learning_rate": 7.429206349206349e-06, "loss": 0.3456, "step": 5800 }, { "epoch": 16.012, "grad_norm": 0.11179056763648987, "learning_rate": 7.422857142857144e-06, "loss": 0.0647, "step": 5810 }, { "epoch": 16.01257142857143, "grad_norm": 0.18071648478507996, "learning_rate": 7.416507936507937e-06, "loss": 0.5227, "step": 5820 }, { "epoch": 16.013142857142856, "grad_norm": 67.35832214355469, "learning_rate": 7.41015873015873e-06, "loss": 0.3971, "step": 5830 }, { "epoch": 16.013714285714286, "grad_norm": 0.1837083250284195, "learning_rate": 7.403809523809524e-06, "loss": 0.8512, "step": 5840 }, { "epoch": 16.014285714285716, "grad_norm": 0.038320187479257584, "learning_rate": 7.397460317460317e-06, "loss": 0.0037, "step": 5850 }, { "epoch": 16.014857142857142, "grad_norm": 20.66285514831543, "learning_rate": 7.3911111111111125e-06, "loss": 1.123, "step": 5860 }, { "epoch": 16.015428571428572, "grad_norm": 133.61224365234375, "learning_rate": 7.384761904761906e-06, "loss": 0.3835, "step": 5870 }, { "epoch": 16.016, "grad_norm": 0.17678341269493103, "learning_rate": 7.3784126984126995e-06, "loss": 0.1001, "step": 5880 }, { "epoch": 16.016571428571428, "grad_norm": 0.09736228734254837, "learning_rate": 7.372063492063493e-06, "loss": 0.2866, "step": 5890 }, { "epoch": 16.017142857142858, "grad_norm": 0.09368955343961716, "learning_rate": 7.365714285714286e-06, "loss": 1.0441, "step": 5900 }, { "epoch": 16.017714285714284, "grad_norm": 0.10775759816169739, "learning_rate": 7.359365079365079e-06, "loss": 0.3626, "step": 5910 }, { "epoch": 16.018285714285714, "grad_norm": 0.2658800482749939, "learning_rate": 7.353015873015874e-06, "loss": 0.3755, "step": 5920 }, { "epoch": 16.018857142857144, "grad_norm": 0.24060992896556854, "learning_rate": 7.346666666666668e-06, "loss": 1.028, "step": 5930 }, { "epoch": 16.01942857142857, "grad_norm": 0.12600265443325043, "learning_rate": 7.340317460317461e-06, "loss": 0.5612, "step": 5940 }, { "epoch": 16.02, "grad_norm": 0.12872040271759033, "learning_rate": 7.333968253968255e-06, "loss": 0.3013, "step": 5950 }, { "epoch": 16.02, "eval_accuracy": 0.8484848484848485, "eval_loss": 0.7404991984367371, "eval_runtime": 126.748, "eval_samples_per_second": 2.343, "eval_steps_per_second": 1.176, "step": 5950 }, { "epoch": 17.00057142857143, "grad_norm": 0.23484548926353455, "learning_rate": 7.327619047619048e-06, "loss": 0.4457, "step": 5960 }, { "epoch": 17.001142857142856, "grad_norm": 0.14240051805973053, "learning_rate": 7.3212698412698425e-06, "loss": 0.0042, "step": 5970 }, { "epoch": 17.001714285714286, "grad_norm": 0.15564581751823425, "learning_rate": 7.314920634920636e-06, "loss": 0.6894, "step": 5980 }, { "epoch": 17.002285714285716, "grad_norm": 127.26374816894531, "learning_rate": 7.3085714285714295e-06, "loss": 1.2778, "step": 5990 }, { "epoch": 17.002857142857142, "grad_norm": 0.1205485537648201, "learning_rate": 7.302222222222223e-06, "loss": 0.2591, "step": 6000 }, { "epoch": 17.00342857142857, "grad_norm": 0.14839039742946625, "learning_rate": 7.2958730158730165e-06, "loss": 0.2446, "step": 6010 }, { "epoch": 17.004, "grad_norm": 0.2482178807258606, "learning_rate": 7.28952380952381e-06, "loss": 1.1471, "step": 6020 }, { "epoch": 17.004571428571428, "grad_norm": 0.6684224009513855, "learning_rate": 7.283174603174604e-06, "loss": 0.1935, "step": 6030 }, { "epoch": 17.005142857142857, "grad_norm": 0.31114834547042847, "learning_rate": 7.276825396825398e-06, "loss": 0.374, "step": 6040 }, { "epoch": 17.005714285714287, "grad_norm": 0.241834819316864, "learning_rate": 7.270476190476191e-06, "loss": 0.932, "step": 6050 }, { "epoch": 17.006285714285713, "grad_norm": 17.917661666870117, "learning_rate": 7.264126984126985e-06, "loss": 0.5023, "step": 6060 }, { "epoch": 17.006857142857143, "grad_norm": 0.11860672384500504, "learning_rate": 7.257777777777778e-06, "loss": 0.2441, "step": 6070 }, { "epoch": 17.007428571428573, "grad_norm": 0.12156573683023453, "learning_rate": 7.251428571428572e-06, "loss": 0.4809, "step": 6080 }, { "epoch": 17.008, "grad_norm": 32.46079635620117, "learning_rate": 7.245079365079366e-06, "loss": 1.0452, "step": 6090 }, { "epoch": 17.00857142857143, "grad_norm": 1.8947296142578125, "learning_rate": 7.2387301587301595e-06, "loss": 0.2181, "step": 6100 }, { "epoch": 17.00914285714286, "grad_norm": 1.0444539785385132, "learning_rate": 7.232380952380953e-06, "loss": 0.2129, "step": 6110 }, { "epoch": 17.009714285714285, "grad_norm": 9.648567199707031, "learning_rate": 7.2260317460317465e-06, "loss": 0.2156, "step": 6120 }, { "epoch": 17.010285714285715, "grad_norm": 0.24711275100708008, "learning_rate": 7.21968253968254e-06, "loss": 0.0108, "step": 6130 }, { "epoch": 17.010857142857144, "grad_norm": 0.22460851073265076, "learning_rate": 7.213333333333334e-06, "loss": 0.645, "step": 6140 }, { "epoch": 17.01142857142857, "grad_norm": 49.333404541015625, "learning_rate": 7.206984126984128e-06, "loss": 0.6497, "step": 6150 }, { "epoch": 17.012, "grad_norm": 0.22554242610931396, "learning_rate": 7.200634920634921e-06, "loss": 0.7637, "step": 6160 }, { "epoch": 17.01257142857143, "grad_norm": 0.2542533576488495, "learning_rate": 7.194285714285715e-06, "loss": 0.6032, "step": 6170 }, { "epoch": 17.013142857142856, "grad_norm": 0.03277142718434334, "learning_rate": 7.187936507936508e-06, "loss": 0.4751, "step": 6180 }, { "epoch": 17.013714285714286, "grad_norm": 0.09158936887979507, "learning_rate": 7.181587301587302e-06, "loss": 0.433, "step": 6190 }, { "epoch": 17.014285714285716, "grad_norm": 0.2807057201862335, "learning_rate": 7.175238095238096e-06, "loss": 0.2971, "step": 6200 }, { "epoch": 17.014857142857142, "grad_norm": 80.08293914794922, "learning_rate": 7.1688888888888895e-06, "loss": 0.4443, "step": 6210 }, { "epoch": 17.015428571428572, "grad_norm": 0.19456472992897034, "learning_rate": 7.162539682539683e-06, "loss": 0.5821, "step": 6220 }, { "epoch": 17.016, "grad_norm": 0.07997258752584457, "learning_rate": 7.1561904761904765e-06, "loss": 1.0668, "step": 6230 }, { "epoch": 17.016571428571428, "grad_norm": 0.1534426212310791, "learning_rate": 7.14984126984127e-06, "loss": 0.4562, "step": 6240 }, { "epoch": 17.017142857142858, "grad_norm": 0.36759310960769653, "learning_rate": 7.143492063492064e-06, "loss": 0.2592, "step": 6250 }, { "epoch": 17.017714285714284, "grad_norm": 0.15657585859298706, "learning_rate": 7.137142857142858e-06, "loss": 0.5768, "step": 6260 }, { "epoch": 17.018285714285714, "grad_norm": 0.13546155393123627, "learning_rate": 7.130793650793651e-06, "loss": 0.2564, "step": 6270 }, { "epoch": 17.018857142857144, "grad_norm": 0.14879682660102844, "learning_rate": 7.124444444444445e-06, "loss": 0.987, "step": 6280 }, { "epoch": 17.01942857142857, "grad_norm": 20.40616226196289, "learning_rate": 7.118095238095238e-06, "loss": 0.8061, "step": 6290 }, { "epoch": 17.02, "grad_norm": 0.3881203234195709, "learning_rate": 7.111746031746032e-06, "loss": 0.5055, "step": 6300 }, { "epoch": 17.02, "eval_accuracy": 0.8451178451178452, "eval_loss": 0.6562865376472473, "eval_runtime": 127.2408, "eval_samples_per_second": 2.334, "eval_steps_per_second": 1.171, "step": 6300 }, { "epoch": 18.00057142857143, "grad_norm": 1.636939287185669, "learning_rate": 7.105396825396826e-06, "loss": 0.9626, "step": 6310 }, { "epoch": 18.001142857142856, "grad_norm": 1.2696601152420044, "learning_rate": 7.0990476190476195e-06, "loss": 1.1254, "step": 6320 }, { "epoch": 18.001714285714286, "grad_norm": 31.090442657470703, "learning_rate": 7.092698412698413e-06, "loss": 0.2333, "step": 6330 }, { "epoch": 18.002285714285716, "grad_norm": 122.07534790039062, "learning_rate": 7.0863492063492065e-06, "loss": 0.3172, "step": 6340 }, { "epoch": 18.002857142857142, "grad_norm": 0.2747611701488495, "learning_rate": 7.08e-06, "loss": 0.3612, "step": 6350 }, { "epoch": 18.00342857142857, "grad_norm": 0.3476632535457611, "learning_rate": 7.073650793650795e-06, "loss": 0.8514, "step": 6360 }, { "epoch": 18.004, "grad_norm": 0.03347006067633629, "learning_rate": 7.067301587301589e-06, "loss": 0.3275, "step": 6370 }, { "epoch": 18.004571428571428, "grad_norm": 84.09701538085938, "learning_rate": 7.060952380952381e-06, "loss": 0.5278, "step": 6380 }, { "epoch": 18.005142857142857, "grad_norm": 0.21676389873027802, "learning_rate": 7.054603174603175e-06, "loss": 0.612, "step": 6390 }, { "epoch": 18.005714285714287, "grad_norm": 160.529541015625, "learning_rate": 7.048253968253968e-06, "loss": 0.4884, "step": 6400 }, { "epoch": 18.006285714285713, "grad_norm": 0.364534467458725, "learning_rate": 7.041904761904762e-06, "loss": 0.0083, "step": 6410 }, { "epoch": 18.006857142857143, "grad_norm": 0.11234438419342041, "learning_rate": 7.035555555555557e-06, "loss": 0.0106, "step": 6420 }, { "epoch": 18.007428571428573, "grad_norm": 1.1077744960784912, "learning_rate": 7.02920634920635e-06, "loss": 0.0066, "step": 6430 }, { "epoch": 18.008, "grad_norm": 0.13264243304729462, "learning_rate": 7.022857142857144e-06, "loss": 0.4173, "step": 6440 }, { "epoch": 18.00857142857143, "grad_norm": 0.1175365149974823, "learning_rate": 7.016507936507937e-06, "loss": 0.6565, "step": 6450 }, { "epoch": 18.00914285714286, "grad_norm": 0.2617458701133728, "learning_rate": 7.01015873015873e-06, "loss": 0.2389, "step": 6460 }, { "epoch": 18.009714285714285, "grad_norm": 46.34348678588867, "learning_rate": 7.0038095238095235e-06, "loss": 0.4956, "step": 6470 }, { "epoch": 18.010285714285715, "grad_norm": 153.98577880859375, "learning_rate": 6.997460317460319e-06, "loss": 1.0497, "step": 6480 }, { "epoch": 18.010857142857144, "grad_norm": 0.19409751892089844, "learning_rate": 6.991111111111112e-06, "loss": 0.412, "step": 6490 }, { "epoch": 18.01142857142857, "grad_norm": 0.12201213091611862, "learning_rate": 6.984761904761906e-06, "loss": 0.4613, "step": 6500 }, { "epoch": 18.012, "grad_norm": 0.020613886415958405, "learning_rate": 6.978412698412699e-06, "loss": 0.3496, "step": 6510 }, { "epoch": 18.01257142857143, "grad_norm": 0.1668790876865387, "learning_rate": 6.9720634920634926e-06, "loss": 0.4584, "step": 6520 }, { "epoch": 18.013142857142856, "grad_norm": 0.15408121049404144, "learning_rate": 6.965714285714287e-06, "loss": 0.6383, "step": 6530 }, { "epoch": 18.013714285714286, "grad_norm": 12.476696968078613, "learning_rate": 6.95936507936508e-06, "loss": 0.4078, "step": 6540 }, { "epoch": 18.014285714285716, "grad_norm": 0.240266352891922, "learning_rate": 6.953015873015874e-06, "loss": 0.4204, "step": 6550 }, { "epoch": 18.014857142857142, "grad_norm": 0.20230837166309357, "learning_rate": 6.946666666666667e-06, "loss": 0.4396, "step": 6560 }, { "epoch": 18.015428571428572, "grad_norm": 33.412925720214844, "learning_rate": 6.940317460317461e-06, "loss": 0.7438, "step": 6570 }, { "epoch": 18.016, "grad_norm": 21.571928024291992, "learning_rate": 6.933968253968254e-06, "loss": 0.4501, "step": 6580 }, { "epoch": 18.016571428571428, "grad_norm": 0.3260933458805084, "learning_rate": 6.927619047619049e-06, "loss": 0.1881, "step": 6590 }, { "epoch": 18.017142857142858, "grad_norm": 0.07090619206428528, "learning_rate": 6.921269841269842e-06, "loss": 0.7295, "step": 6600 }, { "epoch": 18.017714285714284, "grad_norm": 0.2709910571575165, "learning_rate": 6.914920634920636e-06, "loss": 0.0077, "step": 6610 }, { "epoch": 18.018285714285714, "grad_norm": 0.2425944209098816, "learning_rate": 6.908571428571429e-06, "loss": 0.7675, "step": 6620 }, { "epoch": 18.018857142857144, "grad_norm": 12.433034896850586, "learning_rate": 6.902222222222223e-06, "loss": 0.5643, "step": 6630 }, { "epoch": 18.01942857142857, "grad_norm": 0.23004376888275146, "learning_rate": 6.895873015873017e-06, "loss": 0.2302, "step": 6640 }, { "epoch": 18.02, "grad_norm": 0.1142105832695961, "learning_rate": 6.88952380952381e-06, "loss": 0.0059, "step": 6650 }, { "epoch": 18.02, "eval_accuracy": 0.8484848484848485, "eval_loss": 0.6916897296905518, "eval_runtime": 126.3823, "eval_samples_per_second": 2.35, "eval_steps_per_second": 1.179, "step": 6650 }, { "epoch": 19.00057142857143, "grad_norm": 0.010271835140883923, "learning_rate": 6.883174603174604e-06, "loss": 0.158, "step": 6660 }, { "epoch": 19.001142857142856, "grad_norm": 15.551984786987305, "learning_rate": 6.876825396825397e-06, "loss": 0.7788, "step": 6670 }, { "epoch": 19.001714285714286, "grad_norm": 0.12954816222190857, "learning_rate": 6.870476190476191e-06, "loss": 0.3656, "step": 6680 }, { "epoch": 19.002285714285716, "grad_norm": 4.656391620635986, "learning_rate": 6.864126984126984e-06, "loss": 0.689, "step": 6690 }, { "epoch": 19.002857142857142, "grad_norm": 0.13569696247577667, "learning_rate": 6.857777777777779e-06, "loss": 0.1878, "step": 6700 }, { "epoch": 19.00342857142857, "grad_norm": 0.16348059475421906, "learning_rate": 6.851428571428572e-06, "loss": 0.8311, "step": 6710 }, { "epoch": 19.004, "grad_norm": 12.783551216125488, "learning_rate": 6.845079365079366e-06, "loss": 0.46, "step": 6720 }, { "epoch": 19.004571428571428, "grad_norm": 0.24606812000274658, "learning_rate": 6.838730158730159e-06, "loss": 0.587, "step": 6730 }, { "epoch": 19.005142857142857, "grad_norm": 0.12452604621648788, "learning_rate": 6.832380952380953e-06, "loss": 0.0415, "step": 6740 }, { "epoch": 19.005714285714287, "grad_norm": 14.892534255981445, "learning_rate": 6.826031746031747e-06, "loss": 0.7738, "step": 6750 }, { "epoch": 19.006285714285713, "grad_norm": 0.1208721324801445, "learning_rate": 6.81968253968254e-06, "loss": 0.2166, "step": 6760 }, { "epoch": 19.006857142857143, "grad_norm": 0.14853811264038086, "learning_rate": 6.813333333333334e-06, "loss": 0.0052, "step": 6770 }, { "epoch": 19.007428571428573, "grad_norm": 0.3613605201244354, "learning_rate": 6.806984126984127e-06, "loss": 0.2232, "step": 6780 }, { "epoch": 19.008, "grad_norm": 11.996315956115723, "learning_rate": 6.800634920634921e-06, "loss": 0.4984, "step": 6790 }, { "epoch": 19.00857142857143, "grad_norm": 11.95859146118164, "learning_rate": 6.794285714285714e-06, "loss": 0.525, "step": 6800 }, { "epoch": 19.00914285714286, "grad_norm": 0.1372225433588028, "learning_rate": 6.787936507936509e-06, "loss": 0.0046, "step": 6810 }, { "epoch": 19.009714285714285, "grad_norm": 13.401540756225586, "learning_rate": 6.781587301587302e-06, "loss": 0.2888, "step": 6820 }, { "epoch": 19.010285714285715, "grad_norm": 0.1333763599395752, "learning_rate": 6.775238095238096e-06, "loss": 0.4673, "step": 6830 }, { "epoch": 19.010857142857144, "grad_norm": 0.28985071182250977, "learning_rate": 6.768888888888889e-06, "loss": 0.4904, "step": 6840 }, { "epoch": 19.01142857142857, "grad_norm": 0.5672123432159424, "learning_rate": 6.762539682539683e-06, "loss": 0.38, "step": 6850 }, { "epoch": 19.012, "grad_norm": 0.18549109995365143, "learning_rate": 6.756190476190476e-06, "loss": 0.4654, "step": 6860 }, { "epoch": 19.01257142857143, "grad_norm": 0.005712473299354315, "learning_rate": 6.74984126984127e-06, "loss": 0.4693, "step": 6870 }, { "epoch": 19.013142857142856, "grad_norm": 0.20795473456382751, "learning_rate": 6.743492063492064e-06, "loss": 0.6987, "step": 6880 }, { "epoch": 19.013714285714286, "grad_norm": 0.2660701274871826, "learning_rate": 6.737142857142857e-06, "loss": 0.3575, "step": 6890 }, { "epoch": 19.014285714285716, "grad_norm": 12.896868705749512, "learning_rate": 6.730793650793651e-06, "loss": 1.0003, "step": 6900 }, { "epoch": 19.014857142857142, "grad_norm": 90.52599334716797, "learning_rate": 6.724444444444444e-06, "loss": 0.5481, "step": 6910 }, { "epoch": 19.015428571428572, "grad_norm": 0.11832074075937271, "learning_rate": 6.7180952380952395e-06, "loss": 0.461, "step": 6920 }, { "epoch": 19.016, "grad_norm": 0.021740248426795006, "learning_rate": 6.711746031746032e-06, "loss": 0.94, "step": 6930 }, { "epoch": 19.016571428571428, "grad_norm": 0.14676664769649506, "learning_rate": 6.705396825396826e-06, "loss": 0.2698, "step": 6940 }, { "epoch": 19.017142857142858, "grad_norm": 0.01525693666189909, "learning_rate": 6.699047619047619e-06, "loss": 0.5851, "step": 6950 }, { "epoch": 19.017714285714284, "grad_norm": 0.5535984039306641, "learning_rate": 6.692698412698413e-06, "loss": 0.3717, "step": 6960 }, { "epoch": 19.018285714285714, "grad_norm": 0.43303757905960083, "learning_rate": 6.686349206349206e-06, "loss": 0.5912, "step": 6970 }, { "epoch": 19.018857142857144, "grad_norm": 0.15641167759895325, "learning_rate": 6.680000000000001e-06, "loss": 0.2061, "step": 6980 }, { "epoch": 19.01942857142857, "grad_norm": 0.11748301982879639, "learning_rate": 6.673650793650795e-06, "loss": 0.326, "step": 6990 }, { "epoch": 19.02, "grad_norm": 0.05292058736085892, "learning_rate": 6.667301587301588e-06, "loss": 0.4332, "step": 7000 }, { "epoch": 19.02, "eval_accuracy": 0.8383838383838383, "eval_loss": 0.6888241767883301, "eval_runtime": 134.2388, "eval_samples_per_second": 2.212, "eval_steps_per_second": 1.11, "step": 7000 }, { "epoch": 20.00057142857143, "grad_norm": 0.4945196211338043, "learning_rate": 6.660952380952381e-06, "loss": 0.103, "step": 7010 }, { "epoch": 20.001142857142856, "grad_norm": 16.183570861816406, "learning_rate": 6.654603174603174e-06, "loss": 0.2379, "step": 7020 }, { "epoch": 20.001714285714286, "grad_norm": 0.07635916769504547, "learning_rate": 6.6482539682539695e-06, "loss": 0.6419, "step": 7030 }, { "epoch": 20.002285714285716, "grad_norm": 159.50653076171875, "learning_rate": 6.641904761904763e-06, "loss": 0.5751, "step": 7040 }, { "epoch": 20.002857142857142, "grad_norm": 0.09001730382442474, "learning_rate": 6.6355555555555565e-06, "loss": 0.254, "step": 7050 }, { "epoch": 20.00342857142857, "grad_norm": 0.07919283211231232, "learning_rate": 6.62920634920635e-06, "loss": 0.7664, "step": 7060 }, { "epoch": 20.004, "grad_norm": 280.9998779296875, "learning_rate": 6.6228571428571435e-06, "loss": 0.2373, "step": 7070 }, { "epoch": 20.004571428571428, "grad_norm": 0.3689022362232208, "learning_rate": 6.616507936507937e-06, "loss": 0.4658, "step": 7080 }, { "epoch": 20.005142857142857, "grad_norm": 0.10695375502109528, "learning_rate": 6.610158730158731e-06, "loss": 0.3698, "step": 7090 }, { "epoch": 20.005714285714287, "grad_norm": 0.13924677670001984, "learning_rate": 6.603809523809525e-06, "loss": 0.5151, "step": 7100 }, { "epoch": 20.006285714285713, "grad_norm": 0.05997047573328018, "learning_rate": 6.597460317460318e-06, "loss": 0.3443, "step": 7110 }, { "epoch": 20.006857142857143, "grad_norm": 15.691723823547363, "learning_rate": 6.591111111111112e-06, "loss": 0.9827, "step": 7120 }, { "epoch": 20.007428571428573, "grad_norm": 0.18234623968601227, "learning_rate": 6.584761904761905e-06, "loss": 0.5862, "step": 7130 }, { "epoch": 20.008, "grad_norm": 0.27075400948524475, "learning_rate": 6.5784126984126995e-06, "loss": 0.1941, "step": 7140 }, { "epoch": 20.00857142857143, "grad_norm": 1.7042484283447266, "learning_rate": 6.572063492063493e-06, "loss": 0.5991, "step": 7150 }, { "epoch": 20.00914285714286, "grad_norm": 0.16708026826381683, "learning_rate": 6.5657142857142865e-06, "loss": 0.3161, "step": 7160 }, { "epoch": 20.009714285714285, "grad_norm": 0.2559516131877899, "learning_rate": 6.55936507936508e-06, "loss": 0.3776, "step": 7170 }, { "epoch": 20.010285714285715, "grad_norm": 0.19110074639320374, "learning_rate": 6.5530158730158735e-06, "loss": 0.9548, "step": 7180 }, { "epoch": 20.010857142857144, "grad_norm": 56.03752517700195, "learning_rate": 6.546666666666667e-06, "loss": 0.2375, "step": 7190 }, { "epoch": 20.01142857142857, "grad_norm": 16.357572555541992, "learning_rate": 6.540317460317461e-06, "loss": 0.5034, "step": 7200 }, { "epoch": 20.012, "grad_norm": 0.40159788727760315, "learning_rate": 6.533968253968255e-06, "loss": 0.4687, "step": 7210 }, { "epoch": 20.01257142857143, "grad_norm": 0.009314529597759247, "learning_rate": 6.527619047619048e-06, "loss": 0.4526, "step": 7220 }, { "epoch": 20.013142857142856, "grad_norm": 0.13374114036560059, "learning_rate": 6.521269841269842e-06, "loss": 0.9903, "step": 7230 }, { "epoch": 20.013714285714286, "grad_norm": 0.020518776029348373, "learning_rate": 6.514920634920635e-06, "loss": 0.0067, "step": 7240 }, { "epoch": 20.014285714285716, "grad_norm": 0.15277713537216187, "learning_rate": 6.5085714285714295e-06, "loss": 0.0422, "step": 7250 }, { "epoch": 20.014857142857142, "grad_norm": 0.10491526871919632, "learning_rate": 6.502222222222223e-06, "loss": 0.0047, "step": 7260 }, { "epoch": 20.015428571428572, "grad_norm": 15.871146202087402, "learning_rate": 6.4958730158730165e-06, "loss": 0.2825, "step": 7270 }, { "epoch": 20.016, "grad_norm": 0.11634642630815506, "learning_rate": 6.48952380952381e-06, "loss": 0.0032, "step": 7280 }, { "epoch": 20.016571428571428, "grad_norm": 0.014390116557478905, "learning_rate": 6.4831746031746035e-06, "loss": 0.6739, "step": 7290 }, { "epoch": 20.017142857142858, "grad_norm": 0.012950708158314228, "learning_rate": 6.476825396825397e-06, "loss": 1.2164, "step": 7300 }, { "epoch": 20.017714285714284, "grad_norm": 0.13078835606575012, "learning_rate": 6.470476190476191e-06, "loss": 0.212, "step": 7310 }, { "epoch": 20.018285714285714, "grad_norm": 0.28521332144737244, "learning_rate": 6.464126984126985e-06, "loss": 0.6134, "step": 7320 }, { "epoch": 20.018857142857144, "grad_norm": 0.14261774718761444, "learning_rate": 6.457777777777778e-06, "loss": 0.0948, "step": 7330 }, { "epoch": 20.01942857142857, "grad_norm": 0.2758195400238037, "learning_rate": 6.451428571428572e-06, "loss": 0.1811, "step": 7340 }, { "epoch": 20.02, "grad_norm": 0.05165260285139084, "learning_rate": 6.445079365079365e-06, "loss": 0.2602, "step": 7350 }, { "epoch": 20.02, "eval_accuracy": 0.8417508417508418, "eval_loss": 0.7993361353874207, "eval_runtime": 133.8895, "eval_samples_per_second": 2.218, "eval_steps_per_second": 1.113, "step": 7350 }, { "epoch": 21.00057142857143, "grad_norm": 0.03576577454805374, "learning_rate": 6.438730158730159e-06, "loss": 0.2528, "step": 7360 }, { "epoch": 21.001142857142856, "grad_norm": 0.05446856468915939, "learning_rate": 6.432380952380953e-06, "loss": 0.0034, "step": 7370 }, { "epoch": 21.001714285714286, "grad_norm": 50.52811050415039, "learning_rate": 6.4260317460317465e-06, "loss": 1.0569, "step": 7380 }, { "epoch": 21.002285714285716, "grad_norm": 26.091224670410156, "learning_rate": 6.41968253968254e-06, "loss": 0.2889, "step": 7390 }, { "epoch": 21.002857142857142, "grad_norm": 0.09547077119350433, "learning_rate": 6.4133333333333335e-06, "loss": 0.0366, "step": 7400 }, { "epoch": 21.00342857142857, "grad_norm": 0.08614878356456757, "learning_rate": 6.406984126984127e-06, "loss": 0.2737, "step": 7410 }, { "epoch": 21.004, "grad_norm": 0.03283363953232765, "learning_rate": 6.400634920634921e-06, "loss": 0.0022, "step": 7420 }, { "epoch": 21.004571428571428, "grad_norm": 4.929975509643555, "learning_rate": 6.394285714285715e-06, "loss": 0.5315, "step": 7430 }, { "epoch": 21.005142857142857, "grad_norm": 106.62588500976562, "learning_rate": 6.387936507936508e-06, "loss": 0.4519, "step": 7440 }, { "epoch": 21.005714285714287, "grad_norm": 0.04590607061982155, "learning_rate": 6.381587301587302e-06, "loss": 0.4353, "step": 7450 }, { "epoch": 21.006285714285713, "grad_norm": 0.10316039621829987, "learning_rate": 6.375238095238095e-06, "loss": 0.2364, "step": 7460 }, { "epoch": 21.006857142857143, "grad_norm": 28.117801666259766, "learning_rate": 6.368888888888889e-06, "loss": 0.6987, "step": 7470 }, { "epoch": 21.007428571428573, "grad_norm": 0.1716347187757492, "learning_rate": 6.362539682539683e-06, "loss": 1.1197, "step": 7480 }, { "epoch": 21.008, "grad_norm": 17.753265380859375, "learning_rate": 6.3561904761904765e-06, "loss": 0.1678, "step": 7490 }, { "epoch": 21.00857142857143, "grad_norm": 0.021444451063871384, "learning_rate": 6.34984126984127e-06, "loss": 0.1589, "step": 7500 }, { "epoch": 21.00914285714286, "grad_norm": 0.013452350161969662, "learning_rate": 6.3434920634920635e-06, "loss": 0.4614, "step": 7510 }, { "epoch": 21.009714285714285, "grad_norm": 0.023600058630108833, "learning_rate": 6.337142857142857e-06, "loss": 0.25, "step": 7520 }, { "epoch": 21.010285714285715, "grad_norm": 0.13624207675457, "learning_rate": 6.330793650793652e-06, "loss": 0.334, "step": 7530 }, { "epoch": 21.010857142857144, "grad_norm": 0.15217389166355133, "learning_rate": 6.324444444444446e-06, "loss": 0.3943, "step": 7540 }, { "epoch": 21.01142857142857, "grad_norm": 0.11624295264482498, "learning_rate": 6.318095238095239e-06, "loss": 0.0209, "step": 7550 }, { "epoch": 21.012, "grad_norm": 0.5223240256309509, "learning_rate": 6.311746031746033e-06, "loss": 0.4057, "step": 7560 }, { "epoch": 21.01257142857143, "grad_norm": 0.059054210782051086, "learning_rate": 6.305396825396825e-06, "loss": 0.8236, "step": 7570 }, { "epoch": 21.013142857142856, "grad_norm": 0.0346602126955986, "learning_rate": 6.299047619047619e-06, "loss": 0.4062, "step": 7580 }, { "epoch": 21.013714285714286, "grad_norm": 0.2160838097333908, "learning_rate": 6.292698412698414e-06, "loss": 0.7491, "step": 7590 }, { "epoch": 21.014285714285716, "grad_norm": 0.17393529415130615, "learning_rate": 6.286349206349207e-06, "loss": 0.0512, "step": 7600 }, { "epoch": 21.014857142857142, "grad_norm": 12.681178092956543, "learning_rate": 6.280000000000001e-06, "loss": 0.6529, "step": 7610 }, { "epoch": 21.015428571428572, "grad_norm": 0.008576265536248684, "learning_rate": 6.273650793650794e-06, "loss": 0.8273, "step": 7620 }, { "epoch": 21.016, "grad_norm": 0.006474177818745375, "learning_rate": 6.267301587301588e-06, "loss": 0.0061, "step": 7630 }, { "epoch": 21.016571428571428, "grad_norm": 0.42665791511535645, "learning_rate": 6.260952380952382e-06, "loss": 0.924, "step": 7640 }, { "epoch": 21.017142857142858, "grad_norm": 37.20500946044922, "learning_rate": 6.254603174603176e-06, "loss": 0.2282, "step": 7650 }, { "epoch": 21.017714285714284, "grad_norm": 0.03610096871852875, "learning_rate": 6.248253968253969e-06, "loss": 0.2507, "step": 7660 }, { "epoch": 21.018285714285714, "grad_norm": 0.17031346261501312, "learning_rate": 6.241904761904763e-06, "loss": 0.483, "step": 7670 }, { "epoch": 21.018857142857144, "grad_norm": 13.083247184753418, "learning_rate": 6.235555555555556e-06, "loss": 0.3114, "step": 7680 }, { "epoch": 21.01942857142857, "grad_norm": 0.013108008541166782, "learning_rate": 6.2292063492063496e-06, "loss": 0.9822, "step": 7690 }, { "epoch": 21.02, "grad_norm": 0.19977082312107086, "learning_rate": 6.222857142857144e-06, "loss": 0.2142, "step": 7700 }, { "epoch": 21.02, "eval_accuracy": 0.8451178451178452, "eval_loss": 0.7130899429321289, "eval_runtime": 134.5619, "eval_samples_per_second": 2.207, "eval_steps_per_second": 1.107, "step": 7700 }, { "epoch": 22.00057142857143, "grad_norm": 0.12899892032146454, "learning_rate": 6.216507936507937e-06, "loss": 0.3791, "step": 7710 }, { "epoch": 22.001142857142856, "grad_norm": 0.06919983774423599, "learning_rate": 6.210158730158731e-06, "loss": 0.4267, "step": 7720 }, { "epoch": 22.001714285714286, "grad_norm": 0.082898810505867, "learning_rate": 6.203809523809524e-06, "loss": 0.2237, "step": 7730 }, { "epoch": 22.002285714285716, "grad_norm": 0.08086587488651276, "learning_rate": 6.197460317460318e-06, "loss": 0.3335, "step": 7740 }, { "epoch": 22.002857142857142, "grad_norm": 0.007148704957216978, "learning_rate": 6.191111111111111e-06, "loss": 0.261, "step": 7750 }, { "epoch": 22.00342857142857, "grad_norm": 13.431198120117188, "learning_rate": 6.184761904761906e-06, "loss": 0.9781, "step": 7760 }, { "epoch": 22.004, "grad_norm": 0.16154451668262482, "learning_rate": 6.178412698412699e-06, "loss": 0.0853, "step": 7770 }, { "epoch": 22.004571428571428, "grad_norm": 0.14190372824668884, "learning_rate": 6.172063492063493e-06, "loss": 0.035, "step": 7780 }, { "epoch": 22.005142857142857, "grad_norm": 13.651422500610352, "learning_rate": 6.165714285714286e-06, "loss": 0.7239, "step": 7790 }, { "epoch": 22.005714285714287, "grad_norm": 0.16447778046131134, "learning_rate": 6.15936507936508e-06, "loss": 0.3368, "step": 7800 }, { "epoch": 22.006285714285713, "grad_norm": 0.36849430203437805, "learning_rate": 6.153015873015874e-06, "loss": 0.2213, "step": 7810 }, { "epoch": 22.006857142857143, "grad_norm": 0.5472358465194702, "learning_rate": 6.146666666666667e-06, "loss": 0.7931, "step": 7820 }, { "epoch": 22.007428571428573, "grad_norm": 0.8773165941238403, "learning_rate": 6.140317460317461e-06, "loss": 0.1896, "step": 7830 }, { "epoch": 22.008, "grad_norm": 0.20025408267974854, "learning_rate": 6.133968253968254e-06, "loss": 0.72, "step": 7840 }, { "epoch": 22.00857142857143, "grad_norm": 0.0833997055888176, "learning_rate": 6.127619047619048e-06, "loss": 0.0071, "step": 7850 }, { "epoch": 22.00914285714286, "grad_norm": 0.018751641735434532, "learning_rate": 6.121269841269841e-06, "loss": 0.2319, "step": 7860 }, { "epoch": 22.009714285714285, "grad_norm": 12.105188369750977, "learning_rate": 6.114920634920636e-06, "loss": 0.7782, "step": 7870 }, { "epoch": 22.010285714285715, "grad_norm": 0.0960543304681778, "learning_rate": 6.108571428571429e-06, "loss": 0.0327, "step": 7880 }, { "epoch": 22.010857142857144, "grad_norm": 12.760798454284668, "learning_rate": 6.102222222222223e-06, "loss": 0.5455, "step": 7890 }, { "epoch": 22.01142857142857, "grad_norm": 0.19047430157661438, "learning_rate": 6.095873015873016e-06, "loss": 0.1856, "step": 7900 }, { "epoch": 22.012, "grad_norm": 12.85616683959961, "learning_rate": 6.08952380952381e-06, "loss": 0.2649, "step": 7910 }, { "epoch": 22.01257142857143, "grad_norm": 0.1496925801038742, "learning_rate": 6.083174603174604e-06, "loss": 0.756, "step": 7920 }, { "epoch": 22.013142857142856, "grad_norm": 0.1137370839715004, "learning_rate": 6.076825396825397e-06, "loss": 1.1056, "step": 7930 }, { "epoch": 22.013714285714286, "grad_norm": 14.750025749206543, "learning_rate": 6.070476190476191e-06, "loss": 0.5389, "step": 7940 }, { "epoch": 22.014285714285716, "grad_norm": 0.15537060797214508, "learning_rate": 6.064126984126984e-06, "loss": 0.2203, "step": 7950 }, { "epoch": 22.014857142857142, "grad_norm": 0.17418305575847626, "learning_rate": 6.057777777777778e-06, "loss": 0.5758, "step": 7960 }, { "epoch": 22.015428571428572, "grad_norm": 0.3206419050693512, "learning_rate": 6.051428571428571e-06, "loss": 0.0278, "step": 7970 }, { "epoch": 22.016, "grad_norm": 0.10264712572097778, "learning_rate": 6.045079365079366e-06, "loss": 0.4427, "step": 7980 }, { "epoch": 22.016571428571428, "grad_norm": 0.08854754269123077, "learning_rate": 6.038730158730159e-06, "loss": 0.1925, "step": 7990 }, { "epoch": 22.017142857142858, "grad_norm": 0.1728421151638031, "learning_rate": 6.032380952380953e-06, "loss": 0.3517, "step": 8000 }, { "epoch": 22.017714285714284, "grad_norm": 0.07346148788928986, "learning_rate": 6.026031746031746e-06, "loss": 0.1056, "step": 8010 }, { "epoch": 22.018285714285714, "grad_norm": 20.421218872070312, "learning_rate": 6.01968253968254e-06, "loss": 0.7583, "step": 8020 }, { "epoch": 22.018857142857144, "grad_norm": 0.0952727198600769, "learning_rate": 6.013333333333335e-06, "loss": 0.3924, "step": 8030 }, { "epoch": 22.01942857142857, "grad_norm": 139.4682159423828, "learning_rate": 6.006984126984127e-06, "loss": 0.1965, "step": 8040 }, { "epoch": 22.02, "grad_norm": 0.15866141021251678, "learning_rate": 6.000634920634921e-06, "loss": 0.5742, "step": 8050 }, { "epoch": 22.02, "eval_accuracy": 0.797979797979798, "eval_loss": 0.9735172986984253, "eval_runtime": 134.4584, "eval_samples_per_second": 2.209, "eval_steps_per_second": 1.108, "step": 8050 }, { "epoch": 23.00057142857143, "grad_norm": 0.1783732771873474, "learning_rate": 5.994285714285714e-06, "loss": 0.1797, "step": 8060 }, { "epoch": 23.001142857142856, "grad_norm": 0.5633581876754761, "learning_rate": 5.987936507936508e-06, "loss": 0.0058, "step": 8070 }, { "epoch": 23.001714285714286, "grad_norm": 0.052664969116449356, "learning_rate": 5.981587301587301e-06, "loss": 0.3705, "step": 8080 }, { "epoch": 23.002285714285716, "grad_norm": 1.9416977167129517, "learning_rate": 5.9752380952380965e-06, "loss": 0.8398, "step": 8090 }, { "epoch": 23.002857142857142, "grad_norm": 2.0996170043945312, "learning_rate": 5.96888888888889e-06, "loss": 0.5296, "step": 8100 }, { "epoch": 23.00342857142857, "grad_norm": 0.32839661836624146, "learning_rate": 5.9625396825396835e-06, "loss": 0.2443, "step": 8110 }, { "epoch": 23.004, "grad_norm": 0.007853704504668713, "learning_rate": 5.956190476190476e-06, "loss": 0.0044, "step": 8120 }, { "epoch": 23.004571428571428, "grad_norm": 12.491561889648438, "learning_rate": 5.94984126984127e-06, "loss": 0.4636, "step": 8130 }, { "epoch": 23.005142857142857, "grad_norm": 0.10937748849391937, "learning_rate": 5.943492063492063e-06, "loss": 0.664, "step": 8140 }, { "epoch": 23.005714285714287, "grad_norm": 0.1766849309206009, "learning_rate": 5.937142857142858e-06, "loss": 0.4253, "step": 8150 }, { "epoch": 23.006285714285713, "grad_norm": 55.246822357177734, "learning_rate": 5.930793650793652e-06, "loss": 0.5592, "step": 8160 }, { "epoch": 23.006857142857143, "grad_norm": 1.2145743370056152, "learning_rate": 5.924444444444445e-06, "loss": 0.0061, "step": 8170 }, { "epoch": 23.007428571428573, "grad_norm": 5.3756794929504395, "learning_rate": 5.918095238095239e-06, "loss": 0.2953, "step": 8180 }, { "epoch": 23.008, "grad_norm": 3.9541385173797607, "learning_rate": 5.911746031746032e-06, "loss": 0.5192, "step": 8190 }, { "epoch": 23.00857142857143, "grad_norm": 0.25989338755607605, "learning_rate": 5.9053968253968265e-06, "loss": 0.4489, "step": 8200 }, { "epoch": 23.00914285714286, "grad_norm": 0.16681231558322906, "learning_rate": 5.89904761904762e-06, "loss": 0.5296, "step": 8210 }, { "epoch": 23.009714285714285, "grad_norm": 0.2039841264486313, "learning_rate": 5.8926984126984135e-06, "loss": 0.5324, "step": 8220 }, { "epoch": 23.010285714285715, "grad_norm": 0.3426172137260437, "learning_rate": 5.886349206349207e-06, "loss": 0.663, "step": 8230 }, { "epoch": 23.010857142857144, "grad_norm": 0.007993980310857296, "learning_rate": 5.8800000000000005e-06, "loss": 0.2322, "step": 8240 }, { "epoch": 23.01142857142857, "grad_norm": 0.28963732719421387, "learning_rate": 5.873650793650794e-06, "loss": 0.2086, "step": 8250 }, { "epoch": 23.012, "grad_norm": 0.18354713916778564, "learning_rate": 5.867301587301588e-06, "loss": 0.0054, "step": 8260 }, { "epoch": 23.01257142857143, "grad_norm": 0.2161131054162979, "learning_rate": 5.860952380952382e-06, "loss": 0.0039, "step": 8270 }, { "epoch": 23.013142857142856, "grad_norm": 0.11740400642156601, "learning_rate": 5.854603174603175e-06, "loss": 0.4064, "step": 8280 }, { "epoch": 23.013714285714286, "grad_norm": 150.37094116210938, "learning_rate": 5.848253968253969e-06, "loss": 0.0248, "step": 8290 }, { "epoch": 23.014285714285716, "grad_norm": 0.1555628478527069, "learning_rate": 5.841904761904762e-06, "loss": 0.0036, "step": 8300 }, { "epoch": 23.014857142857142, "grad_norm": 0.13599923253059387, "learning_rate": 5.8355555555555565e-06, "loss": 0.2426, "step": 8310 }, { "epoch": 23.015428571428572, "grad_norm": 0.1228395327925682, "learning_rate": 5.82920634920635e-06, "loss": 0.8494, "step": 8320 }, { "epoch": 23.016, "grad_norm": 295.5143127441406, "learning_rate": 5.8228571428571435e-06, "loss": 0.459, "step": 8330 }, { "epoch": 23.016571428571428, "grad_norm": 14.874744415283203, "learning_rate": 5.816507936507937e-06, "loss": 0.4832, "step": 8340 }, { "epoch": 23.017142857142858, "grad_norm": 0.10557269304990768, "learning_rate": 5.8101587301587305e-06, "loss": 0.2448, "step": 8350 }, { "epoch": 23.017714285714284, "grad_norm": 0.11553741991519928, "learning_rate": 5.803809523809524e-06, "loss": 0.1697, "step": 8360 }, { "epoch": 23.018285714285714, "grad_norm": 0.08351312577724457, "learning_rate": 5.797460317460318e-06, "loss": 0.4602, "step": 8370 }, { "epoch": 23.018857142857144, "grad_norm": 6.845589637756348, "learning_rate": 5.791111111111112e-06, "loss": 0.3568, "step": 8380 }, { "epoch": 23.01942857142857, "grad_norm": 0.0753844752907753, "learning_rate": 5.784761904761905e-06, "loss": 0.0102, "step": 8390 }, { "epoch": 23.02, "grad_norm": 0.12463897466659546, "learning_rate": 5.778412698412699e-06, "loss": 0.2504, "step": 8400 }, { "epoch": 23.02, "eval_accuracy": 0.8383838383838383, "eval_loss": 0.8313712477684021, "eval_runtime": 134.2181, "eval_samples_per_second": 2.213, "eval_steps_per_second": 1.11, "step": 8400 }, { "epoch": 24.00057142857143, "grad_norm": 0.052970390766859055, "learning_rate": 5.772063492063492e-06, "loss": 0.2138, "step": 8410 }, { "epoch": 24.001142857142856, "grad_norm": 0.05925741419196129, "learning_rate": 5.7657142857142865e-06, "loss": 0.2863, "step": 8420 }, { "epoch": 24.001714285714286, "grad_norm": 0.0537530817091465, "learning_rate": 5.75936507936508e-06, "loss": 0.3034, "step": 8430 }, { "epoch": 24.002285714285716, "grad_norm": 0.23769424855709076, "learning_rate": 5.7530158730158735e-06, "loss": 0.4167, "step": 8440 }, { "epoch": 24.002857142857142, "grad_norm": 19.706758499145508, "learning_rate": 5.746666666666667e-06, "loss": 0.2434, "step": 8450 }, { "epoch": 24.00342857142857, "grad_norm": 0.2779850363731384, "learning_rate": 5.7403174603174605e-06, "loss": 0.8818, "step": 8460 }, { "epoch": 24.004, "grad_norm": 0.29331010580062866, "learning_rate": 5.733968253968254e-06, "loss": 0.226, "step": 8470 }, { "epoch": 24.004571428571428, "grad_norm": 15.611747741699219, "learning_rate": 5.727619047619048e-06, "loss": 0.2733, "step": 8480 }, { "epoch": 24.005142857142857, "grad_norm": 17.301841735839844, "learning_rate": 5.721269841269842e-06, "loss": 0.2518, "step": 8490 }, { "epoch": 24.005714285714287, "grad_norm": 0.828689455986023, "learning_rate": 5.714920634920635e-06, "loss": 0.0048, "step": 8500 }, { "epoch": 24.006285714285713, "grad_norm": 0.07569330185651779, "learning_rate": 5.708571428571429e-06, "loss": 0.2393, "step": 8510 }, { "epoch": 24.006857142857143, "grad_norm": 0.027848446741700172, "learning_rate": 5.702222222222222e-06, "loss": 1.2843, "step": 8520 }, { "epoch": 24.007428571428573, "grad_norm": 0.09675378352403641, "learning_rate": 5.6958730158730165e-06, "loss": 0.006, "step": 8530 }, { "epoch": 24.008, "grad_norm": 0.08907628059387207, "learning_rate": 5.68952380952381e-06, "loss": 0.7083, "step": 8540 }, { "epoch": 24.00857142857143, "grad_norm": 0.0546133778989315, "learning_rate": 5.6831746031746035e-06, "loss": 0.5869, "step": 8550 }, { "epoch": 24.00914285714286, "grad_norm": 0.08426523208618164, "learning_rate": 5.676825396825397e-06, "loss": 0.2361, "step": 8560 }, { "epoch": 24.009714285714285, "grad_norm": 0.1244712769985199, "learning_rate": 5.6704761904761905e-06, "loss": 0.0052, "step": 8570 }, { "epoch": 24.010285714285715, "grad_norm": 0.15895043313503265, "learning_rate": 5.664126984126984e-06, "loss": 0.4814, "step": 8580 }, { "epoch": 24.010857142857144, "grad_norm": 0.24632836878299713, "learning_rate": 5.657777777777778e-06, "loss": 0.0043, "step": 8590 }, { "epoch": 24.01142857142857, "grad_norm": 245.2207794189453, "learning_rate": 5.651428571428572e-06, "loss": 0.7302, "step": 8600 }, { "epoch": 24.012, "grad_norm": 0.1819450557231903, "learning_rate": 5.645079365079365e-06, "loss": 0.0047, "step": 8610 }, { "epoch": 24.01257142857143, "grad_norm": 0.07300709933042526, "learning_rate": 5.638730158730159e-06, "loss": 0.6173, "step": 8620 }, { "epoch": 24.013142857142856, "grad_norm": 0.014348522759974003, "learning_rate": 5.632380952380952e-06, "loss": 0.3765, "step": 8630 }, { "epoch": 24.013714285714286, "grad_norm": 76.59358978271484, "learning_rate": 5.626031746031746e-06, "loss": 0.7209, "step": 8640 }, { "epoch": 24.014285714285716, "grad_norm": 0.1954093724489212, "learning_rate": 5.619682539682541e-06, "loss": 0.2304, "step": 8650 }, { "epoch": 24.014857142857142, "grad_norm": 13.528507232666016, "learning_rate": 5.613333333333334e-06, "loss": 0.6376, "step": 8660 }, { "epoch": 24.015428571428572, "grad_norm": 12.869405746459961, "learning_rate": 5.606984126984127e-06, "loss": 0.5984, "step": 8670 }, { "epoch": 24.016, "grad_norm": 0.1572108119726181, "learning_rate": 5.6006349206349205e-06, "loss": 0.0176, "step": 8680 }, { "epoch": 24.016571428571428, "grad_norm": 0.20609256625175476, "learning_rate": 5.594285714285714e-06, "loss": 0.4308, "step": 8690 }, { "epoch": 24.017142857142858, "grad_norm": 0.07268011569976807, "learning_rate": 5.587936507936509e-06, "loss": 0.4066, "step": 8700 }, { "epoch": 24.017714285714284, "grad_norm": 13.543063163757324, "learning_rate": 5.581587301587303e-06, "loss": 0.7049, "step": 8710 }, { "epoch": 24.018285714285714, "grad_norm": 19.77041244506836, "learning_rate": 5.575238095238096e-06, "loss": 0.4488, "step": 8720 }, { "epoch": 24.018857142857144, "grad_norm": 0.14645476639270782, "learning_rate": 5.56888888888889e-06, "loss": 0.7006, "step": 8730 }, { "epoch": 24.01942857142857, "grad_norm": 0.013115695677697659, "learning_rate": 5.562539682539683e-06, "loss": 0.0082, "step": 8740 }, { "epoch": 24.02, "grad_norm": 0.41332682967185974, "learning_rate": 5.556190476190476e-06, "loss": 0.8514, "step": 8750 }, { "epoch": 24.02, "eval_accuracy": 0.8417508417508418, "eval_loss": 0.7481423616409302, "eval_runtime": 137.4085, "eval_samples_per_second": 2.161, "eval_steps_per_second": 1.084, "step": 8750 }, { "epoch": 25.00057142857143, "grad_norm": 0.371246874332428, "learning_rate": 5.549841269841271e-06, "loss": 0.6345, "step": 8760 }, { "epoch": 25.001142857142856, "grad_norm": 0.7499510645866394, "learning_rate": 5.543492063492064e-06, "loss": 0.1388, "step": 8770 }, { "epoch": 25.001714285714286, "grad_norm": 0.013739760965108871, "learning_rate": 5.537142857142858e-06, "loss": 0.5583, "step": 8780 }, { "epoch": 25.002285714285716, "grad_norm": 190.9663543701172, "learning_rate": 5.530793650793651e-06, "loss": 0.7159, "step": 8790 }, { "epoch": 25.002857142857142, "grad_norm": 256.3753356933594, "learning_rate": 5.524444444444445e-06, "loss": 0.0527, "step": 8800 }, { "epoch": 25.00342857142857, "grad_norm": 7.645998001098633, "learning_rate": 5.518095238095239e-06, "loss": 0.5753, "step": 8810 }, { "epoch": 25.004, "grad_norm": 0.10563361644744873, "learning_rate": 5.511746031746033e-06, "loss": 0.632, "step": 8820 }, { "epoch": 25.004571428571428, "grad_norm": 27.83478546142578, "learning_rate": 5.505396825396826e-06, "loss": 0.3274, "step": 8830 }, { "epoch": 25.005142857142857, "grad_norm": 0.09404099732637405, "learning_rate": 5.49904761904762e-06, "loss": 0.0059, "step": 8840 }, { "epoch": 25.005714285714287, "grad_norm": 0.031015774235129356, "learning_rate": 5.492698412698413e-06, "loss": 0.0043, "step": 8850 }, { "epoch": 25.006285714285713, "grad_norm": 0.20480813086032867, "learning_rate": 5.4863492063492066e-06, "loss": 0.5294, "step": 8860 }, { "epoch": 25.006857142857143, "grad_norm": 125.27932739257812, "learning_rate": 5.480000000000001e-06, "loss": 1.0979, "step": 8870 }, { "epoch": 25.007428571428573, "grad_norm": 0.12436144798994064, "learning_rate": 5.473650793650794e-06, "loss": 0.4211, "step": 8880 }, { "epoch": 25.008, "grad_norm": 0.09650959074497223, "learning_rate": 5.467301587301588e-06, "loss": 0.2128, "step": 8890 }, { "epoch": 25.00857142857143, "grad_norm": 0.09640531986951828, "learning_rate": 5.460952380952381e-06, "loss": 0.0061, "step": 8900 }, { "epoch": 25.00914285714286, "grad_norm": 0.5898492932319641, "learning_rate": 5.454603174603175e-06, "loss": 0.3704, "step": 8910 }, { "epoch": 25.009714285714285, "grad_norm": 0.0638417899608612, "learning_rate": 5.448253968253969e-06, "loss": 0.2117, "step": 8920 }, { "epoch": 25.010285714285715, "grad_norm": 0.08020645380020142, "learning_rate": 5.441904761904763e-06, "loss": 0.4894, "step": 8930 }, { "epoch": 25.010857142857144, "grad_norm": 0.15978746116161346, "learning_rate": 5.435555555555556e-06, "loss": 0.3888, "step": 8940 }, { "epoch": 25.01142857142857, "grad_norm": 0.06692436337471008, "learning_rate": 5.42920634920635e-06, "loss": 0.2216, "step": 8950 }, { "epoch": 25.012, "grad_norm": 0.41444236040115356, "learning_rate": 5.422857142857143e-06, "loss": 0.7492, "step": 8960 }, { "epoch": 25.01257142857143, "grad_norm": 0.08462107926607132, "learning_rate": 5.4165079365079366e-06, "loss": 0.9321, "step": 8970 }, { "epoch": 25.013142857142856, "grad_norm": 0.08357837796211243, "learning_rate": 5.410158730158731e-06, "loss": 0.0149, "step": 8980 }, { "epoch": 25.013714285714286, "grad_norm": 20.435392379760742, "learning_rate": 5.403809523809524e-06, "loss": 0.6908, "step": 8990 }, { "epoch": 25.014285714285716, "grad_norm": 0.18590255081653595, "learning_rate": 5.397460317460318e-06, "loss": 0.2268, "step": 9000 }, { "epoch": 25.014857142857142, "grad_norm": 0.013719220645725727, "learning_rate": 5.391111111111111e-06, "loss": 0.198, "step": 9010 }, { "epoch": 25.015428571428572, "grad_norm": 0.004588362295180559, "learning_rate": 5.384761904761905e-06, "loss": 0.0098, "step": 9020 }, { "epoch": 25.016, "grad_norm": 0.07363064587116241, "learning_rate": 5.378412698412698e-06, "loss": 0.3217, "step": 9030 }, { "epoch": 25.016571428571428, "grad_norm": 0.056400805711746216, "learning_rate": 5.372063492063493e-06, "loss": 0.2697, "step": 9040 }, { "epoch": 25.017142857142858, "grad_norm": 84.72920989990234, "learning_rate": 5.365714285714286e-06, "loss": 0.0159, "step": 9050 }, { "epoch": 25.017714285714284, "grad_norm": 17.841381072998047, "learning_rate": 5.35936507936508e-06, "loss": 0.3147, "step": 9060 }, { "epoch": 25.018285714285714, "grad_norm": 17.061647415161133, "learning_rate": 5.353015873015873e-06, "loss": 0.5654, "step": 9070 }, { "epoch": 25.018857142857144, "grad_norm": 0.8570396304130554, "learning_rate": 5.346666666666667e-06, "loss": 0.5723, "step": 9080 }, { "epoch": 25.01942857142857, "grad_norm": 58.06008529663086, "learning_rate": 5.340317460317461e-06, "loss": 1.2434, "step": 9090 }, { "epoch": 25.02, "grad_norm": 207.67483520507812, "learning_rate": 5.333968253968254e-06, "loss": 0.8148, "step": 9100 }, { "epoch": 25.02, "eval_accuracy": 0.8383838383838383, "eval_loss": 0.7210359573364258, "eval_runtime": 137.5062, "eval_samples_per_second": 2.16, "eval_steps_per_second": 1.084, "step": 9100 }, { "epoch": 26.00057142857143, "grad_norm": 0.09444784373044968, "learning_rate": 5.327619047619048e-06, "loss": 0.3524, "step": 9110 }, { "epoch": 26.001142857142856, "grad_norm": 0.26179543137550354, "learning_rate": 5.321269841269841e-06, "loss": 0.5438, "step": 9120 }, { "epoch": 26.001714285714286, "grad_norm": 2.8941266536712646, "learning_rate": 5.314920634920635e-06, "loss": 0.4125, "step": 9130 }, { "epoch": 26.002285714285716, "grad_norm": 0.1553201973438263, "learning_rate": 5.308571428571428e-06, "loss": 0.0053, "step": 9140 }, { "epoch": 26.002857142857142, "grad_norm": 0.08845008164644241, "learning_rate": 5.302222222222223e-06, "loss": 0.5051, "step": 9150 }, { "epoch": 26.00342857142857, "grad_norm": 0.11578679084777832, "learning_rate": 5.295873015873016e-06, "loss": 0.2748, "step": 9160 }, { "epoch": 26.004, "grad_norm": 0.14951874315738678, "learning_rate": 5.28952380952381e-06, "loss": 0.1894, "step": 9170 }, { "epoch": 26.004571428571428, "grad_norm": 0.0727708712220192, "learning_rate": 5.283174603174603e-06, "loss": 0.6826, "step": 9180 }, { "epoch": 26.005142857142857, "grad_norm": 0.05796307697892189, "learning_rate": 5.276825396825397e-06, "loss": 0.5291, "step": 9190 }, { "epoch": 26.005714285714287, "grad_norm": 0.10209472477436066, "learning_rate": 5.270476190476192e-06, "loss": 0.0044, "step": 9200 }, { "epoch": 26.006285714285713, "grad_norm": 0.008814401924610138, "learning_rate": 5.264126984126985e-06, "loss": 0.5146, "step": 9210 }, { "epoch": 26.006857142857143, "grad_norm": 0.3405003249645233, "learning_rate": 5.257777777777779e-06, "loss": 0.3465, "step": 9220 }, { "epoch": 26.007428571428573, "grad_norm": 0.09109217673540115, "learning_rate": 5.251428571428571e-06, "loss": 0.1963, "step": 9230 }, { "epoch": 26.008, "grad_norm": 0.05053064227104187, "learning_rate": 5.245079365079365e-06, "loss": 0.197, "step": 9240 }, { "epoch": 26.00857142857143, "grad_norm": 0.01660529151558876, "learning_rate": 5.238730158730158e-06, "loss": 0.4736, "step": 9250 }, { "epoch": 26.00914285714286, "grad_norm": 0.015270842239260674, "learning_rate": 5.2323809523809535e-06, "loss": 0.4788, "step": 9260 }, { "epoch": 26.009714285714285, "grad_norm": 0.09976387768983841, "learning_rate": 5.226031746031747e-06, "loss": 0.002, "step": 9270 }, { "epoch": 26.010285714285715, "grad_norm": 0.19991345703601837, "learning_rate": 5.2196825396825405e-06, "loss": 0.0036, "step": 9280 }, { "epoch": 26.010857142857144, "grad_norm": 0.12031792104244232, "learning_rate": 5.213333333333334e-06, "loss": 0.6917, "step": 9290 }, { "epoch": 26.01142857142857, "grad_norm": 0.051638491451740265, "learning_rate": 5.2069841269841274e-06, "loss": 0.7381, "step": 9300 }, { "epoch": 26.012, "grad_norm": 91.03575897216797, "learning_rate": 5.200634920634922e-06, "loss": 0.2566, "step": 9310 }, { "epoch": 26.01257142857143, "grad_norm": 0.1020565778017044, "learning_rate": 5.194285714285715e-06, "loss": 0.2807, "step": 9320 }, { "epoch": 26.013142857142856, "grad_norm": 0.1554216891527176, "learning_rate": 5.187936507936509e-06, "loss": 0.2259, "step": 9330 }, { "epoch": 26.013714285714286, "grad_norm": 0.09728459268808365, "learning_rate": 5.181587301587302e-06, "loss": 0.4738, "step": 9340 }, { "epoch": 26.014285714285716, "grad_norm": 0.7819724082946777, "learning_rate": 5.175238095238096e-06, "loss": 0.2462, "step": 9350 }, { "epoch": 26.014857142857142, "grad_norm": 0.19601747393608093, "learning_rate": 5.168888888888889e-06, "loss": 0.3972, "step": 9360 }, { "epoch": 26.015428571428572, "grad_norm": 0.21019776165485382, "learning_rate": 5.1625396825396835e-06, "loss": 0.5798, "step": 9370 }, { "epoch": 26.016, "grad_norm": 0.13152237236499786, "learning_rate": 5.156190476190477e-06, "loss": 0.0422, "step": 9380 }, { "epoch": 26.016571428571428, "grad_norm": 42.05175018310547, "learning_rate": 5.1498412698412705e-06, "loss": 0.2365, "step": 9390 }, { "epoch": 26.017142857142858, "grad_norm": 0.06394017487764359, "learning_rate": 5.143492063492064e-06, "loss": 0.2123, "step": 9400 }, { "epoch": 26.017714285714284, "grad_norm": 0.1293639838695526, "learning_rate": 5.1371428571428574e-06, "loss": 0.2474, "step": 9410 }, { "epoch": 26.018285714285714, "grad_norm": 0.46589070558547974, "learning_rate": 5.130793650793651e-06, "loss": 0.5852, "step": 9420 }, { "epoch": 26.018857142857144, "grad_norm": 0.04075726494193077, "learning_rate": 5.124444444444445e-06, "loss": 0.4732, "step": 9430 }, { "epoch": 26.01942857142857, "grad_norm": 0.08269976824522018, "learning_rate": 5.118095238095239e-06, "loss": 0.1798, "step": 9440 }, { "epoch": 26.02, "grad_norm": 0.038522034883499146, "learning_rate": 5.111746031746032e-06, "loss": 0.2594, "step": 9450 }, { "epoch": 26.02, "eval_accuracy": 0.8249158249158249, "eval_loss": 0.9979982972145081, "eval_runtime": 126.233, "eval_samples_per_second": 2.353, "eval_steps_per_second": 1.18, "step": 9450 }, { "epoch": 27.00057142857143, "grad_norm": 0.03514016792178154, "learning_rate": 5.105396825396826e-06, "loss": 0.6331, "step": 9460 }, { "epoch": 27.001142857142856, "grad_norm": 0.039175134152173996, "learning_rate": 5.099047619047619e-06, "loss": 0.0031, "step": 9470 }, { "epoch": 27.001714285714286, "grad_norm": 0.017468813806772232, "learning_rate": 5.0926984126984135e-06, "loss": 0.0028, "step": 9480 }, { "epoch": 27.002285714285716, "grad_norm": 3.9538137912750244, "learning_rate": 5.086349206349207e-06, "loss": 0.0074, "step": 9490 }, { "epoch": 27.002857142857142, "grad_norm": 0.040926579385995865, "learning_rate": 5.0800000000000005e-06, "loss": 0.7674, "step": 9500 }, { "epoch": 27.00342857142857, "grad_norm": 0.15430638194084167, "learning_rate": 5.073650793650794e-06, "loss": 0.0668, "step": 9510 }, { "epoch": 27.004, "grad_norm": 36.84708786010742, "learning_rate": 5.0673015873015875e-06, "loss": 0.4732, "step": 9520 }, { "epoch": 27.004571428571428, "grad_norm": 0.04013565182685852, "learning_rate": 5.060952380952381e-06, "loss": 0.0228, "step": 9530 }, { "epoch": 27.005142857142857, "grad_norm": 0.029689166694879532, "learning_rate": 5.054603174603175e-06, "loss": 0.2372, "step": 9540 }, { "epoch": 27.005714285714287, "grad_norm": 0.0034933576826006174, "learning_rate": 5.048253968253969e-06, "loss": 0.2009, "step": 9550 }, { "epoch": 27.006285714285713, "grad_norm": 22.289688110351562, "learning_rate": 5.041904761904762e-06, "loss": 0.3468, "step": 9560 }, { "epoch": 27.006857142857143, "grad_norm": 0.05222174897789955, "learning_rate": 5.035555555555556e-06, "loss": 0.266, "step": 9570 }, { "epoch": 27.007428571428573, "grad_norm": 0.08454541116952896, "learning_rate": 5.029206349206349e-06, "loss": 0.543, "step": 9580 }, { "epoch": 27.008, "grad_norm": 15.091740608215332, "learning_rate": 5.0228571428571435e-06, "loss": 0.2849, "step": 9590 }, { "epoch": 27.00857142857143, "grad_norm": 108.28709411621094, "learning_rate": 5.016507936507937e-06, "loss": 0.5846, "step": 9600 }, { "epoch": 27.00914285714286, "grad_norm": 37.8171501159668, "learning_rate": 5.0101587301587305e-06, "loss": 0.3388, "step": 9610 }, { "epoch": 27.009714285714285, "grad_norm": 0.05814701318740845, "learning_rate": 5.003809523809524e-06, "loss": 0.1932, "step": 9620 }, { "epoch": 27.010285714285715, "grad_norm": 0.018001697957515717, "learning_rate": 4.997460317460318e-06, "loss": 0.1371, "step": 9630 }, { "epoch": 27.010857142857144, "grad_norm": 0.20318511128425598, "learning_rate": 4.991111111111112e-06, "loss": 0.3923, "step": 9640 }, { "epoch": 27.01142857142857, "grad_norm": 0.07720087468624115, "learning_rate": 4.984761904761905e-06, "loss": 0.0501, "step": 9650 }, { "epoch": 27.012, "grad_norm": 19.00177764892578, "learning_rate": 4.978412698412699e-06, "loss": 0.542, "step": 9660 }, { "epoch": 27.01257142857143, "grad_norm": 0.011382623575627804, "learning_rate": 4.972063492063492e-06, "loss": 0.2808, "step": 9670 }, { "epoch": 27.013142857142856, "grad_norm": 0.14707712829113007, "learning_rate": 4.965714285714286e-06, "loss": 0.3066, "step": 9680 }, { "epoch": 27.013714285714286, "grad_norm": 0.0880795419216156, "learning_rate": 4.95936507936508e-06, "loss": 0.3369, "step": 9690 }, { "epoch": 27.014285714285716, "grad_norm": 91.44270324707031, "learning_rate": 4.9530158730158735e-06, "loss": 0.1431, "step": 9700 }, { "epoch": 27.014857142857142, "grad_norm": 0.03979627415537834, "learning_rate": 4.946666666666667e-06, "loss": 0.2022, "step": 9710 }, { "epoch": 27.015428571428572, "grad_norm": 0.06051446869969368, "learning_rate": 4.9403174603174605e-06, "loss": 1.1248, "step": 9720 }, { "epoch": 27.016, "grad_norm": 0.011192934587597847, "learning_rate": 4.933968253968254e-06, "loss": 0.5604, "step": 9730 }, { "epoch": 27.016571428571428, "grad_norm": 0.013154531829059124, "learning_rate": 4.9276190476190475e-06, "loss": 0.1641, "step": 9740 }, { "epoch": 27.017142857142858, "grad_norm": 0.16332918405532837, "learning_rate": 4.921269841269842e-06, "loss": 0.2698, "step": 9750 }, { "epoch": 27.017714285714284, "grad_norm": 0.12617675960063934, "learning_rate": 4.914920634920635e-06, "loss": 0.5508, "step": 9760 }, { "epoch": 27.018285714285714, "grad_norm": 0.007707640528678894, "learning_rate": 4.90857142857143e-06, "loss": 0.5326, "step": 9770 }, { "epoch": 27.018857142857144, "grad_norm": 0.10187384486198425, "learning_rate": 4.902222222222222e-06, "loss": 0.0046, "step": 9780 }, { "epoch": 27.01942857142857, "grad_norm": 0.09700454026460648, "learning_rate": 4.895873015873016e-06, "loss": 0.4082, "step": 9790 }, { "epoch": 27.02, "grad_norm": 0.1347123682498932, "learning_rate": 4.88952380952381e-06, "loss": 0.6742, "step": 9800 }, { "epoch": 27.02, "eval_accuracy": 0.8585858585858586, "eval_loss": 0.7987341284751892, "eval_runtime": 126.3026, "eval_samples_per_second": 2.351, "eval_steps_per_second": 1.18, "step": 9800 }, { "epoch": 28.00057142857143, "grad_norm": 0.007627937477082014, "learning_rate": 4.8831746031746035e-06, "loss": 0.8211, "step": 9810 }, { "epoch": 28.001142857142856, "grad_norm": 0.5802029967308044, "learning_rate": 4.876825396825397e-06, "loss": 0.0109, "step": 9820 }, { "epoch": 28.001714285714286, "grad_norm": 0.045871149748563766, "learning_rate": 4.870476190476191e-06, "loss": 0.0037, "step": 9830 }, { "epoch": 28.002285714285716, "grad_norm": 17.630939483642578, "learning_rate": 4.864126984126985e-06, "loss": 1.0221, "step": 9840 }, { "epoch": 28.002857142857142, "grad_norm": 0.20201857388019562, "learning_rate": 4.857777777777778e-06, "loss": 0.3867, "step": 9850 }, { "epoch": 28.00342857142857, "grad_norm": 19.86452293395996, "learning_rate": 4.851428571428572e-06, "loss": 0.0709, "step": 9860 }, { "epoch": 28.004, "grad_norm": 24.085710525512695, "learning_rate": 4.845079365079365e-06, "loss": 0.1767, "step": 9870 }, { "epoch": 28.004571428571428, "grad_norm": 0.37033602595329285, "learning_rate": 4.83873015873016e-06, "loss": 0.3709, "step": 9880 }, { "epoch": 28.005142857142857, "grad_norm": 0.0676516443490982, "learning_rate": 4.832380952380953e-06, "loss": 0.6114, "step": 9890 }, { "epoch": 28.005714285714287, "grad_norm": 0.13778136670589447, "learning_rate": 4.826031746031747e-06, "loss": 0.1078, "step": 9900 }, { "epoch": 28.006285714285713, "grad_norm": 292.9512939453125, "learning_rate": 4.81968253968254e-06, "loss": 0.6608, "step": 9910 }, { "epoch": 28.006857142857143, "grad_norm": 0.08069964498281479, "learning_rate": 4.8133333333333336e-06, "loss": 0.6372, "step": 9920 }, { "epoch": 28.007428571428573, "grad_norm": 16.857074737548828, "learning_rate": 4.806984126984127e-06, "loss": 0.3145, "step": 9930 }, { "epoch": 28.008, "grad_norm": 0.1705380231142044, "learning_rate": 4.800634920634921e-06, "loss": 0.5308, "step": 9940 }, { "epoch": 28.00857142857143, "grad_norm": 0.19721141457557678, "learning_rate": 4.794285714285715e-06, "loss": 0.2503, "step": 9950 }, { "epoch": 28.00914285714286, "grad_norm": 0.018095174804329872, "learning_rate": 4.787936507936508e-06, "loss": 0.0216, "step": 9960 }, { "epoch": 28.009714285714285, "grad_norm": 0.058308668434619904, "learning_rate": 4.781587301587302e-06, "loss": 0.4907, "step": 9970 }, { "epoch": 28.010285714285715, "grad_norm": 0.18544158339500427, "learning_rate": 4.775238095238095e-06, "loss": 0.2361, "step": 9980 }, { "epoch": 28.010857142857144, "grad_norm": 0.18984903395175934, "learning_rate": 4.768888888888889e-06, "loss": 0.1339, "step": 9990 }, { "epoch": 28.01142857142857, "grad_norm": 117.79344177246094, "learning_rate": 4.762539682539683e-06, "loss": 0.496, "step": 10000 }, { "epoch": 28.012, "grad_norm": 0.006381293758749962, "learning_rate": 4.756190476190477e-06, "loss": 0.0039, "step": 10010 }, { "epoch": 28.01257142857143, "grad_norm": 321.21435546875, "learning_rate": 4.74984126984127e-06, "loss": 0.5793, "step": 10020 }, { "epoch": 28.013142857142856, "grad_norm": 0.06357376277446747, "learning_rate": 4.7434920634920636e-06, "loss": 0.0057, "step": 10030 }, { "epoch": 28.013714285714286, "grad_norm": 0.2022400200366974, "learning_rate": 4.737142857142857e-06, "loss": 0.3293, "step": 10040 }, { "epoch": 28.014285714285716, "grad_norm": 0.050252072513103485, "learning_rate": 4.730793650793651e-06, "loss": 0.0027, "step": 10050 }, { "epoch": 28.014857142857142, "grad_norm": 0.07211655378341675, "learning_rate": 4.724444444444445e-06, "loss": 0.0018, "step": 10060 }, { "epoch": 28.015428571428572, "grad_norm": 0.012892846018075943, "learning_rate": 4.718095238095238e-06, "loss": 0.3652, "step": 10070 }, { "epoch": 28.016, "grad_norm": 0.07106824219226837, "learning_rate": 4.711746031746033e-06, "loss": 0.6978, "step": 10080 }, { "epoch": 28.016571428571428, "grad_norm": 131.25204467773438, "learning_rate": 4.705396825396826e-06, "loss": 0.0722, "step": 10090 }, { "epoch": 28.017142857142858, "grad_norm": 0.002385256579145789, "learning_rate": 4.699047619047619e-06, "loss": 0.0026, "step": 10100 }, { "epoch": 28.017714285714284, "grad_norm": 0.15431569516658783, "learning_rate": 4.692698412698413e-06, "loss": 0.3684, "step": 10110 }, { "epoch": 28.018285714285714, "grad_norm": 0.08459474891424179, "learning_rate": 4.686349206349207e-06, "loss": 0.4475, "step": 10120 }, { "epoch": 28.018857142857144, "grad_norm": 0.012470588088035583, "learning_rate": 4.680000000000001e-06, "loss": 0.0172, "step": 10130 }, { "epoch": 28.01942857142857, "grad_norm": 19.614744186401367, "learning_rate": 4.673650793650794e-06, "loss": 0.6554, "step": 10140 }, { "epoch": 28.02, "grad_norm": 0.002492617815732956, "learning_rate": 4.667301587301588e-06, "loss": 0.0063, "step": 10150 }, { "epoch": 28.02, "eval_accuracy": 0.8316498316498316, "eval_loss": 0.9369211196899414, "eval_runtime": 126.2783, "eval_samples_per_second": 2.352, "eval_steps_per_second": 1.18, "step": 10150 }, { "epoch": 29.00057142857143, "grad_norm": 0.11610179394483566, "learning_rate": 4.660952380952381e-06, "loss": 0.0046, "step": 10160 }, { "epoch": 29.001142857142856, "grad_norm": 13.605859756469727, "learning_rate": 4.654603174603175e-06, "loss": 0.5286, "step": 10170 }, { "epoch": 29.001714285714286, "grad_norm": 0.03343842178583145, "learning_rate": 4.648253968253968e-06, "loss": 0.5034, "step": 10180 }, { "epoch": 29.002285714285716, "grad_norm": 0.25074559450149536, "learning_rate": 4.641904761904763e-06, "loss": 0.1946, "step": 10190 }, { "epoch": 29.002857142857142, "grad_norm": 0.0782301276922226, "learning_rate": 4.635555555555556e-06, "loss": 0.0034, "step": 10200 }, { "epoch": 29.00342857142857, "grad_norm": 0.037485599517822266, "learning_rate": 4.62920634920635e-06, "loss": 0.183, "step": 10210 }, { "epoch": 29.004, "grad_norm": 0.03207986056804657, "learning_rate": 4.622857142857143e-06, "loss": 0.7524, "step": 10220 }, { "epoch": 29.004571428571428, "grad_norm": 0.07413322478532791, "learning_rate": 4.616507936507937e-06, "loss": 0.2849, "step": 10230 }, { "epoch": 29.005142857142857, "grad_norm": 19.703533172607422, "learning_rate": 4.61015873015873e-06, "loss": 1.0924, "step": 10240 }, { "epoch": 29.005714285714287, "grad_norm": 13.900262832641602, "learning_rate": 4.603809523809524e-06, "loss": 0.6464, "step": 10250 }, { "epoch": 29.006285714285713, "grad_norm": 0.2331477850675583, "learning_rate": 4.597460317460318e-06, "loss": 0.0042, "step": 10260 }, { "epoch": 29.006857142857143, "grad_norm": 0.0446133092045784, "learning_rate": 4.591111111111111e-06, "loss": 0.0039, "step": 10270 }, { "epoch": 29.007428571428573, "grad_norm": 0.03788290172815323, "learning_rate": 4.584761904761905e-06, "loss": 0.4045, "step": 10280 }, { "epoch": 29.008, "grad_norm": 0.11208215355873108, "learning_rate": 4.578412698412698e-06, "loss": 0.2883, "step": 10290 }, { "epoch": 29.00857142857143, "grad_norm": 0.05422298610210419, "learning_rate": 4.572063492063493e-06, "loss": 0.0113, "step": 10300 }, { "epoch": 29.00914285714286, "grad_norm": 0.003405461786314845, "learning_rate": 4.565714285714286e-06, "loss": 0.0028, "step": 10310 }, { "epoch": 29.009714285714285, "grad_norm": 0.04415423423051834, "learning_rate": 4.55936507936508e-06, "loss": 0.2427, "step": 10320 }, { "epoch": 29.010285714285715, "grad_norm": 15.168233871459961, "learning_rate": 4.553015873015873e-06, "loss": 0.7346, "step": 10330 }, { "epoch": 29.010857142857144, "grad_norm": 0.021887382492423058, "learning_rate": 4.546666666666667e-06, "loss": 0.2257, "step": 10340 }, { "epoch": 29.01142857142857, "grad_norm": 0.03137199580669403, "learning_rate": 4.54031746031746e-06, "loss": 0.249, "step": 10350 }, { "epoch": 29.012, "grad_norm": 0.06243215128779411, "learning_rate": 4.5339682539682544e-06, "loss": 0.0026, "step": 10360 }, { "epoch": 29.01257142857143, "grad_norm": 0.0038311234675347805, "learning_rate": 4.527619047619048e-06, "loss": 0.1538, "step": 10370 }, { "epoch": 29.013142857142856, "grad_norm": 0.01766922138631344, "learning_rate": 4.521269841269841e-06, "loss": 0.273, "step": 10380 }, { "epoch": 29.013714285714286, "grad_norm": 0.0019730119965970516, "learning_rate": 4.514920634920636e-06, "loss": 0.0014, "step": 10390 }, { "epoch": 29.014285714285716, "grad_norm": 0.6263951659202576, "learning_rate": 4.508571428571429e-06, "loss": 0.2918, "step": 10400 }, { "epoch": 29.014857142857142, "grad_norm": 49.63083267211914, "learning_rate": 4.502222222222223e-06, "loss": 0.3073, "step": 10410 }, { "epoch": 29.015428571428572, "grad_norm": 37.1717643737793, "learning_rate": 4.495873015873016e-06, "loss": 0.3176, "step": 10420 }, { "epoch": 29.016, "grad_norm": 0.03757132217288017, "learning_rate": 4.48952380952381e-06, "loss": 0.1175, "step": 10430 }, { "epoch": 29.016571428571428, "grad_norm": 0.095696821808815, "learning_rate": 4.483174603174604e-06, "loss": 0.5835, "step": 10440 }, { "epoch": 29.017142857142858, "grad_norm": 0.12873250246047974, "learning_rate": 4.4768253968253975e-06, "loss": 0.1662, "step": 10450 }, { "epoch": 29.017714285714284, "grad_norm": 0.03826398402452469, "learning_rate": 4.470476190476191e-06, "loss": 0.0065, "step": 10460 }, { "epoch": 29.018285714285714, "grad_norm": 0.027511196210980415, "learning_rate": 4.4641269841269844e-06, "loss": 0.669, "step": 10470 }, { "epoch": 29.018857142857144, "grad_norm": 0.01353570818901062, "learning_rate": 4.457777777777778e-06, "loss": 0.0017, "step": 10480 }, { "epoch": 29.01942857142857, "grad_norm": 0.007252034731209278, "learning_rate": 4.451428571428571e-06, "loss": 0.3284, "step": 10490 }, { "epoch": 29.02, "grad_norm": 0.14840850234031677, "learning_rate": 4.445079365079366e-06, "loss": 0.5186, "step": 10500 }, { "epoch": 29.02, "eval_accuracy": 0.8148148148148148, "eval_loss": 1.0870999097824097, "eval_runtime": 126.3664, "eval_samples_per_second": 2.35, "eval_steps_per_second": 1.179, "step": 10500 }, { "epoch": 30.00057142857143, "grad_norm": 0.020524989813566208, "learning_rate": 4.438730158730159e-06, "loss": 0.0009, "step": 10510 }, { "epoch": 30.001142857142856, "grad_norm": 0.27530866861343384, "learning_rate": 4.432380952380953e-06, "loss": 1.0032, "step": 10520 }, { "epoch": 30.001714285714286, "grad_norm": 0.06338400393724442, "learning_rate": 4.426031746031746e-06, "loss": 0.0018, "step": 10530 }, { "epoch": 30.002285714285716, "grad_norm": 0.049876321107149124, "learning_rate": 4.41968253968254e-06, "loss": 0.0032, "step": 10540 }, { "epoch": 30.002857142857142, "grad_norm": 0.21788761019706726, "learning_rate": 4.413333333333334e-06, "loss": 0.0011, "step": 10550 }, { "epoch": 30.00342857142857, "grad_norm": 0.006480003707110882, "learning_rate": 4.4069841269841275e-06, "loss": 0.4141, "step": 10560 }, { "epoch": 30.004, "grad_norm": 0.011641621589660645, "learning_rate": 4.400634920634921e-06, "loss": 0.2498, "step": 10570 }, { "epoch": 30.004571428571428, "grad_norm": 0.06504371762275696, "learning_rate": 4.3942857142857144e-06, "loss": 0.5437, "step": 10580 }, { "epoch": 30.005142857142857, "grad_norm": 284.99957275390625, "learning_rate": 4.387936507936508e-06, "loss": 0.734, "step": 10590 }, { "epoch": 30.005714285714287, "grad_norm": 81.92947387695312, "learning_rate": 4.381587301587301e-06, "loss": 0.6903, "step": 10600 }, { "epoch": 30.006285714285713, "grad_norm": 0.0021508638747036457, "learning_rate": 4.375238095238096e-06, "loss": 0.0015, "step": 10610 }, { "epoch": 30.006857142857143, "grad_norm": 0.008287636563181877, "learning_rate": 4.368888888888889e-06, "loss": 0.0204, "step": 10620 }, { "epoch": 30.007428571428573, "grad_norm": 0.06526491791009903, "learning_rate": 4.362539682539683e-06, "loss": 0.2437, "step": 10630 }, { "epoch": 30.008, "grad_norm": 0.09128577262163162, "learning_rate": 4.356190476190477e-06, "loss": 0.1999, "step": 10640 }, { "epoch": 30.00857142857143, "grad_norm": 0.13880805671215057, "learning_rate": 4.34984126984127e-06, "loss": 0.2869, "step": 10650 }, { "epoch": 30.00914285714286, "grad_norm": 61.90774154663086, "learning_rate": 4.343492063492064e-06, "loss": 0.2872, "step": 10660 }, { "epoch": 30.009714285714285, "grad_norm": 0.1190374344587326, "learning_rate": 4.3371428571428575e-06, "loss": 1.0817, "step": 10670 }, { "epoch": 30.010285714285715, "grad_norm": 0.0040014442056417465, "learning_rate": 4.330793650793651e-06, "loss": 0.8608, "step": 10680 }, { "epoch": 30.010857142857144, "grad_norm": 0.08265765011310577, "learning_rate": 4.324444444444445e-06, "loss": 0.0143, "step": 10690 }, { "epoch": 30.01142857142857, "grad_norm": 0.08359917998313904, "learning_rate": 4.318095238095239e-06, "loss": 0.0016, "step": 10700 }, { "epoch": 30.012, "grad_norm": 0.03306671977043152, "learning_rate": 4.311746031746032e-06, "loss": 0.0031, "step": 10710 }, { "epoch": 30.01257142857143, "grad_norm": 45.719505310058594, "learning_rate": 4.305396825396826e-06, "loss": 0.3334, "step": 10720 }, { "epoch": 30.013142857142856, "grad_norm": 0.0030138203874230385, "learning_rate": 4.299047619047619e-06, "loss": 0.2682, "step": 10730 }, { "epoch": 30.013714285714286, "grad_norm": 0.015341831371188164, "learning_rate": 4.292698412698413e-06, "loss": 0.6864, "step": 10740 }, { "epoch": 30.014285714285716, "grad_norm": 1.9505524635314941, "learning_rate": 4.286349206349207e-06, "loss": 0.3386, "step": 10750 }, { "epoch": 30.014857142857142, "grad_norm": 15.920595169067383, "learning_rate": 4.2800000000000005e-06, "loss": 0.0109, "step": 10760 }, { "epoch": 30.015428571428572, "grad_norm": 0.03716844692826271, "learning_rate": 4.273650793650794e-06, "loss": 0.0024, "step": 10770 }, { "epoch": 30.016, "grad_norm": 0.012245182879269123, "learning_rate": 4.2673015873015875e-06, "loss": 0.0497, "step": 10780 }, { "epoch": 30.016571428571428, "grad_norm": 0.020435450598597527, "learning_rate": 4.260952380952381e-06, "loss": 0.3038, "step": 10790 }, { "epoch": 30.017142857142858, "grad_norm": 0.7739905714988708, "learning_rate": 4.254603174603175e-06, "loss": 0.0028, "step": 10800 }, { "epoch": 30.017714285714284, "grad_norm": 116.42400360107422, "learning_rate": 4.248253968253969e-06, "loss": 0.5479, "step": 10810 }, { "epoch": 30.018285714285714, "grad_norm": 0.060728929936885834, "learning_rate": 4.241904761904762e-06, "loss": 0.1333, "step": 10820 }, { "epoch": 30.018857142857144, "grad_norm": 2.810554265975952, "learning_rate": 4.235555555555556e-06, "loss": 1.2669, "step": 10830 }, { "epoch": 30.01942857142857, "grad_norm": 0.007885237224400043, "learning_rate": 4.229206349206349e-06, "loss": 0.8325, "step": 10840 }, { "epoch": 30.02, "grad_norm": 0.06585021317005157, "learning_rate": 4.222857142857143e-06, "loss": 0.3076, "step": 10850 }, { "epoch": 30.02, "eval_accuracy": 0.835016835016835, "eval_loss": 0.8930524587631226, "eval_runtime": 126.4231, "eval_samples_per_second": 2.349, "eval_steps_per_second": 1.179, "step": 10850 }, { "epoch": 31.00057142857143, "grad_norm": 0.008948814123868942, "learning_rate": 4.216507936507937e-06, "loss": 0.263, "step": 10860 }, { "epoch": 31.001142857142856, "grad_norm": 0.22963719069957733, "learning_rate": 4.2101587301587305e-06, "loss": 0.0015, "step": 10870 }, { "epoch": 31.001714285714286, "grad_norm": 0.2196054607629776, "learning_rate": 4.203809523809524e-06, "loss": 0.0021, "step": 10880 }, { "epoch": 31.002285714285716, "grad_norm": 0.03480219841003418, "learning_rate": 4.1974603174603175e-06, "loss": 0.6281, "step": 10890 }, { "epoch": 31.002857142857142, "grad_norm": 0.9133780002593994, "learning_rate": 4.191111111111111e-06, "loss": 0.0018, "step": 10900 }, { "epoch": 31.00342857142857, "grad_norm": 0.1275293380022049, "learning_rate": 4.184761904761905e-06, "loss": 0.4055, "step": 10910 }, { "epoch": 31.004, "grad_norm": 0.3001004755496979, "learning_rate": 4.178412698412699e-06, "loss": 0.2961, "step": 10920 }, { "epoch": 31.004571428571428, "grad_norm": 0.08802656084299088, "learning_rate": 4.172063492063492e-06, "loss": 0.537, "step": 10930 }, { "epoch": 31.005142857142857, "grad_norm": 159.4746551513672, "learning_rate": 4.165714285714287e-06, "loss": 0.3808, "step": 10940 }, { "epoch": 31.005714285714287, "grad_norm": 0.11664550751447678, "learning_rate": 4.15936507936508e-06, "loss": 0.7977, "step": 10950 }, { "epoch": 31.006285714285713, "grad_norm": 0.20757505297660828, "learning_rate": 4.1530158730158736e-06, "loss": 0.7499, "step": 10960 }, { "epoch": 31.006857142857143, "grad_norm": 0.0029254963155835867, "learning_rate": 4.146666666666667e-06, "loss": 0.0042, "step": 10970 }, { "epoch": 31.007428571428573, "grad_norm": 0.3312826454639435, "learning_rate": 4.1403174603174605e-06, "loss": 0.1451, "step": 10980 }, { "epoch": 31.008, "grad_norm": 235.10496520996094, "learning_rate": 4.133968253968254e-06, "loss": 0.2038, "step": 10990 }, { "epoch": 31.00857142857143, "grad_norm": 0.029614897444844246, "learning_rate": 4.127619047619048e-06, "loss": 0.3989, "step": 11000 }, { "epoch": 31.00914285714286, "grad_norm": 0.00274507119320333, "learning_rate": 4.121269841269842e-06, "loss": 0.0016, "step": 11010 }, { "epoch": 31.009714285714285, "grad_norm": 0.0025850103702396154, "learning_rate": 4.114920634920635e-06, "loss": 0.2211, "step": 11020 }, { "epoch": 31.010285714285715, "grad_norm": 0.003770750481635332, "learning_rate": 4.108571428571429e-06, "loss": 0.3602, "step": 11030 }, { "epoch": 31.010857142857144, "grad_norm": 0.08702629059553146, "learning_rate": 4.102222222222222e-06, "loss": 0.2275, "step": 11040 }, { "epoch": 31.01142857142857, "grad_norm": 0.03445100784301758, "learning_rate": 4.095873015873017e-06, "loss": 0.004, "step": 11050 }, { "epoch": 31.012, "grad_norm": 0.07224719971418381, "learning_rate": 4.08952380952381e-06, "loss": 0.1784, "step": 11060 }, { "epoch": 31.01257142857143, "grad_norm": 0.08405738323926926, "learning_rate": 4.083174603174604e-06, "loss": 0.275, "step": 11070 }, { "epoch": 31.013142857142856, "grad_norm": 0.12125347554683685, "learning_rate": 4.076825396825397e-06, "loss": 0.0013, "step": 11080 }, { "epoch": 31.013714285714286, "grad_norm": 1.5644397735595703, "learning_rate": 4.0704761904761905e-06, "loss": 0.2768, "step": 11090 }, { "epoch": 31.014285714285716, "grad_norm": 0.030137941241264343, "learning_rate": 4.064126984126984e-06, "loss": 0.1936, "step": 11100 }, { "epoch": 31.014857142857142, "grad_norm": 0.005889153108000755, "learning_rate": 4.057777777777778e-06, "loss": 0.3787, "step": 11110 }, { "epoch": 31.015428571428572, "grad_norm": 840.96142578125, "learning_rate": 4.051428571428572e-06, "loss": 0.1409, "step": 11120 }, { "epoch": 31.016, "grad_norm": 0.3630973994731903, "learning_rate": 4.045079365079365e-06, "loss": 0.5644, "step": 11130 }, { "epoch": 31.016571428571428, "grad_norm": 0.1463097482919693, "learning_rate": 4.038730158730159e-06, "loss": 0.4875, "step": 11140 }, { "epoch": 31.017142857142858, "grad_norm": 0.19403041899204254, "learning_rate": 4.032380952380952e-06, "loss": 0.0018, "step": 11150 }, { "epoch": 31.017714285714284, "grad_norm": 0.002485697390511632, "learning_rate": 4.026031746031747e-06, "loss": 0.7305, "step": 11160 }, { "epoch": 31.018285714285714, "grad_norm": 0.22555981576442719, "learning_rate": 4.01968253968254e-06, "loss": 0.0121, "step": 11170 }, { "epoch": 31.018857142857144, "grad_norm": 0.02071559987962246, "learning_rate": 4.013333333333334e-06, "loss": 0.5759, "step": 11180 }, { "epoch": 31.01942857142857, "grad_norm": 0.10109854489564896, "learning_rate": 4.006984126984128e-06, "loss": 0.3115, "step": 11190 }, { "epoch": 31.02, "grad_norm": 0.07400314509868622, "learning_rate": 4.000634920634921e-06, "loss": 0.1113, "step": 11200 }, { "epoch": 31.02, "eval_accuracy": 0.8383838383838383, "eval_loss": 1.0014111995697021, "eval_runtime": 126.4347, "eval_samples_per_second": 2.349, "eval_steps_per_second": 1.178, "step": 11200 }, { "epoch": 32.000571428571426, "grad_norm": 0.018319344148039818, "learning_rate": 3.994285714285714e-06, "loss": 1.0033, "step": 11210 }, { "epoch": 32.00114285714286, "grad_norm": 2.0360021591186523, "learning_rate": 3.987936507936508e-06, "loss": 0.1463, "step": 11220 }, { "epoch": 32.001714285714286, "grad_norm": 11.246064186096191, "learning_rate": 3.981587301587302e-06, "loss": 0.2805, "step": 11230 }, { "epoch": 32.00228571428571, "grad_norm": 0.21093471348285675, "learning_rate": 3.975238095238095e-06, "loss": 0.0017, "step": 11240 }, { "epoch": 32.002857142857145, "grad_norm": 26.97494125366211, "learning_rate": 3.96888888888889e-06, "loss": 0.0044, "step": 11250 }, { "epoch": 32.00342857142857, "grad_norm": 0.3112584352493286, "learning_rate": 3.962539682539683e-06, "loss": 0.2287, "step": 11260 }, { "epoch": 32.004, "grad_norm": 9.63143539428711, "learning_rate": 3.956190476190477e-06, "loss": 0.9842, "step": 11270 }, { "epoch": 32.00457142857143, "grad_norm": 0.25314244627952576, "learning_rate": 3.94984126984127e-06, "loss": 0.2338, "step": 11280 }, { "epoch": 32.00514285714286, "grad_norm": 0.11641905456781387, "learning_rate": 3.943492063492064e-06, "loss": 0.0021, "step": 11290 }, { "epoch": 32.005714285714284, "grad_norm": 83.00080871582031, "learning_rate": 3.937142857142858e-06, "loss": 1.038, "step": 11300 }, { "epoch": 32.00628571428572, "grad_norm": 0.11469433456659317, "learning_rate": 3.930793650793651e-06, "loss": 0.0026, "step": 11310 }, { "epoch": 32.00685714285714, "grad_norm": 0.014426827430725098, "learning_rate": 3.924444444444445e-06, "loss": 0.0023, "step": 11320 }, { "epoch": 32.00742857142857, "grad_norm": 0.0037542914506047964, "learning_rate": 3.918095238095238e-06, "loss": 0.4315, "step": 11330 }, { "epoch": 32.008, "grad_norm": 0.004907695110887289, "learning_rate": 3.911746031746032e-06, "loss": 0.2727, "step": 11340 }, { "epoch": 32.00857142857143, "grad_norm": 0.13894321024417877, "learning_rate": 3.905396825396825e-06, "loss": 0.1903, "step": 11350 }, { "epoch": 32.009142857142855, "grad_norm": 0.03604894503951073, "learning_rate": 3.89904761904762e-06, "loss": 1.1061, "step": 11360 }, { "epoch": 32.00971428571429, "grad_norm": 0.0288707222789526, "learning_rate": 3.892698412698413e-06, "loss": 0.0018, "step": 11370 }, { "epoch": 32.010285714285715, "grad_norm": 41.77061080932617, "learning_rate": 3.886349206349207e-06, "loss": 0.2094, "step": 11380 }, { "epoch": 32.01085714285714, "grad_norm": 0.09654809534549713, "learning_rate": 3.88e-06, "loss": 0.0016, "step": 11390 }, { "epoch": 32.011428571428574, "grad_norm": 0.002318542217835784, "learning_rate": 3.873650793650794e-06, "loss": 0.2902, "step": 11400 }, { "epoch": 32.012, "grad_norm": 0.1287265121936798, "learning_rate": 3.867301587301588e-06, "loss": 0.0023, "step": 11410 }, { "epoch": 32.01257142857143, "grad_norm": 0.025955529883503914, "learning_rate": 3.860952380952381e-06, "loss": 0.29, "step": 11420 }, { "epoch": 32.01314285714286, "grad_norm": 0.022403893992304802, "learning_rate": 3.854603174603175e-06, "loss": 0.2088, "step": 11430 }, { "epoch": 32.013714285714286, "grad_norm": 14.325145721435547, "learning_rate": 3.848253968253968e-06, "loss": 0.2963, "step": 11440 }, { "epoch": 32.01428571428571, "grad_norm": 0.15538744628429413, "learning_rate": 3.841904761904762e-06, "loss": 0.4815, "step": 11450 }, { "epoch": 32.014857142857146, "grad_norm": 0.07028108835220337, "learning_rate": 3.835555555555555e-06, "loss": 0.358, "step": 11460 }, { "epoch": 32.01542857142857, "grad_norm": 26.09341812133789, "learning_rate": 3.82920634920635e-06, "loss": 0.2815, "step": 11470 }, { "epoch": 32.016, "grad_norm": 0.17105203866958618, "learning_rate": 3.822857142857143e-06, "loss": 0.5399, "step": 11480 }, { "epoch": 32.01657142857143, "grad_norm": 124.22401428222656, "learning_rate": 3.816507936507937e-06, "loss": 0.3341, "step": 11490 }, { "epoch": 32.01714285714286, "grad_norm": 0.05575098842382431, "learning_rate": 3.8101587301587306e-06, "loss": 0.0021, "step": 11500 }, { "epoch": 32.017714285714284, "grad_norm": 0.025988677516579628, "learning_rate": 3.803809523809524e-06, "loss": 0.2548, "step": 11510 }, { "epoch": 32.01828571428572, "grad_norm": 31.389314651489258, "learning_rate": 3.7974603174603175e-06, "loss": 0.3914, "step": 11520 }, { "epoch": 32.018857142857144, "grad_norm": 0.030341658741235733, "learning_rate": 3.7911111111111114e-06, "loss": 0.0424, "step": 11530 }, { "epoch": 32.01942857142857, "grad_norm": 0.18274207413196564, "learning_rate": 3.784761904761905e-06, "loss": 0.4848, "step": 11540 }, { "epoch": 32.02, "grad_norm": 0.0034605495166033506, "learning_rate": 3.778412698412699e-06, "loss": 0.2201, "step": 11550 }, { "epoch": 32.02, "eval_accuracy": 0.8484848484848485, "eval_loss": 0.8628104329109192, "eval_runtime": 126.7935, "eval_samples_per_second": 2.342, "eval_steps_per_second": 1.175, "step": 11550 }, { "epoch": 33.000571428571426, "grad_norm": 0.009115273132920265, "learning_rate": 3.7720634920634923e-06, "loss": 0.0015, "step": 11560 }, { "epoch": 33.00114285714286, "grad_norm": 0.029986055567860603, "learning_rate": 3.7657142857142858e-06, "loss": 0.1856, "step": 11570 }, { "epoch": 33.001714285714286, "grad_norm": 0.04707956314086914, "learning_rate": 3.7593650793650797e-06, "loss": 0.3591, "step": 11580 }, { "epoch": 33.00228571428571, "grad_norm": 4.939572334289551, "learning_rate": 3.753015873015873e-06, "loss": 0.0061, "step": 11590 }, { "epoch": 33.002857142857145, "grad_norm": 0.019965268671512604, "learning_rate": 3.7466666666666667e-06, "loss": 0.002, "step": 11600 }, { "epoch": 33.00342857142857, "grad_norm": 0.4821774959564209, "learning_rate": 3.740317460317461e-06, "loss": 0.6994, "step": 11610 }, { "epoch": 33.004, "grad_norm": 359.18255615234375, "learning_rate": 3.733968253968254e-06, "loss": 0.2732, "step": 11620 }, { "epoch": 33.00457142857143, "grad_norm": 0.0022106466349214315, "learning_rate": 3.7276190476190475e-06, "loss": 0.0015, "step": 11630 }, { "epoch": 33.00514285714286, "grad_norm": 0.4608334004878998, "learning_rate": 3.721269841269842e-06, "loss": 0.0016, "step": 11640 }, { "epoch": 33.005714285714284, "grad_norm": 0.10803355276584625, "learning_rate": 3.7149206349206353e-06, "loss": 0.0874, "step": 11650 }, { "epoch": 33.00628571428572, "grad_norm": 0.07706461101770401, "learning_rate": 3.7085714285714284e-06, "loss": 0.0007, "step": 11660 }, { "epoch": 33.00685714285714, "grad_norm": 0.006313271354883909, "learning_rate": 3.7022222222222227e-06, "loss": 0.0019, "step": 11670 }, { "epoch": 33.00742857142857, "grad_norm": 0.037479300051927567, "learning_rate": 3.695873015873016e-06, "loss": 0.8099, "step": 11680 }, { "epoch": 33.008, "grad_norm": 0.08825170248746872, "learning_rate": 3.68952380952381e-06, "loss": 0.0036, "step": 11690 }, { "epoch": 33.00857142857143, "grad_norm": 196.14361572265625, "learning_rate": 3.6831746031746036e-06, "loss": 0.75, "step": 11700 }, { "epoch": 33.009142857142855, "grad_norm": 0.5264555215835571, "learning_rate": 3.676825396825397e-06, "loss": 0.4639, "step": 11710 }, { "epoch": 33.00971428571429, "grad_norm": 10.480401039123535, "learning_rate": 3.670476190476191e-06, "loss": 0.947, "step": 11720 }, { "epoch": 33.010285714285715, "grad_norm": 0.4620245695114136, "learning_rate": 3.6641269841269845e-06, "loss": 0.0035, "step": 11730 }, { "epoch": 33.01085714285714, "grad_norm": 0.13946720957756042, "learning_rate": 3.657777777777778e-06, "loss": 0.2192, "step": 11740 }, { "epoch": 33.011428571428574, "grad_norm": 0.10003682225942612, "learning_rate": 3.651428571428572e-06, "loss": 0.236, "step": 11750 }, { "epoch": 33.012, "grad_norm": 0.2135056108236313, "learning_rate": 3.6450793650793653e-06, "loss": 0.0021, "step": 11760 }, { "epoch": 33.01257142857143, "grad_norm": 0.05885402113199234, "learning_rate": 3.638730158730159e-06, "loss": 0.2531, "step": 11770 }, { "epoch": 33.01314285714286, "grad_norm": 0.011742881499230862, "learning_rate": 3.6323809523809527e-06, "loss": 0.4037, "step": 11780 }, { "epoch": 33.013714285714286, "grad_norm": 0.05668189749121666, "learning_rate": 3.6260317460317462e-06, "loss": 0.0036, "step": 11790 }, { "epoch": 33.01428571428571, "grad_norm": 104.82915496826172, "learning_rate": 3.61968253968254e-06, "loss": 0.1297, "step": 11800 }, { "epoch": 33.014857142857146, "grad_norm": 0.22662226855754852, "learning_rate": 3.6133333333333336e-06, "loss": 0.0031, "step": 11810 }, { "epoch": 33.01542857142857, "grad_norm": 0.0017571650678291917, "learning_rate": 3.606984126984127e-06, "loss": 0.3367, "step": 11820 }, { "epoch": 33.016, "grad_norm": 0.3255730867385864, "learning_rate": 3.600634920634921e-06, "loss": 0.0302, "step": 11830 }, { "epoch": 33.01657142857143, "grad_norm": 0.04617001861333847, "learning_rate": 3.5942857142857145e-06, "loss": 0.6351, "step": 11840 }, { "epoch": 33.01714285714286, "grad_norm": 0.001643249997869134, "learning_rate": 3.587936507936508e-06, "loss": 0.4685, "step": 11850 }, { "epoch": 33.017714285714284, "grad_norm": 66.71566772460938, "learning_rate": 3.581587301587302e-06, "loss": 0.4042, "step": 11860 }, { "epoch": 33.01828571428572, "grad_norm": 0.02842831425368786, "learning_rate": 3.5752380952380954e-06, "loss": 0.3481, "step": 11870 }, { "epoch": 33.018857142857144, "grad_norm": 0.8168884515762329, "learning_rate": 3.568888888888889e-06, "loss": 0.4965, "step": 11880 }, { "epoch": 33.01942857142857, "grad_norm": 109.5650863647461, "learning_rate": 3.562539682539683e-06, "loss": 0.5878, "step": 11890 }, { "epoch": 33.02, "grad_norm": 0.12658925354480743, "learning_rate": 3.5561904761904762e-06, "loss": 0.0324, "step": 11900 }, { "epoch": 33.02, "eval_accuracy": 0.835016835016835, "eval_loss": 0.9971614480018616, "eval_runtime": 126.5059, "eval_samples_per_second": 2.348, "eval_steps_per_second": 1.178, "step": 11900 }, { "epoch": 34.000571428571426, "grad_norm": 0.027432570233941078, "learning_rate": 3.5498412698412697e-06, "loss": 0.3804, "step": 11910 }, { "epoch": 34.00114285714286, "grad_norm": 0.004706758074462414, "learning_rate": 3.543492063492064e-06, "loss": 0.2754, "step": 11920 }, { "epoch": 34.001714285714286, "grad_norm": 0.03468727692961693, "learning_rate": 3.5371428571428575e-06, "loss": 0.0022, "step": 11930 }, { "epoch": 34.00228571428571, "grad_norm": 0.16161181032657623, "learning_rate": 3.5307936507936514e-06, "loss": 0.5878, "step": 11940 }, { "epoch": 34.002857142857145, "grad_norm": 0.0015723078977316618, "learning_rate": 3.524444444444445e-06, "loss": 0.0007, "step": 11950 }, { "epoch": 34.00342857142857, "grad_norm": 0.051679786294698715, "learning_rate": 3.5180952380952384e-06, "loss": 0.0014, "step": 11960 }, { "epoch": 34.004, "grad_norm": 0.07004716992378235, "learning_rate": 3.5117460317460323e-06, "loss": 0.4407, "step": 11970 }, { "epoch": 34.00457142857143, "grad_norm": 0.04986123740673065, "learning_rate": 3.5053968253968258e-06, "loss": 0.2698, "step": 11980 }, { "epoch": 34.00514285714286, "grad_norm": 0.007722809910774231, "learning_rate": 3.4990476190476193e-06, "loss": 0.0011, "step": 11990 }, { "epoch": 34.005714285714284, "grad_norm": 0.0745702013373375, "learning_rate": 3.492698412698413e-06, "loss": 0.2513, "step": 12000 }, { "epoch": 34.00628571428572, "grad_norm": 0.043245185166597366, "learning_rate": 3.4863492063492067e-06, "loss": 0.2987, "step": 12010 }, { "epoch": 34.00685714285714, "grad_norm": 0.09795431792736053, "learning_rate": 3.48e-06, "loss": 0.3349, "step": 12020 }, { "epoch": 34.00742857142857, "grad_norm": 0.09430580586194992, "learning_rate": 3.473650793650794e-06, "loss": 0.0011, "step": 12030 }, { "epoch": 34.008, "grad_norm": 0.022420106455683708, "learning_rate": 3.4673015873015875e-06, "loss": 0.267, "step": 12040 }, { "epoch": 34.00857142857143, "grad_norm": 0.045474231243133545, "learning_rate": 3.4609523809523814e-06, "loss": 0.2568, "step": 12050 }, { "epoch": 34.009142857142855, "grad_norm": 80.42121887207031, "learning_rate": 3.454603174603175e-06, "loss": 0.3101, "step": 12060 }, { "epoch": 34.00971428571429, "grad_norm": 20.72455406188965, "learning_rate": 3.4482539682539684e-06, "loss": 0.6013, "step": 12070 }, { "epoch": 34.010285714285715, "grad_norm": 33.085418701171875, "learning_rate": 3.4419047619047623e-06, "loss": 0.5322, "step": 12080 }, { "epoch": 34.01085714285714, "grad_norm": 209.3749542236328, "learning_rate": 3.435555555555556e-06, "loss": 0.2576, "step": 12090 }, { "epoch": 34.011428571428574, "grad_norm": 0.305754691362381, "learning_rate": 3.4292063492063493e-06, "loss": 0.1585, "step": 12100 }, { "epoch": 34.012, "grad_norm": 19.921072006225586, "learning_rate": 3.422857142857143e-06, "loss": 0.2088, "step": 12110 }, { "epoch": 34.01257142857143, "grad_norm": 0.02304167114198208, "learning_rate": 3.4165079365079367e-06, "loss": 0.167, "step": 12120 }, { "epoch": 34.01314285714286, "grad_norm": 0.08073779195547104, "learning_rate": 3.41015873015873e-06, "loss": 0.7815, "step": 12130 }, { "epoch": 34.013714285714286, "grad_norm": 94.09422302246094, "learning_rate": 3.403809523809524e-06, "loss": 0.0197, "step": 12140 }, { "epoch": 34.01428571428571, "grad_norm": 0.05119791999459267, "learning_rate": 3.3974603174603175e-06, "loss": 0.0028, "step": 12150 }, { "epoch": 34.014857142857146, "grad_norm": 15.968472480773926, "learning_rate": 3.391111111111111e-06, "loss": 0.3075, "step": 12160 }, { "epoch": 34.01542857142857, "grad_norm": 95.5150375366211, "learning_rate": 3.3847619047619053e-06, "loss": 0.5475, "step": 12170 }, { "epoch": 34.016, "grad_norm": 69.68689727783203, "learning_rate": 3.3784126984126984e-06, "loss": 0.293, "step": 12180 }, { "epoch": 34.01657142857143, "grad_norm": 0.052636485546827316, "learning_rate": 3.3720634920634927e-06, "loss": 0.2891, "step": 12190 }, { "epoch": 34.01714285714286, "grad_norm": 22.63859748840332, "learning_rate": 3.3657142857142862e-06, "loss": 0.2413, "step": 12200 }, { "epoch": 34.017714285714284, "grad_norm": 0.07254651933908463, "learning_rate": 3.3593650793650797e-06, "loss": 0.2965, "step": 12210 }, { "epoch": 34.01828571428572, "grad_norm": 0.016718924045562744, "learning_rate": 3.3530158730158736e-06, "loss": 0.0566, "step": 12220 }, { "epoch": 34.018857142857144, "grad_norm": 0.0034701621625572443, "learning_rate": 3.346666666666667e-06, "loss": 0.1736, "step": 12230 }, { "epoch": 34.01942857142857, "grad_norm": 0.4016928970813751, "learning_rate": 3.3403174603174606e-06, "loss": 0.3631, "step": 12240 }, { "epoch": 34.02, "grad_norm": 173.18507385253906, "learning_rate": 3.3339682539682545e-06, "loss": 0.4411, "step": 12250 }, { "epoch": 34.02, "eval_accuracy": 0.835016835016835, "eval_loss": 1.059208869934082, "eval_runtime": 126.2592, "eval_samples_per_second": 2.352, "eval_steps_per_second": 1.18, "step": 12250 }, { "epoch": 35.000571428571426, "grad_norm": 0.02301918901503086, "learning_rate": 3.327619047619048e-06, "loss": 0.264, "step": 12260 }, { "epoch": 35.00114285714286, "grad_norm": 0.14874428510665894, "learning_rate": 3.3212698412698414e-06, "loss": 0.0022, "step": 12270 }, { "epoch": 35.001714285714286, "grad_norm": 0.5483258962631226, "learning_rate": 3.3149206349206354e-06, "loss": 0.1753, "step": 12280 }, { "epoch": 35.00228571428571, "grad_norm": 0.0009629835840314627, "learning_rate": 3.308571428571429e-06, "loss": 0.1238, "step": 12290 }, { "epoch": 35.002857142857145, "grad_norm": 0.543134331703186, "learning_rate": 3.3022222222222223e-06, "loss": 0.1245, "step": 12300 }, { "epoch": 35.00342857142857, "grad_norm": 0.08552040904760361, "learning_rate": 3.2958730158730162e-06, "loss": 0.5037, "step": 12310 }, { "epoch": 35.004, "grad_norm": 0.015447727404534817, "learning_rate": 3.2895238095238097e-06, "loss": 0.5062, "step": 12320 }, { "epoch": 35.00457142857143, "grad_norm": 0.004400096833705902, "learning_rate": 3.2831746031746036e-06, "loss": 0.2552, "step": 12330 }, { "epoch": 35.00514285714286, "grad_norm": 0.018583452329039574, "learning_rate": 3.276825396825397e-06, "loss": 0.3969, "step": 12340 }, { "epoch": 35.005714285714284, "grad_norm": 0.04393948242068291, "learning_rate": 3.2704761904761906e-06, "loss": 0.2678, "step": 12350 }, { "epoch": 35.00628571428572, "grad_norm": 0.05897550284862518, "learning_rate": 3.2641269841269845e-06, "loss": 0.0057, "step": 12360 }, { "epoch": 35.00685714285714, "grad_norm": 0.004284753929823637, "learning_rate": 3.257777777777778e-06, "loss": 0.2586, "step": 12370 }, { "epoch": 35.00742857142857, "grad_norm": 0.007391383405774832, "learning_rate": 3.2514285714285715e-06, "loss": 0.6294, "step": 12380 }, { "epoch": 35.008, "grad_norm": 0.05153246968984604, "learning_rate": 3.2450793650793654e-06, "loss": 0.6797, "step": 12390 }, { "epoch": 35.00857142857143, "grad_norm": 20.67218017578125, "learning_rate": 3.238730158730159e-06, "loss": 0.2888, "step": 12400 }, { "epoch": 35.009142857142855, "grad_norm": 0.0017453532200306654, "learning_rate": 3.2323809523809523e-06, "loss": 0.0024, "step": 12410 }, { "epoch": 35.00971428571429, "grad_norm": 0.6204124689102173, "learning_rate": 3.2260317460317462e-06, "loss": 0.3306, "step": 12420 }, { "epoch": 35.010285714285715, "grad_norm": 0.11453156918287277, "learning_rate": 3.2196825396825397e-06, "loss": 0.6737, "step": 12430 }, { "epoch": 35.01085714285714, "grad_norm": 116.97684478759766, "learning_rate": 3.213333333333334e-06, "loss": 0.453, "step": 12440 }, { "epoch": 35.011428571428574, "grad_norm": 0.036407049745321274, "learning_rate": 3.206984126984127e-06, "loss": 0.3485, "step": 12450 }, { "epoch": 35.012, "grad_norm": 0.003050927072763443, "learning_rate": 3.2006349206349206e-06, "loss": 0.4884, "step": 12460 }, { "epoch": 35.01257142857143, "grad_norm": 0.2217385321855545, "learning_rate": 3.194285714285715e-06, "loss": 0.5697, "step": 12470 }, { "epoch": 35.01314285714286, "grad_norm": 4.006773471832275, "learning_rate": 3.1879365079365084e-06, "loss": 0.0041, "step": 12480 }, { "epoch": 35.013714285714286, "grad_norm": 0.09846015274524689, "learning_rate": 3.1815873015873015e-06, "loss": 0.0028, "step": 12490 }, { "epoch": 35.01428571428571, "grad_norm": 0.00579382898285985, "learning_rate": 3.175238095238096e-06, "loss": 0.0024, "step": 12500 }, { "epoch": 35.014857142857146, "grad_norm": 0.08295068144798279, "learning_rate": 3.1688888888888893e-06, "loss": 0.001, "step": 12510 }, { "epoch": 35.01542857142857, "grad_norm": 0.0009396121604368091, "learning_rate": 3.1625396825396828e-06, "loss": 0.0129, "step": 12520 }, { "epoch": 35.016, "grad_norm": 0.007435150910168886, "learning_rate": 3.1561904761904767e-06, "loss": 0.0014, "step": 12530 }, { "epoch": 35.01657142857143, "grad_norm": 0.03919130563735962, "learning_rate": 3.14984126984127e-06, "loss": 0.6772, "step": 12540 }, { "epoch": 35.01714285714286, "grad_norm": 0.03321586921811104, "learning_rate": 3.1434920634920636e-06, "loss": 0.4575, "step": 12550 }, { "epoch": 35.017714285714284, "grad_norm": 0.19160214066505432, "learning_rate": 3.1371428571428575e-06, "loss": 0.195, "step": 12560 }, { "epoch": 35.01828571428572, "grad_norm": 0.0026826318353414536, "learning_rate": 3.130793650793651e-06, "loss": 0.0045, "step": 12570 }, { "epoch": 35.018857142857144, "grad_norm": 0.038780469447374344, "learning_rate": 3.124444444444445e-06, "loss": 0.0011, "step": 12580 }, { "epoch": 35.01942857142857, "grad_norm": 0.07216961681842804, "learning_rate": 3.1180952380952384e-06, "loss": 0.0011, "step": 12590 }, { "epoch": 35.02, "grad_norm": 0.026877250522375107, "learning_rate": 3.111746031746032e-06, "loss": 0.0011, "step": 12600 }, { "epoch": 35.02, "eval_accuracy": 0.8282828282828283, "eval_loss": 1.0746172666549683, "eval_runtime": 126.6057, "eval_samples_per_second": 2.346, "eval_steps_per_second": 1.177, "step": 12600 }, { "epoch": 36.000571428571426, "grad_norm": 0.0886671394109726, "learning_rate": 3.105396825396826e-06, "loss": 0.2623, "step": 12610 }, { "epoch": 36.00114285714286, "grad_norm": 0.012463954277336597, "learning_rate": 3.0990476190476193e-06, "loss": 0.4357, "step": 12620 }, { "epoch": 36.001714285714286, "grad_norm": 0.026730680838227272, "learning_rate": 3.0926984126984128e-06, "loss": 0.2669, "step": 12630 }, { "epoch": 36.00228571428571, "grad_norm": 0.07718096673488617, "learning_rate": 3.0863492063492067e-06, "loss": 0.5815, "step": 12640 }, { "epoch": 36.002857142857145, "grad_norm": 0.2121714949607849, "learning_rate": 3.08e-06, "loss": 0.1494, "step": 12650 }, { "epoch": 36.00342857142857, "grad_norm": 0.1541670709848404, "learning_rate": 3.0736507936507936e-06, "loss": 0.0016, "step": 12660 }, { "epoch": 36.004, "grad_norm": 0.030359311029314995, "learning_rate": 3.0673015873015875e-06, "loss": 0.0706, "step": 12670 }, { "epoch": 36.00457142857143, "grad_norm": 0.07423070073127747, "learning_rate": 3.060952380952381e-06, "loss": 0.234, "step": 12680 }, { "epoch": 36.00514285714286, "grad_norm": 0.005180784966796637, "learning_rate": 3.054603174603175e-06, "loss": 0.2837, "step": 12690 }, { "epoch": 36.005714285714284, "grad_norm": 345.4854736328125, "learning_rate": 3.0482539682539684e-06, "loss": 0.3923, "step": 12700 }, { "epoch": 36.00628571428572, "grad_norm": 0.0064833336509764194, "learning_rate": 3.041904761904762e-06, "loss": 0.0015, "step": 12710 }, { "epoch": 36.00685714285714, "grad_norm": 0.04605857655405998, "learning_rate": 3.0355555555555562e-06, "loss": 0.0033, "step": 12720 }, { "epoch": 36.00742857142857, "grad_norm": 0.10902780294418335, "learning_rate": 3.0292063492063493e-06, "loss": 0.0027, "step": 12730 }, { "epoch": 36.008, "grad_norm": 0.006975686177611351, "learning_rate": 3.0228571428571428e-06, "loss": 0.4695, "step": 12740 }, { "epoch": 36.00857142857143, "grad_norm": 0.09180530905723572, "learning_rate": 3.016507936507937e-06, "loss": 0.0007, "step": 12750 }, { "epoch": 36.009142857142855, "grad_norm": 0.007026137318462133, "learning_rate": 3.0101587301587306e-06, "loss": 0.4985, "step": 12760 }, { "epoch": 36.00971428571429, "grad_norm": 0.005862717051059008, "learning_rate": 3.0038095238095236e-06, "loss": 0.163, "step": 12770 }, { "epoch": 36.010285714285715, "grad_norm": 0.05073744058609009, "learning_rate": 2.997460317460318e-06, "loss": 0.002, "step": 12780 }, { "epoch": 36.01085714285714, "grad_norm": 0.017824208363890648, "learning_rate": 2.9911111111111115e-06, "loss": 0.4371, "step": 12790 }, { "epoch": 36.011428571428574, "grad_norm": 13.799386024475098, "learning_rate": 2.984761904761905e-06, "loss": 0.6038, "step": 12800 }, { "epoch": 36.012, "grad_norm": 199.8282012939453, "learning_rate": 2.978412698412699e-06, "loss": 0.4218, "step": 12810 }, { "epoch": 36.01257142857143, "grad_norm": 0.15811072289943695, "learning_rate": 2.9720634920634923e-06, "loss": 0.0011, "step": 12820 }, { "epoch": 36.01314285714286, "grad_norm": 0.17585258185863495, "learning_rate": 2.9657142857142862e-06, "loss": 0.4932, "step": 12830 }, { "epoch": 36.013714285714286, "grad_norm": 16.838157653808594, "learning_rate": 2.9593650793650797e-06, "loss": 1.1193, "step": 12840 }, { "epoch": 36.01428571428571, "grad_norm": 0.04459202662110329, "learning_rate": 2.953015873015873e-06, "loss": 0.184, "step": 12850 }, { "epoch": 36.014857142857146, "grad_norm": 0.38909366726875305, "learning_rate": 2.946666666666667e-06, "loss": 0.2212, "step": 12860 }, { "epoch": 36.01542857142857, "grad_norm": 0.1248052716255188, "learning_rate": 2.9403174603174606e-06, "loss": 0.0016, "step": 12870 }, { "epoch": 36.016, "grad_norm": 0.5679760575294495, "learning_rate": 2.933968253968254e-06, "loss": 0.0029, "step": 12880 }, { "epoch": 36.01657142857143, "grad_norm": 0.02419651672244072, "learning_rate": 2.927619047619048e-06, "loss": 0.2692, "step": 12890 }, { "epoch": 36.01714285714286, "grad_norm": 0.2062833309173584, "learning_rate": 2.9212698412698415e-06, "loss": 0.1403, "step": 12900 }, { "epoch": 36.017714285714284, "grad_norm": 0.21458952128887177, "learning_rate": 2.914920634920635e-06, "loss": 0.4953, "step": 12910 }, { "epoch": 36.01828571428572, "grad_norm": 0.24665455520153046, "learning_rate": 2.908571428571429e-06, "loss": 0.7257, "step": 12920 }, { "epoch": 36.018857142857144, "grad_norm": 0.5908558368682861, "learning_rate": 2.9022222222222223e-06, "loss": 0.005, "step": 12930 }, { "epoch": 36.01942857142857, "grad_norm": 0.19099822640419006, "learning_rate": 2.895873015873016e-06, "loss": 0.0019, "step": 12940 }, { "epoch": 36.02, "grad_norm": 0.05261155590415001, "learning_rate": 2.8895238095238097e-06, "loss": 0.3917, "step": 12950 }, { "epoch": 36.02, "eval_accuracy": 0.8383838383838383, "eval_loss": 0.9695500135421753, "eval_runtime": 126.7723, "eval_samples_per_second": 2.343, "eval_steps_per_second": 1.175, "step": 12950 }, { "epoch": 37.000571428571426, "grad_norm": 0.07244765758514404, "learning_rate": 2.8831746031746032e-06, "loss": 0.0006, "step": 12960 }, { "epoch": 37.00114285714286, "grad_norm": 0.04542018845677376, "learning_rate": 2.876825396825397e-06, "loss": 0.4526, "step": 12970 }, { "epoch": 37.001714285714286, "grad_norm": 0.007704177405685186, "learning_rate": 2.8704761904761906e-06, "loss": 0.0015, "step": 12980 }, { "epoch": 37.00228571428571, "grad_norm": 0.01741478592157364, "learning_rate": 2.864126984126984e-06, "loss": 0.4346, "step": 12990 }, { "epoch": 37.002857142857145, "grad_norm": 0.020365318283438683, "learning_rate": 2.8577777777777784e-06, "loss": 0.2763, "step": 13000 }, { "epoch": 37.00342857142857, "grad_norm": 0.10666250437498093, "learning_rate": 2.8514285714285715e-06, "loss": 0.1876, "step": 13010 }, { "epoch": 37.004, "grad_norm": 0.0016732965596020222, "learning_rate": 2.845079365079365e-06, "loss": 0.0018, "step": 13020 }, { "epoch": 37.00457142857143, "grad_norm": 0.025395436212420464, "learning_rate": 2.8387301587301593e-06, "loss": 0.0012, "step": 13030 }, { "epoch": 37.00514285714286, "grad_norm": 2.5686378479003906, "learning_rate": 2.8323809523809528e-06, "loss": 0.197, "step": 13040 }, { "epoch": 37.005714285714284, "grad_norm": 0.044472586363554, "learning_rate": 2.826031746031746e-06, "loss": 0.0021, "step": 13050 }, { "epoch": 37.00628571428572, "grad_norm": 0.05117283761501312, "learning_rate": 2.81968253968254e-06, "loss": 0.0008, "step": 13060 }, { "epoch": 37.00685714285714, "grad_norm": 0.1931295245885849, "learning_rate": 2.8133333333333336e-06, "loss": 0.0012, "step": 13070 }, { "epoch": 37.00742857142857, "grad_norm": 0.0703316405415535, "learning_rate": 2.8069841269841276e-06, "loss": 0.0013, "step": 13080 }, { "epoch": 37.008, "grad_norm": 0.03455556929111481, "learning_rate": 2.800634920634921e-06, "loss": 0.0009, "step": 13090 }, { "epoch": 37.00857142857143, "grad_norm": 0.007254268042743206, "learning_rate": 2.7942857142857145e-06, "loss": 0.906, "step": 13100 }, { "epoch": 37.009142857142855, "grad_norm": 0.01259919349104166, "learning_rate": 2.7879365079365084e-06, "loss": 0.5051, "step": 13110 }, { "epoch": 37.00971428571429, "grad_norm": 0.05640314146876335, "learning_rate": 2.781587301587302e-06, "loss": 0.0017, "step": 13120 }, { "epoch": 37.010285714285715, "grad_norm": 0.0015288791619241238, "learning_rate": 2.7752380952380954e-06, "loss": 0.6821, "step": 13130 }, { "epoch": 37.01085714285714, "grad_norm": 0.1644653081893921, "learning_rate": 2.7688888888888893e-06, "loss": 0.1161, "step": 13140 }, { "epoch": 37.011428571428574, "grad_norm": 0.0032219102140516043, "learning_rate": 2.7625396825396828e-06, "loss": 0.0012, "step": 13150 }, { "epoch": 37.012, "grad_norm": 0.1719381958246231, "learning_rate": 2.7561904761904763e-06, "loss": 0.0014, "step": 13160 }, { "epoch": 37.01257142857143, "grad_norm": 0.04314820095896721, "learning_rate": 2.74984126984127e-06, "loss": 0.0016, "step": 13170 }, { "epoch": 37.01314285714286, "grad_norm": 0.05535136163234711, "learning_rate": 2.7434920634920637e-06, "loss": 0.0016, "step": 13180 }, { "epoch": 37.013714285714286, "grad_norm": 0.004680715035647154, "learning_rate": 2.737142857142857e-06, "loss": 0.0195, "step": 13190 }, { "epoch": 37.01428571428571, "grad_norm": 0.020550280809402466, "learning_rate": 2.730793650793651e-06, "loss": 0.0009, "step": 13200 }, { "epoch": 37.014857142857146, "grad_norm": 0.02597636915743351, "learning_rate": 2.7244444444444445e-06, "loss": 0.3391, "step": 13210 }, { "epoch": 37.01542857142857, "grad_norm": 0.022728653624653816, "learning_rate": 2.7180952380952384e-06, "loss": 0.3329, "step": 13220 }, { "epoch": 37.016, "grad_norm": 37.18050003051758, "learning_rate": 2.711746031746032e-06, "loss": 1.1542, "step": 13230 }, { "epoch": 37.01657142857143, "grad_norm": 0.11314905434846878, "learning_rate": 2.7053968253968254e-06, "loss": 0.3948, "step": 13240 }, { "epoch": 37.01714285714286, "grad_norm": 16.31825065612793, "learning_rate": 2.6990476190476193e-06, "loss": 0.4456, "step": 13250 }, { "epoch": 37.017714285714284, "grad_norm": 0.0014047048753127456, "learning_rate": 2.6926984126984128e-06, "loss": 0.2791, "step": 13260 }, { "epoch": 37.01828571428572, "grad_norm": 0.059588722884655, "learning_rate": 2.6863492063492063e-06, "loss": 0.2329, "step": 13270 }, { "epoch": 37.018857142857144, "grad_norm": 0.005540232639759779, "learning_rate": 2.68e-06, "loss": 0.0019, "step": 13280 }, { "epoch": 37.01942857142857, "grad_norm": 18.04555892944336, "learning_rate": 2.6736507936507937e-06, "loss": 0.7975, "step": 13290 }, { "epoch": 37.02, "grad_norm": 0.0029781581833958626, "learning_rate": 2.667301587301587e-06, "loss": 0.7268, "step": 13300 }, { "epoch": 37.02, "eval_accuracy": 0.8181818181818182, "eval_loss": 1.1061733961105347, "eval_runtime": 126.817, "eval_samples_per_second": 2.342, "eval_steps_per_second": 1.175, "step": 13300 }, { "epoch": 38.000571428571426, "grad_norm": 0.47502627968788147, "learning_rate": 2.6609523809523815e-06, "loss": 0.3805, "step": 13310 }, { "epoch": 38.00114285714286, "grad_norm": 0.08994001895189285, "learning_rate": 2.6546031746031745e-06, "loss": 0.003, "step": 13320 }, { "epoch": 38.001714285714286, "grad_norm": 0.06851596385240555, "learning_rate": 2.648253968253969e-06, "loss": 0.0035, "step": 13330 }, { "epoch": 38.00228571428571, "grad_norm": 0.1989380419254303, "learning_rate": 2.6419047619047623e-06, "loss": 0.3274, "step": 13340 }, { "epoch": 38.002857142857145, "grad_norm": 0.04935070872306824, "learning_rate": 2.635555555555556e-06, "loss": 0.4874, "step": 13350 }, { "epoch": 38.00342857142857, "grad_norm": 0.04223153740167618, "learning_rate": 2.6292063492063497e-06, "loss": 0.2244, "step": 13360 }, { "epoch": 38.004, "grad_norm": 0.06442692130804062, "learning_rate": 2.6228571428571432e-06, "loss": 0.005, "step": 13370 }, { "epoch": 38.00457142857143, "grad_norm": 0.04460546746850014, "learning_rate": 2.6165079365079367e-06, "loss": 0.238, "step": 13380 }, { "epoch": 38.00514285714286, "grad_norm": 0.0043782079592347145, "learning_rate": 2.6101587301587306e-06, "loss": 0.2346, "step": 13390 }, { "epoch": 38.005714285714284, "grad_norm": 0.05058957263827324, "learning_rate": 2.603809523809524e-06, "loss": 0.4129, "step": 13400 }, { "epoch": 38.00628571428572, "grad_norm": 0.020779475569725037, "learning_rate": 2.5974603174603176e-06, "loss": 0.154, "step": 13410 }, { "epoch": 38.00685714285714, "grad_norm": 0.3094046413898468, "learning_rate": 2.5911111111111115e-06, "loss": 0.3172, "step": 13420 }, { "epoch": 38.00742857142857, "grad_norm": 0.38032662868499756, "learning_rate": 2.584761904761905e-06, "loss": 0.2117, "step": 13430 }, { "epoch": 38.008, "grad_norm": 0.004349476657807827, "learning_rate": 2.5784126984126984e-06, "loss": 0.2803, "step": 13440 }, { "epoch": 38.00857142857143, "grad_norm": 0.06518778204917908, "learning_rate": 2.5720634920634924e-06, "loss": 0.2218, "step": 13450 }, { "epoch": 38.009142857142855, "grad_norm": 0.09332817047834396, "learning_rate": 2.565714285714286e-06, "loss": 0.2499, "step": 13460 }, { "epoch": 38.00971428571429, "grad_norm": 0.11110708862543106, "learning_rate": 2.5593650793650797e-06, "loss": 0.6162, "step": 13470 }, { "epoch": 38.010285714285715, "grad_norm": 0.07068169862031937, "learning_rate": 2.5530158730158732e-06, "loss": 0.4614, "step": 13480 }, { "epoch": 38.01085714285714, "grad_norm": 0.0015391431516036391, "learning_rate": 2.5466666666666667e-06, "loss": 0.3118, "step": 13490 }, { "epoch": 38.011428571428574, "grad_norm": 0.05324092507362366, "learning_rate": 2.5403174603174606e-06, "loss": 0.0024, "step": 13500 }, { "epoch": 38.012, "grad_norm": 0.03473285958170891, "learning_rate": 2.533968253968254e-06, "loss": 0.1767, "step": 13510 }, { "epoch": 38.01257142857143, "grad_norm": 0.05727458372712135, "learning_rate": 2.5276190476190476e-06, "loss": 0.2763, "step": 13520 }, { "epoch": 38.01314285714286, "grad_norm": 0.04184458777308464, "learning_rate": 2.5212698412698415e-06, "loss": 0.0069, "step": 13530 }, { "epoch": 38.013714285714286, "grad_norm": 0.003181879874318838, "learning_rate": 2.514920634920635e-06, "loss": 0.001, "step": 13540 }, { "epoch": 38.01428571428571, "grad_norm": 0.019657937809824944, "learning_rate": 2.5085714285714285e-06, "loss": 0.0081, "step": 13550 }, { "epoch": 38.014857142857146, "grad_norm": 0.032800447195768356, "learning_rate": 2.5022222222222224e-06, "loss": 0.0018, "step": 13560 }, { "epoch": 38.01542857142857, "grad_norm": 0.04382657632231712, "learning_rate": 2.495873015873016e-06, "loss": 0.0013, "step": 13570 }, { "epoch": 38.016, "grad_norm": 0.008959997445344925, "learning_rate": 2.4895238095238097e-06, "loss": 0.0004, "step": 13580 }, { "epoch": 38.01657142857143, "grad_norm": 0.013616718351840973, "learning_rate": 2.4831746031746037e-06, "loss": 0.0008, "step": 13590 }, { "epoch": 38.01714285714286, "grad_norm": 27.68400001525879, "learning_rate": 2.4768253968253967e-06, "loss": 0.4851, "step": 13600 }, { "epoch": 38.017714285714284, "grad_norm": 20.209680557250977, "learning_rate": 2.4704761904761906e-06, "loss": 0.3812, "step": 13610 }, { "epoch": 38.01828571428572, "grad_norm": 0.13937675952911377, "learning_rate": 2.4641269841269845e-06, "loss": 0.2243, "step": 13620 }, { "epoch": 38.018857142857144, "grad_norm": 0.0037647627759724855, "learning_rate": 2.457777777777778e-06, "loss": 0.6971, "step": 13630 }, { "epoch": 38.01942857142857, "grad_norm": 0.033469632267951965, "learning_rate": 2.4514285714285715e-06, "loss": 0.3434, "step": 13640 }, { "epoch": 38.02, "grad_norm": 0.023815317079424858, "learning_rate": 2.4450793650793654e-06, "loss": 0.3747, "step": 13650 }, { "epoch": 38.02, "eval_accuracy": 0.835016835016835, "eval_loss": 1.036763072013855, "eval_runtime": 126.6279, "eval_samples_per_second": 2.345, "eval_steps_per_second": 1.177, "step": 13650 }, { "epoch": 39.000571428571426, "grad_norm": 49.7794075012207, "learning_rate": 2.438730158730159e-06, "loss": 0.003, "step": 13660 }, { "epoch": 39.00114285714286, "grad_norm": 0.03126109018921852, "learning_rate": 2.4323809523809524e-06, "loss": 0.1622, "step": 13670 }, { "epoch": 39.001714285714286, "grad_norm": 0.047299765050411224, "learning_rate": 2.4260317460317463e-06, "loss": 0.2872, "step": 13680 }, { "epoch": 39.00228571428571, "grad_norm": 0.05949264392256737, "learning_rate": 2.4196825396825398e-06, "loss": 0.2092, "step": 13690 }, { "epoch": 39.002857142857145, "grad_norm": 0.026437293738126755, "learning_rate": 2.4133333333333337e-06, "loss": 0.0031, "step": 13700 }, { "epoch": 39.00342857142857, "grad_norm": 0.20057931542396545, "learning_rate": 2.406984126984127e-06, "loss": 0.4873, "step": 13710 }, { "epoch": 39.004, "grad_norm": 0.004649260081350803, "learning_rate": 2.4006349206349206e-06, "loss": 0.2073, "step": 13720 }, { "epoch": 39.00457142857143, "grad_norm": 0.5119889974594116, "learning_rate": 2.3942857142857145e-06, "loss": 0.4199, "step": 13730 }, { "epoch": 39.00514285714286, "grad_norm": 0.16033364832401276, "learning_rate": 2.387936507936508e-06, "loss": 0.0135, "step": 13740 }, { "epoch": 39.005714285714284, "grad_norm": 0.0032101422548294067, "learning_rate": 2.381587301587302e-06, "loss": 0.9572, "step": 13750 }, { "epoch": 39.00628571428572, "grad_norm": 0.026194965466856956, "learning_rate": 2.3752380952380954e-06, "loss": 1.2531, "step": 13760 }, { "epoch": 39.00685714285714, "grad_norm": 0.0062312232330441475, "learning_rate": 2.3688888888888893e-06, "loss": 0.2321, "step": 13770 }, { "epoch": 39.00742857142857, "grad_norm": 0.07098673284053802, "learning_rate": 2.362539682539683e-06, "loss": 0.0025, "step": 13780 }, { "epoch": 39.008, "grad_norm": 0.0009907495696097612, "learning_rate": 2.3561904761904763e-06, "loss": 0.563, "step": 13790 }, { "epoch": 39.00857142857143, "grad_norm": 0.0647115707397461, "learning_rate": 2.34984126984127e-06, "loss": 0.8326, "step": 13800 }, { "epoch": 39.009142857142855, "grad_norm": 0.0811251774430275, "learning_rate": 2.3434920634920637e-06, "loss": 0.6204, "step": 13810 }, { "epoch": 39.00971428571429, "grad_norm": 0.011888241395354271, "learning_rate": 2.337142857142857e-06, "loss": 0.0029, "step": 13820 }, { "epoch": 39.010285714285715, "grad_norm": 0.06965713948011398, "learning_rate": 2.330793650793651e-06, "loss": 0.0021, "step": 13830 }, { "epoch": 39.01085714285714, "grad_norm": 0.0806642472743988, "learning_rate": 2.3244444444444445e-06, "loss": 0.2988, "step": 13840 }, { "epoch": 39.011428571428574, "grad_norm": 0.007325103506445885, "learning_rate": 2.318095238095238e-06, "loss": 0.0013, "step": 13850 }, { "epoch": 39.012, "grad_norm": 0.11560201644897461, "learning_rate": 2.311746031746032e-06, "loss": 0.1653, "step": 13860 }, { "epoch": 39.01257142857143, "grad_norm": 0.033399466425180435, "learning_rate": 2.305396825396826e-06, "loss": 0.0027, "step": 13870 }, { "epoch": 39.01314285714286, "grad_norm": 1141.728515625, "learning_rate": 2.2990476190476193e-06, "loss": 0.0865, "step": 13880 }, { "epoch": 39.013714285714286, "grad_norm": 0.0021837761159986258, "learning_rate": 2.292698412698413e-06, "loss": 0.2472, "step": 13890 }, { "epoch": 39.01428571428571, "grad_norm": 0.008280741050839424, "learning_rate": 2.2863492063492067e-06, "loss": 0.2712, "step": 13900 }, { "epoch": 39.014857142857146, "grad_norm": 0.00286860391497612, "learning_rate": 2.28e-06, "loss": 0.2084, "step": 13910 }, { "epoch": 39.01542857142857, "grad_norm": 0.012107732705771923, "learning_rate": 2.2736507936507937e-06, "loss": 0.0006, "step": 13920 }, { "epoch": 39.016, "grad_norm": 0.0011101092677563429, "learning_rate": 2.2673015873015876e-06, "loss": 0.3675, "step": 13930 }, { "epoch": 39.01657142857143, "grad_norm": 0.03219223394989967, "learning_rate": 2.260952380952381e-06, "loss": 0.0032, "step": 13940 }, { "epoch": 39.01714285714286, "grad_norm": 0.0029516194481402636, "learning_rate": 2.254603174603175e-06, "loss": 0.0086, "step": 13950 }, { "epoch": 39.017714285714284, "grad_norm": 0.030327564105391502, "learning_rate": 2.2482539682539685e-06, "loss": 0.2982, "step": 13960 }, { "epoch": 39.01828571428572, "grad_norm": 0.010290997102856636, "learning_rate": 2.241904761904762e-06, "loss": 0.2149, "step": 13970 }, { "epoch": 39.018857142857144, "grad_norm": 0.031805507838726044, "learning_rate": 2.235555555555556e-06, "loss": 0.1666, "step": 13980 }, { "epoch": 39.01942857142857, "grad_norm": 0.05369047820568085, "learning_rate": 2.2292063492063493e-06, "loss": 0.0689, "step": 13990 }, { "epoch": 39.02, "grad_norm": 0.07717669010162354, "learning_rate": 2.222857142857143e-06, "loss": 0.5584, "step": 14000 }, { "epoch": 39.02, "eval_accuracy": 0.8417508417508418, "eval_loss": 1.0148627758026123, "eval_runtime": 126.757, "eval_samples_per_second": 2.343, "eval_steps_per_second": 1.175, "step": 14000 }, { "epoch": 40.000571428571426, "grad_norm": 0.2859209179878235, "learning_rate": 2.2165079365079367e-06, "loss": 0.001, "step": 14010 }, { "epoch": 40.00114285714286, "grad_norm": 0.06745479255914688, "learning_rate": 2.2101587301587306e-06, "loss": 0.0017, "step": 14020 }, { "epoch": 40.001714285714286, "grad_norm": 0.08140210062265396, "learning_rate": 2.203809523809524e-06, "loss": 0.0011, "step": 14030 }, { "epoch": 40.00228571428571, "grad_norm": 27.934417724609375, "learning_rate": 2.1974603174603176e-06, "loss": 0.1988, "step": 14040 }, { "epoch": 40.002857142857145, "grad_norm": 0.01658036932349205, "learning_rate": 2.1911111111111115e-06, "loss": 0.5945, "step": 14050 }, { "epoch": 40.00342857142857, "grad_norm": 0.0029238651040941477, "learning_rate": 2.184761904761905e-06, "loss": 0.001, "step": 14060 }, { "epoch": 40.004, "grad_norm": 0.5616852045059204, "learning_rate": 2.1784126984126985e-06, "loss": 0.2197, "step": 14070 }, { "epoch": 40.00457142857143, "grad_norm": 0.005956857465207577, "learning_rate": 2.1720634920634924e-06, "loss": 0.007, "step": 14080 }, { "epoch": 40.00514285714286, "grad_norm": 0.0020911735482513905, "learning_rate": 2.165714285714286e-06, "loss": 0.2357, "step": 14090 }, { "epoch": 40.005714285714284, "grad_norm": 0.010295256972312927, "learning_rate": 2.1593650793650793e-06, "loss": 0.2058, "step": 14100 }, { "epoch": 40.00628571428572, "grad_norm": 390.59674072265625, "learning_rate": 2.1530158730158732e-06, "loss": 0.4373, "step": 14110 }, { "epoch": 40.00685714285714, "grad_norm": 0.01252991147339344, "learning_rate": 2.1466666666666667e-06, "loss": 0.0004, "step": 14120 }, { "epoch": 40.00742857142857, "grad_norm": 0.0566883347928524, "learning_rate": 2.1403174603174606e-06, "loss": 0.3251, "step": 14130 }, { "epoch": 40.008, "grad_norm": 0.0027798376977443695, "learning_rate": 2.133968253968254e-06, "loss": 0.0004, "step": 14140 }, { "epoch": 40.00857142857143, "grad_norm": 0.049019478261470795, "learning_rate": 2.1276190476190476e-06, "loss": 0.0005, "step": 14150 }, { "epoch": 40.009142857142855, "grad_norm": 0.25185105204582214, "learning_rate": 2.1212698412698415e-06, "loss": 0.0016, "step": 14160 }, { "epoch": 40.00971428571429, "grad_norm": 0.05718778818845749, "learning_rate": 2.114920634920635e-06, "loss": 0.2511, "step": 14170 }, { "epoch": 40.010285714285715, "grad_norm": 0.017743902280926704, "learning_rate": 2.108571428571429e-06, "loss": 0.2719, "step": 14180 }, { "epoch": 40.01085714285714, "grad_norm": 0.003888359060510993, "learning_rate": 2.1022222222222224e-06, "loss": 0.001, "step": 14190 }, { "epoch": 40.011428571428574, "grad_norm": 0.1548689901828766, "learning_rate": 2.0958730158730163e-06, "loss": 0.2496, "step": 14200 }, { "epoch": 40.012, "grad_norm": 0.14938634634017944, "learning_rate": 2.0895238095238098e-06, "loss": 0.1626, "step": 14210 }, { "epoch": 40.01257142857143, "grad_norm": 0.0049010817892849445, "learning_rate": 2.0831746031746032e-06, "loss": 0.0009, "step": 14220 }, { "epoch": 40.01314285714286, "grad_norm": 0.06158607453107834, "learning_rate": 2.076825396825397e-06, "loss": 0.6294, "step": 14230 }, { "epoch": 40.013714285714286, "grad_norm": 0.396128386259079, "learning_rate": 2.0704761904761906e-06, "loss": 0.2451, "step": 14240 }, { "epoch": 40.01428571428571, "grad_norm": 0.7225034236907959, "learning_rate": 2.064126984126984e-06, "loss": 0.1585, "step": 14250 }, { "epoch": 40.014857142857146, "grad_norm": 28.033405303955078, "learning_rate": 2.057777777777778e-06, "loss": 0.2748, "step": 14260 }, { "epoch": 40.01542857142857, "grad_norm": 95.52556610107422, "learning_rate": 2.0514285714285715e-06, "loss": 0.1725, "step": 14270 }, { "epoch": 40.016, "grad_norm": 75.94400787353516, "learning_rate": 2.045079365079365e-06, "loss": 0.158, "step": 14280 }, { "epoch": 40.01657142857143, "grad_norm": 0.08160189539194107, "learning_rate": 2.038730158730159e-06, "loss": 0.0047, "step": 14290 }, { "epoch": 40.01714285714286, "grad_norm": 0.04288149252533913, "learning_rate": 2.032380952380953e-06, "loss": 0.002, "step": 14300 }, { "epoch": 40.017714285714284, "grad_norm": 0.013937451876699924, "learning_rate": 2.026031746031746e-06, "loss": 0.0041, "step": 14310 }, { "epoch": 40.01828571428572, "grad_norm": 91.30559539794922, "learning_rate": 2.0196825396825398e-06, "loss": 0.2636, "step": 14320 }, { "epoch": 40.018857142857144, "grad_norm": 0.051494400948286057, "learning_rate": 2.0133333333333337e-06, "loss": 0.0489, "step": 14330 }, { "epoch": 40.01942857142857, "grad_norm": 0.0007844906649552286, "learning_rate": 2.006984126984127e-06, "loss": 0.6524, "step": 14340 }, { "epoch": 40.02, "grad_norm": 19.62409210205078, "learning_rate": 2.0006349206349206e-06, "loss": 0.4637, "step": 14350 }, { "epoch": 40.02, "eval_accuracy": 0.8316498316498316, "eval_loss": 1.0104492902755737, "eval_runtime": 126.2637, "eval_samples_per_second": 2.352, "eval_steps_per_second": 1.18, "step": 14350 }, { "epoch": 41.000571428571426, "grad_norm": 0.10992828756570816, "learning_rate": 1.9942857142857146e-06, "loss": 0.1912, "step": 14360 }, { "epoch": 41.00114285714286, "grad_norm": 7.585694313049316, "learning_rate": 1.987936507936508e-06, "loss": 0.0027, "step": 14370 }, { "epoch": 41.001714285714286, "grad_norm": 0.027735279873013496, "learning_rate": 1.9815873015873015e-06, "loss": 0.3344, "step": 14380 }, { "epoch": 41.00228571428571, "grad_norm": 0.02809802256524563, "learning_rate": 1.9752380952380954e-06, "loss": 0.0989, "step": 14390 }, { "epoch": 41.002857142857145, "grad_norm": 0.0027615518774837255, "learning_rate": 1.968888888888889e-06, "loss": 0.0007, "step": 14400 }, { "epoch": 41.00342857142857, "grad_norm": 0.009083566255867481, "learning_rate": 1.962539682539683e-06, "loss": 0.0071, "step": 14410 }, { "epoch": 41.004, "grad_norm": 43.1031379699707, "learning_rate": 1.9561904761904763e-06, "loss": 0.213, "step": 14420 }, { "epoch": 41.00457142857143, "grad_norm": 0.06005506217479706, "learning_rate": 1.9498412698412698e-06, "loss": 0.0027, "step": 14430 }, { "epoch": 41.00514285714286, "grad_norm": 0.10170082747936249, "learning_rate": 1.9434920634920637e-06, "loss": 0.3536, "step": 14440 }, { "epoch": 41.005714285714284, "grad_norm": 0.0068178740330040455, "learning_rate": 1.9371428571428576e-06, "loss": 0.3189, "step": 14450 }, { "epoch": 41.00628571428572, "grad_norm": 381.4216003417969, "learning_rate": 1.930793650793651e-06, "loss": 0.1193, "step": 14460 }, { "epoch": 41.00685714285714, "grad_norm": 0.004170392639935017, "learning_rate": 1.9244444444444446e-06, "loss": 0.258, "step": 14470 }, { "epoch": 41.00742857142857, "grad_norm": 0.059858277440071106, "learning_rate": 1.9180952380952385e-06, "loss": 0.2108, "step": 14480 }, { "epoch": 41.008, "grad_norm": 0.0008723547798581421, "learning_rate": 1.911746031746032e-06, "loss": 0.2695, "step": 14490 }, { "epoch": 41.00857142857143, "grad_norm": 44.76935958862305, "learning_rate": 1.9053968253968254e-06, "loss": 0.7032, "step": 14500 }, { "epoch": 41.009142857142855, "grad_norm": 0.011419898830354214, "learning_rate": 1.8990476190476193e-06, "loss": 0.0005, "step": 14510 }, { "epoch": 41.00971428571429, "grad_norm": 0.001278414623811841, "learning_rate": 1.892698412698413e-06, "loss": 0.0007, "step": 14520 }, { "epoch": 41.010285714285715, "grad_norm": 0.011867762543261051, "learning_rate": 1.8863492063492065e-06, "loss": 0.0005, "step": 14530 }, { "epoch": 41.01085714285714, "grad_norm": 0.014038086868822575, "learning_rate": 1.8800000000000002e-06, "loss": 0.6461, "step": 14540 }, { "epoch": 41.011428571428574, "grad_norm": 32.356666564941406, "learning_rate": 1.873650793650794e-06, "loss": 0.5288, "step": 14550 }, { "epoch": 41.012, "grad_norm": 0.18130460381507874, "learning_rate": 1.8673015873015874e-06, "loss": 0.7912, "step": 14560 }, { "epoch": 41.01257142857143, "grad_norm": 0.2205055207014084, "learning_rate": 1.860952380952381e-06, "loss": 0.0011, "step": 14570 }, { "epoch": 41.01314285714286, "grad_norm": 0.002843776484951377, "learning_rate": 1.8546031746031748e-06, "loss": 0.0006, "step": 14580 }, { "epoch": 41.013714285714286, "grad_norm": 0.0019020799081772566, "learning_rate": 1.8482539682539685e-06, "loss": 0.0116, "step": 14590 }, { "epoch": 41.01428571428571, "grad_norm": 0.12416958808898926, "learning_rate": 1.841904761904762e-06, "loss": 0.2979, "step": 14600 }, { "epoch": 41.014857142857146, "grad_norm": 0.0510811023414135, "learning_rate": 1.8355555555555557e-06, "loss": 0.2946, "step": 14610 }, { "epoch": 41.01542857142857, "grad_norm": 0.04932905361056328, "learning_rate": 1.8292063492063493e-06, "loss": 0.557, "step": 14620 }, { "epoch": 41.016, "grad_norm": 6.153399467468262, "learning_rate": 1.8228571428571428e-06, "loss": 0.2424, "step": 14630 }, { "epoch": 41.01657142857143, "grad_norm": 0.1840139478445053, "learning_rate": 1.8165079365079365e-06, "loss": 0.3622, "step": 14640 }, { "epoch": 41.01714285714286, "grad_norm": 0.05411524698138237, "learning_rate": 1.8101587301587304e-06, "loss": 0.2168, "step": 14650 }, { "epoch": 41.017714285714284, "grad_norm": 0.02711281180381775, "learning_rate": 1.8038095238095241e-06, "loss": 0.3268, "step": 14660 }, { "epoch": 41.01828571428572, "grad_norm": 0.005898493342101574, "learning_rate": 1.7974603174603176e-06, "loss": 0.003, "step": 14670 }, { "epoch": 41.018857142857144, "grad_norm": 0.08160807937383652, "learning_rate": 1.7911111111111113e-06, "loss": 0.0009, "step": 14680 }, { "epoch": 41.01942857142857, "grad_norm": 0.0039462801069021225, "learning_rate": 1.784761904761905e-06, "loss": 0.0006, "step": 14690 }, { "epoch": 41.02, "grad_norm": 1.8989238739013672, "learning_rate": 1.7784126984126985e-06, "loss": 0.0014, "step": 14700 }, { "epoch": 41.02, "eval_accuracy": 0.8417508417508418, "eval_loss": 1.043727159500122, "eval_runtime": 126.4211, "eval_samples_per_second": 2.349, "eval_steps_per_second": 1.179, "step": 14700 }, { "epoch": 42.000571428571426, "grad_norm": 0.014098647981882095, "learning_rate": 1.7720634920634922e-06, "loss": 0.1607, "step": 14710 }, { "epoch": 42.00114285714286, "grad_norm": 20.41655158996582, "learning_rate": 1.7657142857142859e-06, "loss": 0.7527, "step": 14720 }, { "epoch": 42.001714285714286, "grad_norm": 0.023175185546278954, "learning_rate": 1.7593650793650796e-06, "loss": 0.2158, "step": 14730 }, { "epoch": 42.00228571428571, "grad_norm": 0.013949972577393055, "learning_rate": 1.753015873015873e-06, "loss": 0.0014, "step": 14740 }, { "epoch": 42.002857142857145, "grad_norm": 78.23534393310547, "learning_rate": 1.7466666666666667e-06, "loss": 0.2982, "step": 14750 }, { "epoch": 42.00342857142857, "grad_norm": 0.06348254531621933, "learning_rate": 1.7403174603174604e-06, "loss": 0.3159, "step": 14760 }, { "epoch": 42.004, "grad_norm": 0.0164299625903368, "learning_rate": 1.7339682539682543e-06, "loss": 0.0015, "step": 14770 }, { "epoch": 42.00457142857143, "grad_norm": 0.0010301744332537055, "learning_rate": 1.7276190476190476e-06, "loss": 0.0005, "step": 14780 }, { "epoch": 42.00514285714286, "grad_norm": 0.11905546486377716, "learning_rate": 1.7212698412698415e-06, "loss": 0.0194, "step": 14790 }, { "epoch": 42.005714285714284, "grad_norm": 0.05502673611044884, "learning_rate": 1.7149206349206352e-06, "loss": 0.0014, "step": 14800 }, { "epoch": 42.00628571428572, "grad_norm": 0.005338889546692371, "learning_rate": 1.7085714285714287e-06, "loss": 0.0021, "step": 14810 }, { "epoch": 42.00685714285714, "grad_norm": 0.021363425999879837, "learning_rate": 1.7022222222222224e-06, "loss": 0.2946, "step": 14820 }, { "epoch": 42.00742857142857, "grad_norm": 0.0004772421089001, "learning_rate": 1.695873015873016e-06, "loss": 0.3743, "step": 14830 }, { "epoch": 42.008, "grad_norm": 0.08759511262178421, "learning_rate": 1.6895238095238098e-06, "loss": 0.3515, "step": 14840 }, { "epoch": 42.00857142857143, "grad_norm": 0.03472195193171501, "learning_rate": 1.6831746031746033e-06, "loss": 0.0011, "step": 14850 }, { "epoch": 42.009142857142855, "grad_norm": 0.011670676060020924, "learning_rate": 1.676825396825397e-06, "loss": 0.3558, "step": 14860 }, { "epoch": 42.00971428571429, "grad_norm": 0.013489765115082264, "learning_rate": 1.6704761904761907e-06, "loss": 0.0845, "step": 14870 }, { "epoch": 42.010285714285715, "grad_norm": 0.005060871131718159, "learning_rate": 1.6641269841269841e-06, "loss": 0.0024, "step": 14880 }, { "epoch": 42.01085714285714, "grad_norm": 0.08271327614784241, "learning_rate": 1.6577777777777778e-06, "loss": 0.3332, "step": 14890 }, { "epoch": 42.011428571428574, "grad_norm": 0.0020348222460597754, "learning_rate": 1.6514285714285715e-06, "loss": 0.9036, "step": 14900 }, { "epoch": 42.012, "grad_norm": 0.020888779312372208, "learning_rate": 1.6450793650793654e-06, "loss": 0.0897, "step": 14910 }, { "epoch": 42.01257142857143, "grad_norm": 0.030984889715909958, "learning_rate": 1.6387301587301587e-06, "loss": 0.5513, "step": 14920 }, { "epoch": 42.01314285714286, "grad_norm": 90.73495483398438, "learning_rate": 1.6323809523809526e-06, "loss": 0.3941, "step": 14930 }, { "epoch": 42.013714285714286, "grad_norm": 0.013540121726691723, "learning_rate": 1.6260317460317463e-06, "loss": 0.0006, "step": 14940 }, { "epoch": 42.01428571428571, "grad_norm": 0.0985909178853035, "learning_rate": 1.6196825396825398e-06, "loss": 0.3027, "step": 14950 }, { "epoch": 42.014857142857146, "grad_norm": 0.001703809481114149, "learning_rate": 1.6133333333333335e-06, "loss": 0.3457, "step": 14960 }, { "epoch": 42.01542857142857, "grad_norm": 0.005111075472086668, "learning_rate": 1.6069841269841272e-06, "loss": 0.2664, "step": 14970 }, { "epoch": 42.016, "grad_norm": 0.0010470590787008405, "learning_rate": 1.6006349206349209e-06, "loss": 0.0113, "step": 14980 }, { "epoch": 42.01657142857143, "grad_norm": 0.08761300891637802, "learning_rate": 1.5942857142857144e-06, "loss": 0.327, "step": 14990 }, { "epoch": 42.01714285714286, "grad_norm": 0.0029850888531655073, "learning_rate": 1.587936507936508e-06, "loss": 0.0013, "step": 15000 }, { "epoch": 42.017714285714284, "grad_norm": 0.006845708005130291, "learning_rate": 1.5815873015873017e-06, "loss": 0.3322, "step": 15010 }, { "epoch": 42.01828571428572, "grad_norm": 0.036936238408088684, "learning_rate": 1.5752380952380952e-06, "loss": 0.0012, "step": 15020 }, { "epoch": 42.018857142857144, "grad_norm": 0.026531461626291275, "learning_rate": 1.568888888888889e-06, "loss": 0.6484, "step": 15030 }, { "epoch": 42.01942857142857, "grad_norm": 0.04399362578988075, "learning_rate": 1.5625396825396826e-06, "loss": 0.4781, "step": 15040 }, { "epoch": 42.02, "grad_norm": 0.03520611673593521, "learning_rate": 1.5561904761904763e-06, "loss": 0.6253, "step": 15050 }, { "epoch": 42.02, "eval_accuracy": 0.8148148148148148, "eval_loss": 1.1686880588531494, "eval_runtime": 126.3263, "eval_samples_per_second": 2.351, "eval_steps_per_second": 1.179, "step": 15050 }, { "epoch": 43.000571428571426, "grad_norm": 0.002064335159957409, "learning_rate": 1.5498412698412698e-06, "loss": 0.0016, "step": 15060 }, { "epoch": 43.00114285714286, "grad_norm": 833.5392456054688, "learning_rate": 1.5434920634920635e-06, "loss": 0.1628, "step": 15070 }, { "epoch": 43.001714285714286, "grad_norm": 0.02547420747578144, "learning_rate": 1.5371428571428574e-06, "loss": 0.4407, "step": 15080 }, { "epoch": 43.00228571428571, "grad_norm": 0.0005154515383765101, "learning_rate": 1.530793650793651e-06, "loss": 0.0059, "step": 15090 }, { "epoch": 43.002857142857145, "grad_norm": 0.019299926236271858, "learning_rate": 1.5244444444444446e-06, "loss": 0.1375, "step": 15100 }, { "epoch": 43.00342857142857, "grad_norm": 0.04400021582841873, "learning_rate": 1.5180952380952383e-06, "loss": 0.1776, "step": 15110 }, { "epoch": 43.004, "grad_norm": 0.37237548828125, "learning_rate": 1.511746031746032e-06, "loss": 0.0009, "step": 15120 }, { "epoch": 43.00457142857143, "grad_norm": 27.535037994384766, "learning_rate": 1.5053968253968255e-06, "loss": 0.302, "step": 15130 }, { "epoch": 43.00514285714286, "grad_norm": 0.016599314287304878, "learning_rate": 1.4990476190476191e-06, "loss": 0.1959, "step": 15140 }, { "epoch": 43.005714285714284, "grad_norm": 0.002258901484310627, "learning_rate": 1.4926984126984128e-06, "loss": 0.011, "step": 15150 }, { "epoch": 43.00628571428572, "grad_norm": 21.613733291625977, "learning_rate": 1.4863492063492065e-06, "loss": 0.2254, "step": 15160 }, { "epoch": 43.00685714285714, "grad_norm": 0.004936868790537119, "learning_rate": 1.48e-06, "loss": 0.0045, "step": 15170 }, { "epoch": 43.00742857142857, "grad_norm": 0.09461677819490433, "learning_rate": 1.4736507936507937e-06, "loss": 0.3332, "step": 15180 }, { "epoch": 43.008, "grad_norm": 0.001690115430392325, "learning_rate": 1.4673015873015874e-06, "loss": 0.6643, "step": 15190 }, { "epoch": 43.00857142857143, "grad_norm": 0.018435359001159668, "learning_rate": 1.4609523809523809e-06, "loss": 0.001, "step": 15200 }, { "epoch": 43.009142857142855, "grad_norm": 30.231048583984375, "learning_rate": 1.4546031746031746e-06, "loss": 0.2802, "step": 15210 }, { "epoch": 43.00971428571429, "grad_norm": 0.05070001631975174, "learning_rate": 1.4482539682539685e-06, "loss": 0.001, "step": 15220 }, { "epoch": 43.010285714285715, "grad_norm": 0.026784028857946396, "learning_rate": 1.4419047619047622e-06, "loss": 0.4937, "step": 15230 }, { "epoch": 43.01085714285714, "grad_norm": 0.002373154740780592, "learning_rate": 1.4355555555555557e-06, "loss": 0.0013, "step": 15240 }, { "epoch": 43.011428571428574, "grad_norm": 0.09937410801649094, "learning_rate": 1.4292063492063494e-06, "loss": 0.2796, "step": 15250 }, { "epoch": 43.012, "grad_norm": 0.026784038171172142, "learning_rate": 1.422857142857143e-06, "loss": 0.4131, "step": 15260 }, { "epoch": 43.01257142857143, "grad_norm": 0.0008246685029007494, "learning_rate": 1.4165079365079365e-06, "loss": 0.0036, "step": 15270 }, { "epoch": 43.01314285714286, "grad_norm": 0.013181746006011963, "learning_rate": 1.4101587301587302e-06, "loss": 0.0016, "step": 15280 }, { "epoch": 43.013714285714286, "grad_norm": 0.23996856808662415, "learning_rate": 1.403809523809524e-06, "loss": 0.2515, "step": 15290 }, { "epoch": 43.01428571428571, "grad_norm": 0.08559610694646835, "learning_rate": 1.3974603174603176e-06, "loss": 0.6153, "step": 15300 }, { "epoch": 43.014857142857146, "grad_norm": 0.03630613163113594, "learning_rate": 1.3911111111111111e-06, "loss": 0.2843, "step": 15310 }, { "epoch": 43.01542857142857, "grad_norm": 0.0006657622870989144, "learning_rate": 1.3847619047619048e-06, "loss": 0.0004, "step": 15320 }, { "epoch": 43.016, "grad_norm": 0.3452169895172119, "learning_rate": 1.3784126984126985e-06, "loss": 0.0034, "step": 15330 }, { "epoch": 43.01657142857143, "grad_norm": 2.725701332092285, "learning_rate": 1.372063492063492e-06, "loss": 0.1951, "step": 15340 }, { "epoch": 43.01714285714286, "grad_norm": 0.00891941599547863, "learning_rate": 1.3657142857142857e-06, "loss": 0.2923, "step": 15350 }, { "epoch": 43.017714285714284, "grad_norm": 0.0006919241859577596, "learning_rate": 1.3593650793650796e-06, "loss": 0.5058, "step": 15360 }, { "epoch": 43.01828571428572, "grad_norm": 0.04941738769412041, "learning_rate": 1.3530158730158733e-06, "loss": 0.0011, "step": 15370 }, { "epoch": 43.018857142857144, "grad_norm": 0.11309830099344254, "learning_rate": 1.3466666666666668e-06, "loss": 0.0771, "step": 15380 }, { "epoch": 43.01942857142857, "grad_norm": 0.019192902371287346, "learning_rate": 1.3403174603174605e-06, "loss": 0.4352, "step": 15390 }, { "epoch": 43.02, "grad_norm": 0.0010145236738026142, "learning_rate": 1.3339682539682542e-06, "loss": 0.0009, "step": 15400 }, { "epoch": 43.02, "eval_accuracy": 0.8417508417508418, "eval_loss": 1.0243438482284546, "eval_runtime": 126.4767, "eval_samples_per_second": 2.348, "eval_steps_per_second": 1.178, "step": 15400 }, { "epoch": 44.000571428571426, "grad_norm": 0.020503785461187363, "learning_rate": 1.3276190476190478e-06, "loss": 0.2084, "step": 15410 }, { "epoch": 44.00114285714286, "grad_norm": 0.03143817558884621, "learning_rate": 1.3212698412698413e-06, "loss": 0.0007, "step": 15420 }, { "epoch": 44.001714285714286, "grad_norm": 0.04019314423203468, "learning_rate": 1.314920634920635e-06, "loss": 0.0006, "step": 15430 }, { "epoch": 44.00228571428571, "grad_norm": 0.050954557955265045, "learning_rate": 1.3085714285714287e-06, "loss": 0.1784, "step": 15440 }, { "epoch": 44.002857142857145, "grad_norm": 0.03376142680644989, "learning_rate": 1.3022222222222222e-06, "loss": 0.001, "step": 15450 }, { "epoch": 44.00342857142857, "grad_norm": 0.012583643198013306, "learning_rate": 1.295873015873016e-06, "loss": 0.0007, "step": 15460 }, { "epoch": 44.004, "grad_norm": 0.03293371573090553, "learning_rate": 1.2895238095238096e-06, "loss": 0.0009, "step": 15470 }, { "epoch": 44.00457142857143, "grad_norm": 0.007353159133344889, "learning_rate": 1.2831746031746035e-06, "loss": 0.2371, "step": 15480 }, { "epoch": 44.00514285714286, "grad_norm": 0.1071564108133316, "learning_rate": 1.2768253968253968e-06, "loss": 0.001, "step": 15490 }, { "epoch": 44.005714285714284, "grad_norm": 0.03245130181312561, "learning_rate": 1.2704761904761907e-06, "loss": 0.0003, "step": 15500 }, { "epoch": 44.00628571428572, "grad_norm": 0.0014682890614494681, "learning_rate": 1.2641269841269844e-06, "loss": 0.0009, "step": 15510 }, { "epoch": 44.00685714285714, "grad_norm": 51.87623596191406, "learning_rate": 1.2577777777777779e-06, "loss": 0.1958, "step": 15520 }, { "epoch": 44.00742857142857, "grad_norm": 0.010897007770836353, "learning_rate": 1.2514285714285715e-06, "loss": 0.0008, "step": 15530 }, { "epoch": 44.008, "grad_norm": 0.36192965507507324, "learning_rate": 1.2450793650793652e-06, "loss": 0.7262, "step": 15540 }, { "epoch": 44.00857142857143, "grad_norm": 0.1297987550497055, "learning_rate": 1.2387301587301587e-06, "loss": 0.0005, "step": 15550 }, { "epoch": 44.009142857142855, "grad_norm": 0.002931043738499284, "learning_rate": 1.2323809523809526e-06, "loss": 0.0008, "step": 15560 }, { "epoch": 44.00971428571429, "grad_norm": 0.016998382285237312, "learning_rate": 1.2260317460317461e-06, "loss": 0.0034, "step": 15570 }, { "epoch": 44.010285714285715, "grad_norm": 0.013441096059978008, "learning_rate": 1.2196825396825398e-06, "loss": 0.0003, "step": 15580 }, { "epoch": 44.01085714285714, "grad_norm": 0.0027524903416633606, "learning_rate": 1.2133333333333335e-06, "loss": 0.001, "step": 15590 }, { "epoch": 44.011428571428574, "grad_norm": 0.028492752462625504, "learning_rate": 1.206984126984127e-06, "loss": 0.0005, "step": 15600 }, { "epoch": 44.012, "grad_norm": 0.011916798539459705, "learning_rate": 1.2006349206349207e-06, "loss": 0.342, "step": 15610 }, { "epoch": 44.01257142857143, "grad_norm": 0.0014567896723747253, "learning_rate": 1.1942857142857144e-06, "loss": 0.0004, "step": 15620 }, { "epoch": 44.01314285714286, "grad_norm": 0.029987970367074013, "learning_rate": 1.187936507936508e-06, "loss": 0.0193, "step": 15630 }, { "epoch": 44.013714285714286, "grad_norm": 0.08593633025884628, "learning_rate": 1.1815873015873018e-06, "loss": 0.0006, "step": 15640 }, { "epoch": 44.01428571428571, "grad_norm": 0.06858720630407333, "learning_rate": 1.1752380952380955e-06, "loss": 0.0006, "step": 15650 }, { "epoch": 44.014857142857146, "grad_norm": 0.16518321633338928, "learning_rate": 1.168888888888889e-06, "loss": 0.0037, "step": 15660 }, { "epoch": 44.01542857142857, "grad_norm": 0.006396264769136906, "learning_rate": 1.1625396825396826e-06, "loss": 0.288, "step": 15670 }, { "epoch": 44.016, "grad_norm": 0.018989071249961853, "learning_rate": 1.1561904761904763e-06, "loss": 0.2859, "step": 15680 }, { "epoch": 44.01657142857143, "grad_norm": 0.0026919255033135414, "learning_rate": 1.1498412698412698e-06, "loss": 0.0005, "step": 15690 }, { "epoch": 44.01714285714286, "grad_norm": 0.07339149713516235, "learning_rate": 1.1434920634920637e-06, "loss": 0.6642, "step": 15700 }, { "epoch": 44.017714285714284, "grad_norm": 0.07957630604505539, "learning_rate": 1.1371428571428572e-06, "loss": 0.0012, "step": 15710 }, { "epoch": 44.01828571428572, "grad_norm": 0.0010191805195063353, "learning_rate": 1.130793650793651e-06, "loss": 0.001, "step": 15720 }, { "epoch": 44.018857142857144, "grad_norm": 0.03606898710131645, "learning_rate": 1.1244444444444446e-06, "loss": 0.8374, "step": 15730 }, { "epoch": 44.01942857142857, "grad_norm": 0.13875551521778107, "learning_rate": 1.118095238095238e-06, "loss": 0.3241, "step": 15740 }, { "epoch": 44.02, "grad_norm": 0.023555980995297432, "learning_rate": 1.1117460317460318e-06, "loss": 0.0003, "step": 15750 }, { "epoch": 44.02, "eval_accuracy": 0.8316498316498316, "eval_loss": 1.0864453315734863, "eval_runtime": 126.4265, "eval_samples_per_second": 2.349, "eval_steps_per_second": 1.179, "step": 15750 }, { "epoch": 45.000571428571426, "grad_norm": 0.05806439369916916, "learning_rate": 1.1053968253968255e-06, "loss": 0.0007, "step": 15760 }, { "epoch": 45.00114285714286, "grad_norm": 0.00057839514920488, "learning_rate": 1.0990476190476192e-06, "loss": 0.1975, "step": 15770 }, { "epoch": 45.001714285714286, "grad_norm": 23.461442947387695, "learning_rate": 1.0926984126984129e-06, "loss": 0.2195, "step": 15780 }, { "epoch": 45.00228571428571, "grad_norm": 0.04696512967348099, "learning_rate": 1.0863492063492066e-06, "loss": 0.0007, "step": 15790 }, { "epoch": 45.002857142857145, "grad_norm": 0.008673385716974735, "learning_rate": 1.08e-06, "loss": 0.4171, "step": 15800 }, { "epoch": 45.00342857142857, "grad_norm": 0.005190796218812466, "learning_rate": 1.0736507936507937e-06, "loss": 0.2958, "step": 15810 }, { "epoch": 45.004, "grad_norm": 0.0010960869258269668, "learning_rate": 1.0673015873015874e-06, "loss": 0.0008, "step": 15820 }, { "epoch": 45.00457142857143, "grad_norm": 0.04663942754268646, "learning_rate": 1.060952380952381e-06, "loss": 0.1735, "step": 15830 }, { "epoch": 45.00514285714286, "grad_norm": 0.0008961122948676348, "learning_rate": 1.0546031746031748e-06, "loss": 0.2856, "step": 15840 }, { "epoch": 45.005714285714284, "grad_norm": 0.05738474056124687, "learning_rate": 1.0482539682539683e-06, "loss": 0.0014, "step": 15850 }, { "epoch": 45.00628571428572, "grad_norm": 0.04210129752755165, "learning_rate": 1.041904761904762e-06, "loss": 0.0007, "step": 15860 }, { "epoch": 45.00685714285714, "grad_norm": 0.003992922138422728, "learning_rate": 1.0355555555555557e-06, "loss": 0.6006, "step": 15870 }, { "epoch": 45.00742857142857, "grad_norm": 0.014494026079773903, "learning_rate": 1.0292063492063494e-06, "loss": 0.0008, "step": 15880 }, { "epoch": 45.008, "grad_norm": 0.09102319180965424, "learning_rate": 1.0228571428571429e-06, "loss": 0.1237, "step": 15890 }, { "epoch": 45.00857142857143, "grad_norm": 0.02508743479847908, "learning_rate": 1.0165079365079366e-06, "loss": 0.2364, "step": 15900 }, { "epoch": 45.009142857142855, "grad_norm": 0.002251436933875084, "learning_rate": 1.0101587301587303e-06, "loss": 0.4723, "step": 15910 }, { "epoch": 45.00971428571429, "grad_norm": 0.007445584516972303, "learning_rate": 1.003809523809524e-06, "loss": 0.2701, "step": 15920 }, { "epoch": 45.010285714285715, "grad_norm": 0.03148816525936127, "learning_rate": 9.974603174603176e-07, "loss": 0.001, "step": 15930 }, { "epoch": 45.01085714285714, "grad_norm": 0.0006346903392113745, "learning_rate": 9.911111111111111e-07, "loss": 0.2117, "step": 15940 }, { "epoch": 45.011428571428574, "grad_norm": 0.0017558577237650752, "learning_rate": 9.847619047619048e-07, "loss": 0.0015, "step": 15950 }, { "epoch": 45.012, "grad_norm": 0.029404617846012115, "learning_rate": 9.784126984126985e-07, "loss": 0.3177, "step": 15960 }, { "epoch": 45.01257142857143, "grad_norm": 0.008277475833892822, "learning_rate": 9.720634920634922e-07, "loss": 0.0016, "step": 15970 }, { "epoch": 45.01314285714286, "grad_norm": 0.002906983019784093, "learning_rate": 9.657142857142857e-07, "loss": 0.0123, "step": 15980 }, { "epoch": 45.013714285714286, "grad_norm": 0.1758769452571869, "learning_rate": 9.593650793650794e-07, "loss": 0.181, "step": 15990 }, { "epoch": 45.01428571428571, "grad_norm": 106.2284927368164, "learning_rate": 9.530158730158731e-07, "loss": 0.2344, "step": 16000 }, { "epoch": 45.014857142857146, "grad_norm": 0.5874964594841003, "learning_rate": 9.466666666666667e-07, "loss": 0.0037, "step": 16010 }, { "epoch": 45.01542857142857, "grad_norm": 26.586414337158203, "learning_rate": 9.403174603174605e-07, "loss": 0.2546, "step": 16020 }, { "epoch": 45.016, "grad_norm": 0.08056703954935074, "learning_rate": 9.339682539682541e-07, "loss": 0.3617, "step": 16030 }, { "epoch": 45.01657142857143, "grad_norm": 0.015912292525172234, "learning_rate": 9.276190476190478e-07, "loss": 0.0014, "step": 16040 }, { "epoch": 45.01714285714286, "grad_norm": 0.1449512094259262, "learning_rate": 9.212698412698413e-07, "loss": 0.0525, "step": 16050 }, { "epoch": 45.017714285714284, "grad_norm": 0.06955932825803757, "learning_rate": 9.149206349206349e-07, "loss": 0.0011, "step": 16060 }, { "epoch": 45.01828571428572, "grad_norm": 0.002916391473263502, "learning_rate": 9.085714285714286e-07, "loss": 0.0006, "step": 16070 }, { "epoch": 45.018857142857144, "grad_norm": 0.006070619914680719, "learning_rate": 9.022222222222222e-07, "loss": 0.0072, "step": 16080 }, { "epoch": 45.01942857142857, "grad_norm": 461.6020202636719, "learning_rate": 8.95873015873016e-07, "loss": 0.0531, "step": 16090 }, { "epoch": 45.02, "grad_norm": 0.0010222607525065541, "learning_rate": 8.895238095238096e-07, "loss": 0.291, "step": 16100 }, { "epoch": 45.02, "eval_accuracy": 0.8383838383838383, "eval_loss": 1.0647377967834473, "eval_runtime": 126.4165, "eval_samples_per_second": 2.349, "eval_steps_per_second": 1.179, "step": 16100 }, { "epoch": 46.000571428571426, "grad_norm": 0.047850172966718674, "learning_rate": 8.831746031746033e-07, "loss": 0.0005, "step": 16110 }, { "epoch": 46.00114285714286, "grad_norm": 0.11881715804338455, "learning_rate": 8.768253968253969e-07, "loss": 0.0008, "step": 16120 }, { "epoch": 46.001714285714286, "grad_norm": 1.4675657749176025, "learning_rate": 8.704761904761906e-07, "loss": 0.0016, "step": 16130 }, { "epoch": 46.00228571428571, "grad_norm": 0.005048373248428106, "learning_rate": 8.641269841269842e-07, "loss": 0.0005, "step": 16140 }, { "epoch": 46.002857142857145, "grad_norm": 0.008005023933947086, "learning_rate": 8.577777777777778e-07, "loss": 0.6526, "step": 16150 }, { "epoch": 46.00342857142857, "grad_norm": 0.24868465960025787, "learning_rate": 8.514285714285716e-07, "loss": 0.0015, "step": 16160 }, { "epoch": 46.004, "grad_norm": 0.000758437963668257, "learning_rate": 8.450793650793652e-07, "loss": 0.2208, "step": 16170 }, { "epoch": 46.00457142857143, "grad_norm": 0.001206890563480556, "learning_rate": 8.387301587301588e-07, "loss": 0.2555, "step": 16180 }, { "epoch": 46.00514285714286, "grad_norm": 0.08712995797395706, "learning_rate": 8.323809523809524e-07, "loss": 0.2027, "step": 16190 }, { "epoch": 46.005714285714284, "grad_norm": 0.021634329110383987, "learning_rate": 8.260317460317461e-07, "loss": 0.0004, "step": 16200 }, { "epoch": 46.00628571428572, "grad_norm": 0.017216574400663376, "learning_rate": 8.196825396825397e-07, "loss": 0.2594, "step": 16210 }, { "epoch": 46.00685714285714, "grad_norm": 0.03562033176422119, "learning_rate": 8.133333333333333e-07, "loss": 0.0008, "step": 16220 }, { "epoch": 46.00742857142857, "grad_norm": 0.06330721825361252, "learning_rate": 8.069841269841271e-07, "loss": 0.3, "step": 16230 }, { "epoch": 46.008, "grad_norm": 0.006224165204912424, "learning_rate": 8.006349206349207e-07, "loss": 0.0629, "step": 16240 }, { "epoch": 46.00857142857143, "grad_norm": 34.4676513671875, "learning_rate": 7.942857142857144e-07, "loss": 0.8066, "step": 16250 }, { "epoch": 46.009142857142855, "grad_norm": 0.00887818168848753, "learning_rate": 7.87936507936508e-07, "loss": 0.0873, "step": 16260 }, { "epoch": 46.00971428571429, "grad_norm": 0.02603054791688919, "learning_rate": 7.815873015873017e-07, "loss": 0.0004, "step": 16270 }, { "epoch": 46.010285714285715, "grad_norm": 0.11655969172716141, "learning_rate": 7.752380952380953e-07, "loss": 0.0017, "step": 16280 }, { "epoch": 46.01085714285714, "grad_norm": 0.09002801775932312, "learning_rate": 7.688888888888891e-07, "loss": 0.2952, "step": 16290 }, { "epoch": 46.011428571428574, "grad_norm": 0.06367355585098267, "learning_rate": 7.625396825396827e-07, "loss": 0.2261, "step": 16300 }, { "epoch": 46.012, "grad_norm": 0.002869822084903717, "learning_rate": 7.561904761904762e-07, "loss": 0.004, "step": 16310 }, { "epoch": 46.01257142857143, "grad_norm": 0.01801781728863716, "learning_rate": 7.498412698412699e-07, "loss": 0.0004, "step": 16320 }, { "epoch": 46.01314285714286, "grad_norm": 0.034441664814949036, "learning_rate": 7.434920634920635e-07, "loss": 0.3214, "step": 16330 }, { "epoch": 46.013714285714286, "grad_norm": 0.030220109969377518, "learning_rate": 7.371428571428572e-07, "loss": 0.1798, "step": 16340 }, { "epoch": 46.01428571428571, "grad_norm": 0.09933434426784515, "learning_rate": 7.307936507936508e-07, "loss": 0.0004, "step": 16350 }, { "epoch": 46.014857142857146, "grad_norm": 0.06845889985561371, "learning_rate": 7.244444444444446e-07, "loss": 0.3171, "step": 16360 }, { "epoch": 46.01542857142857, "grad_norm": 0.0007817966397851706, "learning_rate": 7.180952380952382e-07, "loss": 0.3191, "step": 16370 }, { "epoch": 46.016, "grad_norm": 19.301998138427734, "learning_rate": 7.117460317460318e-07, "loss": 0.4349, "step": 16380 }, { "epoch": 46.01657142857143, "grad_norm": 0.012864407151937485, "learning_rate": 7.053968253968255e-07, "loss": 0.2711, "step": 16390 }, { "epoch": 46.01714285714286, "grad_norm": 0.003735810751095414, "learning_rate": 6.990476190476191e-07, "loss": 0.2612, "step": 16400 }, { "epoch": 46.017714285714284, "grad_norm": 0.7684434652328491, "learning_rate": 6.926984126984128e-07, "loss": 0.2843, "step": 16410 }, { "epoch": 46.01828571428572, "grad_norm": 0.034190673381090164, "learning_rate": 6.863492063492064e-07, "loss": 0.3447, "step": 16420 }, { "epoch": 46.018857142857144, "grad_norm": 0.02056187205016613, "learning_rate": 6.800000000000001e-07, "loss": 0.3328, "step": 16430 }, { "epoch": 46.01942857142857, "grad_norm": 0.0970858708024025, "learning_rate": 6.736507936507936e-07, "loss": 0.753, "step": 16440 }, { "epoch": 46.02, "grad_norm": 0.03131880611181259, "learning_rate": 6.673015873015874e-07, "loss": 0.4962, "step": 16450 }, { "epoch": 46.02, "eval_accuracy": 0.8316498316498316, "eval_loss": 1.116576910018921, "eval_runtime": 126.7393, "eval_samples_per_second": 2.343, "eval_steps_per_second": 1.176, "step": 16450 }, { "epoch": 47.000571428571426, "grad_norm": 0.04706917330622673, "learning_rate": 6.60952380952381e-07, "loss": 0.4978, "step": 16460 }, { "epoch": 47.00114285714286, "grad_norm": 0.0006357289967127144, "learning_rate": 6.546031746031746e-07, "loss": 0.3942, "step": 16470 }, { "epoch": 47.001714285714286, "grad_norm": 0.02852693945169449, "learning_rate": 6.482539682539683e-07, "loss": 0.0281, "step": 16480 }, { "epoch": 47.00228571428571, "grad_norm": 0.01799050346016884, "learning_rate": 6.419047619047619e-07, "loss": 0.3552, "step": 16490 }, { "epoch": 47.002857142857145, "grad_norm": 0.0006481860764324665, "learning_rate": 6.355555555555556e-07, "loss": 0.001, "step": 16500 }, { "epoch": 47.00342857142857, "grad_norm": 0.20090606808662415, "learning_rate": 6.292063492063492e-07, "loss": 0.2303, "step": 16510 }, { "epoch": 47.004, "grad_norm": 0.0003543874772731215, "learning_rate": 6.228571428571429e-07, "loss": 0.1126, "step": 16520 }, { "epoch": 47.00457142857143, "grad_norm": 0.0005535692907869816, "learning_rate": 6.165079365079366e-07, "loss": 0.0008, "step": 16530 }, { "epoch": 47.00514285714286, "grad_norm": 0.0009550791000947356, "learning_rate": 6.101587301587302e-07, "loss": 0.0005, "step": 16540 }, { "epoch": 47.005714285714284, "grad_norm": 0.021146543323993683, "learning_rate": 6.038095238095239e-07, "loss": 0.001, "step": 16550 }, { "epoch": 47.00628571428572, "grad_norm": 0.007506783120334148, "learning_rate": 5.974603174603176e-07, "loss": 0.3273, "step": 16560 }, { "epoch": 47.00685714285714, "grad_norm": 0.008311262354254723, "learning_rate": 5.911111111111111e-07, "loss": 0.0786, "step": 16570 }, { "epoch": 47.00742857142857, "grad_norm": 0.7662389278411865, "learning_rate": 5.847619047619047e-07, "loss": 0.0013, "step": 16580 }, { "epoch": 47.008, "grad_norm": 0.0042674667201936245, "learning_rate": 5.784126984126984e-07, "loss": 0.0208, "step": 16590 }, { "epoch": 47.00857142857143, "grad_norm": 0.0019062272040173411, "learning_rate": 5.720634920634921e-07, "loss": 0.3439, "step": 16600 }, { "epoch": 47.009142857142855, "grad_norm": 0.12979654967784882, "learning_rate": 5.657142857142857e-07, "loss": 0.0246, "step": 16610 }, { "epoch": 47.00971428571429, "grad_norm": 196.23326110839844, "learning_rate": 5.593650793650794e-07, "loss": 1.4075, "step": 16620 }, { "epoch": 47.010285714285715, "grad_norm": 0.10944508761167526, "learning_rate": 5.530158730158731e-07, "loss": 0.001, "step": 16630 }, { "epoch": 47.01085714285714, "grad_norm": 0.17711256444454193, "learning_rate": 5.466666666666667e-07, "loss": 0.0017, "step": 16640 }, { "epoch": 47.011428571428574, "grad_norm": 0.026470154523849487, "learning_rate": 5.403174603174604e-07, "loss": 0.5992, "step": 16650 }, { "epoch": 47.012, "grad_norm": 0.04885806888341904, "learning_rate": 5.33968253968254e-07, "loss": 0.0008, "step": 16660 }, { "epoch": 47.01257142857143, "grad_norm": 0.1748293787240982, "learning_rate": 5.276190476190477e-07, "loss": 0.0011, "step": 16670 }, { "epoch": 47.01314285714286, "grad_norm": 0.028242330998182297, "learning_rate": 5.212698412698413e-07, "loss": 0.2365, "step": 16680 }, { "epoch": 47.013714285714286, "grad_norm": 499.1086120605469, "learning_rate": 5.14920634920635e-07, "loss": 0.2298, "step": 16690 }, { "epoch": 47.01428571428571, "grad_norm": 0.029000846669077873, "learning_rate": 5.085714285714286e-07, "loss": 0.0006, "step": 16700 }, { "epoch": 47.014857142857146, "grad_norm": 0.018016472458839417, "learning_rate": 5.022222222222222e-07, "loss": 0.1712, "step": 16710 }, { "epoch": 47.01542857142857, "grad_norm": 0.9705002903938293, "learning_rate": 4.958730158730159e-07, "loss": 0.1886, "step": 16720 }, { "epoch": 47.016, "grad_norm": 0.011911354027688503, "learning_rate": 4.895238095238096e-07, "loss": 0.0004, "step": 16730 }, { "epoch": 47.01657142857143, "grad_norm": 0.008706871420145035, "learning_rate": 4.831746031746032e-07, "loss": 0.0008, "step": 16740 }, { "epoch": 47.01714285714286, "grad_norm": 0.038937151432037354, "learning_rate": 4.7682539682539686e-07, "loss": 0.0004, "step": 16750 }, { "epoch": 47.017714285714284, "grad_norm": 0.006863133050501347, "learning_rate": 4.704761904761905e-07, "loss": 0.1916, "step": 16760 }, { "epoch": 47.01828571428572, "grad_norm": 34.81043243408203, "learning_rate": 4.6412698412698414e-07, "loss": 0.1632, "step": 16770 }, { "epoch": 47.018857142857144, "grad_norm": 0.004815808031708002, "learning_rate": 4.5777777777777784e-07, "loss": 0.3468, "step": 16780 }, { "epoch": 47.01942857142857, "grad_norm": 0.05895291268825531, "learning_rate": 4.514285714285715e-07, "loss": 0.2842, "step": 16790 }, { "epoch": 47.02, "grad_norm": 0.0008096261299215257, "learning_rate": 4.450793650793651e-07, "loss": 0.0919, "step": 16800 }, { "epoch": 47.02, "eval_accuracy": 0.8282828282828283, "eval_loss": 1.1209441423416138, "eval_runtime": 126.6719, "eval_samples_per_second": 2.345, "eval_steps_per_second": 1.176, "step": 16800 }, { "epoch": 48.000571428571426, "grad_norm": 174.62635803222656, "learning_rate": 4.387301587301588e-07, "loss": 0.008, "step": 16810 }, { "epoch": 48.00114285714286, "grad_norm": 0.04038200154900551, "learning_rate": 4.323809523809524e-07, "loss": 0.2735, "step": 16820 }, { "epoch": 48.001714285714286, "grad_norm": 0.012884361669421196, "learning_rate": 4.2603174603174605e-07, "loss": 0.2273, "step": 16830 }, { "epoch": 48.00228571428571, "grad_norm": 0.11547985672950745, "learning_rate": 4.196825396825397e-07, "loss": 0.2345, "step": 16840 }, { "epoch": 48.002857142857145, "grad_norm": 0.34670960903167725, "learning_rate": 4.133333333333334e-07, "loss": 0.0018, "step": 16850 }, { "epoch": 48.00342857142857, "grad_norm": 0.006787777412682772, "learning_rate": 4.06984126984127e-07, "loss": 0.1207, "step": 16860 }, { "epoch": 48.004, "grad_norm": 0.0013283508596941829, "learning_rate": 4.0063492063492067e-07, "loss": 0.0036, "step": 16870 }, { "epoch": 48.00457142857143, "grad_norm": 0.0029995145741850138, "learning_rate": 3.9428571428571436e-07, "loss": 0.0003, "step": 16880 }, { "epoch": 48.00514285714286, "grad_norm": 0.09618420153856277, "learning_rate": 3.87936507936508e-07, "loss": 0.0005, "step": 16890 }, { "epoch": 48.005714285714284, "grad_norm": 0.033103521913290024, "learning_rate": 3.815873015873016e-07, "loss": 0.0123, "step": 16900 }, { "epoch": 48.00628571428572, "grad_norm": 0.0023510020691901445, "learning_rate": 3.7523809523809523e-07, "loss": 0.0004, "step": 16910 }, { "epoch": 48.00685714285714, "grad_norm": 0.02988676354289055, "learning_rate": 3.6888888888888893e-07, "loss": 0.4017, "step": 16920 }, { "epoch": 48.00742857142857, "grad_norm": 0.014092681929469109, "learning_rate": 3.6253968253968257e-07, "loss": 0.295, "step": 16930 }, { "epoch": 48.008, "grad_norm": 0.014224675484001637, "learning_rate": 3.561904761904762e-07, "loss": 0.3255, "step": 16940 }, { "epoch": 48.00857142857143, "grad_norm": 0.09868843853473663, "learning_rate": 3.498412698412699e-07, "loss": 0.2179, "step": 16950 }, { "epoch": 48.009142857142855, "grad_norm": 0.0006300527020357549, "learning_rate": 3.4349206349206355e-07, "loss": 0.094, "step": 16960 }, { "epoch": 48.00971428571429, "grad_norm": 0.0006618179613724351, "learning_rate": 3.371428571428572e-07, "loss": 0.0007, "step": 16970 }, { "epoch": 48.010285714285715, "grad_norm": 0.12803080677986145, "learning_rate": 3.307936507936508e-07, "loss": 0.3201, "step": 16980 }, { "epoch": 48.01085714285714, "grad_norm": 0.033108729869127274, "learning_rate": 3.2444444444444447e-07, "loss": 0.2339, "step": 16990 }, { "epoch": 48.011428571428574, "grad_norm": 0.10151008516550064, "learning_rate": 3.180952380952381e-07, "loss": 0.0004, "step": 17000 }, { "epoch": 48.012, "grad_norm": 0.015769364312291145, "learning_rate": 3.1174603174603176e-07, "loss": 0.3187, "step": 17010 }, { "epoch": 48.01257142857143, "grad_norm": 0.009230137802660465, "learning_rate": 3.0539682539682545e-07, "loss": 0.4486, "step": 17020 }, { "epoch": 48.01314285714286, "grad_norm": 0.0004300758882891387, "learning_rate": 2.990476190476191e-07, "loss": 0.049, "step": 17030 }, { "epoch": 48.013714285714286, "grad_norm": 0.006301034241914749, "learning_rate": 2.9269841269841274e-07, "loss": 0.0007, "step": 17040 }, { "epoch": 48.01428571428571, "grad_norm": 0.009746396914124489, "learning_rate": 2.863492063492064e-07, "loss": 0.5487, "step": 17050 }, { "epoch": 48.014857142857146, "grad_norm": 0.009282475337386131, "learning_rate": 2.8e-07, "loss": 0.0008, "step": 17060 }, { "epoch": 48.01542857142857, "grad_norm": 0.002887505106627941, "learning_rate": 2.736507936507937e-07, "loss": 0.0004, "step": 17070 }, { "epoch": 48.016, "grad_norm": 0.025261027738451958, "learning_rate": 2.673015873015873e-07, "loss": 0.0006, "step": 17080 }, { "epoch": 48.01657142857143, "grad_norm": 0.016581503674387932, "learning_rate": 2.6095238095238094e-07, "loss": 0.5268, "step": 17090 }, { "epoch": 48.01714285714286, "grad_norm": 0.03690231218934059, "learning_rate": 2.5460317460317464e-07, "loss": 0.0583, "step": 17100 }, { "epoch": 48.017714285714284, "grad_norm": 0.16912665963172913, "learning_rate": 2.482539682539683e-07, "loss": 0.0008, "step": 17110 }, { "epoch": 48.01828571428572, "grad_norm": 0.01130374800413847, "learning_rate": 2.419047619047619e-07, "loss": 0.0004, "step": 17120 }, { "epoch": 48.018857142857144, "grad_norm": 0.3978990316390991, "learning_rate": 2.3555555555555556e-07, "loss": 0.001, "step": 17130 }, { "epoch": 48.01942857142857, "grad_norm": 0.003729065880179405, "learning_rate": 2.2920634920634923e-07, "loss": 0.6757, "step": 17140 }, { "epoch": 48.02, "grad_norm": 0.0032000578939914703, "learning_rate": 2.228571428571429e-07, "loss": 0.0007, "step": 17150 }, { "epoch": 48.02, "eval_accuracy": 0.8316498316498316, "eval_loss": 1.1259746551513672, "eval_runtime": 126.1783, "eval_samples_per_second": 2.354, "eval_steps_per_second": 1.181, "step": 17150 }, { "epoch": 49.000571428571426, "grad_norm": 0.021503791213035583, "learning_rate": 2.1650793650793652e-07, "loss": 0.188, "step": 17160 }, { "epoch": 49.00114285714286, "grad_norm": 0.009531227871775627, "learning_rate": 2.1015873015873019e-07, "loss": 0.0006, "step": 17170 }, { "epoch": 49.001714285714286, "grad_norm": 0.02791694551706314, "learning_rate": 2.0380952380952383e-07, "loss": 0.0036, "step": 17180 }, { "epoch": 49.00228571428571, "grad_norm": 0.014732033014297485, "learning_rate": 1.974603174603175e-07, "loss": 0.4474, "step": 17190 }, { "epoch": 49.002857142857145, "grad_norm": 0.0033705858513712883, "learning_rate": 1.911111111111111e-07, "loss": 0.001, "step": 17200 }, { "epoch": 49.00342857142857, "grad_norm": 0.076132632791996, "learning_rate": 1.8476190476190478e-07, "loss": 0.5201, "step": 17210 }, { "epoch": 49.004, "grad_norm": 0.008379080332815647, "learning_rate": 1.7841269841269842e-07, "loss": 0.0005, "step": 17220 }, { "epoch": 49.00457142857143, "grad_norm": 0.014781179837882519, "learning_rate": 1.720634920634921e-07, "loss": 0.2197, "step": 17230 }, { "epoch": 49.00514285714286, "grad_norm": 0.03410143777728081, "learning_rate": 1.657142857142857e-07, "loss": 0.001, "step": 17240 }, { "epoch": 49.005714285714284, "grad_norm": 0.0006565088406205177, "learning_rate": 1.5936507936507937e-07, "loss": 0.0015, "step": 17250 }, { "epoch": 49.00628571428572, "grad_norm": 0.014354717917740345, "learning_rate": 1.5301587301587304e-07, "loss": 0.5642, "step": 17260 }, { "epoch": 49.00685714285714, "grad_norm": 0.09152337163686752, "learning_rate": 1.4666666666666668e-07, "loss": 0.276, "step": 17270 }, { "epoch": 49.00742857142857, "grad_norm": 0.03440091758966446, "learning_rate": 1.4031746031746032e-07, "loss": 0.0006, "step": 17280 }, { "epoch": 49.008, "grad_norm": 0.004568720702081919, "learning_rate": 1.3396825396825397e-07, "loss": 0.0073, "step": 17290 }, { "epoch": 49.00857142857143, "grad_norm": 0.021223975345492363, "learning_rate": 1.2761904761904763e-07, "loss": 0.0004, "step": 17300 }, { "epoch": 49.009142857142855, "grad_norm": 0.007963555864989758, "learning_rate": 1.2126984126984128e-07, "loss": 0.002, "step": 17310 }, { "epoch": 49.00971428571429, "grad_norm": 0.0007649322506040335, "learning_rate": 1.1492063492063493e-07, "loss": 0.3511, "step": 17320 }, { "epoch": 49.010285714285715, "grad_norm": 0.057293448597192764, "learning_rate": 1.0857142857142857e-07, "loss": 0.3396, "step": 17330 }, { "epoch": 49.01085714285714, "grad_norm": 0.006769323721528053, "learning_rate": 1.0222222222222224e-07, "loss": 0.0022, "step": 17340 }, { "epoch": 49.011428571428574, "grad_norm": 0.0010334831895306706, "learning_rate": 9.587301587301588e-08, "loss": 0.2595, "step": 17350 }, { "epoch": 49.012, "grad_norm": 0.015894345939159393, "learning_rate": 8.952380952380954e-08, "loss": 0.2016, "step": 17360 }, { "epoch": 49.01257142857143, "grad_norm": 0.013702361844480038, "learning_rate": 8.317460317460318e-08, "loss": 0.0008, "step": 17370 }, { "epoch": 49.01314285714286, "grad_norm": 0.007162360940128565, "learning_rate": 7.682539682539682e-08, "loss": 0.2925, "step": 17380 }, { "epoch": 49.013714285714286, "grad_norm": 0.009392665699124336, "learning_rate": 7.047619047619048e-08, "loss": 0.007, "step": 17390 }, { "epoch": 49.01428571428571, "grad_norm": 5.373441219329834, "learning_rate": 6.412698412698413e-08, "loss": 0.2043, "step": 17400 }, { "epoch": 49.014857142857146, "grad_norm": 0.020008977502584457, "learning_rate": 5.777777777777778e-08, "loss": 0.0742, "step": 17410 }, { "epoch": 49.01542857142857, "grad_norm": 0.04810080677270889, "learning_rate": 5.142857142857143e-08, "loss": 0.0005, "step": 17420 }, { "epoch": 49.016, "grad_norm": 0.010244383476674557, "learning_rate": 4.507936507936508e-08, "loss": 0.4077, "step": 17430 }, { "epoch": 49.01657142857143, "grad_norm": 0.006721619050949812, "learning_rate": 3.873015873015873e-08, "loss": 0.8587, "step": 17440 }, { "epoch": 49.01714285714286, "grad_norm": 0.0023023684043437243, "learning_rate": 3.238095238095239e-08, "loss": 0.0006, "step": 17450 }, { "epoch": 49.017714285714284, "grad_norm": 0.19224895536899567, "learning_rate": 2.6031746031746037e-08, "loss": 0.0009, "step": 17460 }, { "epoch": 49.01828571428572, "grad_norm": 0.04239881783723831, "learning_rate": 1.9682539682539685e-08, "loss": 0.0007, "step": 17470 }, { "epoch": 49.018857142857144, "grad_norm": 0.0015393183566629887, "learning_rate": 1.3333333333333334e-08, "loss": 0.2342, "step": 17480 }, { "epoch": 49.01942857142857, "grad_norm": 0.05108032375574112, "learning_rate": 6.9841269841269845e-09, "loss": 0.187, "step": 17490 }, { "epoch": 49.02, "grad_norm": 0.0034502753987908363, "learning_rate": 6.34920634920635e-10, "loss": 0.0008, "step": 17500 }, { "epoch": 49.02, "eval_accuracy": 0.835016835016835, "eval_loss": 1.1138967275619507, "eval_runtime": 126.4744, "eval_samples_per_second": 2.348, "eval_steps_per_second": 1.178, "step": 17500 }, { "epoch": 49.02, "step": 17500, "total_flos": 1.5368592103538688e+20, "train_loss": 0.39184775381689624, "train_runtime": 51190.6894, "train_samples_per_second": 0.684, "train_steps_per_second": 0.342 }, { "epoch": 49.02, "eval_accuracy": 0.8585858585858586, "eval_loss": 0.7987341284751892, "eval_runtime": 125.2938, "eval_samples_per_second": 2.37, "eval_steps_per_second": 1.189, "step": 17500 }, { "epoch": 49.02, "eval_accuracy": 0.8585858585858586, "eval_loss": 0.798734188079834, "eval_runtime": 125.2382, "eval_samples_per_second": 2.371, "eval_steps_per_second": 1.19, "step": 17500 } ], "logging_steps": 10, "max_steps": 17500, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5368592103538688e+20, "train_batch_size": 2, "trial_name": null, "trial_params": null }