{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9278410299035432, "eval_steps": 500, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015464017165059054, "grad_norm": 4352.0, "learning_rate": 1.9972307692307693e-05, "loss": 10.9174, "step": 10 }, { "epoch": 0.0030928034330118107, "grad_norm": 71168.0, "learning_rate": 1.9941538461538464e-05, "loss": 11.9649, "step": 20 }, { "epoch": 0.004639205149517716, "grad_norm": 190.0, "learning_rate": 1.9910769230769232e-05, "loss": 5.27, "step": 30 }, { "epoch": 0.0061856068660236215, "grad_norm": 16.125, "learning_rate": 1.9880000000000003e-05, "loss": 0.3647, "step": 40 }, { "epoch": 0.007732008582529527, "grad_norm": 3.890625, "learning_rate": 1.984923076923077e-05, "loss": 0.3099, "step": 50 }, { "epoch": 0.009278410299035433, "grad_norm": 1.8828125, "learning_rate": 1.9818461538461538e-05, "loss": 0.2842, "step": 60 }, { "epoch": 0.010824812015541337, "grad_norm": 1.6953125, "learning_rate": 1.978769230769231e-05, "loss": 0.2943, "step": 70 }, { "epoch": 0.012371213732047243, "grad_norm": 1.1640625, "learning_rate": 1.9756923076923077e-05, "loss": 0.3539, "step": 80 }, { "epoch": 0.013917615448553147, "grad_norm": 1.046875, "learning_rate": 1.9726153846153848e-05, "loss": 0.259, "step": 90 }, { "epoch": 0.015464017165059053, "grad_norm": 1.203125, "learning_rate": 1.9695384615384616e-05, "loss": 0.2741, "step": 100 }, { "epoch": 0.01701041888156496, "grad_norm": 1.0390625, "learning_rate": 1.9664615384615387e-05, "loss": 0.281, "step": 110 }, { "epoch": 0.018556820598070865, "grad_norm": 0.921875, "learning_rate": 1.9633846153846155e-05, "loss": 0.2586, "step": 120 }, { "epoch": 0.020103222314576768, "grad_norm": 0.921875, "learning_rate": 1.9603076923076926e-05, "loss": 0.2776, "step": 130 }, { "epoch": 0.021649624031082674, "grad_norm": 1.140625, "learning_rate": 1.9572307692307693e-05, "loss": 0.3186, "step": 140 }, { "epoch": 0.02319602574758858, "grad_norm": 0.85546875, "learning_rate": 1.9541538461538464e-05, "loss": 0.3315, "step": 150 }, { "epoch": 0.024742427464094486, "grad_norm": 1.1171875, "learning_rate": 1.9510769230769232e-05, "loss": 0.257, "step": 160 }, { "epoch": 0.026288829180600392, "grad_norm": 1.1640625, "learning_rate": 1.948e-05, "loss": 0.2592, "step": 170 }, { "epoch": 0.027835230897106295, "grad_norm": 0.91015625, "learning_rate": 1.944923076923077e-05, "loss": 0.2703, "step": 180 }, { "epoch": 0.0293816326136122, "grad_norm": 0.94140625, "learning_rate": 1.941846153846154e-05, "loss": 0.2547, "step": 190 }, { "epoch": 0.030928034330118107, "grad_norm": 1.078125, "learning_rate": 1.938769230769231e-05, "loss": 0.3182, "step": 200 }, { "epoch": 0.03247443604662401, "grad_norm": 0.94140625, "learning_rate": 1.9356923076923077e-05, "loss": 0.3005, "step": 210 }, { "epoch": 0.03402083776312992, "grad_norm": 1.09375, "learning_rate": 1.932615384615385e-05, "loss": 0.2693, "step": 220 }, { "epoch": 0.035567239479635825, "grad_norm": 0.95703125, "learning_rate": 1.929538461538462e-05, "loss": 0.2925, "step": 230 }, { "epoch": 0.03711364119614173, "grad_norm": 1.078125, "learning_rate": 1.9264615384615387e-05, "loss": 0.3165, "step": 240 }, { "epoch": 0.03866004291264763, "grad_norm": 0.82421875, "learning_rate": 1.9233846153846155e-05, "loss": 0.2606, "step": 250 }, { "epoch": 0.040206444629153536, "grad_norm": 1.109375, "learning_rate": 1.9203076923076923e-05, "loss": 0.324, "step": 260 }, { "epoch": 0.04175284634565944, "grad_norm": 1.0390625, "learning_rate": 1.9172307692307694e-05, "loss": 0.2787, "step": 270 }, { "epoch": 0.04329924806216535, "grad_norm": 1.046875, "learning_rate": 1.914153846153846e-05, "loss": 0.3092, "step": 280 }, { "epoch": 0.044845649778671254, "grad_norm": 0.875, "learning_rate": 1.9110769230769233e-05, "loss": 0.2831, "step": 290 }, { "epoch": 0.04639205149517716, "grad_norm": 1.0390625, "learning_rate": 1.908e-05, "loss": 0.282, "step": 300 }, { "epoch": 0.047938453211683066, "grad_norm": 1.8125, "learning_rate": 1.904923076923077e-05, "loss": 0.3863, "step": 310 }, { "epoch": 0.04948485492818897, "grad_norm": 0.85546875, "learning_rate": 1.901846153846154e-05, "loss": 0.246, "step": 320 }, { "epoch": 0.05103125664469488, "grad_norm": 1.3984375, "learning_rate": 1.898769230769231e-05, "loss": 0.3483, "step": 330 }, { "epoch": 0.052577658361200784, "grad_norm": 1.0546875, "learning_rate": 1.8956923076923078e-05, "loss": 0.4107, "step": 340 }, { "epoch": 0.05412406007770668, "grad_norm": 1.0, "learning_rate": 1.892615384615385e-05, "loss": 0.2813, "step": 350 }, { "epoch": 0.05567046179421259, "grad_norm": 0.828125, "learning_rate": 1.8895384615384617e-05, "loss": 0.283, "step": 360 }, { "epoch": 0.057216863510718495, "grad_norm": 0.9765625, "learning_rate": 1.8864615384615384e-05, "loss": 0.268, "step": 370 }, { "epoch": 0.0587632652272244, "grad_norm": 1.0859375, "learning_rate": 1.8833846153846155e-05, "loss": 0.2852, "step": 380 }, { "epoch": 0.06030966694373031, "grad_norm": 0.921875, "learning_rate": 1.8803076923076923e-05, "loss": 0.2477, "step": 390 }, { "epoch": 0.06185606866023621, "grad_norm": 1.0, "learning_rate": 1.8772307692307694e-05, "loss": 0.2418, "step": 400 }, { "epoch": 0.06340247037674211, "grad_norm": 0.85546875, "learning_rate": 1.8741538461538462e-05, "loss": 0.2218, "step": 410 }, { "epoch": 0.06494887209324803, "grad_norm": 1.1484375, "learning_rate": 1.8710769230769233e-05, "loss": 0.2616, "step": 420 }, { "epoch": 0.06649527380975392, "grad_norm": 1.1171875, "learning_rate": 1.8680000000000004e-05, "loss": 0.3475, "step": 430 }, { "epoch": 0.06804167552625984, "grad_norm": 1.125, "learning_rate": 1.8649230769230772e-05, "loss": 0.3025, "step": 440 }, { "epoch": 0.06958807724276574, "grad_norm": 0.93359375, "learning_rate": 1.861846153846154e-05, "loss": 0.3119, "step": 450 }, { "epoch": 0.07113447895927165, "grad_norm": 0.90625, "learning_rate": 1.8587692307692307e-05, "loss": 0.3004, "step": 460 }, { "epoch": 0.07268088067577755, "grad_norm": 0.97265625, "learning_rate": 1.8556923076923078e-05, "loss": 0.2957, "step": 470 }, { "epoch": 0.07422728239228346, "grad_norm": 1.1328125, "learning_rate": 1.8526153846153846e-05, "loss": 0.3162, "step": 480 }, { "epoch": 0.07577368410878936, "grad_norm": 1.75, "learning_rate": 1.8495384615384617e-05, "loss": 0.3637, "step": 490 }, { "epoch": 0.07732008582529526, "grad_norm": 1.0390625, "learning_rate": 1.8464615384615385e-05, "loss": 0.2379, "step": 500 }, { "epoch": 0.07886648754180117, "grad_norm": 0.984375, "learning_rate": 1.8433846153846156e-05, "loss": 0.3098, "step": 510 }, { "epoch": 0.08041288925830707, "grad_norm": 0.99609375, "learning_rate": 1.8403076923076924e-05, "loss": 0.3977, "step": 520 }, { "epoch": 0.08195929097481298, "grad_norm": 0.9609375, "learning_rate": 1.8372307692307695e-05, "loss": 0.3034, "step": 530 }, { "epoch": 0.08350569269131888, "grad_norm": 0.7421875, "learning_rate": 1.8341538461538462e-05, "loss": 0.2327, "step": 540 }, { "epoch": 0.0850520944078248, "grad_norm": 1.0625, "learning_rate": 1.8310769230769233e-05, "loss": 0.2561, "step": 550 }, { "epoch": 0.0865984961243307, "grad_norm": 0.8515625, "learning_rate": 1.828e-05, "loss": 0.3739, "step": 560 }, { "epoch": 0.08814489784083661, "grad_norm": 0.87890625, "learning_rate": 1.824923076923077e-05, "loss": 0.3605, "step": 570 }, { "epoch": 0.08969129955734251, "grad_norm": 1.0, "learning_rate": 1.821846153846154e-05, "loss": 0.2557, "step": 580 }, { "epoch": 0.09123770127384842, "grad_norm": 1.03125, "learning_rate": 1.8187692307692308e-05, "loss": 0.2806, "step": 590 }, { "epoch": 0.09278410299035432, "grad_norm": 0.875, "learning_rate": 1.815692307692308e-05, "loss": 0.2977, "step": 600 }, { "epoch": 0.09433050470686022, "grad_norm": 0.7890625, "learning_rate": 1.8126153846153846e-05, "loss": 0.2845, "step": 610 }, { "epoch": 0.09587690642336613, "grad_norm": 0.859375, "learning_rate": 1.8095384615384618e-05, "loss": 0.3309, "step": 620 }, { "epoch": 0.09742330813987203, "grad_norm": 0.95703125, "learning_rate": 1.806461538461539e-05, "loss": 0.3197, "step": 630 }, { "epoch": 0.09896970985637794, "grad_norm": 0.83203125, "learning_rate": 1.8033846153846156e-05, "loss": 0.2654, "step": 640 }, { "epoch": 0.10051611157288384, "grad_norm": 1.09375, "learning_rate": 1.8003076923076924e-05, "loss": 0.2954, "step": 650 }, { "epoch": 0.10206251328938976, "grad_norm": 1.0390625, "learning_rate": 1.7972307692307692e-05, "loss": 0.3237, "step": 660 }, { "epoch": 0.10360891500589565, "grad_norm": 0.82421875, "learning_rate": 1.7941538461538463e-05, "loss": 0.2887, "step": 670 }, { "epoch": 0.10515531672240157, "grad_norm": 1.0546875, "learning_rate": 1.791076923076923e-05, "loss": 0.3018, "step": 680 }, { "epoch": 0.10670171843890747, "grad_norm": 0.890625, "learning_rate": 1.788e-05, "loss": 0.261, "step": 690 }, { "epoch": 0.10824812015541337, "grad_norm": 0.84375, "learning_rate": 1.784923076923077e-05, "loss": 0.254, "step": 700 }, { "epoch": 0.10979452187191928, "grad_norm": 0.96875, "learning_rate": 1.781846153846154e-05, "loss": 0.2944, "step": 710 }, { "epoch": 0.11134092358842518, "grad_norm": 0.9375, "learning_rate": 1.778769230769231e-05, "loss": 0.3163, "step": 720 }, { "epoch": 0.11288732530493109, "grad_norm": 0.640625, "learning_rate": 1.775692307692308e-05, "loss": 0.2838, "step": 730 }, { "epoch": 0.11443372702143699, "grad_norm": 0.98828125, "learning_rate": 1.7726153846153847e-05, "loss": 0.236, "step": 740 }, { "epoch": 0.1159801287379429, "grad_norm": 0.6796875, "learning_rate": 1.7695384615384618e-05, "loss": 0.2164, "step": 750 }, { "epoch": 0.1175265304544488, "grad_norm": 0.7109375, "learning_rate": 1.7664615384615386e-05, "loss": 0.3331, "step": 760 }, { "epoch": 0.11907293217095472, "grad_norm": 0.859375, "learning_rate": 1.7633846153846153e-05, "loss": 0.303, "step": 770 }, { "epoch": 0.12061933388746061, "grad_norm": 0.80078125, "learning_rate": 1.7603076923076924e-05, "loss": 0.3264, "step": 780 }, { "epoch": 0.12216573560396653, "grad_norm": 0.85546875, "learning_rate": 1.7572307692307692e-05, "loss": 0.2097, "step": 790 }, { "epoch": 0.12371213732047243, "grad_norm": 0.80859375, "learning_rate": 1.7541538461538463e-05, "loss": 0.2456, "step": 800 }, { "epoch": 0.12525853903697834, "grad_norm": 0.875, "learning_rate": 1.751076923076923e-05, "loss": 0.2877, "step": 810 }, { "epoch": 0.12680494075348422, "grad_norm": 0.93359375, "learning_rate": 1.7480000000000002e-05, "loss": 0.2902, "step": 820 }, { "epoch": 0.12835134246999014, "grad_norm": 0.875, "learning_rate": 1.7449230769230773e-05, "loss": 0.2357, "step": 830 }, { "epoch": 0.12989774418649605, "grad_norm": 0.79296875, "learning_rate": 1.741846153846154e-05, "loss": 0.2926, "step": 840 }, { "epoch": 0.13144414590300196, "grad_norm": 0.85546875, "learning_rate": 1.738769230769231e-05, "loss": 0.2301, "step": 850 }, { "epoch": 0.13299054761950785, "grad_norm": 1.1015625, "learning_rate": 1.7356923076923076e-05, "loss": 0.2501, "step": 860 }, { "epoch": 0.13453694933601376, "grad_norm": 1.0234375, "learning_rate": 1.7326153846153847e-05, "loss": 0.2393, "step": 870 }, { "epoch": 0.13608335105251967, "grad_norm": 1.0625, "learning_rate": 1.7295384615384615e-05, "loss": 0.2337, "step": 880 }, { "epoch": 0.1376297527690256, "grad_norm": 1.1171875, "learning_rate": 1.7264615384615386e-05, "loss": 0.3147, "step": 890 }, { "epoch": 0.13917615448553147, "grad_norm": 0.828125, "learning_rate": 1.7233846153846154e-05, "loss": 0.2949, "step": 900 }, { "epoch": 0.14072255620203739, "grad_norm": 1.1484375, "learning_rate": 1.7203076923076925e-05, "loss": 0.3394, "step": 910 }, { "epoch": 0.1422689579185433, "grad_norm": 0.72265625, "learning_rate": 1.7172307692307696e-05, "loss": 0.3119, "step": 920 }, { "epoch": 0.14381535963504918, "grad_norm": 0.8828125, "learning_rate": 1.7141538461538464e-05, "loss": 0.2959, "step": 930 }, { "epoch": 0.1453617613515551, "grad_norm": 0.86328125, "learning_rate": 1.711076923076923e-05, "loss": 0.2677, "step": 940 }, { "epoch": 0.146908163068061, "grad_norm": 0.99609375, "learning_rate": 1.7080000000000002e-05, "loss": 0.2575, "step": 950 }, { "epoch": 0.14845456478456692, "grad_norm": 1.1640625, "learning_rate": 1.704923076923077e-05, "loss": 0.2419, "step": 960 }, { "epoch": 0.1500009665010728, "grad_norm": 0.86328125, "learning_rate": 1.7018461538461538e-05, "loss": 0.2631, "step": 970 }, { "epoch": 0.15154736821757872, "grad_norm": 0.8125, "learning_rate": 1.698769230769231e-05, "loss": 0.2415, "step": 980 }, { "epoch": 0.15309376993408463, "grad_norm": 0.96484375, "learning_rate": 1.6956923076923077e-05, "loss": 0.2498, "step": 990 }, { "epoch": 0.15464017165059052, "grad_norm": 0.93359375, "learning_rate": 1.6926153846153848e-05, "loss": 0.2845, "step": 1000 }, { "epoch": 0.15618657336709643, "grad_norm": 1.0, "learning_rate": 1.6895384615384615e-05, "loss": 0.3159, "step": 1010 }, { "epoch": 0.15773297508360234, "grad_norm": 1.15625, "learning_rate": 1.6864615384615387e-05, "loss": 0.2969, "step": 1020 }, { "epoch": 0.15927937680010826, "grad_norm": 0.9765625, "learning_rate": 1.6833846153846158e-05, "loss": 0.3195, "step": 1030 }, { "epoch": 0.16082577851661414, "grad_norm": 0.96484375, "learning_rate": 1.6803076923076925e-05, "loss": 0.3086, "step": 1040 }, { "epoch": 0.16237218023312006, "grad_norm": 0.87109375, "learning_rate": 1.6772307692307693e-05, "loss": 0.297, "step": 1050 }, { "epoch": 0.16391858194962597, "grad_norm": 1.015625, "learning_rate": 1.674153846153846e-05, "loss": 0.2677, "step": 1060 }, { "epoch": 0.16546498366613188, "grad_norm": 1.015625, "learning_rate": 1.6710769230769232e-05, "loss": 0.294, "step": 1070 }, { "epoch": 0.16701138538263777, "grad_norm": 0.953125, "learning_rate": 1.668e-05, "loss": 0.2483, "step": 1080 }, { "epoch": 0.16855778709914368, "grad_norm": 0.9375, "learning_rate": 1.664923076923077e-05, "loss": 0.2564, "step": 1090 }, { "epoch": 0.1701041888156496, "grad_norm": 0.96484375, "learning_rate": 1.661846153846154e-05, "loss": 0.2363, "step": 1100 }, { "epoch": 0.17165059053215548, "grad_norm": 1.015625, "learning_rate": 1.658769230769231e-05, "loss": 0.2486, "step": 1110 }, { "epoch": 0.1731969922486614, "grad_norm": 1.046875, "learning_rate": 1.655692307692308e-05, "loss": 0.3142, "step": 1120 }, { "epoch": 0.1747433939651673, "grad_norm": 1.609375, "learning_rate": 1.6526153846153848e-05, "loss": 0.4319, "step": 1130 }, { "epoch": 0.17628979568167322, "grad_norm": 0.9609375, "learning_rate": 1.6495384615384616e-05, "loss": 0.2727, "step": 1140 }, { "epoch": 0.1778361973981791, "grad_norm": 0.92578125, "learning_rate": 1.6464615384615387e-05, "loss": 0.2472, "step": 1150 }, { "epoch": 0.17938259911468502, "grad_norm": 0.9609375, "learning_rate": 1.6433846153846155e-05, "loss": 0.3036, "step": 1160 }, { "epoch": 0.18092900083119093, "grad_norm": 0.99609375, "learning_rate": 1.6403076923076922e-05, "loss": 0.2199, "step": 1170 }, { "epoch": 0.18247540254769684, "grad_norm": 1.1484375, "learning_rate": 1.6372307692307693e-05, "loss": 0.2474, "step": 1180 }, { "epoch": 0.18402180426420273, "grad_norm": 0.9609375, "learning_rate": 1.634153846153846e-05, "loss": 0.2892, "step": 1190 }, { "epoch": 0.18556820598070864, "grad_norm": 0.94140625, "learning_rate": 1.6310769230769232e-05, "loss": 0.3317, "step": 1200 }, { "epoch": 0.18711460769721455, "grad_norm": 0.85546875, "learning_rate": 1.628e-05, "loss": 0.3066, "step": 1210 }, { "epoch": 0.18866100941372044, "grad_norm": 0.88671875, "learning_rate": 1.624923076923077e-05, "loss": 0.2811, "step": 1220 }, { "epoch": 0.19020741113022635, "grad_norm": 0.87890625, "learning_rate": 1.6218461538461542e-05, "loss": 0.2503, "step": 1230 }, { "epoch": 0.19175381284673226, "grad_norm": 0.9140625, "learning_rate": 1.618769230769231e-05, "loss": 0.3128, "step": 1240 }, { "epoch": 0.19330021456323818, "grad_norm": 0.953125, "learning_rate": 1.6156923076923078e-05, "loss": 0.3067, "step": 1250 }, { "epoch": 0.19484661627974406, "grad_norm": 1.203125, "learning_rate": 1.6126153846153845e-05, "loss": 0.2975, "step": 1260 }, { "epoch": 0.19639301799624997, "grad_norm": 1.140625, "learning_rate": 1.6095384615384616e-05, "loss": 0.3083, "step": 1270 }, { "epoch": 0.1979394197127559, "grad_norm": 0.66796875, "learning_rate": 1.6064615384615384e-05, "loss": 0.2786, "step": 1280 }, { "epoch": 0.19948582142926177, "grad_norm": 0.890625, "learning_rate": 1.6033846153846155e-05, "loss": 0.404, "step": 1290 }, { "epoch": 0.20103222314576769, "grad_norm": 0.8984375, "learning_rate": 1.6003076923076923e-05, "loss": 0.3213, "step": 1300 }, { "epoch": 0.2025786248622736, "grad_norm": 0.91015625, "learning_rate": 1.5972307692307694e-05, "loss": 0.24, "step": 1310 }, { "epoch": 0.2041250265787795, "grad_norm": 1.2890625, "learning_rate": 1.5941538461538465e-05, "loss": 0.2711, "step": 1320 }, { "epoch": 0.2056714282952854, "grad_norm": 0.83203125, "learning_rate": 1.5910769230769233e-05, "loss": 0.2493, "step": 1330 }, { "epoch": 0.2072178300117913, "grad_norm": 1.4609375, "learning_rate": 1.588e-05, "loss": 0.353, "step": 1340 }, { "epoch": 0.20876423172829722, "grad_norm": 0.9375, "learning_rate": 1.584923076923077e-05, "loss": 0.2534, "step": 1350 }, { "epoch": 0.21031063344480314, "grad_norm": 0.64453125, "learning_rate": 1.581846153846154e-05, "loss": 0.2088, "step": 1360 }, { "epoch": 0.21185703516130902, "grad_norm": 0.94921875, "learning_rate": 1.5787692307692307e-05, "loss": 0.3146, "step": 1370 }, { "epoch": 0.21340343687781493, "grad_norm": 1.125, "learning_rate": 1.5756923076923078e-05, "loss": 0.2947, "step": 1380 }, { "epoch": 0.21494983859432085, "grad_norm": 0.6796875, "learning_rate": 1.5726153846153846e-05, "loss": 0.2039, "step": 1390 }, { "epoch": 0.21649624031082673, "grad_norm": 0.828125, "learning_rate": 1.5695384615384617e-05, "loss": 0.252, "step": 1400 }, { "epoch": 0.21804264202733264, "grad_norm": 0.875, "learning_rate": 1.5664615384615388e-05, "loss": 0.2689, "step": 1410 }, { "epoch": 0.21958904374383856, "grad_norm": 1.0390625, "learning_rate": 1.5633846153846156e-05, "loss": 0.3239, "step": 1420 }, { "epoch": 0.22113544546034447, "grad_norm": 1.0859375, "learning_rate": 1.5603076923076927e-05, "loss": 0.2891, "step": 1430 }, { "epoch": 0.22268184717685036, "grad_norm": 0.75390625, "learning_rate": 1.5572307692307694e-05, "loss": 0.3306, "step": 1440 }, { "epoch": 0.22422824889335627, "grad_norm": 1.0859375, "learning_rate": 1.5541538461538462e-05, "loss": 0.2971, "step": 1450 }, { "epoch": 0.22577465060986218, "grad_norm": 0.953125, "learning_rate": 1.551076923076923e-05, "loss": 0.2892, "step": 1460 }, { "epoch": 0.2273210523263681, "grad_norm": 0.75390625, "learning_rate": 1.548e-05, "loss": 0.2773, "step": 1470 }, { "epoch": 0.22886745404287398, "grad_norm": 0.9453125, "learning_rate": 1.544923076923077e-05, "loss": 0.2767, "step": 1480 }, { "epoch": 0.2304138557593799, "grad_norm": 1.046875, "learning_rate": 1.541846153846154e-05, "loss": 0.2899, "step": 1490 }, { "epoch": 0.2319602574758858, "grad_norm": 0.890625, "learning_rate": 1.5387692307692307e-05, "loss": 0.2521, "step": 1500 }, { "epoch": 0.2335066591923917, "grad_norm": 1.03125, "learning_rate": 1.535692307692308e-05, "loss": 0.2479, "step": 1510 }, { "epoch": 0.2350530609088976, "grad_norm": 1.0, "learning_rate": 1.532615384615385e-05, "loss": 0.3154, "step": 1520 }, { "epoch": 0.23659946262540352, "grad_norm": 0.83984375, "learning_rate": 1.5295384615384617e-05, "loss": 0.3391, "step": 1530 }, { "epoch": 0.23814586434190943, "grad_norm": 1.2265625, "learning_rate": 1.5264615384615385e-05, "loss": 0.265, "step": 1540 }, { "epoch": 0.23969226605841532, "grad_norm": 0.99609375, "learning_rate": 1.5233846153846154e-05, "loss": 0.2949, "step": 1550 }, { "epoch": 0.24123866777492123, "grad_norm": 1.265625, "learning_rate": 1.5203076923076925e-05, "loss": 0.3136, "step": 1560 }, { "epoch": 0.24278506949142714, "grad_norm": 1.4609375, "learning_rate": 1.5172307692307693e-05, "loss": 0.3073, "step": 1570 }, { "epoch": 0.24433147120793305, "grad_norm": 1.140625, "learning_rate": 1.5141538461538463e-05, "loss": 0.3271, "step": 1580 }, { "epoch": 0.24587787292443894, "grad_norm": 1.03125, "learning_rate": 1.5110769230769232e-05, "loss": 0.2722, "step": 1590 }, { "epoch": 0.24742427464094485, "grad_norm": 0.87109375, "learning_rate": 1.5080000000000001e-05, "loss": 0.3513, "step": 1600 }, { "epoch": 0.24897067635745077, "grad_norm": 1.1796875, "learning_rate": 1.504923076923077e-05, "loss": 0.2212, "step": 1610 }, { "epoch": 0.2505170780739567, "grad_norm": 0.890625, "learning_rate": 1.501846153846154e-05, "loss": 0.2914, "step": 1620 }, { "epoch": 0.25206347979046256, "grad_norm": 0.83984375, "learning_rate": 1.498769230769231e-05, "loss": 0.281, "step": 1630 }, { "epoch": 0.25360988150696845, "grad_norm": 0.86328125, "learning_rate": 1.4956923076923077e-05, "loss": 0.2509, "step": 1640 }, { "epoch": 0.2551562832234744, "grad_norm": 1.2578125, "learning_rate": 1.4926153846153848e-05, "loss": 0.2994, "step": 1650 }, { "epoch": 0.2567026849399803, "grad_norm": 0.90625, "learning_rate": 1.4895384615384616e-05, "loss": 0.2839, "step": 1660 }, { "epoch": 0.2582490866564862, "grad_norm": 0.81640625, "learning_rate": 1.4864615384615385e-05, "loss": 0.229, "step": 1670 }, { "epoch": 0.2597954883729921, "grad_norm": 0.9453125, "learning_rate": 1.4833846153846155e-05, "loss": 0.2381, "step": 1680 }, { "epoch": 0.261341890089498, "grad_norm": 0.9140625, "learning_rate": 1.4803076923076924e-05, "loss": 0.3495, "step": 1690 }, { "epoch": 0.2628882918060039, "grad_norm": 0.7421875, "learning_rate": 1.4772307692307692e-05, "loss": 0.2756, "step": 1700 }, { "epoch": 0.2644346935225098, "grad_norm": 0.9375, "learning_rate": 1.4741538461538463e-05, "loss": 0.3189, "step": 1710 }, { "epoch": 0.2659810952390157, "grad_norm": 0.765625, "learning_rate": 1.4710769230769232e-05, "loss": 0.289, "step": 1720 }, { "epoch": 0.26752749695552164, "grad_norm": 1.0703125, "learning_rate": 1.4680000000000002e-05, "loss": 0.2848, "step": 1730 }, { "epoch": 0.2690738986720275, "grad_norm": 2.96875, "learning_rate": 1.4649230769230771e-05, "loss": 0.3115, "step": 1740 }, { "epoch": 0.2706203003885334, "grad_norm": 0.79296875, "learning_rate": 1.4618461538461539e-05, "loss": 0.3075, "step": 1750 }, { "epoch": 0.27216670210503935, "grad_norm": 1.0625, "learning_rate": 1.458769230769231e-05, "loss": 0.2705, "step": 1760 }, { "epoch": 0.27371310382154523, "grad_norm": 0.98046875, "learning_rate": 1.4556923076923078e-05, "loss": 0.266, "step": 1770 }, { "epoch": 0.2752595055380512, "grad_norm": 0.8203125, "learning_rate": 1.4526153846153847e-05, "loss": 0.3179, "step": 1780 }, { "epoch": 0.27680590725455706, "grad_norm": 0.953125, "learning_rate": 1.4495384615384616e-05, "loss": 0.2775, "step": 1790 }, { "epoch": 0.27835230897106295, "grad_norm": 0.87890625, "learning_rate": 1.4464615384615386e-05, "loss": 0.3279, "step": 1800 }, { "epoch": 0.2798987106875689, "grad_norm": 0.8515625, "learning_rate": 1.4433846153846155e-05, "loss": 0.2373, "step": 1810 }, { "epoch": 0.28144511240407477, "grad_norm": 0.80859375, "learning_rate": 1.4403076923076925e-05, "loss": 0.2216, "step": 1820 }, { "epoch": 0.28299151412058066, "grad_norm": 0.93359375, "learning_rate": 1.4372307692307694e-05, "loss": 0.3206, "step": 1830 }, { "epoch": 0.2845379158370866, "grad_norm": 1.109375, "learning_rate": 1.4341538461538462e-05, "loss": 0.2467, "step": 1840 }, { "epoch": 0.2860843175535925, "grad_norm": 0.9765625, "learning_rate": 1.4310769230769233e-05, "loss": 0.2818, "step": 1850 }, { "epoch": 0.28763071927009837, "grad_norm": 0.8984375, "learning_rate": 1.428e-05, "loss": 0.2477, "step": 1860 }, { "epoch": 0.2891771209866043, "grad_norm": 0.734375, "learning_rate": 1.4249230769230772e-05, "loss": 0.2576, "step": 1870 }, { "epoch": 0.2907235227031102, "grad_norm": 0.91015625, "learning_rate": 1.421846153846154e-05, "loss": 0.2841, "step": 1880 }, { "epoch": 0.2922699244196161, "grad_norm": 0.98828125, "learning_rate": 1.4187692307692309e-05, "loss": 0.3371, "step": 1890 }, { "epoch": 0.293816326136122, "grad_norm": 1.0, "learning_rate": 1.4156923076923076e-05, "loss": 0.3037, "step": 1900 }, { "epoch": 0.2953627278526279, "grad_norm": 1.015625, "learning_rate": 1.4126153846153847e-05, "loss": 0.2526, "step": 1910 }, { "epoch": 0.29690912956913385, "grad_norm": 0.98828125, "learning_rate": 1.4095384615384617e-05, "loss": 0.2125, "step": 1920 }, { "epoch": 0.29845553128563973, "grad_norm": 0.8125, "learning_rate": 1.4064615384615386e-05, "loss": 0.2783, "step": 1930 }, { "epoch": 0.3000019330021456, "grad_norm": 0.90625, "learning_rate": 1.4033846153846156e-05, "loss": 0.3131, "step": 1940 }, { "epoch": 0.30154833471865156, "grad_norm": 0.78125, "learning_rate": 1.4003076923076923e-05, "loss": 0.3226, "step": 1950 }, { "epoch": 0.30309473643515744, "grad_norm": 0.83203125, "learning_rate": 1.3972307692307694e-05, "loss": 0.2819, "step": 1960 }, { "epoch": 0.3046411381516633, "grad_norm": 0.78515625, "learning_rate": 1.3941538461538462e-05, "loss": 0.2868, "step": 1970 }, { "epoch": 0.30618753986816927, "grad_norm": 0.92578125, "learning_rate": 1.3910769230769232e-05, "loss": 0.2615, "step": 1980 }, { "epoch": 0.30773394158467515, "grad_norm": 0.87109375, "learning_rate": 1.3880000000000001e-05, "loss": 0.255, "step": 1990 }, { "epoch": 0.30928034330118104, "grad_norm": 0.82421875, "learning_rate": 1.384923076923077e-05, "loss": 0.251, "step": 2000 }, { "epoch": 0.310826745017687, "grad_norm": 0.9609375, "learning_rate": 1.3818461538461541e-05, "loss": 0.2983, "step": 2010 }, { "epoch": 0.31237314673419286, "grad_norm": 1.0859375, "learning_rate": 1.3787692307692309e-05, "loss": 0.2705, "step": 2020 }, { "epoch": 0.3139195484506988, "grad_norm": 0.890625, "learning_rate": 1.3756923076923079e-05, "loss": 0.2937, "step": 2030 }, { "epoch": 0.3154659501672047, "grad_norm": 1.1953125, "learning_rate": 1.3726153846153846e-05, "loss": 0.3296, "step": 2040 }, { "epoch": 0.3170123518837106, "grad_norm": 1.140625, "learning_rate": 1.3695384615384617e-05, "loss": 0.2666, "step": 2050 }, { "epoch": 0.3185587536002165, "grad_norm": 1.0390625, "learning_rate": 1.3664615384615385e-05, "loss": 0.3124, "step": 2060 }, { "epoch": 0.3201051553167224, "grad_norm": 1.015625, "learning_rate": 1.3633846153846156e-05, "loss": 0.3752, "step": 2070 }, { "epoch": 0.3216515570332283, "grad_norm": 0.9296875, "learning_rate": 1.3603076923076924e-05, "loss": 0.2622, "step": 2080 }, { "epoch": 0.3231979587497342, "grad_norm": 0.734375, "learning_rate": 1.3572307692307693e-05, "loss": 0.2526, "step": 2090 }, { "epoch": 0.3247443604662401, "grad_norm": 0.80078125, "learning_rate": 1.3541538461538464e-05, "loss": 0.2775, "step": 2100 }, { "epoch": 0.326290762182746, "grad_norm": 1.0546875, "learning_rate": 1.3510769230769232e-05, "loss": 0.3322, "step": 2110 }, { "epoch": 0.32783716389925194, "grad_norm": 1.0, "learning_rate": 1.3480000000000001e-05, "loss": 0.2897, "step": 2120 }, { "epoch": 0.3293835656157578, "grad_norm": 1.0234375, "learning_rate": 1.344923076923077e-05, "loss": 0.2815, "step": 2130 }, { "epoch": 0.33092996733226376, "grad_norm": 0.7109375, "learning_rate": 1.341846153846154e-05, "loss": 0.2959, "step": 2140 }, { "epoch": 0.33247636904876965, "grad_norm": 0.91015625, "learning_rate": 1.3387692307692308e-05, "loss": 0.2571, "step": 2150 }, { "epoch": 0.33402277076527553, "grad_norm": 0.91015625, "learning_rate": 1.3356923076923079e-05, "loss": 0.247, "step": 2160 }, { "epoch": 0.3355691724817815, "grad_norm": 0.828125, "learning_rate": 1.3326153846153847e-05, "loss": 0.248, "step": 2170 }, { "epoch": 0.33711557419828736, "grad_norm": 0.94140625, "learning_rate": 1.3295384615384616e-05, "loss": 0.2438, "step": 2180 }, { "epoch": 0.33866197591479325, "grad_norm": 1.2734375, "learning_rate": 1.3264615384615385e-05, "loss": 0.3612, "step": 2190 }, { "epoch": 0.3402083776312992, "grad_norm": 0.9609375, "learning_rate": 1.3233846153846155e-05, "loss": 0.3287, "step": 2200 }, { "epoch": 0.34175477934780507, "grad_norm": 0.9609375, "learning_rate": 1.3203076923076926e-05, "loss": 0.2756, "step": 2210 }, { "epoch": 0.34330118106431096, "grad_norm": 0.91796875, "learning_rate": 1.3172307692307694e-05, "loss": 0.2886, "step": 2220 }, { "epoch": 0.3448475827808169, "grad_norm": 0.87109375, "learning_rate": 1.3141538461538463e-05, "loss": 0.2446, "step": 2230 }, { "epoch": 0.3463939844973228, "grad_norm": 0.94140625, "learning_rate": 1.311076923076923e-05, "loss": 0.3064, "step": 2240 }, { "epoch": 0.3479403862138287, "grad_norm": 0.8203125, "learning_rate": 1.3080000000000002e-05, "loss": 0.2376, "step": 2250 }, { "epoch": 0.3494867879303346, "grad_norm": 0.8359375, "learning_rate": 1.304923076923077e-05, "loss": 0.2932, "step": 2260 }, { "epoch": 0.3510331896468405, "grad_norm": 1.4296875, "learning_rate": 1.301846153846154e-05, "loss": 0.2979, "step": 2270 }, { "epoch": 0.35257959136334643, "grad_norm": 0.84375, "learning_rate": 1.2987692307692308e-05, "loss": 0.2897, "step": 2280 }, { "epoch": 0.3541259930798523, "grad_norm": 0.99609375, "learning_rate": 1.2956923076923078e-05, "loss": 0.2744, "step": 2290 }, { "epoch": 0.3556723947963582, "grad_norm": 0.96484375, "learning_rate": 1.2926153846153849e-05, "loss": 0.2708, "step": 2300 }, { "epoch": 0.35721879651286415, "grad_norm": 0.81640625, "learning_rate": 1.2895384615384616e-05, "loss": 0.2224, "step": 2310 }, { "epoch": 0.35876519822937003, "grad_norm": 1.1015625, "learning_rate": 1.2864615384615386e-05, "loss": 0.2728, "step": 2320 }, { "epoch": 0.3603115999458759, "grad_norm": 1.0859375, "learning_rate": 1.2833846153846155e-05, "loss": 0.2661, "step": 2330 }, { "epoch": 0.36185800166238186, "grad_norm": 1.046875, "learning_rate": 1.2803076923076925e-05, "loss": 0.2892, "step": 2340 }, { "epoch": 0.36340440337888774, "grad_norm": 0.6640625, "learning_rate": 1.2772307692307692e-05, "loss": 0.3092, "step": 2350 }, { "epoch": 0.3649508050953937, "grad_norm": 0.84765625, "learning_rate": 1.2741538461538463e-05, "loss": 0.2542, "step": 2360 }, { "epoch": 0.36649720681189957, "grad_norm": 0.91796875, "learning_rate": 1.2710769230769231e-05, "loss": 0.3589, "step": 2370 }, { "epoch": 0.36804360852840545, "grad_norm": 0.71484375, "learning_rate": 1.268e-05, "loss": 0.2237, "step": 2380 }, { "epoch": 0.3695900102449114, "grad_norm": 1.0703125, "learning_rate": 1.264923076923077e-05, "loss": 0.3413, "step": 2390 }, { "epoch": 0.3711364119614173, "grad_norm": 1.0234375, "learning_rate": 1.261846153846154e-05, "loss": 0.2556, "step": 2400 }, { "epoch": 0.37268281367792316, "grad_norm": 0.9296875, "learning_rate": 1.258769230769231e-05, "loss": 0.3087, "step": 2410 }, { "epoch": 0.3742292153944291, "grad_norm": 0.95703125, "learning_rate": 1.2556923076923078e-05, "loss": 0.2609, "step": 2420 }, { "epoch": 0.375775617110935, "grad_norm": 1.1328125, "learning_rate": 1.2526153846153848e-05, "loss": 0.2572, "step": 2430 }, { "epoch": 0.3773220188274409, "grad_norm": 1.1484375, "learning_rate": 1.2495384615384615e-05, "loss": 0.3003, "step": 2440 }, { "epoch": 0.3788684205439468, "grad_norm": 1.1484375, "learning_rate": 1.2464615384615386e-05, "loss": 0.259, "step": 2450 }, { "epoch": 0.3804148222604527, "grad_norm": 1.203125, "learning_rate": 1.2433846153846154e-05, "loss": 0.2606, "step": 2460 }, { "epoch": 0.38196122397695864, "grad_norm": 0.85546875, "learning_rate": 1.2403076923076925e-05, "loss": 0.2351, "step": 2470 }, { "epoch": 0.3835076256934645, "grad_norm": 0.90625, "learning_rate": 1.2372307692307693e-05, "loss": 0.2664, "step": 2480 }, { "epoch": 0.3850540274099704, "grad_norm": 0.91796875, "learning_rate": 1.2341538461538462e-05, "loss": 0.245, "step": 2490 }, { "epoch": 0.38660042912647635, "grad_norm": 1.109375, "learning_rate": 1.2310769230769233e-05, "loss": 0.2781, "step": 2500 }, { "epoch": 0.38814683084298224, "grad_norm": 1.046875, "learning_rate": 1.2280000000000001e-05, "loss": 0.2847, "step": 2510 }, { "epoch": 0.3896932325594881, "grad_norm": 0.8203125, "learning_rate": 1.224923076923077e-05, "loss": 0.3223, "step": 2520 }, { "epoch": 0.39123963427599406, "grad_norm": 0.9765625, "learning_rate": 1.221846153846154e-05, "loss": 0.3068, "step": 2530 }, { "epoch": 0.39278603599249995, "grad_norm": 0.9609375, "learning_rate": 1.218769230769231e-05, "loss": 0.2393, "step": 2540 }, { "epoch": 0.39433243770900583, "grad_norm": 1.09375, "learning_rate": 1.2156923076923077e-05, "loss": 0.2918, "step": 2550 }, { "epoch": 0.3958788394255118, "grad_norm": 0.84375, "learning_rate": 1.2126153846153848e-05, "loss": 0.2146, "step": 2560 }, { "epoch": 0.39742524114201766, "grad_norm": 0.875, "learning_rate": 1.2095384615384616e-05, "loss": 0.3178, "step": 2570 }, { "epoch": 0.39897164285852355, "grad_norm": 0.78515625, "learning_rate": 1.2064615384615385e-05, "loss": 0.2247, "step": 2580 }, { "epoch": 0.4005180445750295, "grad_norm": 0.93359375, "learning_rate": 1.2033846153846154e-05, "loss": 0.2684, "step": 2590 }, { "epoch": 0.40206444629153537, "grad_norm": 0.99609375, "learning_rate": 1.2003076923076924e-05, "loss": 0.2332, "step": 2600 }, { "epoch": 0.4036108480080413, "grad_norm": 0.86328125, "learning_rate": 1.1972307692307695e-05, "loss": 0.4153, "step": 2610 }, { "epoch": 0.4051572497245472, "grad_norm": 0.96484375, "learning_rate": 1.1941538461538463e-05, "loss": 0.2559, "step": 2620 }, { "epoch": 0.4067036514410531, "grad_norm": 1.53125, "learning_rate": 1.1910769230769232e-05, "loss": 0.2974, "step": 2630 }, { "epoch": 0.408250053157559, "grad_norm": 1.0078125, "learning_rate": 1.188e-05, "loss": 0.3523, "step": 2640 }, { "epoch": 0.4097964548740649, "grad_norm": 1.1171875, "learning_rate": 1.1849230769230771e-05, "loss": 0.2619, "step": 2650 }, { "epoch": 0.4113428565905708, "grad_norm": 1.109375, "learning_rate": 1.1818461538461539e-05, "loss": 0.2833, "step": 2660 }, { "epoch": 0.41288925830707673, "grad_norm": 0.96875, "learning_rate": 1.178769230769231e-05, "loss": 0.244, "step": 2670 }, { "epoch": 0.4144356600235826, "grad_norm": 0.8046875, "learning_rate": 1.1756923076923077e-05, "loss": 0.2238, "step": 2680 }, { "epoch": 0.4159820617400885, "grad_norm": 0.87890625, "learning_rate": 1.1726153846153847e-05, "loss": 0.2839, "step": 2690 }, { "epoch": 0.41752846345659445, "grad_norm": 1.46875, "learning_rate": 1.1695384615384618e-05, "loss": 0.3264, "step": 2700 }, { "epoch": 0.41907486517310033, "grad_norm": 0.953125, "learning_rate": 1.1664615384615386e-05, "loss": 0.2501, "step": 2710 }, { "epoch": 0.42062126688960627, "grad_norm": 0.94140625, "learning_rate": 1.1633846153846155e-05, "loss": 0.3618, "step": 2720 }, { "epoch": 0.42216766860611216, "grad_norm": 1.015625, "learning_rate": 1.1603076923076924e-05, "loss": 0.2353, "step": 2730 }, { "epoch": 0.42371407032261804, "grad_norm": 0.79296875, "learning_rate": 1.1572307692307694e-05, "loss": 0.2745, "step": 2740 }, { "epoch": 0.425260472039124, "grad_norm": 1.1171875, "learning_rate": 1.1541538461538461e-05, "loss": 0.2673, "step": 2750 }, { "epoch": 0.42680687375562987, "grad_norm": 1.1796875, "learning_rate": 1.1510769230769232e-05, "loss": 0.2448, "step": 2760 }, { "epoch": 0.42835327547213575, "grad_norm": 0.82421875, "learning_rate": 1.148e-05, "loss": 0.2428, "step": 2770 }, { "epoch": 0.4298996771886417, "grad_norm": 0.97265625, "learning_rate": 1.144923076923077e-05, "loss": 0.2259, "step": 2780 }, { "epoch": 0.4314460789051476, "grad_norm": 1.0625, "learning_rate": 1.141846153846154e-05, "loss": 0.3214, "step": 2790 }, { "epoch": 0.43299248062165346, "grad_norm": 0.90625, "learning_rate": 1.1387692307692308e-05, "loss": 0.272, "step": 2800 }, { "epoch": 0.4345388823381594, "grad_norm": 0.7890625, "learning_rate": 1.135692307692308e-05, "loss": 0.3055, "step": 2810 }, { "epoch": 0.4360852840546653, "grad_norm": 1.3359375, "learning_rate": 1.1326153846153847e-05, "loss": 0.3783, "step": 2820 }, { "epoch": 0.43763168577117123, "grad_norm": 0.83203125, "learning_rate": 1.1295384615384617e-05, "loss": 0.2318, "step": 2830 }, { "epoch": 0.4391780874876771, "grad_norm": 1.0, "learning_rate": 1.1264615384615384e-05, "loss": 0.3072, "step": 2840 }, { "epoch": 0.440724489204183, "grad_norm": 1.453125, "learning_rate": 1.1233846153846155e-05, "loss": 0.3272, "step": 2850 }, { "epoch": 0.44227089092068894, "grad_norm": 1.015625, "learning_rate": 1.1203076923076923e-05, "loss": 0.2422, "step": 2860 }, { "epoch": 0.4438172926371948, "grad_norm": 0.83984375, "learning_rate": 1.1172307692307694e-05, "loss": 0.2424, "step": 2870 }, { "epoch": 0.4453636943537007, "grad_norm": 1.0703125, "learning_rate": 1.1141538461538462e-05, "loss": 0.288, "step": 2880 }, { "epoch": 0.44691009607020665, "grad_norm": 0.90234375, "learning_rate": 1.1110769230769231e-05, "loss": 0.2376, "step": 2890 }, { "epoch": 0.44845649778671254, "grad_norm": 0.80859375, "learning_rate": 1.1080000000000002e-05, "loss": 0.2541, "step": 2900 }, { "epoch": 0.4500028995032184, "grad_norm": 0.80078125, "learning_rate": 1.104923076923077e-05, "loss": 0.2928, "step": 2910 }, { "epoch": 0.45154930121972436, "grad_norm": 0.83984375, "learning_rate": 1.101846153846154e-05, "loss": 0.2582, "step": 2920 }, { "epoch": 0.45309570293623025, "grad_norm": 0.9140625, "learning_rate": 1.0987692307692309e-05, "loss": 0.2548, "step": 2930 }, { "epoch": 0.4546421046527362, "grad_norm": 1.421875, "learning_rate": 1.0956923076923078e-05, "loss": 0.3462, "step": 2940 }, { "epoch": 0.4561885063692421, "grad_norm": 1.0, "learning_rate": 1.0926153846153846e-05, "loss": 0.3076, "step": 2950 }, { "epoch": 0.45773490808574796, "grad_norm": 1.0546875, "learning_rate": 1.0895384615384617e-05, "loss": 0.2761, "step": 2960 }, { "epoch": 0.4592813098022539, "grad_norm": 0.859375, "learning_rate": 1.0864615384615385e-05, "loss": 0.3359, "step": 2970 }, { "epoch": 0.4608277115187598, "grad_norm": 0.86328125, "learning_rate": 1.0833846153846154e-05, "loss": 0.3213, "step": 2980 }, { "epoch": 0.46237411323526567, "grad_norm": 0.85546875, "learning_rate": 1.0803076923076925e-05, "loss": 0.2917, "step": 2990 }, { "epoch": 0.4639205149517716, "grad_norm": 0.953125, "learning_rate": 1.0772307692307693e-05, "loss": 0.2774, "step": 3000 }, { "epoch": 0.4654669166682775, "grad_norm": 0.7734375, "learning_rate": 1.0741538461538464e-05, "loss": 0.3373, "step": 3010 }, { "epoch": 0.4670133183847834, "grad_norm": 0.57421875, "learning_rate": 1.0710769230769232e-05, "loss": 0.248, "step": 3020 }, { "epoch": 0.4685597201012893, "grad_norm": 1.0078125, "learning_rate": 1.0680000000000001e-05, "loss": 0.2782, "step": 3030 }, { "epoch": 0.4701061218177952, "grad_norm": 0.80859375, "learning_rate": 1.0649230769230769e-05, "loss": 0.3041, "step": 3040 }, { "epoch": 0.47165252353430115, "grad_norm": 0.8359375, "learning_rate": 1.061846153846154e-05, "loss": 0.2109, "step": 3050 }, { "epoch": 0.47319892525080703, "grad_norm": 0.99609375, "learning_rate": 1.0587692307692308e-05, "loss": 0.2815, "step": 3060 }, { "epoch": 0.4747453269673129, "grad_norm": 1.1875, "learning_rate": 1.0556923076923079e-05, "loss": 0.2775, "step": 3070 }, { "epoch": 0.47629172868381886, "grad_norm": 1.15625, "learning_rate": 1.0526153846153846e-05, "loss": 0.2645, "step": 3080 }, { "epoch": 0.47783813040032475, "grad_norm": 0.91796875, "learning_rate": 1.0495384615384616e-05, "loss": 0.2738, "step": 3090 }, { "epoch": 0.47938453211683063, "grad_norm": 0.73828125, "learning_rate": 1.0464615384615387e-05, "loss": 0.2912, "step": 3100 }, { "epoch": 0.48093093383333657, "grad_norm": 1.046875, "learning_rate": 1.0433846153846155e-05, "loss": 0.217, "step": 3110 }, { "epoch": 0.48247733554984246, "grad_norm": 0.96875, "learning_rate": 1.0403076923076924e-05, "loss": 0.3397, "step": 3120 }, { "epoch": 0.48402373726634834, "grad_norm": 0.80859375, "learning_rate": 1.0372307692307693e-05, "loss": 0.215, "step": 3130 }, { "epoch": 0.4855701389828543, "grad_norm": 1.125, "learning_rate": 1.0341538461538463e-05, "loss": 0.2587, "step": 3140 }, { "epoch": 0.48711654069936017, "grad_norm": 0.9765625, "learning_rate": 1.031076923076923e-05, "loss": 0.2183, "step": 3150 }, { "epoch": 0.4886629424158661, "grad_norm": 1.140625, "learning_rate": 1.0280000000000002e-05, "loss": 0.2551, "step": 3160 }, { "epoch": 0.490209344132372, "grad_norm": 0.859375, "learning_rate": 1.024923076923077e-05, "loss": 0.2389, "step": 3170 }, { "epoch": 0.4917557458488779, "grad_norm": 0.859375, "learning_rate": 1.0218461538461539e-05, "loss": 0.2774, "step": 3180 }, { "epoch": 0.4933021475653838, "grad_norm": 0.8828125, "learning_rate": 1.018769230769231e-05, "loss": 0.2608, "step": 3190 }, { "epoch": 0.4948485492818897, "grad_norm": 1.0078125, "learning_rate": 1.0156923076923077e-05, "loss": 0.3287, "step": 3200 }, { "epoch": 0.4963949509983956, "grad_norm": 0.80859375, "learning_rate": 1.0126153846153849e-05, "loss": 0.236, "step": 3210 }, { "epoch": 0.49794135271490153, "grad_norm": 1.0, "learning_rate": 1.0095384615384616e-05, "loss": 0.259, "step": 3220 }, { "epoch": 0.4994877544314074, "grad_norm": 0.6796875, "learning_rate": 1.0064615384615386e-05, "loss": 0.2668, "step": 3230 }, { "epoch": 0.5010341561479134, "grad_norm": 0.86328125, "learning_rate": 1.0033846153846153e-05, "loss": 0.3078, "step": 3240 }, { "epoch": 0.5025805578644192, "grad_norm": 1.1328125, "learning_rate": 1.0003076923076924e-05, "loss": 0.2674, "step": 3250 }, { "epoch": 0.5041269595809251, "grad_norm": 1.03125, "learning_rate": 9.972307692307694e-06, "loss": 0.276, "step": 3260 }, { "epoch": 0.505673361297431, "grad_norm": 0.58203125, "learning_rate": 9.941538461538463e-06, "loss": 0.2331, "step": 3270 }, { "epoch": 0.5072197630139369, "grad_norm": 0.7734375, "learning_rate": 9.910769230769231e-06, "loss": 0.2518, "step": 3280 }, { "epoch": 0.5087661647304429, "grad_norm": 0.79296875, "learning_rate": 9.88e-06, "loss": 0.3217, "step": 3290 }, { "epoch": 0.5103125664469488, "grad_norm": 0.78125, "learning_rate": 9.84923076923077e-06, "loss": 0.2582, "step": 3300 }, { "epoch": 0.5118589681634547, "grad_norm": 0.921875, "learning_rate": 9.818461538461539e-06, "loss": 0.2967, "step": 3310 }, { "epoch": 0.5134053698799605, "grad_norm": 0.921875, "learning_rate": 9.787692307692308e-06, "loss": 0.2508, "step": 3320 }, { "epoch": 0.5149517715964664, "grad_norm": 1.3203125, "learning_rate": 9.756923076923078e-06, "loss": 0.247, "step": 3330 }, { "epoch": 0.5164981733129724, "grad_norm": 0.94140625, "learning_rate": 9.726153846153847e-06, "loss": 0.2664, "step": 3340 }, { "epoch": 0.5180445750294783, "grad_norm": 0.96875, "learning_rate": 9.695384615384617e-06, "loss": 0.2963, "step": 3350 }, { "epoch": 0.5195909767459842, "grad_norm": 1.2109375, "learning_rate": 9.664615384615386e-06, "loss": 0.2869, "step": 3360 }, { "epoch": 0.5211373784624901, "grad_norm": 0.796875, "learning_rate": 9.633846153846155e-06, "loss": 0.2125, "step": 3370 }, { "epoch": 0.522683780178996, "grad_norm": 0.83203125, "learning_rate": 9.603076923076923e-06, "loss": 0.2379, "step": 3380 }, { "epoch": 0.5242301818955019, "grad_norm": 1.2578125, "learning_rate": 9.572307692307693e-06, "loss": 0.2897, "step": 3390 }, { "epoch": 0.5257765836120079, "grad_norm": 0.94140625, "learning_rate": 9.541538461538462e-06, "loss": 0.2954, "step": 3400 }, { "epoch": 0.5273229853285137, "grad_norm": 0.75390625, "learning_rate": 9.510769230769231e-06, "loss": 0.257, "step": 3410 }, { "epoch": 0.5288693870450196, "grad_norm": 0.77734375, "learning_rate": 9.48e-06, "loss": 0.3107, "step": 3420 }, { "epoch": 0.5304157887615255, "grad_norm": 0.7890625, "learning_rate": 9.44923076923077e-06, "loss": 0.2178, "step": 3430 }, { "epoch": 0.5319621904780314, "grad_norm": 1.0625, "learning_rate": 9.41846153846154e-06, "loss": 0.2407, "step": 3440 }, { "epoch": 0.5335085921945374, "grad_norm": 1.2421875, "learning_rate": 9.387692307692309e-06, "loss": 0.284, "step": 3450 }, { "epoch": 0.5350549939110433, "grad_norm": 0.8984375, "learning_rate": 9.356923076923078e-06, "loss": 0.2358, "step": 3460 }, { "epoch": 0.5366013956275492, "grad_norm": 1.1328125, "learning_rate": 9.326153846153848e-06, "loss": 0.2455, "step": 3470 }, { "epoch": 0.538147797344055, "grad_norm": 1.171875, "learning_rate": 9.295384615384615e-06, "loss": 0.3416, "step": 3480 }, { "epoch": 0.5396941990605609, "grad_norm": 1.171875, "learning_rate": 9.264615384615385e-06, "loss": 0.2908, "step": 3490 }, { "epoch": 0.5412406007770668, "grad_norm": 0.83984375, "learning_rate": 9.233846153846154e-06, "loss": 0.2648, "step": 3500 }, { "epoch": 0.5427870024935728, "grad_norm": 0.984375, "learning_rate": 9.203076923076924e-06, "loss": 0.2159, "step": 3510 }, { "epoch": 0.5443334042100787, "grad_norm": 1.078125, "learning_rate": 9.172307692307693e-06, "loss": 0.3019, "step": 3520 }, { "epoch": 0.5458798059265846, "grad_norm": 1.4375, "learning_rate": 9.141538461538462e-06, "loss": 0.2886, "step": 3530 }, { "epoch": 0.5474262076430905, "grad_norm": 1.1484375, "learning_rate": 9.110769230769232e-06, "loss": 0.2674, "step": 3540 }, { "epoch": 0.5489726093595964, "grad_norm": 1.015625, "learning_rate": 9.080000000000001e-06, "loss": 0.2809, "step": 3550 }, { "epoch": 0.5505190110761023, "grad_norm": 0.94140625, "learning_rate": 9.04923076923077e-06, "loss": 0.3105, "step": 3560 }, { "epoch": 0.5520654127926082, "grad_norm": 0.859375, "learning_rate": 9.01846153846154e-06, "loss": 0.3115, "step": 3570 }, { "epoch": 0.5536118145091141, "grad_norm": 0.90234375, "learning_rate": 8.987692307692308e-06, "loss": 0.2605, "step": 3580 }, { "epoch": 0.55515821622562, "grad_norm": 0.703125, "learning_rate": 8.956923076923077e-06, "loss": 0.2281, "step": 3590 }, { "epoch": 0.5567046179421259, "grad_norm": 0.953125, "learning_rate": 8.926153846153846e-06, "loss": 0.2732, "step": 3600 }, { "epoch": 0.5582510196586318, "grad_norm": 1.0078125, "learning_rate": 8.895384615384616e-06, "loss": 0.2134, "step": 3610 }, { "epoch": 0.5597974213751378, "grad_norm": 1.140625, "learning_rate": 8.864615384615385e-06, "loss": 0.2788, "step": 3620 }, { "epoch": 0.5613438230916437, "grad_norm": 0.8984375, "learning_rate": 8.833846153846155e-06, "loss": 0.2558, "step": 3630 }, { "epoch": 0.5628902248081495, "grad_norm": 0.7265625, "learning_rate": 8.803076923076924e-06, "loss": 0.2719, "step": 3640 }, { "epoch": 0.5644366265246554, "grad_norm": 1.0703125, "learning_rate": 8.772307692307693e-06, "loss": 0.2596, "step": 3650 }, { "epoch": 0.5659830282411613, "grad_norm": 0.8828125, "learning_rate": 8.741538461538463e-06, "loss": 0.2484, "step": 3660 }, { "epoch": 0.5675294299576673, "grad_norm": 0.734375, "learning_rate": 8.710769230769232e-06, "loss": 0.2734, "step": 3670 }, { "epoch": 0.5690758316741732, "grad_norm": 0.91796875, "learning_rate": 8.68e-06, "loss": 0.2728, "step": 3680 }, { "epoch": 0.5706222333906791, "grad_norm": 0.9765625, "learning_rate": 8.64923076923077e-06, "loss": 0.2746, "step": 3690 }, { "epoch": 0.572168635107185, "grad_norm": 0.94140625, "learning_rate": 8.618461538461539e-06, "loss": 0.2767, "step": 3700 }, { "epoch": 0.5737150368236908, "grad_norm": 0.80859375, "learning_rate": 8.587692307692308e-06, "loss": 0.2798, "step": 3710 }, { "epoch": 0.5752614385401967, "grad_norm": 1.0703125, "learning_rate": 8.556923076923077e-06, "loss": 0.2573, "step": 3720 }, { "epoch": 0.5768078402567027, "grad_norm": 1.1015625, "learning_rate": 8.526153846153847e-06, "loss": 0.2756, "step": 3730 }, { "epoch": 0.5783542419732086, "grad_norm": 1.03125, "learning_rate": 8.495384615384616e-06, "loss": 0.2819, "step": 3740 }, { "epoch": 0.5799006436897145, "grad_norm": 0.8046875, "learning_rate": 8.464615384615386e-06, "loss": 0.22, "step": 3750 }, { "epoch": 0.5814470454062204, "grad_norm": 1.0078125, "learning_rate": 8.433846153846155e-06, "loss": 0.2857, "step": 3760 }, { "epoch": 0.5829934471227263, "grad_norm": 1.0703125, "learning_rate": 8.403076923076924e-06, "loss": 0.2803, "step": 3770 }, { "epoch": 0.5845398488392322, "grad_norm": 1.0625, "learning_rate": 8.372307692307692e-06, "loss": 0.2207, "step": 3780 }, { "epoch": 0.5860862505557382, "grad_norm": 1.375, "learning_rate": 8.341538461538462e-06, "loss": 0.2684, "step": 3790 }, { "epoch": 0.587632652272244, "grad_norm": 0.7578125, "learning_rate": 8.310769230769231e-06, "loss": 0.2353, "step": 3800 }, { "epoch": 0.5891790539887499, "grad_norm": 0.6484375, "learning_rate": 8.28e-06, "loss": 0.3244, "step": 3810 }, { "epoch": 0.5907254557052558, "grad_norm": 1.15625, "learning_rate": 8.24923076923077e-06, "loss": 0.288, "step": 3820 }, { "epoch": 0.5922718574217617, "grad_norm": 0.91015625, "learning_rate": 8.218461538461539e-06, "loss": 0.261, "step": 3830 }, { "epoch": 0.5938182591382677, "grad_norm": 0.78125, "learning_rate": 8.187692307692309e-06, "loss": 0.3277, "step": 3840 }, { "epoch": 0.5953646608547736, "grad_norm": 0.87890625, "learning_rate": 8.156923076923078e-06, "loss": 0.2727, "step": 3850 }, { "epoch": 0.5969110625712795, "grad_norm": 0.80078125, "learning_rate": 8.126153846153847e-06, "loss": 0.2319, "step": 3860 }, { "epoch": 0.5984574642877853, "grad_norm": 0.97265625, "learning_rate": 8.095384615384617e-06, "loss": 0.252, "step": 3870 }, { "epoch": 0.6000038660042912, "grad_norm": 0.96875, "learning_rate": 8.064615384615384e-06, "loss": 0.2683, "step": 3880 }, { "epoch": 0.6015502677207971, "grad_norm": 0.89453125, "learning_rate": 8.033846153846154e-06, "loss": 0.3251, "step": 3890 }, { "epoch": 0.6030966694373031, "grad_norm": 1.015625, "learning_rate": 8.003076923076923e-06, "loss": 0.3153, "step": 3900 }, { "epoch": 0.604643071153809, "grad_norm": 0.9296875, "learning_rate": 7.972307692307693e-06, "loss": 0.3234, "step": 3910 }, { "epoch": 0.6061894728703149, "grad_norm": 0.88671875, "learning_rate": 7.941538461538462e-06, "loss": 0.2812, "step": 3920 }, { "epoch": 0.6077358745868208, "grad_norm": 0.890625, "learning_rate": 7.910769230769231e-06, "loss": 0.2959, "step": 3930 }, { "epoch": 0.6092822763033267, "grad_norm": 0.98046875, "learning_rate": 7.88e-06, "loss": 0.2818, "step": 3940 }, { "epoch": 0.6108286780198326, "grad_norm": 1.15625, "learning_rate": 7.84923076923077e-06, "loss": 0.3152, "step": 3950 }, { "epoch": 0.6123750797363385, "grad_norm": 0.9375, "learning_rate": 7.81846153846154e-06, "loss": 0.2988, "step": 3960 }, { "epoch": 0.6139214814528444, "grad_norm": 0.984375, "learning_rate": 7.787692307692309e-06, "loss": 0.2835, "step": 3970 }, { "epoch": 0.6154678831693503, "grad_norm": 1.359375, "learning_rate": 7.756923076923077e-06, "loss": 0.3486, "step": 3980 }, { "epoch": 0.6170142848858562, "grad_norm": 1.3828125, "learning_rate": 7.726153846153846e-06, "loss": 0.2934, "step": 3990 }, { "epoch": 0.6185606866023621, "grad_norm": 0.92578125, "learning_rate": 7.695384615384615e-06, "loss": 0.2678, "step": 4000 }, { "epoch": 0.6201070883188681, "grad_norm": 0.90625, "learning_rate": 7.664615384615385e-06, "loss": 0.2608, "step": 4010 }, { "epoch": 0.621653490035374, "grad_norm": 1.0703125, "learning_rate": 7.633846153846154e-06, "loss": 0.289, "step": 4020 }, { "epoch": 0.6231998917518798, "grad_norm": 1.421875, "learning_rate": 7.6030769230769245e-06, "loss": 0.2856, "step": 4030 }, { "epoch": 0.6247462934683857, "grad_norm": 0.67578125, "learning_rate": 7.572307692307693e-06, "loss": 0.2569, "step": 4040 }, { "epoch": 0.6262926951848916, "grad_norm": 0.96484375, "learning_rate": 7.5415384615384624e-06, "loss": 0.2727, "step": 4050 }, { "epoch": 0.6278390969013976, "grad_norm": 0.75390625, "learning_rate": 7.510769230769232e-06, "loss": 0.279, "step": 4060 }, { "epoch": 0.6293854986179035, "grad_norm": 0.9453125, "learning_rate": 7.48e-06, "loss": 0.3606, "step": 4070 }, { "epoch": 0.6309319003344094, "grad_norm": 0.7109375, "learning_rate": 7.44923076923077e-06, "loss": 0.2959, "step": 4080 }, { "epoch": 0.6324783020509153, "grad_norm": 1.03125, "learning_rate": 7.418461538461539e-06, "loss": 0.2622, "step": 4090 }, { "epoch": 0.6340247037674211, "grad_norm": 0.8046875, "learning_rate": 7.387692307692308e-06, "loss": 0.2207, "step": 4100 }, { "epoch": 0.635571105483927, "grad_norm": 1.0234375, "learning_rate": 7.356923076923077e-06, "loss": 0.3007, "step": 4110 }, { "epoch": 0.637117507200433, "grad_norm": 1.09375, "learning_rate": 7.326153846153847e-06, "loss": 0.2815, "step": 4120 }, { "epoch": 0.6386639089169389, "grad_norm": 0.9453125, "learning_rate": 7.295384615384617e-06, "loss": 0.2587, "step": 4130 }, { "epoch": 0.6402103106334448, "grad_norm": 0.9453125, "learning_rate": 7.264615384615385e-06, "loss": 0.2999, "step": 4140 }, { "epoch": 0.6417567123499507, "grad_norm": 0.90234375, "learning_rate": 7.233846153846155e-06, "loss": 0.2398, "step": 4150 }, { "epoch": 0.6433031140664566, "grad_norm": 0.98046875, "learning_rate": 7.203076923076924e-06, "loss": 0.2716, "step": 4160 }, { "epoch": 0.6448495157829626, "grad_norm": 1.0234375, "learning_rate": 7.172307692307693e-06, "loss": 0.2222, "step": 4170 }, { "epoch": 0.6463959174994685, "grad_norm": 0.73828125, "learning_rate": 7.141538461538462e-06, "loss": 0.285, "step": 4180 }, { "epoch": 0.6479423192159743, "grad_norm": 0.74609375, "learning_rate": 7.1107692307692314e-06, "loss": 0.3562, "step": 4190 }, { "epoch": 0.6494887209324802, "grad_norm": 0.9921875, "learning_rate": 7.08e-06, "loss": 0.283, "step": 4200 }, { "epoch": 0.6510351226489861, "grad_norm": 0.85546875, "learning_rate": 7.049230769230769e-06, "loss": 0.2915, "step": 4210 }, { "epoch": 0.652581524365492, "grad_norm": 0.765625, "learning_rate": 7.01846153846154e-06, "loss": 0.2424, "step": 4220 }, { "epoch": 0.654127926081998, "grad_norm": 0.81640625, "learning_rate": 6.987692307692309e-06, "loss": 0.2456, "step": 4230 }, { "epoch": 0.6556743277985039, "grad_norm": 0.97265625, "learning_rate": 6.9569230769230776e-06, "loss": 0.2946, "step": 4240 }, { "epoch": 0.6572207295150098, "grad_norm": 0.77734375, "learning_rate": 6.926153846153847e-06, "loss": 0.3338, "step": 4250 }, { "epoch": 0.6587671312315156, "grad_norm": 1.2265625, "learning_rate": 6.895384615384616e-06, "loss": 0.2645, "step": 4260 }, { "epoch": 0.6603135329480215, "grad_norm": 0.96484375, "learning_rate": 6.864615384615385e-06, "loss": 0.2671, "step": 4270 }, { "epoch": 0.6618599346645275, "grad_norm": 1.09375, "learning_rate": 6.833846153846154e-06, "loss": 0.2627, "step": 4280 }, { "epoch": 0.6634063363810334, "grad_norm": 1.2890625, "learning_rate": 6.803076923076924e-06, "loss": 0.2972, "step": 4290 }, { "epoch": 0.6649527380975393, "grad_norm": 0.8359375, "learning_rate": 6.772307692307692e-06, "loss": 0.2637, "step": 4300 }, { "epoch": 0.6664991398140452, "grad_norm": 0.87109375, "learning_rate": 6.741538461538462e-06, "loss": 0.2459, "step": 4310 }, { "epoch": 0.6680455415305511, "grad_norm": 0.93359375, "learning_rate": 6.710769230769232e-06, "loss": 0.3008, "step": 4320 }, { "epoch": 0.669591943247057, "grad_norm": 1.109375, "learning_rate": 6.680000000000001e-06, "loss": 0.2881, "step": 4330 }, { "epoch": 0.671138344963563, "grad_norm": 0.67578125, "learning_rate": 6.64923076923077e-06, "loss": 0.2887, "step": 4340 }, { "epoch": 0.6726847466800688, "grad_norm": 0.875, "learning_rate": 6.618461538461539e-06, "loss": 0.3097, "step": 4350 }, { "epoch": 0.6742311483965747, "grad_norm": 0.8671875, "learning_rate": 6.587692307692309e-06, "loss": 0.2623, "step": 4360 }, { "epoch": 0.6757775501130806, "grad_norm": 0.8984375, "learning_rate": 6.556923076923077e-06, "loss": 0.2589, "step": 4370 }, { "epoch": 0.6773239518295865, "grad_norm": 0.77734375, "learning_rate": 6.5261538461538465e-06, "loss": 0.2149, "step": 4380 }, { "epoch": 0.6788703535460925, "grad_norm": 0.9453125, "learning_rate": 6.495384615384616e-06, "loss": 0.22, "step": 4390 }, { "epoch": 0.6804167552625984, "grad_norm": 0.98828125, "learning_rate": 6.4646153846153845e-06, "loss": 0.2636, "step": 4400 }, { "epoch": 0.6819631569791043, "grad_norm": 1.0234375, "learning_rate": 6.433846153846154e-06, "loss": 0.3088, "step": 4410 }, { "epoch": 0.6835095586956101, "grad_norm": 0.92578125, "learning_rate": 6.403076923076924e-06, "loss": 0.2378, "step": 4420 }, { "epoch": 0.685055960412116, "grad_norm": 0.91796875, "learning_rate": 6.3723076923076935e-06, "loss": 0.2405, "step": 4430 }, { "epoch": 0.6866023621286219, "grad_norm": 0.83203125, "learning_rate": 6.341538461538462e-06, "loss": 0.2825, "step": 4440 }, { "epoch": 0.6881487638451279, "grad_norm": 0.9296875, "learning_rate": 6.3107692307692315e-06, "loss": 0.3513, "step": 4450 }, { "epoch": 0.6896951655616338, "grad_norm": 1.078125, "learning_rate": 6.280000000000001e-06, "loss": 0.3271, "step": 4460 }, { "epoch": 0.6912415672781397, "grad_norm": 0.82421875, "learning_rate": 6.249230769230769e-06, "loss": 0.2742, "step": 4470 }, { "epoch": 0.6927879689946456, "grad_norm": 0.9921875, "learning_rate": 6.218461538461539e-06, "loss": 0.2642, "step": 4480 }, { "epoch": 0.6943343707111514, "grad_norm": 0.96875, "learning_rate": 6.187692307692308e-06, "loss": 0.2364, "step": 4490 }, { "epoch": 0.6958807724276574, "grad_norm": 1.1875, "learning_rate": 6.156923076923077e-06, "loss": 0.2883, "step": 4500 }, { "epoch": 0.6974271741441633, "grad_norm": 1.03125, "learning_rate": 6.126153846153846e-06, "loss": 0.2219, "step": 4510 }, { "epoch": 0.6989735758606692, "grad_norm": 0.9453125, "learning_rate": 6.095384615384616e-06, "loss": 0.2639, "step": 4520 }, { "epoch": 0.7005199775771751, "grad_norm": 0.80859375, "learning_rate": 6.064615384615386e-06, "loss": 0.2764, "step": 4530 }, { "epoch": 0.702066379293681, "grad_norm": 1.0390625, "learning_rate": 6.033846153846154e-06, "loss": 0.3097, "step": 4540 }, { "epoch": 0.7036127810101869, "grad_norm": 0.890625, "learning_rate": 6.003076923076924e-06, "loss": 0.2262, "step": 4550 }, { "epoch": 0.7051591827266929, "grad_norm": 0.82421875, "learning_rate": 5.972307692307693e-06, "loss": 0.2537, "step": 4560 }, { "epoch": 0.7067055844431988, "grad_norm": 0.7734375, "learning_rate": 5.941538461538462e-06, "loss": 0.347, "step": 4570 }, { "epoch": 0.7082519861597046, "grad_norm": 1.4140625, "learning_rate": 5.910769230769231e-06, "loss": 0.3193, "step": 4580 }, { "epoch": 0.7097983878762105, "grad_norm": 0.75, "learning_rate": 5.8800000000000005e-06, "loss": 0.273, "step": 4590 }, { "epoch": 0.7113447895927164, "grad_norm": 0.921875, "learning_rate": 5.849230769230769e-06, "loss": 0.2902, "step": 4600 }, { "epoch": 0.7128911913092224, "grad_norm": 0.8046875, "learning_rate": 5.818461538461538e-06, "loss": 0.3653, "step": 4610 }, { "epoch": 0.7144375930257283, "grad_norm": 0.76171875, "learning_rate": 5.787692307692309e-06, "loss": 0.3106, "step": 4620 }, { "epoch": 0.7159839947422342, "grad_norm": 0.953125, "learning_rate": 5.756923076923078e-06, "loss": 0.2368, "step": 4630 }, { "epoch": 0.7175303964587401, "grad_norm": 0.8203125, "learning_rate": 5.726153846153847e-06, "loss": 0.249, "step": 4640 }, { "epoch": 0.719076798175246, "grad_norm": 1.1328125, "learning_rate": 5.695384615384616e-06, "loss": 0.3709, "step": 4650 }, { "epoch": 0.7206231998917518, "grad_norm": 0.88671875, "learning_rate": 5.664615384615385e-06, "loss": 0.2921, "step": 4660 }, { "epoch": 0.7221696016082578, "grad_norm": 1.015625, "learning_rate": 5.633846153846154e-06, "loss": 0.3115, "step": 4670 }, { "epoch": 0.7237160033247637, "grad_norm": 0.87890625, "learning_rate": 5.603076923076923e-06, "loss": 0.2479, "step": 4680 }, { "epoch": 0.7252624050412696, "grad_norm": 1.046875, "learning_rate": 5.572307692307693e-06, "loss": 0.2297, "step": 4690 }, { "epoch": 0.7268088067577755, "grad_norm": 0.89453125, "learning_rate": 5.541538461538461e-06, "loss": 0.2454, "step": 4700 }, { "epoch": 0.7283552084742814, "grad_norm": 0.79296875, "learning_rate": 5.5107692307692315e-06, "loss": 0.2849, "step": 4710 }, { "epoch": 0.7299016101907874, "grad_norm": 0.74609375, "learning_rate": 5.480000000000001e-06, "loss": 0.2797, "step": 4720 }, { "epoch": 0.7314480119072932, "grad_norm": 1.0234375, "learning_rate": 5.44923076923077e-06, "loss": 0.3882, "step": 4730 }, { "epoch": 0.7329944136237991, "grad_norm": 0.90234375, "learning_rate": 5.418461538461539e-06, "loss": 0.2509, "step": 4740 }, { "epoch": 0.734540815340305, "grad_norm": 0.81640625, "learning_rate": 5.387692307692308e-06, "loss": 0.2408, "step": 4750 }, { "epoch": 0.7360872170568109, "grad_norm": 0.8828125, "learning_rate": 5.356923076923078e-06, "loss": 0.2413, "step": 4760 }, { "epoch": 0.7376336187733168, "grad_norm": 0.7890625, "learning_rate": 5.326153846153846e-06, "loss": 0.2432, "step": 4770 }, { "epoch": 0.7391800204898228, "grad_norm": 0.9921875, "learning_rate": 5.2953846153846156e-06, "loss": 0.277, "step": 4780 }, { "epoch": 0.7407264222063287, "grad_norm": 1.15625, "learning_rate": 5.264615384615385e-06, "loss": 0.2486, "step": 4790 }, { "epoch": 0.7422728239228346, "grad_norm": 0.8203125, "learning_rate": 5.2338461538461535e-06, "loss": 0.3, "step": 4800 }, { "epoch": 0.7438192256393404, "grad_norm": 0.87109375, "learning_rate": 5.203076923076924e-06, "loss": 0.26, "step": 4810 }, { "epoch": 0.7453656273558463, "grad_norm": 1.328125, "learning_rate": 5.172307692307693e-06, "loss": 0.2937, "step": 4820 }, { "epoch": 0.7469120290723523, "grad_norm": 0.9453125, "learning_rate": 5.1415384615384625e-06, "loss": 0.3057, "step": 4830 }, { "epoch": 0.7484584307888582, "grad_norm": 0.8828125, "learning_rate": 5.110769230769231e-06, "loss": 0.3284, "step": 4840 }, { "epoch": 0.7500048325053641, "grad_norm": 0.98046875, "learning_rate": 5.0800000000000005e-06, "loss": 0.2434, "step": 4850 }, { "epoch": 0.75155123422187, "grad_norm": 0.97265625, "learning_rate": 5.04923076923077e-06, "loss": 0.257, "step": 4860 }, { "epoch": 0.7530976359383759, "grad_norm": 0.91015625, "learning_rate": 5.0184615384615384e-06, "loss": 0.2714, "step": 4870 }, { "epoch": 0.7546440376548817, "grad_norm": 0.609375, "learning_rate": 4.987692307692308e-06, "loss": 0.2182, "step": 4880 }, { "epoch": 0.7561904393713877, "grad_norm": 0.85546875, "learning_rate": 4.956923076923077e-06, "loss": 0.2855, "step": 4890 }, { "epoch": 0.7577368410878936, "grad_norm": 1.0703125, "learning_rate": 4.926153846153847e-06, "loss": 0.2774, "step": 4900 }, { "epoch": 0.7592832428043995, "grad_norm": 0.86328125, "learning_rate": 4.895384615384616e-06, "loss": 0.2489, "step": 4910 }, { "epoch": 0.7608296445209054, "grad_norm": 1.1328125, "learning_rate": 4.8646153846153846e-06, "loss": 0.3157, "step": 4920 }, { "epoch": 0.7623760462374113, "grad_norm": 0.9609375, "learning_rate": 4.833846153846154e-06, "loss": 0.2704, "step": 4930 }, { "epoch": 0.7639224479539173, "grad_norm": 0.82421875, "learning_rate": 4.803076923076923e-06, "loss": 0.2995, "step": 4940 }, { "epoch": 0.7654688496704232, "grad_norm": 0.98046875, "learning_rate": 4.772307692307693e-06, "loss": 0.2422, "step": 4950 }, { "epoch": 0.767015251386929, "grad_norm": 1.296875, "learning_rate": 4.741538461538462e-06, "loss": 0.2692, "step": 4960 }, { "epoch": 0.7685616531034349, "grad_norm": 1.015625, "learning_rate": 4.710769230769231e-06, "loss": 0.2704, "step": 4970 }, { "epoch": 0.7701080548199408, "grad_norm": 0.85546875, "learning_rate": 4.680000000000001e-06, "loss": 0.3147, "step": 4980 }, { "epoch": 0.7716544565364467, "grad_norm": 0.9609375, "learning_rate": 4.6492307692307695e-06, "loss": 0.2867, "step": 4990 }, { "epoch": 0.7732008582529527, "grad_norm": 1.09375, "learning_rate": 4.618461538461539e-06, "loss": 0.2896, "step": 5000 }, { "epoch": 0.7747472599694586, "grad_norm": 1.0546875, "learning_rate": 4.587692307692308e-06, "loss": 0.2335, "step": 5010 }, { "epoch": 0.7762936616859645, "grad_norm": 1.0390625, "learning_rate": 4.556923076923077e-06, "loss": 0.2441, "step": 5020 }, { "epoch": 0.7778400634024704, "grad_norm": 1.4453125, "learning_rate": 4.526153846153847e-06, "loss": 0.3049, "step": 5030 }, { "epoch": 0.7793864651189762, "grad_norm": 1.09375, "learning_rate": 4.495384615384616e-06, "loss": 0.2605, "step": 5040 }, { "epoch": 0.7809328668354822, "grad_norm": 1.2265625, "learning_rate": 4.464615384615385e-06, "loss": 0.2876, "step": 5050 }, { "epoch": 0.7824792685519881, "grad_norm": 1.09375, "learning_rate": 4.433846153846154e-06, "loss": 0.3434, "step": 5060 }, { "epoch": 0.784025670268494, "grad_norm": 1.046875, "learning_rate": 4.403076923076923e-06, "loss": 0.2956, "step": 5070 }, { "epoch": 0.7855720719849999, "grad_norm": 1.0078125, "learning_rate": 4.372307692307693e-06, "loss": 0.3175, "step": 5080 }, { "epoch": 0.7871184737015058, "grad_norm": 1.1015625, "learning_rate": 4.341538461538462e-06, "loss": 0.2914, "step": 5090 }, { "epoch": 0.7886648754180117, "grad_norm": 1.125, "learning_rate": 4.310769230769231e-06, "loss": 0.2657, "step": 5100 }, { "epoch": 0.7902112771345177, "grad_norm": 0.9453125, "learning_rate": 4.2800000000000005e-06, "loss": 0.3168, "step": 5110 }, { "epoch": 0.7917576788510235, "grad_norm": 1.0390625, "learning_rate": 4.249230769230769e-06, "loss": 0.2422, "step": 5120 }, { "epoch": 0.7933040805675294, "grad_norm": 0.8125, "learning_rate": 4.218461538461539e-06, "loss": 0.2651, "step": 5130 }, { "epoch": 0.7948504822840353, "grad_norm": 0.984375, "learning_rate": 4.187692307692308e-06, "loss": 0.245, "step": 5140 }, { "epoch": 0.7963968840005412, "grad_norm": 0.82421875, "learning_rate": 4.156923076923077e-06, "loss": 0.3055, "step": 5150 }, { "epoch": 0.7979432857170471, "grad_norm": 1.015625, "learning_rate": 4.126153846153847e-06, "loss": 0.2992, "step": 5160 }, { "epoch": 0.7994896874335531, "grad_norm": 0.796875, "learning_rate": 4.095384615384615e-06, "loss": 0.3123, "step": 5170 }, { "epoch": 0.801036089150059, "grad_norm": 1.1796875, "learning_rate": 4.0646153846153854e-06, "loss": 0.2849, "step": 5180 }, { "epoch": 0.8025824908665649, "grad_norm": 0.84765625, "learning_rate": 4.033846153846154e-06, "loss": 0.317, "step": 5190 }, { "epoch": 0.8041288925830707, "grad_norm": 0.88671875, "learning_rate": 4.003076923076923e-06, "loss": 0.2567, "step": 5200 }, { "epoch": 0.8056752942995766, "grad_norm": 1.109375, "learning_rate": 3.972307692307693e-06, "loss": 0.2918, "step": 5210 }, { "epoch": 0.8072216960160826, "grad_norm": 0.9765625, "learning_rate": 3.941538461538461e-06, "loss": 0.3973, "step": 5220 }, { "epoch": 0.8087680977325885, "grad_norm": 1.1015625, "learning_rate": 3.9107692307692316e-06, "loss": 0.3034, "step": 5230 }, { "epoch": 0.8103144994490944, "grad_norm": 1.0546875, "learning_rate": 3.88e-06, "loss": 0.2369, "step": 5240 }, { "epoch": 0.8118609011656003, "grad_norm": 0.8125, "learning_rate": 3.8492307692307695e-06, "loss": 0.261, "step": 5250 }, { "epoch": 0.8134073028821062, "grad_norm": 1.015625, "learning_rate": 3.818461538461539e-06, "loss": 0.2657, "step": 5260 }, { "epoch": 0.814953704598612, "grad_norm": 0.91796875, "learning_rate": 3.787692307692308e-06, "loss": 0.2336, "step": 5270 }, { "epoch": 0.816500106315118, "grad_norm": 0.83984375, "learning_rate": 3.7569230769230773e-06, "loss": 0.2683, "step": 5280 }, { "epoch": 0.8180465080316239, "grad_norm": 1.09375, "learning_rate": 3.7261538461538467e-06, "loss": 0.2703, "step": 5290 }, { "epoch": 0.8195929097481298, "grad_norm": 1.28125, "learning_rate": 3.6953846153846156e-06, "loss": 0.2907, "step": 5300 }, { "epoch": 0.8211393114646357, "grad_norm": 0.95703125, "learning_rate": 3.6646153846153846e-06, "loss": 0.3177, "step": 5310 }, { "epoch": 0.8226857131811416, "grad_norm": 0.796875, "learning_rate": 3.633846153846154e-06, "loss": 0.3023, "step": 5320 }, { "epoch": 0.8242321148976476, "grad_norm": 0.8359375, "learning_rate": 3.6030769230769234e-06, "loss": 0.2169, "step": 5330 }, { "epoch": 0.8257785166141535, "grad_norm": 0.7890625, "learning_rate": 3.572307692307693e-06, "loss": 0.24, "step": 5340 }, { "epoch": 0.8273249183306594, "grad_norm": 1.0546875, "learning_rate": 3.5415384615384618e-06, "loss": 0.3394, "step": 5350 }, { "epoch": 0.8288713200471652, "grad_norm": 1.1875, "learning_rate": 3.5107692307692307e-06, "loss": 0.2527, "step": 5360 }, { "epoch": 0.8304177217636711, "grad_norm": 0.69921875, "learning_rate": 3.48e-06, "loss": 0.2447, "step": 5370 }, { "epoch": 0.831964123480177, "grad_norm": 1.0703125, "learning_rate": 3.4492307692307695e-06, "loss": 0.2509, "step": 5380 }, { "epoch": 0.833510525196683, "grad_norm": 0.80859375, "learning_rate": 3.418461538461539e-06, "loss": 0.3633, "step": 5390 }, { "epoch": 0.8350569269131889, "grad_norm": 0.69921875, "learning_rate": 3.387692307692308e-06, "loss": 0.3206, "step": 5400 }, { "epoch": 0.8366033286296948, "grad_norm": 1.5078125, "learning_rate": 3.356923076923077e-06, "loss": 0.3542, "step": 5410 }, { "epoch": 0.8381497303462007, "grad_norm": 1.0234375, "learning_rate": 3.3261538461538463e-06, "loss": 0.2731, "step": 5420 }, { "epoch": 0.8396961320627065, "grad_norm": 0.92578125, "learning_rate": 3.2953846153846157e-06, "loss": 0.3256, "step": 5430 }, { "epoch": 0.8412425337792125, "grad_norm": 0.8359375, "learning_rate": 3.264615384615385e-06, "loss": 0.2471, "step": 5440 }, { "epoch": 0.8427889354957184, "grad_norm": 0.9453125, "learning_rate": 3.233846153846154e-06, "loss": 0.2755, "step": 5450 }, { "epoch": 0.8443353372122243, "grad_norm": 1.046875, "learning_rate": 3.203076923076923e-06, "loss": 0.3139, "step": 5460 }, { "epoch": 0.8458817389287302, "grad_norm": 0.98046875, "learning_rate": 3.1723076923076924e-06, "loss": 0.2722, "step": 5470 }, { "epoch": 0.8474281406452361, "grad_norm": 0.828125, "learning_rate": 3.141538461538462e-06, "loss": 0.3058, "step": 5480 }, { "epoch": 0.848974542361742, "grad_norm": 1.03125, "learning_rate": 3.110769230769231e-06, "loss": 0.2424, "step": 5490 }, { "epoch": 0.850520944078248, "grad_norm": 0.98828125, "learning_rate": 3.08e-06, "loss": 0.2752, "step": 5500 }, { "epoch": 0.8520673457947538, "grad_norm": 0.6640625, "learning_rate": 3.049230769230769e-06, "loss": 0.2309, "step": 5510 }, { "epoch": 0.8536137475112597, "grad_norm": 0.85546875, "learning_rate": 3.0184615384615385e-06, "loss": 0.33, "step": 5520 }, { "epoch": 0.8551601492277656, "grad_norm": 0.765625, "learning_rate": 2.987692307692308e-06, "loss": 0.2942, "step": 5530 }, { "epoch": 0.8567065509442715, "grad_norm": 0.76171875, "learning_rate": 2.9569230769230773e-06, "loss": 0.3103, "step": 5540 }, { "epoch": 0.8582529526607775, "grad_norm": 0.91796875, "learning_rate": 2.9261538461538463e-06, "loss": 0.2775, "step": 5550 }, { "epoch": 0.8597993543772834, "grad_norm": 1.0234375, "learning_rate": 2.8953846153846153e-06, "loss": 0.2941, "step": 5560 }, { "epoch": 0.8613457560937893, "grad_norm": 0.72265625, "learning_rate": 2.8646153846153847e-06, "loss": 0.2591, "step": 5570 }, { "epoch": 0.8628921578102952, "grad_norm": 1.0390625, "learning_rate": 2.833846153846154e-06, "loss": 0.2801, "step": 5580 }, { "epoch": 0.864438559526801, "grad_norm": 0.859375, "learning_rate": 2.8030769230769234e-06, "loss": 0.3041, "step": 5590 }, { "epoch": 0.8659849612433069, "grad_norm": 1.2265625, "learning_rate": 2.7723076923076924e-06, "loss": 0.2866, "step": 5600 }, { "epoch": 0.8675313629598129, "grad_norm": 0.99609375, "learning_rate": 2.7415384615384614e-06, "loss": 0.3128, "step": 5610 }, { "epoch": 0.8690777646763188, "grad_norm": 1.0390625, "learning_rate": 2.710769230769231e-06, "loss": 0.3121, "step": 5620 }, { "epoch": 0.8706241663928247, "grad_norm": 0.7578125, "learning_rate": 2.68e-06, "loss": 0.2264, "step": 5630 }, { "epoch": 0.8721705681093306, "grad_norm": 1.0390625, "learning_rate": 2.6492307692307696e-06, "loss": 0.2619, "step": 5640 }, { "epoch": 0.8737169698258365, "grad_norm": 0.70703125, "learning_rate": 2.6184615384615385e-06, "loss": 0.2631, "step": 5650 }, { "epoch": 0.8752633715423425, "grad_norm": 0.9765625, "learning_rate": 2.587692307692308e-06, "loss": 0.2636, "step": 5660 }, { "epoch": 0.8768097732588483, "grad_norm": 1.03125, "learning_rate": 2.5569230769230773e-06, "loss": 0.3569, "step": 5670 }, { "epoch": 0.8783561749753542, "grad_norm": 0.76953125, "learning_rate": 2.5261538461538463e-06, "loss": 0.2297, "step": 5680 }, { "epoch": 0.8799025766918601, "grad_norm": 0.89453125, "learning_rate": 2.4953846153846157e-06, "loss": 0.2181, "step": 5690 }, { "epoch": 0.881448978408366, "grad_norm": 1.3359375, "learning_rate": 2.4646153846153847e-06, "loss": 0.3117, "step": 5700 }, { "epoch": 0.8829953801248719, "grad_norm": 0.9296875, "learning_rate": 2.433846153846154e-06, "loss": 0.3071, "step": 5710 }, { "epoch": 0.8845417818413779, "grad_norm": 0.828125, "learning_rate": 2.4030769230769235e-06, "loss": 0.2599, "step": 5720 }, { "epoch": 0.8860881835578838, "grad_norm": 1.0234375, "learning_rate": 2.3723076923076924e-06, "loss": 0.265, "step": 5730 }, { "epoch": 0.8876345852743897, "grad_norm": 1.0078125, "learning_rate": 2.341538461538462e-06, "loss": 0.2922, "step": 5740 }, { "epoch": 0.8891809869908955, "grad_norm": 0.984375, "learning_rate": 2.310769230769231e-06, "loss": 0.3616, "step": 5750 }, { "epoch": 0.8907273887074014, "grad_norm": 1.2578125, "learning_rate": 2.28e-06, "loss": 0.2587, "step": 5760 }, { "epoch": 0.8922737904239074, "grad_norm": 0.7890625, "learning_rate": 2.2492307692307696e-06, "loss": 0.335, "step": 5770 }, { "epoch": 0.8938201921404133, "grad_norm": 0.9375, "learning_rate": 2.218461538461539e-06, "loss": 0.288, "step": 5780 }, { "epoch": 0.8953665938569192, "grad_norm": 0.921875, "learning_rate": 2.187692307692308e-06, "loss": 0.2932, "step": 5790 }, { "epoch": 0.8969129955734251, "grad_norm": 1.109375, "learning_rate": 2.156923076923077e-06, "loss": 0.282, "step": 5800 }, { "epoch": 0.898459397289931, "grad_norm": 0.8671875, "learning_rate": 2.1261538461538463e-06, "loss": 0.2073, "step": 5810 }, { "epoch": 0.9000057990064368, "grad_norm": 0.87890625, "learning_rate": 2.0953846153846157e-06, "loss": 0.2583, "step": 5820 }, { "epoch": 0.9015522007229428, "grad_norm": 1.0390625, "learning_rate": 2.064615384615385e-06, "loss": 0.2805, "step": 5830 }, { "epoch": 0.9030986024394487, "grad_norm": 0.828125, "learning_rate": 2.033846153846154e-06, "loss": 0.2416, "step": 5840 }, { "epoch": 0.9046450041559546, "grad_norm": 0.7890625, "learning_rate": 2.003076923076923e-06, "loss": 0.2826, "step": 5850 }, { "epoch": 0.9061914058724605, "grad_norm": 0.9375, "learning_rate": 1.9723076923076924e-06, "loss": 0.3072, "step": 5860 }, { "epoch": 0.9077378075889664, "grad_norm": 1.171875, "learning_rate": 1.941538461538462e-06, "loss": 0.3551, "step": 5870 }, { "epoch": 0.9092842093054724, "grad_norm": 0.84765625, "learning_rate": 1.9107692307692312e-06, "loss": 0.3224, "step": 5880 }, { "epoch": 0.9108306110219783, "grad_norm": 0.890625, "learning_rate": 1.8800000000000002e-06, "loss": 0.2501, "step": 5890 }, { "epoch": 0.9123770127384841, "grad_norm": 0.671875, "learning_rate": 1.8492307692307692e-06, "loss": 0.2555, "step": 5900 }, { "epoch": 0.91392341445499, "grad_norm": 0.8671875, "learning_rate": 1.8184615384615386e-06, "loss": 0.329, "step": 5910 }, { "epoch": 0.9154698161714959, "grad_norm": 0.953125, "learning_rate": 1.7876923076923078e-06, "loss": 0.3193, "step": 5920 }, { "epoch": 0.9170162178880018, "grad_norm": 1.2265625, "learning_rate": 1.7569230769230772e-06, "loss": 0.3162, "step": 5930 }, { "epoch": 0.9185626196045078, "grad_norm": 0.87109375, "learning_rate": 1.7261538461538463e-06, "loss": 0.29, "step": 5940 }, { "epoch": 0.9201090213210137, "grad_norm": 0.859375, "learning_rate": 1.6953846153846153e-06, "loss": 0.3122, "step": 5950 }, { "epoch": 0.9216554230375196, "grad_norm": 0.75, "learning_rate": 1.6646153846153847e-06, "loss": 0.2374, "step": 5960 }, { "epoch": 0.9232018247540255, "grad_norm": 0.80859375, "learning_rate": 1.6338461538461539e-06, "loss": 0.2562, "step": 5970 }, { "epoch": 0.9247482264705313, "grad_norm": 1.0859375, "learning_rate": 1.6030769230769233e-06, "loss": 0.2854, "step": 5980 }, { "epoch": 0.9262946281870373, "grad_norm": 1.1015625, "learning_rate": 1.5723076923076925e-06, "loss": 0.3549, "step": 5990 }, { "epoch": 0.9278410299035432, "grad_norm": 1.1953125, "learning_rate": 1.5415384615384614e-06, "loss": 0.4152, "step": 6000 } ], "logging_steps": 10, "max_steps": 6500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3789563231171932e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }