{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.9523809523809526, "eval_steps": 500, "global_step": 1256, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.031746031746031744, "grad_norm": 0.557715117931366, "learning_rate": 1.984126984126984e-06, "loss": 3.7156, "step": 5 }, { "epoch": 0.06349206349206349, "grad_norm": 0.6210838556289673, "learning_rate": 3.968253968253968e-06, "loss": 3.9392, "step": 10 }, { "epoch": 0.09523809523809523, "grad_norm": 0.574651837348938, "learning_rate": 5.9523809523809525e-06, "loss": 3.9172, "step": 15 }, { "epoch": 0.12698412698412698, "grad_norm": 0.49391403794288635, "learning_rate": 7.936507936507936e-06, "loss": 3.7298, "step": 20 }, { "epoch": 0.15873015873015872, "grad_norm": 0.6184353232383728, "learning_rate": 9.92063492063492e-06, "loss": 3.8158, "step": 25 }, { "epoch": 0.19047619047619047, "grad_norm": 0.880232572555542, "learning_rate": 1.1904761904761905e-05, "loss": 3.7316, "step": 30 }, { "epoch": 0.2222222222222222, "grad_norm": 0.7022971510887146, "learning_rate": 1.388888888888889e-05, "loss": 3.8494, "step": 35 }, { "epoch": 0.25396825396825395, "grad_norm": 0.8043017387390137, "learning_rate": 1.5873015873015872e-05, "loss": 3.8021, "step": 40 }, { "epoch": 0.2857142857142857, "grad_norm": 0.975318431854248, "learning_rate": 1.785714285714286e-05, "loss": 3.5457, "step": 45 }, { "epoch": 0.31746031746031744, "grad_norm": 0.9920660853385925, "learning_rate": 1.984126984126984e-05, "loss": 3.6339, "step": 50 }, { "epoch": 0.3492063492063492, "grad_norm": 1.0266857147216797, "learning_rate": 2.1825396825396827e-05, "loss": 3.3562, "step": 55 }, { "epoch": 0.38095238095238093, "grad_norm": 1.0816854238510132, "learning_rate": 2.380952380952381e-05, "loss": 3.0271, "step": 60 }, { "epoch": 0.4126984126984127, "grad_norm": 0.9915212988853455, "learning_rate": 2.5793650793650796e-05, "loss": 2.5454, "step": 65 }, { "epoch": 0.4444444444444444, "grad_norm": 1.2306663990020752, "learning_rate": 2.777777777777778e-05, "loss": 2.4261, "step": 70 }, { "epoch": 0.47619047619047616, "grad_norm": 1.2502392530441284, "learning_rate": 2.9761904761904762e-05, "loss": 2.0371, "step": 75 }, { "epoch": 0.5079365079365079, "grad_norm": 1.5857068300247192, "learning_rate": 3.1746031746031745e-05, "loss": 1.9495, "step": 80 }, { "epoch": 0.5396825396825397, "grad_norm": 1.2740144729614258, "learning_rate": 3.3730158730158734e-05, "loss": 1.8084, "step": 85 }, { "epoch": 0.5714285714285714, "grad_norm": 1.813217043876648, "learning_rate": 3.571428571428572e-05, "loss": 1.4826, "step": 90 }, { "epoch": 0.6031746031746031, "grad_norm": 1.1831731796264648, "learning_rate": 3.76984126984127e-05, "loss": 1.1703, "step": 95 }, { "epoch": 0.6349206349206349, "grad_norm": 2.0489368438720703, "learning_rate": 3.968253968253968e-05, "loss": 1.2717, "step": 100 }, { "epoch": 0.6666666666666666, "grad_norm": 1.4326945543289185, "learning_rate": 4.166666666666667e-05, "loss": 1.0786, "step": 105 }, { "epoch": 0.6984126984126984, "grad_norm": 1.4672375917434692, "learning_rate": 4.3650793650793655e-05, "loss": 1.2461, "step": 110 }, { "epoch": 0.7301587301587301, "grad_norm": 1.7501994371414185, "learning_rate": 4.563492063492064e-05, "loss": 1.3453, "step": 115 }, { "epoch": 0.7619047619047619, "grad_norm": 1.3067994117736816, "learning_rate": 4.761904761904762e-05, "loss": 1.058, "step": 120 }, { "epoch": 0.7936507936507936, "grad_norm": 1.3366992473602295, "learning_rate": 4.960317460317461e-05, "loss": 1.2212, "step": 125 }, { "epoch": 0.8253968253968254, "grad_norm": 1.0570944547653198, "learning_rate": 4.9998454146340764e-05, "loss": 1.014, "step": 130 }, { "epoch": 0.8571428571428571, "grad_norm": 1.225695252418518, "learning_rate": 4.999217444349398e-05, "loss": 1.2156, "step": 135 }, { "epoch": 0.8888888888888888, "grad_norm": 1.4539202451705933, "learning_rate": 4.998106548810312e-05, "loss": 1.2021, "step": 140 }, { "epoch": 0.9206349206349206, "grad_norm": 1.0543771982192993, "learning_rate": 4.996512942675816e-05, "loss": 0.9998, "step": 145 }, { "epoch": 0.9523809523809523, "grad_norm": 1.205698013305664, "learning_rate": 4.99443693387936e-05, "loss": 0.8978, "step": 150 }, { "epoch": 0.9841269841269841, "grad_norm": 1.279111385345459, "learning_rate": 4.991878923569342e-05, "loss": 1.0797, "step": 155 }, { "epoch": 1.0126984126984127, "grad_norm": 1.5468761920928955, "learning_rate": 4.9888394060315975e-05, "loss": 1.0614, "step": 160 }, { "epoch": 1.0444444444444445, "grad_norm": 1.3189973831176758, "learning_rate": 4.9853189685938837e-05, "loss": 0.8396, "step": 165 }, { "epoch": 1.0761904761904761, "grad_norm": 1.0562372207641602, "learning_rate": 4.981318291512396e-05, "loss": 0.8926, "step": 170 }, { "epoch": 1.107936507936508, "grad_norm": 0.9939103722572327, "learning_rate": 4.976838147840314e-05, "loss": 0.9728, "step": 175 }, { "epoch": 1.1396825396825396, "grad_norm": 1.6342549324035645, "learning_rate": 4.971879403278432e-05, "loss": 0.8781, "step": 180 }, { "epoch": 1.1714285714285715, "grad_norm": 1.3527770042419434, "learning_rate": 4.966443016007873e-05, "loss": 1.0276, "step": 185 }, { "epoch": 1.2031746031746031, "grad_norm": 1.2593631744384766, "learning_rate": 4.960530036504942e-05, "loss": 0.9742, "step": 190 }, { "epoch": 1.234920634920635, "grad_norm": 1.2724119424819946, "learning_rate": 4.9541416073381395e-05, "loss": 0.7572, "step": 195 }, { "epoch": 1.2666666666666666, "grad_norm": 1.2241063117980957, "learning_rate": 4.947278962947387e-05, "loss": 0.8872, "step": 200 }, { "epoch": 1.2984126984126985, "grad_norm": 1.0864907503128052, "learning_rate": 4.9399434294054894e-05, "loss": 0.9463, "step": 205 }, { "epoch": 1.33015873015873, "grad_norm": 1.2503291368484497, "learning_rate": 4.9321364241618994e-05, "loss": 0.8187, "step": 210 }, { "epoch": 1.361904761904762, "grad_norm": 1.2996984720230103, "learning_rate": 4.9238594557688265e-05, "loss": 0.9949, "step": 215 }, { "epoch": 1.3936507936507936, "grad_norm": 1.2346172332763672, "learning_rate": 4.9151141235897324e-05, "loss": 0.8104, "step": 220 }, { "epoch": 1.4253968253968254, "grad_norm": 1.3184157609939575, "learning_rate": 4.905902117490291e-05, "loss": 0.9109, "step": 225 }, { "epoch": 1.457142857142857, "grad_norm": 1.4319523572921753, "learning_rate": 4.89622521751185e-05, "loss": 0.9854, "step": 230 }, { "epoch": 1.488888888888889, "grad_norm": 1.458723545074463, "learning_rate": 4.886085293527474e-05, "loss": 0.9916, "step": 235 }, { "epoch": 1.5206349206349206, "grad_norm": 1.2609443664550781, "learning_rate": 4.8754843048806296e-05, "loss": 0.9446, "step": 240 }, { "epoch": 1.5523809523809524, "grad_norm": 1.1162338256835938, "learning_rate": 4.864424300006579e-05, "loss": 0.746, "step": 245 }, { "epoch": 1.5841269841269843, "grad_norm": 1.4760407209396362, "learning_rate": 4.852907416036559e-05, "loss": 0.8856, "step": 250 }, { "epoch": 1.615873015873016, "grad_norm": 1.4606406688690186, "learning_rate": 4.8409358783848216e-05, "loss": 0.8501, "step": 255 }, { "epoch": 1.6476190476190475, "grad_norm": 1.4926954507827759, "learning_rate": 4.828512000318617e-05, "loss": 0.8581, "step": 260 }, { "epoch": 1.6793650793650794, "grad_norm": 1.7399059534072876, "learning_rate": 4.8156381825112006e-05, "loss": 0.8994, "step": 265 }, { "epoch": 1.7111111111111112, "grad_norm": 1.2087697982788086, "learning_rate": 4.8023169125779466e-05, "loss": 0.8272, "step": 270 }, { "epoch": 1.7428571428571429, "grad_norm": 1.5295592546463013, "learning_rate": 4.788550764595667e-05, "loss": 0.8555, "step": 275 }, { "epoch": 1.7746031746031745, "grad_norm": 1.6441818475723267, "learning_rate": 4.774342398605221e-05, "loss": 0.768, "step": 280 }, { "epoch": 1.8063492063492064, "grad_norm": 1.8932971954345703, "learning_rate": 4.759694560097513e-05, "loss": 0.9286, "step": 285 }, { "epoch": 1.8380952380952382, "grad_norm": 1.565648078918457, "learning_rate": 4.7446100794829785e-05, "loss": 0.7844, "step": 290 }, { "epoch": 1.8698412698412699, "grad_norm": 1.4803764820098877, "learning_rate": 4.7290918715446644e-05, "loss": 0.9625, "step": 295 }, { "epoch": 1.9015873015873015, "grad_norm": 1.2535064220428467, "learning_rate": 4.7131429348750055e-05, "loss": 0.7778, "step": 300 }, { "epoch": 1.9333333333333333, "grad_norm": 1.4900981187820435, "learning_rate": 4.6967663512963986e-05, "loss": 0.6377, "step": 305 }, { "epoch": 1.9650793650793652, "grad_norm": 1.5180715322494507, "learning_rate": 4.6799652852657064e-05, "loss": 0.8705, "step": 310 }, { "epoch": 1.9968253968253968, "grad_norm": 1.074388027191162, "learning_rate": 4.662742983262784e-05, "loss": 0.589, "step": 315 }, { "epoch": 2.0253968253968253, "grad_norm": 1.4143909215927124, "learning_rate": 4.64510277316316e-05, "loss": 0.675, "step": 320 }, { "epoch": 2.057142857142857, "grad_norm": 1.4869240522384644, "learning_rate": 4.6270480635949933e-05, "loss": 0.7768, "step": 325 }, { "epoch": 2.088888888888889, "grad_norm": 1.4446533918380737, "learning_rate": 4.6085823432804144e-05, "loss": 0.6516, "step": 330 }, { "epoch": 2.1206349206349207, "grad_norm": 1.8745191097259521, "learning_rate": 4.589709180361403e-05, "loss": 0.7022, "step": 335 }, { "epoch": 2.1523809523809523, "grad_norm": 1.678874135017395, "learning_rate": 4.5704322217103146e-05, "loss": 0.7443, "step": 340 }, { "epoch": 2.1841269841269844, "grad_norm": 2.3285534381866455, "learning_rate": 4.550755192225188e-05, "loss": 0.6574, "step": 345 }, { "epoch": 2.215873015873016, "grad_norm": 1.537413239479065, "learning_rate": 4.530681894109987e-05, "loss": 0.6493, "step": 350 }, { "epoch": 2.2476190476190476, "grad_norm": 1.8794249296188354, "learning_rate": 4.510216206139894e-05, "loss": 0.7287, "step": 355 }, { "epoch": 2.2793650793650793, "grad_norm": 1.5752090215682983, "learning_rate": 4.489362082911813e-05, "loss": 0.6171, "step": 360 }, { "epoch": 2.311111111111111, "grad_norm": 1.5545966625213623, "learning_rate": 4.4681235540802216e-05, "loss": 0.5651, "step": 365 }, { "epoch": 2.342857142857143, "grad_norm": 1.6034917831420898, "learning_rate": 4.446504723578519e-05, "loss": 0.6758, "step": 370 }, { "epoch": 2.3746031746031746, "grad_norm": 2.2822070121765137, "learning_rate": 4.424509768826017e-05, "loss": 0.7167, "step": 375 }, { "epoch": 2.4063492063492062, "grad_norm": 1.3968263864517212, "learning_rate": 4.4021429399207405e-05, "loss": 0.6388, "step": 380 }, { "epoch": 2.4380952380952383, "grad_norm": 2.0151493549346924, "learning_rate": 4.3794085588181725e-05, "loss": 0.8385, "step": 385 }, { "epoch": 2.46984126984127, "grad_norm": 2.2131729125976562, "learning_rate": 4.3563110184961235e-05, "loss": 0.8482, "step": 390 }, { "epoch": 2.5015873015873016, "grad_norm": 1.9444142580032349, "learning_rate": 4.332854782105875e-05, "loss": 0.7357, "step": 395 }, { "epoch": 2.533333333333333, "grad_norm": 1.3676568269729614, "learning_rate": 4.309044382109757e-05, "loss": 0.6352, "step": 400 }, { "epoch": 2.565079365079365, "grad_norm": 1.5921475887298584, "learning_rate": 4.2848844194053455e-05, "loss": 0.6742, "step": 405 }, { "epoch": 2.596825396825397, "grad_norm": 2.04608154296875, "learning_rate": 4.26037956243642e-05, "loss": 0.7472, "step": 410 }, { "epoch": 2.6285714285714286, "grad_norm": 1.6338012218475342, "learning_rate": 4.235534546290883e-05, "loss": 0.5659, "step": 415 }, { "epoch": 2.66031746031746, "grad_norm": 2.2755625247955322, "learning_rate": 4.2103541717857956e-05, "loss": 0.6775, "step": 420 }, { "epoch": 2.6920634920634923, "grad_norm": 1.8385895490646362, "learning_rate": 4.184843304539708e-05, "loss": 0.5902, "step": 425 }, { "epoch": 2.723809523809524, "grad_norm": 2.394862174987793, "learning_rate": 4.1590068740324806e-05, "loss": 0.623, "step": 430 }, { "epoch": 2.7555555555555555, "grad_norm": 1.9886717796325684, "learning_rate": 4.132849872652751e-05, "loss": 0.6361, "step": 435 }, { "epoch": 2.787301587301587, "grad_norm": 2.674502372741699, "learning_rate": 4.106377354733259e-05, "loss": 0.6835, "step": 440 }, { "epoch": 2.819047619047619, "grad_norm": 2.432657480239868, "learning_rate": 4.079594435574186e-05, "loss": 0.592, "step": 445 }, { "epoch": 2.850793650793651, "grad_norm": 1.8935447931289673, "learning_rate": 4.0525062904547276e-05, "loss": 0.629, "step": 450 }, { "epoch": 2.8825396825396825, "grad_norm": 2.635646343231201, "learning_rate": 4.025118153633075e-05, "loss": 0.7282, "step": 455 }, { "epoch": 2.914285714285714, "grad_norm": 2.8444085121154785, "learning_rate": 3.9974353173349886e-05, "loss": 0.7106, "step": 460 }, { "epoch": 2.9460317460317462, "grad_norm": 2.6861064434051514, "learning_rate": 3.969463130731183e-05, "loss": 0.6447, "step": 465 }, { "epoch": 2.977777777777778, "grad_norm": 2.7277615070343018, "learning_rate": 3.9412069989037015e-05, "loss": 0.563, "step": 470 }, { "epoch": 3.0063492063492063, "grad_norm": 1.8300037384033203, "learning_rate": 3.91267238180149e-05, "loss": 0.5555, "step": 475 }, { "epoch": 3.038095238095238, "grad_norm": 2.206399917602539, "learning_rate": 3.883864793185369e-05, "loss": 0.4926, "step": 480 }, { "epoch": 3.06984126984127, "grad_norm": 2.5881357192993164, "learning_rate": 3.854789799562602e-05, "loss": 0.5441, "step": 485 }, { "epoch": 3.1015873015873017, "grad_norm": 2.2896475791931152, "learning_rate": 3.825453019111281e-05, "loss": 0.498, "step": 490 }, { "epoch": 3.1333333333333333, "grad_norm": 2.622668981552124, "learning_rate": 3.7958601205947234e-05, "loss": 0.5055, "step": 495 }, { "epoch": 3.165079365079365, "grad_norm": 3.064993381500244, "learning_rate": 3.766016822266083e-05, "loss": 0.5348, "step": 500 }, { "epoch": 3.196825396825397, "grad_norm": 3.3422789573669434, "learning_rate": 3.7359288907634225e-05, "loss": 0.5869, "step": 505 }, { "epoch": 3.2285714285714286, "grad_norm": 2.460176467895508, "learning_rate": 3.705602139995416e-05, "loss": 0.5228, "step": 510 }, { "epoch": 3.2603174603174603, "grad_norm": 2.6845529079437256, "learning_rate": 3.675042430017923e-05, "loss": 0.4544, "step": 515 }, { "epoch": 3.292063492063492, "grad_norm": 2.378709077835083, "learning_rate": 3.6442556659016484e-05, "loss": 0.5201, "step": 520 }, { "epoch": 3.323809523809524, "grad_norm": 2.2331037521362305, "learning_rate": 3.613247796591101e-05, "loss": 0.4924, "step": 525 }, { "epoch": 3.3555555555555556, "grad_norm": 2.702828884124756, "learning_rate": 3.582024813755077e-05, "loss": 0.5358, "step": 530 }, { "epoch": 3.3873015873015873, "grad_norm": 2.3180782794952393, "learning_rate": 3.550592750628885e-05, "loss": 0.5862, "step": 535 }, { "epoch": 3.419047619047619, "grad_norm": 2.6038239002227783, "learning_rate": 3.5189576808485407e-05, "loss": 0.5303, "step": 540 }, { "epoch": 3.450793650793651, "grad_norm": 3.0068862438201904, "learning_rate": 3.4871257172771555e-05, "loss": 0.4848, "step": 545 }, { "epoch": 3.4825396825396826, "grad_norm": 2.6912105083465576, "learning_rate": 3.455103010823744e-05, "loss": 0.5653, "step": 550 }, { "epoch": 3.5142857142857142, "grad_norm": 3.0676960945129395, "learning_rate": 3.422895749254677e-05, "loss": 0.5993, "step": 555 }, { "epoch": 3.546031746031746, "grad_norm": 2.376481533050537, "learning_rate": 3.390510155998023e-05, "loss": 0.5484, "step": 560 }, { "epoch": 3.5777777777777775, "grad_norm": 2.4998154640197754, "learning_rate": 3.357952488940984e-05, "loss": 0.502, "step": 565 }, { "epoch": 3.6095238095238096, "grad_norm": 2.900434970855713, "learning_rate": 3.325229039220684e-05, "loss": 0.4531, "step": 570 }, { "epoch": 3.641269841269841, "grad_norm": 3.2575416564941406, "learning_rate": 3.2923461300085305e-05, "loss": 0.5703, "step": 575 }, { "epoch": 3.6730158730158733, "grad_norm": 2.449436902999878, "learning_rate": 3.2593101152883796e-05, "loss": 0.5894, "step": 580 }, { "epoch": 3.704761904761905, "grad_norm": 2.860034227371216, "learning_rate": 3.226127378628756e-05, "loss": 0.557, "step": 585 }, { "epoch": 3.7365079365079366, "grad_norm": 2.4020888805389404, "learning_rate": 3.19280433194935e-05, "loss": 0.4805, "step": 590 }, { "epoch": 3.768253968253968, "grad_norm": 2.56815767288208, "learning_rate": 3.15934741428204e-05, "loss": 0.5097, "step": 595 }, { "epoch": 3.8, "grad_norm": 1.385717749595642, "learning_rate": 3.1257630905266746e-05, "loss": 0.4136, "step": 600 }, { "epoch": 3.831746031746032, "grad_norm": 2.7889769077301025, "learning_rate": 3.092057850201855e-05, "loss": 0.4979, "step": 605 }, { "epoch": 3.8634920634920635, "grad_norm": 2.0486621856689453, "learning_rate": 3.058238206190962e-05, "loss": 0.4226, "step": 610 }, { "epoch": 3.895238095238095, "grad_norm": 2.6471216678619385, "learning_rate": 3.0243106934836686e-05, "loss": 0.5591, "step": 615 }, { "epoch": 3.9269841269841272, "grad_norm": 2.7665154933929443, "learning_rate": 2.9902818679131776e-05, "loss": 0.4856, "step": 620 }, { "epoch": 3.958730158730159, "grad_norm": 2.2888855934143066, "learning_rate": 2.9561583048894382e-05, "loss": 0.4661, "step": 625 }, { "epoch": 3.9904761904761905, "grad_norm": 2.540665864944458, "learning_rate": 2.9219465981285716e-05, "loss": 0.5027, "step": 630 }, { "epoch": 4.019047619047619, "grad_norm": 2.840465545654297, "learning_rate": 2.8876533583787647e-05, "loss": 0.5111, "step": 635 }, { "epoch": 4.050793650793651, "grad_norm": 2.362541437149048, "learning_rate": 2.8532852121428737e-05, "loss": 0.4652, "step": 640 }, { "epoch": 4.082539682539682, "grad_norm": 2.4384407997131348, "learning_rate": 2.8188488003979784e-05, "loss": 0.4227, "step": 645 }, { "epoch": 4.114285714285714, "grad_norm": 2.5572218894958496, "learning_rate": 2.784350777312142e-05, "loss": 0.3772, "step": 650 }, { "epoch": 4.146031746031746, "grad_norm": 2.4390878677368164, "learning_rate": 2.7497978089586236e-05, "loss": 0.4704, "step": 655 }, { "epoch": 4.177777777777778, "grad_norm": 3.3781797885894775, "learning_rate": 2.7151965720277893e-05, "loss": 0.4303, "step": 660 }, { "epoch": 4.20952380952381, "grad_norm": 2.8656809329986572, "learning_rate": 2.6805537525369713e-05, "loss": 0.4128, "step": 665 }, { "epoch": 4.241269841269841, "grad_norm": 4.3884100914001465, "learning_rate": 2.6458760445385216e-05, "loss": 0.4682, "step": 670 }, { "epoch": 4.273015873015873, "grad_norm": 3.1728439331054688, "learning_rate": 2.6111701488263224e-05, "loss": 0.4208, "step": 675 }, { "epoch": 4.304761904761905, "grad_norm": 2.4535300731658936, "learning_rate": 2.5764427716409815e-05, "loss": 0.4489, "step": 680 }, { "epoch": 4.336507936507936, "grad_norm": 2.231412887573242, "learning_rate": 2.5417006233739864e-05, "loss": 0.4079, "step": 685 }, { "epoch": 4.368253968253969, "grad_norm": 2.2064919471740723, "learning_rate": 2.5069504172710494e-05, "loss": 0.4157, "step": 690 }, { "epoch": 4.4, "grad_norm": 2.6554882526397705, "learning_rate": 2.4721988681349012e-05, "loss": 0.544, "step": 695 }, { "epoch": 4.431746031746032, "grad_norm": 2.739560842514038, "learning_rate": 2.437452691027789e-05, "loss": 0.4366, "step": 700 }, { "epoch": 4.463492063492064, "grad_norm": 2.2243738174438477, "learning_rate": 2.402718599973919e-05, "loss": 0.3831, "step": 705 }, { "epoch": 4.495238095238095, "grad_norm": 2.6665985584259033, "learning_rate": 2.3680033066621043e-05, "loss": 0.3785, "step": 710 }, { "epoch": 4.526984126984127, "grad_norm": 2.7318532466888428, "learning_rate": 2.3333135191488563e-05, "loss": 0.4055, "step": 715 }, { "epoch": 4.5587301587301585, "grad_norm": 2.6442530155181885, "learning_rate": 2.298655940562189e-05, "loss": 0.3967, "step": 720 }, { "epoch": 4.59047619047619, "grad_norm": 2.543417453765869, "learning_rate": 2.2640372678063628e-05, "loss": 0.3863, "step": 725 }, { "epoch": 4.622222222222222, "grad_norm": 2.905043363571167, "learning_rate": 2.2294641902678445e-05, "loss": 0.3996, "step": 730 }, { "epoch": 4.653968253968254, "grad_norm": 1.9619908332824707, "learning_rate": 2.19494338852271e-05, "loss": 0.4122, "step": 735 }, { "epoch": 4.685714285714286, "grad_norm": 2.6673147678375244, "learning_rate": 2.1604815330457512e-05, "loss": 0.478, "step": 740 }, { "epoch": 4.717460317460318, "grad_norm": 2.735572099685669, "learning_rate": 2.1260852829215406e-05, "loss": 0.4408, "step": 745 }, { "epoch": 4.749206349206349, "grad_norm": 2.9399704933166504, "learning_rate": 2.0917612845576885e-05, "loss": 0.4422, "step": 750 }, { "epoch": 4.780952380952381, "grad_norm": 2.5321192741394043, "learning_rate": 2.057516170400554e-05, "loss": 0.3983, "step": 755 }, { "epoch": 4.8126984126984125, "grad_norm": 2.2827346324920654, "learning_rate": 2.0233565576536567e-05, "loss": 0.4235, "step": 760 }, { "epoch": 4.844444444444444, "grad_norm": 3.497560739517212, "learning_rate": 1.9892890469990264e-05, "loss": 0.4277, "step": 765 }, { "epoch": 4.876190476190477, "grad_norm": 2.7729287147521973, "learning_rate": 1.955320221321754e-05, "loss": 0.3638, "step": 770 }, { "epoch": 4.907936507936508, "grad_norm": 3.169905424118042, "learning_rate": 1.921456644437972e-05, "loss": 0.4323, "step": 775 }, { "epoch": 4.93968253968254, "grad_norm": 3.217583417892456, "learning_rate": 1.8877048598265283e-05, "loss": 0.4467, "step": 780 }, { "epoch": 4.9714285714285715, "grad_norm": 2.79040789604187, "learning_rate": 1.8540713893645827e-05, "loss": 0.3856, "step": 785 }, { "epoch": 5.0, "grad_norm": 4.346138000488281, "learning_rate": 1.8205627320673837e-05, "loss": 0.4324, "step": 790 }, { "epoch": 5.031746031746032, "grad_norm": 2.163886070251465, "learning_rate": 1.787185362832459e-05, "loss": 0.3076, "step": 795 }, { "epoch": 5.063492063492063, "grad_norm": 2.5888731479644775, "learning_rate": 1.7539457311884676e-05, "loss": 0.3277, "step": 800 }, { "epoch": 5.095238095238095, "grad_norm": 2.4321916103363037, "learning_rate": 1.720850260048948e-05, "loss": 0.286, "step": 805 }, { "epoch": 5.1269841269841265, "grad_norm": 1.7569212913513184, "learning_rate": 1.687905344471226e-05, "loss": 0.3328, "step": 810 }, { "epoch": 5.158730158730159, "grad_norm": 2.9200055599212646, "learning_rate": 1.6551173504206853e-05, "loss": 0.4124, "step": 815 }, { "epoch": 5.190476190476191, "grad_norm": 3.5868027210235596, "learning_rate": 1.6224926135406693e-05, "loss": 0.3896, "step": 820 }, { "epoch": 5.222222222222222, "grad_norm": 2.3882710933685303, "learning_rate": 1.5900374379282445e-05, "loss": 0.4112, "step": 825 }, { "epoch": 5.253968253968254, "grad_norm": 2.136838912963867, "learning_rate": 1.557758094916053e-05, "loss": 0.3479, "step": 830 }, { "epoch": 5.285714285714286, "grad_norm": 2.6616430282592773, "learning_rate": 1.5256608218605015e-05, "loss": 0.3211, "step": 835 }, { "epoch": 5.317460317460317, "grad_norm": 4.097353458404541, "learning_rate": 1.493751820936511e-05, "loss": 0.4213, "step": 840 }, { "epoch": 5.349206349206349, "grad_norm": 3.042466402053833, "learning_rate": 1.4620372579390678e-05, "loss": 0.3856, "step": 845 }, { "epoch": 5.380952380952381, "grad_norm": 2.485643148422241, "learning_rate": 1.4305232610918045e-05, "loss": 0.3751, "step": 850 }, { "epoch": 5.412698412698413, "grad_norm": 2.9114632606506348, "learning_rate": 1.3992159198628373e-05, "loss": 0.3557, "step": 855 }, { "epoch": 5.444444444444445, "grad_norm": 1.9394385814666748, "learning_rate": 1.3681212837880978e-05, "loss": 0.357, "step": 860 }, { "epoch": 5.476190476190476, "grad_norm": 3.1689724922180176, "learning_rate": 1.3372453613023728e-05, "loss": 0.3665, "step": 865 }, { "epoch": 5.507936507936508, "grad_norm": 2.9183590412139893, "learning_rate": 1.3065941185782979e-05, "loss": 0.3786, "step": 870 }, { "epoch": 5.5396825396825395, "grad_norm": 3.520986795425415, "learning_rate": 1.2761734783735036e-05, "loss": 0.3562, "step": 875 }, { "epoch": 5.571428571428571, "grad_norm": 3.4199678897857666, "learning_rate": 1.2459893188861613e-05, "loss": 0.3871, "step": 880 }, { "epoch": 5.603174603174603, "grad_norm": 2.38875675201416, "learning_rate": 1.2160474726191323e-05, "loss": 0.3532, "step": 885 }, { "epoch": 5.634920634920634, "grad_norm": 2.554577350616455, "learning_rate": 1.186353725252955e-05, "loss": 0.3241, "step": 890 }, { "epoch": 5.666666666666667, "grad_norm": 2.4856674671173096, "learning_rate": 1.1569138145278696e-05, "loss": 0.3571, "step": 895 }, { "epoch": 5.698412698412699, "grad_norm": 3.3535118103027344, "learning_rate": 1.1277334291351147e-05, "loss": 0.3456, "step": 900 }, { "epoch": 5.73015873015873, "grad_norm": 2.534107208251953, "learning_rate": 1.0988182076176939e-05, "loss": 0.3895, "step": 905 }, { "epoch": 5.761904761904762, "grad_norm": 2.346252202987671, "learning_rate": 1.0701737372808432e-05, "loss": 0.3796, "step": 910 }, { "epoch": 5.7936507936507935, "grad_norm": 2.958519220352173, "learning_rate": 1.0418055531123857e-05, "loss": 0.392, "step": 915 }, { "epoch": 5.825396825396825, "grad_norm": 1.7530192136764526, "learning_rate": 1.013719136713208e-05, "loss": 0.2836, "step": 920 }, { "epoch": 5.857142857142857, "grad_norm": 3.0703108310699463, "learning_rate": 9.859199152380432e-06, "loss": 0.4059, "step": 925 }, { "epoch": 5.888888888888889, "grad_norm": 3.0952444076538086, "learning_rate": 9.584132603467827e-06, "loss": 0.3901, "step": 930 }, { "epoch": 5.920634920634921, "grad_norm": 2.6520533561706543, "learning_rate": 9.312044871665032e-06, "loss": 0.3285, "step": 935 }, { "epoch": 5.9523809523809526, "grad_norm": 2.382810354232788, "learning_rate": 9.04298853264425e-06, "loss": 0.3998, "step": 940 }, { "epoch": 5.984126984126984, "grad_norm": 3.09870982170105, "learning_rate": 8.777015576319869e-06, "loss": 0.3804, "step": 945 }, { "epoch": 6.012698412698413, "grad_norm": 2.542013645172119, "learning_rate": 8.514177396802428e-06, "loss": 0.3471, "step": 950 }, { "epoch": 6.044444444444444, "grad_norm": 3.0190534591674805, "learning_rate": 8.254524782467692e-06, "loss": 0.3275, "step": 955 }, { "epoch": 6.076190476190476, "grad_norm": 2.745600461959839, "learning_rate": 7.99810790614284e-06, "loss": 0.3089, "step": 960 }, { "epoch": 6.1079365079365076, "grad_norm": 2.2067768573760986, "learning_rate": 7.744976315411428e-06, "loss": 0.2902, "step": 965 }, { "epoch": 6.13968253968254, "grad_norm": 2.806065320968628, "learning_rate": 7.495178923039398e-06, "loss": 0.3349, "step": 970 }, { "epoch": 6.171428571428572, "grad_norm": 2.4265201091766357, "learning_rate": 7.248763997523561e-06, "loss": 0.3295, "step": 975 }, { "epoch": 6.203174603174603, "grad_norm": 2.9042932987213135, "learning_rate": 7.005779153764683e-06, "loss": 0.3285, "step": 980 }, { "epoch": 6.234920634920635, "grad_norm": 3.585085391998291, "learning_rate": 6.766271343866812e-06, "loss": 0.3635, "step": 985 }, { "epoch": 6.266666666666667, "grad_norm": 2.554619312286377, "learning_rate": 6.530286848064699e-06, "loss": 0.2845, "step": 990 }, { "epoch": 6.298412698412698, "grad_norm": 2.951251745223999, "learning_rate": 6.297871265781055e-06, "loss": 0.3259, "step": 995 }, { "epoch": 6.33015873015873, "grad_norm": 2.6404244899749756, "learning_rate": 6.069069506815325e-06, "loss": 0.2872, "step": 1000 }, { "epoch": 6.3619047619047615, "grad_norm": 2.684638261795044, "learning_rate": 5.843925782665754e-06, "loss": 0.3206, "step": 1005 }, { "epoch": 6.393650793650794, "grad_norm": 2.080413341522217, "learning_rate": 5.622483597986372e-06, "loss": 0.3099, "step": 1010 }, { "epoch": 6.425396825396826, "grad_norm": 3.5247743129730225, "learning_rate": 5.404785742180565e-06, "loss": 0.3275, "step": 1015 }, { "epoch": 6.457142857142857, "grad_norm": 3.3956446647644043, "learning_rate": 5.190874281132851e-06, "loss": 0.3097, "step": 1020 }, { "epoch": 6.488888888888889, "grad_norm": 2.5341458320617676, "learning_rate": 4.980790549080463e-06, "loss": 0.3346, "step": 1025 }, { "epoch": 6.520634920634921, "grad_norm": 3.2887229919433594, "learning_rate": 4.7745751406263165e-06, "loss": 0.3282, "step": 1030 }, { "epoch": 6.552380952380952, "grad_norm": 2.290588617324829, "learning_rate": 4.572267902894884e-06, "loss": 0.2956, "step": 1035 }, { "epoch": 6.584126984126984, "grad_norm": 4.084680080413818, "learning_rate": 4.3739079278325125e-06, "loss": 0.3452, "step": 1040 }, { "epoch": 6.6158730158730155, "grad_norm": 2.976604700088501, "learning_rate": 4.179533544653674e-06, "loss": 0.3414, "step": 1045 }, { "epoch": 6.647619047619048, "grad_norm": 3.1176273822784424, "learning_rate": 3.989182312434567e-06, "loss": 0.3179, "step": 1050 }, { "epoch": 6.67936507936508, "grad_norm": 2.9681129455566406, "learning_rate": 3.8028910128555804e-06, "loss": 0.2868, "step": 1055 }, { "epoch": 6.711111111111111, "grad_norm": 3.6012775897979736, "learning_rate": 3.6206956430939244e-06, "loss": 0.3446, "step": 1060 }, { "epoch": 6.742857142857143, "grad_norm": 2.876650810241699, "learning_rate": 3.442631408867894e-06, "loss": 0.3363, "step": 1065 }, { "epoch": 6.7746031746031745, "grad_norm": 2.6718950271606445, "learning_rate": 3.2687327176340322e-06, "loss": 0.2764, "step": 1070 }, { "epoch": 6.806349206349206, "grad_norm": 4.247652530670166, "learning_rate": 3.099033171938567e-06, "loss": 0.3326, "step": 1075 }, { "epoch": 6.838095238095238, "grad_norm": 3.0778801441192627, "learning_rate": 2.9335655629243645e-06, "loss": 0.3685, "step": 1080 }, { "epoch": 6.86984126984127, "grad_norm": 3.640263795852661, "learning_rate": 2.772361863994688e-06, "loss": 0.3011, "step": 1085 }, { "epoch": 6.901587301587302, "grad_norm": 3.859355926513672, "learning_rate": 2.6154532246349477e-06, "loss": 0.3587, "step": 1090 }, { "epoch": 6.933333333333334, "grad_norm": 2.7156548500061035, "learning_rate": 2.462869964393666e-06, "loss": 0.3402, "step": 1095 }, { "epoch": 6.965079365079365, "grad_norm": 3.4168052673339844, "learning_rate": 2.31464156702382e-06, "loss": 0.3349, "step": 1100 }, { "epoch": 6.996825396825397, "grad_norm": 2.9093616008758545, "learning_rate": 2.170796674785683e-06, "loss": 0.3261, "step": 1105 }, { "epoch": 7.025396825396825, "grad_norm": 2.839615821838379, "learning_rate": 2.0313630829122522e-06, "loss": 0.3031, "step": 1110 }, { "epoch": 7.057142857142857, "grad_norm": 3.0792160034179688, "learning_rate": 1.8963677342383663e-06, "loss": 0.3013, "step": 1115 }, { "epoch": 7.088888888888889, "grad_norm": 2.227898120880127, "learning_rate": 1.7658367139945232e-06, "loss": 0.2666, "step": 1120 }, { "epoch": 7.12063492063492, "grad_norm": 2.5210745334625244, "learning_rate": 1.6397952447664177e-06, "loss": 0.2693, "step": 1125 }, { "epoch": 7.152380952380953, "grad_norm": 3.3494973182678223, "learning_rate": 1.5182676816211633e-06, "loss": 0.3525, "step": 1130 }, { "epoch": 7.184126984126984, "grad_norm": 2.7284188270568848, "learning_rate": 1.4012775074011553e-06, "loss": 0.2972, "step": 1135 }, { "epoch": 7.215873015873016, "grad_norm": 2.1954691410064697, "learning_rate": 1.28884732818646e-06, "loss": 0.3037, "step": 1140 }, { "epoch": 7.247619047619048, "grad_norm": 2.8341176509857178, "learning_rate": 1.180998868926625e-06, "loss": 0.314, "step": 1145 }, { "epoch": 7.279365079365079, "grad_norm": 3.621948719024658, "learning_rate": 1.077752969242768e-06, "loss": 0.2931, "step": 1150 }, { "epoch": 7.311111111111111, "grad_norm": 3.365598440170288, "learning_rate": 9.791295794007172e-07, "loss": 0.344, "step": 1155 }, { "epoch": 7.3428571428571425, "grad_norm": 3.006441116333008, "learning_rate": 8.851477564560062e-07, "loss": 0.3121, "step": 1160 }, { "epoch": 7.374603174603175, "grad_norm": 2.3393473625183105, "learning_rate": 7.958256605714726e-07, "loss": 0.2873, "step": 1165 }, { "epoch": 7.406349206349207, "grad_norm": 2.6204307079315186, "learning_rate": 7.111805515081532e-07, "loss": 0.2897, "step": 1170 }, { "epoch": 7.438095238095238, "grad_norm": 3.66949725151062, "learning_rate": 6.312287852901832e-07, "loss": 0.323, "step": 1175 }, { "epoch": 7.46984126984127, "grad_norm": 3.176905393600464, "learning_rate": 5.559858110443017e-07, "loss": 0.3114, "step": 1180 }, { "epoch": 7.501587301587302, "grad_norm": 2.6001551151275635, "learning_rate": 4.854661680146033e-07, "loss": 0.3147, "step": 1185 }, { "epoch": 7.533333333333333, "grad_norm": 2.6408729553222656, "learning_rate": 4.1968348275312763e-07, "loss": 0.2926, "step": 1190 }, { "epoch": 7.565079365079365, "grad_norm": 2.4512081146240234, "learning_rate": 3.586504664867707e-07, "loss": 0.2575, "step": 1195 }, { "epoch": 7.5968253968253965, "grad_norm": 2.964920997619629, "learning_rate": 3.023789126611137e-07, "loss": 0.3054, "step": 1200 }, { "epoch": 7.628571428571428, "grad_norm": 3.3851871490478516, "learning_rate": 2.508796946615405e-07, "loss": 0.3041, "step": 1205 }, { "epoch": 7.660317460317461, "grad_norm": 2.6496877670288086, "learning_rate": 2.0416276371219289e-07, "loss": 0.3293, "step": 1210 }, { "epoch": 7.692063492063492, "grad_norm": 3.0116524696350098, "learning_rate": 1.6223714695306757e-07, "loss": 0.3449, "step": 1215 }, { "epoch": 7.723809523809524, "grad_norm": 3.024155855178833, "learning_rate": 1.2511094569571668e-07, "loss": 0.3051, "step": 1220 }, { "epoch": 7.7555555555555555, "grad_norm": 3.3377327919006348, "learning_rate": 9.27913338578057e-08, "loss": 0.3011, "step": 1225 }, { "epoch": 7.787301587301587, "grad_norm": 3.6456995010375977, "learning_rate": 6.528455657691113e-08, "loss": 0.3179, "step": 1230 }, { "epoch": 7.819047619047619, "grad_norm": 2.919464111328125, "learning_rate": 4.259592900376363e-08, "loss": 0.2799, "step": 1235 }, { "epoch": 7.85079365079365, "grad_norm": 3.5842061042785645, "learning_rate": 2.4729835275189018e-08, "loss": 0.3065, "step": 1240 }, { "epoch": 7.882539682539683, "grad_norm": 2.56817364692688, "learning_rate": 1.1689727666969186e-08, "loss": 0.2864, "step": 1245 }, { "epoch": 7.914285714285715, "grad_norm": 3.4104034900665283, "learning_rate": 3.478125926756337e-09, "loss": 0.3002, "step": 1250 }, { "epoch": 7.946031746031746, "grad_norm": 3.37762451171875, "learning_rate": 9.66167871790935e-11, "loss": 0.2886, "step": 1255 }, { "epoch": 7.9523809523809526, "step": 1256, "total_flos": 2.698353498390528e+16, "train_loss": 0.720484851841714, "train_runtime": 3774.6002, "train_samples_per_second": 2.67, "train_steps_per_second": 0.333 } ], "logging_steps": 5, "max_steps": 1256, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 157, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.698353498390528e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }