{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1295465243998117, "eval_steps": 500, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002510591558135886, "grad_norm": 2.53125, "learning_rate": 1.9964e-05, "loss": 1.2975, "step": 10 }, { "epoch": 0.005021183116271772, "grad_norm": 1.859375, "learning_rate": 1.9924e-05, "loss": 0.6792, "step": 20 }, { "epoch": 0.007531774674407657, "grad_norm": 1.921875, "learning_rate": 1.9884e-05, "loss": 0.6571, "step": 30 }, { "epoch": 0.010042366232543544, "grad_norm": 1.8046875, "learning_rate": 1.9844000000000002e-05, "loss": 0.6431, "step": 40 }, { "epoch": 0.012552957790679428, "grad_norm": 1.8984375, "learning_rate": 1.9804000000000002e-05, "loss": 0.6739, "step": 50 }, { "epoch": 0.015063549348815314, "grad_norm": 1.6953125, "learning_rate": 1.9764000000000003e-05, "loss": 0.6491, "step": 60 }, { "epoch": 0.0175741409069512, "grad_norm": 1.6796875, "learning_rate": 1.9724e-05, "loss": 0.5911, "step": 70 }, { "epoch": 0.020084732465087088, "grad_norm": 2.0, "learning_rate": 1.9684e-05, "loss": 0.6117, "step": 80 }, { "epoch": 0.022595324023222972, "grad_norm": 1.734375, "learning_rate": 1.9644e-05, "loss": 0.5892, "step": 90 }, { "epoch": 0.025105915581358856, "grad_norm": 2.125, "learning_rate": 1.9604e-05, "loss": 0.5893, "step": 100 }, { "epoch": 0.027616507139494744, "grad_norm": 1.953125, "learning_rate": 1.9564e-05, "loss": 0.6412, "step": 110 }, { "epoch": 0.03012709869763063, "grad_norm": 1.796875, "learning_rate": 1.9524e-05, "loss": 0.5514, "step": 120 }, { "epoch": 0.032637690255766516, "grad_norm": 1.7421875, "learning_rate": 1.9484000000000002e-05, "loss": 0.6029, "step": 130 }, { "epoch": 0.0351482818139024, "grad_norm": 1.78125, "learning_rate": 1.9444000000000002e-05, "loss": 0.5723, "step": 140 }, { "epoch": 0.037658873372038285, "grad_norm": 1.9453125, "learning_rate": 1.9404e-05, "loss": 0.5721, "step": 150 }, { "epoch": 0.040169464930174176, "grad_norm": 2.0625, "learning_rate": 1.9364e-05, "loss": 0.5911, "step": 160 }, { "epoch": 0.04268005648831006, "grad_norm": 1.7265625, "learning_rate": 1.9324e-05, "loss": 0.5532, "step": 170 }, { "epoch": 0.045190648046445944, "grad_norm": 1.796875, "learning_rate": 1.9284e-05, "loss": 0.5836, "step": 180 }, { "epoch": 0.04770123960458183, "grad_norm": 1.890625, "learning_rate": 1.9244000000000004e-05, "loss": 0.5294, "step": 190 }, { "epoch": 0.05021183116271771, "grad_norm": 1.8125, "learning_rate": 1.9204e-05, "loss": 0.5675, "step": 200 }, { "epoch": 0.052722422720853604, "grad_norm": 1.421875, "learning_rate": 1.9164e-05, "loss": 0.4689, "step": 210 }, { "epoch": 0.05523301427898949, "grad_norm": 1.6484375, "learning_rate": 1.9124000000000002e-05, "loss": 0.5765, "step": 220 }, { "epoch": 0.05774360583712537, "grad_norm": 2.078125, "learning_rate": 1.9084000000000002e-05, "loss": 0.5369, "step": 230 }, { "epoch": 0.06025419739526126, "grad_norm": 1.65625, "learning_rate": 1.9044000000000003e-05, "loss": 0.4895, "step": 240 }, { "epoch": 0.06276478895339714, "grad_norm": 2.234375, "learning_rate": 1.9004000000000003e-05, "loss": 0.5191, "step": 250 }, { "epoch": 0.06527538051153303, "grad_norm": 1.90625, "learning_rate": 1.8964000000000003e-05, "loss": 0.5655, "step": 260 }, { "epoch": 0.06778597206966891, "grad_norm": 1.9921875, "learning_rate": 1.8924000000000004e-05, "loss": 0.5454, "step": 270 }, { "epoch": 0.0702965636278048, "grad_norm": 1.9765625, "learning_rate": 1.8884e-05, "loss": 0.5656, "step": 280 }, { "epoch": 0.07280715518594069, "grad_norm": 1.7890625, "learning_rate": 1.8844e-05, "loss": 0.481, "step": 290 }, { "epoch": 0.07531774674407657, "grad_norm": 1.6328125, "learning_rate": 1.8804e-05, "loss": 0.5365, "step": 300 }, { "epoch": 0.07782833830221246, "grad_norm": 1.7578125, "learning_rate": 1.8764000000000002e-05, "loss": 0.5271, "step": 310 }, { "epoch": 0.08033892986034835, "grad_norm": 1.65625, "learning_rate": 1.8724000000000002e-05, "loss": 0.5145, "step": 320 }, { "epoch": 0.08284952141848423, "grad_norm": 1.90625, "learning_rate": 1.8684000000000003e-05, "loss": 0.5129, "step": 330 }, { "epoch": 0.08536011297662012, "grad_norm": 1.6484375, "learning_rate": 1.8644000000000003e-05, "loss": 0.5539, "step": 340 }, { "epoch": 0.087870704534756, "grad_norm": 1.9453125, "learning_rate": 1.8604000000000003e-05, "loss": 0.5051, "step": 350 }, { "epoch": 0.09038129609289189, "grad_norm": 2.09375, "learning_rate": 1.8564e-05, "loss": 0.5034, "step": 360 }, { "epoch": 0.09289188765102778, "grad_norm": 1.625, "learning_rate": 1.8524e-05, "loss": 0.4867, "step": 370 }, { "epoch": 0.09540247920916366, "grad_norm": 1.71875, "learning_rate": 1.8484e-05, "loss": 0.5237, "step": 380 }, { "epoch": 0.09791307076729955, "grad_norm": 2.109375, "learning_rate": 1.8444e-05, "loss": 0.5305, "step": 390 }, { "epoch": 0.10042366232543543, "grad_norm": 1.859375, "learning_rate": 1.8404000000000002e-05, "loss": 0.498, "step": 400 }, { "epoch": 0.10293425388357132, "grad_norm": 1.921875, "learning_rate": 1.8364000000000002e-05, "loss": 0.512, "step": 410 }, { "epoch": 0.10544484544170721, "grad_norm": 2.046875, "learning_rate": 1.8324000000000003e-05, "loss": 0.4874, "step": 420 }, { "epoch": 0.10795543699984309, "grad_norm": 2.03125, "learning_rate": 1.8284000000000003e-05, "loss": 0.5123, "step": 430 }, { "epoch": 0.11046602855797898, "grad_norm": 1.78125, "learning_rate": 1.8244e-05, "loss": 0.4691, "step": 440 }, { "epoch": 0.11297662011611485, "grad_norm": 1.640625, "learning_rate": 1.8204e-05, "loss": 0.4485, "step": 450 }, { "epoch": 0.11548721167425074, "grad_norm": 1.7890625, "learning_rate": 1.8164e-05, "loss": 0.4647, "step": 460 }, { "epoch": 0.11799780323238664, "grad_norm": 1.7265625, "learning_rate": 1.8124e-05, "loss": 0.4518, "step": 470 }, { "epoch": 0.12050839479052251, "grad_norm": 2.34375, "learning_rate": 1.8084e-05, "loss": 0.4848, "step": 480 }, { "epoch": 0.1230189863486584, "grad_norm": 1.828125, "learning_rate": 1.8044000000000002e-05, "loss": 0.4624, "step": 490 }, { "epoch": 0.12552957790679428, "grad_norm": 2.3125, "learning_rate": 1.8004000000000002e-05, "loss": 0.489, "step": 500 }, { "epoch": 0.1280401694649302, "grad_norm": 1.828125, "learning_rate": 1.7964000000000003e-05, "loss": 0.4556, "step": 510 }, { "epoch": 0.13055076102306606, "grad_norm": 2.15625, "learning_rate": 1.7924e-05, "loss": 0.4726, "step": 520 }, { "epoch": 0.13306135258120194, "grad_norm": 2.03125, "learning_rate": 1.7884e-05, "loss": 0.4686, "step": 530 }, { "epoch": 0.13557194413933782, "grad_norm": 1.96875, "learning_rate": 1.7844e-05, "loss": 0.4228, "step": 540 }, { "epoch": 0.13808253569747372, "grad_norm": 2.125, "learning_rate": 1.7804e-05, "loss": 0.4523, "step": 550 }, { "epoch": 0.1405931272556096, "grad_norm": 1.9921875, "learning_rate": 1.7764e-05, "loss": 0.4235, "step": 560 }, { "epoch": 0.14310371881374548, "grad_norm": 1.875, "learning_rate": 1.7724000000000002e-05, "loss": 0.4474, "step": 570 }, { "epoch": 0.14561431037188138, "grad_norm": 2.0625, "learning_rate": 1.7684000000000002e-05, "loss": 0.4576, "step": 580 }, { "epoch": 0.14812490193001726, "grad_norm": 2.296875, "learning_rate": 1.7644000000000003e-05, "loss": 0.4345, "step": 590 }, { "epoch": 0.15063549348815314, "grad_norm": 2.0, "learning_rate": 1.7604e-05, "loss": 0.4124, "step": 600 }, { "epoch": 0.15314608504628904, "grad_norm": 1.9453125, "learning_rate": 1.7564e-05, "loss": 0.4457, "step": 610 }, { "epoch": 0.15565667660442492, "grad_norm": 2.125, "learning_rate": 1.7524e-05, "loss": 0.4569, "step": 620 }, { "epoch": 0.1581672681625608, "grad_norm": 1.8203125, "learning_rate": 1.7484e-05, "loss": 0.4065, "step": 630 }, { "epoch": 0.1606778597206967, "grad_norm": 1.9921875, "learning_rate": 1.7444e-05, "loss": 0.4231, "step": 640 }, { "epoch": 0.16318845127883258, "grad_norm": 2.328125, "learning_rate": 1.7404e-05, "loss": 0.4306, "step": 650 }, { "epoch": 0.16569904283696846, "grad_norm": 1.8359375, "learning_rate": 1.7364000000000002e-05, "loss": 0.4261, "step": 660 }, { "epoch": 0.16820963439510433, "grad_norm": 1.9140625, "learning_rate": 1.7324000000000002e-05, "loss": 0.3939, "step": 670 }, { "epoch": 0.17072022595324024, "grad_norm": 2.34375, "learning_rate": 1.7284e-05, "loss": 0.3979, "step": 680 }, { "epoch": 0.17323081751137612, "grad_norm": 2.0, "learning_rate": 1.7244e-05, "loss": 0.419, "step": 690 }, { "epoch": 0.175741409069512, "grad_norm": 2.078125, "learning_rate": 1.7204e-05, "loss": 0.4181, "step": 700 }, { "epoch": 0.1782520006276479, "grad_norm": 2.0625, "learning_rate": 1.7164e-05, "loss": 0.4345, "step": 710 }, { "epoch": 0.18076259218578378, "grad_norm": 2.078125, "learning_rate": 1.7124e-05, "loss": 0.436, "step": 720 }, { "epoch": 0.18327318374391965, "grad_norm": 2.15625, "learning_rate": 1.7084e-05, "loss": 0.3904, "step": 730 }, { "epoch": 0.18578377530205556, "grad_norm": 1.7265625, "learning_rate": 1.7044e-05, "loss": 0.3941, "step": 740 }, { "epoch": 0.18829436686019144, "grad_norm": 1.9453125, "learning_rate": 1.7004000000000002e-05, "loss": 0.403, "step": 750 }, { "epoch": 0.19080495841832731, "grad_norm": 2.09375, "learning_rate": 1.6964e-05, "loss": 0.397, "step": 760 }, { "epoch": 0.1933155499764632, "grad_norm": 2.078125, "learning_rate": 1.6924e-05, "loss": 0.3869, "step": 770 }, { "epoch": 0.1958261415345991, "grad_norm": 2.046875, "learning_rate": 1.6884e-05, "loss": 0.428, "step": 780 }, { "epoch": 0.19833673309273497, "grad_norm": 2.375, "learning_rate": 1.6844e-05, "loss": 0.4024, "step": 790 }, { "epoch": 0.20084732465087085, "grad_norm": 2.03125, "learning_rate": 1.6804e-05, "loss": 0.4061, "step": 800 }, { "epoch": 0.20335791620900676, "grad_norm": 2.171875, "learning_rate": 1.6764e-05, "loss": 0.3882, "step": 810 }, { "epoch": 0.20586850776714263, "grad_norm": 2.140625, "learning_rate": 1.6724e-05, "loss": 0.3973, "step": 820 }, { "epoch": 0.2083790993252785, "grad_norm": 2.203125, "learning_rate": 1.6684e-05, "loss": 0.3871, "step": 830 }, { "epoch": 0.21088969088341442, "grad_norm": 2.09375, "learning_rate": 1.6644000000000002e-05, "loss": 0.3666, "step": 840 }, { "epoch": 0.2134002824415503, "grad_norm": 2.046875, "learning_rate": 1.6604000000000002e-05, "loss": 0.3853, "step": 850 }, { "epoch": 0.21591087399968617, "grad_norm": 2.078125, "learning_rate": 1.6564000000000003e-05, "loss": 0.3772, "step": 860 }, { "epoch": 0.21842146555782208, "grad_norm": 2.3125, "learning_rate": 1.6524000000000003e-05, "loss": 0.3926, "step": 870 }, { "epoch": 0.22093205711595795, "grad_norm": 2.0625, "learning_rate": 1.6484000000000003e-05, "loss": 0.3824, "step": 880 }, { "epoch": 0.22344264867409383, "grad_norm": 2.078125, "learning_rate": 1.6444000000000004e-05, "loss": 0.3777, "step": 890 }, { "epoch": 0.2259532402322297, "grad_norm": 1.8984375, "learning_rate": 1.6404e-05, "loss": 0.3886, "step": 900 }, { "epoch": 0.2284638317903656, "grad_norm": 2.25, "learning_rate": 1.6364e-05, "loss": 0.4085, "step": 910 }, { "epoch": 0.2309744233485015, "grad_norm": 2.125, "learning_rate": 1.6324e-05, "loss": 0.3692, "step": 920 }, { "epoch": 0.23348501490663737, "grad_norm": 2.125, "learning_rate": 1.6284000000000002e-05, "loss": 0.3528, "step": 930 }, { "epoch": 0.23599560646477327, "grad_norm": 2.0625, "learning_rate": 1.6244000000000002e-05, "loss": 0.3708, "step": 940 }, { "epoch": 0.23850619802290915, "grad_norm": 1.9453125, "learning_rate": 1.6204000000000003e-05, "loss": 0.3625, "step": 950 }, { "epoch": 0.24101678958104503, "grad_norm": 2.140625, "learning_rate": 1.6164000000000003e-05, "loss": 0.3596, "step": 960 }, { "epoch": 0.24352738113918093, "grad_norm": 1.7265625, "learning_rate": 1.6124000000000004e-05, "loss": 0.3576, "step": 970 }, { "epoch": 0.2460379726973168, "grad_norm": 1.6015625, "learning_rate": 1.6084e-05, "loss": 0.3481, "step": 980 }, { "epoch": 0.2485485642554527, "grad_norm": 2.296875, "learning_rate": 1.6044e-05, "loss": 0.3899, "step": 990 }, { "epoch": 0.25105915581358856, "grad_norm": 2.0, "learning_rate": 1.6004e-05, "loss": 0.3838, "step": 1000 }, { "epoch": 0.25356974737172444, "grad_norm": 1.78125, "learning_rate": 1.5964e-05, "loss": 0.3522, "step": 1010 }, { "epoch": 0.2560803389298604, "grad_norm": 2.0625, "learning_rate": 1.5924000000000002e-05, "loss": 0.3277, "step": 1020 }, { "epoch": 0.25859093048799625, "grad_norm": 2.109375, "learning_rate": 1.5884000000000002e-05, "loss": 0.3532, "step": 1030 }, { "epoch": 0.26110152204613213, "grad_norm": 2.125, "learning_rate": 1.5844000000000003e-05, "loss": 0.3558, "step": 1040 }, { "epoch": 0.263612113604268, "grad_norm": 2.375, "learning_rate": 1.5804000000000003e-05, "loss": 0.3617, "step": 1050 }, { "epoch": 0.2661227051624039, "grad_norm": 2.421875, "learning_rate": 1.5764e-05, "loss": 0.3607, "step": 1060 }, { "epoch": 0.26863329672053976, "grad_norm": 2.140625, "learning_rate": 1.5724e-05, "loss": 0.3948, "step": 1070 }, { "epoch": 0.27114388827867564, "grad_norm": 1.8203125, "learning_rate": 1.5684e-05, "loss": 0.3776, "step": 1080 }, { "epoch": 0.27365447983681157, "grad_norm": 2.15625, "learning_rate": 1.5644e-05, "loss": 0.3324, "step": 1090 }, { "epoch": 0.27616507139494745, "grad_norm": 2.125, "learning_rate": 1.5604000000000002e-05, "loss": 0.3536, "step": 1100 }, { "epoch": 0.2786756629530833, "grad_norm": 2.34375, "learning_rate": 1.5564000000000002e-05, "loss": 0.3591, "step": 1110 }, { "epoch": 0.2811862545112192, "grad_norm": 2.015625, "learning_rate": 1.5524000000000002e-05, "loss": 0.3598, "step": 1120 }, { "epoch": 0.2836968460693551, "grad_norm": 1.734375, "learning_rate": 1.5484000000000003e-05, "loss": 0.323, "step": 1130 }, { "epoch": 0.28620743762749096, "grad_norm": 1.421875, "learning_rate": 1.5444e-05, "loss": 0.3496, "step": 1140 }, { "epoch": 0.2887180291856269, "grad_norm": 2.1875, "learning_rate": 1.5404e-05, "loss": 0.3442, "step": 1150 }, { "epoch": 0.29122862074376277, "grad_norm": 2.09375, "learning_rate": 1.5364e-05, "loss": 0.3392, "step": 1160 }, { "epoch": 0.29373921230189864, "grad_norm": 1.9296875, "learning_rate": 1.5324e-05, "loss": 0.3689, "step": 1170 }, { "epoch": 0.2962498038600345, "grad_norm": 2.296875, "learning_rate": 1.5284e-05, "loss": 0.347, "step": 1180 }, { "epoch": 0.2987603954181704, "grad_norm": 1.96875, "learning_rate": 1.5244000000000002e-05, "loss": 0.3487, "step": 1190 }, { "epoch": 0.3012709869763063, "grad_norm": 2.46875, "learning_rate": 1.5204e-05, "loss": 0.3182, "step": 1200 }, { "epoch": 0.30378157853444215, "grad_norm": 2.109375, "learning_rate": 1.5164e-05, "loss": 0.3298, "step": 1210 }, { "epoch": 0.3062921700925781, "grad_norm": 2.40625, "learning_rate": 1.5124000000000001e-05, "loss": 0.3299, "step": 1220 }, { "epoch": 0.30880276165071396, "grad_norm": 2.53125, "learning_rate": 1.5084000000000002e-05, "loss": 0.3685, "step": 1230 }, { "epoch": 0.31131335320884984, "grad_norm": 2.09375, "learning_rate": 1.5044e-05, "loss": 0.3321, "step": 1240 }, { "epoch": 0.3138239447669857, "grad_norm": 1.9453125, "learning_rate": 1.5004e-05, "loss": 0.3397, "step": 1250 }, { "epoch": 0.3163345363251216, "grad_norm": 1.8359375, "learning_rate": 1.4964000000000001e-05, "loss": 0.3296, "step": 1260 }, { "epoch": 0.3188451278832575, "grad_norm": 1.875, "learning_rate": 1.4924000000000001e-05, "loss": 0.3122, "step": 1270 }, { "epoch": 0.3213557194413934, "grad_norm": 2.0, "learning_rate": 1.4884e-05, "loss": 0.3325, "step": 1280 }, { "epoch": 0.3238663109995293, "grad_norm": 1.9453125, "learning_rate": 1.4844e-05, "loss": 0.343, "step": 1290 }, { "epoch": 0.32637690255766516, "grad_norm": 2.21875, "learning_rate": 1.4804000000000001e-05, "loss": 0.332, "step": 1300 }, { "epoch": 0.32888749411580104, "grad_norm": 2.234375, "learning_rate": 1.4764000000000001e-05, "loss": 0.3303, "step": 1310 }, { "epoch": 0.3313980856739369, "grad_norm": 2.25, "learning_rate": 1.4724e-05, "loss": 0.3302, "step": 1320 }, { "epoch": 0.3339086772320728, "grad_norm": 1.6796875, "learning_rate": 1.4684e-05, "loss": 0.3304, "step": 1330 }, { "epoch": 0.33641926879020867, "grad_norm": 1.7734375, "learning_rate": 1.4644e-05, "loss": 0.3247, "step": 1340 }, { "epoch": 0.3389298603483446, "grad_norm": 1.9375, "learning_rate": 1.4604000000000001e-05, "loss": 0.324, "step": 1350 }, { "epoch": 0.3414404519064805, "grad_norm": 1.5390625, "learning_rate": 1.4564e-05, "loss": 0.3353, "step": 1360 }, { "epoch": 0.34395104346461636, "grad_norm": 2.03125, "learning_rate": 1.4524e-05, "loss": 0.321, "step": 1370 }, { "epoch": 0.34646163502275223, "grad_norm": 2.5625, "learning_rate": 1.4484e-05, "loss": 0.3133, "step": 1380 }, { "epoch": 0.3489722265808881, "grad_norm": 2.34375, "learning_rate": 1.4444000000000001e-05, "loss": 0.3516, "step": 1390 }, { "epoch": 0.351482818139024, "grad_norm": 1.6015625, "learning_rate": 1.4404e-05, "loss": 0.3093, "step": 1400 }, { "epoch": 0.35399340969715987, "grad_norm": 1.4765625, "learning_rate": 1.4364e-05, "loss": 0.3171, "step": 1410 }, { "epoch": 0.3565040012552958, "grad_norm": 2.0, "learning_rate": 1.4324e-05, "loss": 0.3321, "step": 1420 }, { "epoch": 0.3590145928134317, "grad_norm": 2.0625, "learning_rate": 1.4284e-05, "loss": 0.3111, "step": 1430 }, { "epoch": 0.36152518437156755, "grad_norm": 1.9375, "learning_rate": 1.4244000000000003e-05, "loss": 0.3403, "step": 1440 }, { "epoch": 0.36403577592970343, "grad_norm": 1.859375, "learning_rate": 1.4204000000000002e-05, "loss": 0.3174, "step": 1450 }, { "epoch": 0.3665463674878393, "grad_norm": 2.03125, "learning_rate": 1.4164000000000002e-05, "loss": 0.3332, "step": 1460 }, { "epoch": 0.3690569590459752, "grad_norm": 1.7734375, "learning_rate": 1.4124000000000002e-05, "loss": 0.3139, "step": 1470 }, { "epoch": 0.3715675506041111, "grad_norm": 1.9375, "learning_rate": 1.4084000000000003e-05, "loss": 0.328, "step": 1480 }, { "epoch": 0.374078142162247, "grad_norm": 2.015625, "learning_rate": 1.4044000000000001e-05, "loss": 0.3301, "step": 1490 }, { "epoch": 0.3765887337203829, "grad_norm": 2.6875, "learning_rate": 1.4004000000000002e-05, "loss": 0.3218, "step": 1500 }, { "epoch": 0.37909932527851875, "grad_norm": 2.453125, "learning_rate": 1.3964000000000002e-05, "loss": 0.3371, "step": 1510 }, { "epoch": 0.38160991683665463, "grad_norm": 1.515625, "learning_rate": 1.3924000000000003e-05, "loss": 0.3162, "step": 1520 }, { "epoch": 0.3841205083947905, "grad_norm": 1.9765625, "learning_rate": 1.3884000000000001e-05, "loss": 0.3092, "step": 1530 }, { "epoch": 0.3866310999529264, "grad_norm": 2.1875, "learning_rate": 1.3844000000000002e-05, "loss": 0.3224, "step": 1540 }, { "epoch": 0.3891416915110623, "grad_norm": 2.390625, "learning_rate": 1.3804000000000002e-05, "loss": 0.3027, "step": 1550 }, { "epoch": 0.3916522830691982, "grad_norm": 2.375, "learning_rate": 1.3764000000000002e-05, "loss": 0.3303, "step": 1560 }, { "epoch": 0.39416287462733407, "grad_norm": 1.828125, "learning_rate": 1.3724000000000001e-05, "loss": 0.3317, "step": 1570 }, { "epoch": 0.39667346618546995, "grad_norm": 1.9140625, "learning_rate": 1.3684000000000001e-05, "loss": 0.3195, "step": 1580 }, { "epoch": 0.3991840577436058, "grad_norm": 2.03125, "learning_rate": 1.3644000000000002e-05, "loss": 0.3185, "step": 1590 }, { "epoch": 0.4016946493017417, "grad_norm": 1.75, "learning_rate": 1.3604000000000002e-05, "loss": 0.2818, "step": 1600 }, { "epoch": 0.40420524085987763, "grad_norm": 1.8515625, "learning_rate": 1.3564000000000001e-05, "loss": 0.2996, "step": 1610 }, { "epoch": 0.4067158324180135, "grad_norm": 1.7578125, "learning_rate": 1.3524000000000001e-05, "loss": 0.3167, "step": 1620 }, { "epoch": 0.4092264239761494, "grad_norm": 2.25, "learning_rate": 1.3484000000000002e-05, "loss": 0.3257, "step": 1630 }, { "epoch": 0.41173701553428527, "grad_norm": 1.671875, "learning_rate": 1.3444000000000002e-05, "loss": 0.3145, "step": 1640 }, { "epoch": 0.41424760709242114, "grad_norm": 1.59375, "learning_rate": 1.3404e-05, "loss": 0.2936, "step": 1650 }, { "epoch": 0.416758198650557, "grad_norm": 1.953125, "learning_rate": 1.3364000000000001e-05, "loss": 0.3158, "step": 1660 }, { "epoch": 0.4192687902086929, "grad_norm": 1.53125, "learning_rate": 1.3324000000000002e-05, "loss": 0.3233, "step": 1670 }, { "epoch": 0.42177938176682883, "grad_norm": 2.28125, "learning_rate": 1.3284000000000002e-05, "loss": 0.2975, "step": 1680 }, { "epoch": 0.4242899733249647, "grad_norm": 2.046875, "learning_rate": 1.3244e-05, "loss": 0.3058, "step": 1690 }, { "epoch": 0.4268005648831006, "grad_norm": 2.15625, "learning_rate": 1.3204000000000001e-05, "loss": 0.3107, "step": 1700 }, { "epoch": 0.42931115644123646, "grad_norm": 1.8203125, "learning_rate": 1.3164000000000001e-05, "loss": 0.3096, "step": 1710 }, { "epoch": 0.43182174799937234, "grad_norm": 1.3828125, "learning_rate": 1.3124000000000002e-05, "loss": 0.3052, "step": 1720 }, { "epoch": 0.4343323395575082, "grad_norm": 1.8515625, "learning_rate": 1.3084e-05, "loss": 0.3029, "step": 1730 }, { "epoch": 0.43684293111564415, "grad_norm": 1.8203125, "learning_rate": 1.3044e-05, "loss": 0.3147, "step": 1740 }, { "epoch": 0.43935352267378003, "grad_norm": 1.984375, "learning_rate": 1.3004000000000001e-05, "loss": 0.3053, "step": 1750 }, { "epoch": 0.4418641142319159, "grad_norm": 1.953125, "learning_rate": 1.2964000000000002e-05, "loss": 0.2913, "step": 1760 }, { "epoch": 0.4443747057900518, "grad_norm": 1.8515625, "learning_rate": 1.2924e-05, "loss": 0.2916, "step": 1770 }, { "epoch": 0.44688529734818766, "grad_norm": 2.046875, "learning_rate": 1.2884e-05, "loss": 0.3159, "step": 1780 }, { "epoch": 0.44939588890632354, "grad_norm": 1.90625, "learning_rate": 1.2844000000000001e-05, "loss": 0.2987, "step": 1790 }, { "epoch": 0.4519064804644594, "grad_norm": 1.953125, "learning_rate": 1.2804000000000001e-05, "loss": 0.3136, "step": 1800 }, { "epoch": 0.45441707202259535, "grad_norm": 1.6015625, "learning_rate": 1.2764e-05, "loss": 0.3038, "step": 1810 }, { "epoch": 0.4569276635807312, "grad_norm": 2.0, "learning_rate": 1.2724e-05, "loss": 0.2916, "step": 1820 }, { "epoch": 0.4594382551388671, "grad_norm": 1.9765625, "learning_rate": 1.2684000000000001e-05, "loss": 0.3113, "step": 1830 }, { "epoch": 0.461948846697003, "grad_norm": 1.5625, "learning_rate": 1.2644000000000001e-05, "loss": 0.2728, "step": 1840 }, { "epoch": 0.46445943825513886, "grad_norm": 1.9296875, "learning_rate": 1.2604e-05, "loss": 0.2868, "step": 1850 }, { "epoch": 0.46697002981327473, "grad_norm": 2.140625, "learning_rate": 1.2564e-05, "loss": 0.289, "step": 1860 }, { "epoch": 0.4694806213714106, "grad_norm": 1.4921875, "learning_rate": 1.2524e-05, "loss": 0.2881, "step": 1870 }, { "epoch": 0.47199121292954654, "grad_norm": 1.8984375, "learning_rate": 1.2484000000000001e-05, "loss": 0.2803, "step": 1880 }, { "epoch": 0.4745018044876824, "grad_norm": 1.9375, "learning_rate": 1.2444e-05, "loss": 0.2785, "step": 1890 }, { "epoch": 0.4770123960458183, "grad_norm": 1.71875, "learning_rate": 1.2404e-05, "loss": 0.2976, "step": 1900 }, { "epoch": 0.4795229876039542, "grad_norm": 1.9296875, "learning_rate": 1.2364e-05, "loss": 0.2737, "step": 1910 }, { "epoch": 0.48203357916209005, "grad_norm": 1.7421875, "learning_rate": 1.2324000000000001e-05, "loss": 0.3237, "step": 1920 }, { "epoch": 0.48454417072022593, "grad_norm": 1.859375, "learning_rate": 1.2284e-05, "loss": 0.297, "step": 1930 }, { "epoch": 0.48705476227836186, "grad_norm": 2.046875, "learning_rate": 1.2244e-05, "loss": 0.2745, "step": 1940 }, { "epoch": 0.48956535383649774, "grad_norm": 2.1875, "learning_rate": 1.2204e-05, "loss": 0.3059, "step": 1950 }, { "epoch": 0.4920759453946336, "grad_norm": 2.359375, "learning_rate": 1.2164e-05, "loss": 0.2876, "step": 1960 }, { "epoch": 0.4945865369527695, "grad_norm": 2.6875, "learning_rate": 1.2124e-05, "loss": 0.2801, "step": 1970 }, { "epoch": 0.4970971285109054, "grad_norm": 1.921875, "learning_rate": 1.2084e-05, "loss": 0.2971, "step": 1980 }, { "epoch": 0.49960772006904125, "grad_norm": 1.6484375, "learning_rate": 1.2044e-05, "loss": 0.2979, "step": 1990 }, { "epoch": 0.5021183116271771, "grad_norm": 1.9296875, "learning_rate": 1.2004e-05, "loss": 0.306, "step": 2000 }, { "epoch": 0.5046289031853131, "grad_norm": 1.5859375, "learning_rate": 1.1964e-05, "loss": 0.2975, "step": 2010 }, { "epoch": 0.5071394947434489, "grad_norm": 1.9296875, "learning_rate": 1.1924e-05, "loss": 0.2771, "step": 2020 }, { "epoch": 0.5096500863015848, "grad_norm": 2.265625, "learning_rate": 1.1884e-05, "loss": 0.2903, "step": 2030 }, { "epoch": 0.5121606778597207, "grad_norm": 1.7109375, "learning_rate": 1.1844e-05, "loss": 0.2808, "step": 2040 }, { "epoch": 0.5146712694178566, "grad_norm": 1.7890625, "learning_rate": 1.1803999999999999e-05, "loss": 0.2856, "step": 2050 }, { "epoch": 0.5171818609759925, "grad_norm": 1.6796875, "learning_rate": 1.1764e-05, "loss": 0.2868, "step": 2060 }, { "epoch": 0.5196924525341283, "grad_norm": 1.7109375, "learning_rate": 1.1724000000000002e-05, "loss": 0.2973, "step": 2070 }, { "epoch": 0.5222030440922643, "grad_norm": 1.8828125, "learning_rate": 1.1684000000000002e-05, "loss": 0.3097, "step": 2080 }, { "epoch": 0.5247136356504001, "grad_norm": 1.6953125, "learning_rate": 1.1644000000000002e-05, "loss": 0.2731, "step": 2090 }, { "epoch": 0.527224227208536, "grad_norm": 2.0625, "learning_rate": 1.1604000000000003e-05, "loss": 0.2849, "step": 2100 }, { "epoch": 0.5297348187666719, "grad_norm": 2.203125, "learning_rate": 1.1564000000000001e-05, "loss": 0.2949, "step": 2110 }, { "epoch": 0.5322454103248078, "grad_norm": 2.203125, "learning_rate": 1.1524000000000002e-05, "loss": 0.3049, "step": 2120 }, { "epoch": 0.5347560018829437, "grad_norm": 1.7421875, "learning_rate": 1.1484000000000002e-05, "loss": 0.2851, "step": 2130 }, { "epoch": 0.5372665934410795, "grad_norm": 1.84375, "learning_rate": 1.1444000000000003e-05, "loss": 0.2919, "step": 2140 }, { "epoch": 0.5397771849992155, "grad_norm": 1.765625, "learning_rate": 1.1404000000000001e-05, "loss": 0.2967, "step": 2150 }, { "epoch": 0.5422877765573513, "grad_norm": 1.5546875, "learning_rate": 1.1364000000000002e-05, "loss": 0.2801, "step": 2160 }, { "epoch": 0.5447983681154872, "grad_norm": 2.0, "learning_rate": 1.1324000000000002e-05, "loss": 0.3012, "step": 2170 }, { "epoch": 0.5473089596736231, "grad_norm": 1.8984375, "learning_rate": 1.1284000000000002e-05, "loss": 0.2925, "step": 2180 }, { "epoch": 0.549819551231759, "grad_norm": 1.609375, "learning_rate": 1.1244000000000001e-05, "loss": 0.2954, "step": 2190 }, { "epoch": 0.5523301427898949, "grad_norm": 1.734375, "learning_rate": 1.1204000000000001e-05, "loss": 0.2799, "step": 2200 }, { "epoch": 0.5548407343480307, "grad_norm": 1.515625, "learning_rate": 1.1164000000000002e-05, "loss": 0.2704, "step": 2210 }, { "epoch": 0.5573513259061667, "grad_norm": 1.6953125, "learning_rate": 1.1124000000000002e-05, "loss": 0.2876, "step": 2220 }, { "epoch": 0.5598619174643025, "grad_norm": 1.7421875, "learning_rate": 1.1084000000000001e-05, "loss": 0.2633, "step": 2230 }, { "epoch": 0.5623725090224384, "grad_norm": 1.796875, "learning_rate": 1.1044000000000001e-05, "loss": 0.2867, "step": 2240 }, { "epoch": 0.5648831005805743, "grad_norm": 1.4609375, "learning_rate": 1.1004000000000002e-05, "loss": 0.2702, "step": 2250 }, { "epoch": 0.5673936921387102, "grad_norm": 1.828125, "learning_rate": 1.0964000000000002e-05, "loss": 0.2852, "step": 2260 }, { "epoch": 0.5699042836968461, "grad_norm": 1.703125, "learning_rate": 1.0924e-05, "loss": 0.2746, "step": 2270 }, { "epoch": 0.5724148752549819, "grad_norm": 1.90625, "learning_rate": 1.0884000000000001e-05, "loss": 0.2932, "step": 2280 }, { "epoch": 0.5749254668131178, "grad_norm": 1.84375, "learning_rate": 1.0844000000000002e-05, "loss": 0.2903, "step": 2290 }, { "epoch": 0.5774360583712538, "grad_norm": 2.15625, "learning_rate": 1.0804000000000002e-05, "loss": 0.2851, "step": 2300 }, { "epoch": 0.5799466499293896, "grad_norm": 2.0625, "learning_rate": 1.0764e-05, "loss": 0.2923, "step": 2310 }, { "epoch": 0.5824572414875255, "grad_norm": 2.109375, "learning_rate": 1.0724000000000001e-05, "loss": 0.3003, "step": 2320 }, { "epoch": 0.5849678330456614, "grad_norm": 2.046875, "learning_rate": 1.0684000000000001e-05, "loss": 0.291, "step": 2330 }, { "epoch": 0.5874784246037973, "grad_norm": 1.8828125, "learning_rate": 1.0644000000000002e-05, "loss": 0.2787, "step": 2340 }, { "epoch": 0.5899890161619331, "grad_norm": 1.8203125, "learning_rate": 1.0604e-05, "loss": 0.2764, "step": 2350 }, { "epoch": 0.592499607720069, "grad_norm": 1.65625, "learning_rate": 1.0564e-05, "loss": 0.2842, "step": 2360 }, { "epoch": 0.595010199278205, "grad_norm": 1.78125, "learning_rate": 1.0524000000000001e-05, "loss": 0.2922, "step": 2370 }, { "epoch": 0.5975207908363408, "grad_norm": 1.5625, "learning_rate": 1.0484000000000002e-05, "loss": 0.283, "step": 2380 }, { "epoch": 0.6000313823944767, "grad_norm": 1.5703125, "learning_rate": 1.0444e-05, "loss": 0.2796, "step": 2390 }, { "epoch": 0.6025419739526126, "grad_norm": 1.96875, "learning_rate": 1.0404e-05, "loss": 0.2828, "step": 2400 }, { "epoch": 0.6050525655107485, "grad_norm": 1.5703125, "learning_rate": 1.0364000000000001e-05, "loss": 0.2866, "step": 2410 }, { "epoch": 0.6075631570688843, "grad_norm": 2.109375, "learning_rate": 1.0324000000000001e-05, "loss": 0.2836, "step": 2420 }, { "epoch": 0.6100737486270202, "grad_norm": 1.875, "learning_rate": 1.0284e-05, "loss": 0.3037, "step": 2430 }, { "epoch": 0.6125843401851562, "grad_norm": 2.203125, "learning_rate": 1.0244e-05, "loss": 0.2774, "step": 2440 }, { "epoch": 0.615094931743292, "grad_norm": 1.8671875, "learning_rate": 1.0204000000000001e-05, "loss": 0.2922, "step": 2450 }, { "epoch": 0.6176055233014279, "grad_norm": 1.7109375, "learning_rate": 1.0164000000000001e-05, "loss": 0.283, "step": 2460 }, { "epoch": 0.6201161148595637, "grad_norm": 1.9453125, "learning_rate": 1.0124e-05, "loss": 0.2725, "step": 2470 }, { "epoch": 0.6226267064176997, "grad_norm": 1.6875, "learning_rate": 1.0084e-05, "loss": 0.2749, "step": 2480 }, { "epoch": 0.6251372979758355, "grad_norm": 1.703125, "learning_rate": 1.0044e-05, "loss": 0.2823, "step": 2490 }, { "epoch": 0.6276478895339714, "grad_norm": 1.9296875, "learning_rate": 1.0004000000000001e-05, "loss": 0.281, "step": 2500 }, { "epoch": 0.6301584810921074, "grad_norm": 1.9140625, "learning_rate": 9.964e-06, "loss": 0.2856, "step": 2510 }, { "epoch": 0.6326690726502432, "grad_norm": 1.4921875, "learning_rate": 9.924e-06, "loss": 0.28, "step": 2520 }, { "epoch": 0.6351796642083791, "grad_norm": 1.53125, "learning_rate": 9.884e-06, "loss": 0.267, "step": 2530 }, { "epoch": 0.637690255766515, "grad_norm": 1.5703125, "learning_rate": 9.844000000000001e-06, "loss": 0.2563, "step": 2540 }, { "epoch": 0.6402008473246509, "grad_norm": 1.46875, "learning_rate": 9.804000000000001e-06, "loss": 0.2911, "step": 2550 }, { "epoch": 0.6427114388827868, "grad_norm": 2.46875, "learning_rate": 9.764000000000002e-06, "loss": 0.2897, "step": 2560 }, { "epoch": 0.6452220304409226, "grad_norm": 1.90625, "learning_rate": 9.724e-06, "loss": 0.2975, "step": 2570 }, { "epoch": 0.6477326219990586, "grad_norm": 1.890625, "learning_rate": 9.684e-06, "loss": 0.2722, "step": 2580 }, { "epoch": 0.6502432135571944, "grad_norm": 1.6015625, "learning_rate": 9.644000000000001e-06, "loss": 0.2787, "step": 2590 }, { "epoch": 0.6527538051153303, "grad_norm": 2.046875, "learning_rate": 9.604000000000002e-06, "loss": 0.2822, "step": 2600 }, { "epoch": 0.6552643966734661, "grad_norm": 1.9296875, "learning_rate": 9.564e-06, "loss": 0.2903, "step": 2610 }, { "epoch": 0.6577749882316021, "grad_norm": 1.78125, "learning_rate": 9.524e-06, "loss": 0.2693, "step": 2620 }, { "epoch": 0.660285579789738, "grad_norm": 1.46875, "learning_rate": 9.484000000000001e-06, "loss": 0.2768, "step": 2630 }, { "epoch": 0.6627961713478738, "grad_norm": 1.734375, "learning_rate": 9.444000000000001e-06, "loss": 0.2707, "step": 2640 }, { "epoch": 0.6653067629060098, "grad_norm": 1.5703125, "learning_rate": 9.404e-06, "loss": 0.2791, "step": 2650 }, { "epoch": 0.6678173544641456, "grad_norm": 1.703125, "learning_rate": 9.364e-06, "loss": 0.2924, "step": 2660 }, { "epoch": 0.6703279460222815, "grad_norm": 1.9296875, "learning_rate": 9.324000000000001e-06, "loss": 0.2875, "step": 2670 }, { "epoch": 0.6728385375804173, "grad_norm": 1.78125, "learning_rate": 9.284000000000001e-06, "loss": 0.2602, "step": 2680 }, { "epoch": 0.6753491291385533, "grad_norm": 1.6875, "learning_rate": 9.244e-06, "loss": 0.2924, "step": 2690 }, { "epoch": 0.6778597206966892, "grad_norm": 1.578125, "learning_rate": 9.204e-06, "loss": 0.2841, "step": 2700 }, { "epoch": 0.680370312254825, "grad_norm": 1.390625, "learning_rate": 9.164e-06, "loss": 0.2745, "step": 2710 }, { "epoch": 0.682880903812961, "grad_norm": 2.28125, "learning_rate": 9.124000000000001e-06, "loss": 0.2876, "step": 2720 }, { "epoch": 0.6853914953710968, "grad_norm": 1.546875, "learning_rate": 9.084e-06, "loss": 0.2549, "step": 2730 }, { "epoch": 0.6879020869292327, "grad_norm": 1.3515625, "learning_rate": 9.044e-06, "loss": 0.2512, "step": 2740 }, { "epoch": 0.6904126784873685, "grad_norm": 2.203125, "learning_rate": 9.004e-06, "loss": 0.2686, "step": 2750 }, { "epoch": 0.6929232700455045, "grad_norm": 1.6484375, "learning_rate": 8.964000000000001e-06, "loss": 0.2714, "step": 2760 }, { "epoch": 0.6954338616036404, "grad_norm": 1.6796875, "learning_rate": 8.924e-06, "loss": 0.275, "step": 2770 }, { "epoch": 0.6979444531617762, "grad_norm": 1.8359375, "learning_rate": 8.884e-06, "loss": 0.2631, "step": 2780 }, { "epoch": 0.7004550447199122, "grad_norm": 1.3828125, "learning_rate": 8.844e-06, "loss": 0.2639, "step": 2790 }, { "epoch": 0.702965636278048, "grad_norm": 1.59375, "learning_rate": 8.804e-06, "loss": 0.2755, "step": 2800 }, { "epoch": 0.7054762278361839, "grad_norm": 1.8828125, "learning_rate": 8.764e-06, "loss": 0.282, "step": 2810 }, { "epoch": 0.7079868193943197, "grad_norm": 1.9140625, "learning_rate": 8.724e-06, "loss": 0.2672, "step": 2820 }, { "epoch": 0.7104974109524557, "grad_norm": 1.5859375, "learning_rate": 8.684e-06, "loss": 0.2663, "step": 2830 }, { "epoch": 0.7130080025105916, "grad_norm": 1.75, "learning_rate": 8.644e-06, "loss": 0.263, "step": 2840 }, { "epoch": 0.7155185940687274, "grad_norm": 1.9765625, "learning_rate": 8.604000000000001e-06, "loss": 0.276, "step": 2850 }, { "epoch": 0.7180291856268634, "grad_norm": 1.5078125, "learning_rate": 8.564000000000001e-06, "loss": 0.2622, "step": 2860 }, { "epoch": 0.7205397771849992, "grad_norm": 1.890625, "learning_rate": 8.524000000000002e-06, "loss": 0.2574, "step": 2870 }, { "epoch": 0.7230503687431351, "grad_norm": 1.65625, "learning_rate": 8.484e-06, "loss": 0.2586, "step": 2880 }, { "epoch": 0.725560960301271, "grad_norm": 2.234375, "learning_rate": 8.444e-06, "loss": 0.2694, "step": 2890 }, { "epoch": 0.7280715518594069, "grad_norm": 1.6953125, "learning_rate": 8.404000000000001e-06, "loss": 0.2817, "step": 2900 }, { "epoch": 0.7305821434175428, "grad_norm": 1.828125, "learning_rate": 8.364000000000002e-06, "loss": 0.2767, "step": 2910 }, { "epoch": 0.7330927349756786, "grad_norm": 1.40625, "learning_rate": 8.324e-06, "loss": 0.2547, "step": 2920 }, { "epoch": 0.7356033265338145, "grad_norm": 1.65625, "learning_rate": 8.284e-06, "loss": 0.2724, "step": 2930 }, { "epoch": 0.7381139180919504, "grad_norm": 2.078125, "learning_rate": 8.244000000000001e-06, "loss": 0.2972, "step": 2940 }, { "epoch": 0.7406245096500863, "grad_norm": 1.484375, "learning_rate": 8.204000000000001e-06, "loss": 0.2601, "step": 2950 }, { "epoch": 0.7431351012082222, "grad_norm": 2.015625, "learning_rate": 8.164e-06, "loss": 0.2552, "step": 2960 }, { "epoch": 0.7456456927663581, "grad_norm": 2.234375, "learning_rate": 8.124e-06, "loss": 0.274, "step": 2970 }, { "epoch": 0.748156284324494, "grad_norm": 1.8125, "learning_rate": 8.084000000000001e-06, "loss": 0.272, "step": 2980 }, { "epoch": 0.7506668758826298, "grad_norm": 1.6953125, "learning_rate": 8.044000000000001e-06, "loss": 0.2826, "step": 2990 }, { "epoch": 0.7531774674407657, "grad_norm": 1.7109375, "learning_rate": 8.004e-06, "loss": 0.2799, "step": 3000 }, { "epoch": 0.7556880589989016, "grad_norm": 1.640625, "learning_rate": 7.964e-06, "loss": 0.2919, "step": 3010 }, { "epoch": 0.7581986505570375, "grad_norm": 1.6953125, "learning_rate": 7.924e-06, "loss": 0.2634, "step": 3020 }, { "epoch": 0.7607092421151734, "grad_norm": 1.8515625, "learning_rate": 7.884000000000001e-06, "loss": 0.259, "step": 3030 }, { "epoch": 0.7632198336733093, "grad_norm": 1.7109375, "learning_rate": 7.844e-06, "loss": 0.2687, "step": 3040 }, { "epoch": 0.7657304252314452, "grad_norm": 1.6015625, "learning_rate": 7.804e-06, "loss": 0.2644, "step": 3050 }, { "epoch": 0.768241016789581, "grad_norm": 1.7421875, "learning_rate": 7.764e-06, "loss": 0.2541, "step": 3060 }, { "epoch": 0.7707516083477169, "grad_norm": 1.3203125, "learning_rate": 7.724000000000001e-06, "loss": 0.2555, "step": 3070 }, { "epoch": 0.7732621999058528, "grad_norm": 1.4609375, "learning_rate": 7.684e-06, "loss": 0.2745, "step": 3080 }, { "epoch": 0.7757727914639887, "grad_norm": 1.5546875, "learning_rate": 7.644e-06, "loss": 0.2924, "step": 3090 }, { "epoch": 0.7782833830221246, "grad_norm": 1.78125, "learning_rate": 7.604e-06, "loss": 0.2859, "step": 3100 }, { "epoch": 0.7807939745802605, "grad_norm": 1.75, "learning_rate": 7.564e-06, "loss": 0.2752, "step": 3110 }, { "epoch": 0.7833045661383964, "grad_norm": 1.6171875, "learning_rate": 7.524e-06, "loss": 0.2611, "step": 3120 }, { "epoch": 0.7858151576965322, "grad_norm": 1.578125, "learning_rate": 7.484e-06, "loss": 0.256, "step": 3130 }, { "epoch": 0.7883257492546681, "grad_norm": 1.578125, "learning_rate": 7.444e-06, "loss": 0.2707, "step": 3140 }, { "epoch": 0.790836340812804, "grad_norm": 1.765625, "learning_rate": 7.404e-06, "loss": 0.2711, "step": 3150 }, { "epoch": 0.7933469323709399, "grad_norm": 1.6484375, "learning_rate": 7.364000000000001e-06, "loss": 0.2588, "step": 3160 }, { "epoch": 0.7958575239290758, "grad_norm": 1.7890625, "learning_rate": 7.324000000000001e-06, "loss": 0.2618, "step": 3170 }, { "epoch": 0.7983681154872116, "grad_norm": 1.7734375, "learning_rate": 7.284000000000001e-06, "loss": 0.2755, "step": 3180 }, { "epoch": 0.8008787070453476, "grad_norm": 1.5703125, "learning_rate": 7.244000000000001e-06, "loss": 0.2727, "step": 3190 }, { "epoch": 0.8033892986034834, "grad_norm": 1.453125, "learning_rate": 7.204000000000001e-06, "loss": 0.2584, "step": 3200 }, { "epoch": 0.8058998901616193, "grad_norm": 1.7265625, "learning_rate": 7.164000000000001e-06, "loss": 0.2767, "step": 3210 }, { "epoch": 0.8084104817197553, "grad_norm": 1.5703125, "learning_rate": 7.124000000000001e-06, "loss": 0.2735, "step": 3220 }, { "epoch": 0.8109210732778911, "grad_norm": 1.8203125, "learning_rate": 7.084000000000001e-06, "loss": 0.277, "step": 3230 }, { "epoch": 0.813431664836027, "grad_norm": 1.8125, "learning_rate": 7.044000000000001e-06, "loss": 0.2743, "step": 3240 }, { "epoch": 0.8159422563941628, "grad_norm": 1.953125, "learning_rate": 7.004000000000001e-06, "loss": 0.2804, "step": 3250 }, { "epoch": 0.8184528479522988, "grad_norm": 1.71875, "learning_rate": 6.964000000000001e-06, "loss": 0.27, "step": 3260 }, { "epoch": 0.8209634395104346, "grad_norm": 1.578125, "learning_rate": 6.924000000000001e-06, "loss": 0.2672, "step": 3270 }, { "epoch": 0.8234740310685705, "grad_norm": 1.671875, "learning_rate": 6.8840000000000005e-06, "loss": 0.2787, "step": 3280 }, { "epoch": 0.8259846226267065, "grad_norm": 2.34375, "learning_rate": 6.844000000000001e-06, "loss": 0.2624, "step": 3290 }, { "epoch": 0.8284952141848423, "grad_norm": 1.9765625, "learning_rate": 6.804e-06, "loss": 0.2655, "step": 3300 }, { "epoch": 0.8310058057429782, "grad_norm": 1.65625, "learning_rate": 6.764000000000001e-06, "loss": 0.2677, "step": 3310 }, { "epoch": 0.833516397301114, "grad_norm": 2.015625, "learning_rate": 6.724e-06, "loss": 0.2655, "step": 3320 }, { "epoch": 0.83602698885925, "grad_norm": 1.5703125, "learning_rate": 6.684000000000001e-06, "loss": 0.27, "step": 3330 }, { "epoch": 0.8385375804173858, "grad_norm": 1.6015625, "learning_rate": 6.644e-06, "loss": 0.2503, "step": 3340 }, { "epoch": 0.8410481719755217, "grad_norm": 2.15625, "learning_rate": 6.604000000000001e-06, "loss": 0.2773, "step": 3350 }, { "epoch": 0.8435587635336577, "grad_norm": 1.609375, "learning_rate": 6.564e-06, "loss": 0.274, "step": 3360 }, { "epoch": 0.8460693550917935, "grad_norm": 1.578125, "learning_rate": 6.5240000000000006e-06, "loss": 0.2587, "step": 3370 }, { "epoch": 0.8485799466499294, "grad_norm": 1.828125, "learning_rate": 6.484e-06, "loss": 0.2699, "step": 3380 }, { "epoch": 0.8510905382080652, "grad_norm": 1.8125, "learning_rate": 6.4440000000000005e-06, "loss": 0.255, "step": 3390 }, { "epoch": 0.8536011297662012, "grad_norm": 1.65625, "learning_rate": 6.404e-06, "loss": 0.2532, "step": 3400 }, { "epoch": 0.856111721324337, "grad_norm": 1.6171875, "learning_rate": 6.364e-06, "loss": 0.2745, "step": 3410 }, { "epoch": 0.8586223128824729, "grad_norm": 1.7421875, "learning_rate": 6.324e-06, "loss": 0.2713, "step": 3420 }, { "epoch": 0.8611329044406089, "grad_norm": 1.6875, "learning_rate": 6.284e-06, "loss": 0.271, "step": 3430 }, { "epoch": 0.8636434959987447, "grad_norm": 1.4375, "learning_rate": 6.244e-06, "loss": 0.2485, "step": 3440 }, { "epoch": 0.8661540875568806, "grad_norm": 1.3203125, "learning_rate": 6.204e-06, "loss": 0.2648, "step": 3450 }, { "epoch": 0.8686646791150164, "grad_norm": 1.8203125, "learning_rate": 6.164e-06, "loss": 0.2683, "step": 3460 }, { "epoch": 0.8711752706731524, "grad_norm": 1.5703125, "learning_rate": 6.124000000000001e-06, "loss": 0.261, "step": 3470 }, { "epoch": 0.8736858622312883, "grad_norm": 1.4296875, "learning_rate": 6.084000000000001e-06, "loss": 0.2541, "step": 3480 }, { "epoch": 0.8761964537894241, "grad_norm": 1.859375, "learning_rate": 6.044000000000001e-06, "loss": 0.2607, "step": 3490 }, { "epoch": 0.8787070453475601, "grad_norm": 2.71875, "learning_rate": 6.004000000000001e-06, "loss": 0.2516, "step": 3500 }, { "epoch": 0.8812176369056959, "grad_norm": 1.578125, "learning_rate": 5.964000000000001e-06, "loss": 0.2678, "step": 3510 }, { "epoch": 0.8837282284638318, "grad_norm": 1.65625, "learning_rate": 5.924000000000001e-06, "loss": 0.2628, "step": 3520 }, { "epoch": 0.8862388200219676, "grad_norm": 1.5546875, "learning_rate": 5.884000000000001e-06, "loss": 0.2557, "step": 3530 }, { "epoch": 0.8887494115801036, "grad_norm": 1.8828125, "learning_rate": 5.844000000000001e-06, "loss": 0.2663, "step": 3540 }, { "epoch": 0.8912600031382395, "grad_norm": 2.125, "learning_rate": 5.804000000000001e-06, "loss": 0.2789, "step": 3550 }, { "epoch": 0.8937705946963753, "grad_norm": 1.78125, "learning_rate": 5.764000000000001e-06, "loss": 0.2646, "step": 3560 }, { "epoch": 0.8962811862545113, "grad_norm": 1.796875, "learning_rate": 5.724000000000001e-06, "loss": 0.2627, "step": 3570 }, { "epoch": 0.8987917778126471, "grad_norm": 1.609375, "learning_rate": 5.684000000000001e-06, "loss": 0.281, "step": 3580 }, { "epoch": 0.901302369370783, "grad_norm": 1.8515625, "learning_rate": 5.6440000000000005e-06, "loss": 0.2588, "step": 3590 }, { "epoch": 0.9038129609289188, "grad_norm": 1.9140625, "learning_rate": 5.604000000000001e-06, "loss": 0.267, "step": 3600 }, { "epoch": 0.9063235524870548, "grad_norm": 1.75, "learning_rate": 5.5640000000000004e-06, "loss": 0.2546, "step": 3610 }, { "epoch": 0.9088341440451907, "grad_norm": 1.484375, "learning_rate": 5.524000000000001e-06, "loss": 0.2607, "step": 3620 }, { "epoch": 0.9113447356033265, "grad_norm": 1.5703125, "learning_rate": 5.484e-06, "loss": 0.2804, "step": 3630 }, { "epoch": 0.9138553271614624, "grad_norm": 1.6328125, "learning_rate": 5.444000000000001e-06, "loss": 0.2595, "step": 3640 }, { "epoch": 0.9163659187195983, "grad_norm": 1.84375, "learning_rate": 5.404e-06, "loss": 0.2697, "step": 3650 }, { "epoch": 0.9188765102777342, "grad_norm": 1.5703125, "learning_rate": 5.364000000000001e-06, "loss": 0.2725, "step": 3660 }, { "epoch": 0.92138710183587, "grad_norm": 1.3984375, "learning_rate": 5.324e-06, "loss": 0.2613, "step": 3670 }, { "epoch": 0.923897693394006, "grad_norm": 1.8046875, "learning_rate": 5.2840000000000006e-06, "loss": 0.2708, "step": 3680 }, { "epoch": 0.9264082849521419, "grad_norm": 1.578125, "learning_rate": 5.244e-06, "loss": 0.2685, "step": 3690 }, { "epoch": 0.9289188765102777, "grad_norm": 1.8828125, "learning_rate": 5.2040000000000005e-06, "loss": 0.272, "step": 3700 }, { "epoch": 0.9314294680684136, "grad_norm": 1.6875, "learning_rate": 5.164e-06, "loss": 0.2801, "step": 3710 }, { "epoch": 0.9339400596265495, "grad_norm": 1.6015625, "learning_rate": 5.124e-06, "loss": 0.257, "step": 3720 }, { "epoch": 0.9364506511846854, "grad_norm": 1.5, "learning_rate": 5.084e-06, "loss": 0.2511, "step": 3730 }, { "epoch": 0.9389612427428212, "grad_norm": 1.5390625, "learning_rate": 5.044e-06, "loss": 0.2645, "step": 3740 }, { "epoch": 0.9414718343009572, "grad_norm": 1.6640625, "learning_rate": 5.004e-06, "loss": 0.2711, "step": 3750 }, { "epoch": 0.9439824258590931, "grad_norm": 1.7421875, "learning_rate": 4.964e-06, "loss": 0.2547, "step": 3760 }, { "epoch": 0.9464930174172289, "grad_norm": 1.7734375, "learning_rate": 4.924000000000001e-06, "loss": 0.2728, "step": 3770 }, { "epoch": 0.9490036089753648, "grad_norm": 1.421875, "learning_rate": 4.884e-06, "loss": 0.2666, "step": 3780 }, { "epoch": 0.9515142005335007, "grad_norm": 1.3515625, "learning_rate": 4.8440000000000005e-06, "loss": 0.2509, "step": 3790 }, { "epoch": 0.9540247920916366, "grad_norm": 1.6640625, "learning_rate": 4.804e-06, "loss": 0.2643, "step": 3800 }, { "epoch": 0.9565353836497725, "grad_norm": 1.5703125, "learning_rate": 4.7640000000000005e-06, "loss": 0.2568, "step": 3810 }, { "epoch": 0.9590459752079084, "grad_norm": 1.6796875, "learning_rate": 4.724e-06, "loss": 0.2673, "step": 3820 }, { "epoch": 0.9615565667660443, "grad_norm": 1.625, "learning_rate": 4.684e-06, "loss": 0.2622, "step": 3830 }, { "epoch": 0.9640671583241801, "grad_norm": 1.5625, "learning_rate": 4.644e-06, "loss": 0.2656, "step": 3840 }, { "epoch": 0.966577749882316, "grad_norm": 1.4609375, "learning_rate": 4.604e-06, "loss": 0.2581, "step": 3850 }, { "epoch": 0.9690883414404519, "grad_norm": 1.6875, "learning_rate": 4.564e-06, "loss": 0.2529, "step": 3860 }, { "epoch": 0.9715989329985878, "grad_norm": 1.8125, "learning_rate": 4.524e-06, "loss": 0.284, "step": 3870 }, { "epoch": 0.9741095245567237, "grad_norm": 2.484375, "learning_rate": 4.484000000000001e-06, "loss": 0.2596, "step": 3880 }, { "epoch": 0.9766201161148595, "grad_norm": 1.5390625, "learning_rate": 4.444e-06, "loss": 0.2759, "step": 3890 }, { "epoch": 0.9791307076729955, "grad_norm": 1.46875, "learning_rate": 4.4040000000000005e-06, "loss": 0.2563, "step": 3900 }, { "epoch": 0.9816412992311313, "grad_norm": 1.53125, "learning_rate": 4.364e-06, "loss": 0.25, "step": 3910 }, { "epoch": 0.9841518907892672, "grad_norm": 1.875, "learning_rate": 4.3240000000000004e-06, "loss": 0.2747, "step": 3920 }, { "epoch": 0.9866624823474031, "grad_norm": 1.6640625, "learning_rate": 4.284e-06, "loss": 0.2691, "step": 3930 }, { "epoch": 0.989173073905539, "grad_norm": 1.53125, "learning_rate": 4.244e-06, "loss": 0.2431, "step": 3940 }, { "epoch": 0.9916836654636749, "grad_norm": 1.46875, "learning_rate": 4.204e-06, "loss": 0.2422, "step": 3950 }, { "epoch": 0.9941942570218107, "grad_norm": 1.984375, "learning_rate": 4.164e-06, "loss": 0.2554, "step": 3960 }, { "epoch": 0.9967048485799467, "grad_norm": 1.671875, "learning_rate": 4.124e-06, "loss": 0.2659, "step": 3970 }, { "epoch": 0.9992154401380825, "grad_norm": 1.875, "learning_rate": 4.084e-06, "loss": 0.2563, "step": 3980 }, { "epoch": 1.0015063549348815, "grad_norm": 1.53125, "learning_rate": 4.044e-06, "loss": 0.2382, "step": 3990 }, { "epoch": 1.0040169464930173, "grad_norm": 1.34375, "learning_rate": 4.004e-06, "loss": 0.2385, "step": 4000 }, { "epoch": 1.0065275380511534, "grad_norm": 1.4921875, "learning_rate": 3.964e-06, "loss": 0.2354, "step": 4010 }, { "epoch": 1.0090381296092892, "grad_norm": 1.578125, "learning_rate": 3.924000000000001e-06, "loss": 0.2446, "step": 4020 }, { "epoch": 1.011548721167425, "grad_norm": 1.5703125, "learning_rate": 3.884e-06, "loss": 0.2355, "step": 4030 }, { "epoch": 1.014059312725561, "grad_norm": 1.4609375, "learning_rate": 3.844000000000001e-06, "loss": 0.2358, "step": 4040 }, { "epoch": 1.0165699042836969, "grad_norm": 1.6171875, "learning_rate": 3.8040000000000003e-06, "loss": 0.2387, "step": 4050 }, { "epoch": 1.0190804958418327, "grad_norm": 1.6015625, "learning_rate": 3.7640000000000003e-06, "loss": 0.2342, "step": 4060 }, { "epoch": 1.0215910873999685, "grad_norm": 1.5546875, "learning_rate": 3.7240000000000003e-06, "loss": 0.2381, "step": 4070 }, { "epoch": 1.0241016789581046, "grad_norm": 1.765625, "learning_rate": 3.6840000000000002e-06, "loss": 0.2392, "step": 4080 }, { "epoch": 1.0266122705162404, "grad_norm": 1.515625, "learning_rate": 3.644e-06, "loss": 0.2335, "step": 4090 }, { "epoch": 1.0291228620743762, "grad_norm": 1.453125, "learning_rate": 3.604e-06, "loss": 0.2544, "step": 4100 }, { "epoch": 1.0316334536325122, "grad_norm": 1.4296875, "learning_rate": 3.564e-06, "loss": 0.2254, "step": 4110 }, { "epoch": 1.034144045190648, "grad_norm": 1.7734375, "learning_rate": 3.524e-06, "loss": 0.2443, "step": 4120 }, { "epoch": 1.0366546367487839, "grad_norm": 1.6640625, "learning_rate": 3.484e-06, "loss": 0.2492, "step": 4130 }, { "epoch": 1.0391652283069197, "grad_norm": 1.4296875, "learning_rate": 3.444e-06, "loss": 0.2285, "step": 4140 }, { "epoch": 1.0416758198650558, "grad_norm": 1.7109375, "learning_rate": 3.404e-06, "loss": 0.2309, "step": 4150 }, { "epoch": 1.0441864114231916, "grad_norm": 1.78125, "learning_rate": 3.364e-06, "loss": 0.2477, "step": 4160 }, { "epoch": 1.0466970029813274, "grad_norm": 1.7265625, "learning_rate": 3.324e-06, "loss": 0.2395, "step": 4170 }, { "epoch": 1.0492075945394634, "grad_norm": 1.5234375, "learning_rate": 3.2840000000000007e-06, "loss": 0.2266, "step": 4180 }, { "epoch": 1.0517181860975993, "grad_norm": 2.046875, "learning_rate": 3.2440000000000006e-06, "loss": 0.2382, "step": 4190 }, { "epoch": 1.054228777655735, "grad_norm": 1.8125, "learning_rate": 3.2040000000000006e-06, "loss": 0.2315, "step": 4200 }, { "epoch": 1.056739369213871, "grad_norm": 1.7109375, "learning_rate": 3.1640000000000005e-06, "loss": 0.2399, "step": 4210 }, { "epoch": 1.059249960772007, "grad_norm": 1.5703125, "learning_rate": 3.1240000000000005e-06, "loss": 0.2287, "step": 4220 }, { "epoch": 1.0617605523301428, "grad_norm": 1.4453125, "learning_rate": 3.0840000000000005e-06, "loss": 0.2399, "step": 4230 }, { "epoch": 1.0642711438882786, "grad_norm": 1.6171875, "learning_rate": 3.0440000000000004e-06, "loss": 0.2334, "step": 4240 }, { "epoch": 1.0667817354464146, "grad_norm": 1.671875, "learning_rate": 3.0040000000000004e-06, "loss": 0.2324, "step": 4250 }, { "epoch": 1.0692923270045505, "grad_norm": 1.5859375, "learning_rate": 2.9640000000000003e-06, "loss": 0.231, "step": 4260 }, { "epoch": 1.0718029185626863, "grad_norm": 1.640625, "learning_rate": 2.9240000000000003e-06, "loss": 0.2465, "step": 4270 }, { "epoch": 1.074313510120822, "grad_norm": 1.796875, "learning_rate": 2.8840000000000003e-06, "loss": 0.2465, "step": 4280 }, { "epoch": 1.0768241016789581, "grad_norm": 1.4375, "learning_rate": 2.8440000000000002e-06, "loss": 0.2514, "step": 4290 }, { "epoch": 1.079334693237094, "grad_norm": 1.71875, "learning_rate": 2.804e-06, "loss": 0.2318, "step": 4300 }, { "epoch": 1.0818452847952298, "grad_norm": 1.5234375, "learning_rate": 2.764e-06, "loss": 0.2366, "step": 4310 }, { "epoch": 1.0843558763533658, "grad_norm": 1.46875, "learning_rate": 2.724e-06, "loss": 0.2328, "step": 4320 }, { "epoch": 1.0868664679115017, "grad_norm": 1.734375, "learning_rate": 2.6840000000000005e-06, "loss": 0.2566, "step": 4330 }, { "epoch": 1.0893770594696375, "grad_norm": 1.4296875, "learning_rate": 2.6440000000000004e-06, "loss": 0.2243, "step": 4340 }, { "epoch": 1.0918876510277735, "grad_norm": 1.703125, "learning_rate": 2.6040000000000004e-06, "loss": 0.2234, "step": 4350 }, { "epoch": 1.0943982425859093, "grad_norm": 1.7421875, "learning_rate": 2.5640000000000004e-06, "loss": 0.2287, "step": 4360 }, { "epoch": 1.0969088341440452, "grad_norm": 1.5703125, "learning_rate": 2.5240000000000003e-06, "loss": 0.2365, "step": 4370 }, { "epoch": 1.099419425702181, "grad_norm": 1.4296875, "learning_rate": 2.4840000000000003e-06, "loss": 0.2299, "step": 4380 }, { "epoch": 1.101930017260317, "grad_norm": 1.8359375, "learning_rate": 2.4440000000000002e-06, "loss": 0.24, "step": 4390 }, { "epoch": 1.1044406088184529, "grad_norm": 1.5703125, "learning_rate": 2.404e-06, "loss": 0.2292, "step": 4400 }, { "epoch": 1.1069512003765887, "grad_norm": 1.59375, "learning_rate": 2.364e-06, "loss": 0.2356, "step": 4410 }, { "epoch": 1.1094617919347247, "grad_norm": 1.8671875, "learning_rate": 2.324e-06, "loss": 0.2395, "step": 4420 }, { "epoch": 1.1119723834928605, "grad_norm": 1.7734375, "learning_rate": 2.284e-06, "loss": 0.2534, "step": 4430 }, { "epoch": 1.1144829750509964, "grad_norm": 1.6328125, "learning_rate": 2.244e-06, "loss": 0.2294, "step": 4440 }, { "epoch": 1.1169935666091322, "grad_norm": 1.6796875, "learning_rate": 2.2040000000000004e-06, "loss": 0.2428, "step": 4450 }, { "epoch": 1.1195041581672682, "grad_norm": 1.5, "learning_rate": 2.1640000000000004e-06, "loss": 0.2486, "step": 4460 }, { "epoch": 1.122014749725404, "grad_norm": 1.515625, "learning_rate": 2.1240000000000003e-06, "loss": 0.2428, "step": 4470 }, { "epoch": 1.1245253412835399, "grad_norm": 1.65625, "learning_rate": 2.0840000000000003e-06, "loss": 0.2438, "step": 4480 }, { "epoch": 1.127035932841676, "grad_norm": 1.796875, "learning_rate": 2.0440000000000003e-06, "loss": 0.2278, "step": 4490 }, { "epoch": 1.1295465243998117, "grad_norm": 1.640625, "learning_rate": 2.004e-06, "loss": 0.2306, "step": 4500 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.194052832115884e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }