{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.96, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 0.09734748303890228, "learning_rate": 9.9712e-06, "loss": 1.6245, "step": 10 }, { "epoch": 0.0064, "grad_norm": 0.0981544628739357, "learning_rate": 9.939200000000001e-06, "loss": 1.5375, "step": 20 }, { "epoch": 0.0096, "grad_norm": 0.09418202936649323, "learning_rate": 9.9072e-06, "loss": 1.5647, "step": 30 }, { "epoch": 0.0128, "grad_norm": 0.10748359560966492, "learning_rate": 9.8752e-06, "loss": 1.6781, "step": 40 }, { "epoch": 0.016, "grad_norm": 0.12658047676086426, "learning_rate": 9.843200000000001e-06, "loss": 1.5854, "step": 50 }, { "epoch": 0.0192, "grad_norm": 0.1334228664636612, "learning_rate": 9.8112e-06, "loss": 1.5453, "step": 60 }, { "epoch": 0.0224, "grad_norm": 0.15112873911857605, "learning_rate": 9.779200000000001e-06, "loss": 1.5721, "step": 70 }, { "epoch": 0.0256, "grad_norm": 0.140653595328331, "learning_rate": 9.7472e-06, "loss": 1.5162, "step": 80 }, { "epoch": 0.0288, "grad_norm": 0.16999679803848267, "learning_rate": 9.715200000000001e-06, "loss": 1.5689, "step": 90 }, { "epoch": 0.032, "grad_norm": 0.1928016096353531, "learning_rate": 9.6832e-06, "loss": 1.5845, "step": 100 }, { "epoch": 0.0352, "grad_norm": 0.19378426671028137, "learning_rate": 9.6512e-06, "loss": 1.5386, "step": 110 }, { "epoch": 0.0384, "grad_norm": 0.24590148031711578, "learning_rate": 9.619200000000001e-06, "loss": 1.4133, "step": 120 }, { "epoch": 0.0416, "grad_norm": 0.23824049532413483, "learning_rate": 9.5872e-06, "loss": 1.4573, "step": 130 }, { "epoch": 0.0448, "grad_norm": 0.19866596162319183, "learning_rate": 9.555200000000001e-06, "loss": 1.4357, "step": 140 }, { "epoch": 0.048, "grad_norm": 0.2909606993198395, "learning_rate": 9.5232e-06, "loss": 1.3924, "step": 150 }, { "epoch": 0.0512, "grad_norm": 0.48891496658325195, "learning_rate": 9.4912e-06, "loss": 1.4041, "step": 160 }, { "epoch": 0.0544, "grad_norm": 0.3921829164028168, "learning_rate": 9.4592e-06, "loss": 1.3158, "step": 170 }, { "epoch": 0.0576, "grad_norm": 0.293231338262558, "learning_rate": 9.4272e-06, "loss": 1.4709, "step": 180 }, { "epoch": 0.0608, "grad_norm": 0.27421411871910095, "learning_rate": 9.395200000000001e-06, "loss": 1.4046, "step": 190 }, { "epoch": 0.064, "grad_norm": 0.1971723437309265, "learning_rate": 9.3632e-06, "loss": 1.417, "step": 200 }, { "epoch": 0.0672, "grad_norm": 0.27423539757728577, "learning_rate": 9.3312e-06, "loss": 1.3721, "step": 210 }, { "epoch": 0.0704, "grad_norm": 0.4509432315826416, "learning_rate": 9.2992e-06, "loss": 1.4333, "step": 220 }, { "epoch": 0.0736, "grad_norm": 0.3389282822608948, "learning_rate": 9.2672e-06, "loss": 1.352, "step": 230 }, { "epoch": 0.0768, "grad_norm": 0.2814404368400574, "learning_rate": 9.235200000000001e-06, "loss": 1.3682, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.2661599814891815, "learning_rate": 9.2032e-06, "loss": 1.3661, "step": 250 }, { "epoch": 0.0832, "grad_norm": 0.29006555676460266, "learning_rate": 9.171200000000001e-06, "loss": 1.299, "step": 260 }, { "epoch": 0.0864, "grad_norm": 0.2795925438404083, "learning_rate": 9.1392e-06, "loss": 1.3144, "step": 270 }, { "epoch": 0.0896, "grad_norm": 0.25778502225875854, "learning_rate": 9.1072e-06, "loss": 1.2957, "step": 280 }, { "epoch": 0.0928, "grad_norm": 0.26814839243888855, "learning_rate": 9.0752e-06, "loss": 1.3356, "step": 290 }, { "epoch": 0.096, "grad_norm": 0.3247470557689667, "learning_rate": 9.0432e-06, "loss": 1.3458, "step": 300 }, { "epoch": 0.0992, "grad_norm": 0.36921611428260803, "learning_rate": 9.011200000000001e-06, "loss": 1.3601, "step": 310 }, { "epoch": 0.1024, "grad_norm": 0.31122124195098877, "learning_rate": 8.979200000000002e-06, "loss": 1.3131, "step": 320 }, { "epoch": 0.1056, "grad_norm": 0.3557804822921753, "learning_rate": 8.9472e-06, "loss": 1.426, "step": 330 }, { "epoch": 0.1088, "grad_norm": 0.3266560137271881, "learning_rate": 8.9152e-06, "loss": 1.3386, "step": 340 }, { "epoch": 0.112, "grad_norm": 0.3932088017463684, "learning_rate": 8.8832e-06, "loss": 1.3982, "step": 350 }, { "epoch": 0.1152, "grad_norm": 0.32620078325271606, "learning_rate": 8.851200000000001e-06, "loss": 1.3048, "step": 360 }, { "epoch": 0.1184, "grad_norm": 0.30419647693634033, "learning_rate": 8.819200000000002e-06, "loss": 1.3761, "step": 370 }, { "epoch": 0.1216, "grad_norm": 0.29732590913772583, "learning_rate": 8.7872e-06, "loss": 1.2566, "step": 380 }, { "epoch": 0.1248, "grad_norm": 0.28484678268432617, "learning_rate": 8.7552e-06, "loss": 1.3666, "step": 390 }, { "epoch": 0.128, "grad_norm": 0.4168960154056549, "learning_rate": 8.7232e-06, "loss": 1.3396, "step": 400 }, { "epoch": 0.1312, "grad_norm": 0.3573697507381439, "learning_rate": 8.6912e-06, "loss": 1.3684, "step": 410 }, { "epoch": 0.1344, "grad_norm": 0.4777122735977173, "learning_rate": 8.659200000000002e-06, "loss": 1.3059, "step": 420 }, { "epoch": 0.1376, "grad_norm": 0.26450300216674805, "learning_rate": 8.627200000000001e-06, "loss": 1.3283, "step": 430 }, { "epoch": 0.1408, "grad_norm": 0.37447720766067505, "learning_rate": 8.5952e-06, "loss": 1.2667, "step": 440 }, { "epoch": 0.144, "grad_norm": 0.30257123708724976, "learning_rate": 8.5632e-06, "loss": 1.3147, "step": 450 }, { "epoch": 0.1472, "grad_norm": 0.34745684266090393, "learning_rate": 8.5312e-06, "loss": 1.3603, "step": 460 }, { "epoch": 0.1504, "grad_norm": 0.2882753312587738, "learning_rate": 8.499200000000002e-06, "loss": 1.3087, "step": 470 }, { "epoch": 0.1536, "grad_norm": 0.3751160204410553, "learning_rate": 8.467200000000001e-06, "loss": 1.342, "step": 480 }, { "epoch": 0.1568, "grad_norm": 0.3185778260231018, "learning_rate": 8.435200000000002e-06, "loss": 1.35, "step": 490 }, { "epoch": 0.16, "grad_norm": 0.2853422164916992, "learning_rate": 8.4032e-06, "loss": 1.3105, "step": 500 }, { "epoch": 0.1632, "grad_norm": 0.3187882602214813, "learning_rate": 8.3712e-06, "loss": 1.2915, "step": 510 }, { "epoch": 0.1664, "grad_norm": 0.4516860842704773, "learning_rate": 8.339200000000001e-06, "loss": 1.3449, "step": 520 }, { "epoch": 0.1696, "grad_norm": 0.3336597681045532, "learning_rate": 8.3072e-06, "loss": 1.2989, "step": 530 }, { "epoch": 0.1728, "grad_norm": 0.4279087781906128, "learning_rate": 8.275200000000002e-06, "loss": 1.2412, "step": 540 }, { "epoch": 0.176, "grad_norm": 0.4071614742279053, "learning_rate": 8.243200000000001e-06, "loss": 1.414, "step": 550 }, { "epoch": 0.1792, "grad_norm": 0.3194911479949951, "learning_rate": 8.2112e-06, "loss": 1.2762, "step": 560 }, { "epoch": 0.1824, "grad_norm": 0.3617415428161621, "learning_rate": 8.179200000000001e-06, "loss": 1.3225, "step": 570 }, { "epoch": 0.1856, "grad_norm": 0.3274191915988922, "learning_rate": 8.1472e-06, "loss": 1.3464, "step": 580 }, { "epoch": 0.1888, "grad_norm": 0.35526078939437866, "learning_rate": 8.115200000000002e-06, "loss": 1.315, "step": 590 }, { "epoch": 0.192, "grad_norm": 0.3728134036064148, "learning_rate": 8.0832e-06, "loss": 1.3023, "step": 600 }, { "epoch": 0.1952, "grad_norm": 0.4048090875148773, "learning_rate": 8.0512e-06, "loss": 1.2751, "step": 610 }, { "epoch": 0.1984, "grad_norm": 0.41539278626441956, "learning_rate": 8.019200000000001e-06, "loss": 1.3533, "step": 620 }, { "epoch": 0.2016, "grad_norm": 0.3269357979297638, "learning_rate": 7.9872e-06, "loss": 1.2709, "step": 630 }, { "epoch": 0.2048, "grad_norm": 0.3444967567920685, "learning_rate": 7.955200000000001e-06, "loss": 1.3119, "step": 640 }, { "epoch": 0.208, "grad_norm": 0.34097886085510254, "learning_rate": 7.9232e-06, "loss": 1.3444, "step": 650 }, { "epoch": 0.2112, "grad_norm": 0.42459428310394287, "learning_rate": 7.891200000000002e-06, "loss": 1.325, "step": 660 }, { "epoch": 0.2144, "grad_norm": 0.3942951261997223, "learning_rate": 7.859200000000001e-06, "loss": 1.3732, "step": 670 }, { "epoch": 0.2176, "grad_norm": 0.33468231558799744, "learning_rate": 7.8272e-06, "loss": 1.2883, "step": 680 }, { "epoch": 0.2208, "grad_norm": 0.3964150547981262, "learning_rate": 7.795200000000001e-06, "loss": 1.4014, "step": 690 }, { "epoch": 0.224, "grad_norm": 0.3447844386100769, "learning_rate": 7.7632e-06, "loss": 1.3205, "step": 700 }, { "epoch": 0.2272, "grad_norm": 0.380398154258728, "learning_rate": 7.731200000000001e-06, "loss": 1.2819, "step": 710 }, { "epoch": 0.2304, "grad_norm": 0.3823450207710266, "learning_rate": 7.6992e-06, "loss": 1.3097, "step": 720 }, { "epoch": 0.2336, "grad_norm": 0.3383599817752838, "learning_rate": 7.6672e-06, "loss": 1.346, "step": 730 }, { "epoch": 0.2368, "grad_norm": 0.39140060544013977, "learning_rate": 7.635200000000001e-06, "loss": 1.2961, "step": 740 }, { "epoch": 0.24, "grad_norm": 0.32159295678138733, "learning_rate": 7.6032e-06, "loss": 1.3045, "step": 750 }, { "epoch": 0.2432, "grad_norm": 0.3853408098220825, "learning_rate": 7.5712000000000005e-06, "loss": 1.2935, "step": 760 }, { "epoch": 0.2464, "grad_norm": 0.39150312542915344, "learning_rate": 7.539200000000001e-06, "loss": 1.2976, "step": 770 }, { "epoch": 0.2496, "grad_norm": 0.39306044578552246, "learning_rate": 7.507200000000001e-06, "loss": 1.2588, "step": 780 }, { "epoch": 0.2528, "grad_norm": 0.39256688952445984, "learning_rate": 7.4752e-06, "loss": 1.3252, "step": 790 }, { "epoch": 0.256, "grad_norm": 0.3738512098789215, "learning_rate": 7.4432e-06, "loss": 1.3162, "step": 800 }, { "epoch": 0.2592, "grad_norm": 0.4799080491065979, "learning_rate": 7.4112e-06, "loss": 1.2993, "step": 810 }, { "epoch": 0.2624, "grad_norm": 0.4616535007953644, "learning_rate": 7.3792000000000004e-06, "loss": 1.3356, "step": 820 }, { "epoch": 0.2656, "grad_norm": 0.37460416555404663, "learning_rate": 7.347200000000001e-06, "loss": 1.2938, "step": 830 }, { "epoch": 0.2688, "grad_norm": 0.4229544997215271, "learning_rate": 7.3152e-06, "loss": 1.26, "step": 840 }, { "epoch": 0.272, "grad_norm": 0.5051556825637817, "learning_rate": 7.2832e-06, "loss": 1.2868, "step": 850 }, { "epoch": 0.2752, "grad_norm": 0.3845407962799072, "learning_rate": 7.2512e-06, "loss": 1.3255, "step": 860 }, { "epoch": 0.2784, "grad_norm": 0.43234601616859436, "learning_rate": 7.2192e-06, "loss": 1.2756, "step": 870 }, { "epoch": 0.2816, "grad_norm": 0.390572190284729, "learning_rate": 7.187200000000001e-06, "loss": 1.3053, "step": 880 }, { "epoch": 0.2848, "grad_norm": 0.385815292596817, "learning_rate": 7.155200000000001e-06, "loss": 1.2608, "step": 890 }, { "epoch": 0.288, "grad_norm": 0.4778871238231659, "learning_rate": 7.1232e-06, "loss": 1.3109, "step": 900 }, { "epoch": 0.2912, "grad_norm": 0.3777396082878113, "learning_rate": 7.0912e-06, "loss": 1.2723, "step": 910 }, { "epoch": 0.2944, "grad_norm": 0.4682841897010803, "learning_rate": 7.0592e-06, "loss": 1.3304, "step": 920 }, { "epoch": 0.2976, "grad_norm": 0.3837222754955292, "learning_rate": 7.027200000000001e-06, "loss": 1.3081, "step": 930 }, { "epoch": 0.3008, "grad_norm": 0.3792935907840729, "learning_rate": 6.995200000000001e-06, "loss": 1.3176, "step": 940 }, { "epoch": 0.304, "grad_norm": 0.476096510887146, "learning_rate": 6.963200000000001e-06, "loss": 1.2764, "step": 950 }, { "epoch": 0.3072, "grad_norm": 0.4119466543197632, "learning_rate": 6.9312e-06, "loss": 1.3563, "step": 960 }, { "epoch": 0.3104, "grad_norm": 0.40938565135002136, "learning_rate": 6.8992e-06, "loss": 1.2782, "step": 970 }, { "epoch": 0.3136, "grad_norm": 0.4305261969566345, "learning_rate": 6.867200000000001e-06, "loss": 1.3333, "step": 980 }, { "epoch": 0.3168, "grad_norm": 0.3533143997192383, "learning_rate": 6.835200000000001e-06, "loss": 1.3686, "step": 990 }, { "epoch": 0.32, "grad_norm": 0.43104642629623413, "learning_rate": 6.803200000000001e-06, "loss": 1.3461, "step": 1000 }, { "epoch": 0.3232, "grad_norm": 0.5197634696960449, "learning_rate": 6.771200000000001e-06, "loss": 1.3316, "step": 1010 }, { "epoch": 0.3264, "grad_norm": 0.4084891080856323, "learning_rate": 6.7392e-06, "loss": 1.2941, "step": 1020 }, { "epoch": 0.3296, "grad_norm": 0.4634837508201599, "learning_rate": 6.707200000000001e-06, "loss": 1.2982, "step": 1030 }, { "epoch": 0.3328, "grad_norm": 0.4361494183540344, "learning_rate": 6.675200000000001e-06, "loss": 1.334, "step": 1040 }, { "epoch": 0.336, "grad_norm": 0.36735212802886963, "learning_rate": 6.643200000000001e-06, "loss": 1.3642, "step": 1050 }, { "epoch": 0.3392, "grad_norm": 0.3968944847583771, "learning_rate": 6.611200000000001e-06, "loss": 1.3784, "step": 1060 }, { "epoch": 0.3424, "grad_norm": 0.39363133907318115, "learning_rate": 6.5792e-06, "loss": 1.2715, "step": 1070 }, { "epoch": 0.3456, "grad_norm": 0.4664965867996216, "learning_rate": 6.547200000000001e-06, "loss": 1.3436, "step": 1080 }, { "epoch": 0.3488, "grad_norm": 0.3857831358909607, "learning_rate": 6.515200000000001e-06, "loss": 1.3084, "step": 1090 }, { "epoch": 0.352, "grad_norm": 0.41258570551872253, "learning_rate": 6.483200000000001e-06, "loss": 1.3288, "step": 1100 }, { "epoch": 0.3552, "grad_norm": 0.3971725404262543, "learning_rate": 6.451200000000001e-06, "loss": 1.3321, "step": 1110 }, { "epoch": 0.3584, "grad_norm": 0.3993317186832428, "learning_rate": 6.419200000000001e-06, "loss": 1.3385, "step": 1120 }, { "epoch": 0.3616, "grad_norm": 0.5872831344604492, "learning_rate": 6.3872000000000004e-06, "loss": 1.2817, "step": 1130 }, { "epoch": 0.3648, "grad_norm": 0.47822561860084534, "learning_rate": 6.355200000000001e-06, "loss": 1.3083, "step": 1140 }, { "epoch": 0.368, "grad_norm": 0.5206847786903381, "learning_rate": 6.323200000000001e-06, "loss": 1.3457, "step": 1150 }, { "epoch": 0.3712, "grad_norm": 0.41014567017555237, "learning_rate": 6.291200000000001e-06, "loss": 1.2687, "step": 1160 }, { "epoch": 0.3744, "grad_norm": 0.39573901891708374, "learning_rate": 6.259200000000001e-06, "loss": 1.3257, "step": 1170 }, { "epoch": 0.3776, "grad_norm": 0.40908557176589966, "learning_rate": 6.227200000000001e-06, "loss": 1.2587, "step": 1180 }, { "epoch": 0.3808, "grad_norm": 0.4308335781097412, "learning_rate": 6.1952e-06, "loss": 1.2764, "step": 1190 }, { "epoch": 0.384, "grad_norm": 0.41657981276512146, "learning_rate": 6.1632000000000006e-06, "loss": 1.3305, "step": 1200 }, { "epoch": 0.3872, "grad_norm": 0.446154922246933, "learning_rate": 6.131200000000001e-06, "loss": 1.3323, "step": 1210 }, { "epoch": 0.3904, "grad_norm": 0.43903544545173645, "learning_rate": 6.099200000000001e-06, "loss": 1.2731, "step": 1220 }, { "epoch": 0.3936, "grad_norm": 0.4204481542110443, "learning_rate": 6.067200000000001e-06, "loss": 1.2569, "step": 1230 }, { "epoch": 0.3968, "grad_norm": 0.4393060803413391, "learning_rate": 6.0352e-06, "loss": 1.3119, "step": 1240 }, { "epoch": 0.4, "grad_norm": 0.42466068267822266, "learning_rate": 6.0032e-06, "loss": 1.2106, "step": 1250 }, { "epoch": 0.4032, "grad_norm": 0.40182891488075256, "learning_rate": 5.9712000000000005e-06, "loss": 1.2566, "step": 1260 }, { "epoch": 0.4064, "grad_norm": 0.3702845275402069, "learning_rate": 5.939200000000001e-06, "loss": 1.3344, "step": 1270 }, { "epoch": 0.4096, "grad_norm": 0.4409834146499634, "learning_rate": 5.907200000000001e-06, "loss": 1.2553, "step": 1280 }, { "epoch": 0.4128, "grad_norm": 0.5070372223854065, "learning_rate": 5.875200000000001e-06, "loss": 1.2901, "step": 1290 }, { "epoch": 0.416, "grad_norm": 0.44239479303359985, "learning_rate": 5.8432e-06, "loss": 1.2086, "step": 1300 }, { "epoch": 0.4192, "grad_norm": 0.5466510653495789, "learning_rate": 5.8112e-06, "loss": 1.2959, "step": 1310 }, { "epoch": 0.4224, "grad_norm": 0.5056144595146179, "learning_rate": 5.7792000000000005e-06, "loss": 1.3353, "step": 1320 }, { "epoch": 0.4256, "grad_norm": 0.42606833577156067, "learning_rate": 5.747200000000001e-06, "loss": 1.3108, "step": 1330 }, { "epoch": 0.4288, "grad_norm": 0.41976213455200195, "learning_rate": 5.715200000000001e-06, "loss": 1.3248, "step": 1340 }, { "epoch": 0.432, "grad_norm": 0.48559048771858215, "learning_rate": 5.683200000000001e-06, "loss": 1.2686, "step": 1350 }, { "epoch": 0.4352, "grad_norm": 0.47761228680610657, "learning_rate": 5.6512e-06, "loss": 1.281, "step": 1360 }, { "epoch": 0.4384, "grad_norm": 0.4777953028678894, "learning_rate": 5.6192e-06, "loss": 1.2829, "step": 1370 }, { "epoch": 0.4416, "grad_norm": 0.44091978669166565, "learning_rate": 5.5872000000000005e-06, "loss": 1.3032, "step": 1380 }, { "epoch": 0.4448, "grad_norm": 0.48977166414260864, "learning_rate": 5.555200000000001e-06, "loss": 1.3418, "step": 1390 }, { "epoch": 0.448, "grad_norm": 0.6014530062675476, "learning_rate": 5.523200000000001e-06, "loss": 1.2119, "step": 1400 }, { "epoch": 0.4512, "grad_norm": 0.4750172793865204, "learning_rate": 5.491200000000001e-06, "loss": 1.3432, "step": 1410 }, { "epoch": 0.4544, "grad_norm": 0.5095167756080627, "learning_rate": 5.4592e-06, "loss": 1.3448, "step": 1420 }, { "epoch": 0.4576, "grad_norm": 0.47408685088157654, "learning_rate": 5.4272e-06, "loss": 1.3436, "step": 1430 }, { "epoch": 0.4608, "grad_norm": 0.45464885234832764, "learning_rate": 5.3952000000000005e-06, "loss": 1.1962, "step": 1440 }, { "epoch": 0.464, "grad_norm": 0.431349515914917, "learning_rate": 5.363200000000001e-06, "loss": 1.2773, "step": 1450 }, { "epoch": 0.4672, "grad_norm": 0.444397896528244, "learning_rate": 5.331200000000001e-06, "loss": 1.3163, "step": 1460 }, { "epoch": 0.4704, "grad_norm": 0.4360913038253784, "learning_rate": 5.2992e-06, "loss": 1.2759, "step": 1470 }, { "epoch": 0.4736, "grad_norm": 0.5152497887611389, "learning_rate": 5.2672e-06, "loss": 1.3225, "step": 1480 }, { "epoch": 0.4768, "grad_norm": 0.48929157853126526, "learning_rate": 5.2352e-06, "loss": 1.3213, "step": 1490 }, { "epoch": 0.48, "grad_norm": 0.4925262928009033, "learning_rate": 5.2032000000000004e-06, "loss": 1.2008, "step": 1500 }, { "epoch": 0.4832, "grad_norm": 0.46162164211273193, "learning_rate": 5.1712000000000006e-06, "loss": 1.2996, "step": 1510 }, { "epoch": 0.4864, "grad_norm": 0.4908200800418854, "learning_rate": 5.139200000000001e-06, "loss": 1.2729, "step": 1520 }, { "epoch": 0.4896, "grad_norm": 0.5178566575050354, "learning_rate": 5.1072e-06, "loss": 1.293, "step": 1530 }, { "epoch": 0.4928, "grad_norm": 0.5733951330184937, "learning_rate": 5.0752e-06, "loss": 1.3573, "step": 1540 }, { "epoch": 0.496, "grad_norm": 0.4558843672275543, "learning_rate": 5.0432e-06, "loss": 1.3445, "step": 1550 }, { "epoch": 0.4992, "grad_norm": 0.5171469449996948, "learning_rate": 5.0112e-06, "loss": 1.2293, "step": 1560 }, { "epoch": 0.5024, "grad_norm": 0.4879666864871979, "learning_rate": 4.9792000000000005e-06, "loss": 1.31, "step": 1570 }, { "epoch": 0.5056, "grad_norm": 0.4393675923347473, "learning_rate": 4.947200000000001e-06, "loss": 1.3186, "step": 1580 }, { "epoch": 0.5088, "grad_norm": 0.5072659254074097, "learning_rate": 4.915200000000001e-06, "loss": 1.2857, "step": 1590 }, { "epoch": 0.512, "grad_norm": 0.5163191556930542, "learning_rate": 4.8832e-06, "loss": 1.3401, "step": 1600 }, { "epoch": 0.5152, "grad_norm": 0.5119105577468872, "learning_rate": 4.8512e-06, "loss": 1.32, "step": 1610 }, { "epoch": 0.5184, "grad_norm": 0.5342932939529419, "learning_rate": 4.8192e-06, "loss": 1.206, "step": 1620 }, { "epoch": 0.5216, "grad_norm": 0.4517419636249542, "learning_rate": 4.7872000000000005e-06, "loss": 1.3077, "step": 1630 }, { "epoch": 0.5248, "grad_norm": 0.46141722798347473, "learning_rate": 4.755200000000001e-06, "loss": 1.2873, "step": 1640 }, { "epoch": 0.528, "grad_norm": 0.41747117042541504, "learning_rate": 4.723200000000001e-06, "loss": 1.2715, "step": 1650 }, { "epoch": 0.5312, "grad_norm": 0.48263996839523315, "learning_rate": 4.6912e-06, "loss": 1.2814, "step": 1660 }, { "epoch": 0.5344, "grad_norm": 0.4876611828804016, "learning_rate": 4.6592e-06, "loss": 1.2776, "step": 1670 }, { "epoch": 0.5376, "grad_norm": 0.46099624037742615, "learning_rate": 4.6272e-06, "loss": 1.3839, "step": 1680 }, { "epoch": 0.5408, "grad_norm": 0.46614623069763184, "learning_rate": 4.5952000000000005e-06, "loss": 1.2717, "step": 1690 }, { "epoch": 0.544, "grad_norm": 0.48747870326042175, "learning_rate": 4.563200000000001e-06, "loss": 1.2937, "step": 1700 }, { "epoch": 0.5472, "grad_norm": 0.5542135238647461, "learning_rate": 4.531200000000001e-06, "loss": 1.2622, "step": 1710 }, { "epoch": 0.5504, "grad_norm": 0.46008777618408203, "learning_rate": 4.4992e-06, "loss": 1.3188, "step": 1720 }, { "epoch": 0.5536, "grad_norm": 0.4853471517562866, "learning_rate": 4.4672e-06, "loss": 1.252, "step": 1730 }, { "epoch": 0.5568, "grad_norm": 0.44900670647621155, "learning_rate": 4.4352e-06, "loss": 1.2549, "step": 1740 }, { "epoch": 0.56, "grad_norm": 0.4973522126674652, "learning_rate": 4.4032000000000005e-06, "loss": 1.2959, "step": 1750 }, { "epoch": 0.5632, "grad_norm": 0.45412448048591614, "learning_rate": 4.371200000000001e-06, "loss": 1.2092, "step": 1760 }, { "epoch": 0.5664, "grad_norm": 0.5110604763031006, "learning_rate": 4.3392e-06, "loss": 1.3127, "step": 1770 }, { "epoch": 0.5696, "grad_norm": 0.5951307415962219, "learning_rate": 4.3072e-06, "loss": 1.2603, "step": 1780 }, { "epoch": 0.5728, "grad_norm": 0.49740588665008545, "learning_rate": 4.2752e-06, "loss": 1.2609, "step": 1790 }, { "epoch": 0.576, "grad_norm": 0.4803503155708313, "learning_rate": 4.2432e-06, "loss": 1.2287, "step": 1800 }, { "epoch": 0.5792, "grad_norm": 0.48638489842414856, "learning_rate": 4.2112000000000004e-06, "loss": 1.2245, "step": 1810 }, { "epoch": 0.5824, "grad_norm": 0.48148202896118164, "learning_rate": 4.179200000000001e-06, "loss": 1.2858, "step": 1820 }, { "epoch": 0.5856, "grad_norm": 0.5493887662887573, "learning_rate": 4.1472e-06, "loss": 1.2765, "step": 1830 }, { "epoch": 0.5888, "grad_norm": 0.45376092195510864, "learning_rate": 4.1152e-06, "loss": 1.1914, "step": 1840 }, { "epoch": 0.592, "grad_norm": 0.5095167756080627, "learning_rate": 4.0832e-06, "loss": 1.2916, "step": 1850 }, { "epoch": 0.5952, "grad_norm": 0.5425928831100464, "learning_rate": 4.0512e-06, "loss": 1.2189, "step": 1860 }, { "epoch": 0.5984, "grad_norm": 0.46790796518325806, "learning_rate": 4.0192e-06, "loss": 1.3668, "step": 1870 }, { "epoch": 0.6016, "grad_norm": 0.48903679847717285, "learning_rate": 3.9872000000000006e-06, "loss": 1.2132, "step": 1880 }, { "epoch": 0.6048, "grad_norm": 0.47461065649986267, "learning_rate": 3.9552e-06, "loss": 1.2794, "step": 1890 }, { "epoch": 0.608, "grad_norm": 0.4707651436328888, "learning_rate": 3.9232e-06, "loss": 1.3, "step": 1900 }, { "epoch": 0.6112, "grad_norm": 0.5604966878890991, "learning_rate": 3.8912e-06, "loss": 1.2272, "step": 1910 }, { "epoch": 0.6144, "grad_norm": 0.5373271107673645, "learning_rate": 3.8592e-06, "loss": 1.2522, "step": 1920 }, { "epoch": 0.6176, "grad_norm": 0.50235915184021, "learning_rate": 3.8272e-06, "loss": 1.2486, "step": 1930 }, { "epoch": 0.6208, "grad_norm": 0.4826876223087311, "learning_rate": 3.7952000000000005e-06, "loss": 1.3355, "step": 1940 }, { "epoch": 0.624, "grad_norm": 0.46976956725120544, "learning_rate": 3.7632000000000002e-06, "loss": 1.2725, "step": 1950 }, { "epoch": 0.6272, "grad_norm": 0.5186979174613953, "learning_rate": 3.7312000000000004e-06, "loss": 1.3073, "step": 1960 }, { "epoch": 0.6304, "grad_norm": 0.4939082860946655, "learning_rate": 3.6992000000000005e-06, "loss": 1.2649, "step": 1970 }, { "epoch": 0.6336, "grad_norm": 0.5091391205787659, "learning_rate": 3.6672000000000002e-06, "loss": 1.4142, "step": 1980 }, { "epoch": 0.6368, "grad_norm": 0.4665001928806305, "learning_rate": 3.6352000000000004e-06, "loss": 1.2606, "step": 1990 }, { "epoch": 0.64, "grad_norm": 0.48443859815597534, "learning_rate": 3.6032e-06, "loss": 1.1884, "step": 2000 }, { "epoch": 0.6432, "grad_norm": 0.5871022939682007, "learning_rate": 3.5712000000000002e-06, "loss": 1.3792, "step": 2010 }, { "epoch": 0.6464, "grad_norm": 0.48302605748176575, "learning_rate": 3.5392000000000004e-06, "loss": 1.262, "step": 2020 }, { "epoch": 0.6496, "grad_norm": 0.4569855034351349, "learning_rate": 3.5072e-06, "loss": 1.2587, "step": 2030 }, { "epoch": 0.6528, "grad_norm": 0.5194870829582214, "learning_rate": 3.4752e-06, "loss": 1.3056, "step": 2040 }, { "epoch": 0.656, "grad_norm": 0.4751642346382141, "learning_rate": 3.4432000000000003e-06, "loss": 1.1733, "step": 2050 }, { "epoch": 0.6592, "grad_norm": 0.5077437162399292, "learning_rate": 3.4112e-06, "loss": 1.3218, "step": 2060 }, { "epoch": 0.6624, "grad_norm": 0.49009519815444946, "learning_rate": 3.3792e-06, "loss": 1.224, "step": 2070 }, { "epoch": 0.6656, "grad_norm": 0.4634891152381897, "learning_rate": 3.3472000000000003e-06, "loss": 1.2727, "step": 2080 }, { "epoch": 0.6688, "grad_norm": 0.5274826884269714, "learning_rate": 3.3152e-06, "loss": 1.2916, "step": 2090 }, { "epoch": 0.672, "grad_norm": 0.5165941715240479, "learning_rate": 3.2832e-06, "loss": 1.2878, "step": 2100 }, { "epoch": 0.6752, "grad_norm": 0.5654541254043579, "learning_rate": 3.2512000000000003e-06, "loss": 1.2749, "step": 2110 }, { "epoch": 0.6784, "grad_norm": 0.49610668420791626, "learning_rate": 3.2192e-06, "loss": 1.2668, "step": 2120 }, { "epoch": 0.6816, "grad_norm": 0.5377901196479797, "learning_rate": 3.1872e-06, "loss": 1.2671, "step": 2130 }, { "epoch": 0.6848, "grad_norm": 0.5280618071556091, "learning_rate": 3.1552000000000003e-06, "loss": 1.2637, "step": 2140 }, { "epoch": 0.688, "grad_norm": 0.5266459584236145, "learning_rate": 3.1232e-06, "loss": 1.2604, "step": 2150 }, { "epoch": 0.6912, "grad_norm": 0.47189775109291077, "learning_rate": 3.0912e-06, "loss": 1.2546, "step": 2160 }, { "epoch": 0.6944, "grad_norm": 0.5069970488548279, "learning_rate": 3.0592000000000007e-06, "loss": 1.2538, "step": 2170 }, { "epoch": 0.6976, "grad_norm": 0.5452210903167725, "learning_rate": 3.0272e-06, "loss": 1.2896, "step": 2180 }, { "epoch": 0.7008, "grad_norm": 0.47197288274765015, "learning_rate": 2.9952e-06, "loss": 1.2104, "step": 2190 }, { "epoch": 0.704, "grad_norm": 0.5163410305976868, "learning_rate": 2.9632e-06, "loss": 1.2495, "step": 2200 }, { "epoch": 0.7072, "grad_norm": 0.4659384787082672, "learning_rate": 2.9312e-06, "loss": 1.226, "step": 2210 }, { "epoch": 0.7104, "grad_norm": 0.5424367189407349, "learning_rate": 2.8992000000000005e-06, "loss": 1.3475, "step": 2220 }, { "epoch": 0.7136, "grad_norm": 0.5033388137817383, "learning_rate": 2.8672e-06, "loss": 1.2415, "step": 2230 }, { "epoch": 0.7168, "grad_norm": 0.4847257733345032, "learning_rate": 2.8352e-06, "loss": 1.2562, "step": 2240 }, { "epoch": 0.72, "grad_norm": 0.5888292789459229, "learning_rate": 2.8032000000000005e-06, "loss": 1.3166, "step": 2250 }, { "epoch": 0.7232, "grad_norm": 0.5637612342834473, "learning_rate": 2.7712e-06, "loss": 1.2805, "step": 2260 }, { "epoch": 0.7264, "grad_norm": 0.477873831987381, "learning_rate": 2.7392000000000004e-06, "loss": 1.2804, "step": 2270 }, { "epoch": 0.7296, "grad_norm": 0.627713143825531, "learning_rate": 2.7072000000000005e-06, "loss": 1.2844, "step": 2280 }, { "epoch": 0.7328, "grad_norm": 0.5947350859642029, "learning_rate": 2.6752e-06, "loss": 1.28, "step": 2290 }, { "epoch": 0.736, "grad_norm": 0.49309098720550537, "learning_rate": 2.6432000000000004e-06, "loss": 1.353, "step": 2300 }, { "epoch": 0.7392, "grad_norm": 0.5657567381858826, "learning_rate": 2.6112000000000005e-06, "loss": 1.3422, "step": 2310 }, { "epoch": 0.7424, "grad_norm": 0.5906503200531006, "learning_rate": 2.5792000000000002e-06, "loss": 1.2691, "step": 2320 }, { "epoch": 0.7456, "grad_norm": 0.5093393325805664, "learning_rate": 2.5472000000000004e-06, "loss": 1.2689, "step": 2330 }, { "epoch": 0.7488, "grad_norm": 0.48354557156562805, "learning_rate": 2.5152000000000005e-06, "loss": 1.2062, "step": 2340 }, { "epoch": 0.752, "grad_norm": 0.6542074084281921, "learning_rate": 2.4832000000000002e-06, "loss": 1.2852, "step": 2350 }, { "epoch": 0.7552, "grad_norm": 0.5252315998077393, "learning_rate": 2.4512000000000003e-06, "loss": 1.2635, "step": 2360 }, { "epoch": 0.7584, "grad_norm": 0.48583582043647766, "learning_rate": 2.4192e-06, "loss": 1.2096, "step": 2370 }, { "epoch": 0.7616, "grad_norm": 0.49642977118492126, "learning_rate": 2.3872e-06, "loss": 1.2424, "step": 2380 }, { "epoch": 0.7648, "grad_norm": 0.6025352478027344, "learning_rate": 2.3552000000000003e-06, "loss": 1.2992, "step": 2390 }, { "epoch": 0.768, "grad_norm": 0.5461027026176453, "learning_rate": 2.3232e-06, "loss": 1.2946, "step": 2400 }, { "epoch": 0.7712, "grad_norm": 0.6130191683769226, "learning_rate": 2.2912e-06, "loss": 1.2398, "step": 2410 }, { "epoch": 0.7744, "grad_norm": 0.6468284726142883, "learning_rate": 2.2592000000000003e-06, "loss": 1.3087, "step": 2420 }, { "epoch": 0.7776, "grad_norm": 0.6268571019172668, "learning_rate": 2.2272e-06, "loss": 1.1613, "step": 2430 }, { "epoch": 0.7808, "grad_norm": 0.7104691863059998, "learning_rate": 2.1952e-06, "loss": 1.27, "step": 2440 }, { "epoch": 0.784, "grad_norm": 0.4856204688549042, "learning_rate": 2.1632000000000003e-06, "loss": 1.2731, "step": 2450 }, { "epoch": 0.7872, "grad_norm": 0.5168479681015015, "learning_rate": 2.1312e-06, "loss": 1.3437, "step": 2460 }, { "epoch": 0.7904, "grad_norm": 0.659817636013031, "learning_rate": 2.0992e-06, "loss": 1.2839, "step": 2470 }, { "epoch": 0.7936, "grad_norm": 0.5834536552429199, "learning_rate": 2.0672e-06, "loss": 1.3048, "step": 2480 }, { "epoch": 0.7968, "grad_norm": 0.4839385151863098, "learning_rate": 2.0352000000000004e-06, "loss": 1.2803, "step": 2490 }, { "epoch": 0.8, "grad_norm": 0.588320255279541, "learning_rate": 2.0032e-06, "loss": 1.2276, "step": 2500 }, { "epoch": 0.8032, "grad_norm": 0.5608358383178711, "learning_rate": 1.9712e-06, "loss": 1.3644, "step": 2510 }, { "epoch": 0.8064, "grad_norm": 0.5970802903175354, "learning_rate": 1.9392000000000004e-06, "loss": 1.2919, "step": 2520 }, { "epoch": 0.8096, "grad_norm": 0.5823186039924622, "learning_rate": 1.9072000000000001e-06, "loss": 1.3033, "step": 2530 }, { "epoch": 0.8128, "grad_norm": 0.5669010281562805, "learning_rate": 1.8752e-06, "loss": 1.3379, "step": 2540 }, { "epoch": 0.816, "grad_norm": 0.5039373636245728, "learning_rate": 1.8432000000000002e-06, "loss": 1.2282, "step": 2550 }, { "epoch": 0.8192, "grad_norm": 0.5700042843818665, "learning_rate": 1.8112000000000001e-06, "loss": 1.2615, "step": 2560 }, { "epoch": 0.8224, "grad_norm": 0.5190805196762085, "learning_rate": 1.7792e-06, "loss": 1.2593, "step": 2570 }, { "epoch": 0.8256, "grad_norm": 0.5930772423744202, "learning_rate": 1.7472e-06, "loss": 1.2265, "step": 2580 }, { "epoch": 0.8288, "grad_norm": 0.5103446245193481, "learning_rate": 1.7152000000000001e-06, "loss": 1.2012, "step": 2590 }, { "epoch": 0.832, "grad_norm": 0.534788966178894, "learning_rate": 1.6832e-06, "loss": 1.2393, "step": 2600 }, { "epoch": 0.8352, "grad_norm": 0.572394609451294, "learning_rate": 1.6512e-06, "loss": 1.2876, "step": 2610 }, { "epoch": 0.8384, "grad_norm": 0.4987950623035431, "learning_rate": 1.6192000000000003e-06, "loss": 1.2783, "step": 2620 }, { "epoch": 0.8416, "grad_norm": 0.5138176083564758, "learning_rate": 1.5872e-06, "loss": 1.2559, "step": 2630 }, { "epoch": 0.8448, "grad_norm": 0.5693644881248474, "learning_rate": 1.5552e-06, "loss": 1.2599, "step": 2640 }, { "epoch": 0.848, "grad_norm": 0.6024214029312134, "learning_rate": 1.5232000000000003e-06, "loss": 1.3064, "step": 2650 }, { "epoch": 0.8512, "grad_norm": 0.5588571429252625, "learning_rate": 1.4912000000000002e-06, "loss": 1.2977, "step": 2660 }, { "epoch": 0.8544, "grad_norm": 0.5551236867904663, "learning_rate": 1.4592000000000001e-06, "loss": 1.3121, "step": 2670 }, { "epoch": 0.8576, "grad_norm": 0.5989100933074951, "learning_rate": 1.4272000000000003e-06, "loss": 1.2795, "step": 2680 }, { "epoch": 0.8608, "grad_norm": 0.6164664626121521, "learning_rate": 1.3952000000000002e-06, "loss": 1.3366, "step": 2690 }, { "epoch": 0.864, "grad_norm": 0.6146747469902039, "learning_rate": 1.3632000000000001e-06, "loss": 1.2494, "step": 2700 }, { "epoch": 0.8672, "grad_norm": 0.6117052435874939, "learning_rate": 1.3312e-06, "loss": 1.2398, "step": 2710 }, { "epoch": 0.8704, "grad_norm": 0.4775325655937195, "learning_rate": 1.2992000000000002e-06, "loss": 1.3065, "step": 2720 }, { "epoch": 0.8736, "grad_norm": 0.6605592966079712, "learning_rate": 1.2672000000000001e-06, "loss": 1.1719, "step": 2730 }, { "epoch": 0.8768, "grad_norm": 0.48634928464889526, "learning_rate": 1.2352e-06, "loss": 1.2774, "step": 2740 }, { "epoch": 0.88, "grad_norm": 0.6096370220184326, "learning_rate": 1.2032e-06, "loss": 1.3231, "step": 2750 }, { "epoch": 0.8832, "grad_norm": 0.5880251526832581, "learning_rate": 1.1712000000000001e-06, "loss": 1.2641, "step": 2760 }, { "epoch": 0.8864, "grad_norm": 0.5116971135139465, "learning_rate": 1.1392e-06, "loss": 1.2763, "step": 2770 }, { "epoch": 0.8896, "grad_norm": 0.6191303730010986, "learning_rate": 1.1072000000000002e-06, "loss": 1.2622, "step": 2780 }, { "epoch": 0.8928, "grad_norm": 0.5492941737174988, "learning_rate": 1.0752e-06, "loss": 1.3002, "step": 2790 }, { "epoch": 0.896, "grad_norm": 0.6216818690299988, "learning_rate": 1.0432e-06, "loss": 1.3222, "step": 2800 }, { "epoch": 0.8992, "grad_norm": 0.5383599400520325, "learning_rate": 1.0112000000000002e-06, "loss": 1.292, "step": 2810 }, { "epoch": 0.9024, "grad_norm": 0.5288344025611877, "learning_rate": 9.792e-07, "loss": 1.2895, "step": 2820 }, { "epoch": 0.9056, "grad_norm": 0.5043691396713257, "learning_rate": 9.472e-07, "loss": 1.2499, "step": 2830 }, { "epoch": 0.9088, "grad_norm": 0.5582976341247559, "learning_rate": 9.152000000000001e-07, "loss": 1.2986, "step": 2840 }, { "epoch": 0.912, "grad_norm": 0.5215420126914978, "learning_rate": 8.832000000000001e-07, "loss": 1.3142, "step": 2850 }, { "epoch": 0.9152, "grad_norm": 0.5378311276435852, "learning_rate": 8.512000000000001e-07, "loss": 1.2104, "step": 2860 }, { "epoch": 0.9184, "grad_norm": 0.5053496360778809, "learning_rate": 8.192000000000001e-07, "loss": 1.3056, "step": 2870 }, { "epoch": 0.9216, "grad_norm": 0.5381192564964294, "learning_rate": 7.872000000000001e-07, "loss": 1.3055, "step": 2880 }, { "epoch": 0.9248, "grad_norm": 0.6026363968849182, "learning_rate": 7.552000000000001e-07, "loss": 1.346, "step": 2890 }, { "epoch": 0.928, "grad_norm": 0.5687581896781921, "learning_rate": 7.232e-07, "loss": 1.3244, "step": 2900 }, { "epoch": 0.9312, "grad_norm": 0.5862733125686646, "learning_rate": 6.912e-07, "loss": 1.2806, "step": 2910 }, { "epoch": 0.9344, "grad_norm": 0.47303637862205505, "learning_rate": 6.592000000000001e-07, "loss": 1.2337, "step": 2920 }, { "epoch": 0.9376, "grad_norm": 0.509482741355896, "learning_rate": 6.272e-07, "loss": 1.2466, "step": 2930 }, { "epoch": 0.9408, "grad_norm": 0.5245184302330017, "learning_rate": 5.952e-07, "loss": 1.2577, "step": 2940 }, { "epoch": 0.944, "grad_norm": 0.7082109451293945, "learning_rate": 5.632000000000001e-07, "loss": 1.2272, "step": 2950 }, { "epoch": 0.9472, "grad_norm": 0.4797827899456024, "learning_rate": 5.312000000000001e-07, "loss": 1.3238, "step": 2960 }, { "epoch": 0.9504, "grad_norm": 0.5341638326644897, "learning_rate": 4.992e-07, "loss": 1.313, "step": 2970 }, { "epoch": 0.9536, "grad_norm": 0.5286096334457397, "learning_rate": 4.672e-07, "loss": 1.2538, "step": 2980 }, { "epoch": 0.9568, "grad_norm": 0.5771506428718567, "learning_rate": 4.352000000000001e-07, "loss": 1.2869, "step": 2990 }, { "epoch": 0.96, "grad_norm": 0.5290225744247437, "learning_rate": 4.0320000000000006e-07, "loss": 1.2882, "step": 3000 } ], "logging_steps": 10, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.8776953724928e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }