| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 3125, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0032, |
| "grad_norm": 0.09734748303890228, |
| "learning_rate": 9.9712e-06, |
| "loss": 1.6245, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0064, |
| "grad_norm": 0.0981544628739357, |
| "learning_rate": 9.939200000000001e-06, |
| "loss": 1.5375, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0096, |
| "grad_norm": 0.09418202936649323, |
| "learning_rate": 9.9072e-06, |
| "loss": 1.5647, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0128, |
| "grad_norm": 0.10748359560966492, |
| "learning_rate": 9.8752e-06, |
| "loss": 1.6781, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 0.12658047676086426, |
| "learning_rate": 9.843200000000001e-06, |
| "loss": 1.5854, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0192, |
| "grad_norm": 0.1334228664636612, |
| "learning_rate": 9.8112e-06, |
| "loss": 1.5453, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.0224, |
| "grad_norm": 0.15112873911857605, |
| "learning_rate": 9.779200000000001e-06, |
| "loss": 1.5721, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.0256, |
| "grad_norm": 0.140653595328331, |
| "learning_rate": 9.7472e-06, |
| "loss": 1.5162, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.0288, |
| "grad_norm": 0.16999679803848267, |
| "learning_rate": 9.715200000000001e-06, |
| "loss": 1.5689, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.1928016096353531, |
| "learning_rate": 9.6832e-06, |
| "loss": 1.5845, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0352, |
| "grad_norm": 0.19378426671028137, |
| "learning_rate": 9.6512e-06, |
| "loss": 1.5386, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0384, |
| "grad_norm": 0.24590148031711578, |
| "learning_rate": 9.619200000000001e-06, |
| "loss": 1.4133, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.0416, |
| "grad_norm": 0.23824049532413483, |
| "learning_rate": 9.5872e-06, |
| "loss": 1.4573, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0448, |
| "grad_norm": 0.19866596162319183, |
| "learning_rate": 9.555200000000001e-06, |
| "loss": 1.4357, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 0.2909606993198395, |
| "learning_rate": 9.5232e-06, |
| "loss": 1.3924, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.0512, |
| "grad_norm": 0.48891496658325195, |
| "learning_rate": 9.4912e-06, |
| "loss": 1.4041, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.0544, |
| "grad_norm": 0.3921829164028168, |
| "learning_rate": 9.4592e-06, |
| "loss": 1.3158, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0576, |
| "grad_norm": 0.293231338262558, |
| "learning_rate": 9.4272e-06, |
| "loss": 1.4709, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.0608, |
| "grad_norm": 0.27421411871910095, |
| "learning_rate": 9.395200000000001e-06, |
| "loss": 1.4046, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 0.1971723437309265, |
| "learning_rate": 9.3632e-06, |
| "loss": 1.417, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0672, |
| "grad_norm": 0.27423539757728577, |
| "learning_rate": 9.3312e-06, |
| "loss": 1.3721, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0704, |
| "grad_norm": 0.4509432315826416, |
| "learning_rate": 9.2992e-06, |
| "loss": 1.4333, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0736, |
| "grad_norm": 0.3389282822608948, |
| "learning_rate": 9.2672e-06, |
| "loss": 1.352, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.0768, |
| "grad_norm": 0.2814404368400574, |
| "learning_rate": 9.235200000000001e-06, |
| "loss": 1.3682, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.2661599814891815, |
| "learning_rate": 9.2032e-06, |
| "loss": 1.3661, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0832, |
| "grad_norm": 0.29006555676460266, |
| "learning_rate": 9.171200000000001e-06, |
| "loss": 1.299, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.0864, |
| "grad_norm": 0.2795925438404083, |
| "learning_rate": 9.1392e-06, |
| "loss": 1.3144, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.0896, |
| "grad_norm": 0.25778502225875854, |
| "learning_rate": 9.1072e-06, |
| "loss": 1.2957, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.0928, |
| "grad_norm": 0.26814839243888855, |
| "learning_rate": 9.0752e-06, |
| "loss": 1.3356, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 0.3247470557689667, |
| "learning_rate": 9.0432e-06, |
| "loss": 1.3458, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0992, |
| "grad_norm": 0.36921611428260803, |
| "learning_rate": 9.011200000000001e-06, |
| "loss": 1.3601, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.1024, |
| "grad_norm": 0.31122124195098877, |
| "learning_rate": 8.979200000000002e-06, |
| "loss": 1.3131, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.1056, |
| "grad_norm": 0.3557804822921753, |
| "learning_rate": 8.9472e-06, |
| "loss": 1.426, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.1088, |
| "grad_norm": 0.3266560137271881, |
| "learning_rate": 8.9152e-06, |
| "loss": 1.3386, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 0.3932088017463684, |
| "learning_rate": 8.8832e-06, |
| "loss": 1.3982, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.1152, |
| "grad_norm": 0.32620078325271606, |
| "learning_rate": 8.851200000000001e-06, |
| "loss": 1.3048, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.1184, |
| "grad_norm": 0.30419647693634033, |
| "learning_rate": 8.819200000000002e-06, |
| "loss": 1.3761, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.1216, |
| "grad_norm": 0.29732590913772583, |
| "learning_rate": 8.7872e-06, |
| "loss": 1.2566, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.1248, |
| "grad_norm": 0.28484678268432617, |
| "learning_rate": 8.7552e-06, |
| "loss": 1.3666, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 0.4168960154056549, |
| "learning_rate": 8.7232e-06, |
| "loss": 1.3396, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1312, |
| "grad_norm": 0.3573697507381439, |
| "learning_rate": 8.6912e-06, |
| "loss": 1.3684, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.1344, |
| "grad_norm": 0.4777122735977173, |
| "learning_rate": 8.659200000000002e-06, |
| "loss": 1.3059, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.1376, |
| "grad_norm": 0.26450300216674805, |
| "learning_rate": 8.627200000000001e-06, |
| "loss": 1.3283, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.1408, |
| "grad_norm": 0.37447720766067505, |
| "learning_rate": 8.5952e-06, |
| "loss": 1.2667, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 0.30257123708724976, |
| "learning_rate": 8.5632e-06, |
| "loss": 1.3147, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1472, |
| "grad_norm": 0.34745684266090393, |
| "learning_rate": 8.5312e-06, |
| "loss": 1.3603, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.1504, |
| "grad_norm": 0.2882753312587738, |
| "learning_rate": 8.499200000000002e-06, |
| "loss": 1.3087, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.1536, |
| "grad_norm": 0.3751160204410553, |
| "learning_rate": 8.467200000000001e-06, |
| "loss": 1.342, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.1568, |
| "grad_norm": 0.3185778260231018, |
| "learning_rate": 8.435200000000002e-06, |
| "loss": 1.35, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.2853422164916992, |
| "learning_rate": 8.4032e-06, |
| "loss": 1.3105, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1632, |
| "grad_norm": 0.3187882602214813, |
| "learning_rate": 8.3712e-06, |
| "loss": 1.2915, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.1664, |
| "grad_norm": 0.4516860842704773, |
| "learning_rate": 8.339200000000001e-06, |
| "loss": 1.3449, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.1696, |
| "grad_norm": 0.3336597681045532, |
| "learning_rate": 8.3072e-06, |
| "loss": 1.2989, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.1728, |
| "grad_norm": 0.4279087781906128, |
| "learning_rate": 8.275200000000002e-06, |
| "loss": 1.2412, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 0.4071614742279053, |
| "learning_rate": 8.243200000000001e-06, |
| "loss": 1.414, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.1792, |
| "grad_norm": 0.3194911479949951, |
| "learning_rate": 8.2112e-06, |
| "loss": 1.2762, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.1824, |
| "grad_norm": 0.3617415428161621, |
| "learning_rate": 8.179200000000001e-06, |
| "loss": 1.3225, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.1856, |
| "grad_norm": 0.3274191915988922, |
| "learning_rate": 8.1472e-06, |
| "loss": 1.3464, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.1888, |
| "grad_norm": 0.35526078939437866, |
| "learning_rate": 8.115200000000002e-06, |
| "loss": 1.315, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 0.3728134036064148, |
| "learning_rate": 8.0832e-06, |
| "loss": 1.3023, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1952, |
| "grad_norm": 0.4048090875148773, |
| "learning_rate": 8.0512e-06, |
| "loss": 1.2751, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.1984, |
| "grad_norm": 0.41539278626441956, |
| "learning_rate": 8.019200000000001e-06, |
| "loss": 1.3533, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.2016, |
| "grad_norm": 0.3269357979297638, |
| "learning_rate": 7.9872e-06, |
| "loss": 1.2709, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.2048, |
| "grad_norm": 0.3444967567920685, |
| "learning_rate": 7.955200000000001e-06, |
| "loss": 1.3119, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 0.34097886085510254, |
| "learning_rate": 7.9232e-06, |
| "loss": 1.3444, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.2112, |
| "grad_norm": 0.42459428310394287, |
| "learning_rate": 7.891200000000002e-06, |
| "loss": 1.325, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.2144, |
| "grad_norm": 0.3942951261997223, |
| "learning_rate": 7.859200000000001e-06, |
| "loss": 1.3732, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.2176, |
| "grad_norm": 0.33468231558799744, |
| "learning_rate": 7.8272e-06, |
| "loss": 1.2883, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.2208, |
| "grad_norm": 0.3964150547981262, |
| "learning_rate": 7.795200000000001e-06, |
| "loss": 1.4014, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 0.3447844386100769, |
| "learning_rate": 7.7632e-06, |
| "loss": 1.3205, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2272, |
| "grad_norm": 0.380398154258728, |
| "learning_rate": 7.731200000000001e-06, |
| "loss": 1.2819, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.2304, |
| "grad_norm": 0.3823450207710266, |
| "learning_rate": 7.6992e-06, |
| "loss": 1.3097, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.2336, |
| "grad_norm": 0.3383599817752838, |
| "learning_rate": 7.6672e-06, |
| "loss": 1.346, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.2368, |
| "grad_norm": 0.39140060544013977, |
| "learning_rate": 7.635200000000001e-06, |
| "loss": 1.2961, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.32159295678138733, |
| "learning_rate": 7.6032e-06, |
| "loss": 1.3045, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2432, |
| "grad_norm": 0.3853408098220825, |
| "learning_rate": 7.5712000000000005e-06, |
| "loss": 1.2935, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.2464, |
| "grad_norm": 0.39150312542915344, |
| "learning_rate": 7.539200000000001e-06, |
| "loss": 1.2976, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.2496, |
| "grad_norm": 0.39306044578552246, |
| "learning_rate": 7.507200000000001e-06, |
| "loss": 1.2588, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.2528, |
| "grad_norm": 0.39256688952445984, |
| "learning_rate": 7.4752e-06, |
| "loss": 1.3252, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 0.3738512098789215, |
| "learning_rate": 7.4432e-06, |
| "loss": 1.3162, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.2592, |
| "grad_norm": 0.4799080491065979, |
| "learning_rate": 7.4112e-06, |
| "loss": 1.2993, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.2624, |
| "grad_norm": 0.4616535007953644, |
| "learning_rate": 7.3792000000000004e-06, |
| "loss": 1.3356, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.2656, |
| "grad_norm": 0.37460416555404663, |
| "learning_rate": 7.347200000000001e-06, |
| "loss": 1.2938, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.2688, |
| "grad_norm": 0.4229544997215271, |
| "learning_rate": 7.3152e-06, |
| "loss": 1.26, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 0.5051556825637817, |
| "learning_rate": 7.2832e-06, |
| "loss": 1.2868, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2752, |
| "grad_norm": 0.3845407962799072, |
| "learning_rate": 7.2512e-06, |
| "loss": 1.3255, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.2784, |
| "grad_norm": 0.43234601616859436, |
| "learning_rate": 7.2192e-06, |
| "loss": 1.2756, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.2816, |
| "grad_norm": 0.390572190284729, |
| "learning_rate": 7.187200000000001e-06, |
| "loss": 1.3053, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.2848, |
| "grad_norm": 0.385815292596817, |
| "learning_rate": 7.155200000000001e-06, |
| "loss": 1.2608, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 0.4778871238231659, |
| "learning_rate": 7.1232e-06, |
| "loss": 1.3109, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.2912, |
| "grad_norm": 0.3777396082878113, |
| "learning_rate": 7.0912e-06, |
| "loss": 1.2723, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.2944, |
| "grad_norm": 0.4682841897010803, |
| "learning_rate": 7.0592e-06, |
| "loss": 1.3304, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.2976, |
| "grad_norm": 0.3837222754955292, |
| "learning_rate": 7.027200000000001e-06, |
| "loss": 1.3081, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.3008, |
| "grad_norm": 0.3792935907840729, |
| "learning_rate": 6.995200000000001e-06, |
| "loss": 1.3176, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 0.476096510887146, |
| "learning_rate": 6.963200000000001e-06, |
| "loss": 1.2764, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.3072, |
| "grad_norm": 0.4119466543197632, |
| "learning_rate": 6.9312e-06, |
| "loss": 1.3563, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.3104, |
| "grad_norm": 0.40938565135002136, |
| "learning_rate": 6.8992e-06, |
| "loss": 1.2782, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.3136, |
| "grad_norm": 0.4305261969566345, |
| "learning_rate": 6.867200000000001e-06, |
| "loss": 1.3333, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.3168, |
| "grad_norm": 0.3533143997192383, |
| "learning_rate": 6.835200000000001e-06, |
| "loss": 1.3686, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.43104642629623413, |
| "learning_rate": 6.803200000000001e-06, |
| "loss": 1.3461, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3232, |
| "grad_norm": 0.5197634696960449, |
| "learning_rate": 6.771200000000001e-06, |
| "loss": 1.3316, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.3264, |
| "grad_norm": 0.4084891080856323, |
| "learning_rate": 6.7392e-06, |
| "loss": 1.2941, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.3296, |
| "grad_norm": 0.4634837508201599, |
| "learning_rate": 6.707200000000001e-06, |
| "loss": 1.2982, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.3328, |
| "grad_norm": 0.4361494183540344, |
| "learning_rate": 6.675200000000001e-06, |
| "loss": 1.334, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 0.36735212802886963, |
| "learning_rate": 6.643200000000001e-06, |
| "loss": 1.3642, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3392, |
| "grad_norm": 0.3968944847583771, |
| "learning_rate": 6.611200000000001e-06, |
| "loss": 1.3784, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.3424, |
| "grad_norm": 0.39363133907318115, |
| "learning_rate": 6.5792e-06, |
| "loss": 1.2715, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.3456, |
| "grad_norm": 0.4664965867996216, |
| "learning_rate": 6.547200000000001e-06, |
| "loss": 1.3436, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.3488, |
| "grad_norm": 0.3857831358909607, |
| "learning_rate": 6.515200000000001e-06, |
| "loss": 1.3084, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 0.41258570551872253, |
| "learning_rate": 6.483200000000001e-06, |
| "loss": 1.3288, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3552, |
| "grad_norm": 0.3971725404262543, |
| "learning_rate": 6.451200000000001e-06, |
| "loss": 1.3321, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.3584, |
| "grad_norm": 0.3993317186832428, |
| "learning_rate": 6.419200000000001e-06, |
| "loss": 1.3385, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.3616, |
| "grad_norm": 0.5872831344604492, |
| "learning_rate": 6.3872000000000004e-06, |
| "loss": 1.2817, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.3648, |
| "grad_norm": 0.47822561860084534, |
| "learning_rate": 6.355200000000001e-06, |
| "loss": 1.3083, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 0.5206847786903381, |
| "learning_rate": 6.323200000000001e-06, |
| "loss": 1.3457, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3712, |
| "grad_norm": 0.41014567017555237, |
| "learning_rate": 6.291200000000001e-06, |
| "loss": 1.2687, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.3744, |
| "grad_norm": 0.39573901891708374, |
| "learning_rate": 6.259200000000001e-06, |
| "loss": 1.3257, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.3776, |
| "grad_norm": 0.40908557176589966, |
| "learning_rate": 6.227200000000001e-06, |
| "loss": 1.2587, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.3808, |
| "grad_norm": 0.4308335781097412, |
| "learning_rate": 6.1952e-06, |
| "loss": 1.2764, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 0.41657981276512146, |
| "learning_rate": 6.1632000000000006e-06, |
| "loss": 1.3305, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.3872, |
| "grad_norm": 0.446154922246933, |
| "learning_rate": 6.131200000000001e-06, |
| "loss": 1.3323, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.3904, |
| "grad_norm": 0.43903544545173645, |
| "learning_rate": 6.099200000000001e-06, |
| "loss": 1.2731, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.3936, |
| "grad_norm": 0.4204481542110443, |
| "learning_rate": 6.067200000000001e-06, |
| "loss": 1.2569, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.3968, |
| "grad_norm": 0.4393060803413391, |
| "learning_rate": 6.0352e-06, |
| "loss": 1.3119, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.42466068267822266, |
| "learning_rate": 6.0032e-06, |
| "loss": 1.2106, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.4032, |
| "grad_norm": 0.40182891488075256, |
| "learning_rate": 5.9712000000000005e-06, |
| "loss": 1.2566, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.4064, |
| "grad_norm": 0.3702845275402069, |
| "learning_rate": 5.939200000000001e-06, |
| "loss": 1.3344, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.4096, |
| "grad_norm": 0.4409834146499634, |
| "learning_rate": 5.907200000000001e-06, |
| "loss": 1.2553, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.4128, |
| "grad_norm": 0.5070372223854065, |
| "learning_rate": 5.875200000000001e-06, |
| "loss": 1.2901, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 0.44239479303359985, |
| "learning_rate": 5.8432e-06, |
| "loss": 1.2086, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.4192, |
| "grad_norm": 0.5466510653495789, |
| "learning_rate": 5.8112e-06, |
| "loss": 1.2959, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.4224, |
| "grad_norm": 0.5056144595146179, |
| "learning_rate": 5.7792000000000005e-06, |
| "loss": 1.3353, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.4256, |
| "grad_norm": 0.42606833577156067, |
| "learning_rate": 5.747200000000001e-06, |
| "loss": 1.3108, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.4288, |
| "grad_norm": 0.41976213455200195, |
| "learning_rate": 5.715200000000001e-06, |
| "loss": 1.3248, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 0.48559048771858215, |
| "learning_rate": 5.683200000000001e-06, |
| "loss": 1.2686, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.4352, |
| "grad_norm": 0.47761228680610657, |
| "learning_rate": 5.6512e-06, |
| "loss": 1.281, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.4384, |
| "grad_norm": 0.4777953028678894, |
| "learning_rate": 5.6192e-06, |
| "loss": 1.2829, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.4416, |
| "grad_norm": 0.44091978669166565, |
| "learning_rate": 5.5872000000000005e-06, |
| "loss": 1.3032, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.4448, |
| "grad_norm": 0.48977166414260864, |
| "learning_rate": 5.555200000000001e-06, |
| "loss": 1.3418, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 0.6014530062675476, |
| "learning_rate": 5.523200000000001e-06, |
| "loss": 1.2119, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.4512, |
| "grad_norm": 0.4750172793865204, |
| "learning_rate": 5.491200000000001e-06, |
| "loss": 1.3432, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.4544, |
| "grad_norm": 0.5095167756080627, |
| "learning_rate": 5.4592e-06, |
| "loss": 1.3448, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.4576, |
| "grad_norm": 0.47408685088157654, |
| "learning_rate": 5.4272e-06, |
| "loss": 1.3436, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.4608, |
| "grad_norm": 0.45464885234832764, |
| "learning_rate": 5.3952000000000005e-06, |
| "loss": 1.1962, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 0.431349515914917, |
| "learning_rate": 5.363200000000001e-06, |
| "loss": 1.2773, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4672, |
| "grad_norm": 0.444397896528244, |
| "learning_rate": 5.331200000000001e-06, |
| "loss": 1.3163, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.4704, |
| "grad_norm": 0.4360913038253784, |
| "learning_rate": 5.2992e-06, |
| "loss": 1.2759, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.4736, |
| "grad_norm": 0.5152497887611389, |
| "learning_rate": 5.2672e-06, |
| "loss": 1.3225, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.4768, |
| "grad_norm": 0.48929157853126526, |
| "learning_rate": 5.2352e-06, |
| "loss": 1.3213, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.4925262928009033, |
| "learning_rate": 5.2032000000000004e-06, |
| "loss": 1.2008, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.4832, |
| "grad_norm": 0.46162164211273193, |
| "learning_rate": 5.1712000000000006e-06, |
| "loss": 1.2996, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.4864, |
| "grad_norm": 0.4908200800418854, |
| "learning_rate": 5.139200000000001e-06, |
| "loss": 1.2729, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.4896, |
| "grad_norm": 0.5178566575050354, |
| "learning_rate": 5.1072e-06, |
| "loss": 1.293, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.4928, |
| "grad_norm": 0.5733951330184937, |
| "learning_rate": 5.0752e-06, |
| "loss": 1.3573, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 0.4558843672275543, |
| "learning_rate": 5.0432e-06, |
| "loss": 1.3445, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.4992, |
| "grad_norm": 0.5171469449996948, |
| "learning_rate": 5.0112e-06, |
| "loss": 1.2293, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.5024, |
| "grad_norm": 0.4879666864871979, |
| "learning_rate": 4.9792000000000005e-06, |
| "loss": 1.31, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.5056, |
| "grad_norm": 0.4393675923347473, |
| "learning_rate": 4.947200000000001e-06, |
| "loss": 1.3186, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.5088, |
| "grad_norm": 0.5072659254074097, |
| "learning_rate": 4.915200000000001e-06, |
| "loss": 1.2857, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.512, |
| "grad_norm": 0.5163191556930542, |
| "learning_rate": 4.8832e-06, |
| "loss": 1.3401, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5152, |
| "grad_norm": 0.5119105577468872, |
| "learning_rate": 4.8512e-06, |
| "loss": 1.32, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.5184, |
| "grad_norm": 0.5342932939529419, |
| "learning_rate": 4.8192e-06, |
| "loss": 1.206, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.5216, |
| "grad_norm": 0.4517419636249542, |
| "learning_rate": 4.7872000000000005e-06, |
| "loss": 1.3077, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.5248, |
| "grad_norm": 0.46141722798347473, |
| "learning_rate": 4.755200000000001e-06, |
| "loss": 1.2873, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.528, |
| "grad_norm": 0.41747117042541504, |
| "learning_rate": 4.723200000000001e-06, |
| "loss": 1.2715, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.5312, |
| "grad_norm": 0.48263996839523315, |
| "learning_rate": 4.6912e-06, |
| "loss": 1.2814, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.5344, |
| "grad_norm": 0.4876611828804016, |
| "learning_rate": 4.6592e-06, |
| "loss": 1.2776, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.5376, |
| "grad_norm": 0.46099624037742615, |
| "learning_rate": 4.6272e-06, |
| "loss": 1.3839, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.5408, |
| "grad_norm": 0.46614623069763184, |
| "learning_rate": 4.5952000000000005e-06, |
| "loss": 1.2717, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.544, |
| "grad_norm": 0.48747870326042175, |
| "learning_rate": 4.563200000000001e-06, |
| "loss": 1.2937, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5472, |
| "grad_norm": 0.5542135238647461, |
| "learning_rate": 4.531200000000001e-06, |
| "loss": 1.2622, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.5504, |
| "grad_norm": 0.46008777618408203, |
| "learning_rate": 4.4992e-06, |
| "loss": 1.3188, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.5536, |
| "grad_norm": 0.4853471517562866, |
| "learning_rate": 4.4672e-06, |
| "loss": 1.252, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.5568, |
| "grad_norm": 0.44900670647621155, |
| "learning_rate": 4.4352e-06, |
| "loss": 1.2549, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.4973522126674652, |
| "learning_rate": 4.4032000000000005e-06, |
| "loss": 1.2959, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5632, |
| "grad_norm": 0.45412448048591614, |
| "learning_rate": 4.371200000000001e-06, |
| "loss": 1.2092, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.5664, |
| "grad_norm": 0.5110604763031006, |
| "learning_rate": 4.3392e-06, |
| "loss": 1.3127, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.5696, |
| "grad_norm": 0.5951307415962219, |
| "learning_rate": 4.3072e-06, |
| "loss": 1.2603, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.5728, |
| "grad_norm": 0.49740588665008545, |
| "learning_rate": 4.2752e-06, |
| "loss": 1.2609, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.576, |
| "grad_norm": 0.4803503155708313, |
| "learning_rate": 4.2432e-06, |
| "loss": 1.2287, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5792, |
| "grad_norm": 0.48638489842414856, |
| "learning_rate": 4.2112000000000004e-06, |
| "loss": 1.2245, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.5824, |
| "grad_norm": 0.48148202896118164, |
| "learning_rate": 4.179200000000001e-06, |
| "loss": 1.2858, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.5856, |
| "grad_norm": 0.5493887662887573, |
| "learning_rate": 4.1472e-06, |
| "loss": 1.2765, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.5888, |
| "grad_norm": 0.45376092195510864, |
| "learning_rate": 4.1152e-06, |
| "loss": 1.1914, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.592, |
| "grad_norm": 0.5095167756080627, |
| "learning_rate": 4.0832e-06, |
| "loss": 1.2916, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5952, |
| "grad_norm": 0.5425928831100464, |
| "learning_rate": 4.0512e-06, |
| "loss": 1.2189, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.5984, |
| "grad_norm": 0.46790796518325806, |
| "learning_rate": 4.0192e-06, |
| "loss": 1.3668, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.6016, |
| "grad_norm": 0.48903679847717285, |
| "learning_rate": 3.9872000000000006e-06, |
| "loss": 1.2132, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.6048, |
| "grad_norm": 0.47461065649986267, |
| "learning_rate": 3.9552e-06, |
| "loss": 1.2794, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.608, |
| "grad_norm": 0.4707651436328888, |
| "learning_rate": 3.9232e-06, |
| "loss": 1.3, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.6112, |
| "grad_norm": 0.5604966878890991, |
| "learning_rate": 3.8912e-06, |
| "loss": 1.2272, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.6144, |
| "grad_norm": 0.5373271107673645, |
| "learning_rate": 3.8592e-06, |
| "loss": 1.2522, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.6176, |
| "grad_norm": 0.50235915184021, |
| "learning_rate": 3.8272e-06, |
| "loss": 1.2486, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.6208, |
| "grad_norm": 0.4826876223087311, |
| "learning_rate": 3.7952000000000005e-06, |
| "loss": 1.3355, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.624, |
| "grad_norm": 0.46976956725120544, |
| "learning_rate": 3.7632000000000002e-06, |
| "loss": 1.2725, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.6272, |
| "grad_norm": 0.5186979174613953, |
| "learning_rate": 3.7312000000000004e-06, |
| "loss": 1.3073, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.6304, |
| "grad_norm": 0.4939082860946655, |
| "learning_rate": 3.6992000000000005e-06, |
| "loss": 1.2649, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.6336, |
| "grad_norm": 0.5091391205787659, |
| "learning_rate": 3.6672000000000002e-06, |
| "loss": 1.4142, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.6368, |
| "grad_norm": 0.4665001928806305, |
| "learning_rate": 3.6352000000000004e-06, |
| "loss": 1.2606, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.48443859815597534, |
| "learning_rate": 3.6032e-06, |
| "loss": 1.1884, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.6432, |
| "grad_norm": 0.5871022939682007, |
| "learning_rate": 3.5712000000000002e-06, |
| "loss": 1.3792, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.6464, |
| "grad_norm": 0.48302605748176575, |
| "learning_rate": 3.5392000000000004e-06, |
| "loss": 1.262, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.6496, |
| "grad_norm": 0.4569855034351349, |
| "learning_rate": 3.5072e-06, |
| "loss": 1.2587, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.6528, |
| "grad_norm": 0.5194870829582214, |
| "learning_rate": 3.4752e-06, |
| "loss": 1.3056, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.656, |
| "grad_norm": 0.4751642346382141, |
| "learning_rate": 3.4432000000000003e-06, |
| "loss": 1.1733, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6592, |
| "grad_norm": 0.5077437162399292, |
| "learning_rate": 3.4112e-06, |
| "loss": 1.3218, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.6624, |
| "grad_norm": 0.49009519815444946, |
| "learning_rate": 3.3792e-06, |
| "loss": 1.224, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.6656, |
| "grad_norm": 0.4634891152381897, |
| "learning_rate": 3.3472000000000003e-06, |
| "loss": 1.2727, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.6688, |
| "grad_norm": 0.5274826884269714, |
| "learning_rate": 3.3152e-06, |
| "loss": 1.2916, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.672, |
| "grad_norm": 0.5165941715240479, |
| "learning_rate": 3.2832e-06, |
| "loss": 1.2878, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6752, |
| "grad_norm": 0.5654541254043579, |
| "learning_rate": 3.2512000000000003e-06, |
| "loss": 1.2749, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.6784, |
| "grad_norm": 0.49610668420791626, |
| "learning_rate": 3.2192e-06, |
| "loss": 1.2668, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.6816, |
| "grad_norm": 0.5377901196479797, |
| "learning_rate": 3.1872e-06, |
| "loss": 1.2671, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.6848, |
| "grad_norm": 0.5280618071556091, |
| "learning_rate": 3.1552000000000003e-06, |
| "loss": 1.2637, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.688, |
| "grad_norm": 0.5266459584236145, |
| "learning_rate": 3.1232e-06, |
| "loss": 1.2604, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6912, |
| "grad_norm": 0.47189775109291077, |
| "learning_rate": 3.0912e-06, |
| "loss": 1.2546, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.6944, |
| "grad_norm": 0.5069970488548279, |
| "learning_rate": 3.0592000000000007e-06, |
| "loss": 1.2538, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.6976, |
| "grad_norm": 0.5452210903167725, |
| "learning_rate": 3.0272e-06, |
| "loss": 1.2896, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.7008, |
| "grad_norm": 0.47197288274765015, |
| "learning_rate": 2.9952e-06, |
| "loss": 1.2104, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.704, |
| "grad_norm": 0.5163410305976868, |
| "learning_rate": 2.9632e-06, |
| "loss": 1.2495, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.7072, |
| "grad_norm": 0.4659384787082672, |
| "learning_rate": 2.9312e-06, |
| "loss": 1.226, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.7104, |
| "grad_norm": 0.5424367189407349, |
| "learning_rate": 2.8992000000000005e-06, |
| "loss": 1.3475, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.7136, |
| "grad_norm": 0.5033388137817383, |
| "learning_rate": 2.8672e-06, |
| "loss": 1.2415, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.7168, |
| "grad_norm": 0.4847257733345032, |
| "learning_rate": 2.8352e-06, |
| "loss": 1.2562, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.5888292789459229, |
| "learning_rate": 2.8032000000000005e-06, |
| "loss": 1.3166, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.7232, |
| "grad_norm": 0.5637612342834473, |
| "learning_rate": 2.7712e-06, |
| "loss": 1.2805, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.7264, |
| "grad_norm": 0.477873831987381, |
| "learning_rate": 2.7392000000000004e-06, |
| "loss": 1.2804, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.7296, |
| "grad_norm": 0.627713143825531, |
| "learning_rate": 2.7072000000000005e-06, |
| "loss": 1.2844, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.7328, |
| "grad_norm": 0.5947350859642029, |
| "learning_rate": 2.6752e-06, |
| "loss": 1.28, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.736, |
| "grad_norm": 0.49309098720550537, |
| "learning_rate": 2.6432000000000004e-06, |
| "loss": 1.353, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.7392, |
| "grad_norm": 0.5657567381858826, |
| "learning_rate": 2.6112000000000005e-06, |
| "loss": 1.3422, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.7424, |
| "grad_norm": 0.5906503200531006, |
| "learning_rate": 2.5792000000000002e-06, |
| "loss": 1.2691, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.7456, |
| "grad_norm": 0.5093393325805664, |
| "learning_rate": 2.5472000000000004e-06, |
| "loss": 1.2689, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.7488, |
| "grad_norm": 0.48354557156562805, |
| "learning_rate": 2.5152000000000005e-06, |
| "loss": 1.2062, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.752, |
| "grad_norm": 0.6542074084281921, |
| "learning_rate": 2.4832000000000002e-06, |
| "loss": 1.2852, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.7552, |
| "grad_norm": 0.5252315998077393, |
| "learning_rate": 2.4512000000000003e-06, |
| "loss": 1.2635, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.7584, |
| "grad_norm": 0.48583582043647766, |
| "learning_rate": 2.4192e-06, |
| "loss": 1.2096, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.7616, |
| "grad_norm": 0.49642977118492126, |
| "learning_rate": 2.3872e-06, |
| "loss": 1.2424, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.7648, |
| "grad_norm": 0.6025352478027344, |
| "learning_rate": 2.3552000000000003e-06, |
| "loss": 1.2992, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.768, |
| "grad_norm": 0.5461027026176453, |
| "learning_rate": 2.3232e-06, |
| "loss": 1.2946, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7712, |
| "grad_norm": 0.6130191683769226, |
| "learning_rate": 2.2912e-06, |
| "loss": 1.2398, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.7744, |
| "grad_norm": 0.6468284726142883, |
| "learning_rate": 2.2592000000000003e-06, |
| "loss": 1.3087, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.7776, |
| "grad_norm": 0.6268571019172668, |
| "learning_rate": 2.2272e-06, |
| "loss": 1.1613, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.7808, |
| "grad_norm": 0.7104691863059998, |
| "learning_rate": 2.1952e-06, |
| "loss": 1.27, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.784, |
| "grad_norm": 0.4856204688549042, |
| "learning_rate": 2.1632000000000003e-06, |
| "loss": 1.2731, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7872, |
| "grad_norm": 0.5168479681015015, |
| "learning_rate": 2.1312e-06, |
| "loss": 1.3437, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.7904, |
| "grad_norm": 0.659817636013031, |
| "learning_rate": 2.0992e-06, |
| "loss": 1.2839, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.7936, |
| "grad_norm": 0.5834536552429199, |
| "learning_rate": 2.0672e-06, |
| "loss": 1.3048, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.7968, |
| "grad_norm": 0.4839385151863098, |
| "learning_rate": 2.0352000000000004e-06, |
| "loss": 1.2803, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.588320255279541, |
| "learning_rate": 2.0032e-06, |
| "loss": 1.2276, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.8032, |
| "grad_norm": 0.5608358383178711, |
| "learning_rate": 1.9712e-06, |
| "loss": 1.3644, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.8064, |
| "grad_norm": 0.5970802903175354, |
| "learning_rate": 1.9392000000000004e-06, |
| "loss": 1.2919, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.8096, |
| "grad_norm": 0.5823186039924622, |
| "learning_rate": 1.9072000000000001e-06, |
| "loss": 1.3033, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.8128, |
| "grad_norm": 0.5669010281562805, |
| "learning_rate": 1.8752e-06, |
| "loss": 1.3379, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.816, |
| "grad_norm": 0.5039373636245728, |
| "learning_rate": 1.8432000000000002e-06, |
| "loss": 1.2282, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.8192, |
| "grad_norm": 0.5700042843818665, |
| "learning_rate": 1.8112000000000001e-06, |
| "loss": 1.2615, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.8224, |
| "grad_norm": 0.5190805196762085, |
| "learning_rate": 1.7792e-06, |
| "loss": 1.2593, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.8256, |
| "grad_norm": 0.5930772423744202, |
| "learning_rate": 1.7472e-06, |
| "loss": 1.2265, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.8288, |
| "grad_norm": 0.5103446245193481, |
| "learning_rate": 1.7152000000000001e-06, |
| "loss": 1.2012, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.832, |
| "grad_norm": 0.534788966178894, |
| "learning_rate": 1.6832e-06, |
| "loss": 1.2393, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.8352, |
| "grad_norm": 0.572394609451294, |
| "learning_rate": 1.6512e-06, |
| "loss": 1.2876, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.8384, |
| "grad_norm": 0.4987950623035431, |
| "learning_rate": 1.6192000000000003e-06, |
| "loss": 1.2783, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.8416, |
| "grad_norm": 0.5138176083564758, |
| "learning_rate": 1.5872e-06, |
| "loss": 1.2559, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.8448, |
| "grad_norm": 0.5693644881248474, |
| "learning_rate": 1.5552e-06, |
| "loss": 1.2599, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.848, |
| "grad_norm": 0.6024214029312134, |
| "learning_rate": 1.5232000000000003e-06, |
| "loss": 1.3064, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.8512, |
| "grad_norm": 0.5588571429252625, |
| "learning_rate": 1.4912000000000002e-06, |
| "loss": 1.2977, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.8544, |
| "grad_norm": 0.5551236867904663, |
| "learning_rate": 1.4592000000000001e-06, |
| "loss": 1.3121, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.8576, |
| "grad_norm": 0.5989100933074951, |
| "learning_rate": 1.4272000000000003e-06, |
| "loss": 1.2795, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.8608, |
| "grad_norm": 0.6164664626121521, |
| "learning_rate": 1.3952000000000002e-06, |
| "loss": 1.3366, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.864, |
| "grad_norm": 0.6146747469902039, |
| "learning_rate": 1.3632000000000001e-06, |
| "loss": 1.2494, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8672, |
| "grad_norm": 0.6117052435874939, |
| "learning_rate": 1.3312e-06, |
| "loss": 1.2398, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.8704, |
| "grad_norm": 0.4775325655937195, |
| "learning_rate": 1.2992000000000002e-06, |
| "loss": 1.3065, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.8736, |
| "grad_norm": 0.6605592966079712, |
| "learning_rate": 1.2672000000000001e-06, |
| "loss": 1.1719, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.8768, |
| "grad_norm": 0.48634928464889526, |
| "learning_rate": 1.2352e-06, |
| "loss": 1.2774, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.6096370220184326, |
| "learning_rate": 1.2032e-06, |
| "loss": 1.3231, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8832, |
| "grad_norm": 0.5880251526832581, |
| "learning_rate": 1.1712000000000001e-06, |
| "loss": 1.2641, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.8864, |
| "grad_norm": 0.5116971135139465, |
| "learning_rate": 1.1392e-06, |
| "loss": 1.2763, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.8896, |
| "grad_norm": 0.6191303730010986, |
| "learning_rate": 1.1072000000000002e-06, |
| "loss": 1.2622, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.8928, |
| "grad_norm": 0.5492941737174988, |
| "learning_rate": 1.0752e-06, |
| "loss": 1.3002, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.896, |
| "grad_norm": 0.6216818690299988, |
| "learning_rate": 1.0432e-06, |
| "loss": 1.3222, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8992, |
| "grad_norm": 0.5383599400520325, |
| "learning_rate": 1.0112000000000002e-06, |
| "loss": 1.292, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.9024, |
| "grad_norm": 0.5288344025611877, |
| "learning_rate": 9.792e-07, |
| "loss": 1.2895, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.9056, |
| "grad_norm": 0.5043691396713257, |
| "learning_rate": 9.472e-07, |
| "loss": 1.2499, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.9088, |
| "grad_norm": 0.5582976341247559, |
| "learning_rate": 9.152000000000001e-07, |
| "loss": 1.2986, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.912, |
| "grad_norm": 0.5215420126914978, |
| "learning_rate": 8.832000000000001e-07, |
| "loss": 1.3142, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.9152, |
| "grad_norm": 0.5378311276435852, |
| "learning_rate": 8.512000000000001e-07, |
| "loss": 1.2104, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.9184, |
| "grad_norm": 0.5053496360778809, |
| "learning_rate": 8.192000000000001e-07, |
| "loss": 1.3056, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.9216, |
| "grad_norm": 0.5381192564964294, |
| "learning_rate": 7.872000000000001e-07, |
| "loss": 1.3055, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.9248, |
| "grad_norm": 0.6026363968849182, |
| "learning_rate": 7.552000000000001e-07, |
| "loss": 1.346, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.928, |
| "grad_norm": 0.5687581896781921, |
| "learning_rate": 7.232e-07, |
| "loss": 1.3244, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.9312, |
| "grad_norm": 0.5862733125686646, |
| "learning_rate": 6.912e-07, |
| "loss": 1.2806, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.9344, |
| "grad_norm": 0.47303637862205505, |
| "learning_rate": 6.592000000000001e-07, |
| "loss": 1.2337, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.9376, |
| "grad_norm": 0.509482741355896, |
| "learning_rate": 6.272e-07, |
| "loss": 1.2466, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.9408, |
| "grad_norm": 0.5245184302330017, |
| "learning_rate": 5.952e-07, |
| "loss": 1.2577, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.944, |
| "grad_norm": 0.7082109451293945, |
| "learning_rate": 5.632000000000001e-07, |
| "loss": 1.2272, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.9472, |
| "grad_norm": 0.4797827899456024, |
| "learning_rate": 5.312000000000001e-07, |
| "loss": 1.3238, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.9504, |
| "grad_norm": 0.5341638326644897, |
| "learning_rate": 4.992e-07, |
| "loss": 1.313, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.9536, |
| "grad_norm": 0.5286096334457397, |
| "learning_rate": 4.672e-07, |
| "loss": 1.2538, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.9568, |
| "grad_norm": 0.5771506428718567, |
| "learning_rate": 4.352000000000001e-07, |
| "loss": 1.2869, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.5290225744247437, |
| "learning_rate": 4.0320000000000006e-07, |
| "loss": 1.2882, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.9632, |
| "grad_norm": 0.5034458637237549, |
| "learning_rate": 3.7120000000000004e-07, |
| "loss": 1.2333, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.9664, |
| "grad_norm": 0.5447474122047424, |
| "learning_rate": 3.392e-07, |
| "loss": 1.2834, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.9696, |
| "grad_norm": 0.685217559337616, |
| "learning_rate": 3.0720000000000005e-07, |
| "loss": 1.3295, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.9728, |
| "grad_norm": 0.5744034051895142, |
| "learning_rate": 2.7520000000000003e-07, |
| "loss": 1.3135, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.976, |
| "grad_norm": 0.7290722131729126, |
| "learning_rate": 2.432e-07, |
| "loss": 1.2872, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9792, |
| "grad_norm": 0.517284095287323, |
| "learning_rate": 2.112e-07, |
| "loss": 1.2822, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.9824, |
| "grad_norm": 0.5856100916862488, |
| "learning_rate": 1.792e-07, |
| "loss": 1.2041, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.9856, |
| "grad_norm": 0.5470112562179565, |
| "learning_rate": 1.4720000000000002e-07, |
| "loss": 1.2581, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.9888, |
| "grad_norm": 0.5641172528266907, |
| "learning_rate": 1.1520000000000001e-07, |
| "loss": 1.3559, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.992, |
| "grad_norm": 0.5377922058105469, |
| "learning_rate": 8.32e-08, |
| "loss": 1.2531, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9952, |
| "grad_norm": 0.579965353012085, |
| "learning_rate": 5.120000000000001e-08, |
| "loss": 1.3203, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.9984, |
| "grad_norm": 0.5516660809516907, |
| "learning_rate": 1.9200000000000003e-08, |
| "loss": 1.2791, |
| "step": 3120 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3125, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.08093267968e+17, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|