| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.96, | |
| "eval_steps": 500, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 0.09734748303890228, | |
| "learning_rate": 9.9712e-06, | |
| "loss": 1.6245, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 0.0981544628739357, | |
| "learning_rate": 9.939200000000001e-06, | |
| "loss": 1.5375, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 0.09418202936649323, | |
| "learning_rate": 9.9072e-06, | |
| "loss": 1.5647, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 0.10748359560966492, | |
| "learning_rate": 9.8752e-06, | |
| "loss": 1.6781, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.12658047676086426, | |
| "learning_rate": 9.843200000000001e-06, | |
| "loss": 1.5854, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 0.1334228664636612, | |
| "learning_rate": 9.8112e-06, | |
| "loss": 1.5453, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 0.15112873911857605, | |
| "learning_rate": 9.779200000000001e-06, | |
| "loss": 1.5721, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 0.140653595328331, | |
| "learning_rate": 9.7472e-06, | |
| "loss": 1.5162, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 0.16999679803848267, | |
| "learning_rate": 9.715200000000001e-06, | |
| "loss": 1.5689, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.1928016096353531, | |
| "learning_rate": 9.6832e-06, | |
| "loss": 1.5845, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 0.19378426671028137, | |
| "learning_rate": 9.6512e-06, | |
| "loss": 1.5386, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 0.24590148031711578, | |
| "learning_rate": 9.619200000000001e-06, | |
| "loss": 1.4133, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 0.23824049532413483, | |
| "learning_rate": 9.5872e-06, | |
| "loss": 1.4573, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 0.19866596162319183, | |
| "learning_rate": 9.555200000000001e-06, | |
| "loss": 1.4357, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.2909606993198395, | |
| "learning_rate": 9.5232e-06, | |
| "loss": 1.3924, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 0.48891496658325195, | |
| "learning_rate": 9.4912e-06, | |
| "loss": 1.4041, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 0.3921829164028168, | |
| "learning_rate": 9.4592e-06, | |
| "loss": 1.3158, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 0.293231338262558, | |
| "learning_rate": 9.4272e-06, | |
| "loss": 1.4709, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 0.27421411871910095, | |
| "learning_rate": 9.395200000000001e-06, | |
| "loss": 1.4046, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.1971723437309265, | |
| "learning_rate": 9.3632e-06, | |
| "loss": 1.417, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 0.27423539757728577, | |
| "learning_rate": 9.3312e-06, | |
| "loss": 1.3721, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 0.4509432315826416, | |
| "learning_rate": 9.2992e-06, | |
| "loss": 1.4333, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 0.3389282822608948, | |
| "learning_rate": 9.2672e-06, | |
| "loss": 1.352, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 0.2814404368400574, | |
| "learning_rate": 9.235200000000001e-06, | |
| "loss": 1.3682, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.2661599814891815, | |
| "learning_rate": 9.2032e-06, | |
| "loss": 1.3661, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 0.29006555676460266, | |
| "learning_rate": 9.171200000000001e-06, | |
| "loss": 1.299, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 0.2795925438404083, | |
| "learning_rate": 9.1392e-06, | |
| "loss": 1.3144, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 0.25778502225875854, | |
| "learning_rate": 9.1072e-06, | |
| "loss": 1.2957, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 0.26814839243888855, | |
| "learning_rate": 9.0752e-06, | |
| "loss": 1.3356, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.3247470557689667, | |
| "learning_rate": 9.0432e-06, | |
| "loss": 1.3458, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 0.36921611428260803, | |
| "learning_rate": 9.011200000000001e-06, | |
| "loss": 1.3601, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 0.31122124195098877, | |
| "learning_rate": 8.979200000000002e-06, | |
| "loss": 1.3131, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 0.3557804822921753, | |
| "learning_rate": 8.9472e-06, | |
| "loss": 1.426, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 0.3266560137271881, | |
| "learning_rate": 8.9152e-06, | |
| "loss": 1.3386, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.3932088017463684, | |
| "learning_rate": 8.8832e-06, | |
| "loss": 1.3982, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 0.32620078325271606, | |
| "learning_rate": 8.851200000000001e-06, | |
| "loss": 1.3048, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 0.30419647693634033, | |
| "learning_rate": 8.819200000000002e-06, | |
| "loss": 1.3761, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 0.29732590913772583, | |
| "learning_rate": 8.7872e-06, | |
| "loss": 1.2566, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 0.28484678268432617, | |
| "learning_rate": 8.7552e-06, | |
| "loss": 1.3666, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.4168960154056549, | |
| "learning_rate": 8.7232e-06, | |
| "loss": 1.3396, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 0.3573697507381439, | |
| "learning_rate": 8.6912e-06, | |
| "loss": 1.3684, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 0.4777122735977173, | |
| "learning_rate": 8.659200000000002e-06, | |
| "loss": 1.3059, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 0.26450300216674805, | |
| "learning_rate": 8.627200000000001e-06, | |
| "loss": 1.3283, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 0.37447720766067505, | |
| "learning_rate": 8.5952e-06, | |
| "loss": 1.2667, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.30257123708724976, | |
| "learning_rate": 8.5632e-06, | |
| "loss": 1.3147, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 0.34745684266090393, | |
| "learning_rate": 8.5312e-06, | |
| "loss": 1.3603, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 0.2882753312587738, | |
| "learning_rate": 8.499200000000002e-06, | |
| "loss": 1.3087, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 0.3751160204410553, | |
| "learning_rate": 8.467200000000001e-06, | |
| "loss": 1.342, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 0.3185778260231018, | |
| "learning_rate": 8.435200000000002e-06, | |
| "loss": 1.35, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.2853422164916992, | |
| "learning_rate": 8.4032e-06, | |
| "loss": 1.3105, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 0.3187882602214813, | |
| "learning_rate": 8.3712e-06, | |
| "loss": 1.2915, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 0.4516860842704773, | |
| "learning_rate": 8.339200000000001e-06, | |
| "loss": 1.3449, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 0.3336597681045532, | |
| "learning_rate": 8.3072e-06, | |
| "loss": 1.2989, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 0.4279087781906128, | |
| "learning_rate": 8.275200000000002e-06, | |
| "loss": 1.2412, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.4071614742279053, | |
| "learning_rate": 8.243200000000001e-06, | |
| "loss": 1.414, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 0.3194911479949951, | |
| "learning_rate": 8.2112e-06, | |
| "loss": 1.2762, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 0.3617415428161621, | |
| "learning_rate": 8.179200000000001e-06, | |
| "loss": 1.3225, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 0.3274191915988922, | |
| "learning_rate": 8.1472e-06, | |
| "loss": 1.3464, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 0.35526078939437866, | |
| "learning_rate": 8.115200000000002e-06, | |
| "loss": 1.315, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.3728134036064148, | |
| "learning_rate": 8.0832e-06, | |
| "loss": 1.3023, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 0.4048090875148773, | |
| "learning_rate": 8.0512e-06, | |
| "loss": 1.2751, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 0.41539278626441956, | |
| "learning_rate": 8.019200000000001e-06, | |
| "loss": 1.3533, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 0.3269357979297638, | |
| "learning_rate": 7.9872e-06, | |
| "loss": 1.2709, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 0.3444967567920685, | |
| "learning_rate": 7.955200000000001e-06, | |
| "loss": 1.3119, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.34097886085510254, | |
| "learning_rate": 7.9232e-06, | |
| "loss": 1.3444, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 0.42459428310394287, | |
| "learning_rate": 7.891200000000002e-06, | |
| "loss": 1.325, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 0.3942951261997223, | |
| "learning_rate": 7.859200000000001e-06, | |
| "loss": 1.3732, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 0.33468231558799744, | |
| "learning_rate": 7.8272e-06, | |
| "loss": 1.2883, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 0.3964150547981262, | |
| "learning_rate": 7.795200000000001e-06, | |
| "loss": 1.4014, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.3447844386100769, | |
| "learning_rate": 7.7632e-06, | |
| "loss": 1.3205, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 0.380398154258728, | |
| "learning_rate": 7.731200000000001e-06, | |
| "loss": 1.2819, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 0.3823450207710266, | |
| "learning_rate": 7.6992e-06, | |
| "loss": 1.3097, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 0.3383599817752838, | |
| "learning_rate": 7.6672e-06, | |
| "loss": 1.346, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 0.39140060544013977, | |
| "learning_rate": 7.635200000000001e-06, | |
| "loss": 1.2961, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.32159295678138733, | |
| "learning_rate": 7.6032e-06, | |
| "loss": 1.3045, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 0.3853408098220825, | |
| "learning_rate": 7.5712000000000005e-06, | |
| "loss": 1.2935, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 0.39150312542915344, | |
| "learning_rate": 7.539200000000001e-06, | |
| "loss": 1.2976, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 0.39306044578552246, | |
| "learning_rate": 7.507200000000001e-06, | |
| "loss": 1.2588, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 0.39256688952445984, | |
| "learning_rate": 7.4752e-06, | |
| "loss": 1.3252, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.3738512098789215, | |
| "learning_rate": 7.4432e-06, | |
| "loss": 1.3162, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 0.4799080491065979, | |
| "learning_rate": 7.4112e-06, | |
| "loss": 1.2993, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 0.4616535007953644, | |
| "learning_rate": 7.3792000000000004e-06, | |
| "loss": 1.3356, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 0.37460416555404663, | |
| "learning_rate": 7.347200000000001e-06, | |
| "loss": 1.2938, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 0.4229544997215271, | |
| "learning_rate": 7.3152e-06, | |
| "loss": 1.26, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.5051556825637817, | |
| "learning_rate": 7.2832e-06, | |
| "loss": 1.2868, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 0.3845407962799072, | |
| "learning_rate": 7.2512e-06, | |
| "loss": 1.3255, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 0.43234601616859436, | |
| "learning_rate": 7.2192e-06, | |
| "loss": 1.2756, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 0.390572190284729, | |
| "learning_rate": 7.187200000000001e-06, | |
| "loss": 1.3053, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 0.385815292596817, | |
| "learning_rate": 7.155200000000001e-06, | |
| "loss": 1.2608, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.4778871238231659, | |
| "learning_rate": 7.1232e-06, | |
| "loss": 1.3109, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 0.3777396082878113, | |
| "learning_rate": 7.0912e-06, | |
| "loss": 1.2723, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 0.4682841897010803, | |
| "learning_rate": 7.0592e-06, | |
| "loss": 1.3304, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 0.3837222754955292, | |
| "learning_rate": 7.027200000000001e-06, | |
| "loss": 1.3081, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 0.3792935907840729, | |
| "learning_rate": 6.995200000000001e-06, | |
| "loss": 1.3176, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.476096510887146, | |
| "learning_rate": 6.963200000000001e-06, | |
| "loss": 1.2764, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 0.4119466543197632, | |
| "learning_rate": 6.9312e-06, | |
| "loss": 1.3563, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 0.40938565135002136, | |
| "learning_rate": 6.8992e-06, | |
| "loss": 1.2782, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 0.4305261969566345, | |
| "learning_rate": 6.867200000000001e-06, | |
| "loss": 1.3333, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 0.3533143997192383, | |
| "learning_rate": 6.835200000000001e-06, | |
| "loss": 1.3686, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.43104642629623413, | |
| "learning_rate": 6.803200000000001e-06, | |
| "loss": 1.3461, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 0.5197634696960449, | |
| "learning_rate": 6.771200000000001e-06, | |
| "loss": 1.3316, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 0.4084891080856323, | |
| "learning_rate": 6.7392e-06, | |
| "loss": 1.2941, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 0.4634837508201599, | |
| "learning_rate": 6.707200000000001e-06, | |
| "loss": 1.2982, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 0.4361494183540344, | |
| "learning_rate": 6.675200000000001e-06, | |
| "loss": 1.334, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.36735212802886963, | |
| "learning_rate": 6.643200000000001e-06, | |
| "loss": 1.3642, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 0.3968944847583771, | |
| "learning_rate": 6.611200000000001e-06, | |
| "loss": 1.3784, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 0.39363133907318115, | |
| "learning_rate": 6.5792e-06, | |
| "loss": 1.2715, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 0.4664965867996216, | |
| "learning_rate": 6.547200000000001e-06, | |
| "loss": 1.3436, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 0.3857831358909607, | |
| "learning_rate": 6.515200000000001e-06, | |
| "loss": 1.3084, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.41258570551872253, | |
| "learning_rate": 6.483200000000001e-06, | |
| "loss": 1.3288, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 0.3971725404262543, | |
| "learning_rate": 6.451200000000001e-06, | |
| "loss": 1.3321, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 0.3993317186832428, | |
| "learning_rate": 6.419200000000001e-06, | |
| "loss": 1.3385, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 0.5872831344604492, | |
| "learning_rate": 6.3872000000000004e-06, | |
| "loss": 1.2817, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 0.47822561860084534, | |
| "learning_rate": 6.355200000000001e-06, | |
| "loss": 1.3083, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.5206847786903381, | |
| "learning_rate": 6.323200000000001e-06, | |
| "loss": 1.3457, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 0.41014567017555237, | |
| "learning_rate": 6.291200000000001e-06, | |
| "loss": 1.2687, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 0.39573901891708374, | |
| "learning_rate": 6.259200000000001e-06, | |
| "loss": 1.3257, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 0.40908557176589966, | |
| "learning_rate": 6.227200000000001e-06, | |
| "loss": 1.2587, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 0.4308335781097412, | |
| "learning_rate": 6.1952e-06, | |
| "loss": 1.2764, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.41657981276512146, | |
| "learning_rate": 6.1632000000000006e-06, | |
| "loss": 1.3305, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 0.446154922246933, | |
| "learning_rate": 6.131200000000001e-06, | |
| "loss": 1.3323, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 0.43903544545173645, | |
| "learning_rate": 6.099200000000001e-06, | |
| "loss": 1.2731, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 0.4204481542110443, | |
| "learning_rate": 6.067200000000001e-06, | |
| "loss": 1.2569, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 0.4393060803413391, | |
| "learning_rate": 6.0352e-06, | |
| "loss": 1.3119, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.42466068267822266, | |
| "learning_rate": 6.0032e-06, | |
| "loss": 1.2106, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 0.40182891488075256, | |
| "learning_rate": 5.9712000000000005e-06, | |
| "loss": 1.2566, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 0.3702845275402069, | |
| "learning_rate": 5.939200000000001e-06, | |
| "loss": 1.3344, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 0.4409834146499634, | |
| "learning_rate": 5.907200000000001e-06, | |
| "loss": 1.2553, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 0.5070372223854065, | |
| "learning_rate": 5.875200000000001e-06, | |
| "loss": 1.2901, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.44239479303359985, | |
| "learning_rate": 5.8432e-06, | |
| "loss": 1.2086, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 0.5466510653495789, | |
| "learning_rate": 5.8112e-06, | |
| "loss": 1.2959, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 0.5056144595146179, | |
| "learning_rate": 5.7792000000000005e-06, | |
| "loss": 1.3353, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 0.42606833577156067, | |
| "learning_rate": 5.747200000000001e-06, | |
| "loss": 1.3108, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 0.41976213455200195, | |
| "learning_rate": 5.715200000000001e-06, | |
| "loss": 1.3248, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.48559048771858215, | |
| "learning_rate": 5.683200000000001e-06, | |
| "loss": 1.2686, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 0.47761228680610657, | |
| "learning_rate": 5.6512e-06, | |
| "loss": 1.281, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 0.4777953028678894, | |
| "learning_rate": 5.6192e-06, | |
| "loss": 1.2829, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 0.44091978669166565, | |
| "learning_rate": 5.5872000000000005e-06, | |
| "loss": 1.3032, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 0.48977166414260864, | |
| "learning_rate": 5.555200000000001e-06, | |
| "loss": 1.3418, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.6014530062675476, | |
| "learning_rate": 5.523200000000001e-06, | |
| "loss": 1.2119, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 0.4750172793865204, | |
| "learning_rate": 5.491200000000001e-06, | |
| "loss": 1.3432, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 0.5095167756080627, | |
| "learning_rate": 5.4592e-06, | |
| "loss": 1.3448, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 0.47408685088157654, | |
| "learning_rate": 5.4272e-06, | |
| "loss": 1.3436, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 0.45464885234832764, | |
| "learning_rate": 5.3952000000000005e-06, | |
| "loss": 1.1962, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.431349515914917, | |
| "learning_rate": 5.363200000000001e-06, | |
| "loss": 1.2773, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 0.444397896528244, | |
| "learning_rate": 5.331200000000001e-06, | |
| "loss": 1.3163, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 0.4360913038253784, | |
| "learning_rate": 5.2992e-06, | |
| "loss": 1.2759, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 0.5152497887611389, | |
| "learning_rate": 5.2672e-06, | |
| "loss": 1.3225, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 0.48929157853126526, | |
| "learning_rate": 5.2352e-06, | |
| "loss": 1.3213, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.4925262928009033, | |
| "learning_rate": 5.2032000000000004e-06, | |
| "loss": 1.2008, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 0.46162164211273193, | |
| "learning_rate": 5.1712000000000006e-06, | |
| "loss": 1.2996, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 0.4908200800418854, | |
| "learning_rate": 5.139200000000001e-06, | |
| "loss": 1.2729, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 0.5178566575050354, | |
| "learning_rate": 5.1072e-06, | |
| "loss": 1.293, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 0.5733951330184937, | |
| "learning_rate": 5.0752e-06, | |
| "loss": 1.3573, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.4558843672275543, | |
| "learning_rate": 5.0432e-06, | |
| "loss": 1.3445, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 0.5171469449996948, | |
| "learning_rate": 5.0112e-06, | |
| "loss": 1.2293, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 0.4879666864871979, | |
| "learning_rate": 4.9792000000000005e-06, | |
| "loss": 1.31, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 0.4393675923347473, | |
| "learning_rate": 4.947200000000001e-06, | |
| "loss": 1.3186, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 0.5072659254074097, | |
| "learning_rate": 4.915200000000001e-06, | |
| "loss": 1.2857, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.5163191556930542, | |
| "learning_rate": 4.8832e-06, | |
| "loss": 1.3401, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 0.5119105577468872, | |
| "learning_rate": 4.8512e-06, | |
| "loss": 1.32, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 0.5342932939529419, | |
| "learning_rate": 4.8192e-06, | |
| "loss": 1.206, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 0.4517419636249542, | |
| "learning_rate": 4.7872000000000005e-06, | |
| "loss": 1.3077, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 0.46141722798347473, | |
| "learning_rate": 4.755200000000001e-06, | |
| "loss": 1.2873, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.41747117042541504, | |
| "learning_rate": 4.723200000000001e-06, | |
| "loss": 1.2715, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 0.48263996839523315, | |
| "learning_rate": 4.6912e-06, | |
| "loss": 1.2814, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 0.4876611828804016, | |
| "learning_rate": 4.6592e-06, | |
| "loss": 1.2776, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 0.46099624037742615, | |
| "learning_rate": 4.6272e-06, | |
| "loss": 1.3839, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 0.46614623069763184, | |
| "learning_rate": 4.5952000000000005e-06, | |
| "loss": 1.2717, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.48747870326042175, | |
| "learning_rate": 4.563200000000001e-06, | |
| "loss": 1.2937, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 0.5542135238647461, | |
| "learning_rate": 4.531200000000001e-06, | |
| "loss": 1.2622, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 0.46008777618408203, | |
| "learning_rate": 4.4992e-06, | |
| "loss": 1.3188, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 0.4853471517562866, | |
| "learning_rate": 4.4672e-06, | |
| "loss": 1.252, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 0.44900670647621155, | |
| "learning_rate": 4.4352e-06, | |
| "loss": 1.2549, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.4973522126674652, | |
| "learning_rate": 4.4032000000000005e-06, | |
| "loss": 1.2959, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 0.45412448048591614, | |
| "learning_rate": 4.371200000000001e-06, | |
| "loss": 1.2092, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 0.5110604763031006, | |
| "learning_rate": 4.3392e-06, | |
| "loss": 1.3127, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 0.5951307415962219, | |
| "learning_rate": 4.3072e-06, | |
| "loss": 1.2603, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 0.49740588665008545, | |
| "learning_rate": 4.2752e-06, | |
| "loss": 1.2609, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.4803503155708313, | |
| "learning_rate": 4.2432e-06, | |
| "loss": 1.2287, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 0.48638489842414856, | |
| "learning_rate": 4.2112000000000004e-06, | |
| "loss": 1.2245, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 0.48148202896118164, | |
| "learning_rate": 4.179200000000001e-06, | |
| "loss": 1.2858, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 0.5493887662887573, | |
| "learning_rate": 4.1472e-06, | |
| "loss": 1.2765, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 0.45376092195510864, | |
| "learning_rate": 4.1152e-06, | |
| "loss": 1.1914, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.5095167756080627, | |
| "learning_rate": 4.0832e-06, | |
| "loss": 1.2916, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 0.5425928831100464, | |
| "learning_rate": 4.0512e-06, | |
| "loss": 1.2189, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 0.46790796518325806, | |
| "learning_rate": 4.0192e-06, | |
| "loss": 1.3668, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 0.48903679847717285, | |
| "learning_rate": 3.9872000000000006e-06, | |
| "loss": 1.2132, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 0.47461065649986267, | |
| "learning_rate": 3.9552e-06, | |
| "loss": 1.2794, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.4707651436328888, | |
| "learning_rate": 3.9232e-06, | |
| "loss": 1.3, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 0.5604966878890991, | |
| "learning_rate": 3.8912e-06, | |
| "loss": 1.2272, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 0.5373271107673645, | |
| "learning_rate": 3.8592e-06, | |
| "loss": 1.2522, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 0.50235915184021, | |
| "learning_rate": 3.8272e-06, | |
| "loss": 1.2486, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 0.4826876223087311, | |
| "learning_rate": 3.7952000000000005e-06, | |
| "loss": 1.3355, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.46976956725120544, | |
| "learning_rate": 3.7632000000000002e-06, | |
| "loss": 1.2725, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 0.5186979174613953, | |
| "learning_rate": 3.7312000000000004e-06, | |
| "loss": 1.3073, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 0.4939082860946655, | |
| "learning_rate": 3.6992000000000005e-06, | |
| "loss": 1.2649, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 0.5091391205787659, | |
| "learning_rate": 3.6672000000000002e-06, | |
| "loss": 1.4142, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 0.4665001928806305, | |
| "learning_rate": 3.6352000000000004e-06, | |
| "loss": 1.2606, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.48443859815597534, | |
| "learning_rate": 3.6032e-06, | |
| "loss": 1.1884, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 0.5871022939682007, | |
| "learning_rate": 3.5712000000000002e-06, | |
| "loss": 1.3792, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 0.48302605748176575, | |
| "learning_rate": 3.5392000000000004e-06, | |
| "loss": 1.262, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 0.4569855034351349, | |
| "learning_rate": 3.5072e-06, | |
| "loss": 1.2587, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 0.5194870829582214, | |
| "learning_rate": 3.4752e-06, | |
| "loss": 1.3056, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.4751642346382141, | |
| "learning_rate": 3.4432000000000003e-06, | |
| "loss": 1.1733, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 0.5077437162399292, | |
| "learning_rate": 3.4112e-06, | |
| "loss": 1.3218, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 0.49009519815444946, | |
| "learning_rate": 3.3792e-06, | |
| "loss": 1.224, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 0.4634891152381897, | |
| "learning_rate": 3.3472000000000003e-06, | |
| "loss": 1.2727, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 0.5274826884269714, | |
| "learning_rate": 3.3152e-06, | |
| "loss": 1.2916, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.5165941715240479, | |
| "learning_rate": 3.2832e-06, | |
| "loss": 1.2878, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 0.5654541254043579, | |
| "learning_rate": 3.2512000000000003e-06, | |
| "loss": 1.2749, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 0.49610668420791626, | |
| "learning_rate": 3.2192e-06, | |
| "loss": 1.2668, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 0.5377901196479797, | |
| "learning_rate": 3.1872e-06, | |
| "loss": 1.2671, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 0.5280618071556091, | |
| "learning_rate": 3.1552000000000003e-06, | |
| "loss": 1.2637, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.5266459584236145, | |
| "learning_rate": 3.1232e-06, | |
| "loss": 1.2604, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 0.47189775109291077, | |
| "learning_rate": 3.0912e-06, | |
| "loss": 1.2546, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 0.5069970488548279, | |
| "learning_rate": 3.0592000000000007e-06, | |
| "loss": 1.2538, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 0.5452210903167725, | |
| "learning_rate": 3.0272e-06, | |
| "loss": 1.2896, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 0.47197288274765015, | |
| "learning_rate": 2.9952e-06, | |
| "loss": 1.2104, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.5163410305976868, | |
| "learning_rate": 2.9632e-06, | |
| "loss": 1.2495, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 0.4659384787082672, | |
| "learning_rate": 2.9312e-06, | |
| "loss": 1.226, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 0.5424367189407349, | |
| "learning_rate": 2.8992000000000005e-06, | |
| "loss": 1.3475, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 0.5033388137817383, | |
| "learning_rate": 2.8672e-06, | |
| "loss": 1.2415, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 0.4847257733345032, | |
| "learning_rate": 2.8352e-06, | |
| "loss": 1.2562, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.5888292789459229, | |
| "learning_rate": 2.8032000000000005e-06, | |
| "loss": 1.3166, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 0.5637612342834473, | |
| "learning_rate": 2.7712e-06, | |
| "loss": 1.2805, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 0.477873831987381, | |
| "learning_rate": 2.7392000000000004e-06, | |
| "loss": 1.2804, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 0.627713143825531, | |
| "learning_rate": 2.7072000000000005e-06, | |
| "loss": 1.2844, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 0.5947350859642029, | |
| "learning_rate": 2.6752e-06, | |
| "loss": 1.28, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.49309098720550537, | |
| "learning_rate": 2.6432000000000004e-06, | |
| "loss": 1.353, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 0.5657567381858826, | |
| "learning_rate": 2.6112000000000005e-06, | |
| "loss": 1.3422, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 0.5906503200531006, | |
| "learning_rate": 2.5792000000000002e-06, | |
| "loss": 1.2691, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 0.5093393325805664, | |
| "learning_rate": 2.5472000000000004e-06, | |
| "loss": 1.2689, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 0.48354557156562805, | |
| "learning_rate": 2.5152000000000005e-06, | |
| "loss": 1.2062, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.6542074084281921, | |
| "learning_rate": 2.4832000000000002e-06, | |
| "loss": 1.2852, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 0.5252315998077393, | |
| "learning_rate": 2.4512000000000003e-06, | |
| "loss": 1.2635, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 0.48583582043647766, | |
| "learning_rate": 2.4192e-06, | |
| "loss": 1.2096, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 0.49642977118492126, | |
| "learning_rate": 2.3872e-06, | |
| "loss": 1.2424, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 0.6025352478027344, | |
| "learning_rate": 2.3552000000000003e-06, | |
| "loss": 1.2992, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.5461027026176453, | |
| "learning_rate": 2.3232e-06, | |
| "loss": 1.2946, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 0.6130191683769226, | |
| "learning_rate": 2.2912e-06, | |
| "loss": 1.2398, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 0.6468284726142883, | |
| "learning_rate": 2.2592000000000003e-06, | |
| "loss": 1.3087, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 0.6268571019172668, | |
| "learning_rate": 2.2272e-06, | |
| "loss": 1.1613, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 0.7104691863059998, | |
| "learning_rate": 2.1952e-06, | |
| "loss": 1.27, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.4856204688549042, | |
| "learning_rate": 2.1632000000000003e-06, | |
| "loss": 1.2731, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 0.5168479681015015, | |
| "learning_rate": 2.1312e-06, | |
| "loss": 1.3437, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 0.659817636013031, | |
| "learning_rate": 2.0992e-06, | |
| "loss": 1.2839, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 0.5834536552429199, | |
| "learning_rate": 2.0672e-06, | |
| "loss": 1.3048, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 0.4839385151863098, | |
| "learning_rate": 2.0352000000000004e-06, | |
| "loss": 1.2803, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.588320255279541, | |
| "learning_rate": 2.0032e-06, | |
| "loss": 1.2276, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 0.5608358383178711, | |
| "learning_rate": 1.9712e-06, | |
| "loss": 1.3644, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 0.5970802903175354, | |
| "learning_rate": 1.9392000000000004e-06, | |
| "loss": 1.2919, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 0.5823186039924622, | |
| "learning_rate": 1.9072000000000001e-06, | |
| "loss": 1.3033, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 0.5669010281562805, | |
| "learning_rate": 1.8752e-06, | |
| "loss": 1.3379, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.5039373636245728, | |
| "learning_rate": 1.8432000000000002e-06, | |
| "loss": 1.2282, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 0.5700042843818665, | |
| "learning_rate": 1.8112000000000001e-06, | |
| "loss": 1.2615, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 0.5190805196762085, | |
| "learning_rate": 1.7792e-06, | |
| "loss": 1.2593, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 0.5930772423744202, | |
| "learning_rate": 1.7472e-06, | |
| "loss": 1.2265, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 0.5103446245193481, | |
| "learning_rate": 1.7152000000000001e-06, | |
| "loss": 1.2012, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.534788966178894, | |
| "learning_rate": 1.6832e-06, | |
| "loss": 1.2393, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 0.572394609451294, | |
| "learning_rate": 1.6512e-06, | |
| "loss": 1.2876, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 0.4987950623035431, | |
| "learning_rate": 1.6192000000000003e-06, | |
| "loss": 1.2783, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 0.5138176083564758, | |
| "learning_rate": 1.5872e-06, | |
| "loss": 1.2559, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 0.5693644881248474, | |
| "learning_rate": 1.5552e-06, | |
| "loss": 1.2599, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.6024214029312134, | |
| "learning_rate": 1.5232000000000003e-06, | |
| "loss": 1.3064, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 0.5588571429252625, | |
| "learning_rate": 1.4912000000000002e-06, | |
| "loss": 1.2977, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 0.5551236867904663, | |
| "learning_rate": 1.4592000000000001e-06, | |
| "loss": 1.3121, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 0.5989100933074951, | |
| "learning_rate": 1.4272000000000003e-06, | |
| "loss": 1.2795, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 0.6164664626121521, | |
| "learning_rate": 1.3952000000000002e-06, | |
| "loss": 1.3366, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.6146747469902039, | |
| "learning_rate": 1.3632000000000001e-06, | |
| "loss": 1.2494, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 0.6117052435874939, | |
| "learning_rate": 1.3312e-06, | |
| "loss": 1.2398, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 0.4775325655937195, | |
| "learning_rate": 1.2992000000000002e-06, | |
| "loss": 1.3065, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 0.6605592966079712, | |
| "learning_rate": 1.2672000000000001e-06, | |
| "loss": 1.1719, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 0.48634928464889526, | |
| "learning_rate": 1.2352e-06, | |
| "loss": 1.2774, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.6096370220184326, | |
| "learning_rate": 1.2032e-06, | |
| "loss": 1.3231, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 0.5880251526832581, | |
| "learning_rate": 1.1712000000000001e-06, | |
| "loss": 1.2641, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 0.5116971135139465, | |
| "learning_rate": 1.1392e-06, | |
| "loss": 1.2763, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 0.6191303730010986, | |
| "learning_rate": 1.1072000000000002e-06, | |
| "loss": 1.2622, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 0.5492941737174988, | |
| "learning_rate": 1.0752e-06, | |
| "loss": 1.3002, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.6216818690299988, | |
| "learning_rate": 1.0432e-06, | |
| "loss": 1.3222, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 0.5383599400520325, | |
| "learning_rate": 1.0112000000000002e-06, | |
| "loss": 1.292, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 0.5288344025611877, | |
| "learning_rate": 9.792e-07, | |
| "loss": 1.2895, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 0.5043691396713257, | |
| "learning_rate": 9.472e-07, | |
| "loss": 1.2499, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 0.5582976341247559, | |
| "learning_rate": 9.152000000000001e-07, | |
| "loss": 1.2986, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.5215420126914978, | |
| "learning_rate": 8.832000000000001e-07, | |
| "loss": 1.3142, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 0.5378311276435852, | |
| "learning_rate": 8.512000000000001e-07, | |
| "loss": 1.2104, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 0.5053496360778809, | |
| "learning_rate": 8.192000000000001e-07, | |
| "loss": 1.3056, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 0.5381192564964294, | |
| "learning_rate": 7.872000000000001e-07, | |
| "loss": 1.3055, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 0.6026363968849182, | |
| "learning_rate": 7.552000000000001e-07, | |
| "loss": 1.346, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.5687581896781921, | |
| "learning_rate": 7.232e-07, | |
| "loss": 1.3244, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 0.5862733125686646, | |
| "learning_rate": 6.912e-07, | |
| "loss": 1.2806, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 0.47303637862205505, | |
| "learning_rate": 6.592000000000001e-07, | |
| "loss": 1.2337, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 0.509482741355896, | |
| "learning_rate": 6.272e-07, | |
| "loss": 1.2466, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 0.5245184302330017, | |
| "learning_rate": 5.952e-07, | |
| "loss": 1.2577, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.7082109451293945, | |
| "learning_rate": 5.632000000000001e-07, | |
| "loss": 1.2272, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 0.4797827899456024, | |
| "learning_rate": 5.312000000000001e-07, | |
| "loss": 1.3238, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 0.5341638326644897, | |
| "learning_rate": 4.992e-07, | |
| "loss": 1.313, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 0.5286096334457397, | |
| "learning_rate": 4.672e-07, | |
| "loss": 1.2538, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 0.5771506428718567, | |
| "learning_rate": 4.352000000000001e-07, | |
| "loss": 1.2869, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.5290225744247437, | |
| "learning_rate": 4.0320000000000006e-07, | |
| "loss": 1.2882, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.8776953724928e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |