| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.291005291005291, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.05291005291005291, |
| "grad_norm": 12.614516258239746, |
| "learning_rate": 9e-07, |
| "loss": 1.0027, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.10582010582010581, |
| "grad_norm": 7.134069919586182, |
| "learning_rate": 1.9e-06, |
| "loss": 0.9062, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.15873015873015872, |
| "grad_norm": 4.088052749633789, |
| "learning_rate": 2.9e-06, |
| "loss": 0.6868, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.21164021164021163, |
| "grad_norm": 1.5182710886001587, |
| "learning_rate": 3.9e-06, |
| "loss": 0.4484, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.26455026455026454, |
| "grad_norm": 1.0434691905975342, |
| "learning_rate": 4.9000000000000005e-06, |
| "loss": 0.2952, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.31746031746031744, |
| "grad_norm": 1.0803996324539185, |
| "learning_rate": 5.9e-06, |
| "loss": 0.2199, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.37037037037037035, |
| "grad_norm": 1.0268789529800415, |
| "learning_rate": 6.900000000000001e-06, |
| "loss": 0.1769, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.42328042328042326, |
| "grad_norm": 1.586670994758606, |
| "learning_rate": 7.9e-06, |
| "loss": 0.1669, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 1.1374748945236206, |
| "learning_rate": 8.9e-06, |
| "loss": 0.145, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.5291005291005291, |
| "grad_norm": 1.359847903251648, |
| "learning_rate": 9.900000000000002e-06, |
| "loss": 0.1334, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.582010582010582, |
| "grad_norm": 0.8251723647117615, |
| "learning_rate": 1.09e-05, |
| "loss": 0.1212, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.6349206349206349, |
| "grad_norm": 0.7170350551605225, |
| "learning_rate": 1.19e-05, |
| "loss": 0.1069, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.6878306878306878, |
| "grad_norm": 0.7338834404945374, |
| "learning_rate": 1.29e-05, |
| "loss": 0.0926, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.7407407407407407, |
| "grad_norm": 0.7547397017478943, |
| "learning_rate": 1.3900000000000002e-05, |
| "loss": 0.0854, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.7936507936507936, |
| "grad_norm": 0.7571271061897278, |
| "learning_rate": 1.49e-05, |
| "loss": 0.0805, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.8465608465608465, |
| "grad_norm": 0.6611447334289551, |
| "learning_rate": 1.59e-05, |
| "loss": 0.074, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.8994708994708994, |
| "grad_norm": 0.725141704082489, |
| "learning_rate": 1.69e-05, |
| "loss": 0.0735, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 0.7854066491127014, |
| "learning_rate": 1.79e-05, |
| "loss": 0.0657, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.0052910052910053, |
| "grad_norm": 0.689892053604126, |
| "learning_rate": 1.8900000000000002e-05, |
| "loss": 0.0576, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.0582010582010581, |
| "grad_norm": 0.4899508059024811, |
| "learning_rate": 1.9900000000000003e-05, |
| "loss": 0.0554, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 0.44909000396728516, |
| "learning_rate": 2.09e-05, |
| "loss": 0.0555, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.164021164021164, |
| "grad_norm": 0.5986359119415283, |
| "learning_rate": 2.19e-05, |
| "loss": 0.0551, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.216931216931217, |
| "grad_norm": 0.6715332269668579, |
| "learning_rate": 2.29e-05, |
| "loss": 0.0503, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.2698412698412698, |
| "grad_norm": 0.6094868779182434, |
| "learning_rate": 2.39e-05, |
| "loss": 0.0476, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.3227513227513228, |
| "grad_norm": 0.5420939922332764, |
| "learning_rate": 2.4900000000000002e-05, |
| "loss": 0.0453, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.3756613756613756, |
| "grad_norm": 0.506049394607544, |
| "learning_rate": 2.5900000000000003e-05, |
| "loss": 0.0428, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 0.42280900478363037, |
| "learning_rate": 2.6900000000000003e-05, |
| "loss": 0.0441, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.4814814814814814, |
| "grad_norm": 0.5239710211753845, |
| "learning_rate": 2.7900000000000004e-05, |
| "loss": 0.04, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.5343915343915344, |
| "grad_norm": 0.5714675784111023, |
| "learning_rate": 2.8899999999999998e-05, |
| "loss": 0.042, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.5873015873015874, |
| "grad_norm": 0.41054368019104004, |
| "learning_rate": 2.9900000000000002e-05, |
| "loss": 0.0388, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.6402116402116402, |
| "grad_norm": 0.5580154061317444, |
| "learning_rate": 3.09e-05, |
| "loss": 0.0377, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.693121693121693, |
| "grad_norm": 0.538187563419342, |
| "learning_rate": 3.19e-05, |
| "loss": 0.0388, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.746031746031746, |
| "grad_norm": 0.47221389412879944, |
| "learning_rate": 3.29e-05, |
| "loss": 0.037, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.798941798941799, |
| "grad_norm": 0.4035741686820984, |
| "learning_rate": 3.3900000000000004e-05, |
| "loss": 0.033, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.8518518518518519, |
| "grad_norm": 0.46619656682014465, |
| "learning_rate": 3.49e-05, |
| "loss": 0.032, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 0.4589271545410156, |
| "learning_rate": 3.59e-05, |
| "loss": 0.0338, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.9576719576719577, |
| "grad_norm": 0.428501158952713, |
| "learning_rate": 3.69e-05, |
| "loss": 0.0343, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.0105820105820107, |
| "grad_norm": 0.44803091883659363, |
| "learning_rate": 3.79e-05, |
| "loss": 0.033, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.0634920634920633, |
| "grad_norm": 0.4423496127128601, |
| "learning_rate": 3.8900000000000004e-05, |
| "loss": 0.0346, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.1164021164021163, |
| "grad_norm": 0.3414062261581421, |
| "learning_rate": 3.99e-05, |
| "loss": 0.0321, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.1693121693121693, |
| "grad_norm": 0.4166780710220337, |
| "learning_rate": 4.09e-05, |
| "loss": 0.0294, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 0.41433510184288025, |
| "learning_rate": 4.19e-05, |
| "loss": 0.0309, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.2751322751322753, |
| "grad_norm": 0.3357546627521515, |
| "learning_rate": 4.29e-05, |
| "loss": 0.0308, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.328042328042328, |
| "grad_norm": 0.3968923091888428, |
| "learning_rate": 4.39e-05, |
| "loss": 0.0333, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 0.39435455203056335, |
| "learning_rate": 4.49e-05, |
| "loss": 0.0333, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.433862433862434, |
| "grad_norm": 0.4199584126472473, |
| "learning_rate": 4.5900000000000004e-05, |
| "loss": 0.0292, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.4867724867724865, |
| "grad_norm": 0.4035734534263611, |
| "learning_rate": 4.69e-05, |
| "loss": 0.0308, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.5396825396825395, |
| "grad_norm": 0.43274185061454773, |
| "learning_rate": 4.79e-05, |
| "loss": 0.0307, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.5925925925925926, |
| "grad_norm": 0.4387704133987427, |
| "learning_rate": 4.89e-05, |
| "loss": 0.0308, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.6455026455026456, |
| "grad_norm": 0.33311206102371216, |
| "learning_rate": 4.99e-05, |
| "loss": 0.0313, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.6984126984126986, |
| "grad_norm": 0.379742294549942, |
| "learning_rate": 5.0900000000000004e-05, |
| "loss": 0.0309, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.751322751322751, |
| "grad_norm": 0.47922366857528687, |
| "learning_rate": 5.19e-05, |
| "loss": 0.0286, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.804232804232804, |
| "grad_norm": 0.3975037932395935, |
| "learning_rate": 5.2900000000000005e-05, |
| "loss": 0.0256, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.3734884262084961, |
| "learning_rate": 5.390000000000001e-05, |
| "loss": 0.0286, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.91005291005291, |
| "grad_norm": 0.38052114844322205, |
| "learning_rate": 5.4900000000000006e-05, |
| "loss": 0.0273, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.962962962962963, |
| "grad_norm": 0.3779924809932709, |
| "learning_rate": 5.590000000000001e-05, |
| "loss": 0.0252, |
| "step": 560 |
| }, |
| { |
| "epoch": 3.015873015873016, |
| "grad_norm": 0.47244957089424133, |
| "learning_rate": 5.69e-05, |
| "loss": 0.027, |
| "step": 570 |
| }, |
| { |
| "epoch": 3.068783068783069, |
| "grad_norm": 0.44039979577064514, |
| "learning_rate": 5.79e-05, |
| "loss": 0.026, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.121693121693122, |
| "grad_norm": 0.35815173387527466, |
| "learning_rate": 5.89e-05, |
| "loss": 0.0288, |
| "step": 590 |
| }, |
| { |
| "epoch": 3.1746031746031744, |
| "grad_norm": 0.4196639657020569, |
| "learning_rate": 5.99e-05, |
| "loss": 0.0276, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.2275132275132274, |
| "grad_norm": 0.41070035099983215, |
| "learning_rate": 6.09e-05, |
| "loss": 0.0254, |
| "step": 610 |
| }, |
| { |
| "epoch": 3.2804232804232805, |
| "grad_norm": 0.377328097820282, |
| "learning_rate": 6.19e-05, |
| "loss": 0.0277, |
| "step": 620 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.3343811631202698, |
| "learning_rate": 6.29e-05, |
| "loss": 0.0278, |
| "step": 630 |
| }, |
| { |
| "epoch": 3.386243386243386, |
| "grad_norm": 0.30622223019599915, |
| "learning_rate": 6.390000000000001e-05, |
| "loss": 0.0267, |
| "step": 640 |
| }, |
| { |
| "epoch": 3.439153439153439, |
| "grad_norm": 0.29412469267845154, |
| "learning_rate": 6.49e-05, |
| "loss": 0.0267, |
| "step": 650 |
| }, |
| { |
| "epoch": 3.492063492063492, |
| "grad_norm": 0.3135109841823578, |
| "learning_rate": 6.59e-05, |
| "loss": 0.0251, |
| "step": 660 |
| }, |
| { |
| "epoch": 3.544973544973545, |
| "grad_norm": 0.314832866191864, |
| "learning_rate": 6.690000000000001e-05, |
| "loss": 0.0285, |
| "step": 670 |
| }, |
| { |
| "epoch": 3.597883597883598, |
| "grad_norm": 0.39124566316604614, |
| "learning_rate": 6.790000000000001e-05, |
| "loss": 0.0263, |
| "step": 680 |
| }, |
| { |
| "epoch": 3.6507936507936507, |
| "grad_norm": 0.37374603748321533, |
| "learning_rate": 6.89e-05, |
| "loss": 0.0238, |
| "step": 690 |
| }, |
| { |
| "epoch": 3.7037037037037037, |
| "grad_norm": 0.3832198679447174, |
| "learning_rate": 6.99e-05, |
| "loss": 0.0231, |
| "step": 700 |
| }, |
| { |
| "epoch": 3.7566137566137567, |
| "grad_norm": 0.40413761138916016, |
| "learning_rate": 7.09e-05, |
| "loss": 0.0239, |
| "step": 710 |
| }, |
| { |
| "epoch": 3.8095238095238093, |
| "grad_norm": 0.35467275977134705, |
| "learning_rate": 7.19e-05, |
| "loss": 0.025, |
| "step": 720 |
| }, |
| { |
| "epoch": 3.8624338624338623, |
| "grad_norm": 0.35146641731262207, |
| "learning_rate": 7.29e-05, |
| "loss": 0.0253, |
| "step": 730 |
| }, |
| { |
| "epoch": 3.9153439153439153, |
| "grad_norm": 0.3469861149787903, |
| "learning_rate": 7.390000000000001e-05, |
| "loss": 0.0228, |
| "step": 740 |
| }, |
| { |
| "epoch": 3.9682539682539684, |
| "grad_norm": 0.3485950529575348, |
| "learning_rate": 7.49e-05, |
| "loss": 0.0236, |
| "step": 750 |
| }, |
| { |
| "epoch": 4.021164021164021, |
| "grad_norm": 0.35884398221969604, |
| "learning_rate": 7.59e-05, |
| "loss": 0.0242, |
| "step": 760 |
| }, |
| { |
| "epoch": 4.074074074074074, |
| "grad_norm": 0.34299910068511963, |
| "learning_rate": 7.69e-05, |
| "loss": 0.0249, |
| "step": 770 |
| }, |
| { |
| "epoch": 4.1269841269841265, |
| "grad_norm": 0.24837176501750946, |
| "learning_rate": 7.790000000000001e-05, |
| "loss": 0.0252, |
| "step": 780 |
| }, |
| { |
| "epoch": 4.1798941798941796, |
| "grad_norm": 0.3547382950782776, |
| "learning_rate": 7.890000000000001e-05, |
| "loss": 0.0241, |
| "step": 790 |
| }, |
| { |
| "epoch": 4.232804232804233, |
| "grad_norm": 0.32745105028152466, |
| "learning_rate": 7.99e-05, |
| "loss": 0.0238, |
| "step": 800 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "grad_norm": 0.3326016366481781, |
| "learning_rate": 8.090000000000001e-05, |
| "loss": 0.023, |
| "step": 810 |
| }, |
| { |
| "epoch": 4.338624338624339, |
| "grad_norm": 0.3203228712081909, |
| "learning_rate": 8.19e-05, |
| "loss": 0.0239, |
| "step": 820 |
| }, |
| { |
| "epoch": 4.391534391534392, |
| "grad_norm": 0.31203749775886536, |
| "learning_rate": 8.29e-05, |
| "loss": 0.0216, |
| "step": 830 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 0.31825166940689087, |
| "learning_rate": 8.39e-05, |
| "loss": 0.0233, |
| "step": 840 |
| }, |
| { |
| "epoch": 4.497354497354498, |
| "grad_norm": 0.3035692572593689, |
| "learning_rate": 8.49e-05, |
| "loss": 0.0262, |
| "step": 850 |
| }, |
| { |
| "epoch": 4.550264550264551, |
| "grad_norm": 0.32904911041259766, |
| "learning_rate": 8.59e-05, |
| "loss": 0.0243, |
| "step": 860 |
| }, |
| { |
| "epoch": 4.603174603174603, |
| "grad_norm": 0.2948061227798462, |
| "learning_rate": 8.69e-05, |
| "loss": 0.0241, |
| "step": 870 |
| }, |
| { |
| "epoch": 4.656084656084656, |
| "grad_norm": 0.28630343079566956, |
| "learning_rate": 8.790000000000001e-05, |
| "loss": 0.0244, |
| "step": 880 |
| }, |
| { |
| "epoch": 4.708994708994709, |
| "grad_norm": 0.36151307821273804, |
| "learning_rate": 8.89e-05, |
| "loss": 0.0247, |
| "step": 890 |
| }, |
| { |
| "epoch": 4.761904761904762, |
| "grad_norm": 0.3373434543609619, |
| "learning_rate": 8.99e-05, |
| "loss": 0.0245, |
| "step": 900 |
| }, |
| { |
| "epoch": 4.814814814814815, |
| "grad_norm": 0.3014078736305237, |
| "learning_rate": 9.090000000000001e-05, |
| "loss": 0.0244, |
| "step": 910 |
| }, |
| { |
| "epoch": 4.867724867724868, |
| "grad_norm": 0.3851953148841858, |
| "learning_rate": 9.190000000000001e-05, |
| "loss": 0.0232, |
| "step": 920 |
| }, |
| { |
| "epoch": 4.920634920634921, |
| "grad_norm": 0.2808712124824524, |
| "learning_rate": 9.290000000000001e-05, |
| "loss": 0.0265, |
| "step": 930 |
| }, |
| { |
| "epoch": 4.973544973544973, |
| "grad_norm": 0.2859858274459839, |
| "learning_rate": 9.39e-05, |
| "loss": 0.0237, |
| "step": 940 |
| }, |
| { |
| "epoch": 5.026455026455026, |
| "grad_norm": 0.31913334131240845, |
| "learning_rate": 9.49e-05, |
| "loss": 0.0245, |
| "step": 950 |
| }, |
| { |
| "epoch": 5.079365079365079, |
| "grad_norm": 0.3442356288433075, |
| "learning_rate": 9.59e-05, |
| "loss": 0.0257, |
| "step": 960 |
| }, |
| { |
| "epoch": 5.132275132275132, |
| "grad_norm": 0.406459242105484, |
| "learning_rate": 9.69e-05, |
| "loss": 0.0254, |
| "step": 970 |
| }, |
| { |
| "epoch": 5.185185185185185, |
| "grad_norm": 0.34004539251327515, |
| "learning_rate": 9.790000000000001e-05, |
| "loss": 0.024, |
| "step": 980 |
| }, |
| { |
| "epoch": 5.238095238095238, |
| "grad_norm": 0.3003678321838379, |
| "learning_rate": 9.89e-05, |
| "loss": 0.0248, |
| "step": 990 |
| }, |
| { |
| "epoch": 5.291005291005291, |
| "grad_norm": 0.3703750967979431, |
| "learning_rate": 9.99e-05, |
| "loss": 0.0239, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 20000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 106, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 128, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|