| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 2040, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.024509803921568627, |
| "grad_norm": 0.29983004927635193, |
| "learning_rate": 0.00019950980392156864, |
| "loss": 1.497, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.049019607843137254, |
| "grad_norm": 0.22434405982494354, |
| "learning_rate": 0.00019901960784313727, |
| "loss": 1.0531, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.07352941176470588, |
| "grad_norm": 0.1899292916059494, |
| "learning_rate": 0.0001985294117647059, |
| "loss": 0.9282, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.09803921568627451, |
| "grad_norm": 0.21063339710235596, |
| "learning_rate": 0.00019803921568627454, |
| "loss": 0.9509, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.12254901960784313, |
| "grad_norm": 0.2021014541387558, |
| "learning_rate": 0.00019754901960784314, |
| "loss": 0.9195, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.14705882352941177, |
| "grad_norm": 0.22847646474838257, |
| "learning_rate": 0.00019705882352941177, |
| "loss": 0.9314, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1715686274509804, |
| "grad_norm": 0.24809524416923523, |
| "learning_rate": 0.00019656862745098038, |
| "loss": 0.8545, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.19607843137254902, |
| "grad_norm": 0.23628589510917664, |
| "learning_rate": 0.000196078431372549, |
| "loss": 0.8714, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.22058823529411764, |
| "grad_norm": 0.25221529603004456, |
| "learning_rate": 0.00019558823529411764, |
| "loss": 0.8254, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.24509803921568626, |
| "grad_norm": 0.25446635484695435, |
| "learning_rate": 0.00019509803921568628, |
| "loss": 0.8309, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2696078431372549, |
| "grad_norm": 0.23954473435878754, |
| "learning_rate": 0.0001946078431372549, |
| "loss": 0.879, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.29411764705882354, |
| "grad_norm": 0.2512606382369995, |
| "learning_rate": 0.00019411764705882354, |
| "loss": 0.8101, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.31862745098039214, |
| "grad_norm": 0.24768000841140747, |
| "learning_rate": 0.00019362745098039217, |
| "loss": 0.8346, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3431372549019608, |
| "grad_norm": 0.24737860262393951, |
| "learning_rate": 0.0001931372549019608, |
| "loss": 0.8123, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.36764705882352944, |
| "grad_norm": 0.2855224013328552, |
| "learning_rate": 0.00019264705882352944, |
| "loss": 0.7965, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.39215686274509803, |
| "grad_norm": 0.27062663435935974, |
| "learning_rate": 0.00019215686274509807, |
| "loss": 0.8108, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 0.2578269839286804, |
| "learning_rate": 0.00019166666666666667, |
| "loss": 0.809, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.4411764705882353, |
| "grad_norm": 0.2555166482925415, |
| "learning_rate": 0.0001911764705882353, |
| "loss": 0.7826, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.46568627450980393, |
| "grad_norm": 0.3456018567085266, |
| "learning_rate": 0.00019068627450980394, |
| "loss": 0.7771, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.49019607843137253, |
| "grad_norm": 0.2838132381439209, |
| "learning_rate": 0.00019019607843137254, |
| "loss": 0.7505, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5147058823529411, |
| "grad_norm": 0.2752726078033447, |
| "learning_rate": 0.00018970588235294117, |
| "loss": 0.7402, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5392156862745098, |
| "grad_norm": 0.26294729113578796, |
| "learning_rate": 0.0001892156862745098, |
| "loss": 0.7353, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5637254901960784, |
| "grad_norm": 0.28479790687561035, |
| "learning_rate": 0.00018872549019607844, |
| "loss": 0.8024, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5882352941176471, |
| "grad_norm": 0.2948661148548126, |
| "learning_rate": 0.00018823529411764707, |
| "loss": 0.7628, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6127450980392157, |
| "grad_norm": 0.26690128445625305, |
| "learning_rate": 0.0001877450980392157, |
| "loss": 0.75, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6372549019607843, |
| "grad_norm": 0.2684984803199768, |
| "learning_rate": 0.00018725490196078433, |
| "loss": 0.7458, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.6617647058823529, |
| "grad_norm": 0.24935846030712128, |
| "learning_rate": 0.00018676470588235297, |
| "loss": 0.7901, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6862745098039216, |
| "grad_norm": 0.26486936211586, |
| "learning_rate": 0.00018627450980392157, |
| "loss": 0.7356, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7107843137254902, |
| "grad_norm": 0.2598109245300293, |
| "learning_rate": 0.0001857843137254902, |
| "loss": 0.7749, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.7352941176470589, |
| "grad_norm": 0.29170921444892883, |
| "learning_rate": 0.00018529411764705883, |
| "loss": 0.7322, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7598039215686274, |
| "grad_norm": 0.26423102617263794, |
| "learning_rate": 0.00018480392156862747, |
| "loss": 0.7544, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.7843137254901961, |
| "grad_norm": 0.29086926579475403, |
| "learning_rate": 0.00018431372549019607, |
| "loss": 0.7452, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.8088235294117647, |
| "grad_norm": 0.2983661890029907, |
| "learning_rate": 0.0001838235294117647, |
| "loss": 0.7263, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 0.27282992005348206, |
| "learning_rate": 0.00018333333333333334, |
| "loss": 0.739, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.8578431372549019, |
| "grad_norm": 0.2726730704307556, |
| "learning_rate": 0.00018284313725490197, |
| "loss": 0.7134, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.8823529411764706, |
| "grad_norm": 0.282355397939682, |
| "learning_rate": 0.0001823529411764706, |
| "loss": 0.7545, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.9068627450980392, |
| "grad_norm": 0.2755967080593109, |
| "learning_rate": 0.00018186274509803923, |
| "loss": 0.7286, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.9313725490196079, |
| "grad_norm": 0.27661100029945374, |
| "learning_rate": 0.00018137254901960786, |
| "loss": 0.7363, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.9558823529411765, |
| "grad_norm": 0.24934948980808258, |
| "learning_rate": 0.00018088235294117647, |
| "loss": 0.7397, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.9803921568627451, |
| "grad_norm": 0.28986600041389465, |
| "learning_rate": 0.0001803921568627451, |
| "loss": 0.722, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.0049019607843137, |
| "grad_norm": 0.24967212975025177, |
| "learning_rate": 0.00017990196078431373, |
| "loss": 0.7431, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.0294117647058822, |
| "grad_norm": 0.3122069537639618, |
| "learning_rate": 0.00017941176470588236, |
| "loss": 0.6769, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.053921568627451, |
| "grad_norm": 0.27327316999435425, |
| "learning_rate": 0.000178921568627451, |
| "loss": 0.6854, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.0784313725490196, |
| "grad_norm": 0.2995717227458954, |
| "learning_rate": 0.00017843137254901963, |
| "loss": 0.6592, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.1029411764705883, |
| "grad_norm": 0.2795763313770294, |
| "learning_rate": 0.00017794117647058823, |
| "loss": 0.6666, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.1274509803921569, |
| "grad_norm": 0.2783352732658386, |
| "learning_rate": 0.00017745098039215687, |
| "loss": 0.6626, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.1519607843137254, |
| "grad_norm": 0.29723408818244934, |
| "learning_rate": 0.0001769607843137255, |
| "loss": 0.6811, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.1764705882352942, |
| "grad_norm": 0.30281776189804077, |
| "learning_rate": 0.00017647058823529413, |
| "loss": 0.6673, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.2009803921568627, |
| "grad_norm": 0.32884112000465393, |
| "learning_rate": 0.00017598039215686276, |
| "loss": 0.7036, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.2254901960784315, |
| "grad_norm": 0.30529946088790894, |
| "learning_rate": 0.00017549019607843137, |
| "loss": 0.6559, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.32612255215644836, |
| "learning_rate": 0.000175, |
| "loss": 0.6739, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.2745098039215685, |
| "grad_norm": 0.28662651777267456, |
| "learning_rate": 0.00017450980392156863, |
| "loss": 0.6675, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.2990196078431373, |
| "grad_norm": 0.30719125270843506, |
| "learning_rate": 0.00017401960784313726, |
| "loss": 0.7133, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.3235294117647058, |
| "grad_norm": 0.29701197147369385, |
| "learning_rate": 0.0001735294117647059, |
| "loss": 0.6542, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.3480392156862746, |
| "grad_norm": 0.274860143661499, |
| "learning_rate": 0.00017303921568627453, |
| "loss": 0.7098, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.3725490196078431, |
| "grad_norm": 0.3022995889186859, |
| "learning_rate": 0.00017254901960784316, |
| "loss": 0.6648, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.3970588235294117, |
| "grad_norm": 0.2775422930717468, |
| "learning_rate": 0.0001720588235294118, |
| "loss": 0.645, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.4215686274509804, |
| "grad_norm": 0.3129810392856598, |
| "learning_rate": 0.0001715686274509804, |
| "loss": 0.6922, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.446078431372549, |
| "grad_norm": 0.2952588200569153, |
| "learning_rate": 0.00017107843137254903, |
| "loss": 0.6353, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.4705882352941178, |
| "grad_norm": 0.2810933589935303, |
| "learning_rate": 0.00017058823529411766, |
| "loss": 0.6504, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.4950980392156863, |
| "grad_norm": 0.3011665940284729, |
| "learning_rate": 0.0001700980392156863, |
| "loss": 0.6622, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.5196078431372548, |
| "grad_norm": 0.2970154583454132, |
| "learning_rate": 0.0001696078431372549, |
| "loss": 0.6587, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.5441176470588234, |
| "grad_norm": 0.2876601815223694, |
| "learning_rate": 0.00016911764705882353, |
| "loss": 0.6887, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.5686274509803921, |
| "grad_norm": 0.29214441776275635, |
| "learning_rate": 0.00016862745098039216, |
| "loss": 0.6893, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.593137254901961, |
| "grad_norm": 0.29072946310043335, |
| "learning_rate": 0.0001681372549019608, |
| "loss": 0.6752, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.6176470588235294, |
| "grad_norm": 0.30002114176750183, |
| "learning_rate": 0.00016764705882352942, |
| "loss": 0.6521, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.642156862745098, |
| "grad_norm": 0.2819446921348572, |
| "learning_rate": 0.00016715686274509806, |
| "loss": 0.6673, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.2847635746002197, |
| "learning_rate": 0.0001666666666666667, |
| "loss": 0.6597, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.6911764705882353, |
| "grad_norm": 0.28922468423843384, |
| "learning_rate": 0.00016617647058823532, |
| "loss": 0.6467, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.715686274509804, |
| "grad_norm": 0.29009920358657837, |
| "learning_rate": 0.00016568627450980395, |
| "loss": 0.6578, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.7401960784313726, |
| "grad_norm": 0.29140380024909973, |
| "learning_rate": 0.00016519607843137256, |
| "loss": 0.6206, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.7647058823529411, |
| "grad_norm": 0.3021414279937744, |
| "learning_rate": 0.0001647058823529412, |
| "loss": 0.6396, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.7892156862745097, |
| "grad_norm": 0.3210904002189636, |
| "learning_rate": 0.0001642156862745098, |
| "loss": 0.6672, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.8137254901960784, |
| "grad_norm": 0.28844767808914185, |
| "learning_rate": 0.00016372549019607843, |
| "loss": 0.6476, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.8382352941176472, |
| "grad_norm": 0.28307870030403137, |
| "learning_rate": 0.00016323529411764706, |
| "loss": 0.6587, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.8627450980392157, |
| "grad_norm": 0.2879963219165802, |
| "learning_rate": 0.0001627450980392157, |
| "loss": 0.6213, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.8872549019607843, |
| "grad_norm": 0.31524136662483215, |
| "learning_rate": 0.00016225490196078432, |
| "loss": 0.6479, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.9117647058823528, |
| "grad_norm": 0.3042920231819153, |
| "learning_rate": 0.00016176470588235295, |
| "loss": 0.6445, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.9362745098039216, |
| "grad_norm": 0.29315751791000366, |
| "learning_rate": 0.00016127450980392159, |
| "loss": 0.6675, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.9607843137254903, |
| "grad_norm": 0.31360578536987305, |
| "learning_rate": 0.00016078431372549022, |
| "loss": 0.6447, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.9852941176470589, |
| "grad_norm": 0.320044606924057, |
| "learning_rate": 0.00016029411764705885, |
| "loss": 0.6097, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.0098039215686274, |
| "grad_norm": 0.29187172651290894, |
| "learning_rate": 0.00015980392156862746, |
| "loss": 0.6706, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.034313725490196, |
| "grad_norm": 0.3306867778301239, |
| "learning_rate": 0.0001593137254901961, |
| "loss": 0.607, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.0588235294117645, |
| "grad_norm": 0.32475340366363525, |
| "learning_rate": 0.0001588235294117647, |
| "loss": 0.5815, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.0833333333333335, |
| "grad_norm": 0.32624176144599915, |
| "learning_rate": 0.00015833333333333332, |
| "loss": 0.5838, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.107843137254902, |
| "grad_norm": 0.33117762207984924, |
| "learning_rate": 0.00015784313725490196, |
| "loss": 0.5956, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.1323529411764706, |
| "grad_norm": 0.3294385075569153, |
| "learning_rate": 0.0001573529411764706, |
| "loss": 0.618, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.156862745098039, |
| "grad_norm": 0.3245352506637573, |
| "learning_rate": 0.00015686274509803922, |
| "loss": 0.5636, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.1813725490196076, |
| "grad_norm": 0.32185912132263184, |
| "learning_rate": 0.00015637254901960785, |
| "loss": 0.5796, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.2058823529411766, |
| "grad_norm": 0.34521356225013733, |
| "learning_rate": 0.00015588235294117648, |
| "loss": 0.5985, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.230392156862745, |
| "grad_norm": 0.33166056871414185, |
| "learning_rate": 0.00015539215686274512, |
| "loss": 0.5951, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.2549019607843137, |
| "grad_norm": 0.34369799494743347, |
| "learning_rate": 0.00015490196078431375, |
| "loss": 0.6044, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.2794117647058822, |
| "grad_norm": 0.3320542871952057, |
| "learning_rate": 0.00015441176470588238, |
| "loss": 0.5903, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.303921568627451, |
| "grad_norm": 0.34061846137046814, |
| "learning_rate": 0.00015392156862745098, |
| "loss": 0.5792, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.3284313725490198, |
| "grad_norm": 0.3528592586517334, |
| "learning_rate": 0.00015343137254901962, |
| "loss": 0.5867, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.3529411764705883, |
| "grad_norm": 0.33795326948165894, |
| "learning_rate": 0.00015294117647058822, |
| "loss": 0.5529, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.377450980392157, |
| "grad_norm": 0.33023008704185486, |
| "learning_rate": 0.00015245098039215685, |
| "loss": 0.6306, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.4019607843137254, |
| "grad_norm": 0.3331562578678131, |
| "learning_rate": 0.00015196078431372549, |
| "loss": 0.605, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.426470588235294, |
| "grad_norm": 0.32606229186058044, |
| "learning_rate": 0.00015147058823529412, |
| "loss": 0.5815, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.450980392156863, |
| "grad_norm": 0.3443799614906311, |
| "learning_rate": 0.00015098039215686275, |
| "loss": 0.5813, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.4754901960784315, |
| "grad_norm": 0.3231131434440613, |
| "learning_rate": 0.00015049019607843138, |
| "loss": 0.5854, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.31584519147872925, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 0.5808, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.5245098039215685, |
| "grad_norm": 0.32220789790153503, |
| "learning_rate": 0.00014950980392156865, |
| "loss": 0.5972, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.549019607843137, |
| "grad_norm": 0.33140599727630615, |
| "learning_rate": 0.00014901960784313728, |
| "loss": 0.5811, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.5735294117647056, |
| "grad_norm": 0.32509833574295044, |
| "learning_rate": 0.00014852941176470588, |
| "loss": 0.5961, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.5980392156862746, |
| "grad_norm": 0.32181331515312195, |
| "learning_rate": 0.00014803921568627451, |
| "loss": 0.5778, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.622549019607843, |
| "grad_norm": 0.33203810453414917, |
| "learning_rate": 0.00014754901960784315, |
| "loss": 0.6196, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.6470588235294117, |
| "grad_norm": 0.3285435140132904, |
| "learning_rate": 0.00014705882352941178, |
| "loss": 0.6089, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.6715686274509802, |
| "grad_norm": 0.29427462816238403, |
| "learning_rate": 0.00014656862745098038, |
| "loss": 0.5616, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.696078431372549, |
| "grad_norm": 0.36723700165748596, |
| "learning_rate": 0.00014607843137254902, |
| "loss": 0.5664, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.7205882352941178, |
| "grad_norm": 0.32784491777420044, |
| "learning_rate": 0.00014558823529411765, |
| "loss": 0.5627, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.7450980392156863, |
| "grad_norm": 0.33356621861457825, |
| "learning_rate": 0.00014509803921568628, |
| "loss": 0.5789, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.769607843137255, |
| "grad_norm": 0.3453757166862488, |
| "learning_rate": 0.0001446078431372549, |
| "loss": 0.5839, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.7941176470588234, |
| "grad_norm": 0.3244710862636566, |
| "learning_rate": 0.00014411764705882354, |
| "loss": 0.6138, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.818627450980392, |
| "grad_norm": 0.36777082085609436, |
| "learning_rate": 0.00014362745098039218, |
| "loss": 0.5741, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.843137254901961, |
| "grad_norm": 0.3188980221748352, |
| "learning_rate": 0.00014313725490196078, |
| "loss": 0.5625, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.8676470588235294, |
| "grad_norm": 0.32817542552948, |
| "learning_rate": 0.0001426470588235294, |
| "loss": 0.5788, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.892156862745098, |
| "grad_norm": 0.3249094784259796, |
| "learning_rate": 0.00014215686274509804, |
| "loss": 0.6042, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.9166666666666665, |
| "grad_norm": 0.34067729115486145, |
| "learning_rate": 0.00014166666666666668, |
| "loss": 0.5706, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.9411764705882355, |
| "grad_norm": 0.3162751793861389, |
| "learning_rate": 0.0001411764705882353, |
| "loss": 0.5599, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.965686274509804, |
| "grad_norm": 0.3370027244091034, |
| "learning_rate": 0.00014068627450980394, |
| "loss": 0.5967, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.9901960784313726, |
| "grad_norm": 0.3651111423969269, |
| "learning_rate": 0.00014019607843137255, |
| "loss": 0.5765, |
| "step": 1220 |
| }, |
| { |
| "epoch": 3.014705882352941, |
| "grad_norm": 0.3368529975414276, |
| "learning_rate": 0.00013970588235294118, |
| "loss": 0.5559, |
| "step": 1230 |
| }, |
| { |
| "epoch": 3.0392156862745097, |
| "grad_norm": 0.366672158241272, |
| "learning_rate": 0.0001392156862745098, |
| "loss": 0.5062, |
| "step": 1240 |
| }, |
| { |
| "epoch": 3.063725490196078, |
| "grad_norm": 0.406984806060791, |
| "learning_rate": 0.00013872549019607844, |
| "loss": 0.499, |
| "step": 1250 |
| }, |
| { |
| "epoch": 3.088235294117647, |
| "grad_norm": 0.3747471570968628, |
| "learning_rate": 0.00013823529411764707, |
| "loss": 0.5064, |
| "step": 1260 |
| }, |
| { |
| "epoch": 3.1127450980392157, |
| "grad_norm": 0.3869905471801758, |
| "learning_rate": 0.00013774509803921568, |
| "loss": 0.5477, |
| "step": 1270 |
| }, |
| { |
| "epoch": 3.1372549019607843, |
| "grad_norm": 0.40752360224723816, |
| "learning_rate": 0.0001372549019607843, |
| "loss": 0.4915, |
| "step": 1280 |
| }, |
| { |
| "epoch": 3.161764705882353, |
| "grad_norm": 0.3453763723373413, |
| "learning_rate": 0.00013676470588235294, |
| "loss": 0.5037, |
| "step": 1290 |
| }, |
| { |
| "epoch": 3.186274509803922, |
| "grad_norm": 0.3654847741127014, |
| "learning_rate": 0.00013627450980392157, |
| "loss": 0.5138, |
| "step": 1300 |
| }, |
| { |
| "epoch": 3.2107843137254903, |
| "grad_norm": 0.3541817367076874, |
| "learning_rate": 0.0001357843137254902, |
| "loss": 0.5071, |
| "step": 1310 |
| }, |
| { |
| "epoch": 3.235294117647059, |
| "grad_norm": 0.3879326581954956, |
| "learning_rate": 0.00013529411764705884, |
| "loss": 0.5058, |
| "step": 1320 |
| }, |
| { |
| "epoch": 3.2598039215686274, |
| "grad_norm": 0.37032464146614075, |
| "learning_rate": 0.00013480392156862747, |
| "loss": 0.5105, |
| "step": 1330 |
| }, |
| { |
| "epoch": 3.284313725490196, |
| "grad_norm": 0.39336174726486206, |
| "learning_rate": 0.00013431372549019608, |
| "loss": 0.5222, |
| "step": 1340 |
| }, |
| { |
| "epoch": 3.3088235294117645, |
| "grad_norm": 0.36094290018081665, |
| "learning_rate": 0.0001338235294117647, |
| "loss": 0.5298, |
| "step": 1350 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.3758329153060913, |
| "learning_rate": 0.00013333333333333334, |
| "loss": 0.4963, |
| "step": 1360 |
| }, |
| { |
| "epoch": 3.357843137254902, |
| "grad_norm": 0.39011090993881226, |
| "learning_rate": 0.00013284313725490197, |
| "loss": 0.517, |
| "step": 1370 |
| }, |
| { |
| "epoch": 3.3823529411764706, |
| "grad_norm": 0.37309834361076355, |
| "learning_rate": 0.0001323529411764706, |
| "loss": 0.5451, |
| "step": 1380 |
| }, |
| { |
| "epoch": 3.406862745098039, |
| "grad_norm": 0.3810550272464752, |
| "learning_rate": 0.0001318627450980392, |
| "loss": 0.5285, |
| "step": 1390 |
| }, |
| { |
| "epoch": 3.431372549019608, |
| "grad_norm": 0.3513835668563843, |
| "learning_rate": 0.00013137254901960784, |
| "loss": 0.521, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.4558823529411766, |
| "grad_norm": 0.3686577081680298, |
| "learning_rate": 0.00013088235294117647, |
| "loss": 0.5392, |
| "step": 1410 |
| }, |
| { |
| "epoch": 3.480392156862745, |
| "grad_norm": 0.35927894711494446, |
| "learning_rate": 0.0001303921568627451, |
| "loss": 0.5271, |
| "step": 1420 |
| }, |
| { |
| "epoch": 3.5049019607843137, |
| "grad_norm": 0.3643144369125366, |
| "learning_rate": 0.00012990196078431374, |
| "loss": 0.5284, |
| "step": 1430 |
| }, |
| { |
| "epoch": 3.5294117647058822, |
| "grad_norm": 0.40783992409706116, |
| "learning_rate": 0.00012941176470588237, |
| "loss": 0.5085, |
| "step": 1440 |
| }, |
| { |
| "epoch": 3.553921568627451, |
| "grad_norm": 0.370568186044693, |
| "learning_rate": 0.000128921568627451, |
| "loss": 0.4918, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.5784313725490198, |
| "grad_norm": 0.38452836871147156, |
| "learning_rate": 0.00012843137254901963, |
| "loss": 0.5084, |
| "step": 1460 |
| }, |
| { |
| "epoch": 3.6029411764705883, |
| "grad_norm": 0.38461601734161377, |
| "learning_rate": 0.00012794117647058824, |
| "loss": 0.528, |
| "step": 1470 |
| }, |
| { |
| "epoch": 3.627450980392157, |
| "grad_norm": 0.378743052482605, |
| "learning_rate": 0.00012745098039215687, |
| "loss": 0.5476, |
| "step": 1480 |
| }, |
| { |
| "epoch": 3.6519607843137254, |
| "grad_norm": 0.3925258219242096, |
| "learning_rate": 0.0001269607843137255, |
| "loss": 0.5262, |
| "step": 1490 |
| }, |
| { |
| "epoch": 3.6764705882352944, |
| "grad_norm": 0.3843972384929657, |
| "learning_rate": 0.0001264705882352941, |
| "loss": 0.5142, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.700980392156863, |
| "grad_norm": 0.3720986247062683, |
| "learning_rate": 0.00012598039215686274, |
| "loss": 0.51, |
| "step": 1510 |
| }, |
| { |
| "epoch": 3.7254901960784315, |
| "grad_norm": 0.38532596826553345, |
| "learning_rate": 0.00012549019607843137, |
| "loss": 0.5019, |
| "step": 1520 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 0.39480453729629517, |
| "learning_rate": 0.000125, |
| "loss": 0.528, |
| "step": 1530 |
| }, |
| { |
| "epoch": 3.7745098039215685, |
| "grad_norm": 0.40592941641807556, |
| "learning_rate": 0.00012450980392156863, |
| "loss": 0.5345, |
| "step": 1540 |
| }, |
| { |
| "epoch": 3.799019607843137, |
| "grad_norm": 0.9979096055030823, |
| "learning_rate": 0.00012401960784313727, |
| "loss": 0.505, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.8235294117647056, |
| "grad_norm": 0.38043680787086487, |
| "learning_rate": 0.0001235294117647059, |
| "loss": 0.5057, |
| "step": 1560 |
| }, |
| { |
| "epoch": 3.8480392156862746, |
| "grad_norm": 0.37059956789016724, |
| "learning_rate": 0.00012303921568627453, |
| "loss": 0.5252, |
| "step": 1570 |
| }, |
| { |
| "epoch": 3.872549019607843, |
| "grad_norm": 0.3633560240268707, |
| "learning_rate": 0.00012254901960784316, |
| "loss": 0.4987, |
| "step": 1580 |
| }, |
| { |
| "epoch": 3.8970588235294117, |
| "grad_norm": 0.4094372093677521, |
| "learning_rate": 0.00012205882352941178, |
| "loss": 0.5553, |
| "step": 1590 |
| }, |
| { |
| "epoch": 3.9215686274509802, |
| "grad_norm": 0.3960098326206207, |
| "learning_rate": 0.00012156862745098039, |
| "loss": 0.5435, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.946078431372549, |
| "grad_norm": 0.3884652256965637, |
| "learning_rate": 0.00012107843137254902, |
| "loss": 0.5114, |
| "step": 1610 |
| }, |
| { |
| "epoch": 3.9705882352941178, |
| "grad_norm": 0.366059809923172, |
| "learning_rate": 0.00012058823529411765, |
| "loss": 0.5084, |
| "step": 1620 |
| }, |
| { |
| "epoch": 3.9950980392156863, |
| "grad_norm": 0.3853819966316223, |
| "learning_rate": 0.00012009803921568628, |
| "loss": 0.4987, |
| "step": 1630 |
| }, |
| { |
| "epoch": 4.019607843137255, |
| "grad_norm": 0.3792478144168854, |
| "learning_rate": 0.0001196078431372549, |
| "loss": 0.4352, |
| "step": 1640 |
| }, |
| { |
| "epoch": 4.044117647058823, |
| "grad_norm": 0.3791177570819855, |
| "learning_rate": 0.00011911764705882353, |
| "loss": 0.4378, |
| "step": 1650 |
| }, |
| { |
| "epoch": 4.068627450980392, |
| "grad_norm": 0.4122741222381592, |
| "learning_rate": 0.00011862745098039216, |
| "loss": 0.4167, |
| "step": 1660 |
| }, |
| { |
| "epoch": 4.0931372549019605, |
| "grad_norm": 0.41166090965270996, |
| "learning_rate": 0.0001181372549019608, |
| "loss": 0.4355, |
| "step": 1670 |
| }, |
| { |
| "epoch": 4.117647058823529, |
| "grad_norm": 0.37315306067466736, |
| "learning_rate": 0.00011764705882352942, |
| "loss": 0.4347, |
| "step": 1680 |
| }, |
| { |
| "epoch": 4.142156862745098, |
| "grad_norm": 0.4119997024536133, |
| "learning_rate": 0.00011715686274509805, |
| "loss": 0.4396, |
| "step": 1690 |
| }, |
| { |
| "epoch": 4.166666666666667, |
| "grad_norm": 0.41811785101890564, |
| "learning_rate": 0.00011666666666666668, |
| "loss": 0.4374, |
| "step": 1700 |
| }, |
| { |
| "epoch": 4.1911764705882355, |
| "grad_norm": 0.4639507532119751, |
| "learning_rate": 0.00011617647058823531, |
| "loss": 0.4688, |
| "step": 1710 |
| }, |
| { |
| "epoch": 4.215686274509804, |
| "grad_norm": 0.40909409523010254, |
| "learning_rate": 0.00011568627450980394, |
| "loss": 0.4428, |
| "step": 1720 |
| }, |
| { |
| "epoch": 4.240196078431373, |
| "grad_norm": 0.4439013600349426, |
| "learning_rate": 0.00011519607843137255, |
| "loss": 0.4478, |
| "step": 1730 |
| }, |
| { |
| "epoch": 4.264705882352941, |
| "grad_norm": 0.4124948978424072, |
| "learning_rate": 0.00011470588235294118, |
| "loss": 0.4317, |
| "step": 1740 |
| }, |
| { |
| "epoch": 4.28921568627451, |
| "grad_norm": 0.4522900879383087, |
| "learning_rate": 0.0001142156862745098, |
| "loss": 0.4586, |
| "step": 1750 |
| }, |
| { |
| "epoch": 4.313725490196078, |
| "grad_norm": 0.41997721791267395, |
| "learning_rate": 0.00011372549019607843, |
| "loss": 0.4598, |
| "step": 1760 |
| }, |
| { |
| "epoch": 4.338235294117647, |
| "grad_norm": 0.3984828591346741, |
| "learning_rate": 0.00011323529411764706, |
| "loss": 0.4482, |
| "step": 1770 |
| }, |
| { |
| "epoch": 4.362745098039215, |
| "grad_norm": 0.41868138313293457, |
| "learning_rate": 0.0001127450980392157, |
| "loss": 0.4431, |
| "step": 1780 |
| }, |
| { |
| "epoch": 4.387254901960785, |
| "grad_norm": 0.447293221950531, |
| "learning_rate": 0.00011225490196078433, |
| "loss": 0.4434, |
| "step": 1790 |
| }, |
| { |
| "epoch": 4.411764705882353, |
| "grad_norm": 0.3910238742828369, |
| "learning_rate": 0.00011176470588235294, |
| "loss": 0.4463, |
| "step": 1800 |
| }, |
| { |
| "epoch": 4.436274509803922, |
| "grad_norm": 0.40496933460235596, |
| "learning_rate": 0.00011127450980392158, |
| "loss": 0.4793, |
| "step": 1810 |
| }, |
| { |
| "epoch": 4.46078431372549, |
| "grad_norm": 0.4138587415218353, |
| "learning_rate": 0.00011078431372549021, |
| "loss": 0.4417, |
| "step": 1820 |
| }, |
| { |
| "epoch": 4.485294117647059, |
| "grad_norm": 0.4713083803653717, |
| "learning_rate": 0.00011029411764705884, |
| "loss": 0.454, |
| "step": 1830 |
| }, |
| { |
| "epoch": 4.509803921568627, |
| "grad_norm": 0.45029592514038086, |
| "learning_rate": 0.00010980392156862746, |
| "loss": 0.4504, |
| "step": 1840 |
| }, |
| { |
| "epoch": 4.534313725490196, |
| "grad_norm": 0.42069247364997864, |
| "learning_rate": 0.00010931372549019608, |
| "loss": 0.4794, |
| "step": 1850 |
| }, |
| { |
| "epoch": 4.5588235294117645, |
| "grad_norm": 0.3917437195777893, |
| "learning_rate": 0.0001088235294117647, |
| "loss": 0.4502, |
| "step": 1860 |
| }, |
| { |
| "epoch": 4.583333333333333, |
| "grad_norm": 0.417987585067749, |
| "learning_rate": 0.00010833333333333333, |
| "loss": 0.4579, |
| "step": 1870 |
| }, |
| { |
| "epoch": 4.607843137254902, |
| "grad_norm": 0.4511590301990509, |
| "learning_rate": 0.00010784313725490196, |
| "loss": 0.4686, |
| "step": 1880 |
| }, |
| { |
| "epoch": 4.632352941176471, |
| "grad_norm": 0.4555961787700653, |
| "learning_rate": 0.00010735294117647059, |
| "loss": 0.4332, |
| "step": 1890 |
| }, |
| { |
| "epoch": 4.6568627450980395, |
| "grad_norm": 0.4159266948699951, |
| "learning_rate": 0.00010686274509803922, |
| "loss": 0.4721, |
| "step": 1900 |
| }, |
| { |
| "epoch": 4.681372549019608, |
| "grad_norm": 0.4237167239189148, |
| "learning_rate": 0.00010637254901960784, |
| "loss": 0.4542, |
| "step": 1910 |
| }, |
| { |
| "epoch": 4.705882352941177, |
| "grad_norm": 0.42502954602241516, |
| "learning_rate": 0.00010588235294117647, |
| "loss": 0.4568, |
| "step": 1920 |
| }, |
| { |
| "epoch": 4.730392156862745, |
| "grad_norm": 0.4060966968536377, |
| "learning_rate": 0.00010539215686274511, |
| "loss": 0.4588, |
| "step": 1930 |
| }, |
| { |
| "epoch": 4.754901960784314, |
| "grad_norm": 0.444376140832901, |
| "learning_rate": 0.00010490196078431374, |
| "loss": 0.4617, |
| "step": 1940 |
| }, |
| { |
| "epoch": 4.779411764705882, |
| "grad_norm": 0.41235941648483276, |
| "learning_rate": 0.00010441176470588237, |
| "loss": 0.4655, |
| "step": 1950 |
| }, |
| { |
| "epoch": 4.803921568627451, |
| "grad_norm": 0.42703354358673096, |
| "learning_rate": 0.00010392156862745099, |
| "loss": 0.4643, |
| "step": 1960 |
| }, |
| { |
| "epoch": 4.828431372549019, |
| "grad_norm": 0.4282451272010803, |
| "learning_rate": 0.00010343137254901962, |
| "loss": 0.4435, |
| "step": 1970 |
| }, |
| { |
| "epoch": 4.852941176470588, |
| "grad_norm": 0.40952056646347046, |
| "learning_rate": 0.00010294117647058823, |
| "loss": 0.4571, |
| "step": 1980 |
| }, |
| { |
| "epoch": 4.877450980392156, |
| "grad_norm": 0.41904792189598083, |
| "learning_rate": 0.00010245098039215686, |
| "loss": 0.4361, |
| "step": 1990 |
| }, |
| { |
| "epoch": 4.901960784313726, |
| "grad_norm": 0.4027288258075714, |
| "learning_rate": 0.00010196078431372549, |
| "loss": 0.4507, |
| "step": 2000 |
| }, |
| { |
| "epoch": 4.926470588235294, |
| "grad_norm": 0.4230017364025116, |
| "learning_rate": 0.00010147058823529412, |
| "loss": 0.4645, |
| "step": 2010 |
| }, |
| { |
| "epoch": 4.950980392156863, |
| "grad_norm": 0.44512614607810974, |
| "learning_rate": 0.00010098039215686274, |
| "loss": 0.4585, |
| "step": 2020 |
| }, |
| { |
| "epoch": 4.9754901960784315, |
| "grad_norm": 0.4242990016937256, |
| "learning_rate": 0.00010049019607843137, |
| "loss": 0.4785, |
| "step": 2030 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.4472719132900238, |
| "learning_rate": 0.0001, |
| "loss": 0.4635, |
| "step": 2040 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 4080, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.231691259706409e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|