finacoaaa's picture
Upload 14 files
bd0f1bc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 2040,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024509803921568627,
"grad_norm": 0.29983004927635193,
"learning_rate": 0.00019950980392156864,
"loss": 1.497,
"step": 10
},
{
"epoch": 0.049019607843137254,
"grad_norm": 0.22434405982494354,
"learning_rate": 0.00019901960784313727,
"loss": 1.0531,
"step": 20
},
{
"epoch": 0.07352941176470588,
"grad_norm": 0.1899292916059494,
"learning_rate": 0.0001985294117647059,
"loss": 0.9282,
"step": 30
},
{
"epoch": 0.09803921568627451,
"grad_norm": 0.21063339710235596,
"learning_rate": 0.00019803921568627454,
"loss": 0.9509,
"step": 40
},
{
"epoch": 0.12254901960784313,
"grad_norm": 0.2021014541387558,
"learning_rate": 0.00019754901960784314,
"loss": 0.9195,
"step": 50
},
{
"epoch": 0.14705882352941177,
"grad_norm": 0.22847646474838257,
"learning_rate": 0.00019705882352941177,
"loss": 0.9314,
"step": 60
},
{
"epoch": 0.1715686274509804,
"grad_norm": 0.24809524416923523,
"learning_rate": 0.00019656862745098038,
"loss": 0.8545,
"step": 70
},
{
"epoch": 0.19607843137254902,
"grad_norm": 0.23628589510917664,
"learning_rate": 0.000196078431372549,
"loss": 0.8714,
"step": 80
},
{
"epoch": 0.22058823529411764,
"grad_norm": 0.25221529603004456,
"learning_rate": 0.00019558823529411764,
"loss": 0.8254,
"step": 90
},
{
"epoch": 0.24509803921568626,
"grad_norm": 0.25446635484695435,
"learning_rate": 0.00019509803921568628,
"loss": 0.8309,
"step": 100
},
{
"epoch": 0.2696078431372549,
"grad_norm": 0.23954473435878754,
"learning_rate": 0.0001946078431372549,
"loss": 0.879,
"step": 110
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.2512606382369995,
"learning_rate": 0.00019411764705882354,
"loss": 0.8101,
"step": 120
},
{
"epoch": 0.31862745098039214,
"grad_norm": 0.24768000841140747,
"learning_rate": 0.00019362745098039217,
"loss": 0.8346,
"step": 130
},
{
"epoch": 0.3431372549019608,
"grad_norm": 0.24737860262393951,
"learning_rate": 0.0001931372549019608,
"loss": 0.8123,
"step": 140
},
{
"epoch": 0.36764705882352944,
"grad_norm": 0.2855224013328552,
"learning_rate": 0.00019264705882352944,
"loss": 0.7965,
"step": 150
},
{
"epoch": 0.39215686274509803,
"grad_norm": 0.27062663435935974,
"learning_rate": 0.00019215686274509807,
"loss": 0.8108,
"step": 160
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.2578269839286804,
"learning_rate": 0.00019166666666666667,
"loss": 0.809,
"step": 170
},
{
"epoch": 0.4411764705882353,
"grad_norm": 0.2555166482925415,
"learning_rate": 0.0001911764705882353,
"loss": 0.7826,
"step": 180
},
{
"epoch": 0.46568627450980393,
"grad_norm": 0.3456018567085266,
"learning_rate": 0.00019068627450980394,
"loss": 0.7771,
"step": 190
},
{
"epoch": 0.49019607843137253,
"grad_norm": 0.2838132381439209,
"learning_rate": 0.00019019607843137254,
"loss": 0.7505,
"step": 200
},
{
"epoch": 0.5147058823529411,
"grad_norm": 0.2752726078033447,
"learning_rate": 0.00018970588235294117,
"loss": 0.7402,
"step": 210
},
{
"epoch": 0.5392156862745098,
"grad_norm": 0.26294729113578796,
"learning_rate": 0.0001892156862745098,
"loss": 0.7353,
"step": 220
},
{
"epoch": 0.5637254901960784,
"grad_norm": 0.28479790687561035,
"learning_rate": 0.00018872549019607844,
"loss": 0.8024,
"step": 230
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.2948661148548126,
"learning_rate": 0.00018823529411764707,
"loss": 0.7628,
"step": 240
},
{
"epoch": 0.6127450980392157,
"grad_norm": 0.26690128445625305,
"learning_rate": 0.0001877450980392157,
"loss": 0.75,
"step": 250
},
{
"epoch": 0.6372549019607843,
"grad_norm": 0.2684984803199768,
"learning_rate": 0.00018725490196078433,
"loss": 0.7458,
"step": 260
},
{
"epoch": 0.6617647058823529,
"grad_norm": 0.24935846030712128,
"learning_rate": 0.00018676470588235297,
"loss": 0.7901,
"step": 270
},
{
"epoch": 0.6862745098039216,
"grad_norm": 0.26486936211586,
"learning_rate": 0.00018627450980392157,
"loss": 0.7356,
"step": 280
},
{
"epoch": 0.7107843137254902,
"grad_norm": 0.2598109245300293,
"learning_rate": 0.0001857843137254902,
"loss": 0.7749,
"step": 290
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.29170921444892883,
"learning_rate": 0.00018529411764705883,
"loss": 0.7322,
"step": 300
},
{
"epoch": 0.7598039215686274,
"grad_norm": 0.26423102617263794,
"learning_rate": 0.00018480392156862747,
"loss": 0.7544,
"step": 310
},
{
"epoch": 0.7843137254901961,
"grad_norm": 0.29086926579475403,
"learning_rate": 0.00018431372549019607,
"loss": 0.7452,
"step": 320
},
{
"epoch": 0.8088235294117647,
"grad_norm": 0.2983661890029907,
"learning_rate": 0.0001838235294117647,
"loss": 0.7263,
"step": 330
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.27282992005348206,
"learning_rate": 0.00018333333333333334,
"loss": 0.739,
"step": 340
},
{
"epoch": 0.8578431372549019,
"grad_norm": 0.2726730704307556,
"learning_rate": 0.00018284313725490197,
"loss": 0.7134,
"step": 350
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.282355397939682,
"learning_rate": 0.0001823529411764706,
"loss": 0.7545,
"step": 360
},
{
"epoch": 0.9068627450980392,
"grad_norm": 0.2755967080593109,
"learning_rate": 0.00018186274509803923,
"loss": 0.7286,
"step": 370
},
{
"epoch": 0.9313725490196079,
"grad_norm": 0.27661100029945374,
"learning_rate": 0.00018137254901960786,
"loss": 0.7363,
"step": 380
},
{
"epoch": 0.9558823529411765,
"grad_norm": 0.24934948980808258,
"learning_rate": 0.00018088235294117647,
"loss": 0.7397,
"step": 390
},
{
"epoch": 0.9803921568627451,
"grad_norm": 0.28986600041389465,
"learning_rate": 0.0001803921568627451,
"loss": 0.722,
"step": 400
},
{
"epoch": 1.0049019607843137,
"grad_norm": 0.24967212975025177,
"learning_rate": 0.00017990196078431373,
"loss": 0.7431,
"step": 410
},
{
"epoch": 1.0294117647058822,
"grad_norm": 0.3122069537639618,
"learning_rate": 0.00017941176470588236,
"loss": 0.6769,
"step": 420
},
{
"epoch": 1.053921568627451,
"grad_norm": 0.27327316999435425,
"learning_rate": 0.000178921568627451,
"loss": 0.6854,
"step": 430
},
{
"epoch": 1.0784313725490196,
"grad_norm": 0.2995717227458954,
"learning_rate": 0.00017843137254901963,
"loss": 0.6592,
"step": 440
},
{
"epoch": 1.1029411764705883,
"grad_norm": 0.2795763313770294,
"learning_rate": 0.00017794117647058823,
"loss": 0.6666,
"step": 450
},
{
"epoch": 1.1274509803921569,
"grad_norm": 0.2783352732658386,
"learning_rate": 0.00017745098039215687,
"loss": 0.6626,
"step": 460
},
{
"epoch": 1.1519607843137254,
"grad_norm": 0.29723408818244934,
"learning_rate": 0.0001769607843137255,
"loss": 0.6811,
"step": 470
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.30281776189804077,
"learning_rate": 0.00017647058823529413,
"loss": 0.6673,
"step": 480
},
{
"epoch": 1.2009803921568627,
"grad_norm": 0.32884112000465393,
"learning_rate": 0.00017598039215686276,
"loss": 0.7036,
"step": 490
},
{
"epoch": 1.2254901960784315,
"grad_norm": 0.30529946088790894,
"learning_rate": 0.00017549019607843137,
"loss": 0.6559,
"step": 500
},
{
"epoch": 1.25,
"grad_norm": 0.32612255215644836,
"learning_rate": 0.000175,
"loss": 0.6739,
"step": 510
},
{
"epoch": 1.2745098039215685,
"grad_norm": 0.28662651777267456,
"learning_rate": 0.00017450980392156863,
"loss": 0.6675,
"step": 520
},
{
"epoch": 1.2990196078431373,
"grad_norm": 0.30719125270843506,
"learning_rate": 0.00017401960784313726,
"loss": 0.7133,
"step": 530
},
{
"epoch": 1.3235294117647058,
"grad_norm": 0.29701197147369385,
"learning_rate": 0.0001735294117647059,
"loss": 0.6542,
"step": 540
},
{
"epoch": 1.3480392156862746,
"grad_norm": 0.274860143661499,
"learning_rate": 0.00017303921568627453,
"loss": 0.7098,
"step": 550
},
{
"epoch": 1.3725490196078431,
"grad_norm": 0.3022995889186859,
"learning_rate": 0.00017254901960784316,
"loss": 0.6648,
"step": 560
},
{
"epoch": 1.3970588235294117,
"grad_norm": 0.2775422930717468,
"learning_rate": 0.0001720588235294118,
"loss": 0.645,
"step": 570
},
{
"epoch": 1.4215686274509804,
"grad_norm": 0.3129810392856598,
"learning_rate": 0.0001715686274509804,
"loss": 0.6922,
"step": 580
},
{
"epoch": 1.446078431372549,
"grad_norm": 0.2952588200569153,
"learning_rate": 0.00017107843137254903,
"loss": 0.6353,
"step": 590
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.2810933589935303,
"learning_rate": 0.00017058823529411766,
"loss": 0.6504,
"step": 600
},
{
"epoch": 1.4950980392156863,
"grad_norm": 0.3011665940284729,
"learning_rate": 0.0001700980392156863,
"loss": 0.6622,
"step": 610
},
{
"epoch": 1.5196078431372548,
"grad_norm": 0.2970154583454132,
"learning_rate": 0.0001696078431372549,
"loss": 0.6587,
"step": 620
},
{
"epoch": 1.5441176470588234,
"grad_norm": 0.2876601815223694,
"learning_rate": 0.00016911764705882353,
"loss": 0.6887,
"step": 630
},
{
"epoch": 1.5686274509803921,
"grad_norm": 0.29214441776275635,
"learning_rate": 0.00016862745098039216,
"loss": 0.6893,
"step": 640
},
{
"epoch": 1.593137254901961,
"grad_norm": 0.29072946310043335,
"learning_rate": 0.0001681372549019608,
"loss": 0.6752,
"step": 650
},
{
"epoch": 1.6176470588235294,
"grad_norm": 0.30002114176750183,
"learning_rate": 0.00016764705882352942,
"loss": 0.6521,
"step": 660
},
{
"epoch": 1.642156862745098,
"grad_norm": 0.2819446921348572,
"learning_rate": 0.00016715686274509806,
"loss": 0.6673,
"step": 670
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.2847635746002197,
"learning_rate": 0.0001666666666666667,
"loss": 0.6597,
"step": 680
},
{
"epoch": 1.6911764705882353,
"grad_norm": 0.28922468423843384,
"learning_rate": 0.00016617647058823532,
"loss": 0.6467,
"step": 690
},
{
"epoch": 1.715686274509804,
"grad_norm": 0.29009920358657837,
"learning_rate": 0.00016568627450980395,
"loss": 0.6578,
"step": 700
},
{
"epoch": 1.7401960784313726,
"grad_norm": 0.29140380024909973,
"learning_rate": 0.00016519607843137256,
"loss": 0.6206,
"step": 710
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.3021414279937744,
"learning_rate": 0.0001647058823529412,
"loss": 0.6396,
"step": 720
},
{
"epoch": 1.7892156862745097,
"grad_norm": 0.3210904002189636,
"learning_rate": 0.0001642156862745098,
"loss": 0.6672,
"step": 730
},
{
"epoch": 1.8137254901960784,
"grad_norm": 0.28844767808914185,
"learning_rate": 0.00016372549019607843,
"loss": 0.6476,
"step": 740
},
{
"epoch": 1.8382352941176472,
"grad_norm": 0.28307870030403137,
"learning_rate": 0.00016323529411764706,
"loss": 0.6587,
"step": 750
},
{
"epoch": 1.8627450980392157,
"grad_norm": 0.2879963219165802,
"learning_rate": 0.0001627450980392157,
"loss": 0.6213,
"step": 760
},
{
"epoch": 1.8872549019607843,
"grad_norm": 0.31524136662483215,
"learning_rate": 0.00016225490196078432,
"loss": 0.6479,
"step": 770
},
{
"epoch": 1.9117647058823528,
"grad_norm": 0.3042920231819153,
"learning_rate": 0.00016176470588235295,
"loss": 0.6445,
"step": 780
},
{
"epoch": 1.9362745098039216,
"grad_norm": 0.29315751791000366,
"learning_rate": 0.00016127450980392159,
"loss": 0.6675,
"step": 790
},
{
"epoch": 1.9607843137254903,
"grad_norm": 0.31360578536987305,
"learning_rate": 0.00016078431372549022,
"loss": 0.6447,
"step": 800
},
{
"epoch": 1.9852941176470589,
"grad_norm": 0.320044606924057,
"learning_rate": 0.00016029411764705885,
"loss": 0.6097,
"step": 810
},
{
"epoch": 2.0098039215686274,
"grad_norm": 0.29187172651290894,
"learning_rate": 0.00015980392156862746,
"loss": 0.6706,
"step": 820
},
{
"epoch": 2.034313725490196,
"grad_norm": 0.3306867778301239,
"learning_rate": 0.0001593137254901961,
"loss": 0.607,
"step": 830
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.32475340366363525,
"learning_rate": 0.0001588235294117647,
"loss": 0.5815,
"step": 840
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.32624176144599915,
"learning_rate": 0.00015833333333333332,
"loss": 0.5838,
"step": 850
},
{
"epoch": 2.107843137254902,
"grad_norm": 0.33117762207984924,
"learning_rate": 0.00015784313725490196,
"loss": 0.5956,
"step": 860
},
{
"epoch": 2.1323529411764706,
"grad_norm": 0.3294385075569153,
"learning_rate": 0.0001573529411764706,
"loss": 0.618,
"step": 870
},
{
"epoch": 2.156862745098039,
"grad_norm": 0.3245352506637573,
"learning_rate": 0.00015686274509803922,
"loss": 0.5636,
"step": 880
},
{
"epoch": 2.1813725490196076,
"grad_norm": 0.32185912132263184,
"learning_rate": 0.00015637254901960785,
"loss": 0.5796,
"step": 890
},
{
"epoch": 2.2058823529411766,
"grad_norm": 0.34521356225013733,
"learning_rate": 0.00015588235294117648,
"loss": 0.5985,
"step": 900
},
{
"epoch": 2.230392156862745,
"grad_norm": 0.33166056871414185,
"learning_rate": 0.00015539215686274512,
"loss": 0.5951,
"step": 910
},
{
"epoch": 2.2549019607843137,
"grad_norm": 0.34369799494743347,
"learning_rate": 0.00015490196078431375,
"loss": 0.6044,
"step": 920
},
{
"epoch": 2.2794117647058822,
"grad_norm": 0.3320542871952057,
"learning_rate": 0.00015441176470588238,
"loss": 0.5903,
"step": 930
},
{
"epoch": 2.303921568627451,
"grad_norm": 0.34061846137046814,
"learning_rate": 0.00015392156862745098,
"loss": 0.5792,
"step": 940
},
{
"epoch": 2.3284313725490198,
"grad_norm": 0.3528592586517334,
"learning_rate": 0.00015343137254901962,
"loss": 0.5867,
"step": 950
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.33795326948165894,
"learning_rate": 0.00015294117647058822,
"loss": 0.5529,
"step": 960
},
{
"epoch": 2.377450980392157,
"grad_norm": 0.33023008704185486,
"learning_rate": 0.00015245098039215685,
"loss": 0.6306,
"step": 970
},
{
"epoch": 2.4019607843137254,
"grad_norm": 0.3331562578678131,
"learning_rate": 0.00015196078431372549,
"loss": 0.605,
"step": 980
},
{
"epoch": 2.426470588235294,
"grad_norm": 0.32606229186058044,
"learning_rate": 0.00015147058823529412,
"loss": 0.5815,
"step": 990
},
{
"epoch": 2.450980392156863,
"grad_norm": 0.3443799614906311,
"learning_rate": 0.00015098039215686275,
"loss": 0.5813,
"step": 1000
},
{
"epoch": 2.4754901960784315,
"grad_norm": 0.3231131434440613,
"learning_rate": 0.00015049019607843138,
"loss": 0.5854,
"step": 1010
},
{
"epoch": 2.5,
"grad_norm": 0.31584519147872925,
"learning_rate": 0.00015000000000000001,
"loss": 0.5808,
"step": 1020
},
{
"epoch": 2.5245098039215685,
"grad_norm": 0.32220789790153503,
"learning_rate": 0.00014950980392156865,
"loss": 0.5972,
"step": 1030
},
{
"epoch": 2.549019607843137,
"grad_norm": 0.33140599727630615,
"learning_rate": 0.00014901960784313728,
"loss": 0.5811,
"step": 1040
},
{
"epoch": 2.5735294117647056,
"grad_norm": 0.32509833574295044,
"learning_rate": 0.00014852941176470588,
"loss": 0.5961,
"step": 1050
},
{
"epoch": 2.5980392156862746,
"grad_norm": 0.32181331515312195,
"learning_rate": 0.00014803921568627451,
"loss": 0.5778,
"step": 1060
},
{
"epoch": 2.622549019607843,
"grad_norm": 0.33203810453414917,
"learning_rate": 0.00014754901960784315,
"loss": 0.6196,
"step": 1070
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.3285435140132904,
"learning_rate": 0.00014705882352941178,
"loss": 0.6089,
"step": 1080
},
{
"epoch": 2.6715686274509802,
"grad_norm": 0.29427462816238403,
"learning_rate": 0.00014656862745098038,
"loss": 0.5616,
"step": 1090
},
{
"epoch": 2.696078431372549,
"grad_norm": 0.36723700165748596,
"learning_rate": 0.00014607843137254902,
"loss": 0.5664,
"step": 1100
},
{
"epoch": 2.7205882352941178,
"grad_norm": 0.32784491777420044,
"learning_rate": 0.00014558823529411765,
"loss": 0.5627,
"step": 1110
},
{
"epoch": 2.7450980392156863,
"grad_norm": 0.33356621861457825,
"learning_rate": 0.00014509803921568628,
"loss": 0.5789,
"step": 1120
},
{
"epoch": 2.769607843137255,
"grad_norm": 0.3453757166862488,
"learning_rate": 0.0001446078431372549,
"loss": 0.5839,
"step": 1130
},
{
"epoch": 2.7941176470588234,
"grad_norm": 0.3244710862636566,
"learning_rate": 0.00014411764705882354,
"loss": 0.6138,
"step": 1140
},
{
"epoch": 2.818627450980392,
"grad_norm": 0.36777082085609436,
"learning_rate": 0.00014362745098039218,
"loss": 0.5741,
"step": 1150
},
{
"epoch": 2.843137254901961,
"grad_norm": 0.3188980221748352,
"learning_rate": 0.00014313725490196078,
"loss": 0.5625,
"step": 1160
},
{
"epoch": 2.8676470588235294,
"grad_norm": 0.32817542552948,
"learning_rate": 0.0001426470588235294,
"loss": 0.5788,
"step": 1170
},
{
"epoch": 2.892156862745098,
"grad_norm": 0.3249094784259796,
"learning_rate": 0.00014215686274509804,
"loss": 0.6042,
"step": 1180
},
{
"epoch": 2.9166666666666665,
"grad_norm": 0.34067729115486145,
"learning_rate": 0.00014166666666666668,
"loss": 0.5706,
"step": 1190
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.3162751793861389,
"learning_rate": 0.0001411764705882353,
"loss": 0.5599,
"step": 1200
},
{
"epoch": 2.965686274509804,
"grad_norm": 0.3370027244091034,
"learning_rate": 0.00014068627450980394,
"loss": 0.5967,
"step": 1210
},
{
"epoch": 2.9901960784313726,
"grad_norm": 0.3651111423969269,
"learning_rate": 0.00014019607843137255,
"loss": 0.5765,
"step": 1220
},
{
"epoch": 3.014705882352941,
"grad_norm": 0.3368529975414276,
"learning_rate": 0.00013970588235294118,
"loss": 0.5559,
"step": 1230
},
{
"epoch": 3.0392156862745097,
"grad_norm": 0.366672158241272,
"learning_rate": 0.0001392156862745098,
"loss": 0.5062,
"step": 1240
},
{
"epoch": 3.063725490196078,
"grad_norm": 0.406984806060791,
"learning_rate": 0.00013872549019607844,
"loss": 0.499,
"step": 1250
},
{
"epoch": 3.088235294117647,
"grad_norm": 0.3747471570968628,
"learning_rate": 0.00013823529411764707,
"loss": 0.5064,
"step": 1260
},
{
"epoch": 3.1127450980392157,
"grad_norm": 0.3869905471801758,
"learning_rate": 0.00013774509803921568,
"loss": 0.5477,
"step": 1270
},
{
"epoch": 3.1372549019607843,
"grad_norm": 0.40752360224723816,
"learning_rate": 0.0001372549019607843,
"loss": 0.4915,
"step": 1280
},
{
"epoch": 3.161764705882353,
"grad_norm": 0.3453763723373413,
"learning_rate": 0.00013676470588235294,
"loss": 0.5037,
"step": 1290
},
{
"epoch": 3.186274509803922,
"grad_norm": 0.3654847741127014,
"learning_rate": 0.00013627450980392157,
"loss": 0.5138,
"step": 1300
},
{
"epoch": 3.2107843137254903,
"grad_norm": 0.3541817367076874,
"learning_rate": 0.0001357843137254902,
"loss": 0.5071,
"step": 1310
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.3879326581954956,
"learning_rate": 0.00013529411764705884,
"loss": 0.5058,
"step": 1320
},
{
"epoch": 3.2598039215686274,
"grad_norm": 0.37032464146614075,
"learning_rate": 0.00013480392156862747,
"loss": 0.5105,
"step": 1330
},
{
"epoch": 3.284313725490196,
"grad_norm": 0.39336174726486206,
"learning_rate": 0.00013431372549019608,
"loss": 0.5222,
"step": 1340
},
{
"epoch": 3.3088235294117645,
"grad_norm": 0.36094290018081665,
"learning_rate": 0.0001338235294117647,
"loss": 0.5298,
"step": 1350
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.3758329153060913,
"learning_rate": 0.00013333333333333334,
"loss": 0.4963,
"step": 1360
},
{
"epoch": 3.357843137254902,
"grad_norm": 0.39011090993881226,
"learning_rate": 0.00013284313725490197,
"loss": 0.517,
"step": 1370
},
{
"epoch": 3.3823529411764706,
"grad_norm": 0.37309834361076355,
"learning_rate": 0.0001323529411764706,
"loss": 0.5451,
"step": 1380
},
{
"epoch": 3.406862745098039,
"grad_norm": 0.3810550272464752,
"learning_rate": 0.0001318627450980392,
"loss": 0.5285,
"step": 1390
},
{
"epoch": 3.431372549019608,
"grad_norm": 0.3513835668563843,
"learning_rate": 0.00013137254901960784,
"loss": 0.521,
"step": 1400
},
{
"epoch": 3.4558823529411766,
"grad_norm": 0.3686577081680298,
"learning_rate": 0.00013088235294117647,
"loss": 0.5392,
"step": 1410
},
{
"epoch": 3.480392156862745,
"grad_norm": 0.35927894711494446,
"learning_rate": 0.0001303921568627451,
"loss": 0.5271,
"step": 1420
},
{
"epoch": 3.5049019607843137,
"grad_norm": 0.3643144369125366,
"learning_rate": 0.00012990196078431374,
"loss": 0.5284,
"step": 1430
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.40783992409706116,
"learning_rate": 0.00012941176470588237,
"loss": 0.5085,
"step": 1440
},
{
"epoch": 3.553921568627451,
"grad_norm": 0.370568186044693,
"learning_rate": 0.000128921568627451,
"loss": 0.4918,
"step": 1450
},
{
"epoch": 3.5784313725490198,
"grad_norm": 0.38452836871147156,
"learning_rate": 0.00012843137254901963,
"loss": 0.5084,
"step": 1460
},
{
"epoch": 3.6029411764705883,
"grad_norm": 0.38461601734161377,
"learning_rate": 0.00012794117647058824,
"loss": 0.528,
"step": 1470
},
{
"epoch": 3.627450980392157,
"grad_norm": 0.378743052482605,
"learning_rate": 0.00012745098039215687,
"loss": 0.5476,
"step": 1480
},
{
"epoch": 3.6519607843137254,
"grad_norm": 0.3925258219242096,
"learning_rate": 0.0001269607843137255,
"loss": 0.5262,
"step": 1490
},
{
"epoch": 3.6764705882352944,
"grad_norm": 0.3843972384929657,
"learning_rate": 0.0001264705882352941,
"loss": 0.5142,
"step": 1500
},
{
"epoch": 3.700980392156863,
"grad_norm": 0.3720986247062683,
"learning_rate": 0.00012598039215686274,
"loss": 0.51,
"step": 1510
},
{
"epoch": 3.7254901960784315,
"grad_norm": 0.38532596826553345,
"learning_rate": 0.00012549019607843137,
"loss": 0.5019,
"step": 1520
},
{
"epoch": 3.75,
"grad_norm": 0.39480453729629517,
"learning_rate": 0.000125,
"loss": 0.528,
"step": 1530
},
{
"epoch": 3.7745098039215685,
"grad_norm": 0.40592941641807556,
"learning_rate": 0.00012450980392156863,
"loss": 0.5345,
"step": 1540
},
{
"epoch": 3.799019607843137,
"grad_norm": 0.9979096055030823,
"learning_rate": 0.00012401960784313727,
"loss": 0.505,
"step": 1550
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.38043680787086487,
"learning_rate": 0.0001235294117647059,
"loss": 0.5057,
"step": 1560
},
{
"epoch": 3.8480392156862746,
"grad_norm": 0.37059956789016724,
"learning_rate": 0.00012303921568627453,
"loss": 0.5252,
"step": 1570
},
{
"epoch": 3.872549019607843,
"grad_norm": 0.3633560240268707,
"learning_rate": 0.00012254901960784316,
"loss": 0.4987,
"step": 1580
},
{
"epoch": 3.8970588235294117,
"grad_norm": 0.4094372093677521,
"learning_rate": 0.00012205882352941178,
"loss": 0.5553,
"step": 1590
},
{
"epoch": 3.9215686274509802,
"grad_norm": 0.3960098326206207,
"learning_rate": 0.00012156862745098039,
"loss": 0.5435,
"step": 1600
},
{
"epoch": 3.946078431372549,
"grad_norm": 0.3884652256965637,
"learning_rate": 0.00012107843137254902,
"loss": 0.5114,
"step": 1610
},
{
"epoch": 3.9705882352941178,
"grad_norm": 0.366059809923172,
"learning_rate": 0.00012058823529411765,
"loss": 0.5084,
"step": 1620
},
{
"epoch": 3.9950980392156863,
"grad_norm": 0.3853819966316223,
"learning_rate": 0.00012009803921568628,
"loss": 0.4987,
"step": 1630
},
{
"epoch": 4.019607843137255,
"grad_norm": 0.3792478144168854,
"learning_rate": 0.0001196078431372549,
"loss": 0.4352,
"step": 1640
},
{
"epoch": 4.044117647058823,
"grad_norm": 0.3791177570819855,
"learning_rate": 0.00011911764705882353,
"loss": 0.4378,
"step": 1650
},
{
"epoch": 4.068627450980392,
"grad_norm": 0.4122741222381592,
"learning_rate": 0.00011862745098039216,
"loss": 0.4167,
"step": 1660
},
{
"epoch": 4.0931372549019605,
"grad_norm": 0.41166090965270996,
"learning_rate": 0.0001181372549019608,
"loss": 0.4355,
"step": 1670
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.37315306067466736,
"learning_rate": 0.00011764705882352942,
"loss": 0.4347,
"step": 1680
},
{
"epoch": 4.142156862745098,
"grad_norm": 0.4119997024536133,
"learning_rate": 0.00011715686274509805,
"loss": 0.4396,
"step": 1690
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.41811785101890564,
"learning_rate": 0.00011666666666666668,
"loss": 0.4374,
"step": 1700
},
{
"epoch": 4.1911764705882355,
"grad_norm": 0.4639507532119751,
"learning_rate": 0.00011617647058823531,
"loss": 0.4688,
"step": 1710
},
{
"epoch": 4.215686274509804,
"grad_norm": 0.40909409523010254,
"learning_rate": 0.00011568627450980394,
"loss": 0.4428,
"step": 1720
},
{
"epoch": 4.240196078431373,
"grad_norm": 0.4439013600349426,
"learning_rate": 0.00011519607843137255,
"loss": 0.4478,
"step": 1730
},
{
"epoch": 4.264705882352941,
"grad_norm": 0.4124948978424072,
"learning_rate": 0.00011470588235294118,
"loss": 0.4317,
"step": 1740
},
{
"epoch": 4.28921568627451,
"grad_norm": 0.4522900879383087,
"learning_rate": 0.0001142156862745098,
"loss": 0.4586,
"step": 1750
},
{
"epoch": 4.313725490196078,
"grad_norm": 0.41997721791267395,
"learning_rate": 0.00011372549019607843,
"loss": 0.4598,
"step": 1760
},
{
"epoch": 4.338235294117647,
"grad_norm": 0.3984828591346741,
"learning_rate": 0.00011323529411764706,
"loss": 0.4482,
"step": 1770
},
{
"epoch": 4.362745098039215,
"grad_norm": 0.41868138313293457,
"learning_rate": 0.0001127450980392157,
"loss": 0.4431,
"step": 1780
},
{
"epoch": 4.387254901960785,
"grad_norm": 0.447293221950531,
"learning_rate": 0.00011225490196078433,
"loss": 0.4434,
"step": 1790
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.3910238742828369,
"learning_rate": 0.00011176470588235294,
"loss": 0.4463,
"step": 1800
},
{
"epoch": 4.436274509803922,
"grad_norm": 0.40496933460235596,
"learning_rate": 0.00011127450980392158,
"loss": 0.4793,
"step": 1810
},
{
"epoch": 4.46078431372549,
"grad_norm": 0.4138587415218353,
"learning_rate": 0.00011078431372549021,
"loss": 0.4417,
"step": 1820
},
{
"epoch": 4.485294117647059,
"grad_norm": 0.4713083803653717,
"learning_rate": 0.00011029411764705884,
"loss": 0.454,
"step": 1830
},
{
"epoch": 4.509803921568627,
"grad_norm": 0.45029592514038086,
"learning_rate": 0.00010980392156862746,
"loss": 0.4504,
"step": 1840
},
{
"epoch": 4.534313725490196,
"grad_norm": 0.42069247364997864,
"learning_rate": 0.00010931372549019608,
"loss": 0.4794,
"step": 1850
},
{
"epoch": 4.5588235294117645,
"grad_norm": 0.3917437195777893,
"learning_rate": 0.0001088235294117647,
"loss": 0.4502,
"step": 1860
},
{
"epoch": 4.583333333333333,
"grad_norm": 0.417987585067749,
"learning_rate": 0.00010833333333333333,
"loss": 0.4579,
"step": 1870
},
{
"epoch": 4.607843137254902,
"grad_norm": 0.4511590301990509,
"learning_rate": 0.00010784313725490196,
"loss": 0.4686,
"step": 1880
},
{
"epoch": 4.632352941176471,
"grad_norm": 0.4555961787700653,
"learning_rate": 0.00010735294117647059,
"loss": 0.4332,
"step": 1890
},
{
"epoch": 4.6568627450980395,
"grad_norm": 0.4159266948699951,
"learning_rate": 0.00010686274509803922,
"loss": 0.4721,
"step": 1900
},
{
"epoch": 4.681372549019608,
"grad_norm": 0.4237167239189148,
"learning_rate": 0.00010637254901960784,
"loss": 0.4542,
"step": 1910
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.42502954602241516,
"learning_rate": 0.00010588235294117647,
"loss": 0.4568,
"step": 1920
},
{
"epoch": 4.730392156862745,
"grad_norm": 0.4060966968536377,
"learning_rate": 0.00010539215686274511,
"loss": 0.4588,
"step": 1930
},
{
"epoch": 4.754901960784314,
"grad_norm": 0.444376140832901,
"learning_rate": 0.00010490196078431374,
"loss": 0.4617,
"step": 1940
},
{
"epoch": 4.779411764705882,
"grad_norm": 0.41235941648483276,
"learning_rate": 0.00010441176470588237,
"loss": 0.4655,
"step": 1950
},
{
"epoch": 4.803921568627451,
"grad_norm": 0.42703354358673096,
"learning_rate": 0.00010392156862745099,
"loss": 0.4643,
"step": 1960
},
{
"epoch": 4.828431372549019,
"grad_norm": 0.4282451272010803,
"learning_rate": 0.00010343137254901962,
"loss": 0.4435,
"step": 1970
},
{
"epoch": 4.852941176470588,
"grad_norm": 0.40952056646347046,
"learning_rate": 0.00010294117647058823,
"loss": 0.4571,
"step": 1980
},
{
"epoch": 4.877450980392156,
"grad_norm": 0.41904792189598083,
"learning_rate": 0.00010245098039215686,
"loss": 0.4361,
"step": 1990
},
{
"epoch": 4.901960784313726,
"grad_norm": 0.4027288258075714,
"learning_rate": 0.00010196078431372549,
"loss": 0.4507,
"step": 2000
},
{
"epoch": 4.926470588235294,
"grad_norm": 0.4230017364025116,
"learning_rate": 0.00010147058823529412,
"loss": 0.4645,
"step": 2010
},
{
"epoch": 4.950980392156863,
"grad_norm": 0.44512614607810974,
"learning_rate": 0.00010098039215686274,
"loss": 0.4585,
"step": 2020
},
{
"epoch": 4.9754901960784315,
"grad_norm": 0.4242990016937256,
"learning_rate": 0.00010049019607843137,
"loss": 0.4785,
"step": 2030
},
{
"epoch": 5.0,
"grad_norm": 0.4472719132900238,
"learning_rate": 0.0001,
"loss": 0.4635,
"step": 2040
}
],
"logging_steps": 10,
"max_steps": 4080,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.231691259706409e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}