step_cot / trainer_state.json
salmannyu's picture
Upload folder using huggingface_hub
fa1352e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 5907,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005078720162519045,
"grad_norm": 8.11971664428711,
"learning_rate": 7.614213197969544e-08,
"loss": 0.755,
"step": 10
},
{
"epoch": 0.01015744032503809,
"grad_norm": 7.887208938598633,
"learning_rate": 1.607445008460237e-07,
"loss": 0.7436,
"step": 20
},
{
"epoch": 0.015236160487557136,
"grad_norm": 6.621643543243408,
"learning_rate": 2.4534686971235194e-07,
"loss": 0.7286,
"step": 30
},
{
"epoch": 0.02031488065007618,
"grad_norm": 5.338628768920898,
"learning_rate": 3.2994923857868026e-07,
"loss": 0.6637,
"step": 40
},
{
"epoch": 0.025393600812595226,
"grad_norm": 3.3166372776031494,
"learning_rate": 4.1455160744500853e-07,
"loss": 0.5849,
"step": 50
},
{
"epoch": 0.03047232097511427,
"grad_norm": 2.2426822185516357,
"learning_rate": 4.991539763113367e-07,
"loss": 0.5074,
"step": 60
},
{
"epoch": 0.03555104113763331,
"grad_norm": 1.4896293878555298,
"learning_rate": 5.83756345177665e-07,
"loss": 0.4381,
"step": 70
},
{
"epoch": 0.04062976130015236,
"grad_norm": 1.3547300100326538,
"learning_rate": 6.683587140439933e-07,
"loss": 0.3912,
"step": 80
},
{
"epoch": 0.0457084814626714,
"grad_norm": 1.2349778413772583,
"learning_rate": 7.529610829103214e-07,
"loss": 0.3717,
"step": 90
},
{
"epoch": 0.05078720162519045,
"grad_norm": 1.228826880455017,
"learning_rate": 8.375634517766498e-07,
"loss": 0.3763,
"step": 100
},
{
"epoch": 0.055865921787709494,
"grad_norm": 1.9437307119369507,
"learning_rate": 9.22165820642978e-07,
"loss": 0.3648,
"step": 110
},
{
"epoch": 0.06094464195022854,
"grad_norm": 1.1562706232070923,
"learning_rate": 1.0067681895093063e-06,
"loss": 0.3593,
"step": 120
},
{
"epoch": 0.06602336211274759,
"grad_norm": 1.2007368803024292,
"learning_rate": 1.0913705583756345e-06,
"loss": 0.3619,
"step": 130
},
{
"epoch": 0.07110208227526663,
"grad_norm": 1.1364222764968872,
"learning_rate": 1.1759729272419628e-06,
"loss": 0.3492,
"step": 140
},
{
"epoch": 0.07618080243778567,
"grad_norm": 1.2867300510406494,
"learning_rate": 1.2605752961082913e-06,
"loss": 0.359,
"step": 150
},
{
"epoch": 0.08125952260030472,
"grad_norm": 1.230358600616455,
"learning_rate": 1.3451776649746193e-06,
"loss": 0.3396,
"step": 160
},
{
"epoch": 0.08633824276282377,
"grad_norm": 1.1166342496871948,
"learning_rate": 1.4297800338409476e-06,
"loss": 0.3412,
"step": 170
},
{
"epoch": 0.0914169629253428,
"grad_norm": 1.1211029291152954,
"learning_rate": 1.5143824027072759e-06,
"loss": 0.3436,
"step": 180
},
{
"epoch": 0.09649568308786186,
"grad_norm": 1.0837715864181519,
"learning_rate": 1.5989847715736043e-06,
"loss": 0.3399,
"step": 190
},
{
"epoch": 0.1015744032503809,
"grad_norm": 1.1223688125610352,
"learning_rate": 1.6835871404399324e-06,
"loss": 0.3465,
"step": 200
},
{
"epoch": 0.10665312341289995,
"grad_norm": 1.1082217693328857,
"learning_rate": 1.7681895093062607e-06,
"loss": 0.3422,
"step": 210
},
{
"epoch": 0.11173184357541899,
"grad_norm": 1.1912903785705566,
"learning_rate": 1.852791878172589e-06,
"loss": 0.3336,
"step": 220
},
{
"epoch": 0.11681056373793804,
"grad_norm": 1.1215198040008545,
"learning_rate": 1.937394247038917e-06,
"loss": 0.3293,
"step": 230
},
{
"epoch": 0.12188928390045708,
"grad_norm": 1.1722122430801392,
"learning_rate": 2.0219966159052453e-06,
"loss": 0.3376,
"step": 240
},
{
"epoch": 0.12696800406297612,
"grad_norm": 1.1333527565002441,
"learning_rate": 2.1065989847715737e-06,
"loss": 0.342,
"step": 250
},
{
"epoch": 0.13204672422549518,
"grad_norm": 1.0237016677856445,
"learning_rate": 2.1912013536379022e-06,
"loss": 0.3322,
"step": 260
},
{
"epoch": 0.13712544438801422,
"grad_norm": 1.061033844947815,
"learning_rate": 2.2758037225042303e-06,
"loss": 0.3311,
"step": 270
},
{
"epoch": 0.14220416455053325,
"grad_norm": 1.1101399660110474,
"learning_rate": 2.3604060913705588e-06,
"loss": 0.3426,
"step": 280
},
{
"epoch": 0.14728288471305231,
"grad_norm": 1.1787445545196533,
"learning_rate": 2.445008460236887e-06,
"loss": 0.321,
"step": 290
},
{
"epoch": 0.15236160487557135,
"grad_norm": 1.0249688625335693,
"learning_rate": 2.5296108291032153e-06,
"loss": 0.328,
"step": 300
},
{
"epoch": 0.1574403250380904,
"grad_norm": 1.058312177658081,
"learning_rate": 2.6142131979695434e-06,
"loss": 0.3299,
"step": 310
},
{
"epoch": 0.16251904520060945,
"grad_norm": 1.0902348756790161,
"learning_rate": 2.698815566835872e-06,
"loss": 0.3192,
"step": 320
},
{
"epoch": 0.16759776536312848,
"grad_norm": 1.0917754173278809,
"learning_rate": 2.7834179357022e-06,
"loss": 0.3211,
"step": 330
},
{
"epoch": 0.17267648552564754,
"grad_norm": 1.0080076456069946,
"learning_rate": 2.8680203045685284e-06,
"loss": 0.3299,
"step": 340
},
{
"epoch": 0.17775520568816658,
"grad_norm": 1.2910374402999878,
"learning_rate": 2.952622673434856e-06,
"loss": 0.3286,
"step": 350
},
{
"epoch": 0.1828339258506856,
"grad_norm": 1.014559268951416,
"learning_rate": 3.0372250423011845e-06,
"loss": 0.3174,
"step": 360
},
{
"epoch": 0.18791264601320468,
"grad_norm": 1.0668742656707764,
"learning_rate": 3.121827411167513e-06,
"loss": 0.327,
"step": 370
},
{
"epoch": 0.1929913661757237,
"grad_norm": 1.0306109189987183,
"learning_rate": 3.206429780033841e-06,
"loss": 0.3266,
"step": 380
},
{
"epoch": 0.19807008633824277,
"grad_norm": 1.1934700012207031,
"learning_rate": 3.2910321489001695e-06,
"loss": 0.3288,
"step": 390
},
{
"epoch": 0.2031488065007618,
"grad_norm": 1.1355713605880737,
"learning_rate": 3.375634517766498e-06,
"loss": 0.3318,
"step": 400
},
{
"epoch": 0.20822752666328084,
"grad_norm": 1.0128185749053955,
"learning_rate": 3.460236886632826e-06,
"loss": 0.3153,
"step": 410
},
{
"epoch": 0.2133062468257999,
"grad_norm": 1.1666514873504639,
"learning_rate": 3.5448392554991545e-06,
"loss": 0.3249,
"step": 420
},
{
"epoch": 0.21838496698831894,
"grad_norm": 1.1061981916427612,
"learning_rate": 3.629441624365482e-06,
"loss": 0.3218,
"step": 430
},
{
"epoch": 0.22346368715083798,
"grad_norm": 1.1475132703781128,
"learning_rate": 3.7140439932318106e-06,
"loss": 0.3338,
"step": 440
},
{
"epoch": 0.22854240731335704,
"grad_norm": 1.1342549324035645,
"learning_rate": 3.798646362098139e-06,
"loss": 0.3251,
"step": 450
},
{
"epoch": 0.23362112747587607,
"grad_norm": 1.0099126100540161,
"learning_rate": 3.883248730964467e-06,
"loss": 0.3251,
"step": 460
},
{
"epoch": 0.23869984763839514,
"grad_norm": 1.112540602684021,
"learning_rate": 3.967851099830796e-06,
"loss": 0.3234,
"step": 470
},
{
"epoch": 0.24377856780091417,
"grad_norm": 1.0405648946762085,
"learning_rate": 4.052453468697124e-06,
"loss": 0.3255,
"step": 480
},
{
"epoch": 0.2488572879634332,
"grad_norm": 1.0701533555984497,
"learning_rate": 4.137055837563453e-06,
"loss": 0.3214,
"step": 490
},
{
"epoch": 0.25393600812595224,
"grad_norm": 1.2194687128067017,
"learning_rate": 4.22165820642978e-06,
"loss": 0.3221,
"step": 500
},
{
"epoch": 0.25393600812595224,
"eval_loss": 0.3276064395904541,
"eval_runtime": 121.7232,
"eval_samples_per_second": 41.077,
"eval_steps_per_second": 2.571,
"step": 500
},
{
"epoch": 0.25901472828847133,
"grad_norm": 1.0545361042022705,
"learning_rate": 4.306260575296109e-06,
"loss": 0.3102,
"step": 510
},
{
"epoch": 0.26409344845099036,
"grad_norm": 0.9262936115264893,
"learning_rate": 4.390862944162436e-06,
"loss": 0.3044,
"step": 520
},
{
"epoch": 0.2691721686135094,
"grad_norm": 1.0422135591506958,
"learning_rate": 4.475465313028765e-06,
"loss": 0.3178,
"step": 530
},
{
"epoch": 0.27425088877602843,
"grad_norm": 1.108899474143982,
"learning_rate": 4.560067681895093e-06,
"loss": 0.3224,
"step": 540
},
{
"epoch": 0.27932960893854747,
"grad_norm": 0.9598031044006348,
"learning_rate": 4.644670050761422e-06,
"loss": 0.3232,
"step": 550
},
{
"epoch": 0.2844083291010665,
"grad_norm": 1.0063800811767578,
"learning_rate": 4.72927241962775e-06,
"loss": 0.3224,
"step": 560
},
{
"epoch": 0.2894870492635856,
"grad_norm": 1.0170040130615234,
"learning_rate": 4.813874788494079e-06,
"loss": 0.3204,
"step": 570
},
{
"epoch": 0.29456576942610463,
"grad_norm": 1.1103383302688599,
"learning_rate": 4.898477157360406e-06,
"loss": 0.3249,
"step": 580
},
{
"epoch": 0.29964448958862366,
"grad_norm": 1.101906657218933,
"learning_rate": 4.983079526226735e-06,
"loss": 0.3185,
"step": 590
},
{
"epoch": 0.3047232097511427,
"grad_norm": 0.9216155409812927,
"learning_rate": 4.999972060477541e-06,
"loss": 0.3197,
"step": 600
},
{
"epoch": 0.30980192991366173,
"grad_norm": 1.0119545459747314,
"learning_rate": 4.999858557237848e-06,
"loss": 0.316,
"step": 610
},
{
"epoch": 0.3148806500761808,
"grad_norm": 0.9301111698150635,
"learning_rate": 4.999657748021748e-06,
"loss": 0.3256,
"step": 620
},
{
"epoch": 0.31995937023869986,
"grad_norm": 0.944416344165802,
"learning_rate": 4.999369639842375e-06,
"loss": 0.3253,
"step": 630
},
{
"epoch": 0.3250380904012189,
"grad_norm": 0.856289803981781,
"learning_rate": 4.998994242761724e-06,
"loss": 0.3142,
"step": 640
},
{
"epoch": 0.33011681056373793,
"grad_norm": 1.0146931409835815,
"learning_rate": 4.998531569890301e-06,
"loss": 0.3117,
"step": 650
},
{
"epoch": 0.33519553072625696,
"grad_norm": 0.859649121761322,
"learning_rate": 4.997981637386663e-06,
"loss": 0.3089,
"step": 660
},
{
"epoch": 0.34027425088877605,
"grad_norm": 0.9393402338027954,
"learning_rate": 4.997344464456854e-06,
"loss": 0.3121,
"step": 670
},
{
"epoch": 0.3453529710512951,
"grad_norm": 0.9901228547096252,
"learning_rate": 4.9966200733537345e-06,
"loss": 0.3268,
"step": 680
},
{
"epoch": 0.3504316912138141,
"grad_norm": 0.9233267307281494,
"learning_rate": 4.995808489376206e-06,
"loss": 0.3214,
"step": 690
},
{
"epoch": 0.35551041137633316,
"grad_norm": 0.8530528545379639,
"learning_rate": 4.9949097408683235e-06,
"loss": 0.3148,
"step": 700
},
{
"epoch": 0.3605891315388522,
"grad_norm": 0.9197198748588562,
"learning_rate": 4.99392385921831e-06,
"loss": 0.3108,
"step": 710
},
{
"epoch": 0.3656678517013712,
"grad_norm": 0.9738614559173584,
"learning_rate": 4.992850878857458e-06,
"loss": 0.3189,
"step": 720
},
{
"epoch": 0.3707465718638903,
"grad_norm": 0.9086189270019531,
"learning_rate": 4.991690837258926e-06,
"loss": 0.3157,
"step": 730
},
{
"epoch": 0.37582529202640935,
"grad_norm": 1.1314260959625244,
"learning_rate": 4.990443774936432e-06,
"loss": 0.3295,
"step": 740
},
{
"epoch": 0.3809040121889284,
"grad_norm": 0.9313626885414124,
"learning_rate": 4.989109735442838e-06,
"loss": 0.3223,
"step": 750
},
{
"epoch": 0.3859827323514474,
"grad_norm": 0.894349217414856,
"learning_rate": 4.987688765368628e-06,
"loss": 0.3068,
"step": 760
},
{
"epoch": 0.39106145251396646,
"grad_norm": 0.8493692874908447,
"learning_rate": 4.986180914340281e-06,
"loss": 0.3145,
"step": 770
},
{
"epoch": 0.39614017267648555,
"grad_norm": 0.935171365737915,
"learning_rate": 4.9845862350185405e-06,
"loss": 0.3155,
"step": 780
},
{
"epoch": 0.4012188928390046,
"grad_norm": 0.8560822010040283,
"learning_rate": 4.98290478309657e-06,
"loss": 0.3274,
"step": 790
},
{
"epoch": 0.4062976130015236,
"grad_norm": 0.867138147354126,
"learning_rate": 4.981136617298012e-06,
"loss": 0.314,
"step": 800
},
{
"epoch": 0.41137633316404265,
"grad_norm": 0.8818840384483337,
"learning_rate": 4.97928179937494e-06,
"loss": 0.3182,
"step": 810
},
{
"epoch": 0.4164550533265617,
"grad_norm": 0.9063979387283325,
"learning_rate": 4.977340394105692e-06,
"loss": 0.3152,
"step": 820
},
{
"epoch": 0.4215337734890808,
"grad_norm": 0.8890411257743835,
"learning_rate": 4.975312469292618e-06,
"loss": 0.3084,
"step": 830
},
{
"epoch": 0.4266124936515998,
"grad_norm": 0.8582976460456848,
"learning_rate": 4.973198095759708e-06,
"loss": 0.3054,
"step": 840
},
{
"epoch": 0.43169121381411885,
"grad_norm": 0.8314648270606995,
"learning_rate": 4.970997347350117e-06,
"loss": 0.3067,
"step": 850
},
{
"epoch": 0.4367699339766379,
"grad_norm": 0.9010292887687683,
"learning_rate": 4.96871030092359e-06,
"loss": 0.3112,
"step": 860
},
{
"epoch": 0.4418486541391569,
"grad_norm": 0.8275406956672668,
"learning_rate": 4.966337036353775e-06,
"loss": 0.3075,
"step": 870
},
{
"epoch": 0.44692737430167595,
"grad_norm": 1.2174943685531616,
"learning_rate": 4.963877636525431e-06,
"loss": 0.3166,
"step": 880
},
{
"epoch": 0.45200609446419504,
"grad_norm": 0.8684174418449402,
"learning_rate": 4.961332187331541e-06,
"loss": 0.3221,
"step": 890
},
{
"epoch": 0.4570848146267141,
"grad_norm": 0.864565372467041,
"learning_rate": 4.958700777670306e-06,
"loss": 0.3168,
"step": 900
},
{
"epoch": 0.4621635347892331,
"grad_norm": 0.8785614967346191,
"learning_rate": 4.955983499442039e-06,
"loss": 0.3168,
"step": 910
},
{
"epoch": 0.46724225495175215,
"grad_norm": 0.7927045226097107,
"learning_rate": 4.953180447545965e-06,
"loss": 0.3175,
"step": 920
},
{
"epoch": 0.4723209751142712,
"grad_norm": 0.9102469086647034,
"learning_rate": 4.950291719876891e-06,
"loss": 0.3217,
"step": 930
},
{
"epoch": 0.47739969527679027,
"grad_norm": 0.8130275011062622,
"learning_rate": 4.947317417321803e-06,
"loss": 0.2997,
"step": 940
},
{
"epoch": 0.4824784154393093,
"grad_norm": 0.8968609571456909,
"learning_rate": 4.944257643756333e-06,
"loss": 0.3148,
"step": 950
},
{
"epoch": 0.48755713560182834,
"grad_norm": 0.9034120440483093,
"learning_rate": 4.941112506041135e-06,
"loss": 0.3132,
"step": 960
},
{
"epoch": 0.4926358557643474,
"grad_norm": 0.8752701282501221,
"learning_rate": 4.93788211401815e-06,
"loss": 0.3141,
"step": 970
},
{
"epoch": 0.4977145759268664,
"grad_norm": 0.9108573794364929,
"learning_rate": 4.9345665805067735e-06,
"loss": 0.3175,
"step": 980
},
{
"epoch": 0.5027932960893855,
"grad_norm": 0.9702094197273254,
"learning_rate": 4.931166021299914e-06,
"loss": 0.3055,
"step": 990
},
{
"epoch": 0.5078720162519045,
"grad_norm": 0.9001919031143188,
"learning_rate": 4.927680555159946e-06,
"loss": 0.3095,
"step": 1000
},
{
"epoch": 0.5078720162519045,
"eval_loss": 0.3185386657714844,
"eval_runtime": 121.8144,
"eval_samples_per_second": 41.046,
"eval_steps_per_second": 2.569,
"step": 1000
},
{
"epoch": 0.5129507364144236,
"grad_norm": 0.9391583800315857,
"learning_rate": 4.924110303814567e-06,
"loss": 0.3165,
"step": 1010
},
{
"epoch": 0.5180294565769427,
"grad_norm": 0.8731343150138855,
"learning_rate": 4.920455391952543e-06,
"loss": 0.303,
"step": 1020
},
{
"epoch": 0.5231081767394616,
"grad_norm": 0.811320424079895,
"learning_rate": 4.916715947219356e-06,
"loss": 0.3015,
"step": 1030
},
{
"epoch": 0.5281868969019807,
"grad_norm": 0.9213032126426697,
"learning_rate": 4.912892100212744e-06,
"loss": 0.3054,
"step": 1040
},
{
"epoch": 0.5332656170644997,
"grad_norm": 0.8103631734848022,
"learning_rate": 4.908983984478141e-06,
"loss": 0.2985,
"step": 1050
},
{
"epoch": 0.5383443372270188,
"grad_norm": 0.8164056539535522,
"learning_rate": 4.9049917365040135e-06,
"loss": 0.3138,
"step": 1060
},
{
"epoch": 0.5434230573895379,
"grad_norm": 0.7794240713119507,
"learning_rate": 4.900915495717092e-06,
"loss": 0.3073,
"step": 1070
},
{
"epoch": 0.5485017775520569,
"grad_norm": 0.8127440810203552,
"learning_rate": 4.896755404477505e-06,
"loss": 0.3047,
"step": 1080
},
{
"epoch": 0.553580497714576,
"grad_norm": 0.8375741243362427,
"learning_rate": 4.892511608073804e-06,
"loss": 0.3178,
"step": 1090
},
{
"epoch": 0.5586592178770949,
"grad_norm": 0.836986243724823,
"learning_rate": 4.888184254717886e-06,
"loss": 0.314,
"step": 1100
},
{
"epoch": 0.563737938039614,
"grad_norm": 0.8443421125411987,
"learning_rate": 4.88377349553983e-06,
"loss": 0.3187,
"step": 1110
},
{
"epoch": 0.568816658202133,
"grad_norm": 0.8671994805335999,
"learning_rate": 4.879279484582603e-06,
"loss": 0.3152,
"step": 1120
},
{
"epoch": 0.5738953783646521,
"grad_norm": 0.7974778413772583,
"learning_rate": 4.874702378796694e-06,
"loss": 0.3064,
"step": 1130
},
{
"epoch": 0.5789740985271712,
"grad_norm": 0.8144264221191406,
"learning_rate": 4.870042338034618e-06,
"loss": 0.3083,
"step": 1140
},
{
"epoch": 0.5840528186896902,
"grad_norm": 0.8496512770652771,
"learning_rate": 4.8652995250453515e-06,
"loss": 0.3011,
"step": 1150
},
{
"epoch": 0.5891315388522093,
"grad_norm": 0.8523489832878113,
"learning_rate": 4.86047410546863e-06,
"loss": 0.3105,
"step": 1160
},
{
"epoch": 0.5942102590147282,
"grad_norm": 0.8697237968444824,
"learning_rate": 4.855566247829177e-06,
"loss": 0.3118,
"step": 1170
},
{
"epoch": 0.5992889791772473,
"grad_norm": 0.758233368396759,
"learning_rate": 4.85057612353081e-06,
"loss": 0.3057,
"step": 1180
},
{
"epoch": 0.6043676993397664,
"grad_norm": 1.0396537780761719,
"learning_rate": 4.845503906850461e-06,
"loss": 0.3081,
"step": 1190
},
{
"epoch": 0.6094464195022854,
"grad_norm": 0.8582219481468201,
"learning_rate": 4.840349774932081e-06,
"loss": 0.3101,
"step": 1200
},
{
"epoch": 0.6145251396648045,
"grad_norm": 0.8096279501914978,
"learning_rate": 4.835113907780464e-06,
"loss": 0.3162,
"step": 1210
},
{
"epoch": 0.6196038598273235,
"grad_norm": 0.7163273692131042,
"learning_rate": 4.829796488254954e-06,
"loss": 0.3104,
"step": 1220
},
{
"epoch": 0.6246825799898426,
"grad_norm": 0.8141240477561951,
"learning_rate": 4.824397702063058e-06,
"loss": 0.3225,
"step": 1230
},
{
"epoch": 0.6297613001523616,
"grad_norm": 0.7636848092079163,
"learning_rate": 4.8189177377539635e-06,
"loss": 0.3172,
"step": 1240
},
{
"epoch": 0.6348400203148806,
"grad_norm": 0.8381786942481995,
"learning_rate": 4.8133567867119525e-06,
"loss": 0.3091,
"step": 1250
},
{
"epoch": 0.6399187404773997,
"grad_norm": 0.7415141463279724,
"learning_rate": 4.8077150431497175e-06,
"loss": 0.3119,
"step": 1260
},
{
"epoch": 0.6449974606399187,
"grad_norm": 0.7708622813224792,
"learning_rate": 4.801992704101578e-06,
"loss": 0.3121,
"step": 1270
},
{
"epoch": 0.6500761808024378,
"grad_norm": 0.8236015439033508,
"learning_rate": 4.796189969416601e-06,
"loss": 0.3042,
"step": 1280
},
{
"epoch": 0.6551549009649569,
"grad_norm": 0.7215218544006348,
"learning_rate": 4.790307041751617e-06,
"loss": 0.3031,
"step": 1290
},
{
"epoch": 0.6602336211274759,
"grad_norm": 0.7796043753623962,
"learning_rate": 4.78434412656415e-06,
"loss": 0.305,
"step": 1300
},
{
"epoch": 0.665312341289995,
"grad_norm": 0.8012691140174866,
"learning_rate": 4.778301432105234e-06,
"loss": 0.3066,
"step": 1310
},
{
"epoch": 0.6703910614525139,
"grad_norm": 0.8139703273773193,
"learning_rate": 4.772179169412146e-06,
"loss": 0.3023,
"step": 1320
},
{
"epoch": 0.675469781615033,
"grad_norm": 0.7603716850280762,
"learning_rate": 4.765977552301031e-06,
"loss": 0.3093,
"step": 1330
},
{
"epoch": 0.6805485017775521,
"grad_norm": 0.8556414842605591,
"learning_rate": 4.759696797359438e-06,
"loss": 0.3084,
"step": 1340
},
{
"epoch": 0.6856272219400711,
"grad_norm": 0.7557649612426758,
"learning_rate": 4.753337123938754e-06,
"loss": 0.3057,
"step": 1350
},
{
"epoch": 0.6907059421025902,
"grad_norm": 0.6622769832611084,
"learning_rate": 4.746898754146545e-06,
"loss": 0.3034,
"step": 1360
},
{
"epoch": 0.6957846622651092,
"grad_norm": 0.8777470588684082,
"learning_rate": 4.740381912838797e-06,
"loss": 0.2972,
"step": 1370
},
{
"epoch": 0.7008633824276282,
"grad_norm": 0.9032386541366577,
"learning_rate": 4.733786827612064e-06,
"loss": 0.3075,
"step": 1380
},
{
"epoch": 0.7059421025901473,
"grad_norm": 0.7780275940895081,
"learning_rate": 4.72711372879552e-06,
"loss": 0.3057,
"step": 1390
},
{
"epoch": 0.7110208227526663,
"grad_norm": 0.8000150918960571,
"learning_rate": 4.720362849442912e-06,
"loss": 0.3118,
"step": 1400
},
{
"epoch": 0.7160995429151854,
"grad_norm": 0.7798091173171997,
"learning_rate": 4.713534425324426e-06,
"loss": 0.3049,
"step": 1410
},
{
"epoch": 0.7211782630777044,
"grad_norm": 0.7654507160186768,
"learning_rate": 4.706628694918448e-06,
"loss": 0.3107,
"step": 1420
},
{
"epoch": 0.7262569832402235,
"grad_norm": 0.846693217754364,
"learning_rate": 4.699645899403238e-06,
"loss": 0.3074,
"step": 1430
},
{
"epoch": 0.7313357034027425,
"grad_norm": 0.8081271648406982,
"learning_rate": 4.692586282648504e-06,
"loss": 0.3082,
"step": 1440
},
{
"epoch": 0.7364144235652615,
"grad_norm": 0.7759858965873718,
"learning_rate": 4.685450091206893e-06,
"loss": 0.3127,
"step": 1450
},
{
"epoch": 0.7414931437277806,
"grad_norm": 0.8160701990127563,
"learning_rate": 4.678237574305364e-06,
"loss": 0.3018,
"step": 1460
},
{
"epoch": 0.7465718638902996,
"grad_norm": 0.8192775845527649,
"learning_rate": 4.670948983836505e-06,
"loss": 0.3024,
"step": 1470
},
{
"epoch": 0.7516505840528187,
"grad_norm": 0.8106697797775269,
"learning_rate": 4.66358457434972e-06,
"loss": 0.3167,
"step": 1480
},
{
"epoch": 0.7567293042153377,
"grad_norm": 0.7517253160476685,
"learning_rate": 4.6561446030423435e-06,
"loss": 0.3064,
"step": 1490
},
{
"epoch": 0.7618080243778568,
"grad_norm": 0.7888948321342468,
"learning_rate": 4.648629329750662e-06,
"loss": 0.308,
"step": 1500
},
{
"epoch": 0.7618080243778568,
"eval_loss": 0.3137528896331787,
"eval_runtime": 121.613,
"eval_samples_per_second": 41.114,
"eval_steps_per_second": 2.574,
"step": 1500
},
{
"epoch": 0.7668867445403759,
"grad_norm": 0.7494853138923645,
"learning_rate": 4.641039016940832e-06,
"loss": 0.3086,
"step": 1510
},
{
"epoch": 0.7719654647028948,
"grad_norm": 0.7426589131355286,
"learning_rate": 4.6333739296997205e-06,
"loss": 0.3,
"step": 1520
},
{
"epoch": 0.7770441848654139,
"grad_norm": 0.7976534366607666,
"learning_rate": 4.625634335725644e-06,
"loss": 0.3134,
"step": 1530
},
{
"epoch": 0.7821229050279329,
"grad_norm": 0.8214184045791626,
"learning_rate": 4.617820505319018e-06,
"loss": 0.3076,
"step": 1540
},
{
"epoch": 0.787201625190452,
"grad_norm": 0.8541253209114075,
"learning_rate": 4.609932711372921e-06,
"loss": 0.3141,
"step": 1550
},
{
"epoch": 0.7922803453529711,
"grad_norm": 0.75464928150177,
"learning_rate": 4.601971229363558e-06,
"loss": 0.3053,
"step": 1560
},
{
"epoch": 0.7973590655154901,
"grad_norm": 0.8184385895729065,
"learning_rate": 4.593936337340645e-06,
"loss": 0.3123,
"step": 1570
},
{
"epoch": 0.8024377856780092,
"grad_norm": 0.7774415612220764,
"learning_rate": 4.5858283159176955e-06,
"loss": 0.2999,
"step": 1580
},
{
"epoch": 0.8075165058405281,
"grad_norm": 0.9587996602058411,
"learning_rate": 4.57764744826222e-06,
"loss": 0.3077,
"step": 1590
},
{
"epoch": 0.8125952260030472,
"grad_norm": 0.9347293972969055,
"learning_rate": 4.569394020085841e-06,
"loss": 0.3104,
"step": 1600
},
{
"epoch": 0.8176739461655663,
"grad_norm": 0.7690241932868958,
"learning_rate": 4.561068319634307e-06,
"loss": 0.2998,
"step": 1610
},
{
"epoch": 0.8227526663280853,
"grad_norm": 0.783155083656311,
"learning_rate": 4.552670637677432e-06,
"loss": 0.3011,
"step": 1620
},
{
"epoch": 0.8278313864906044,
"grad_norm": 0.8160974979400635,
"learning_rate": 4.544201267498939e-06,
"loss": 0.3042,
"step": 1630
},
{
"epoch": 0.8329101066531234,
"grad_norm": 0.9004737734794617,
"learning_rate": 4.535660504886215e-06,
"loss": 0.3079,
"step": 1640
},
{
"epoch": 0.8379888268156425,
"grad_norm": 0.7750893235206604,
"learning_rate": 4.527048648119986e-06,
"loss": 0.3002,
"step": 1650
},
{
"epoch": 0.8430675469781616,
"grad_norm": 0.7503623962402344,
"learning_rate": 4.5183659979638905e-06,
"loss": 0.3117,
"step": 1660
},
{
"epoch": 0.8481462671406805,
"grad_norm": 0.8173331618309021,
"learning_rate": 4.509612857653987e-06,
"loss": 0.3079,
"step": 1670
},
{
"epoch": 0.8532249873031996,
"grad_norm": 0.7421526312828064,
"learning_rate": 4.500789532888154e-06,
"loss": 0.2998,
"step": 1680
},
{
"epoch": 0.8583037074657186,
"grad_norm": 0.7622323632240295,
"learning_rate": 4.49189633181542e-06,
"loss": 0.3003,
"step": 1690
},
{
"epoch": 0.8633824276282377,
"grad_norm": 0.730026364326477,
"learning_rate": 4.482933565025198e-06,
"loss": 0.3118,
"step": 1700
},
{
"epoch": 0.8684611477907568,
"grad_norm": 0.7820921540260315,
"learning_rate": 4.47390154553644e-06,
"loss": 0.3056,
"step": 1710
},
{
"epoch": 0.8735398679532758,
"grad_norm": 0.8582718372344971,
"learning_rate": 4.4648005887867064e-06,
"loss": 0.2969,
"step": 1720
},
{
"epoch": 0.8786185881157949,
"grad_norm": 0.7791563272476196,
"learning_rate": 4.455631012621143e-06,
"loss": 0.3068,
"step": 1730
},
{
"epoch": 0.8836973082783138,
"grad_norm": 0.7581758499145508,
"learning_rate": 4.4463931372813914e-06,
"loss": 0.304,
"step": 1740
},
{
"epoch": 0.8887760284408329,
"grad_norm": 0.766441285610199,
"learning_rate": 4.4370872853943936e-06,
"loss": 0.298,
"step": 1750
},
{
"epoch": 0.8938547486033519,
"grad_norm": 0.7395172119140625,
"learning_rate": 4.427713781961132e-06,
"loss": 0.2996,
"step": 1760
},
{
"epoch": 0.898933468765871,
"grad_norm": 0.7233405709266663,
"learning_rate": 4.4182729543452765e-06,
"loss": 0.2929,
"step": 1770
},
{
"epoch": 0.9040121889283901,
"grad_norm": 0.7392520904541016,
"learning_rate": 4.408765132261749e-06,
"loss": 0.3088,
"step": 1780
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.7640084028244019,
"learning_rate": 4.399190647765213e-06,
"loss": 0.3059,
"step": 1790
},
{
"epoch": 0.9141696292534282,
"grad_norm": 0.7984718084335327,
"learning_rate": 4.389549835238473e-06,
"loss": 0.3088,
"step": 1800
},
{
"epoch": 0.9192483494159471,
"grad_norm": 0.7300999164581299,
"learning_rate": 4.379843031380801e-06,
"loss": 0.3025,
"step": 1810
},
{
"epoch": 0.9243270695784662,
"grad_norm": 0.7906836867332458,
"learning_rate": 4.370070575196172e-06,
"loss": 0.3056,
"step": 1820
},
{
"epoch": 0.9294057897409853,
"grad_norm": 0.7191804051399231,
"learning_rate": 4.360232807981426e-06,
"loss": 0.3001,
"step": 1830
},
{
"epoch": 0.9344845099035043,
"grad_norm": 0.7592332363128662,
"learning_rate": 4.350330073314351e-06,
"loss": 0.3101,
"step": 1840
},
{
"epoch": 0.9395632300660234,
"grad_norm": 0.8039364218711853,
"learning_rate": 4.340362717041682e-06,
"loss": 0.3055,
"step": 1850
},
{
"epoch": 0.9446419502285424,
"grad_norm": 0.7507144212722778,
"learning_rate": 4.3303310872670226e-06,
"loss": 0.3018,
"step": 1860
},
{
"epoch": 0.9497206703910615,
"grad_norm": 0.9044272303581238,
"learning_rate": 4.320235534338685e-06,
"loss": 0.2943,
"step": 1870
},
{
"epoch": 0.9547993905535805,
"grad_norm": 0.7523107528686523,
"learning_rate": 4.310076410837463e-06,
"loss": 0.3026,
"step": 1880
},
{
"epoch": 0.9598781107160995,
"grad_norm": 0.955448567867279,
"learning_rate": 4.299854071564307e-06,
"loss": 0.2926,
"step": 1890
},
{
"epoch": 0.9649568308786186,
"grad_norm": 0.7745742797851562,
"learning_rate": 4.289568873527941e-06,
"loss": 0.304,
"step": 1900
},
{
"epoch": 0.9700355510411376,
"grad_norm": 0.7563455104827881,
"learning_rate": 4.279221175932389e-06,
"loss": 0.297,
"step": 1910
},
{
"epoch": 0.9751142712036567,
"grad_norm": 0.8183065056800842,
"learning_rate": 4.268811340164436e-06,
"loss": 0.2986,
"step": 1920
},
{
"epoch": 0.9801929913661758,
"grad_norm": 0.7718958258628845,
"learning_rate": 4.258339729781e-06,
"loss": 0.2957,
"step": 1930
},
{
"epoch": 0.9852717115286947,
"grad_norm": 0.7502573728561401,
"learning_rate": 4.24780671049644e-06,
"loss": 0.3023,
"step": 1940
},
{
"epoch": 0.9903504316912138,
"grad_norm": 0.8469391465187073,
"learning_rate": 4.237212650169783e-06,
"loss": 0.3078,
"step": 1950
},
{
"epoch": 0.9954291518537328,
"grad_norm": 0.8061825633049011,
"learning_rate": 4.226557918791872e-06,
"loss": 0.3002,
"step": 1960
},
{
"epoch": 1.000507872016252,
"grad_norm": 0.7248758673667908,
"learning_rate": 4.215842888472452e-06,
"loss": 0.3125,
"step": 1970
},
{
"epoch": 1.005586592178771,
"grad_norm": 0.7721697092056274,
"learning_rate": 4.205067933427169e-06,
"loss": 0.2544,
"step": 1980
},
{
"epoch": 1.01066531234129,
"grad_norm": 0.7651776075363159,
"learning_rate": 4.194233429964501e-06,
"loss": 0.2535,
"step": 1990
},
{
"epoch": 1.015744032503809,
"grad_norm": 0.7415822148323059,
"learning_rate": 4.183339756472617e-06,
"loss": 0.2496,
"step": 2000
},
{
"epoch": 1.015744032503809,
"eval_loss": 0.31630080938339233,
"eval_runtime": 121.7314,
"eval_samples_per_second": 41.074,
"eval_steps_per_second": 2.571,
"step": 2000
},
{
"epoch": 1.020822752666328,
"grad_norm": 0.792204737663269,
"learning_rate": 4.172387293406164e-06,
"loss": 0.2465,
"step": 2010
},
{
"epoch": 1.0259014728288471,
"grad_norm": 0.7987692356109619,
"learning_rate": 4.161376423272974e-06,
"loss": 0.2479,
"step": 2020
},
{
"epoch": 1.0309801929913662,
"grad_norm": 0.7764890789985657,
"learning_rate": 4.150307530620714e-06,
"loss": 0.25,
"step": 2030
},
{
"epoch": 1.0360589131538853,
"grad_norm": 0.7811829447746277,
"learning_rate": 4.139181002023445e-06,
"loss": 0.2518,
"step": 2040
},
{
"epoch": 1.0411376333164042,
"grad_norm": 0.7891380190849304,
"learning_rate": 4.1279972260681286e-06,
"loss": 0.2521,
"step": 2050
},
{
"epoch": 1.0462163534789233,
"grad_norm": 0.831947922706604,
"learning_rate": 4.1167565933410575e-06,
"loss": 0.2529,
"step": 2060
},
{
"epoch": 1.0512950736414424,
"grad_norm": 0.8048622012138367,
"learning_rate": 4.105459496414207e-06,
"loss": 0.2508,
"step": 2070
},
{
"epoch": 1.0563737938039615,
"grad_norm": 0.7546584606170654,
"learning_rate": 4.094106329831531e-06,
"loss": 0.247,
"step": 2080
},
{
"epoch": 1.0614525139664805,
"grad_norm": 0.7589654922485352,
"learning_rate": 4.08269749009518e-06,
"loss": 0.2466,
"step": 2090
},
{
"epoch": 1.0665312341289994,
"grad_norm": 0.7780548334121704,
"learning_rate": 4.0712333756516535e-06,
"loss": 0.2482,
"step": 2100
},
{
"epoch": 1.0716099542915185,
"grad_norm": 0.8401651978492737,
"learning_rate": 4.059714386877886e-06,
"loss": 0.2578,
"step": 2110
},
{
"epoch": 1.0766886744540376,
"grad_norm": 0.7654274702072144,
"learning_rate": 4.048140926067262e-06,
"loss": 0.2568,
"step": 2120
},
{
"epoch": 1.0817673946165567,
"grad_norm": 0.8236129283905029,
"learning_rate": 4.036513397415571e-06,
"loss": 0.2496,
"step": 2130
},
{
"epoch": 1.0868461147790756,
"grad_norm": 0.8232077360153198,
"learning_rate": 4.024832207006883e-06,
"loss": 0.2489,
"step": 2140
},
{
"epoch": 1.0919248349415946,
"grad_norm": 0.7585301399230957,
"learning_rate": 4.013097762799372e-06,
"loss": 0.2503,
"step": 2150
},
{
"epoch": 1.0970035551041137,
"grad_norm": 0.7547110319137573,
"learning_rate": 4.001310474611069e-06,
"loss": 0.2489,
"step": 2160
},
{
"epoch": 1.1020822752666328,
"grad_norm": 0.8154515027999878,
"learning_rate": 3.989470754105546e-06,
"loss": 0.2516,
"step": 2170
},
{
"epoch": 1.107160995429152,
"grad_norm": 0.8041960000991821,
"learning_rate": 3.9775790147775425e-06,
"loss": 0.2455,
"step": 2180
},
{
"epoch": 1.112239715591671,
"grad_norm": 0.7653600573539734,
"learning_rate": 3.96563567193852e-06,
"loss": 0.2539,
"step": 2190
},
{
"epoch": 1.1173184357541899,
"grad_norm": 0.8358815312385559,
"learning_rate": 3.953641142702161e-06,
"loss": 0.2473,
"step": 2200
},
{
"epoch": 1.122397155916709,
"grad_norm": 0.8611774444580078,
"learning_rate": 3.941595845969799e-06,
"loss": 0.2522,
"step": 2210
},
{
"epoch": 1.127475876079228,
"grad_norm": 0.734160840511322,
"learning_rate": 3.929500202415793e-06,
"loss": 0.2469,
"step": 2220
},
{
"epoch": 1.1325545962417471,
"grad_norm": 0.7705450057983398,
"learning_rate": 3.917354634472831e-06,
"loss": 0.2475,
"step": 2230
},
{
"epoch": 1.137633316404266,
"grad_norm": 0.7762371897697449,
"learning_rate": 3.9051595663171795e-06,
"loss": 0.2519,
"step": 2240
},
{
"epoch": 1.142712036566785,
"grad_norm": 0.8073663115501404,
"learning_rate": 3.892915423853866e-06,
"loss": 0.2417,
"step": 2250
},
{
"epoch": 1.1477907567293042,
"grad_norm": 0.8323131203651428,
"learning_rate": 3.880622634701812e-06,
"loss": 0.2489,
"step": 2260
},
{
"epoch": 1.1528694768918233,
"grad_norm": 0.8708091974258423,
"learning_rate": 3.868281628178888e-06,
"loss": 0.2548,
"step": 2270
},
{
"epoch": 1.1579481970543424,
"grad_norm": 0.7699044346809387,
"learning_rate": 3.855892835286931e-06,
"loss": 0.2467,
"step": 2280
},
{
"epoch": 1.1630269172168615,
"grad_norm": 0.718021035194397,
"learning_rate": 3.843456688696683e-06,
"loss": 0.2486,
"step": 2290
},
{
"epoch": 1.1681056373793803,
"grad_norm": 0.7651357054710388,
"learning_rate": 3.830973622732686e-06,
"loss": 0.248,
"step": 2300
},
{
"epoch": 1.1731843575418994,
"grad_norm": 0.8027908802032471,
"learning_rate": 3.818444073358108e-06,
"loss": 0.2352,
"step": 2310
},
{
"epoch": 1.1782630777044185,
"grad_norm": 0.7923874855041504,
"learning_rate": 3.8058684781595277e-06,
"loss": 0.2434,
"step": 2320
},
{
"epoch": 1.1833417978669376,
"grad_norm": 0.8118336200714111,
"learning_rate": 3.793247276331636e-06,
"loss": 0.2488,
"step": 2330
},
{
"epoch": 1.1884205180294565,
"grad_norm": 0.7456130385398865,
"learning_rate": 3.780580908661915e-06,
"loss": 0.248,
"step": 2340
},
{
"epoch": 1.1934992381919756,
"grad_norm": 0.7925790548324585,
"learning_rate": 3.7678698175152286e-06,
"loss": 0.2529,
"step": 2350
},
{
"epoch": 1.1985779583544947,
"grad_norm": 0.7958409190177917,
"learning_rate": 3.7551144468183824e-06,
"loss": 0.2526,
"step": 2360
},
{
"epoch": 1.2036566785170137,
"grad_norm": 0.7938746809959412,
"learning_rate": 3.7423152420446185e-06,
"loss": 0.2478,
"step": 2370
},
{
"epoch": 1.2087353986795328,
"grad_norm": 0.8479053378105164,
"learning_rate": 3.729472650198054e-06,
"loss": 0.2434,
"step": 2380
},
{
"epoch": 1.213814118842052,
"grad_norm": 0.7347251176834106,
"learning_rate": 3.716587119798074e-06,
"loss": 0.2401,
"step": 2390
},
{
"epoch": 1.2188928390045708,
"grad_norm": 0.7714645266532898,
"learning_rate": 3.703659100863664e-06,
"loss": 0.2511,
"step": 2400
},
{
"epoch": 1.2239715591670899,
"grad_norm": 0.777991533279419,
"learning_rate": 3.690689044897695e-06,
"loss": 0.2509,
"step": 2410
},
{
"epoch": 1.229050279329609,
"grad_norm": 0.7387746572494507,
"learning_rate": 3.6776774048711558e-06,
"loss": 0.2466,
"step": 2420
},
{
"epoch": 1.234128999492128,
"grad_norm": 0.740714967250824,
"learning_rate": 3.66462463520733e-06,
"loss": 0.244,
"step": 2430
},
{
"epoch": 1.239207719654647,
"grad_norm": 0.7498502731323242,
"learning_rate": 3.6515311917659302e-06,
"loss": 0.2411,
"step": 2440
},
{
"epoch": 1.244286439817166,
"grad_norm": 0.7903018593788147,
"learning_rate": 3.6383975318271724e-06,
"loss": 0.2556,
"step": 2450
},
{
"epoch": 1.2493651599796851,
"grad_norm": 0.8446292281150818,
"learning_rate": 3.6252241140758103e-06,
"loss": 0.243,
"step": 2460
},
{
"epoch": 1.2544438801422042,
"grad_norm": 0.7503316402435303,
"learning_rate": 3.6120113985851134e-06,
"loss": 0.2522,
"step": 2470
},
{
"epoch": 1.2595226003047233,
"grad_norm": 0.7319806218147278,
"learning_rate": 3.5987598468007993e-06,
"loss": 0.2475,
"step": 2480
},
{
"epoch": 1.2646013204672424,
"grad_norm": 0.7854649424552917,
"learning_rate": 3.585469921524919e-06,
"loss": 0.2534,
"step": 2490
},
{
"epoch": 1.2696800406297613,
"grad_norm": 0.7887899279594421,
"learning_rate": 3.5721420868996943e-06,
"loss": 0.248,
"step": 2500
},
{
"epoch": 1.2696800406297613,
"eval_loss": 0.3149872124195099,
"eval_runtime": 121.8702,
"eval_samples_per_second": 41.027,
"eval_steps_per_second": 2.568,
"step": 2500
},
{
"epoch": 1.2747587607922803,
"grad_norm": 0.7797321677207947,
"learning_rate": 3.5587768083913037e-06,
"loss": 0.251,
"step": 2510
},
{
"epoch": 1.2798374809547994,
"grad_norm": 0.84684157371521,
"learning_rate": 3.545374552773635e-06,
"loss": 0.2514,
"step": 2520
},
{
"epoch": 1.2849162011173183,
"grad_norm": 0.8210347294807434,
"learning_rate": 3.5319357881119733e-06,
"loss": 0.2451,
"step": 2530
},
{
"epoch": 1.2899949212798374,
"grad_norm": 0.8470488786697388,
"learning_rate": 3.518460983746661e-06,
"loss": 0.248,
"step": 2540
},
{
"epoch": 1.2950736414423565,
"grad_norm": 0.7454930543899536,
"learning_rate": 3.5049506102767037e-06,
"loss": 0.2414,
"step": 2550
},
{
"epoch": 1.3001523616048756,
"grad_norm": 0.713578462600708,
"learning_rate": 3.4914051395433363e-06,
"loss": 0.2482,
"step": 2560
},
{
"epoch": 1.3052310817673947,
"grad_norm": 0.7766876816749573,
"learning_rate": 3.477825044613543e-06,
"loss": 0.2501,
"step": 2570
},
{
"epoch": 1.3103098019299138,
"grad_norm": 0.7383909821510315,
"learning_rate": 3.464210799763536e-06,
"loss": 0.247,
"step": 2580
},
{
"epoch": 1.3153885220924328,
"grad_norm": 0.7781063318252563,
"learning_rate": 3.450562880462191e-06,
"loss": 0.2479,
"step": 2590
},
{
"epoch": 1.3204672422549517,
"grad_norm": 0.7478165030479431,
"learning_rate": 3.436881763354444e-06,
"loss": 0.2502,
"step": 2600
},
{
"epoch": 1.3255459624174708,
"grad_norm": 0.7556635737419128,
"learning_rate": 3.4231679262446426e-06,
"loss": 0.2517,
"step": 2610
},
{
"epoch": 1.33062468257999,
"grad_norm": 0.7706420421600342,
"learning_rate": 3.4094218480798608e-06,
"loss": 0.2448,
"step": 2620
},
{
"epoch": 1.3357034027425088,
"grad_norm": 0.8588224053382874,
"learning_rate": 3.3956440089331687e-06,
"loss": 0.2431,
"step": 2630
},
{
"epoch": 1.3407821229050279,
"grad_norm": 0.7747892141342163,
"learning_rate": 3.3818348899868707e-06,
"loss": 0.2517,
"step": 2640
},
{
"epoch": 1.345860843067547,
"grad_norm": 0.8341949582099915,
"learning_rate": 3.3679949735156974e-06,
"loss": 0.2478,
"step": 2650
},
{
"epoch": 1.350939563230066,
"grad_norm": 0.7596986889839172,
"learning_rate": 3.354124742869965e-06,
"loss": 0.2451,
"step": 2660
},
{
"epoch": 1.3560182833925851,
"grad_norm": 0.8195773363113403,
"learning_rate": 3.3402246824586897e-06,
"loss": 0.2448,
"step": 2670
},
{
"epoch": 1.3610970035551042,
"grad_norm": 0.7367848753929138,
"learning_rate": 3.3262952777326775e-06,
"loss": 0.2503,
"step": 2680
},
{
"epoch": 1.366175723717623,
"grad_norm": 0.8999025821685791,
"learning_rate": 3.3123370151675615e-06,
"loss": 0.2537,
"step": 2690
},
{
"epoch": 1.3712544438801422,
"grad_norm": 0.7917847037315369,
"learning_rate": 3.2983503822468214e-06,
"loss": 0.2496,
"step": 2700
},
{
"epoch": 1.3763331640426613,
"grad_norm": 0.7561963200569153,
"learning_rate": 3.28433586744475e-06,
"loss": 0.2489,
"step": 2710
},
{
"epoch": 1.3814118842051804,
"grad_norm": 0.8509207963943481,
"learning_rate": 3.2702939602093988e-06,
"loss": 0.2448,
"step": 2720
},
{
"epoch": 1.3864906043676992,
"grad_norm": 0.8005545139312744,
"learning_rate": 3.2562251509454813e-06,
"loss": 0.2578,
"step": 2730
},
{
"epoch": 1.3915693245302183,
"grad_norm": 0.8283247351646423,
"learning_rate": 3.2421299309972485e-06,
"loss": 0.2489,
"step": 2740
},
{
"epoch": 1.3966480446927374,
"grad_norm": 0.7954180836677551,
"learning_rate": 3.2280087926313288e-06,
"loss": 0.248,
"step": 2750
},
{
"epoch": 1.4017267648552565,
"grad_norm": 0.8017697930335999,
"learning_rate": 3.2138622290195325e-06,
"loss": 0.2535,
"step": 2760
},
{
"epoch": 1.4068054850177756,
"grad_norm": 0.7989761829376221,
"learning_rate": 3.1996907342216318e-06,
"loss": 0.2434,
"step": 2770
},
{
"epoch": 1.4118842051802947,
"grad_norm": 0.8122096061706543,
"learning_rate": 3.1854948031681044e-06,
"loss": 0.2518,
"step": 2780
},
{
"epoch": 1.4169629253428135,
"grad_norm": 0.8307642936706543,
"learning_rate": 3.1712749316428487e-06,
"loss": 0.2433,
"step": 2790
},
{
"epoch": 1.4220416455053326,
"grad_norm": 0.7779731154441833,
"learning_rate": 3.157031616265871e-06,
"loss": 0.2543,
"step": 2800
},
{
"epoch": 1.4271203656678517,
"grad_norm": 0.7799838185310364,
"learning_rate": 3.1427653544759352e-06,
"loss": 0.247,
"step": 2810
},
{
"epoch": 1.4321990858303708,
"grad_norm": 0.7921550869941711,
"learning_rate": 3.1284766445131975e-06,
"loss": 0.2485,
"step": 2820
},
{
"epoch": 1.4372778059928897,
"grad_norm": 0.7986315488815308,
"learning_rate": 3.114165985401801e-06,
"loss": 0.2469,
"step": 2830
},
{
"epoch": 1.4423565261554088,
"grad_norm": 0.848434329032898,
"learning_rate": 3.09983387693245e-06,
"loss": 0.2492,
"step": 2840
},
{
"epoch": 1.4474352463179279,
"grad_norm": 0.7870414853096008,
"learning_rate": 3.085480819644951e-06,
"loss": 0.2417,
"step": 2850
},
{
"epoch": 1.452513966480447,
"grad_norm": 0.8498828411102295,
"learning_rate": 3.0711073148107395e-06,
"loss": 0.2533,
"step": 2860
},
{
"epoch": 1.457592686642966,
"grad_norm": 0.7977587580680847,
"learning_rate": 3.056713864415363e-06,
"loss": 0.2483,
"step": 2870
},
{
"epoch": 1.4626714068054851,
"grad_norm": 0.7781681418418884,
"learning_rate": 3.0423009711409614e-06,
"loss": 0.2502,
"step": 2880
},
{
"epoch": 1.467750126968004,
"grad_norm": 0.8061252236366272,
"learning_rate": 3.0278691383486992e-06,
"loss": 0.2491,
"step": 2890
},
{
"epoch": 1.472828847130523,
"grad_norm": 0.7913114428520203,
"learning_rate": 3.013418870061194e-06,
"loss": 0.2461,
"step": 2900
},
{
"epoch": 1.4779075672930422,
"grad_norm": 0.7775682806968689,
"learning_rate": 2.9989506709449123e-06,
"loss": 0.2498,
"step": 2910
},
{
"epoch": 1.4829862874555613,
"grad_norm": 0.77854323387146,
"learning_rate": 2.984465046292541e-06,
"loss": 0.2473,
"step": 2920
},
{
"epoch": 1.4880650076180801,
"grad_norm": 0.8510044813156128,
"learning_rate": 2.9699625020053457e-06,
"loss": 0.2443,
"step": 2930
},
{
"epoch": 1.4931437277805992,
"grad_norm": 0.823668897151947,
"learning_rate": 2.9554435445754976e-06,
"loss": 0.2386,
"step": 2940
},
{
"epoch": 1.4982224479431183,
"grad_norm": 0.7475908994674683,
"learning_rate": 2.9409086810683858e-06,
"loss": 0.2503,
"step": 2950
},
{
"epoch": 1.5033011681056374,
"grad_norm": 0.8057491183280945,
"learning_rate": 2.926358419104911e-06,
"loss": 0.2421,
"step": 2960
},
{
"epoch": 1.5083798882681565,
"grad_norm": 0.8842071890830994,
"learning_rate": 2.9117932668437542e-06,
"loss": 0.2515,
"step": 2970
},
{
"epoch": 1.5134586084306756,
"grad_norm": 0.7616868615150452,
"learning_rate": 2.8972137329636324e-06,
"loss": 0.2486,
"step": 2980
},
{
"epoch": 1.5185373285931947,
"grad_norm": 0.7204374074935913,
"learning_rate": 2.8826203266455276e-06,
"loss": 0.2406,
"step": 2990
},
{
"epoch": 1.5236160487557135,
"grad_norm": 0.801526665687561,
"learning_rate": 2.868013557554911e-06,
"loss": 0.2526,
"step": 3000
},
{
"epoch": 1.5236160487557135,
"eval_loss": 0.3125091791152954,
"eval_runtime": 121.6711,
"eval_samples_per_second": 41.094,
"eval_steps_per_second": 2.573,
"step": 3000
},
{
"epoch": 1.5286947689182326,
"grad_norm": 0.7821494936943054,
"learning_rate": 2.8533939358239405e-06,
"loss": 0.2447,
"step": 3010
},
{
"epoch": 1.5337734890807515,
"grad_norm": 0.7913290858268738,
"learning_rate": 2.838761972033643e-06,
"loss": 0.2508,
"step": 3020
},
{
"epoch": 1.5388522092432706,
"grad_norm": 1.1276675462722778,
"learning_rate": 2.824118177196083e-06,
"loss": 0.2481,
"step": 3030
},
{
"epoch": 1.5439309294057897,
"grad_norm": 0.7831458449363708,
"learning_rate": 2.8094630627365193e-06,
"loss": 0.2525,
"step": 3040
},
{
"epoch": 1.5490096495683088,
"grad_norm": 0.8312116861343384,
"learning_rate": 2.7947971404755392e-06,
"loss": 0.2503,
"step": 3050
},
{
"epoch": 1.5540883697308279,
"grad_norm": 0.7501511573791504,
"learning_rate": 2.7801209226111874e-06,
"loss": 0.2508,
"step": 3060
},
{
"epoch": 1.559167089893347,
"grad_norm": 0.8042319416999817,
"learning_rate": 2.765434921701075e-06,
"loss": 0.2435,
"step": 3070
},
{
"epoch": 1.564245810055866,
"grad_norm": 0.8284822702407837,
"learning_rate": 2.7507396506444805e-06,
"loss": 0.2468,
"step": 3080
},
{
"epoch": 1.569324530218385,
"grad_norm": 0.7171116471290588,
"learning_rate": 2.7360356226644342e-06,
"loss": 0.2473,
"step": 3090
},
{
"epoch": 1.574403250380904,
"grad_norm": 0.775512158870697,
"learning_rate": 2.721323351289799e-06,
"loss": 0.2468,
"step": 3100
},
{
"epoch": 1.579481970543423,
"grad_norm": 0.7975888252258301,
"learning_rate": 2.7066033503373323e-06,
"loss": 0.243,
"step": 3110
},
{
"epoch": 1.584560690705942,
"grad_norm": 0.8064377903938293,
"learning_rate": 2.6918761338937427e-06,
"loss": 0.2566,
"step": 3120
},
{
"epoch": 1.589639410868461,
"grad_norm": 0.8264314532279968,
"learning_rate": 2.677142216297733e-06,
"loss": 0.2518,
"step": 3130
},
{
"epoch": 1.5947181310309801,
"grad_norm": 0.8090101480484009,
"learning_rate": 2.6624021121220415e-06,
"loss": 0.2537,
"step": 3140
},
{
"epoch": 1.5997968511934992,
"grad_norm": 0.8687019944190979,
"learning_rate": 2.647656336155469e-06,
"loss": 0.248,
"step": 3150
},
{
"epoch": 1.6048755713560183,
"grad_norm": 0.75754714012146,
"learning_rate": 2.6329054033848994e-06,
"loss": 0.2435,
"step": 3160
},
{
"epoch": 1.6099542915185374,
"grad_norm": 0.7850366234779358,
"learning_rate": 2.6181498289773145e-06,
"loss": 0.2473,
"step": 3170
},
{
"epoch": 1.6150330116810565,
"grad_norm": 0.8010424375534058,
"learning_rate": 2.603390128261802e-06,
"loss": 0.2466,
"step": 3180
},
{
"epoch": 1.6201117318435754,
"grad_norm": 0.8364565968513489,
"learning_rate": 2.5886268167115597e-06,
"loss": 0.2463,
"step": 3190
},
{
"epoch": 1.6251904520060945,
"grad_norm": 0.7993068099021912,
"learning_rate": 2.5738604099258908e-06,
"loss": 0.2414,
"step": 3200
},
{
"epoch": 1.6302691721686136,
"grad_norm": 0.8057124018669128,
"learning_rate": 2.559091423612196e-06,
"loss": 0.2573,
"step": 3210
},
{
"epoch": 1.6353478923311324,
"grad_norm": 0.810699462890625,
"learning_rate": 2.5443203735679682e-06,
"loss": 0.2468,
"step": 3220
},
{
"epoch": 1.6404266124936515,
"grad_norm": 0.8280355334281921,
"learning_rate": 2.52954777566277e-06,
"loss": 0.2475,
"step": 3230
},
{
"epoch": 1.6455053326561706,
"grad_norm": 0.8114942312240601,
"learning_rate": 2.5147741458202266e-06,
"loss": 0.2528,
"step": 3240
},
{
"epoch": 1.6505840528186897,
"grad_norm": 0.7734003663063049,
"learning_rate": 2.5e-06,
"loss": 0.252,
"step": 3250
},
{
"epoch": 1.6556627729812088,
"grad_norm": 0.7834712862968445,
"learning_rate": 2.485225854179774e-06,
"loss": 0.2529,
"step": 3260
},
{
"epoch": 1.6607414931437279,
"grad_norm": 0.8254967331886292,
"learning_rate": 2.47045222433723e-06,
"loss": 0.2548,
"step": 3270
},
{
"epoch": 1.665820213306247,
"grad_norm": 0.7638046145439148,
"learning_rate": 2.455679626432032e-06,
"loss": 0.2472,
"step": 3280
},
{
"epoch": 1.6708989334687658,
"grad_norm": 0.7961717247962952,
"learning_rate": 2.4409085763878043e-06,
"loss": 0.2437,
"step": 3290
},
{
"epoch": 1.675977653631285,
"grad_norm": 0.8181645274162292,
"learning_rate": 2.426139590074111e-06,
"loss": 0.2418,
"step": 3300
},
{
"epoch": 1.681056373793804,
"grad_norm": 0.751830518245697,
"learning_rate": 2.4113731832884407e-06,
"loss": 0.239,
"step": 3310
},
{
"epoch": 1.6861350939563229,
"grad_norm": 0.7490795254707336,
"learning_rate": 2.396609871738199e-06,
"loss": 0.2474,
"step": 3320
},
{
"epoch": 1.691213814118842,
"grad_norm": 0.7789638042449951,
"learning_rate": 2.3818501710226867e-06,
"loss": 0.2502,
"step": 3330
},
{
"epoch": 1.696292534281361,
"grad_norm": 0.7721952795982361,
"learning_rate": 2.3670945966151014e-06,
"loss": 0.2571,
"step": 3340
},
{
"epoch": 1.7013712544438802,
"grad_norm": 0.8575606346130371,
"learning_rate": 2.3523436638445312e-06,
"loss": 0.2527,
"step": 3350
},
{
"epoch": 1.7064499746063992,
"grad_norm": 0.814814567565918,
"learning_rate": 2.3375978878779593e-06,
"loss": 0.2465,
"step": 3360
},
{
"epoch": 1.7115286947689183,
"grad_norm": 0.821048378944397,
"learning_rate": 2.322857783702268e-06,
"loss": 0.2515,
"step": 3370
},
{
"epoch": 1.7166074149314374,
"grad_norm": 0.8528531193733215,
"learning_rate": 2.3081238661062585e-06,
"loss": 0.253,
"step": 3380
},
{
"epoch": 1.7216861350939563,
"grad_norm": 0.7616355419158936,
"learning_rate": 2.2933966496626677e-06,
"loss": 0.2549,
"step": 3390
},
{
"epoch": 1.7267648552564754,
"grad_norm": 0.7662220001220703,
"learning_rate": 2.2786766487102014e-06,
"loss": 0.2565,
"step": 3400
},
{
"epoch": 1.7318435754189943,
"grad_norm": 0.730667769908905,
"learning_rate": 2.2639643773355666e-06,
"loss": 0.2405,
"step": 3410
},
{
"epoch": 1.7369222955815133,
"grad_norm": 0.7821247577667236,
"learning_rate": 2.2492603493555208e-06,
"loss": 0.2472,
"step": 3420
},
{
"epoch": 1.7420010157440324,
"grad_norm": 0.7954862117767334,
"learning_rate": 2.234565078298925e-06,
"loss": 0.2538,
"step": 3430
},
{
"epoch": 1.7470797359065515,
"grad_norm": 0.810747504234314,
"learning_rate": 2.219879077388813e-06,
"loss": 0.2542,
"step": 3440
},
{
"epoch": 1.7521584560690706,
"grad_norm": 0.8034137487411499,
"learning_rate": 2.2052028595244616e-06,
"loss": 0.254,
"step": 3450
},
{
"epoch": 1.7572371762315897,
"grad_norm": 0.826793909072876,
"learning_rate": 2.190536937263482e-06,
"loss": 0.2431,
"step": 3460
},
{
"epoch": 1.7623158963941088,
"grad_norm": 0.719495415687561,
"learning_rate": 2.175881822803917e-06,
"loss": 0.2467,
"step": 3470
},
{
"epoch": 1.7673946165566279,
"grad_norm": 0.7717339396476746,
"learning_rate": 2.1612380279663576e-06,
"loss": 0.2414,
"step": 3480
},
{
"epoch": 1.7724733367191468,
"grad_norm": 0.7720147371292114,
"learning_rate": 2.14660606417606e-06,
"loss": 0.2479,
"step": 3490
},
{
"epoch": 1.7775520568816658,
"grad_norm": 0.735954999923706,
"learning_rate": 2.1319864424450894e-06,
"loss": 0.254,
"step": 3500
},
{
"epoch": 1.7775520568816658,
"eval_loss": 0.30940133333206177,
"eval_runtime": 121.7572,
"eval_samples_per_second": 41.065,
"eval_steps_per_second": 2.571,
"step": 3500
},
{
"epoch": 1.7826307770441847,
"grad_norm": 0.8783974647521973,
"learning_rate": 2.117379673354473e-06,
"loss": 0.2461,
"step": 3510
},
{
"epoch": 1.7877094972067038,
"grad_norm": 0.683204174041748,
"learning_rate": 2.1027862670363685e-06,
"loss": 0.2475,
"step": 3520
},
{
"epoch": 1.792788217369223,
"grad_norm": 0.7428295612335205,
"learning_rate": 2.088206733156246e-06,
"loss": 0.2346,
"step": 3530
},
{
"epoch": 1.797866937531742,
"grad_norm": 0.754948079586029,
"learning_rate": 2.0736415808950898e-06,
"loss": 0.255,
"step": 3540
},
{
"epoch": 1.802945657694261,
"grad_norm": 0.754705548286438,
"learning_rate": 2.059091318931615e-06,
"loss": 0.2466,
"step": 3550
},
{
"epoch": 1.8080243778567802,
"grad_norm": 0.8315344452857971,
"learning_rate": 2.0445564554245033e-06,
"loss": 0.2431,
"step": 3560
},
{
"epoch": 1.8131030980192993,
"grad_norm": 0.7972525954246521,
"learning_rate": 2.030037497994655e-06,
"loss": 0.2556,
"step": 3570
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.8291880488395691,
"learning_rate": 2.0155349537074598e-06,
"loss": 0.2376,
"step": 3580
},
{
"epoch": 1.8232605383443372,
"grad_norm": 0.7485976815223694,
"learning_rate": 2.001049329055088e-06,
"loss": 0.2416,
"step": 3590
},
{
"epoch": 1.8283392585068563,
"grad_norm": 0.8340455293655396,
"learning_rate": 1.9865811299388062e-06,
"loss": 0.2447,
"step": 3600
},
{
"epoch": 1.8334179786693752,
"grad_norm": 0.8124203681945801,
"learning_rate": 1.972130861651302e-06,
"loss": 0.2475,
"step": 3610
},
{
"epoch": 1.8384966988318943,
"grad_norm": 0.8198763132095337,
"learning_rate": 1.95769902885904e-06,
"loss": 0.25,
"step": 3620
},
{
"epoch": 1.8435754189944134,
"grad_norm": 0.8861445188522339,
"learning_rate": 1.943286135584637e-06,
"loss": 0.245,
"step": 3630
},
{
"epoch": 1.8486541391569324,
"grad_norm": 0.8078780174255371,
"learning_rate": 1.9288926851892614e-06,
"loss": 0.2424,
"step": 3640
},
{
"epoch": 1.8537328593194515,
"grad_norm": 0.785547137260437,
"learning_rate": 1.9145191803550493e-06,
"loss": 0.2367,
"step": 3650
},
{
"epoch": 1.8588115794819706,
"grad_norm": 0.8039028644561768,
"learning_rate": 1.9001661230675516e-06,
"loss": 0.2434,
"step": 3660
},
{
"epoch": 1.8638902996444897,
"grad_norm": 0.8308824896812439,
"learning_rate": 1.8858340145981994e-06,
"loss": 0.2475,
"step": 3670
},
{
"epoch": 1.8689690198070086,
"grad_norm": 0.7723254561424255,
"learning_rate": 1.8715233554868035e-06,
"loss": 0.2409,
"step": 3680
},
{
"epoch": 1.8740477399695277,
"grad_norm": 0.8033102750778198,
"learning_rate": 1.8572346455240656e-06,
"loss": 0.2415,
"step": 3690
},
{
"epoch": 1.8791264601320468,
"grad_norm": 0.7221494913101196,
"learning_rate": 1.8429683837341306e-06,
"loss": 0.2499,
"step": 3700
},
{
"epoch": 1.8842051802945656,
"grad_norm": 0.8179773688316345,
"learning_rate": 1.828725068357151e-06,
"loss": 0.2499,
"step": 3710
},
{
"epoch": 1.8892839004570847,
"grad_norm": 0.730506420135498,
"learning_rate": 1.8145051968318966e-06,
"loss": 0.2431,
"step": 3720
},
{
"epoch": 1.8943626206196038,
"grad_norm": 0.8109707832336426,
"learning_rate": 1.800309265778369e-06,
"loss": 0.2417,
"step": 3730
},
{
"epoch": 1.899441340782123,
"grad_norm": 0.9620540738105774,
"learning_rate": 1.7861377709804687e-06,
"loss": 0.2538,
"step": 3740
},
{
"epoch": 1.904520060944642,
"grad_norm": 0.7754760980606079,
"learning_rate": 1.7719912073686712e-06,
"loss": 0.2414,
"step": 3750
},
{
"epoch": 1.909598781107161,
"grad_norm": 0.79917311668396,
"learning_rate": 1.7578700690027517e-06,
"loss": 0.2409,
"step": 3760
},
{
"epoch": 1.9146775012696802,
"grad_norm": 0.7494426965713501,
"learning_rate": 1.7437748490545191e-06,
"loss": 0.2414,
"step": 3770
},
{
"epoch": 1.919756221432199,
"grad_norm": 0.7564458847045898,
"learning_rate": 1.7297060397906023e-06,
"loss": 0.2424,
"step": 3780
},
{
"epoch": 1.9248349415947181,
"grad_norm": 0.7982223033905029,
"learning_rate": 1.7156641325552503e-06,
"loss": 0.2521,
"step": 3790
},
{
"epoch": 1.9299136617572372,
"grad_norm": 0.7981540560722351,
"learning_rate": 1.7016496177531792e-06,
"loss": 0.2399,
"step": 3800
},
{
"epoch": 1.934992381919756,
"grad_norm": 0.7966643571853638,
"learning_rate": 1.6876629848324391e-06,
"loss": 0.2336,
"step": 3810
},
{
"epoch": 1.9400711020822752,
"grad_norm": 0.7969587445259094,
"learning_rate": 1.6737047222673235e-06,
"loss": 0.25,
"step": 3820
},
{
"epoch": 1.9451498222447943,
"grad_norm": 0.7210260629653931,
"learning_rate": 1.6597753175413103e-06,
"loss": 0.2516,
"step": 3830
},
{
"epoch": 1.9502285424073134,
"grad_norm": 0.7629299163818359,
"learning_rate": 1.6458752571300358e-06,
"loss": 0.2422,
"step": 3840
},
{
"epoch": 1.9553072625698324,
"grad_norm": 1.0255619287490845,
"learning_rate": 1.632005026484303e-06,
"loss": 0.2467,
"step": 3850
},
{
"epoch": 1.9603859827323515,
"grad_norm": 0.8044396042823792,
"learning_rate": 1.6181651100131302e-06,
"loss": 0.2478,
"step": 3860
},
{
"epoch": 1.9654647028948706,
"grad_norm": 0.8026655912399292,
"learning_rate": 1.6043559910668315e-06,
"loss": 0.2466,
"step": 3870
},
{
"epoch": 1.9705434230573895,
"grad_norm": 0.7410418391227722,
"learning_rate": 1.5905781519201398e-06,
"loss": 0.2401,
"step": 3880
},
{
"epoch": 1.9756221432199086,
"grad_norm": 0.8294776082038879,
"learning_rate": 1.576832073755358e-06,
"loss": 0.2548,
"step": 3890
},
{
"epoch": 1.9807008633824277,
"grad_norm": 0.7692272067070007,
"learning_rate": 1.5631182366455566e-06,
"loss": 0.2394,
"step": 3900
},
{
"epoch": 1.9857795835449465,
"grad_norm": 0.8169528841972351,
"learning_rate": 1.54943711953781e-06,
"loss": 0.2396,
"step": 3910
},
{
"epoch": 1.9908583037074656,
"grad_norm": 0.8071213960647583,
"learning_rate": 1.5357892002364649e-06,
"loss": 0.2435,
"step": 3920
},
{
"epoch": 1.9959370238699847,
"grad_norm": 0.8226195573806763,
"learning_rate": 1.5221749553864578e-06,
"loss": 0.2416,
"step": 3930
},
{
"epoch": 2.001015744032504,
"grad_norm": 0.7779680490493774,
"learning_rate": 1.5085948604566647e-06,
"loss": 0.2302,
"step": 3940
},
{
"epoch": 2.006094464195023,
"grad_norm": 0.793228268623352,
"learning_rate": 1.4950493897232967e-06,
"loss": 0.1941,
"step": 3950
},
{
"epoch": 2.011173184357542,
"grad_norm": 0.8707436919212341,
"learning_rate": 1.4815390162533397e-06,
"loss": 0.1885,
"step": 3960
},
{
"epoch": 2.016251904520061,
"grad_norm": 0.887116014957428,
"learning_rate": 1.4680642118880275e-06,
"loss": 0.1911,
"step": 3970
},
{
"epoch": 2.02133062468258,
"grad_norm": 0.8474360108375549,
"learning_rate": 1.454625447226366e-06,
"loss": 0.1874,
"step": 3980
},
{
"epoch": 2.026409344845099,
"grad_norm": 0.8413044214248657,
"learning_rate": 1.441223191608696e-06,
"loss": 0.1862,
"step": 3990
},
{
"epoch": 2.031488065007618,
"grad_norm": 0.8266652822494507,
"learning_rate": 1.4278579131003067e-06,
"loss": 0.1867,
"step": 4000
},
{
"epoch": 2.031488065007618,
"eval_loss": 0.33027777075767517,
"eval_runtime": 121.5959,
"eval_samples_per_second": 41.12,
"eval_steps_per_second": 2.574,
"step": 4000
},
{
"epoch": 2.036566785170137,
"grad_norm": 0.8005456924438477,
"learning_rate": 1.414530078475082e-06,
"loss": 0.1844,
"step": 4010
},
{
"epoch": 2.041645505332656,
"grad_norm": 0.8141132593154907,
"learning_rate": 1.4012401531992013e-06,
"loss": 0.1806,
"step": 4020
},
{
"epoch": 2.046724225495175,
"grad_norm": 0.8092536330223083,
"learning_rate": 1.3879886014148864e-06,
"loss": 0.1834,
"step": 4030
},
{
"epoch": 2.0518029456576943,
"grad_norm": 0.829396665096283,
"learning_rate": 1.3747758859241896e-06,
"loss": 0.1832,
"step": 4040
},
{
"epoch": 2.0568816658202134,
"grad_norm": 0.7657766342163086,
"learning_rate": 1.3616024681728278e-06,
"loss": 0.1751,
"step": 4050
},
{
"epoch": 2.0619603859827325,
"grad_norm": 0.8640854954719543,
"learning_rate": 1.3484688082340708e-06,
"loss": 0.1824,
"step": 4060
},
{
"epoch": 2.0670391061452515,
"grad_norm": 0.8462369441986084,
"learning_rate": 1.3353753647926701e-06,
"loss": 0.187,
"step": 4070
},
{
"epoch": 2.0721178263077706,
"grad_norm": 0.7753511667251587,
"learning_rate": 1.3223225951288449e-06,
"loss": 0.1772,
"step": 4080
},
{
"epoch": 2.0771965464702893,
"grad_norm": 0.936180830001831,
"learning_rate": 1.3093109551023058e-06,
"loss": 0.182,
"step": 4090
},
{
"epoch": 2.0822752666328084,
"grad_norm": 0.8638271689414978,
"learning_rate": 1.2963408991363374e-06,
"loss": 0.1814,
"step": 4100
},
{
"epoch": 2.0873539867953275,
"grad_norm": 0.8632501363754272,
"learning_rate": 1.283412880201927e-06,
"loss": 0.1816,
"step": 4110
},
{
"epoch": 2.0924327069578466,
"grad_norm": 0.8283603191375732,
"learning_rate": 1.270527349801946e-06,
"loss": 0.1826,
"step": 4120
},
{
"epoch": 2.0975114271203656,
"grad_norm": 0.8857740163803101,
"learning_rate": 1.2576847579553826e-06,
"loss": 0.1872,
"step": 4130
},
{
"epoch": 2.1025901472828847,
"grad_norm": 0.7950591444969177,
"learning_rate": 1.2448855531816184e-06,
"loss": 0.1815,
"step": 4140
},
{
"epoch": 2.107668867445404,
"grad_norm": 0.9257445931434631,
"learning_rate": 1.232130182484772e-06,
"loss": 0.1841,
"step": 4150
},
{
"epoch": 2.112747587607923,
"grad_norm": 0.9456517696380615,
"learning_rate": 1.2194190913380858e-06,
"loss": 0.1844,
"step": 4160
},
{
"epoch": 2.117826307770442,
"grad_norm": 0.9262843728065491,
"learning_rate": 1.206752723668364e-06,
"loss": 0.1833,
"step": 4170
},
{
"epoch": 2.122905027932961,
"grad_norm": 0.8877847194671631,
"learning_rate": 1.194131521840474e-06,
"loss": 0.1845,
"step": 4180
},
{
"epoch": 2.1279837480954797,
"grad_norm": 0.9533064365386963,
"learning_rate": 1.181555926641892e-06,
"loss": 0.1871,
"step": 4190
},
{
"epoch": 2.133062468257999,
"grad_norm": 0.8590644598007202,
"learning_rate": 1.1690263772673158e-06,
"loss": 0.1872,
"step": 4200
},
{
"epoch": 2.138141188420518,
"grad_norm": 0.8177663087844849,
"learning_rate": 1.1565433113033176e-06,
"loss": 0.1821,
"step": 4210
},
{
"epoch": 2.143219908583037,
"grad_norm": 0.8513428568840027,
"learning_rate": 1.14410716471307e-06,
"loss": 0.1796,
"step": 4220
},
{
"epoch": 2.148298628745556,
"grad_norm": 0.7657913565635681,
"learning_rate": 1.131718371821112e-06,
"loss": 0.184,
"step": 4230
},
{
"epoch": 2.153377348908075,
"grad_norm": 0.8900587558746338,
"learning_rate": 1.119377365298189e-06,
"loss": 0.1851,
"step": 4240
},
{
"epoch": 2.1584560690705943,
"grad_norm": 0.8556007742881775,
"learning_rate": 1.1070845761461347e-06,
"loss": 0.1861,
"step": 4250
},
{
"epoch": 2.1635347892331134,
"grad_norm": 0.8326412439346313,
"learning_rate": 1.0948404336828222e-06,
"loss": 0.1816,
"step": 4260
},
{
"epoch": 2.1686135093956325,
"grad_norm": 0.8394993543624878,
"learning_rate": 1.0826453655271693e-06,
"loss": 0.1884,
"step": 4270
},
{
"epoch": 2.173692229558151,
"grad_norm": 0.9060940742492676,
"learning_rate": 1.0704997975842075e-06,
"loss": 0.1851,
"step": 4280
},
{
"epoch": 2.17877094972067,
"grad_norm": 0.816910445690155,
"learning_rate": 1.0584041540302009e-06,
"loss": 0.185,
"step": 4290
},
{
"epoch": 2.1838496698831893,
"grad_norm": 0.8141408562660217,
"learning_rate": 1.0463588572978399e-06,
"loss": 0.1874,
"step": 4300
},
{
"epoch": 2.1889283900457084,
"grad_norm": 0.8770137429237366,
"learning_rate": 1.0343643280614798e-06,
"loss": 0.1812,
"step": 4310
},
{
"epoch": 2.1940071102082275,
"grad_norm": 0.8542727828025818,
"learning_rate": 1.0224209852224573e-06,
"loss": 0.1858,
"step": 4320
},
{
"epoch": 2.1990858303707466,
"grad_norm": 0.7976948618888855,
"learning_rate": 1.010529245894454e-06,
"loss": 0.1782,
"step": 4330
},
{
"epoch": 2.2041645505332657,
"grad_norm": 0.8359585404396057,
"learning_rate": 9.986895253889322e-07,
"loss": 0.1865,
"step": 4340
},
{
"epoch": 2.2092432706957847,
"grad_norm": 0.8453836441040039,
"learning_rate": 9.86902237200629e-07,
"loss": 0.1836,
"step": 4350
},
{
"epoch": 2.214321990858304,
"grad_norm": 0.9079688191413879,
"learning_rate": 9.751677929931189e-07,
"loss": 0.1879,
"step": 4360
},
{
"epoch": 2.219400711020823,
"grad_norm": 0.9282172918319702,
"learning_rate": 9.634866025844306e-07,
"loss": 0.1838,
"step": 4370
},
{
"epoch": 2.224479431183342,
"grad_norm": 0.9397227168083191,
"learning_rate": 9.518590739327382e-07,
"loss": 0.1846,
"step": 4380
},
{
"epoch": 2.2295581513458607,
"grad_norm": 0.817206084728241,
"learning_rate": 9.402856131221144e-07,
"loss": 0.1826,
"step": 4390
},
{
"epoch": 2.2346368715083798,
"grad_norm": 0.8512512445449829,
"learning_rate": 9.287666243483473e-07,
"loss": 0.1813,
"step": 4400
},
{
"epoch": 2.239715591670899,
"grad_norm": 0.8650760054588318,
"learning_rate": 9.17302509904821e-07,
"loss": 0.1823,
"step": 4410
},
{
"epoch": 2.244794311833418,
"grad_norm": 0.8736887574195862,
"learning_rate": 9.058936701684698e-07,
"loss": 0.188,
"step": 4420
},
{
"epoch": 2.249873031995937,
"grad_norm": 0.8562535047531128,
"learning_rate": 8.945405035857932e-07,
"loss": 0.1864,
"step": 4430
},
{
"epoch": 2.254951752158456,
"grad_norm": 0.9099084734916687,
"learning_rate": 8.832434066589432e-07,
"loss": 0.189,
"step": 4440
},
{
"epoch": 2.260030472320975,
"grad_norm": 0.9459706544876099,
"learning_rate": 8.720027739318724e-07,
"loss": 0.1851,
"step": 4450
},
{
"epoch": 2.2651091924834943,
"grad_norm": 0.9423843026161194,
"learning_rate": 8.608189979765563e-07,
"loss": 0.1854,
"step": 4460
},
{
"epoch": 2.2701879126460134,
"grad_norm": 0.9056732654571533,
"learning_rate": 8.496924693792872e-07,
"loss": 0.1869,
"step": 4470
},
{
"epoch": 2.275266632808532,
"grad_norm": 0.8627443909645081,
"learning_rate": 8.386235767270256e-07,
"loss": 0.1873,
"step": 4480
},
{
"epoch": 2.280345352971051,
"grad_norm": 0.9132104516029358,
"learning_rate": 8.27612706593837e-07,
"loss": 0.1859,
"step": 4490
},
{
"epoch": 2.28542407313357,
"grad_norm": 0.878332257270813,
"learning_rate": 8.166602435273832e-07,
"loss": 0.1863,
"step": 4500
},
{
"epoch": 2.28542407313357,
"eval_loss": 0.3316808044910431,
"eval_runtime": 121.6909,
"eval_samples_per_second": 41.088,
"eval_steps_per_second": 2.572,
"step": 4500
},
{
"epoch": 2.2905027932960893,
"grad_norm": 0.9403025507926941,
"learning_rate": 8.057665700354999e-07,
"loss": 0.1839,
"step": 4510
},
{
"epoch": 2.2955815134586084,
"grad_norm": 0.8784217834472656,
"learning_rate": 7.949320665728319e-07,
"loss": 0.1876,
"step": 4520
},
{
"epoch": 2.3006602336211275,
"grad_norm": 1.0212981700897217,
"learning_rate": 7.841571115275487e-07,
"loss": 0.1804,
"step": 4530
},
{
"epoch": 2.3057389537836466,
"grad_norm": 0.850695013999939,
"learning_rate": 7.734420812081283e-07,
"loss": 0.1821,
"step": 4540
},
{
"epoch": 2.3108176739461657,
"grad_norm": 0.9093451499938965,
"learning_rate": 7.62787349830218e-07,
"loss": 0.1867,
"step": 4550
},
{
"epoch": 2.3158963941086848,
"grad_norm": 1.0111429691314697,
"learning_rate": 7.521932895035605e-07,
"loss": 0.1847,
"step": 4560
},
{
"epoch": 2.320975114271204,
"grad_norm": 1.0417455434799194,
"learning_rate": 7.416602702190004e-07,
"loss": 0.1804,
"step": 4570
},
{
"epoch": 2.326053834433723,
"grad_norm": 0.8404303193092346,
"learning_rate": 7.311886598355642e-07,
"loss": 0.1832,
"step": 4580
},
{
"epoch": 2.3311325545962416,
"grad_norm": 0.8522669672966003,
"learning_rate": 7.207788240676108e-07,
"loss": 0.1847,
"step": 4590
},
{
"epoch": 2.3362112747587607,
"grad_norm": 0.7680218815803528,
"learning_rate": 7.104311264720598e-07,
"loss": 0.1843,
"step": 4600
},
{
"epoch": 2.3412899949212798,
"grad_norm": 0.8408559560775757,
"learning_rate": 7.001459284356938e-07,
"loss": 0.1816,
"step": 4610
},
{
"epoch": 2.346368715083799,
"grad_norm": 0.8629699945449829,
"learning_rate": 6.899235891625372e-07,
"loss": 0.1845,
"step": 4620
},
{
"epoch": 2.351447435246318,
"grad_norm": 0.8334382176399231,
"learning_rate": 6.79764465661315e-07,
"loss": 0.1828,
"step": 4630
},
{
"epoch": 2.356526155408837,
"grad_norm": 0.8202574849128723,
"learning_rate": 6.696689127329792e-07,
"loss": 0.184,
"step": 4640
},
{
"epoch": 2.361604875571356,
"grad_norm": 0.901674747467041,
"learning_rate": 6.596372829583184e-07,
"loss": 0.1855,
"step": 4650
},
{
"epoch": 2.366683595733875,
"grad_norm": 0.8691708445549011,
"learning_rate": 6.496699266856493e-07,
"loss": 0.188,
"step": 4660
},
{
"epoch": 2.3717623158963943,
"grad_norm": 1.0007835626602173,
"learning_rate": 6.397671920185738e-07,
"loss": 0.1808,
"step": 4670
},
{
"epoch": 2.376841036058913,
"grad_norm": 0.8889086842536926,
"learning_rate": 6.299294248038281e-07,
"loss": 0.1828,
"step": 4680
},
{
"epoch": 2.381919756221432,
"grad_norm": 0.9149643182754517,
"learning_rate": 6.201569686191988e-07,
"loss": 0.1861,
"step": 4690
},
{
"epoch": 2.386998476383951,
"grad_norm": 0.8830495476722717,
"learning_rate": 6.104501647615265e-07,
"loss": 0.1919,
"step": 4700
},
{
"epoch": 2.39207719654647,
"grad_norm": 0.886160671710968,
"learning_rate": 6.00809352234788e-07,
"loss": 0.1847,
"step": 4710
},
{
"epoch": 2.3971559167089893,
"grad_norm": 0.9045748114585876,
"learning_rate": 5.912348677382523e-07,
"loss": 0.1865,
"step": 4720
},
{
"epoch": 2.4022346368715084,
"grad_norm": 0.9505787491798401,
"learning_rate": 5.81727045654725e-07,
"loss": 0.1873,
"step": 4730
},
{
"epoch": 2.4073133570340275,
"grad_norm": 0.9555074572563171,
"learning_rate": 5.722862180388683e-07,
"loss": 0.1823,
"step": 4740
},
{
"epoch": 2.4123920771965466,
"grad_norm": 0.8942949175834656,
"learning_rate": 5.629127146056062e-07,
"loss": 0.1804,
"step": 4750
},
{
"epoch": 2.4174707973590657,
"grad_norm": 0.883151113986969,
"learning_rate": 5.536068627186089e-07,
"loss": 0.184,
"step": 4760
},
{
"epoch": 2.4225495175215848,
"grad_norm": 0.8733017444610596,
"learning_rate": 5.443689873788572e-07,
"loss": 0.1855,
"step": 4770
},
{
"epoch": 2.427628237684104,
"grad_norm": 0.8522325158119202,
"learning_rate": 5.351994112132944e-07,
"loss": 0.1845,
"step": 4780
},
{
"epoch": 2.4327069578466225,
"grad_norm": 0.8934102058410645,
"learning_rate": 5.260984544635603e-07,
"loss": 0.1869,
"step": 4790
},
{
"epoch": 2.4377856780091416,
"grad_norm": 0.9026458263397217,
"learning_rate": 5.170664349748031e-07,
"loss": 0.1836,
"step": 4800
},
{
"epoch": 2.4428643981716607,
"grad_norm": 0.8497732877731323,
"learning_rate": 5.081036681845813e-07,
"loss": 0.1896,
"step": 4810
},
{
"epoch": 2.4479431183341798,
"grad_norm": 0.9134292006492615,
"learning_rate": 4.99210467111847e-07,
"loss": 0.1869,
"step": 4820
},
{
"epoch": 2.453021838496699,
"grad_norm": 0.8175578117370605,
"learning_rate": 4.903871423460141e-07,
"loss": 0.1845,
"step": 4830
},
{
"epoch": 2.458100558659218,
"grad_norm": 0.7790173292160034,
"learning_rate": 4.816340020361096e-07,
"loss": 0.1766,
"step": 4840
},
{
"epoch": 2.463179278821737,
"grad_norm": 0.869907557964325,
"learning_rate": 4.7295135188001465e-07,
"loss": 0.1829,
"step": 4850
},
{
"epoch": 2.468257998984256,
"grad_norm": 0.7368974089622498,
"learning_rate": 4.6433949511378417e-07,
"loss": 0.1861,
"step": 4860
},
{
"epoch": 2.4733367191467748,
"grad_norm": 0.9023917317390442,
"learning_rate": 4.557987325010613e-07,
"loss": 0.1865,
"step": 4870
},
{
"epoch": 2.478415439309294,
"grad_norm": 0.8636237382888794,
"learning_rate": 4.4732936232256855e-07,
"loss": 0.1812,
"step": 4880
},
{
"epoch": 2.483494159471813,
"grad_norm": 0.8983198404312134,
"learning_rate": 4.389316803656943e-07,
"loss": 0.1918,
"step": 4890
},
{
"epoch": 2.488572879634332,
"grad_norm": 0.8668248057365417,
"learning_rate": 4.3060597991415987e-07,
"loss": 0.178,
"step": 4900
},
{
"epoch": 2.493651599796851,
"grad_norm": 0.9295836091041565,
"learning_rate": 4.223525517377805e-07,
"loss": 0.1864,
"step": 4910
},
{
"epoch": 2.4987303199593702,
"grad_norm": 0.8381646871566772,
"learning_rate": 4.1417168408230596e-07,
"loss": 0.1849,
"step": 4920
},
{
"epoch": 2.5038090401218893,
"grad_norm": 0.9747976660728455,
"learning_rate": 4.060636626593556e-07,
"loss": 0.1816,
"step": 4930
},
{
"epoch": 2.5088877602844084,
"grad_norm": 0.933236837387085,
"learning_rate": 3.9802877063644193e-07,
"loss": 0.1833,
"step": 4940
},
{
"epoch": 2.5139664804469275,
"grad_norm": 0.8036865592002869,
"learning_rate": 3.9006728862707925e-07,
"loss": 0.1781,
"step": 4950
},
{
"epoch": 2.5190452006094466,
"grad_norm": 0.8587754368782043,
"learning_rate": 3.8217949468098205e-07,
"loss": 0.1813,
"step": 4960
},
{
"epoch": 2.5241239207719657,
"grad_norm": 0.958289384841919,
"learning_rate": 3.7436566427435675e-07,
"loss": 0.1915,
"step": 4970
},
{
"epoch": 2.5292026409344848,
"grad_norm": 1.040694236755371,
"learning_rate": 3.6662607030028e-07,
"loss": 0.1843,
"step": 4980
},
{
"epoch": 2.5342813610970034,
"grad_norm": 0.8534585237503052,
"learning_rate": 3.589609830591692e-07,
"loss": 0.1877,
"step": 4990
},
{
"epoch": 2.5393600812595225,
"grad_norm": 0.8737279772758484,
"learning_rate": 3.513706702493394e-07,
"loss": 0.1825,
"step": 5000
},
{
"epoch": 2.5393600812595225,
"eval_loss": 0.3309638500213623,
"eval_runtime": 121.7997,
"eval_samples_per_second": 41.051,
"eval_steps_per_second": 2.57,
"step": 5000
},
{
"epoch": 2.5444388014220416,
"grad_norm": 0.8709063529968262,
"learning_rate": 3.438553969576569e-07,
"loss": 0.1831,
"step": 5010
},
{
"epoch": 2.5495175215845607,
"grad_norm": 0.8241310715675354,
"learning_rate": 3.364154256502808e-07,
"loss": 0.184,
"step": 5020
},
{
"epoch": 2.5545962417470798,
"grad_norm": 0.8842706680297852,
"learning_rate": 3.2905101616349497e-07,
"loss": 0.1854,
"step": 5030
},
{
"epoch": 2.559674961909599,
"grad_norm": 0.9000037908554077,
"learning_rate": 3.217624256946361e-07,
"loss": 0.1832,
"step": 5040
},
{
"epoch": 2.564753682072118,
"grad_norm": 0.8701291680335999,
"learning_rate": 3.1454990879310866e-07,
"loss": 0.1827,
"step": 5050
},
{
"epoch": 2.5698324022346366,
"grad_norm": 0.8461666703224182,
"learning_rate": 3.0741371735149544e-07,
"loss": 0.1825,
"step": 5060
},
{
"epoch": 2.5749111223971557,
"grad_norm": 0.8629067540168762,
"learning_rate": 3.003541005967628e-07,
"loss": 0.1785,
"step": 5070
},
{
"epoch": 2.579989842559675,
"grad_norm": 0.9108079075813293,
"learning_rate": 2.9337130508155287e-07,
"loss": 0.1784,
"step": 5080
},
{
"epoch": 2.585068562722194,
"grad_norm": 0.8795527219772339,
"learning_rate": 2.8646557467557514e-07,
"loss": 0.1831,
"step": 5090
},
{
"epoch": 2.590147282884713,
"grad_norm": 0.871895432472229,
"learning_rate": 2.796371505570888e-07,
"loss": 0.1865,
"step": 5100
},
{
"epoch": 2.595226003047232,
"grad_norm": 0.8901582956314087,
"learning_rate": 2.728862712044811e-07,
"loss": 0.1794,
"step": 5110
},
{
"epoch": 2.600304723209751,
"grad_norm": 0.8318222165107727,
"learning_rate": 2.662131723879366e-07,
"loss": 0.1859,
"step": 5120
},
{
"epoch": 2.6053834433722702,
"grad_norm": 0.8986064791679382,
"learning_rate": 2.5961808716120364e-07,
"loss": 0.1797,
"step": 5130
},
{
"epoch": 2.6104621635347893,
"grad_norm": 0.9209254384040833,
"learning_rate": 2.531012458534551e-07,
"loss": 0.1812,
"step": 5140
},
{
"epoch": 2.6155408836973084,
"grad_norm": 0.9384124279022217,
"learning_rate": 2.466628760612463e-07,
"loss": 0.1874,
"step": 5150
},
{
"epoch": 2.6206196038598275,
"grad_norm": 0.8080311417579651,
"learning_rate": 2.40303202640563e-07,
"loss": 0.1774,
"step": 5160
},
{
"epoch": 2.6256983240223466,
"grad_norm": 0.8578788042068481,
"learning_rate": 2.3402244769896998e-07,
"loss": 0.1843,
"step": 5170
},
{
"epoch": 2.6307770441848657,
"grad_norm": 0.9375886917114258,
"learning_rate": 2.2782083058785458e-07,
"loss": 0.1829,
"step": 5180
},
{
"epoch": 2.6358557643473843,
"grad_norm": 0.819479763507843,
"learning_rate": 2.216985678947664e-07,
"loss": 0.1821,
"step": 5190
},
{
"epoch": 2.6409344845099034,
"grad_norm": 0.899229109287262,
"learning_rate": 2.156558734358505e-07,
"loss": 0.1774,
"step": 5200
},
{
"epoch": 2.6460132046724225,
"grad_norm": 0.9143205881118774,
"learning_rate": 2.0969295824838336e-07,
"loss": 0.1852,
"step": 5210
},
{
"epoch": 2.6510919248349416,
"grad_norm": 0.8006069660186768,
"learning_rate": 2.0381003058339982e-07,
"loss": 0.1832,
"step": 5220
},
{
"epoch": 2.6561706449974607,
"grad_norm": 0.9519758820533752,
"learning_rate": 1.9800729589842222e-07,
"loss": 0.1763,
"step": 5230
},
{
"epoch": 2.66124936515998,
"grad_norm": 0.8859177827835083,
"learning_rate": 1.92284956850283e-07,
"loss": 0.1815,
"step": 5240
},
{
"epoch": 2.666328085322499,
"grad_norm": 0.9211258888244629,
"learning_rate": 1.866432132880483e-07,
"loss": 0.1834,
"step": 5250
},
{
"epoch": 2.6714068054850175,
"grad_norm": 0.993823766708374,
"learning_rate": 1.8108226224603732e-07,
"loss": 0.1804,
"step": 5260
},
{
"epoch": 2.6764855256475366,
"grad_norm": 0.840509295463562,
"learning_rate": 1.7560229793694288e-07,
"loss": 0.1828,
"step": 5270
},
{
"epoch": 2.6815642458100557,
"grad_norm": 0.9131551384925842,
"learning_rate": 1.702035117450468e-07,
"loss": 0.1901,
"step": 5280
},
{
"epoch": 2.686642965972575,
"grad_norm": 0.8990755677223206,
"learning_rate": 1.6488609221953612e-07,
"loss": 0.1797,
"step": 5290
},
{
"epoch": 2.691721686135094,
"grad_norm": 0.8473220467567444,
"learning_rate": 1.596502250679194e-07,
"loss": 0.1799,
"step": 5300
},
{
"epoch": 2.696800406297613,
"grad_norm": 0.8115700483322144,
"learning_rate": 1.5449609314954012e-07,
"loss": 0.1849,
"step": 5310
},
{
"epoch": 2.701879126460132,
"grad_norm": 0.9483342170715332,
"learning_rate": 1.494238764691902e-07,
"loss": 0.181,
"step": 5320
},
{
"epoch": 2.706957846622651,
"grad_norm": 0.8683326840400696,
"learning_rate": 1.444337521708236e-07,
"loss": 0.1806,
"step": 5330
},
{
"epoch": 2.7120365667851702,
"grad_norm": 0.9478078484535217,
"learning_rate": 1.3952589453137017e-07,
"loss": 0.1803,
"step": 5340
},
{
"epoch": 2.7171152869476893,
"grad_norm": 0.8961120843887329,
"learning_rate": 1.3470047495464905e-07,
"loss": 0.1779,
"step": 5350
},
{
"epoch": 2.7221940071102084,
"grad_norm": 0.8967982530593872,
"learning_rate": 1.2995766196538194e-07,
"loss": 0.179,
"step": 5360
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.9021346569061279,
"learning_rate": 1.252976212033072e-07,
"loss": 0.1862,
"step": 5370
},
{
"epoch": 2.732351447435246,
"grad_norm": 0.840735912322998,
"learning_rate": 1.2072051541739682e-07,
"loss": 0.171,
"step": 5380
},
{
"epoch": 2.7374301675977653,
"grad_norm": 0.9057764410972595,
"learning_rate": 1.1622650446017042e-07,
"loss": 0.1785,
"step": 5390
},
{
"epoch": 2.7425088877602843,
"grad_norm": 0.8543335795402527,
"learning_rate": 1.118157452821142e-07,
"loss": 0.1764,
"step": 5400
},
{
"epoch": 2.7475876079228034,
"grad_norm": 0.8573223352432251,
"learning_rate": 1.0748839192619764e-07,
"loss": 0.1809,
"step": 5410
},
{
"epoch": 2.7526663280853225,
"grad_norm": 0.8379035592079163,
"learning_rate": 1.0324459552249505e-07,
"loss": 0.1804,
"step": 5420
},
{
"epoch": 2.7577450482478416,
"grad_norm": 0.9201931953430176,
"learning_rate": 9.908450428290806e-08,
"loss": 0.1837,
"step": 5430
},
{
"epoch": 2.7628237684103607,
"grad_norm": 0.853122889995575,
"learning_rate": 9.500826349598729e-08,
"loss": 0.1779,
"step": 5440
},
{
"epoch": 2.76790248857288,
"grad_norm": 0.8565370440483093,
"learning_rate": 9.101601552185951e-08,
"loss": 0.1853,
"step": 5450
},
{
"epoch": 2.7729812087353984,
"grad_norm": 0.9906251430511475,
"learning_rate": 8.710789978725653e-08,
"loss": 0.1817,
"step": 5460
},
{
"epoch": 2.7780599288979175,
"grad_norm": 0.9657715559005737,
"learning_rate": 8.328405278064417e-08,
"loss": 0.1843,
"step": 5470
},
{
"epoch": 2.7831386490604366,
"grad_norm": 0.8711429834365845,
"learning_rate": 7.954460804745712e-08,
"loss": 0.1811,
"step": 5480
},
{
"epoch": 2.7882173692229557,
"grad_norm": 0.8879536986351013,
"learning_rate": 7.588969618543357e-08,
"loss": 0.1873,
"step": 5490
},
{
"epoch": 2.793296089385475,
"grad_norm": 0.8269737958908081,
"learning_rate": 7.231944484005437e-08,
"loss": 0.1787,
"step": 5500
},
{
"epoch": 2.793296089385475,
"eval_loss": 0.3309679627418518,
"eval_runtime": 121.7053,
"eval_samples_per_second": 41.083,
"eval_steps_per_second": 2.572,
"step": 5500
},
{
"epoch": 2.798374809547994,
"grad_norm": 0.9365456700325012,
"learning_rate": 6.883397870008662e-08,
"loss": 0.1831,
"step": 5510
},
{
"epoch": 2.803453529710513,
"grad_norm": 0.8533931374549866,
"learning_rate": 6.543341949322657e-08,
"loss": 0.1851,
"step": 5520
},
{
"epoch": 2.808532249873032,
"grad_norm": 0.9121244549751282,
"learning_rate": 6.211788598185081e-08,
"loss": 0.182,
"step": 5530
},
{
"epoch": 2.813610970035551,
"grad_norm": 0.8693413734436035,
"learning_rate": 5.8887493958866004e-08,
"loss": 0.1809,
"step": 5540
},
{
"epoch": 2.8186896901980703,
"grad_norm": 0.8991943001747131,
"learning_rate": 5.574235624366764e-08,
"loss": 0.1875,
"step": 5550
},
{
"epoch": 2.8237684103605893,
"grad_norm": 0.9877569079399109,
"learning_rate": 5.2682582678197644e-08,
"loss": 0.1777,
"step": 5560
},
{
"epoch": 2.8288471305231084,
"grad_norm": 0.9565731883049011,
"learning_rate": 4.970828012310969e-08,
"loss": 0.1856,
"step": 5570
},
{
"epoch": 2.833925850685627,
"grad_norm": 0.8290345668792725,
"learning_rate": 4.681955245403602e-08,
"loss": 0.1872,
"step": 5580
},
{
"epoch": 2.839004570848146,
"grad_norm": 0.7747939825057983,
"learning_rate": 4.401650055796042e-08,
"loss": 0.1811,
"step": 5590
},
{
"epoch": 2.8440832910106653,
"grad_norm": 0.8827708959579468,
"learning_rate": 4.1299222329694574e-08,
"loss": 0.177,
"step": 5600
},
{
"epoch": 2.8491620111731844,
"grad_norm": 0.8102378249168396,
"learning_rate": 3.8667812668459204e-08,
"loss": 0.1863,
"step": 5610
},
{
"epoch": 2.8542407313357034,
"grad_norm": 0.8598487377166748,
"learning_rate": 3.612236347456943e-08,
"loss": 0.1799,
"step": 5620
},
{
"epoch": 2.8593194514982225,
"grad_norm": 0.9280297160148621,
"learning_rate": 3.366296364622629e-08,
"loss": 0.1836,
"step": 5630
},
{
"epoch": 2.8643981716607416,
"grad_norm": 0.8947204947471619,
"learning_rate": 3.128969907641027e-08,
"loss": 0.1813,
"step": 5640
},
{
"epoch": 2.8694768918232603,
"grad_norm": 0.9226625561714172,
"learning_rate": 2.9002652649882945e-08,
"loss": 0.1851,
"step": 5650
},
{
"epoch": 2.8745556119857794,
"grad_norm": 0.8815863728523254,
"learning_rate": 2.6801904240292275e-08,
"loss": 0.1795,
"step": 5660
},
{
"epoch": 2.8796343321482984,
"grad_norm": 0.9666081070899963,
"learning_rate": 2.4687530707381836e-08,
"loss": 0.1883,
"step": 5670
},
{
"epoch": 2.8847130523108175,
"grad_norm": 0.8660172820091248,
"learning_rate": 2.265960589430821e-08,
"loss": 0.1755,
"step": 5680
},
{
"epoch": 2.8897917724733366,
"grad_norm": 0.9310330152511597,
"learning_rate": 2.0718200625060302e-08,
"loss": 0.1806,
"step": 5690
},
{
"epoch": 2.8948704926358557,
"grad_norm": 0.90314781665802,
"learning_rate": 1.8863382701987675e-08,
"loss": 0.1838,
"step": 5700
},
{
"epoch": 2.899949212798375,
"grad_norm": 0.8870557546615601,
"learning_rate": 1.70952169034308e-08,
"loss": 0.1822,
"step": 5710
},
{
"epoch": 2.905027932960894,
"grad_norm": 0.8747410178184509,
"learning_rate": 1.5413764981460354e-08,
"loss": 0.189,
"step": 5720
},
{
"epoch": 2.910106653123413,
"grad_norm": 0.8954981565475464,
"learning_rate": 1.3819085659719233e-08,
"loss": 0.1802,
"step": 5730
},
{
"epoch": 2.915185373285932,
"grad_norm": 0.9273566007614136,
"learning_rate": 1.2311234631372514e-08,
"loss": 0.185,
"step": 5740
},
{
"epoch": 2.920264093448451,
"grad_norm": 0.8598524928092957,
"learning_rate": 1.0890264557162356e-08,
"loss": 0.1859,
"step": 5750
},
{
"epoch": 2.9253428136109703,
"grad_norm": 0.8601447939872742,
"learning_rate": 9.556225063568347e-09,
"loss": 0.1822,
"step": 5760
},
{
"epoch": 2.9304215337734894,
"grad_norm": 0.8048866391181946,
"learning_rate": 8.309162741074461e-09,
"loss": 0.1837,
"step": 5770
},
{
"epoch": 2.935500253936008,
"grad_norm": 0.9375097155570984,
"learning_rate": 7.149121142542292e-09,
"loss": 0.1833,
"step": 5780
},
{
"epoch": 2.940578974098527,
"grad_norm": 0.8771663904190063,
"learning_rate": 6.076140781690054e-09,
"loss": 0.1856,
"step": 5790
},
{
"epoch": 2.945657694261046,
"grad_norm": 0.9071695804595947,
"learning_rate": 5.090259131676767e-09,
"loss": 0.1853,
"step": 5800
},
{
"epoch": 2.9507364144235653,
"grad_norm": 0.9588919878005981,
"learning_rate": 4.191510623794414e-09,
"loss": 0.1823,
"step": 5810
},
{
"epoch": 2.9558151345860844,
"grad_norm": 0.8780667185783386,
"learning_rate": 3.379926646265852e-09,
"loss": 0.1791,
"step": 5820
},
{
"epoch": 2.9608938547486034,
"grad_norm": 0.7855400443077087,
"learning_rate": 2.6555355431465145e-09,
"loss": 0.1831,
"step": 5830
},
{
"epoch": 2.9659725749111225,
"grad_norm": 0.8734691143035889,
"learning_rate": 2.0183626133374325e-09,
"loss": 0.1836,
"step": 5840
},
{
"epoch": 2.971051295073641,
"grad_norm": 0.8363956212997437,
"learning_rate": 1.4684301096992704e-09,
"loss": 0.1823,
"step": 5850
},
{
"epoch": 2.9761300152361603,
"grad_norm": 0.8143458962440491,
"learning_rate": 1.0057572382765613e-09,
"loss": 0.1829,
"step": 5860
},
{
"epoch": 2.9812087353986794,
"grad_norm": 0.8893367648124695,
"learning_rate": 6.303601576257423e-10,
"loss": 0.1855,
"step": 5870
},
{
"epoch": 2.9862874555611985,
"grad_norm": 0.8227106332778931,
"learning_rate": 3.4225197825227264e-10,
"loss": 0.1824,
"step": 5880
},
{
"epoch": 2.9913661757237175,
"grad_norm": 0.8717892169952393,
"learning_rate": 1.4144276215211085e-10,
"loss": 0.187,
"step": 5890
},
{
"epoch": 2.9964448958862366,
"grad_norm": 0.9425582885742188,
"learning_rate": 2.793952245921938e-11,
"loss": 0.1806,
"step": 5900
},
{
"epoch": 3.0,
"step": 5907,
"total_flos": 3.03001910027265e+19,
"train_loss": 0.25274969475188996,
"train_runtime": 28024.1449,
"train_samples_per_second": 6.744,
"train_steps_per_second": 0.211
}
],
"logging_steps": 10,
"max_steps": 5907,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.03001910027265e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}