{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.631578947368421, "eval_steps": 369, "global_step": 1107, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005705320211096848, "grad_norm": 83.0, "learning_rate": 0.0, "loss": 3.648493528366089, "step": 1 }, { "epoch": 0.0011410640422193695, "grad_norm": 84.5, "learning_rate": 1.3513513513513515e-06, "loss": 3.7405500411987305, "step": 2 }, { "epoch": 0.0017115960633290544, "grad_norm": 74.0, "learning_rate": 2.702702702702703e-06, "loss": 3.510922431945801, "step": 3 }, { "epoch": 0.002282128084438739, "grad_norm": 74.0, "learning_rate": 4.0540540540540545e-06, "loss": 3.477842330932617, "step": 4 }, { "epoch": 0.002852660105548424, "grad_norm": 48.5, "learning_rate": 5.405405405405406e-06, "loss": 3.2050325870513916, "step": 5 }, { "epoch": 0.0034231921266581087, "grad_norm": 35.0, "learning_rate": 6.7567567567567575e-06, "loss": 2.9610347747802734, "step": 6 }, { "epoch": 0.003993724147767793, "grad_norm": 25.75, "learning_rate": 8.108108108108109e-06, "loss": 2.8089160919189453, "step": 7 }, { "epoch": 0.004564256168877478, "grad_norm": 15.25, "learning_rate": 9.45945945945946e-06, "loss": 2.672607183456421, "step": 8 }, { "epoch": 0.005134788189987163, "grad_norm": 10.125, "learning_rate": 1.0810810810810812e-05, "loss": 2.4392411708831787, "step": 9 }, { "epoch": 0.005705320211096848, "grad_norm": 8.875, "learning_rate": 1.2162162162162164e-05, "loss": 2.4409432411193848, "step": 10 }, { "epoch": 0.006275852232206533, "grad_norm": 7.40625, "learning_rate": 1.3513513513513515e-05, "loss": 2.3299427032470703, "step": 11 }, { "epoch": 0.0068463842533162175, "grad_norm": 6.6875, "learning_rate": 1.4864864864864867e-05, "loss": 2.2852554321289062, "step": 12 }, { "epoch": 0.007416916274425902, "grad_norm": 6.5625, "learning_rate": 1.6216216216216218e-05, "loss": 2.2712786197662354, "step": 13 }, { "epoch": 0.007987448295535587, "grad_norm": 7.1875, "learning_rate": 1.756756756756757e-05, "loss": 2.2143714427948, "step": 14 }, { "epoch": 0.008557980316645272, "grad_norm": 7.0, "learning_rate": 1.891891891891892e-05, "loss": 2.0812437534332275, "step": 15 }, { "epoch": 0.009128512337754956, "grad_norm": 6.28125, "learning_rate": 2.0270270270270273e-05, "loss": 2.068169355392456, "step": 16 }, { "epoch": 0.009699044358864642, "grad_norm": 5.75, "learning_rate": 2.1621621621621624e-05, "loss": 1.8387004137039185, "step": 17 }, { "epoch": 0.010269576379974325, "grad_norm": 4.3125, "learning_rate": 2.2972972972972976e-05, "loss": 1.7710001468658447, "step": 18 }, { "epoch": 0.01084010840108401, "grad_norm": 4.25, "learning_rate": 2.4324324324324327e-05, "loss": 1.7796661853790283, "step": 19 }, { "epoch": 0.011410640422193696, "grad_norm": 3.5, "learning_rate": 2.5675675675675675e-05, "loss": 1.6957234144210815, "step": 20 }, { "epoch": 0.01198117244330338, "grad_norm": 3.21875, "learning_rate": 2.702702702702703e-05, "loss": 1.7516167163848877, "step": 21 }, { "epoch": 0.012551704464413066, "grad_norm": 2.78125, "learning_rate": 2.8378378378378378e-05, "loss": 1.6087043285369873, "step": 22 }, { "epoch": 0.01312223648552275, "grad_norm": 2.34375, "learning_rate": 2.9729729729729733e-05, "loss": 1.5943574905395508, "step": 23 }, { "epoch": 0.013692768506632435, "grad_norm": 2.140625, "learning_rate": 3.108108108108108e-05, "loss": 1.599621295928955, "step": 24 }, { "epoch": 0.014263300527742119, "grad_norm": 2.234375, "learning_rate": 3.2432432432432436e-05, "loss": 1.6016688346862793, "step": 25 }, { "epoch": 0.014833832548851804, "grad_norm": 1.9609375, "learning_rate": 3.3783783783783784e-05, "loss": 1.5124552249908447, "step": 26 }, { "epoch": 0.01540436456996149, "grad_norm": 1.9765625, "learning_rate": 3.513513513513514e-05, "loss": 1.5520291328430176, "step": 27 }, { "epoch": 0.015974896591071173, "grad_norm": 1.90625, "learning_rate": 3.648648648648649e-05, "loss": 1.4819629192352295, "step": 28 }, { "epoch": 0.01654542861218086, "grad_norm": 2.0, "learning_rate": 3.783783783783784e-05, "loss": 1.5304462909698486, "step": 29 }, { "epoch": 0.017115960633290545, "grad_norm": 1.7734375, "learning_rate": 3.918918918918919e-05, "loss": 1.4461307525634766, "step": 30 }, { "epoch": 0.017686492654400227, "grad_norm": 1.8125, "learning_rate": 4.0540540540540545e-05, "loss": 1.4548516273498535, "step": 31 }, { "epoch": 0.018257024675509912, "grad_norm": 1.6875, "learning_rate": 4.189189189189189e-05, "loss": 1.435849905014038, "step": 32 }, { "epoch": 0.018827556696619598, "grad_norm": 1.578125, "learning_rate": 4.324324324324325e-05, "loss": 1.4789021015167236, "step": 33 }, { "epoch": 0.019398088717729283, "grad_norm": 1.578125, "learning_rate": 4.4594594594594596e-05, "loss": 1.3856297731399536, "step": 34 }, { "epoch": 0.01996862073883897, "grad_norm": 1.65625, "learning_rate": 4.594594594594595e-05, "loss": 1.5028152465820312, "step": 35 }, { "epoch": 0.02053915275994865, "grad_norm": 1.46875, "learning_rate": 4.72972972972973e-05, "loss": 1.4294812679290771, "step": 36 }, { "epoch": 0.021109684781058336, "grad_norm": 1.65625, "learning_rate": 4.8648648648648654e-05, "loss": 1.3971917629241943, "step": 37 }, { "epoch": 0.02168021680216802, "grad_norm": 1.515625, "learning_rate": 5e-05, "loss": 1.3995487689971924, "step": 38 }, { "epoch": 0.022250748823277707, "grad_norm": 1.703125, "learning_rate": 4.9972283813747225e-05, "loss": 1.4693856239318848, "step": 39 }, { "epoch": 0.022821280844387393, "grad_norm": 1.5703125, "learning_rate": 4.994456762749446e-05, "loss": 1.4715073108673096, "step": 40 }, { "epoch": 0.023391812865497075, "grad_norm": 1.703125, "learning_rate": 4.9916851441241684e-05, "loss": 1.490320086479187, "step": 41 }, { "epoch": 0.02396234488660676, "grad_norm": 1.5, "learning_rate": 4.9889135254988913e-05, "loss": 1.3657546043395996, "step": 42 }, { "epoch": 0.024532876907716446, "grad_norm": 1.7109375, "learning_rate": 4.986141906873614e-05, "loss": 1.4324053525924683, "step": 43 }, { "epoch": 0.02510340892882613, "grad_norm": 1.7890625, "learning_rate": 4.983370288248337e-05, "loss": 1.3849389553070068, "step": 44 }, { "epoch": 0.025673940949935817, "grad_norm": 1.453125, "learning_rate": 4.98059866962306e-05, "loss": 1.425079345703125, "step": 45 }, { "epoch": 0.0262444729710455, "grad_norm": 1.484375, "learning_rate": 4.977827050997783e-05, "loss": 1.4127968549728394, "step": 46 }, { "epoch": 0.026815004992155184, "grad_norm": 1.5390625, "learning_rate": 4.9750554323725054e-05, "loss": 1.429938793182373, "step": 47 }, { "epoch": 0.02738553701326487, "grad_norm": 1.4140625, "learning_rate": 4.972283813747229e-05, "loss": 1.4178887605667114, "step": 48 }, { "epoch": 0.027956069034374555, "grad_norm": 1.46875, "learning_rate": 4.969512195121951e-05, "loss": 1.4397588968276978, "step": 49 }, { "epoch": 0.028526601055484237, "grad_norm": 1.453125, "learning_rate": 4.966740576496674e-05, "loss": 1.3697854280471802, "step": 50 }, { "epoch": 0.029097133076593923, "grad_norm": 1.3984375, "learning_rate": 4.963968957871397e-05, "loss": 1.3517988920211792, "step": 51 }, { "epoch": 0.02966766509770361, "grad_norm": 1.3984375, "learning_rate": 4.9611973392461195e-05, "loss": 1.4193122386932373, "step": 52 }, { "epoch": 0.030238197118813294, "grad_norm": 1.34375, "learning_rate": 4.958425720620843e-05, "loss": 1.37640380859375, "step": 53 }, { "epoch": 0.03080872913992298, "grad_norm": 1.40625, "learning_rate": 4.9556541019955654e-05, "loss": 1.336474895477295, "step": 54 }, { "epoch": 0.031379261161032665, "grad_norm": 1.515625, "learning_rate": 4.952882483370288e-05, "loss": 1.4701391458511353, "step": 55 }, { "epoch": 0.03194979318214235, "grad_norm": 1.4765625, "learning_rate": 4.950110864745011e-05, "loss": 1.3760974407196045, "step": 56 }, { "epoch": 0.032520325203252036, "grad_norm": 1.4609375, "learning_rate": 4.947339246119734e-05, "loss": 1.3897124528884888, "step": 57 }, { "epoch": 0.03309085722436172, "grad_norm": 1.578125, "learning_rate": 4.944567627494457e-05, "loss": 1.4239261150360107, "step": 58 }, { "epoch": 0.0336613892454714, "grad_norm": 1.53125, "learning_rate": 4.94179600886918e-05, "loss": 1.3669216632843018, "step": 59 }, { "epoch": 0.03423192126658109, "grad_norm": 1.3515625, "learning_rate": 4.9390243902439024e-05, "loss": 1.346958041191101, "step": 60 }, { "epoch": 0.03480245328769077, "grad_norm": 1.5234375, "learning_rate": 4.936252771618626e-05, "loss": 1.4235575199127197, "step": 61 }, { "epoch": 0.03537298530880045, "grad_norm": 1.3359375, "learning_rate": 4.933481152993348e-05, "loss": 1.3075377941131592, "step": 62 }, { "epoch": 0.03594351732991014, "grad_norm": 1.3203125, "learning_rate": 4.930709534368071e-05, "loss": 1.3214820623397827, "step": 63 }, { "epoch": 0.036514049351019824, "grad_norm": 1.3515625, "learning_rate": 4.927937915742794e-05, "loss": 1.39829421043396, "step": 64 }, { "epoch": 0.03708458137212951, "grad_norm": 1.3671875, "learning_rate": 4.9251662971175164e-05, "loss": 1.3523836135864258, "step": 65 }, { "epoch": 0.037655113393239195, "grad_norm": 1.3125, "learning_rate": 4.92239467849224e-05, "loss": 1.3268153667449951, "step": 66 }, { "epoch": 0.03822564541434888, "grad_norm": 1.28125, "learning_rate": 4.919623059866962e-05, "loss": 1.3205022811889648, "step": 67 }, { "epoch": 0.038796177435458566, "grad_norm": 1.2734375, "learning_rate": 4.916851441241685e-05, "loss": 1.2956037521362305, "step": 68 }, { "epoch": 0.03936670945656825, "grad_norm": 1.375, "learning_rate": 4.914079822616408e-05, "loss": 1.3702654838562012, "step": 69 }, { "epoch": 0.03993724147767794, "grad_norm": 1.296875, "learning_rate": 4.911308203991131e-05, "loss": 1.388296127319336, "step": 70 }, { "epoch": 0.04050777349878762, "grad_norm": 1.5078125, "learning_rate": 4.908536585365854e-05, "loss": 1.4403045177459717, "step": 71 }, { "epoch": 0.0410783055198973, "grad_norm": 1.25, "learning_rate": 4.905764966740577e-05, "loss": 1.3626902103424072, "step": 72 }, { "epoch": 0.04164883754100699, "grad_norm": 1.34375, "learning_rate": 4.902993348115299e-05, "loss": 1.382088303565979, "step": 73 }, { "epoch": 0.04221936956211667, "grad_norm": 1.2578125, "learning_rate": 4.900221729490023e-05, "loss": 1.3237360715866089, "step": 74 }, { "epoch": 0.04278990158322636, "grad_norm": 1.296875, "learning_rate": 4.897450110864745e-05, "loss": 1.319187879562378, "step": 75 }, { "epoch": 0.04336043360433604, "grad_norm": 1.3828125, "learning_rate": 4.894678492239468e-05, "loss": 1.3707743883132935, "step": 76 }, { "epoch": 0.043930965625445725, "grad_norm": 1.2578125, "learning_rate": 4.891906873614191e-05, "loss": 1.3658738136291504, "step": 77 }, { "epoch": 0.044501497646555414, "grad_norm": 1.265625, "learning_rate": 4.8891352549889134e-05, "loss": 1.3247051239013672, "step": 78 }, { "epoch": 0.045072029667665096, "grad_norm": 1.421875, "learning_rate": 4.886363636363637e-05, "loss": 1.3614035844802856, "step": 79 }, { "epoch": 0.045642561688774785, "grad_norm": 1.2734375, "learning_rate": 4.883592017738359e-05, "loss": 1.2589421272277832, "step": 80 }, { "epoch": 0.04621309370988447, "grad_norm": 1.28125, "learning_rate": 4.880820399113082e-05, "loss": 1.3525424003601074, "step": 81 }, { "epoch": 0.04678362573099415, "grad_norm": 1.2578125, "learning_rate": 4.878048780487805e-05, "loss": 1.2903777360916138, "step": 82 }, { "epoch": 0.04735415775210384, "grad_norm": 1.328125, "learning_rate": 4.875277161862528e-05, "loss": 1.3538789749145508, "step": 83 }, { "epoch": 0.04792468977321352, "grad_norm": 1.3046875, "learning_rate": 4.872505543237251e-05, "loss": 1.3419591188430786, "step": 84 }, { "epoch": 0.04849522179432321, "grad_norm": 1.28125, "learning_rate": 4.869733924611974e-05, "loss": 1.3367938995361328, "step": 85 }, { "epoch": 0.04906575381543289, "grad_norm": 1.3046875, "learning_rate": 4.866962305986696e-05, "loss": 1.2979538440704346, "step": 86 }, { "epoch": 0.049636285836542574, "grad_norm": 1.25, "learning_rate": 4.864190687361419e-05, "loss": 1.348291039466858, "step": 87 }, { "epoch": 0.05020681785765226, "grad_norm": 1.3125, "learning_rate": 4.861419068736142e-05, "loss": 1.3377124071121216, "step": 88 }, { "epoch": 0.050777349878761945, "grad_norm": 1.328125, "learning_rate": 4.8586474501108644e-05, "loss": 1.3180426359176636, "step": 89 }, { "epoch": 0.051347881899871634, "grad_norm": 1.2109375, "learning_rate": 4.855875831485588e-05, "loss": 1.3215968608856201, "step": 90 }, { "epoch": 0.051918413920981316, "grad_norm": 1.2421875, "learning_rate": 4.85310421286031e-05, "loss": 1.3354041576385498, "step": 91 }, { "epoch": 0.052488945942091, "grad_norm": 1.28125, "learning_rate": 4.850332594235034e-05, "loss": 1.3552148342132568, "step": 92 }, { "epoch": 0.05305947796320069, "grad_norm": 1.2421875, "learning_rate": 4.847560975609756e-05, "loss": 1.2916048765182495, "step": 93 }, { "epoch": 0.05363000998431037, "grad_norm": 1.3125, "learning_rate": 4.844789356984479e-05, "loss": 1.3131537437438965, "step": 94 }, { "epoch": 0.05420054200542006, "grad_norm": 1.296875, "learning_rate": 4.842017738359202e-05, "loss": 1.2902660369873047, "step": 95 }, { "epoch": 0.05477107402652974, "grad_norm": 1.28125, "learning_rate": 4.839246119733925e-05, "loss": 1.3799315690994263, "step": 96 }, { "epoch": 0.05534160604763942, "grad_norm": 1.3515625, "learning_rate": 4.836474501108647e-05, "loss": 1.3607311248779297, "step": 97 }, { "epoch": 0.05591213806874911, "grad_norm": 1.2421875, "learning_rate": 4.833702882483371e-05, "loss": 1.3038060665130615, "step": 98 }, { "epoch": 0.05648267008985879, "grad_norm": 1.234375, "learning_rate": 4.830931263858093e-05, "loss": 1.318457841873169, "step": 99 }, { "epoch": 0.057053202110968475, "grad_norm": 1.2890625, "learning_rate": 4.828159645232816e-05, "loss": 1.3159422874450684, "step": 100 }, { "epoch": 0.057623734132078164, "grad_norm": 1.25, "learning_rate": 4.825388026607539e-05, "loss": 1.3275076150894165, "step": 101 }, { "epoch": 0.058194266153187846, "grad_norm": 1.1796875, "learning_rate": 4.8226164079822614e-05, "loss": 1.2983460426330566, "step": 102 }, { "epoch": 0.058764798174297535, "grad_norm": 1.171875, "learning_rate": 4.819844789356985e-05, "loss": 1.3114261627197266, "step": 103 }, { "epoch": 0.05933533019540722, "grad_norm": 1.1796875, "learning_rate": 4.817073170731707e-05, "loss": 1.266122817993164, "step": 104 }, { "epoch": 0.0599058622165169, "grad_norm": 1.234375, "learning_rate": 4.81430155210643e-05, "loss": 1.3662368059158325, "step": 105 }, { "epoch": 0.06047639423762659, "grad_norm": 1.296875, "learning_rate": 4.811529933481153e-05, "loss": 1.3158059120178223, "step": 106 }, { "epoch": 0.06104692625873627, "grad_norm": 1.234375, "learning_rate": 4.808758314855876e-05, "loss": 1.3571752309799194, "step": 107 }, { "epoch": 0.06161745827984596, "grad_norm": 1.34375, "learning_rate": 4.805986696230599e-05, "loss": 1.3249101638793945, "step": 108 }, { "epoch": 0.06218799030095564, "grad_norm": 1.21875, "learning_rate": 4.803215077605322e-05, "loss": 1.3386337757110596, "step": 109 }, { "epoch": 0.06275852232206533, "grad_norm": 1.2421875, "learning_rate": 4.800443458980044e-05, "loss": 1.2874070405960083, "step": 110 }, { "epoch": 0.06332905434317501, "grad_norm": 1.28125, "learning_rate": 4.797671840354768e-05, "loss": 1.3232687711715698, "step": 111 }, { "epoch": 0.0638995863642847, "grad_norm": 1.203125, "learning_rate": 4.79490022172949e-05, "loss": 1.3370904922485352, "step": 112 }, { "epoch": 0.06447011838539438, "grad_norm": 1.3203125, "learning_rate": 4.792128603104213e-05, "loss": 1.3211901187896729, "step": 113 }, { "epoch": 0.06504065040650407, "grad_norm": 1.2578125, "learning_rate": 4.789356984478936e-05, "loss": 1.3841608762741089, "step": 114 }, { "epoch": 0.06561118242761375, "grad_norm": 1.2890625, "learning_rate": 4.786585365853658e-05, "loss": 1.4017915725708008, "step": 115 }, { "epoch": 0.06618171444872344, "grad_norm": 1.4140625, "learning_rate": 4.783813747228382e-05, "loss": 1.4110525846481323, "step": 116 }, { "epoch": 0.06675224646983312, "grad_norm": 1.2734375, "learning_rate": 4.781042128603104e-05, "loss": 1.2671241760253906, "step": 117 }, { "epoch": 0.0673227784909428, "grad_norm": 1.21875, "learning_rate": 4.778270509977827e-05, "loss": 1.2970881462097168, "step": 118 }, { "epoch": 0.06789331051205248, "grad_norm": 1.3515625, "learning_rate": 4.77549889135255e-05, "loss": 1.2626357078552246, "step": 119 }, { "epoch": 0.06846384253316218, "grad_norm": 1.3203125, "learning_rate": 4.772727272727273e-05, "loss": 1.2779147624969482, "step": 120 }, { "epoch": 0.06903437455427186, "grad_norm": 1.1796875, "learning_rate": 4.769955654101996e-05, "loss": 1.308679461479187, "step": 121 }, { "epoch": 0.06960490657538154, "grad_norm": 1.234375, "learning_rate": 4.767184035476719e-05, "loss": 1.299755573272705, "step": 122 }, { "epoch": 0.07017543859649122, "grad_norm": 1.2578125, "learning_rate": 4.764412416851441e-05, "loss": 1.3637490272521973, "step": 123 }, { "epoch": 0.0707459706176009, "grad_norm": 1.21875, "learning_rate": 4.761640798226164e-05, "loss": 1.3058216571807861, "step": 124 }, { "epoch": 0.0713165026387106, "grad_norm": 1.2734375, "learning_rate": 4.758869179600887e-05, "loss": 1.3146748542785645, "step": 125 }, { "epoch": 0.07188703465982028, "grad_norm": 1.2109375, "learning_rate": 4.75609756097561e-05, "loss": 1.2844371795654297, "step": 126 }, { "epoch": 0.07245756668092997, "grad_norm": 1.25, "learning_rate": 4.753325942350333e-05, "loss": 1.3195525407791138, "step": 127 }, { "epoch": 0.07302809870203965, "grad_norm": 1.1953125, "learning_rate": 4.750554323725055e-05, "loss": 1.3399118185043335, "step": 128 }, { "epoch": 0.07359863072314933, "grad_norm": 1.15625, "learning_rate": 4.747782705099779e-05, "loss": 1.2919648885726929, "step": 129 }, { "epoch": 0.07416916274425903, "grad_norm": 1.2421875, "learning_rate": 4.745011086474501e-05, "loss": 1.277235507965088, "step": 130 }, { "epoch": 0.07473969476536871, "grad_norm": 1.25, "learning_rate": 4.742239467849224e-05, "loss": 1.3034231662750244, "step": 131 }, { "epoch": 0.07531022678647839, "grad_norm": 1.109375, "learning_rate": 4.739467849223947e-05, "loss": 1.2368437051773071, "step": 132 }, { "epoch": 0.07588075880758807, "grad_norm": 1.2265625, "learning_rate": 4.73669623059867e-05, "loss": 1.3728649616241455, "step": 133 }, { "epoch": 0.07645129082869775, "grad_norm": 1.109375, "learning_rate": 4.733924611973393e-05, "loss": 1.2506084442138672, "step": 134 }, { "epoch": 0.07702182284980745, "grad_norm": 1.2109375, "learning_rate": 4.731152993348116e-05, "loss": 1.2813055515289307, "step": 135 }, { "epoch": 0.07759235487091713, "grad_norm": 1.203125, "learning_rate": 4.728381374722838e-05, "loss": 1.2894189357757568, "step": 136 }, { "epoch": 0.07816288689202681, "grad_norm": 1.203125, "learning_rate": 4.725609756097561e-05, "loss": 1.3396642208099365, "step": 137 }, { "epoch": 0.0787334189131365, "grad_norm": 1.359375, "learning_rate": 4.722838137472284e-05, "loss": 1.3043787479400635, "step": 138 }, { "epoch": 0.07930395093424618, "grad_norm": 1.203125, "learning_rate": 4.720066518847007e-05, "loss": 1.308459997177124, "step": 139 }, { "epoch": 0.07987448295535587, "grad_norm": 1.21875, "learning_rate": 4.71729490022173e-05, "loss": 1.3281002044677734, "step": 140 }, { "epoch": 0.08044501497646556, "grad_norm": 1.171875, "learning_rate": 4.714523281596452e-05, "loss": 1.3146984577178955, "step": 141 }, { "epoch": 0.08101554699757524, "grad_norm": 1.203125, "learning_rate": 4.711751662971176e-05, "loss": 1.3078755140304565, "step": 142 }, { "epoch": 0.08158607901868492, "grad_norm": 1.21875, "learning_rate": 4.708980044345898e-05, "loss": 1.3129773139953613, "step": 143 }, { "epoch": 0.0821566110397946, "grad_norm": 1.171875, "learning_rate": 4.706208425720621e-05, "loss": 1.2827129364013672, "step": 144 }, { "epoch": 0.0827271430609043, "grad_norm": 1.171875, "learning_rate": 4.703436807095344e-05, "loss": 1.3232603073120117, "step": 145 }, { "epoch": 0.08329767508201398, "grad_norm": 1.1796875, "learning_rate": 4.700665188470067e-05, "loss": 1.220211386680603, "step": 146 }, { "epoch": 0.08386820710312366, "grad_norm": 1.2421875, "learning_rate": 4.69789356984479e-05, "loss": 1.3406665325164795, "step": 147 }, { "epoch": 0.08443873912423334, "grad_norm": 1.1875, "learning_rate": 4.695121951219512e-05, "loss": 1.2698848247528076, "step": 148 }, { "epoch": 0.08500927114534303, "grad_norm": 1.1171875, "learning_rate": 4.692350332594235e-05, "loss": 1.3016014099121094, "step": 149 }, { "epoch": 0.08557980316645272, "grad_norm": 1.1484375, "learning_rate": 4.689578713968958e-05, "loss": 1.2674150466918945, "step": 150 }, { "epoch": 0.0861503351875624, "grad_norm": 1.234375, "learning_rate": 4.686807095343681e-05, "loss": 1.316935420036316, "step": 151 }, { "epoch": 0.08672086720867209, "grad_norm": 1.125, "learning_rate": 4.684035476718403e-05, "loss": 1.263155221939087, "step": 152 }, { "epoch": 0.08729139922978177, "grad_norm": 1.1875, "learning_rate": 4.681263858093127e-05, "loss": 1.30006742477417, "step": 153 }, { "epoch": 0.08786193125089145, "grad_norm": 1.296875, "learning_rate": 4.678492239467849e-05, "loss": 1.3325148820877075, "step": 154 }, { "epoch": 0.08843246327200115, "grad_norm": 1.3125, "learning_rate": 4.675720620842573e-05, "loss": 1.2306278944015503, "step": 155 }, { "epoch": 0.08900299529311083, "grad_norm": 1.234375, "learning_rate": 4.672949002217295e-05, "loss": 1.3476486206054688, "step": 156 }, { "epoch": 0.08957352731422051, "grad_norm": 1.1953125, "learning_rate": 4.670177383592018e-05, "loss": 1.2401833534240723, "step": 157 }, { "epoch": 0.09014405933533019, "grad_norm": 1.296875, "learning_rate": 4.667405764966741e-05, "loss": 1.3140380382537842, "step": 158 }, { "epoch": 0.09071459135643987, "grad_norm": 1.1875, "learning_rate": 4.664634146341464e-05, "loss": 1.29231595993042, "step": 159 }, { "epoch": 0.09128512337754957, "grad_norm": 1.15625, "learning_rate": 4.661862527716186e-05, "loss": 1.2908031940460205, "step": 160 }, { "epoch": 0.09185565539865925, "grad_norm": 1.140625, "learning_rate": 4.659090909090909e-05, "loss": 1.259028434753418, "step": 161 }, { "epoch": 0.09242618741976893, "grad_norm": 1.203125, "learning_rate": 4.656319290465632e-05, "loss": 1.2758322954177856, "step": 162 }, { "epoch": 0.09299671944087862, "grad_norm": 1.1640625, "learning_rate": 4.653547671840355e-05, "loss": 1.2392590045928955, "step": 163 }, { "epoch": 0.0935672514619883, "grad_norm": 1.2421875, "learning_rate": 4.650776053215078e-05, "loss": 1.3232059478759766, "step": 164 }, { "epoch": 0.094137783483098, "grad_norm": 1.1328125, "learning_rate": 4.6480044345898e-05, "loss": 1.3052716255187988, "step": 165 }, { "epoch": 0.09470831550420768, "grad_norm": 1.15625, "learning_rate": 4.645232815964524e-05, "loss": 1.2643868923187256, "step": 166 }, { "epoch": 0.09527884752531736, "grad_norm": 1.1796875, "learning_rate": 4.642461197339246e-05, "loss": 1.3158135414123535, "step": 167 }, { "epoch": 0.09584937954642704, "grad_norm": 1.1484375, "learning_rate": 4.639689578713969e-05, "loss": 1.2975637912750244, "step": 168 }, { "epoch": 0.09641991156753672, "grad_norm": 1.09375, "learning_rate": 4.636917960088692e-05, "loss": 1.202270269393921, "step": 169 }, { "epoch": 0.09699044358864642, "grad_norm": 1.1015625, "learning_rate": 4.634146341463415e-05, "loss": 1.1989184617996216, "step": 170 }, { "epoch": 0.0975609756097561, "grad_norm": 1.15625, "learning_rate": 4.631374722838138e-05, "loss": 1.325451374053955, "step": 171 }, { "epoch": 0.09813150763086578, "grad_norm": 1.1484375, "learning_rate": 4.628603104212861e-05, "loss": 1.3150224685668945, "step": 172 }, { "epoch": 0.09870203965197547, "grad_norm": 1.1484375, "learning_rate": 4.625831485587583e-05, "loss": 1.2864487171173096, "step": 173 }, { "epoch": 0.09927257167308515, "grad_norm": 1.140625, "learning_rate": 4.623059866962306e-05, "loss": 1.3033939599990845, "step": 174 }, { "epoch": 0.09984310369419484, "grad_norm": 1.109375, "learning_rate": 4.620288248337029e-05, "loss": 1.2654147148132324, "step": 175 }, { "epoch": 0.10041363571530453, "grad_norm": 1.125, "learning_rate": 4.617516629711752e-05, "loss": 1.2905241250991821, "step": 176 }, { "epoch": 0.10098416773641421, "grad_norm": 1.1484375, "learning_rate": 4.614745011086475e-05, "loss": 1.2881019115447998, "step": 177 }, { "epoch": 0.10155469975752389, "grad_norm": 1.15625, "learning_rate": 4.611973392461197e-05, "loss": 1.3300973176956177, "step": 178 }, { "epoch": 0.10212523177863357, "grad_norm": 1.1484375, "learning_rate": 4.609201773835921e-05, "loss": 1.3166918754577637, "step": 179 }, { "epoch": 0.10269576379974327, "grad_norm": 1.078125, "learning_rate": 4.606430155210643e-05, "loss": 1.2149487733840942, "step": 180 }, { "epoch": 0.10326629582085295, "grad_norm": 1.1171875, "learning_rate": 4.603658536585366e-05, "loss": 1.284995198249817, "step": 181 }, { "epoch": 0.10383682784196263, "grad_norm": 1.171875, "learning_rate": 4.600886917960089e-05, "loss": 1.3197823762893677, "step": 182 }, { "epoch": 0.10440735986307231, "grad_norm": 1.09375, "learning_rate": 4.598115299334812e-05, "loss": 1.2414249181747437, "step": 183 }, { "epoch": 0.104977891884182, "grad_norm": 1.1953125, "learning_rate": 4.595343680709535e-05, "loss": 1.2936391830444336, "step": 184 }, { "epoch": 0.10554842390529169, "grad_norm": 1.203125, "learning_rate": 4.592572062084257e-05, "loss": 1.2889211177825928, "step": 185 }, { "epoch": 0.10611895592640137, "grad_norm": 1.1640625, "learning_rate": 4.58980044345898e-05, "loss": 1.2958948612213135, "step": 186 }, { "epoch": 0.10668948794751106, "grad_norm": 1.15625, "learning_rate": 4.587028824833703e-05, "loss": 1.3174210786819458, "step": 187 }, { "epoch": 0.10726001996862074, "grad_norm": 1.09375, "learning_rate": 4.584257206208426e-05, "loss": 1.3083107471466064, "step": 188 }, { "epoch": 0.10783055198973042, "grad_norm": 1.1015625, "learning_rate": 4.581485587583149e-05, "loss": 1.2460663318634033, "step": 189 }, { "epoch": 0.10840108401084012, "grad_norm": 1.109375, "learning_rate": 4.578713968957872e-05, "loss": 1.262696623802185, "step": 190 }, { "epoch": 0.1089716160319498, "grad_norm": 1.1796875, "learning_rate": 4.575942350332594e-05, "loss": 1.290346384048462, "step": 191 }, { "epoch": 0.10954214805305948, "grad_norm": 1.15625, "learning_rate": 4.573170731707318e-05, "loss": 1.2630096673965454, "step": 192 }, { "epoch": 0.11011268007416916, "grad_norm": 1.1015625, "learning_rate": 4.57039911308204e-05, "loss": 1.2521231174468994, "step": 193 }, { "epoch": 0.11068321209527884, "grad_norm": 1.1484375, "learning_rate": 4.567627494456763e-05, "loss": 1.2671630382537842, "step": 194 }, { "epoch": 0.11125374411638853, "grad_norm": 1.171875, "learning_rate": 4.564855875831486e-05, "loss": 1.3561689853668213, "step": 195 }, { "epoch": 0.11182427613749822, "grad_norm": 1.1484375, "learning_rate": 4.562084257206209e-05, "loss": 1.2499645948410034, "step": 196 }, { "epoch": 0.1123948081586079, "grad_norm": 1.15625, "learning_rate": 4.559312638580932e-05, "loss": 1.2348875999450684, "step": 197 }, { "epoch": 0.11296534017971759, "grad_norm": 1.1640625, "learning_rate": 4.556541019955654e-05, "loss": 1.322629690170288, "step": 198 }, { "epoch": 0.11353587220082727, "grad_norm": 1.171875, "learning_rate": 4.553769401330377e-05, "loss": 1.2846410274505615, "step": 199 }, { "epoch": 0.11410640422193695, "grad_norm": 1.1640625, "learning_rate": 4.5509977827051e-05, "loss": 1.311292052268982, "step": 200 }, { "epoch": 0.11467693624304665, "grad_norm": 1.140625, "learning_rate": 4.548226164079823e-05, "loss": 1.2933259010314941, "step": 201 }, { "epoch": 0.11524746826415633, "grad_norm": 1.21875, "learning_rate": 4.545454545454546e-05, "loss": 1.3615764379501343, "step": 202 }, { "epoch": 0.11581800028526601, "grad_norm": 1.1328125, "learning_rate": 4.542682926829269e-05, "loss": 1.187692403793335, "step": 203 }, { "epoch": 0.11638853230637569, "grad_norm": 1.1796875, "learning_rate": 4.539911308203991e-05, "loss": 1.2587438821792603, "step": 204 }, { "epoch": 0.11695906432748537, "grad_norm": 1.0546875, "learning_rate": 4.537139689578715e-05, "loss": 1.2154557704925537, "step": 205 }, { "epoch": 0.11752959634859507, "grad_norm": 1.09375, "learning_rate": 4.534368070953437e-05, "loss": 1.2670985460281372, "step": 206 }, { "epoch": 0.11810012836970475, "grad_norm": 1.171875, "learning_rate": 4.53159645232816e-05, "loss": 1.292269229888916, "step": 207 }, { "epoch": 0.11867066039081443, "grad_norm": 1.21875, "learning_rate": 4.528824833702883e-05, "loss": 1.2353066205978394, "step": 208 }, { "epoch": 0.11924119241192412, "grad_norm": 1.125, "learning_rate": 4.526053215077605e-05, "loss": 1.2745922803878784, "step": 209 }, { "epoch": 0.1198117244330338, "grad_norm": 1.109375, "learning_rate": 4.523281596452328e-05, "loss": 1.2637782096862793, "step": 210 }, { "epoch": 0.1203822564541435, "grad_norm": 1.125, "learning_rate": 4.520509977827051e-05, "loss": 1.2595422267913818, "step": 211 }, { "epoch": 0.12095278847525318, "grad_norm": 1.0859375, "learning_rate": 4.517738359201774e-05, "loss": 1.2515778541564941, "step": 212 }, { "epoch": 0.12152332049636286, "grad_norm": 1.1953125, "learning_rate": 4.514966740576497e-05, "loss": 1.2258851528167725, "step": 213 }, { "epoch": 0.12209385251747254, "grad_norm": 1.140625, "learning_rate": 4.51219512195122e-05, "loss": 1.2595672607421875, "step": 214 }, { "epoch": 0.12266438453858222, "grad_norm": 1.125, "learning_rate": 4.509423503325942e-05, "loss": 1.2574856281280518, "step": 215 }, { "epoch": 0.12323491655969192, "grad_norm": 1.21875, "learning_rate": 4.506651884700666e-05, "loss": 1.2860839366912842, "step": 216 }, { "epoch": 0.1238054485808016, "grad_norm": 1.1875, "learning_rate": 4.503880266075388e-05, "loss": 1.2748535871505737, "step": 217 }, { "epoch": 0.12437598060191128, "grad_norm": 1.1640625, "learning_rate": 4.501108647450111e-05, "loss": 1.2630361318588257, "step": 218 }, { "epoch": 0.12494651262302096, "grad_norm": 1.1328125, "learning_rate": 4.498337028824834e-05, "loss": 1.2100318670272827, "step": 219 }, { "epoch": 0.12551704464413066, "grad_norm": 1.1328125, "learning_rate": 4.495565410199557e-05, "loss": 1.279637098312378, "step": 220 }, { "epoch": 0.12608757666524034, "grad_norm": 1.09375, "learning_rate": 4.49279379157428e-05, "loss": 1.241306185722351, "step": 221 }, { "epoch": 0.12665810868635002, "grad_norm": 1.125, "learning_rate": 4.490022172949002e-05, "loss": 1.2467423677444458, "step": 222 }, { "epoch": 0.1272286407074597, "grad_norm": 1.1640625, "learning_rate": 4.487250554323725e-05, "loss": 1.2398571968078613, "step": 223 }, { "epoch": 0.1277991727285694, "grad_norm": 1.1171875, "learning_rate": 4.484478935698448e-05, "loss": 1.298073410987854, "step": 224 }, { "epoch": 0.12836970474967907, "grad_norm": 1.1640625, "learning_rate": 4.481707317073171e-05, "loss": 1.3275305032730103, "step": 225 }, { "epoch": 0.12894023677078875, "grad_norm": 1.0859375, "learning_rate": 4.478935698447894e-05, "loss": 1.2483649253845215, "step": 226 }, { "epoch": 0.12951076879189843, "grad_norm": 1.1484375, "learning_rate": 4.476164079822617e-05, "loss": 1.322462797164917, "step": 227 }, { "epoch": 0.13008130081300814, "grad_norm": 1.140625, "learning_rate": 4.473392461197339e-05, "loss": 1.2100863456726074, "step": 228 }, { "epoch": 0.13065183283411783, "grad_norm": 1.1640625, "learning_rate": 4.470620842572063e-05, "loss": 1.249301552772522, "step": 229 }, { "epoch": 0.1312223648552275, "grad_norm": 1.125, "learning_rate": 4.467849223946785e-05, "loss": 1.2208349704742432, "step": 230 }, { "epoch": 0.1317928968763372, "grad_norm": 1.171875, "learning_rate": 4.465077605321508e-05, "loss": 1.2686306238174438, "step": 231 }, { "epoch": 0.13236342889744687, "grad_norm": 1.1796875, "learning_rate": 4.462305986696231e-05, "loss": 1.2922316789627075, "step": 232 }, { "epoch": 0.13293396091855655, "grad_norm": 1.171875, "learning_rate": 4.459534368070954e-05, "loss": 1.2734718322753906, "step": 233 }, { "epoch": 0.13350449293966624, "grad_norm": 1.125, "learning_rate": 4.456762749445677e-05, "loss": 1.2748900651931763, "step": 234 }, { "epoch": 0.13407502496077592, "grad_norm": 1.125, "learning_rate": 4.453991130820399e-05, "loss": 1.2857415676116943, "step": 235 }, { "epoch": 0.1346455569818856, "grad_norm": 1.1640625, "learning_rate": 4.451219512195122e-05, "loss": 1.2689714431762695, "step": 236 }, { "epoch": 0.13521608900299528, "grad_norm": 1.1171875, "learning_rate": 4.448447893569845e-05, "loss": 1.248453140258789, "step": 237 }, { "epoch": 0.13578662102410496, "grad_norm": 1.1484375, "learning_rate": 4.445676274944568e-05, "loss": 1.2693870067596436, "step": 238 }, { "epoch": 0.13635715304521467, "grad_norm": 1.109375, "learning_rate": 4.442904656319291e-05, "loss": 1.2767329216003418, "step": 239 }, { "epoch": 0.13692768506632436, "grad_norm": 1.109375, "learning_rate": 4.440133037694014e-05, "loss": 1.2598170042037964, "step": 240 }, { "epoch": 0.13749821708743404, "grad_norm": 1.140625, "learning_rate": 4.437361419068736e-05, "loss": 1.2850111722946167, "step": 241 }, { "epoch": 0.13806874910854372, "grad_norm": 1.0625, "learning_rate": 4.4345898004434597e-05, "loss": 1.2005095481872559, "step": 242 }, { "epoch": 0.1386392811296534, "grad_norm": 1.09375, "learning_rate": 4.431818181818182e-05, "loss": 1.2896265983581543, "step": 243 }, { "epoch": 0.13920981315076308, "grad_norm": 1.1328125, "learning_rate": 4.429046563192905e-05, "loss": 1.3427916765213013, "step": 244 }, { "epoch": 0.13978034517187277, "grad_norm": 1.1484375, "learning_rate": 4.426274944567628e-05, "loss": 1.2719500064849854, "step": 245 }, { "epoch": 0.14035087719298245, "grad_norm": 1.1640625, "learning_rate": 4.42350332594235e-05, "loss": 1.2944797277450562, "step": 246 }, { "epoch": 0.14092140921409213, "grad_norm": 1.1328125, "learning_rate": 4.420731707317074e-05, "loss": 1.3022198677062988, "step": 247 }, { "epoch": 0.1414919412352018, "grad_norm": 1.2109375, "learning_rate": 4.417960088691796e-05, "loss": 1.286307454109192, "step": 248 }, { "epoch": 0.14206247325631152, "grad_norm": 1.328125, "learning_rate": 4.415188470066519e-05, "loss": 1.3540141582489014, "step": 249 }, { "epoch": 0.1426330052774212, "grad_norm": 1.0859375, "learning_rate": 4.412416851441242e-05, "loss": 1.2702994346618652, "step": 250 }, { "epoch": 0.1432035372985309, "grad_norm": 1.140625, "learning_rate": 4.409645232815965e-05, "loss": 1.2684781551361084, "step": 251 }, { "epoch": 0.14377406931964057, "grad_norm": 1.1484375, "learning_rate": 4.406873614190688e-05, "loss": 1.1907923221588135, "step": 252 }, { "epoch": 0.14434460134075025, "grad_norm": 1.1328125, "learning_rate": 4.404101995565411e-05, "loss": 1.2790608406066895, "step": 253 }, { "epoch": 0.14491513336185993, "grad_norm": 1.1015625, "learning_rate": 4.401330376940133e-05, "loss": 1.2878901958465576, "step": 254 }, { "epoch": 0.14548566538296961, "grad_norm": 1.09375, "learning_rate": 4.3985587583148566e-05, "loss": 1.2305991649627686, "step": 255 }, { "epoch": 0.1460561974040793, "grad_norm": 1.1796875, "learning_rate": 4.395787139689579e-05, "loss": 1.3150757551193237, "step": 256 }, { "epoch": 0.14662672942518898, "grad_norm": 1.0390625, "learning_rate": 4.393015521064302e-05, "loss": 1.213336706161499, "step": 257 }, { "epoch": 0.14719726144629866, "grad_norm": 1.0625, "learning_rate": 4.390243902439025e-05, "loss": 1.2233829498291016, "step": 258 }, { "epoch": 0.14776779346740837, "grad_norm": 1.078125, "learning_rate": 4.387472283813747e-05, "loss": 1.1772549152374268, "step": 259 }, { "epoch": 0.14833832548851805, "grad_norm": 1.171875, "learning_rate": 4.3847006651884707e-05, "loss": 1.3097314834594727, "step": 260 }, { "epoch": 0.14890885750962773, "grad_norm": 1.0703125, "learning_rate": 4.381929046563193e-05, "loss": 1.3049172163009644, "step": 261 }, { "epoch": 0.14947938953073742, "grad_norm": 1.125, "learning_rate": 4.379157427937916e-05, "loss": 1.3094444274902344, "step": 262 }, { "epoch": 0.1500499215518471, "grad_norm": 1.125, "learning_rate": 4.376385809312639e-05, "loss": 1.3298535346984863, "step": 263 }, { "epoch": 0.15062045357295678, "grad_norm": 1.0234375, "learning_rate": 4.373614190687362e-05, "loss": 1.2394543886184692, "step": 264 }, { "epoch": 0.15119098559406646, "grad_norm": 1.1640625, "learning_rate": 4.370842572062084e-05, "loss": 1.2180919647216797, "step": 265 }, { "epoch": 0.15176151761517614, "grad_norm": 1.140625, "learning_rate": 4.3680709534368077e-05, "loss": 1.2652344703674316, "step": 266 }, { "epoch": 0.15233204963628583, "grad_norm": 1.09375, "learning_rate": 4.36529933481153e-05, "loss": 1.2816247940063477, "step": 267 }, { "epoch": 0.1529025816573955, "grad_norm": 1.046875, "learning_rate": 4.3625277161862536e-05, "loss": 1.2074222564697266, "step": 268 }, { "epoch": 0.15347311367850522, "grad_norm": 1.0859375, "learning_rate": 4.359756097560976e-05, "loss": 1.2124351263046265, "step": 269 }, { "epoch": 0.1540436456996149, "grad_norm": 1.0859375, "learning_rate": 4.356984478935698e-05, "loss": 1.187751293182373, "step": 270 }, { "epoch": 0.15461417772072458, "grad_norm": 1.046875, "learning_rate": 4.354212860310422e-05, "loss": 1.1458532810211182, "step": 271 }, { "epoch": 0.15518470974183426, "grad_norm": 1.1171875, "learning_rate": 4.351441241685144e-05, "loss": 1.229477882385254, "step": 272 }, { "epoch": 0.15575524176294395, "grad_norm": 1.2265625, "learning_rate": 4.348669623059867e-05, "loss": 1.2863445281982422, "step": 273 }, { "epoch": 0.15632577378405363, "grad_norm": 1.0703125, "learning_rate": 4.34589800443459e-05, "loss": 1.226841688156128, "step": 274 }, { "epoch": 0.1568963058051633, "grad_norm": 1.1640625, "learning_rate": 4.343126385809313e-05, "loss": 1.2147347927093506, "step": 275 }, { "epoch": 0.157466837826273, "grad_norm": 1.0546875, "learning_rate": 4.340354767184036e-05, "loss": 1.2533400058746338, "step": 276 }, { "epoch": 0.15803736984738267, "grad_norm": 1.0859375, "learning_rate": 4.337583148558759e-05, "loss": 1.2199838161468506, "step": 277 }, { "epoch": 0.15860790186849236, "grad_norm": 1.0546875, "learning_rate": 4.334811529933481e-05, "loss": 1.196079969406128, "step": 278 }, { "epoch": 0.15917843388960207, "grad_norm": 1.140625, "learning_rate": 4.3320399113082046e-05, "loss": 1.2512052059173584, "step": 279 }, { "epoch": 0.15974896591071175, "grad_norm": 1.1171875, "learning_rate": 4.329268292682927e-05, "loss": 1.2729978561401367, "step": 280 }, { "epoch": 0.16031949793182143, "grad_norm": 1.171875, "learning_rate": 4.32649667405765e-05, "loss": 1.2414803504943848, "step": 281 }, { "epoch": 0.1608900299529311, "grad_norm": 1.125, "learning_rate": 4.323725055432373e-05, "loss": 1.2329685688018799, "step": 282 }, { "epoch": 0.1614605619740408, "grad_norm": 1.0703125, "learning_rate": 4.320953436807095e-05, "loss": 1.2458125352859497, "step": 283 }, { "epoch": 0.16203109399515048, "grad_norm": 1.09375, "learning_rate": 4.318181818181819e-05, "loss": 1.2762466669082642, "step": 284 }, { "epoch": 0.16260162601626016, "grad_norm": 1.1484375, "learning_rate": 4.315410199556541e-05, "loss": 1.2883433103561401, "step": 285 }, { "epoch": 0.16317215803736984, "grad_norm": 1.109375, "learning_rate": 4.312638580931264e-05, "loss": 1.261974811553955, "step": 286 }, { "epoch": 0.16374269005847952, "grad_norm": 1.0625, "learning_rate": 4.309866962305987e-05, "loss": 1.2657639980316162, "step": 287 }, { "epoch": 0.1643132220795892, "grad_norm": 1.1328125, "learning_rate": 4.30709534368071e-05, "loss": 1.295043706893921, "step": 288 }, { "epoch": 0.16488375410069891, "grad_norm": 1.09375, "learning_rate": 4.304323725055433e-05, "loss": 1.2336839437484741, "step": 289 }, { "epoch": 0.1654542861218086, "grad_norm": 1.09375, "learning_rate": 4.301552106430156e-05, "loss": 1.264127492904663, "step": 290 }, { "epoch": 0.16602481814291828, "grad_norm": 1.078125, "learning_rate": 4.298780487804878e-05, "loss": 1.2246544361114502, "step": 291 }, { "epoch": 0.16659535016402796, "grad_norm": 1.0546875, "learning_rate": 4.2960088691796016e-05, "loss": 1.2040233612060547, "step": 292 }, { "epoch": 0.16716588218513764, "grad_norm": 1.09375, "learning_rate": 4.293237250554324e-05, "loss": 1.2784225940704346, "step": 293 }, { "epoch": 0.16773641420624733, "grad_norm": 1.109375, "learning_rate": 4.290465631929047e-05, "loss": 1.3152185678482056, "step": 294 }, { "epoch": 0.168306946227357, "grad_norm": 1.1015625, "learning_rate": 4.28769401330377e-05, "loss": 1.2193617820739746, "step": 295 }, { "epoch": 0.1688774782484667, "grad_norm": 1.1015625, "learning_rate": 4.284922394678492e-05, "loss": 1.2813901901245117, "step": 296 }, { "epoch": 0.16944801026957637, "grad_norm": 1.09375, "learning_rate": 4.2821507760532156e-05, "loss": 1.205044150352478, "step": 297 }, { "epoch": 0.17001854229068605, "grad_norm": 1.1015625, "learning_rate": 4.279379157427938e-05, "loss": 1.2626889944076538, "step": 298 }, { "epoch": 0.17058907431179576, "grad_norm": 1.1484375, "learning_rate": 4.276607538802661e-05, "loss": 1.2680320739746094, "step": 299 }, { "epoch": 0.17115960633290545, "grad_norm": 1.0859375, "learning_rate": 4.273835920177384e-05, "loss": 1.2155548334121704, "step": 300 }, { "epoch": 0.17173013835401513, "grad_norm": 1.1875, "learning_rate": 4.271064301552107e-05, "loss": 1.2199232578277588, "step": 301 }, { "epoch": 0.1723006703751248, "grad_norm": 1.1171875, "learning_rate": 4.26829268292683e-05, "loss": 1.2747461795806885, "step": 302 }, { "epoch": 0.1728712023962345, "grad_norm": 1.0546875, "learning_rate": 4.2655210643015526e-05, "loss": 1.235656976699829, "step": 303 }, { "epoch": 0.17344173441734417, "grad_norm": 1.1875, "learning_rate": 4.262749445676275e-05, "loss": 1.3054769039154053, "step": 304 }, { "epoch": 0.17401226643845386, "grad_norm": 1.125, "learning_rate": 4.2599778270509985e-05, "loss": 1.2325561046600342, "step": 305 }, { "epoch": 0.17458279845956354, "grad_norm": 1.0546875, "learning_rate": 4.257206208425721e-05, "loss": 1.1963461637496948, "step": 306 }, { "epoch": 0.17515333048067322, "grad_norm": 1.09375, "learning_rate": 4.254434589800444e-05, "loss": 1.2029732465744019, "step": 307 }, { "epoch": 0.1757238625017829, "grad_norm": 1.0703125, "learning_rate": 4.251662971175167e-05, "loss": 1.289282202720642, "step": 308 }, { "epoch": 0.17629439452289258, "grad_norm": 1.1640625, "learning_rate": 4.248891352549889e-05, "loss": 1.2570784091949463, "step": 309 }, { "epoch": 0.1768649265440023, "grad_norm": 1.0703125, "learning_rate": 4.2461197339246126e-05, "loss": 1.1787132024765015, "step": 310 }, { "epoch": 0.17743545856511198, "grad_norm": 1.1328125, "learning_rate": 4.243348115299335e-05, "loss": 1.2079870700836182, "step": 311 }, { "epoch": 0.17800599058622166, "grad_norm": 1.1640625, "learning_rate": 4.240576496674058e-05, "loss": 1.2776343822479248, "step": 312 }, { "epoch": 0.17857652260733134, "grad_norm": 1.0625, "learning_rate": 4.237804878048781e-05, "loss": 1.1856639385223389, "step": 313 }, { "epoch": 0.17914705462844102, "grad_norm": 1.1328125, "learning_rate": 4.235033259423504e-05, "loss": 1.268944501876831, "step": 314 }, { "epoch": 0.1797175866495507, "grad_norm": 1.140625, "learning_rate": 4.2322616407982266e-05, "loss": 1.2755537033081055, "step": 315 }, { "epoch": 0.18028811867066039, "grad_norm": 1.0703125, "learning_rate": 4.2294900221729496e-05, "loss": 1.274179458618164, "step": 316 }, { "epoch": 0.18085865069177007, "grad_norm": 1.1015625, "learning_rate": 4.226718403547672e-05, "loss": 1.2530457973480225, "step": 317 }, { "epoch": 0.18142918271287975, "grad_norm": 1.1015625, "learning_rate": 4.2239467849223955e-05, "loss": 1.1844085454940796, "step": 318 }, { "epoch": 0.18199971473398943, "grad_norm": 1.1328125, "learning_rate": 4.221175166297118e-05, "loss": 1.3111554384231567, "step": 319 }, { "epoch": 0.18257024675509914, "grad_norm": 1.1328125, "learning_rate": 4.21840354767184e-05, "loss": 1.2178188562393188, "step": 320 }, { "epoch": 0.18314077877620882, "grad_norm": 1.1015625, "learning_rate": 4.2156319290465636e-05, "loss": 1.2369928359985352, "step": 321 }, { "epoch": 0.1837113107973185, "grad_norm": 1.0625, "learning_rate": 4.212860310421286e-05, "loss": 1.1851946115493774, "step": 322 }, { "epoch": 0.1842818428184282, "grad_norm": 1.09375, "learning_rate": 4.210088691796009e-05, "loss": 1.2697205543518066, "step": 323 }, { "epoch": 0.18485237483953787, "grad_norm": 1.078125, "learning_rate": 4.207317073170732e-05, "loss": 1.2498860359191895, "step": 324 }, { "epoch": 0.18542290686064755, "grad_norm": 1.109375, "learning_rate": 4.204545454545455e-05, "loss": 1.2507086992263794, "step": 325 }, { "epoch": 0.18599343888175723, "grad_norm": 1.03125, "learning_rate": 4.201773835920178e-05, "loss": 1.2160149812698364, "step": 326 }, { "epoch": 0.18656397090286692, "grad_norm": 1.078125, "learning_rate": 4.1990022172949006e-05, "loss": 1.238983392715454, "step": 327 }, { "epoch": 0.1871345029239766, "grad_norm": 1.03125, "learning_rate": 4.196230598669623e-05, "loss": 1.2306344509124756, "step": 328 }, { "epoch": 0.18770503494508628, "grad_norm": 1.1015625, "learning_rate": 4.1934589800443465e-05, "loss": 1.27529776096344, "step": 329 }, { "epoch": 0.188275566966196, "grad_norm": 1.0859375, "learning_rate": 4.190687361419069e-05, "loss": 1.2787272930145264, "step": 330 }, { "epoch": 0.18884609898730567, "grad_norm": 1.0546875, "learning_rate": 4.187915742793792e-05, "loss": 1.2454849481582642, "step": 331 }, { "epoch": 0.18941663100841535, "grad_norm": 1.0078125, "learning_rate": 4.185144124168515e-05, "loss": 1.2060352563858032, "step": 332 }, { "epoch": 0.18998716302952504, "grad_norm": 1.078125, "learning_rate": 4.182372505543237e-05, "loss": 1.2341554164886475, "step": 333 }, { "epoch": 0.19055769505063472, "grad_norm": 1.1015625, "learning_rate": 4.1796008869179606e-05, "loss": 1.2774791717529297, "step": 334 }, { "epoch": 0.1911282270717444, "grad_norm": 1.0546875, "learning_rate": 4.176829268292683e-05, "loss": 1.2547677755355835, "step": 335 }, { "epoch": 0.19169875909285408, "grad_norm": 1.1015625, "learning_rate": 4.174057649667406e-05, "loss": 1.286057472229004, "step": 336 }, { "epoch": 0.19226929111396376, "grad_norm": 1.1015625, "learning_rate": 4.171286031042129e-05, "loss": 1.2891746759414673, "step": 337 }, { "epoch": 0.19283982313507345, "grad_norm": 1.046875, "learning_rate": 4.168514412416852e-05, "loss": 1.2376006841659546, "step": 338 }, { "epoch": 0.19341035515618313, "grad_norm": 1.0703125, "learning_rate": 4.1657427937915746e-05, "loss": 1.2672202587127686, "step": 339 }, { "epoch": 0.19398088717729284, "grad_norm": 1.1015625, "learning_rate": 4.1629711751662976e-05, "loss": 1.2037293910980225, "step": 340 }, { "epoch": 0.19455141919840252, "grad_norm": 1.0859375, "learning_rate": 4.16019955654102e-05, "loss": 1.218858003616333, "step": 341 }, { "epoch": 0.1951219512195122, "grad_norm": 1.0859375, "learning_rate": 4.1574279379157435e-05, "loss": 1.2183986902236938, "step": 342 }, { "epoch": 0.19569248324062188, "grad_norm": 1.1171875, "learning_rate": 4.154656319290466e-05, "loss": 1.2573124170303345, "step": 343 }, { "epoch": 0.19626301526173157, "grad_norm": 1.125, "learning_rate": 4.151884700665189e-05, "loss": 1.21070396900177, "step": 344 }, { "epoch": 0.19683354728284125, "grad_norm": 1.09375, "learning_rate": 4.1491130820399116e-05, "loss": 1.286003589630127, "step": 345 }, { "epoch": 0.19740407930395093, "grad_norm": 1.1015625, "learning_rate": 4.146341463414634e-05, "loss": 1.2600152492523193, "step": 346 }, { "epoch": 0.1979746113250606, "grad_norm": 1.203125, "learning_rate": 4.1435698447893575e-05, "loss": 1.2338290214538574, "step": 347 }, { "epoch": 0.1985451433461703, "grad_norm": 1.15625, "learning_rate": 4.14079822616408e-05, "loss": 1.2722115516662598, "step": 348 }, { "epoch": 0.19911567536727998, "grad_norm": 1.0859375, "learning_rate": 4.138026607538803e-05, "loss": 1.1988334655761719, "step": 349 }, { "epoch": 0.19968620738838969, "grad_norm": 1.0859375, "learning_rate": 4.135254988913526e-05, "loss": 1.2339057922363281, "step": 350 }, { "epoch": 0.20025673940949937, "grad_norm": 1.125, "learning_rate": 4.1324833702882486e-05, "loss": 1.2363622188568115, "step": 351 }, { "epoch": 0.20082727143060905, "grad_norm": 1.0859375, "learning_rate": 4.1297117516629716e-05, "loss": 1.2658472061157227, "step": 352 }, { "epoch": 0.20139780345171873, "grad_norm": 1.0546875, "learning_rate": 4.1269401330376945e-05, "loss": 1.2181835174560547, "step": 353 }, { "epoch": 0.20196833547282841, "grad_norm": 1.1171875, "learning_rate": 4.124168514412417e-05, "loss": 1.2710312604904175, "step": 354 }, { "epoch": 0.2025388674939381, "grad_norm": 1.0625, "learning_rate": 4.12139689578714e-05, "loss": 1.176246166229248, "step": 355 }, { "epoch": 0.20310939951504778, "grad_norm": 1.0625, "learning_rate": 4.118625277161863e-05, "loss": 1.24937903881073, "step": 356 }, { "epoch": 0.20367993153615746, "grad_norm": 1.0703125, "learning_rate": 4.1158536585365856e-05, "loss": 1.2401498556137085, "step": 357 }, { "epoch": 0.20425046355726714, "grad_norm": 1.046875, "learning_rate": 4.1130820399113086e-05, "loss": 1.2015979290008545, "step": 358 }, { "epoch": 0.20482099557837682, "grad_norm": 1.1484375, "learning_rate": 4.110310421286031e-05, "loss": 1.2495380640029907, "step": 359 }, { "epoch": 0.20539152759948653, "grad_norm": 1.0703125, "learning_rate": 4.1075388026607545e-05, "loss": 1.2646973133087158, "step": 360 }, { "epoch": 0.20596205962059622, "grad_norm": 1.046875, "learning_rate": 4.104767184035477e-05, "loss": 1.2007383108139038, "step": 361 }, { "epoch": 0.2065325916417059, "grad_norm": 1.046875, "learning_rate": 4.1019955654102e-05, "loss": 1.226219892501831, "step": 362 }, { "epoch": 0.20710312366281558, "grad_norm": 1.0703125, "learning_rate": 4.0992239467849226e-05, "loss": 1.306444525718689, "step": 363 }, { "epoch": 0.20767365568392526, "grad_norm": 1.0625, "learning_rate": 4.0964523281596456e-05, "loss": 1.2141070365905762, "step": 364 }, { "epoch": 0.20824418770503494, "grad_norm": 1.0390625, "learning_rate": 4.0936807095343685e-05, "loss": 1.2149772644042969, "step": 365 }, { "epoch": 0.20881471972614463, "grad_norm": 1.0703125, "learning_rate": 4.0909090909090915e-05, "loss": 1.2671623229980469, "step": 366 }, { "epoch": 0.2093852517472543, "grad_norm": 1.0546875, "learning_rate": 4.088137472283814e-05, "loss": 1.2434954643249512, "step": 367 }, { "epoch": 0.209955783768364, "grad_norm": 1.03125, "learning_rate": 4.085365853658537e-05, "loss": 1.2326661348342896, "step": 368 }, { "epoch": 0.21052631578947367, "grad_norm": 1.0390625, "learning_rate": 4.0825942350332596e-05, "loss": 1.2969672679901123, "step": 369 }, { "epoch": 0.21052631578947367, "eval_loss": 1.238897681236267, "eval_runtime": 80.0789, "eval_samples_per_second": 11.938, "eval_steps_per_second": 2.985, "step": 369 }, { "epoch": 0.21109684781058338, "grad_norm": 1.0078125, "learning_rate": 4.0798226164079826e-05, "loss": 1.203234076499939, "step": 370 }, { "epoch": 0.21166737983169306, "grad_norm": 1.0390625, "learning_rate": 4.0770509977827055e-05, "loss": 1.2333259582519531, "step": 371 }, { "epoch": 0.21223791185280275, "grad_norm": 1.046875, "learning_rate": 4.074279379157428e-05, "loss": 1.2060984373092651, "step": 372 }, { "epoch": 0.21280844387391243, "grad_norm": 1.0234375, "learning_rate": 4.0715077605321514e-05, "loss": 1.1909129619598389, "step": 373 }, { "epoch": 0.2133789758950221, "grad_norm": 1.046875, "learning_rate": 4.068736141906874e-05, "loss": 1.2396963834762573, "step": 374 }, { "epoch": 0.2139495079161318, "grad_norm": 1.0078125, "learning_rate": 4.0659645232815966e-05, "loss": 1.1830250024795532, "step": 375 }, { "epoch": 0.21452003993724147, "grad_norm": 1.03125, "learning_rate": 4.0631929046563196e-05, "loss": 1.207044005393982, "step": 376 }, { "epoch": 0.21509057195835116, "grad_norm": 1.203125, "learning_rate": 4.0604212860310425e-05, "loss": 1.2795757055282593, "step": 377 }, { "epoch": 0.21566110397946084, "grad_norm": 1.0546875, "learning_rate": 4.057649667405765e-05, "loss": 1.2492969036102295, "step": 378 }, { "epoch": 0.21623163600057052, "grad_norm": 1.0546875, "learning_rate": 4.0548780487804884e-05, "loss": 1.3094936609268188, "step": 379 }, { "epoch": 0.21680216802168023, "grad_norm": 1.0390625, "learning_rate": 4.052106430155211e-05, "loss": 1.2260823249816895, "step": 380 }, { "epoch": 0.2173727000427899, "grad_norm": 1.078125, "learning_rate": 4.0493348115299336e-05, "loss": 1.2405587434768677, "step": 381 }, { "epoch": 0.2179432320638996, "grad_norm": 1.09375, "learning_rate": 4.0465631929046566e-05, "loss": 1.1963216066360474, "step": 382 }, { "epoch": 0.21851376408500928, "grad_norm": 1.1484375, "learning_rate": 4.043791574279379e-05, "loss": 1.2458081245422363, "step": 383 }, { "epoch": 0.21908429610611896, "grad_norm": 1.0859375, "learning_rate": 4.0410199556541025e-05, "loss": 1.1974573135375977, "step": 384 }, { "epoch": 0.21965482812722864, "grad_norm": 1.0390625, "learning_rate": 4.038248337028825e-05, "loss": 1.2237815856933594, "step": 385 }, { "epoch": 0.22022536014833832, "grad_norm": 1.03125, "learning_rate": 4.035476718403548e-05, "loss": 1.2369771003723145, "step": 386 }, { "epoch": 0.220795892169448, "grad_norm": 1.0859375, "learning_rate": 4.0327050997782706e-05, "loss": 1.2545832395553589, "step": 387 }, { "epoch": 0.2213664241905577, "grad_norm": 1.03125, "learning_rate": 4.0299334811529936e-05, "loss": 1.2126426696777344, "step": 388 }, { "epoch": 0.22193695621166737, "grad_norm": 1.0625, "learning_rate": 4.0271618625277165e-05, "loss": 1.2321901321411133, "step": 389 }, { "epoch": 0.22250748823277705, "grad_norm": 1.03125, "learning_rate": 4.0243902439024395e-05, "loss": 1.2315490245819092, "step": 390 }, { "epoch": 0.22307802025388676, "grad_norm": 1.0390625, "learning_rate": 4.021618625277162e-05, "loss": 1.1859689950942993, "step": 391 }, { "epoch": 0.22364855227499644, "grad_norm": 1.03125, "learning_rate": 4.018847006651885e-05, "loss": 1.2416760921478271, "step": 392 }, { "epoch": 0.22421908429610612, "grad_norm": 1.1015625, "learning_rate": 4.0160753880266076e-05, "loss": 1.3080382347106934, "step": 393 }, { "epoch": 0.2247896163172158, "grad_norm": 1.0625, "learning_rate": 4.0133037694013306e-05, "loss": 1.2275526523590088, "step": 394 }, { "epoch": 0.2253601483383255, "grad_norm": 1.03125, "learning_rate": 4.0105321507760535e-05, "loss": 1.2734044790267944, "step": 395 }, { "epoch": 0.22593068035943517, "grad_norm": 1.09375, "learning_rate": 4.007760532150776e-05, "loss": 1.2480955123901367, "step": 396 }, { "epoch": 0.22650121238054485, "grad_norm": 1.078125, "learning_rate": 4.0049889135254994e-05, "loss": 1.2629410028457642, "step": 397 }, { "epoch": 0.22707174440165453, "grad_norm": 1.046875, "learning_rate": 4.002217294900222e-05, "loss": 1.190090537071228, "step": 398 }, { "epoch": 0.22764227642276422, "grad_norm": 1.0859375, "learning_rate": 3.9994456762749446e-05, "loss": 1.2843146324157715, "step": 399 }, { "epoch": 0.2282128084438739, "grad_norm": 1.0546875, "learning_rate": 3.9966740576496676e-05, "loss": 1.2836047410964966, "step": 400 }, { "epoch": 0.2287833404649836, "grad_norm": 1.0234375, "learning_rate": 3.9939024390243905e-05, "loss": 1.1873021125793457, "step": 401 }, { "epoch": 0.2293538724860933, "grad_norm": 1.03125, "learning_rate": 3.9911308203991135e-05, "loss": 1.228004813194275, "step": 402 }, { "epoch": 0.22992440450720297, "grad_norm": 1.0078125, "learning_rate": 3.9883592017738364e-05, "loss": 1.2318588495254517, "step": 403 }, { "epoch": 0.23049493652831265, "grad_norm": 1.046875, "learning_rate": 3.985587583148559e-05, "loss": 1.218421220779419, "step": 404 }, { "epoch": 0.23106546854942234, "grad_norm": 1.0625, "learning_rate": 3.9828159645232816e-05, "loss": 1.3068960905075073, "step": 405 }, { "epoch": 0.23163600057053202, "grad_norm": 1.0234375, "learning_rate": 3.9800443458980046e-05, "loss": 1.2189011573791504, "step": 406 }, { "epoch": 0.2322065325916417, "grad_norm": 1.046875, "learning_rate": 3.9772727272727275e-05, "loss": 1.2019367218017578, "step": 407 }, { "epoch": 0.23277706461275138, "grad_norm": 1.046875, "learning_rate": 3.9745011086474505e-05, "loss": 1.2285387516021729, "step": 408 }, { "epoch": 0.23334759663386107, "grad_norm": 1.015625, "learning_rate": 3.971729490022173e-05, "loss": 1.1963067054748535, "step": 409 }, { "epoch": 0.23391812865497075, "grad_norm": 1.078125, "learning_rate": 3.9689578713968964e-05, "loss": 1.3005050420761108, "step": 410 }, { "epoch": 0.23448866067608046, "grad_norm": 1.0234375, "learning_rate": 3.9661862527716186e-05, "loss": 1.2429478168487549, "step": 411 }, { "epoch": 0.23505919269719014, "grad_norm": 1.046875, "learning_rate": 3.9634146341463416e-05, "loss": 1.2445229291915894, "step": 412 }, { "epoch": 0.23562972471829982, "grad_norm": 1.046875, "learning_rate": 3.9606430155210645e-05, "loss": 1.2569499015808105, "step": 413 }, { "epoch": 0.2362002567394095, "grad_norm": 1.0, "learning_rate": 3.9578713968957875e-05, "loss": 1.232776165008545, "step": 414 }, { "epoch": 0.23677078876051919, "grad_norm": 1.03125, "learning_rate": 3.9550997782705104e-05, "loss": 1.2104380130767822, "step": 415 }, { "epoch": 0.23734132078162887, "grad_norm": 1.1015625, "learning_rate": 3.952328159645233e-05, "loss": 1.2908308506011963, "step": 416 }, { "epoch": 0.23791185280273855, "grad_norm": 1.0625, "learning_rate": 3.9495565410199557e-05, "loss": 1.1678047180175781, "step": 417 }, { "epoch": 0.23848238482384823, "grad_norm": 1.0859375, "learning_rate": 3.9467849223946786e-05, "loss": 1.310725212097168, "step": 418 }, { "epoch": 0.2390529168449579, "grad_norm": 1.0546875, "learning_rate": 3.9440133037694015e-05, "loss": 1.2618491649627686, "step": 419 }, { "epoch": 0.2396234488660676, "grad_norm": 1.0234375, "learning_rate": 3.9412416851441245e-05, "loss": 1.1795238256454468, "step": 420 }, { "epoch": 0.2401939808871773, "grad_norm": 1.0546875, "learning_rate": 3.9384700665188474e-05, "loss": 1.2187573909759521, "step": 421 }, { "epoch": 0.240764512908287, "grad_norm": 1.03125, "learning_rate": 3.93569844789357e-05, "loss": 1.2171461582183838, "step": 422 }, { "epoch": 0.24133504492939667, "grad_norm": 1.046875, "learning_rate": 3.932926829268293e-05, "loss": 1.2295634746551514, "step": 423 }, { "epoch": 0.24190557695050635, "grad_norm": 1.0859375, "learning_rate": 3.9301552106430156e-05, "loss": 1.2483271360397339, "step": 424 }, { "epoch": 0.24247610897161603, "grad_norm": 1.0546875, "learning_rate": 3.9273835920177385e-05, "loss": 1.1881691217422485, "step": 425 }, { "epoch": 0.24304664099272572, "grad_norm": 1.03125, "learning_rate": 3.9246119733924615e-05, "loss": 1.1997624635696411, "step": 426 }, { "epoch": 0.2436171730138354, "grad_norm": 1.1015625, "learning_rate": 3.9218403547671844e-05, "loss": 1.2510207891464233, "step": 427 }, { "epoch": 0.24418770503494508, "grad_norm": 1.046875, "learning_rate": 3.9190687361419074e-05, "loss": 1.2188156843185425, "step": 428 }, { "epoch": 0.24475823705605476, "grad_norm": 1.0234375, "learning_rate": 3.9162971175166297e-05, "loss": 1.228477954864502, "step": 429 }, { "epoch": 0.24532876907716444, "grad_norm": 1.1171875, "learning_rate": 3.9135254988913526e-05, "loss": 1.3039709329605103, "step": 430 }, { "epoch": 0.24589930109827415, "grad_norm": 1.0703125, "learning_rate": 3.9107538802660755e-05, "loss": 1.2193942070007324, "step": 431 }, { "epoch": 0.24646983311938384, "grad_norm": 1.0859375, "learning_rate": 3.9079822616407985e-05, "loss": 1.2380352020263672, "step": 432 }, { "epoch": 0.24704036514049352, "grad_norm": 1.046875, "learning_rate": 3.905210643015521e-05, "loss": 1.1670141220092773, "step": 433 }, { "epoch": 0.2476108971616032, "grad_norm": 1.0625, "learning_rate": 3.9024390243902444e-05, "loss": 1.2406682968139648, "step": 434 }, { "epoch": 0.24818142918271288, "grad_norm": 1.0625, "learning_rate": 3.8996674057649667e-05, "loss": 1.200782060623169, "step": 435 }, { "epoch": 0.24875196120382256, "grad_norm": 1.046875, "learning_rate": 3.89689578713969e-05, "loss": 1.1442952156066895, "step": 436 }, { "epoch": 0.24932249322493225, "grad_norm": 1.03125, "learning_rate": 3.8941241685144125e-05, "loss": 1.15338134765625, "step": 437 }, { "epoch": 0.24989302524604193, "grad_norm": 0.98828125, "learning_rate": 3.8913525498891355e-05, "loss": 1.1609077453613281, "step": 438 }, { "epoch": 0.25046355726715164, "grad_norm": 1.1015625, "learning_rate": 3.8885809312638584e-05, "loss": 1.257835030555725, "step": 439 }, { "epoch": 0.2510340892882613, "grad_norm": 1.0859375, "learning_rate": 3.8858093126385814e-05, "loss": 1.2244375944137573, "step": 440 }, { "epoch": 0.251604621309371, "grad_norm": 1.0859375, "learning_rate": 3.8830376940133037e-05, "loss": 1.2138961553573608, "step": 441 }, { "epoch": 0.2521751533304807, "grad_norm": 1.0546875, "learning_rate": 3.8802660753880266e-05, "loss": 1.240128755569458, "step": 442 }, { "epoch": 0.25274568535159037, "grad_norm": 1.0, "learning_rate": 3.8774944567627496e-05, "loss": 1.2070982456207275, "step": 443 }, { "epoch": 0.25331621737270005, "grad_norm": 1.0546875, "learning_rate": 3.8747228381374725e-05, "loss": 1.2733830213546753, "step": 444 }, { "epoch": 0.25388674939380973, "grad_norm": 1.046875, "learning_rate": 3.8719512195121954e-05, "loss": 1.1820507049560547, "step": 445 }, { "epoch": 0.2544572814149194, "grad_norm": 1.0078125, "learning_rate": 3.869179600886918e-05, "loss": 1.196885108947754, "step": 446 }, { "epoch": 0.2550278134360291, "grad_norm": 1.03125, "learning_rate": 3.866407982261641e-05, "loss": 1.1905972957611084, "step": 447 }, { "epoch": 0.2555983454571388, "grad_norm": 1.09375, "learning_rate": 3.8636363636363636e-05, "loss": 1.2579684257507324, "step": 448 }, { "epoch": 0.25616887747824846, "grad_norm": 1.0, "learning_rate": 3.8608647450110866e-05, "loss": 1.1727596521377563, "step": 449 }, { "epoch": 0.25673940949935814, "grad_norm": 1.015625, "learning_rate": 3.8580931263858095e-05, "loss": 1.1504234075546265, "step": 450 }, { "epoch": 0.2573099415204678, "grad_norm": 1.015625, "learning_rate": 3.8553215077605324e-05, "loss": 1.1405715942382812, "step": 451 }, { "epoch": 0.2578804735415775, "grad_norm": 0.98828125, "learning_rate": 3.8525498891352554e-05, "loss": 1.220837116241455, "step": 452 }, { "epoch": 0.2584510055626872, "grad_norm": 1.0234375, "learning_rate": 3.8497782705099777e-05, "loss": 1.1962711811065674, "step": 453 }, { "epoch": 0.25902153758379687, "grad_norm": 1.015625, "learning_rate": 3.8470066518847006e-05, "loss": 1.1877164840698242, "step": 454 }, { "epoch": 0.25959206960490655, "grad_norm": 1.046875, "learning_rate": 3.8442350332594236e-05, "loss": 1.2504132986068726, "step": 455 }, { "epoch": 0.2601626016260163, "grad_norm": 1.015625, "learning_rate": 3.8414634146341465e-05, "loss": 1.1902315616607666, "step": 456 }, { "epoch": 0.26073313364712597, "grad_norm": 1.03125, "learning_rate": 3.8386917960088694e-05, "loss": 1.2856203317642212, "step": 457 }, { "epoch": 0.26130366566823565, "grad_norm": 1.0625, "learning_rate": 3.8359201773835924e-05, "loss": 1.2528060674667358, "step": 458 }, { "epoch": 0.26187419768934533, "grad_norm": 1.078125, "learning_rate": 3.833148558758315e-05, "loss": 1.1831871271133423, "step": 459 }, { "epoch": 0.262444729710455, "grad_norm": 1.015625, "learning_rate": 3.830376940133038e-05, "loss": 1.1781988143920898, "step": 460 }, { "epoch": 0.2630152617315647, "grad_norm": 1.015625, "learning_rate": 3.8276053215077606e-05, "loss": 1.193709373474121, "step": 461 }, { "epoch": 0.2635857937526744, "grad_norm": 1.078125, "learning_rate": 3.8248337028824835e-05, "loss": 1.1997225284576416, "step": 462 }, { "epoch": 0.26415632577378406, "grad_norm": 1.0078125, "learning_rate": 3.8220620842572064e-05, "loss": 1.159136176109314, "step": 463 }, { "epoch": 0.26472685779489374, "grad_norm": 1.015625, "learning_rate": 3.8192904656319294e-05, "loss": 1.242883324623108, "step": 464 }, { "epoch": 0.2652973898160034, "grad_norm": 1.0703125, "learning_rate": 3.8165188470066523e-05, "loss": 1.2907770872116089, "step": 465 }, { "epoch": 0.2658679218371131, "grad_norm": 1.09375, "learning_rate": 3.8137472283813746e-05, "loss": 1.2596560716629028, "step": 466 }, { "epoch": 0.2664384538582228, "grad_norm": 1.1171875, "learning_rate": 3.8109756097560976e-05, "loss": 1.2509888410568237, "step": 467 }, { "epoch": 0.26700898587933247, "grad_norm": 0.97265625, "learning_rate": 3.8082039911308205e-05, "loss": 1.2029120922088623, "step": 468 }, { "epoch": 0.26757951790044215, "grad_norm": 1.015625, "learning_rate": 3.8054323725055435e-05, "loss": 1.210568904876709, "step": 469 }, { "epoch": 0.26815004992155184, "grad_norm": 0.9921875, "learning_rate": 3.8026607538802664e-05, "loss": 1.1661216020584106, "step": 470 }, { "epoch": 0.2687205819426615, "grad_norm": 1.03125, "learning_rate": 3.7998891352549893e-05, "loss": 1.229252576828003, "step": 471 }, { "epoch": 0.2692911139637712, "grad_norm": 1.0546875, "learning_rate": 3.7971175166297116e-05, "loss": 1.209242343902588, "step": 472 }, { "epoch": 0.2698616459848809, "grad_norm": 1.0390625, "learning_rate": 3.794345898004435e-05, "loss": 1.2709503173828125, "step": 473 }, { "epoch": 0.27043217800599056, "grad_norm": 1.078125, "learning_rate": 3.7915742793791575e-05, "loss": 1.2316001653671265, "step": 474 }, { "epoch": 0.27100271002710025, "grad_norm": 1.03125, "learning_rate": 3.7888026607538805e-05, "loss": 1.2138065099716187, "step": 475 }, { "epoch": 0.27157324204820993, "grad_norm": 1.0, "learning_rate": 3.7860310421286034e-05, "loss": 1.1936984062194824, "step": 476 }, { "epoch": 0.27214377406931967, "grad_norm": 1.015625, "learning_rate": 3.783259423503326e-05, "loss": 1.2338573932647705, "step": 477 }, { "epoch": 0.27271430609042935, "grad_norm": 1.046875, "learning_rate": 3.780487804878049e-05, "loss": 1.2421263456344604, "step": 478 }, { "epoch": 0.27328483811153903, "grad_norm": 1.03125, "learning_rate": 3.7777161862527716e-05, "loss": 1.2414464950561523, "step": 479 }, { "epoch": 0.2738553701326487, "grad_norm": 0.99609375, "learning_rate": 3.7749445676274945e-05, "loss": 1.2261340618133545, "step": 480 }, { "epoch": 0.2744259021537584, "grad_norm": 1.0234375, "learning_rate": 3.7721729490022175e-05, "loss": 1.208221435546875, "step": 481 }, { "epoch": 0.2749964341748681, "grad_norm": 1.0234375, "learning_rate": 3.7694013303769404e-05, "loss": 1.2820276021957397, "step": 482 }, { "epoch": 0.27556696619597776, "grad_norm": 1.0703125, "learning_rate": 3.7666297117516633e-05, "loss": 1.262161374092102, "step": 483 }, { "epoch": 0.27613749821708744, "grad_norm": 0.9921875, "learning_rate": 3.763858093126386e-05, "loss": 1.2242916822433472, "step": 484 }, { "epoch": 0.2767080302381971, "grad_norm": 1.0078125, "learning_rate": 3.7610864745011086e-05, "loss": 1.1797833442687988, "step": 485 }, { "epoch": 0.2772785622593068, "grad_norm": 1.0546875, "learning_rate": 3.758314855875832e-05, "loss": 1.2725660800933838, "step": 486 }, { "epoch": 0.2778490942804165, "grad_norm": 1.015625, "learning_rate": 3.7555432372505545e-05, "loss": 1.195313572883606, "step": 487 }, { "epoch": 0.27841962630152617, "grad_norm": 0.94921875, "learning_rate": 3.7527716186252774e-05, "loss": 1.1661468744277954, "step": 488 }, { "epoch": 0.27899015832263585, "grad_norm": 1.0390625, "learning_rate": 3.7500000000000003e-05, "loss": 1.2072978019714355, "step": 489 }, { "epoch": 0.27956069034374553, "grad_norm": 0.98828125, "learning_rate": 3.7472283813747226e-05, "loss": 1.203414797782898, "step": 490 }, { "epoch": 0.2801312223648552, "grad_norm": 1.0390625, "learning_rate": 3.7444567627494456e-05, "loss": 1.2426180839538574, "step": 491 }, { "epoch": 0.2807017543859649, "grad_norm": 1.015625, "learning_rate": 3.7416851441241685e-05, "loss": 1.232536792755127, "step": 492 }, { "epoch": 0.2812722864070746, "grad_norm": 1.03125, "learning_rate": 3.7389135254988915e-05, "loss": 1.266850471496582, "step": 493 }, { "epoch": 0.28184281842818426, "grad_norm": 1.046875, "learning_rate": 3.7361419068736144e-05, "loss": 1.2585172653198242, "step": 494 }, { "epoch": 0.28241335044929394, "grad_norm": 0.99609375, "learning_rate": 3.7333702882483374e-05, "loss": 1.2028322219848633, "step": 495 }, { "epoch": 0.2829838824704036, "grad_norm": 1.0546875, "learning_rate": 3.7305986696230596e-05, "loss": 1.2268320322036743, "step": 496 }, { "epoch": 0.28355441449151336, "grad_norm": 1.03125, "learning_rate": 3.727827050997783e-05, "loss": 1.2339527606964111, "step": 497 }, { "epoch": 0.28412494651262304, "grad_norm": 1.046875, "learning_rate": 3.7250554323725055e-05, "loss": 1.2072274684906006, "step": 498 }, { "epoch": 0.2846954785337327, "grad_norm": 0.99609375, "learning_rate": 3.7222838137472285e-05, "loss": 1.235311508178711, "step": 499 }, { "epoch": 0.2852660105548424, "grad_norm": 1.0390625, "learning_rate": 3.7195121951219514e-05, "loss": 1.2435599565505981, "step": 500 }, { "epoch": 0.2858365425759521, "grad_norm": 1.0390625, "learning_rate": 3.7167405764966744e-05, "loss": 1.2234078645706177, "step": 501 }, { "epoch": 0.2864070745970618, "grad_norm": 1.0703125, "learning_rate": 3.713968957871397e-05, "loss": 1.2654131650924683, "step": 502 }, { "epoch": 0.28697760661817145, "grad_norm": 1.0625, "learning_rate": 3.7111973392461196e-05, "loss": 1.226614236831665, "step": 503 }, { "epoch": 0.28754813863928114, "grad_norm": 1.109375, "learning_rate": 3.7084257206208425e-05, "loss": 1.2334555387496948, "step": 504 }, { "epoch": 0.2881186706603908, "grad_norm": 1.0546875, "learning_rate": 3.7056541019955655e-05, "loss": 1.2169506549835205, "step": 505 }, { "epoch": 0.2886892026815005, "grad_norm": 1.078125, "learning_rate": 3.7028824833702884e-05, "loss": 1.2664920091629028, "step": 506 }, { "epoch": 0.2892597347026102, "grad_norm": 1.2109375, "learning_rate": 3.7001108647450114e-05, "loss": 1.2238786220550537, "step": 507 }, { "epoch": 0.28983026672371986, "grad_norm": 1.03125, "learning_rate": 3.697339246119734e-05, "loss": 1.179901361465454, "step": 508 }, { "epoch": 0.29040079874482955, "grad_norm": 1.078125, "learning_rate": 3.6945676274944566e-05, "loss": 1.2527443170547485, "step": 509 }, { "epoch": 0.29097133076593923, "grad_norm": 1.0234375, "learning_rate": 3.69179600886918e-05, "loss": 1.2478464841842651, "step": 510 }, { "epoch": 0.2915418627870489, "grad_norm": 1.1015625, "learning_rate": 3.6890243902439025e-05, "loss": 1.2006577253341675, "step": 511 }, { "epoch": 0.2921123948081586, "grad_norm": 1.1015625, "learning_rate": 3.6862527716186254e-05, "loss": 1.283043384552002, "step": 512 }, { "epoch": 0.2926829268292683, "grad_norm": 1.0234375, "learning_rate": 3.6834811529933484e-05, "loss": 1.223816156387329, "step": 513 }, { "epoch": 0.29325345885037796, "grad_norm": 1.1171875, "learning_rate": 3.6807095343680706e-05, "loss": 1.2357165813446045, "step": 514 }, { "epoch": 0.29382399087148764, "grad_norm": 1.078125, "learning_rate": 3.677937915742794e-05, "loss": 1.2494802474975586, "step": 515 }, { "epoch": 0.2943945228925973, "grad_norm": 1.046875, "learning_rate": 3.6751662971175165e-05, "loss": 1.2093576192855835, "step": 516 }, { "epoch": 0.29496505491370706, "grad_norm": 1.0234375, "learning_rate": 3.6723946784922395e-05, "loss": 1.192871332168579, "step": 517 }, { "epoch": 0.29553558693481674, "grad_norm": 0.93359375, "learning_rate": 3.6696230598669624e-05, "loss": 1.1430253982543945, "step": 518 }, { "epoch": 0.2961061189559264, "grad_norm": 1.03125, "learning_rate": 3.6668514412416854e-05, "loss": 1.2123762369155884, "step": 519 }, { "epoch": 0.2966766509770361, "grad_norm": 1.0703125, "learning_rate": 3.664079822616408e-05, "loss": 1.2201260328292847, "step": 520 }, { "epoch": 0.2972471829981458, "grad_norm": 1.0546875, "learning_rate": 3.661308203991131e-05, "loss": 1.1812068223953247, "step": 521 }, { "epoch": 0.29781771501925547, "grad_norm": 1.078125, "learning_rate": 3.6585365853658535e-05, "loss": 1.2447538375854492, "step": 522 }, { "epoch": 0.29838824704036515, "grad_norm": 1.0625, "learning_rate": 3.655764966740577e-05, "loss": 1.2636268138885498, "step": 523 }, { "epoch": 0.29895877906147483, "grad_norm": 1.0546875, "learning_rate": 3.6529933481152994e-05, "loss": 1.2320729494094849, "step": 524 }, { "epoch": 0.2995293110825845, "grad_norm": 1.0859375, "learning_rate": 3.6502217294900224e-05, "loss": 1.2655476331710815, "step": 525 }, { "epoch": 0.3000998431036942, "grad_norm": 1.0390625, "learning_rate": 3.647450110864745e-05, "loss": 1.2109198570251465, "step": 526 }, { "epoch": 0.3006703751248039, "grad_norm": 1.046875, "learning_rate": 3.6446784922394676e-05, "loss": 1.2380175590515137, "step": 527 }, { "epoch": 0.30124090714591356, "grad_norm": 1.0234375, "learning_rate": 3.641906873614191e-05, "loss": 1.2023993730545044, "step": 528 }, { "epoch": 0.30181143916702324, "grad_norm": 1.0234375, "learning_rate": 3.6391352549889135e-05, "loss": 1.239518404006958, "step": 529 }, { "epoch": 0.3023819711881329, "grad_norm": 0.9921875, "learning_rate": 3.6363636363636364e-05, "loss": 1.2405352592468262, "step": 530 }, { "epoch": 0.3029525032092426, "grad_norm": 1.0390625, "learning_rate": 3.6335920177383594e-05, "loss": 1.269554853439331, "step": 531 }, { "epoch": 0.3035230352303523, "grad_norm": 1.0078125, "learning_rate": 3.630820399113082e-05, "loss": 1.256522297859192, "step": 532 }, { "epoch": 0.30409356725146197, "grad_norm": 0.99609375, "learning_rate": 3.628048780487805e-05, "loss": 1.2245392799377441, "step": 533 }, { "epoch": 0.30466409927257165, "grad_norm": 0.99609375, "learning_rate": 3.625277161862528e-05, "loss": 1.2256156206130981, "step": 534 }, { "epoch": 0.30523463129368134, "grad_norm": 1.078125, "learning_rate": 3.6225055432372505e-05, "loss": 1.2551851272583008, "step": 535 }, { "epoch": 0.305805163314791, "grad_norm": 1.0234375, "learning_rate": 3.619733924611974e-05, "loss": 1.1682400703430176, "step": 536 }, { "epoch": 0.30637569533590076, "grad_norm": 1.0703125, "learning_rate": 3.6169623059866964e-05, "loss": 1.2278921604156494, "step": 537 }, { "epoch": 0.30694622735701044, "grad_norm": 1.03125, "learning_rate": 3.6141906873614186e-05, "loss": 1.2167140245437622, "step": 538 }, { "epoch": 0.3075167593781201, "grad_norm": 0.99609375, "learning_rate": 3.611419068736142e-05, "loss": 1.2471628189086914, "step": 539 }, { "epoch": 0.3080872913992298, "grad_norm": 1.0546875, "learning_rate": 3.6086474501108645e-05, "loss": 1.2300347089767456, "step": 540 }, { "epoch": 0.3086578234203395, "grad_norm": 1.015625, "learning_rate": 3.605875831485588e-05, "loss": 1.1582870483398438, "step": 541 }, { "epoch": 0.30922835544144917, "grad_norm": 1.03125, "learning_rate": 3.6031042128603104e-05, "loss": 1.2606914043426514, "step": 542 }, { "epoch": 0.30979888746255885, "grad_norm": 1.0, "learning_rate": 3.6003325942350334e-05, "loss": 1.2054803371429443, "step": 543 }, { "epoch": 0.31036941948366853, "grad_norm": 1.0078125, "learning_rate": 3.597560975609756e-05, "loss": 1.1797690391540527, "step": 544 }, { "epoch": 0.3109399515047782, "grad_norm": 1.03125, "learning_rate": 3.594789356984479e-05, "loss": 1.1780451536178589, "step": 545 }, { "epoch": 0.3115104835258879, "grad_norm": 1.0546875, "learning_rate": 3.5920177383592015e-05, "loss": 1.2812529802322388, "step": 546 }, { "epoch": 0.3120810155469976, "grad_norm": 1.0625, "learning_rate": 3.589246119733925e-05, "loss": 1.3007402420043945, "step": 547 }, { "epoch": 0.31265154756810726, "grad_norm": 0.9921875, "learning_rate": 3.5864745011086474e-05, "loss": 1.1987743377685547, "step": 548 }, { "epoch": 0.31322207958921694, "grad_norm": 1.078125, "learning_rate": 3.583702882483371e-05, "loss": 1.2217564582824707, "step": 549 }, { "epoch": 0.3137926116103266, "grad_norm": 0.99609375, "learning_rate": 3.580931263858093e-05, "loss": 1.211827039718628, "step": 550 }, { "epoch": 0.3143631436314363, "grad_norm": 1.03125, "learning_rate": 3.5781596452328156e-05, "loss": 1.2164710760116577, "step": 551 }, { "epoch": 0.314933675652546, "grad_norm": 0.98828125, "learning_rate": 3.575388026607539e-05, "loss": 1.2393014430999756, "step": 552 }, { "epoch": 0.31550420767365567, "grad_norm": 0.96875, "learning_rate": 3.5726164079822615e-05, "loss": 1.1759617328643799, "step": 553 }, { "epoch": 0.31607473969476535, "grad_norm": 1.015625, "learning_rate": 3.5698447893569844e-05, "loss": 1.2184211015701294, "step": 554 }, { "epoch": 0.31664527171587503, "grad_norm": 1.046875, "learning_rate": 3.5670731707317074e-05, "loss": 1.248216152191162, "step": 555 }, { "epoch": 0.3172158037369847, "grad_norm": 1.0078125, "learning_rate": 3.56430155210643e-05, "loss": 1.2055684328079224, "step": 556 }, { "epoch": 0.3177863357580944, "grad_norm": 1.0546875, "learning_rate": 3.561529933481153e-05, "loss": 1.19916832447052, "step": 557 }, { "epoch": 0.31835686777920413, "grad_norm": 0.95703125, "learning_rate": 3.558758314855876e-05, "loss": 1.151750087738037, "step": 558 }, { "epoch": 0.3189273998003138, "grad_norm": 0.99609375, "learning_rate": 3.5559866962305985e-05, "loss": 1.254964828491211, "step": 559 }, { "epoch": 0.3194979318214235, "grad_norm": 1.0546875, "learning_rate": 3.553215077605322e-05, "loss": 1.251706600189209, "step": 560 }, { "epoch": 0.3200684638425332, "grad_norm": 1.2265625, "learning_rate": 3.5504434589800444e-05, "loss": 1.1918596029281616, "step": 561 }, { "epoch": 0.32063899586364286, "grad_norm": 1.0625, "learning_rate": 3.547671840354767e-05, "loss": 1.2538777589797974, "step": 562 }, { "epoch": 0.32120952788475254, "grad_norm": 1.078125, "learning_rate": 3.54490022172949e-05, "loss": 1.227068543434143, "step": 563 }, { "epoch": 0.3217800599058622, "grad_norm": 1.015625, "learning_rate": 3.5421286031042125e-05, "loss": 1.1811244487762451, "step": 564 }, { "epoch": 0.3223505919269719, "grad_norm": 1.015625, "learning_rate": 3.539356984478936e-05, "loss": 1.162517786026001, "step": 565 }, { "epoch": 0.3229211239480816, "grad_norm": 0.9921875, "learning_rate": 3.5365853658536584e-05, "loss": 1.1981290578842163, "step": 566 }, { "epoch": 0.32349165596919127, "grad_norm": 0.9765625, "learning_rate": 3.5338137472283814e-05, "loss": 1.1930001974105835, "step": 567 }, { "epoch": 0.32406218799030095, "grad_norm": 1.0546875, "learning_rate": 3.531042128603104e-05, "loss": 1.2397738695144653, "step": 568 }, { "epoch": 0.32463272001141064, "grad_norm": 1.078125, "learning_rate": 3.528270509977827e-05, "loss": 1.273198127746582, "step": 569 }, { "epoch": 0.3252032520325203, "grad_norm": 0.96875, "learning_rate": 3.52549889135255e-05, "loss": 1.1873741149902344, "step": 570 }, { "epoch": 0.32577378405363, "grad_norm": 0.9921875, "learning_rate": 3.522727272727273e-05, "loss": 1.2132840156555176, "step": 571 }, { "epoch": 0.3263443160747397, "grad_norm": 0.984375, "learning_rate": 3.5199556541019954e-05, "loss": 1.1881725788116455, "step": 572 }, { "epoch": 0.32691484809584936, "grad_norm": 1.0546875, "learning_rate": 3.517184035476719e-05, "loss": 1.2296414375305176, "step": 573 }, { "epoch": 0.32748538011695905, "grad_norm": 0.98828125, "learning_rate": 3.514412416851441e-05, "loss": 1.2116769552230835, "step": 574 }, { "epoch": 0.32805591213806873, "grad_norm": 1.0, "learning_rate": 3.511640798226164e-05, "loss": 1.194542646408081, "step": 575 }, { "epoch": 0.3286264441591784, "grad_norm": 1.03125, "learning_rate": 3.508869179600887e-05, "loss": 1.2189078330993652, "step": 576 }, { "epoch": 0.3291969761802881, "grad_norm": 0.96484375, "learning_rate": 3.5060975609756095e-05, "loss": 1.1380560398101807, "step": 577 }, { "epoch": 0.32976750820139783, "grad_norm": 1.015625, "learning_rate": 3.503325942350333e-05, "loss": 1.1995842456817627, "step": 578 }, { "epoch": 0.3303380402225075, "grad_norm": 1.0625, "learning_rate": 3.5005543237250554e-05, "loss": 1.254304051399231, "step": 579 }, { "epoch": 0.3309085722436172, "grad_norm": 1.0, "learning_rate": 3.497782705099778e-05, "loss": 1.201616883277893, "step": 580 }, { "epoch": 0.3314791042647269, "grad_norm": 0.98828125, "learning_rate": 3.495011086474501e-05, "loss": 1.1772336959838867, "step": 581 }, { "epoch": 0.33204963628583656, "grad_norm": 1.0234375, "learning_rate": 3.492239467849224e-05, "loss": 1.1937668323516846, "step": 582 }, { "epoch": 0.33262016830694624, "grad_norm": 0.9765625, "learning_rate": 3.489467849223947e-05, "loss": 1.186886191368103, "step": 583 }, { "epoch": 0.3331907003280559, "grad_norm": 1.0390625, "learning_rate": 3.48669623059867e-05, "loss": 1.2187786102294922, "step": 584 }, { "epoch": 0.3337612323491656, "grad_norm": 1.0234375, "learning_rate": 3.4839246119733924e-05, "loss": 1.1842401027679443, "step": 585 }, { "epoch": 0.3343317643702753, "grad_norm": 1.0, "learning_rate": 3.481152993348116e-05, "loss": 1.1953545808792114, "step": 586 }, { "epoch": 0.33490229639138497, "grad_norm": 1.0078125, "learning_rate": 3.478381374722838e-05, "loss": 1.1909786462783813, "step": 587 }, { "epoch": 0.33547282841249465, "grad_norm": 0.96484375, "learning_rate": 3.475609756097561e-05, "loss": 1.201062798500061, "step": 588 }, { "epoch": 0.33604336043360433, "grad_norm": 0.9921875, "learning_rate": 3.472838137472284e-05, "loss": 1.2262158393859863, "step": 589 }, { "epoch": 0.336613892454714, "grad_norm": 1.0390625, "learning_rate": 3.4700665188470064e-05, "loss": 1.255564570426941, "step": 590 }, { "epoch": 0.3371844244758237, "grad_norm": 0.984375, "learning_rate": 3.46729490022173e-05, "loss": 1.1916460990905762, "step": 591 }, { "epoch": 0.3377549564969334, "grad_norm": 1.03125, "learning_rate": 3.464523281596452e-05, "loss": 1.1728994846343994, "step": 592 }, { "epoch": 0.33832548851804306, "grad_norm": 1.0234375, "learning_rate": 3.461751662971175e-05, "loss": 1.2145668268203735, "step": 593 }, { "epoch": 0.33889602053915274, "grad_norm": 0.9921875, "learning_rate": 3.458980044345898e-05, "loss": 1.2174324989318848, "step": 594 }, { "epoch": 0.3394665525602624, "grad_norm": 1.0390625, "learning_rate": 3.456208425720621e-05, "loss": 1.1968474388122559, "step": 595 }, { "epoch": 0.3400370845813721, "grad_norm": 1.0078125, "learning_rate": 3.453436807095344e-05, "loss": 1.1793067455291748, "step": 596 }, { "epoch": 0.3406076166024818, "grad_norm": 1.03125, "learning_rate": 3.450665188470067e-05, "loss": 1.2109010219573975, "step": 597 }, { "epoch": 0.3411781486235915, "grad_norm": 1.0546875, "learning_rate": 3.447893569844789e-05, "loss": 1.2412149906158447, "step": 598 }, { "epoch": 0.3417486806447012, "grad_norm": 1.0078125, "learning_rate": 3.445121951219512e-05, "loss": 1.1886482238769531, "step": 599 }, { "epoch": 0.3423192126658109, "grad_norm": 1.015625, "learning_rate": 3.442350332594235e-05, "loss": 1.1711212396621704, "step": 600 }, { "epoch": 0.3428897446869206, "grad_norm": 0.98046875, "learning_rate": 3.4395787139689575e-05, "loss": 1.1890015602111816, "step": 601 }, { "epoch": 0.34346027670803025, "grad_norm": 0.95703125, "learning_rate": 3.436807095343681e-05, "loss": 1.1860285997390747, "step": 602 }, { "epoch": 0.34403080872913994, "grad_norm": 0.98046875, "learning_rate": 3.4340354767184034e-05, "loss": 1.2001878023147583, "step": 603 }, { "epoch": 0.3446013407502496, "grad_norm": 1.0234375, "learning_rate": 3.431263858093127e-05, "loss": 1.1815104484558105, "step": 604 }, { "epoch": 0.3451718727713593, "grad_norm": 1.015625, "learning_rate": 3.428492239467849e-05, "loss": 1.1652307510375977, "step": 605 }, { "epoch": 0.345742404792469, "grad_norm": 1.0234375, "learning_rate": 3.425720620842572e-05, "loss": 1.1888481378555298, "step": 606 }, { "epoch": 0.34631293681357866, "grad_norm": 1.0390625, "learning_rate": 3.422949002217295e-05, "loss": 1.2198981046676636, "step": 607 }, { "epoch": 0.34688346883468835, "grad_norm": 1.03125, "learning_rate": 3.420177383592018e-05, "loss": 1.2088303565979004, "step": 608 }, { "epoch": 0.34745400085579803, "grad_norm": 1.046875, "learning_rate": 3.4174057649667404e-05, "loss": 1.2638548612594604, "step": 609 }, { "epoch": 0.3480245328769077, "grad_norm": 1.0234375, "learning_rate": 3.414634146341464e-05, "loss": 1.2314380407333374, "step": 610 }, { "epoch": 0.3485950648980174, "grad_norm": 1.0, "learning_rate": 3.411862527716186e-05, "loss": 1.1847796440124512, "step": 611 }, { "epoch": 0.3491655969191271, "grad_norm": 1.015625, "learning_rate": 3.409090909090909e-05, "loss": 1.1967138051986694, "step": 612 }, { "epoch": 0.34973612894023676, "grad_norm": 1.0546875, "learning_rate": 3.406319290465632e-05, "loss": 1.1948060989379883, "step": 613 }, { "epoch": 0.35030666096134644, "grad_norm": 1.0078125, "learning_rate": 3.4035476718403544e-05, "loss": 1.248701810836792, "step": 614 }, { "epoch": 0.3508771929824561, "grad_norm": 0.98046875, "learning_rate": 3.400776053215078e-05, "loss": 1.2076679468154907, "step": 615 }, { "epoch": 0.3514477250035658, "grad_norm": 1.0234375, "learning_rate": 3.3980044345898e-05, "loss": 1.20987868309021, "step": 616 }, { "epoch": 0.3520182570246755, "grad_norm": 0.9921875, "learning_rate": 3.395232815964523e-05, "loss": 1.1548939943313599, "step": 617 }, { "epoch": 0.35258878904578517, "grad_norm": 1.03125, "learning_rate": 3.392461197339246e-05, "loss": 1.2160520553588867, "step": 618 }, { "epoch": 0.3531593210668949, "grad_norm": 1.03125, "learning_rate": 3.389689578713969e-05, "loss": 1.2215287685394287, "step": 619 }, { "epoch": 0.3537298530880046, "grad_norm": 1.015625, "learning_rate": 3.386917960088692e-05, "loss": 1.2433137893676758, "step": 620 }, { "epoch": 0.35430038510911427, "grad_norm": 1.015625, "learning_rate": 3.384146341463415e-05, "loss": 1.2307751178741455, "step": 621 }, { "epoch": 0.35487091713022395, "grad_norm": 0.99609375, "learning_rate": 3.381374722838137e-05, "loss": 1.1872355937957764, "step": 622 }, { "epoch": 0.35544144915133363, "grad_norm": 1.03125, "learning_rate": 3.378603104212861e-05, "loss": 1.200265645980835, "step": 623 }, { "epoch": 0.3560119811724433, "grad_norm": 1.0546875, "learning_rate": 3.375831485587583e-05, "loss": 1.3020355701446533, "step": 624 }, { "epoch": 0.356582513193553, "grad_norm": 1.03125, "learning_rate": 3.373059866962306e-05, "loss": 1.1976819038391113, "step": 625 }, { "epoch": 0.3571530452146627, "grad_norm": 1.0234375, "learning_rate": 3.370288248337029e-05, "loss": 1.1945629119873047, "step": 626 }, { "epoch": 0.35772357723577236, "grad_norm": 0.9921875, "learning_rate": 3.3675166297117514e-05, "loss": 1.2189013957977295, "step": 627 }, { "epoch": 0.35829410925688204, "grad_norm": 0.9765625, "learning_rate": 3.364745011086475e-05, "loss": 1.2139533758163452, "step": 628 }, { "epoch": 0.3588646412779917, "grad_norm": 0.96484375, "learning_rate": 3.361973392461197e-05, "loss": 1.1832334995269775, "step": 629 }, { "epoch": 0.3594351732991014, "grad_norm": 1.0078125, "learning_rate": 3.35920177383592e-05, "loss": 1.1789777278900146, "step": 630 }, { "epoch": 0.3600057053202111, "grad_norm": 0.97265625, "learning_rate": 3.356430155210643e-05, "loss": 1.1401221752166748, "step": 631 }, { "epoch": 0.36057623734132077, "grad_norm": 1.015625, "learning_rate": 3.353658536585366e-05, "loss": 1.2332661151885986, "step": 632 }, { "epoch": 0.36114676936243045, "grad_norm": 0.9609375, "learning_rate": 3.350886917960089e-05, "loss": 1.1867516040802002, "step": 633 }, { "epoch": 0.36171730138354014, "grad_norm": 1.0078125, "learning_rate": 3.348115299334812e-05, "loss": 1.2486271858215332, "step": 634 }, { "epoch": 0.3622878334046498, "grad_norm": 1.0390625, "learning_rate": 3.345343680709534e-05, "loss": 1.1644282341003418, "step": 635 }, { "epoch": 0.3628583654257595, "grad_norm": 0.98828125, "learning_rate": 3.342572062084257e-05, "loss": 1.1926931142807007, "step": 636 }, { "epoch": 0.3634288974468692, "grad_norm": 0.98046875, "learning_rate": 3.33980044345898e-05, "loss": 1.2337167263031006, "step": 637 }, { "epoch": 0.36399942946797886, "grad_norm": 1.0078125, "learning_rate": 3.337028824833703e-05, "loss": 1.2726258039474487, "step": 638 }, { "epoch": 0.3645699614890886, "grad_norm": 1.046875, "learning_rate": 3.334257206208426e-05, "loss": 1.229848861694336, "step": 639 }, { "epoch": 0.3651404935101983, "grad_norm": 0.94921875, "learning_rate": 3.3314855875831483e-05, "loss": 1.1424199342727661, "step": 640 }, { "epoch": 0.36571102553130796, "grad_norm": 1.0234375, "learning_rate": 3.328713968957872e-05, "loss": 1.2158143520355225, "step": 641 }, { "epoch": 0.36628155755241765, "grad_norm": 1.0234375, "learning_rate": 3.325942350332594e-05, "loss": 1.213433027267456, "step": 642 }, { "epoch": 0.36685208957352733, "grad_norm": 1.0234375, "learning_rate": 3.323170731707317e-05, "loss": 1.1552369594573975, "step": 643 }, { "epoch": 0.367422621594637, "grad_norm": 0.9765625, "learning_rate": 3.32039911308204e-05, "loss": 1.1470410823822021, "step": 644 }, { "epoch": 0.3679931536157467, "grad_norm": 1.0234375, "learning_rate": 3.317627494456763e-05, "loss": 1.227137804031372, "step": 645 }, { "epoch": 0.3685636856368564, "grad_norm": 0.98046875, "learning_rate": 3.314855875831486e-05, "loss": 1.1736478805541992, "step": 646 }, { "epoch": 0.36913421765796606, "grad_norm": 1.0078125, "learning_rate": 3.312084257206209e-05, "loss": 1.2192144393920898, "step": 647 }, { "epoch": 0.36970474967907574, "grad_norm": 0.97265625, "learning_rate": 3.309312638580931e-05, "loss": 1.1780518293380737, "step": 648 }, { "epoch": 0.3702752817001854, "grad_norm": 1.015625, "learning_rate": 3.306541019955654e-05, "loss": 1.2205878496170044, "step": 649 }, { "epoch": 0.3708458137212951, "grad_norm": 0.99609375, "learning_rate": 3.303769401330377e-05, "loss": 1.2226086854934692, "step": 650 }, { "epoch": 0.3714163457424048, "grad_norm": 1.0078125, "learning_rate": 3.3009977827051e-05, "loss": 1.1905219554901123, "step": 651 }, { "epoch": 0.37198687776351447, "grad_norm": 1.015625, "learning_rate": 3.298226164079823e-05, "loss": 1.1790423393249512, "step": 652 }, { "epoch": 0.37255740978462415, "grad_norm": 0.9609375, "learning_rate": 3.295454545454545e-05, "loss": 1.1909444332122803, "step": 653 }, { "epoch": 0.37312794180573383, "grad_norm": 0.98828125, "learning_rate": 3.292682926829269e-05, "loss": 1.2416154146194458, "step": 654 }, { "epoch": 0.3736984738268435, "grad_norm": 1.0234375, "learning_rate": 3.289911308203991e-05, "loss": 1.2464513778686523, "step": 655 }, { "epoch": 0.3742690058479532, "grad_norm": 1.015625, "learning_rate": 3.287139689578714e-05, "loss": 1.239952802658081, "step": 656 }, { "epoch": 0.3748395378690629, "grad_norm": 0.99609375, "learning_rate": 3.284368070953437e-05, "loss": 1.2005925178527832, "step": 657 }, { "epoch": 0.37541006989017256, "grad_norm": 1.0078125, "learning_rate": 3.28159645232816e-05, "loss": 1.2646636962890625, "step": 658 }, { "epoch": 0.3759806019112823, "grad_norm": 0.98828125, "learning_rate": 3.278824833702882e-05, "loss": 1.203331470489502, "step": 659 }, { "epoch": 0.376551133932392, "grad_norm": 0.97265625, "learning_rate": 3.276053215077605e-05, "loss": 1.1849339008331299, "step": 660 }, { "epoch": 0.37712166595350166, "grad_norm": 1.0078125, "learning_rate": 3.273281596452328e-05, "loss": 1.2010148763656616, "step": 661 }, { "epoch": 0.37769219797461134, "grad_norm": 0.9921875, "learning_rate": 3.270509977827051e-05, "loss": 1.2384660243988037, "step": 662 }, { "epoch": 0.378262729995721, "grad_norm": 0.984375, "learning_rate": 3.267738359201774e-05, "loss": 1.2244110107421875, "step": 663 }, { "epoch": 0.3788332620168307, "grad_norm": 1.015625, "learning_rate": 3.2649667405764963e-05, "loss": 1.2706053256988525, "step": 664 }, { "epoch": 0.3794037940379404, "grad_norm": 0.98828125, "learning_rate": 3.26219512195122e-05, "loss": 1.2451549768447876, "step": 665 }, { "epoch": 0.37997432605905007, "grad_norm": 1.015625, "learning_rate": 3.259423503325942e-05, "loss": 1.2653909921646118, "step": 666 }, { "epoch": 0.38054485808015975, "grad_norm": 1.0078125, "learning_rate": 3.256651884700665e-05, "loss": 1.2227097749710083, "step": 667 }, { "epoch": 0.38111539010126944, "grad_norm": 0.98828125, "learning_rate": 3.253880266075388e-05, "loss": 1.2289211750030518, "step": 668 }, { "epoch": 0.3816859221223791, "grad_norm": 1.0078125, "learning_rate": 3.251108647450111e-05, "loss": 1.2068843841552734, "step": 669 }, { "epoch": 0.3822564541434888, "grad_norm": 0.9453125, "learning_rate": 3.248337028824834e-05, "loss": 1.166361689567566, "step": 670 }, { "epoch": 0.3828269861645985, "grad_norm": 0.9609375, "learning_rate": 3.245565410199557e-05, "loss": 1.220710277557373, "step": 671 }, { "epoch": 0.38339751818570816, "grad_norm": 0.984375, "learning_rate": 3.242793791574279e-05, "loss": 1.1663460731506348, "step": 672 }, { "epoch": 0.38396805020681785, "grad_norm": 1.0078125, "learning_rate": 3.240022172949002e-05, "loss": 1.1803617477416992, "step": 673 }, { "epoch": 0.38453858222792753, "grad_norm": 0.9921875, "learning_rate": 3.237250554323725e-05, "loss": 1.1342628002166748, "step": 674 }, { "epoch": 0.3851091142490372, "grad_norm": 0.9921875, "learning_rate": 3.234478935698448e-05, "loss": 1.2325470447540283, "step": 675 }, { "epoch": 0.3856796462701469, "grad_norm": 1.0078125, "learning_rate": 3.231707317073171e-05, "loss": 1.1941877603530884, "step": 676 }, { "epoch": 0.3862501782912566, "grad_norm": 1.0, "learning_rate": 3.228935698447893e-05, "loss": 1.1775301694869995, "step": 677 }, { "epoch": 0.38682071031236626, "grad_norm": 1.0546875, "learning_rate": 3.226164079822617e-05, "loss": 1.248462438583374, "step": 678 }, { "epoch": 0.387391242333476, "grad_norm": 1.03125, "learning_rate": 3.223392461197339e-05, "loss": 1.2440953254699707, "step": 679 }, { "epoch": 0.3879617743545857, "grad_norm": 0.98046875, "learning_rate": 3.220620842572062e-05, "loss": 1.1706881523132324, "step": 680 }, { "epoch": 0.38853230637569536, "grad_norm": 1.0390625, "learning_rate": 3.217849223946785e-05, "loss": 1.227694034576416, "step": 681 }, { "epoch": 0.38910283839680504, "grad_norm": 1.0234375, "learning_rate": 3.215077605321508e-05, "loss": 1.2553303241729736, "step": 682 }, { "epoch": 0.3896733704179147, "grad_norm": 0.94140625, "learning_rate": 3.212305986696231e-05, "loss": 1.1399942636489868, "step": 683 }, { "epoch": 0.3902439024390244, "grad_norm": 0.98828125, "learning_rate": 3.209534368070954e-05, "loss": 1.2082273960113525, "step": 684 }, { "epoch": 0.3908144344601341, "grad_norm": 1.0625, "learning_rate": 3.206762749445676e-05, "loss": 1.2403631210327148, "step": 685 }, { "epoch": 0.39138496648124377, "grad_norm": 1.03125, "learning_rate": 3.203991130820399e-05, "loss": 1.1668493747711182, "step": 686 }, { "epoch": 0.39195549850235345, "grad_norm": 1.0, "learning_rate": 3.201219512195122e-05, "loss": 1.1642647981643677, "step": 687 }, { "epoch": 0.39252603052346313, "grad_norm": 0.94921875, "learning_rate": 3.198447893569845e-05, "loss": 1.169840693473816, "step": 688 }, { "epoch": 0.3930965625445728, "grad_norm": 1.0625, "learning_rate": 3.195676274944568e-05, "loss": 1.1918284893035889, "step": 689 }, { "epoch": 0.3936670945656825, "grad_norm": 1.0546875, "learning_rate": 3.19290465631929e-05, "loss": 1.2486236095428467, "step": 690 }, { "epoch": 0.3942376265867922, "grad_norm": 1.0, "learning_rate": 3.190133037694014e-05, "loss": 1.212164044380188, "step": 691 }, { "epoch": 0.39480815860790186, "grad_norm": 1.0, "learning_rate": 3.187361419068736e-05, "loss": 1.2184773683547974, "step": 692 }, { "epoch": 0.39537869062901154, "grad_norm": 1.0625, "learning_rate": 3.184589800443459e-05, "loss": 1.2665815353393555, "step": 693 }, { "epoch": 0.3959492226501212, "grad_norm": 1.0625, "learning_rate": 3.181818181818182e-05, "loss": 1.1956299543380737, "step": 694 }, { "epoch": 0.3965197546712309, "grad_norm": 0.98828125, "learning_rate": 3.179046563192905e-05, "loss": 1.1868462562561035, "step": 695 }, { "epoch": 0.3970902866923406, "grad_norm": 1.0234375, "learning_rate": 3.176274944567628e-05, "loss": 1.2558304071426392, "step": 696 }, { "epoch": 0.39766081871345027, "grad_norm": 1.0078125, "learning_rate": 3.17350332594235e-05, "loss": 1.2197167873382568, "step": 697 }, { "epoch": 0.39823135073455995, "grad_norm": 1.0390625, "learning_rate": 3.170731707317073e-05, "loss": 1.2546510696411133, "step": 698 }, { "epoch": 0.39880188275566963, "grad_norm": 1.078125, "learning_rate": 3.167960088691796e-05, "loss": 1.2634811401367188, "step": 699 }, { "epoch": 0.39937241477677937, "grad_norm": 0.953125, "learning_rate": 3.165188470066519e-05, "loss": 1.1409438848495483, "step": 700 }, { "epoch": 0.39994294679788905, "grad_norm": 1.0, "learning_rate": 3.162416851441242e-05, "loss": 1.167540431022644, "step": 701 }, { "epoch": 0.40051347881899874, "grad_norm": 0.98828125, "learning_rate": 3.159645232815965e-05, "loss": 1.2233819961547852, "step": 702 }, { "epoch": 0.4010840108401084, "grad_norm": 1.0625, "learning_rate": 3.156873614190687e-05, "loss": 1.2183570861816406, "step": 703 }, { "epoch": 0.4016545428612181, "grad_norm": 1.0234375, "learning_rate": 3.154101995565411e-05, "loss": 1.2039064168930054, "step": 704 }, { "epoch": 0.4022250748823278, "grad_norm": 1.0234375, "learning_rate": 3.151330376940133e-05, "loss": 1.2583222389221191, "step": 705 }, { "epoch": 0.40279560690343746, "grad_norm": 0.9765625, "learning_rate": 3.148558758314856e-05, "loss": 1.2133885622024536, "step": 706 }, { "epoch": 0.40336613892454715, "grad_norm": 0.99609375, "learning_rate": 3.145787139689579e-05, "loss": 1.2497689723968506, "step": 707 }, { "epoch": 0.40393667094565683, "grad_norm": 1.015625, "learning_rate": 3.143015521064302e-05, "loss": 1.1765098571777344, "step": 708 }, { "epoch": 0.4045072029667665, "grad_norm": 0.9765625, "learning_rate": 3.140243902439025e-05, "loss": 1.1668319702148438, "step": 709 }, { "epoch": 0.4050777349878762, "grad_norm": 1.0, "learning_rate": 3.137472283813747e-05, "loss": 1.1545255184173584, "step": 710 }, { "epoch": 0.4056482670089859, "grad_norm": 1.0, "learning_rate": 3.13470066518847e-05, "loss": 1.2044893503189087, "step": 711 }, { "epoch": 0.40621879903009556, "grad_norm": 0.99609375, "learning_rate": 3.131929046563193e-05, "loss": 1.2121517658233643, "step": 712 }, { "epoch": 0.40678933105120524, "grad_norm": 1.0390625, "learning_rate": 3.129157427937916e-05, "loss": 1.276052713394165, "step": 713 }, { "epoch": 0.4073598630723149, "grad_norm": 1.015625, "learning_rate": 3.126385809312638e-05, "loss": 1.1800833940505981, "step": 714 }, { "epoch": 0.4079303950934246, "grad_norm": 0.984375, "learning_rate": 3.123614190687362e-05, "loss": 1.1513339281082153, "step": 715 }, { "epoch": 0.4085009271145343, "grad_norm": 0.98046875, "learning_rate": 3.120842572062084e-05, "loss": 1.2298616170883179, "step": 716 }, { "epoch": 0.40907145913564397, "grad_norm": 0.9765625, "learning_rate": 3.118070953436808e-05, "loss": 1.1709084510803223, "step": 717 }, { "epoch": 0.40964199115675365, "grad_norm": 0.98828125, "learning_rate": 3.11529933481153e-05, "loss": 1.1676058769226074, "step": 718 }, { "epoch": 0.41021252317786333, "grad_norm": 0.98828125, "learning_rate": 3.112527716186253e-05, "loss": 1.2025721073150635, "step": 719 }, { "epoch": 0.41078305519897307, "grad_norm": 1.0390625, "learning_rate": 3.109756097560976e-05, "loss": 1.2218658924102783, "step": 720 }, { "epoch": 0.41135358722008275, "grad_norm": 0.96875, "learning_rate": 3.106984478935698e-05, "loss": 1.1744896173477173, "step": 721 }, { "epoch": 0.41192411924119243, "grad_norm": 0.94921875, "learning_rate": 3.104212860310421e-05, "loss": 1.1989339590072632, "step": 722 }, { "epoch": 0.4124946512623021, "grad_norm": 0.9765625, "learning_rate": 3.101441241685144e-05, "loss": 1.2189137935638428, "step": 723 }, { "epoch": 0.4130651832834118, "grad_norm": 0.9921875, "learning_rate": 3.098669623059867e-05, "loss": 1.2155076265335083, "step": 724 }, { "epoch": 0.4136357153045215, "grad_norm": 0.9921875, "learning_rate": 3.09589800443459e-05, "loss": 1.1465799808502197, "step": 725 }, { "epoch": 0.41420624732563116, "grad_norm": 0.98828125, "learning_rate": 3.093126385809313e-05, "loss": 1.2145007848739624, "step": 726 }, { "epoch": 0.41477677934674084, "grad_norm": 0.984375, "learning_rate": 3.090354767184035e-05, "loss": 1.2057294845581055, "step": 727 }, { "epoch": 0.4153473113678505, "grad_norm": 0.9921875, "learning_rate": 3.087583148558759e-05, "loss": 1.2041752338409424, "step": 728 }, { "epoch": 0.4159178433889602, "grad_norm": 0.9765625, "learning_rate": 3.084811529933481e-05, "loss": 1.1989641189575195, "step": 729 }, { "epoch": 0.4164883754100699, "grad_norm": 0.9453125, "learning_rate": 3.082039911308204e-05, "loss": 1.188431739807129, "step": 730 }, { "epoch": 0.41705890743117957, "grad_norm": 0.96875, "learning_rate": 3.079268292682927e-05, "loss": 1.1488507986068726, "step": 731 }, { "epoch": 0.41762943945228925, "grad_norm": 1.015625, "learning_rate": 3.07649667405765e-05, "loss": 1.2174850702285767, "step": 732 }, { "epoch": 0.41819997147339893, "grad_norm": 0.96875, "learning_rate": 3.073725055432373e-05, "loss": 1.2141880989074707, "step": 733 }, { "epoch": 0.4187705034945086, "grad_norm": 1.03125, "learning_rate": 3.070953436807095e-05, "loss": 1.2875535488128662, "step": 734 }, { "epoch": 0.4193410355156183, "grad_norm": 0.984375, "learning_rate": 3.068181818181818e-05, "loss": 1.168579339981079, "step": 735 }, { "epoch": 0.419911567536728, "grad_norm": 0.96875, "learning_rate": 3.065410199556541e-05, "loss": 1.1168636083602905, "step": 736 }, { "epoch": 0.42048209955783766, "grad_norm": 0.984375, "learning_rate": 3.062638580931264e-05, "loss": 1.1600708961486816, "step": 737 }, { "epoch": 0.42105263157894735, "grad_norm": 0.9765625, "learning_rate": 3.059866962305987e-05, "loss": 1.1832588911056519, "step": 738 }, { "epoch": 0.42105263157894735, "eval_loss": 1.1941628456115723, "eval_runtime": 80.1253, "eval_samples_per_second": 11.931, "eval_steps_per_second": 2.983, "step": 738 }, { "epoch": 0.421623163600057, "grad_norm": 0.98828125, "learning_rate": 3.05709534368071e-05, "loss": 1.193061351776123, "step": 739 }, { "epoch": 0.42219369562116676, "grad_norm": 1.03125, "learning_rate": 3.054323725055432e-05, "loss": 1.1793735027313232, "step": 740 }, { "epoch": 0.42276422764227645, "grad_norm": 0.95703125, "learning_rate": 3.0515521064301554e-05, "loss": 1.1607141494750977, "step": 741 }, { "epoch": 0.42333475966338613, "grad_norm": 0.99609375, "learning_rate": 3.048780487804878e-05, "loss": 1.1790132522583008, "step": 742 }, { "epoch": 0.4239052916844958, "grad_norm": 0.98046875, "learning_rate": 3.0460088691796013e-05, "loss": 1.155259132385254, "step": 743 }, { "epoch": 0.4244758237056055, "grad_norm": 0.9609375, "learning_rate": 3.043237250554324e-05, "loss": 1.1134623289108276, "step": 744 }, { "epoch": 0.4250463557267152, "grad_norm": 0.93359375, "learning_rate": 3.0404656319290465e-05, "loss": 1.198337435722351, "step": 745 }, { "epoch": 0.42561688774782486, "grad_norm": 0.9921875, "learning_rate": 3.0376940133037695e-05, "loss": 1.1744345426559448, "step": 746 }, { "epoch": 0.42618741976893454, "grad_norm": 0.984375, "learning_rate": 3.034922394678492e-05, "loss": 1.1646068096160889, "step": 747 }, { "epoch": 0.4267579517900442, "grad_norm": 0.984375, "learning_rate": 3.0321507760532154e-05, "loss": 1.1827648878097534, "step": 748 }, { "epoch": 0.4273284838111539, "grad_norm": 0.953125, "learning_rate": 3.029379157427938e-05, "loss": 1.1942888498306274, "step": 749 }, { "epoch": 0.4278990158322636, "grad_norm": 0.9765625, "learning_rate": 3.026607538802661e-05, "loss": 1.1896655559539795, "step": 750 }, { "epoch": 0.42846954785337327, "grad_norm": 0.98046875, "learning_rate": 3.0238359201773835e-05, "loss": 1.197471022605896, "step": 751 }, { "epoch": 0.42904007987448295, "grad_norm": 1.0078125, "learning_rate": 3.021064301552107e-05, "loss": 1.1281297206878662, "step": 752 }, { "epoch": 0.42961061189559263, "grad_norm": 0.99609375, "learning_rate": 3.0182926829268294e-05, "loss": 1.1960434913635254, "step": 753 }, { "epoch": 0.4301811439167023, "grad_norm": 0.95703125, "learning_rate": 3.0155210643015524e-05, "loss": 1.1772822141647339, "step": 754 }, { "epoch": 0.430751675937812, "grad_norm": 0.98046875, "learning_rate": 3.012749445676275e-05, "loss": 1.2077326774597168, "step": 755 }, { "epoch": 0.4313222079589217, "grad_norm": 1.0, "learning_rate": 3.0099778270509983e-05, "loss": 1.216168999671936, "step": 756 }, { "epoch": 0.43189273998003136, "grad_norm": 0.97265625, "learning_rate": 3.007206208425721e-05, "loss": 1.1528898477554321, "step": 757 }, { "epoch": 0.43246327200114104, "grad_norm": 1.0, "learning_rate": 3.0044345898004435e-05, "loss": 1.1724753379821777, "step": 758 }, { "epoch": 0.4330338040222507, "grad_norm": 0.96484375, "learning_rate": 3.0016629711751664e-05, "loss": 1.1700730323791504, "step": 759 }, { "epoch": 0.43360433604336046, "grad_norm": 0.9609375, "learning_rate": 2.998891352549889e-05, "loss": 1.1328129768371582, "step": 760 }, { "epoch": 0.43417486806447014, "grad_norm": 0.9765625, "learning_rate": 2.9961197339246123e-05, "loss": 1.191325306892395, "step": 761 }, { "epoch": 0.4347454000855798, "grad_norm": 0.97265625, "learning_rate": 2.993348115299335e-05, "loss": 1.160369873046875, "step": 762 }, { "epoch": 0.4353159321066895, "grad_norm": 0.96484375, "learning_rate": 2.990576496674058e-05, "loss": 1.196010947227478, "step": 763 }, { "epoch": 0.4358864641277992, "grad_norm": 0.96875, "learning_rate": 2.9878048780487805e-05, "loss": 1.1497125625610352, "step": 764 }, { "epoch": 0.43645699614890887, "grad_norm": 1.0078125, "learning_rate": 2.9850332594235038e-05, "loss": 1.152623176574707, "step": 765 }, { "epoch": 0.43702752817001855, "grad_norm": 1.015625, "learning_rate": 2.9822616407982264e-05, "loss": 1.1713566780090332, "step": 766 }, { "epoch": 0.43759806019112824, "grad_norm": 1.1640625, "learning_rate": 2.9794900221729493e-05, "loss": 1.263333797454834, "step": 767 }, { "epoch": 0.4381685922122379, "grad_norm": 0.96875, "learning_rate": 2.976718403547672e-05, "loss": 1.144421935081482, "step": 768 }, { "epoch": 0.4387391242333476, "grad_norm": 0.953125, "learning_rate": 2.9739467849223952e-05, "loss": 1.2290055751800537, "step": 769 }, { "epoch": 0.4393096562544573, "grad_norm": 0.9921875, "learning_rate": 2.971175166297118e-05, "loss": 1.1050488948822021, "step": 770 }, { "epoch": 0.43988018827556696, "grad_norm": 0.9765625, "learning_rate": 2.96840354767184e-05, "loss": 1.2218358516693115, "step": 771 }, { "epoch": 0.44045072029667665, "grad_norm": 0.94921875, "learning_rate": 2.9656319290465634e-05, "loss": 1.1308021545410156, "step": 772 }, { "epoch": 0.4410212523177863, "grad_norm": 0.99609375, "learning_rate": 2.962860310421286e-05, "loss": 1.2299238443374634, "step": 773 }, { "epoch": 0.441591784338896, "grad_norm": 0.98046875, "learning_rate": 2.960088691796009e-05, "loss": 1.1389673948287964, "step": 774 }, { "epoch": 0.4421623163600057, "grad_norm": 1.0078125, "learning_rate": 2.9573170731707316e-05, "loss": 1.2660845518112183, "step": 775 }, { "epoch": 0.4427328483811154, "grad_norm": 0.96484375, "learning_rate": 2.954545454545455e-05, "loss": 1.099113941192627, "step": 776 }, { "epoch": 0.44330338040222506, "grad_norm": 0.953125, "learning_rate": 2.9517738359201774e-05, "loss": 1.2134381532669067, "step": 777 }, { "epoch": 0.44387391242333474, "grad_norm": 0.96875, "learning_rate": 2.9490022172949004e-05, "loss": 1.1754953861236572, "step": 778 }, { "epoch": 0.4444444444444444, "grad_norm": 0.98046875, "learning_rate": 2.946230598669623e-05, "loss": 1.1886742115020752, "step": 779 }, { "epoch": 0.4450149764655541, "grad_norm": 0.953125, "learning_rate": 2.9434589800443463e-05, "loss": 1.192276954650879, "step": 780 }, { "epoch": 0.44558550848666384, "grad_norm": 1.0078125, "learning_rate": 2.940687361419069e-05, "loss": 1.2006890773773193, "step": 781 }, { "epoch": 0.4461560405077735, "grad_norm": 0.98828125, "learning_rate": 2.9379157427937915e-05, "loss": 1.1819924116134644, "step": 782 }, { "epoch": 0.4467265725288832, "grad_norm": 0.9453125, "learning_rate": 2.9351441241685145e-05, "loss": 1.1743961572647095, "step": 783 }, { "epoch": 0.4472971045499929, "grad_norm": 0.98046875, "learning_rate": 2.932372505543237e-05, "loss": 1.2021007537841797, "step": 784 }, { "epoch": 0.44786763657110257, "grad_norm": 1.0, "learning_rate": 2.9296008869179603e-05, "loss": 1.2032489776611328, "step": 785 }, { "epoch": 0.44843816859221225, "grad_norm": 1.015625, "learning_rate": 2.926829268292683e-05, "loss": 1.1912821531295776, "step": 786 }, { "epoch": 0.44900870061332193, "grad_norm": 0.9609375, "learning_rate": 2.924057649667406e-05, "loss": 1.184190034866333, "step": 787 }, { "epoch": 0.4495792326344316, "grad_norm": 1.015625, "learning_rate": 2.9212860310421285e-05, "loss": 1.272563099861145, "step": 788 }, { "epoch": 0.4501497646555413, "grad_norm": 0.98046875, "learning_rate": 2.9185144124168518e-05, "loss": 1.2212070226669312, "step": 789 }, { "epoch": 0.450720296676651, "grad_norm": 1.03125, "learning_rate": 2.9157427937915744e-05, "loss": 1.1937004327774048, "step": 790 }, { "epoch": 0.45129082869776066, "grad_norm": 1.0390625, "learning_rate": 2.9129711751662973e-05, "loss": 1.1712844371795654, "step": 791 }, { "epoch": 0.45186136071887034, "grad_norm": 0.9609375, "learning_rate": 2.91019955654102e-05, "loss": 1.1701891422271729, "step": 792 }, { "epoch": 0.45243189273998, "grad_norm": 1.015625, "learning_rate": 2.9074279379157432e-05, "loss": 1.2575602531433105, "step": 793 }, { "epoch": 0.4530024247610897, "grad_norm": 0.9765625, "learning_rate": 2.904656319290466e-05, "loss": 1.1968649625778198, "step": 794 }, { "epoch": 0.4535729567821994, "grad_norm": 0.97265625, "learning_rate": 2.9018847006651885e-05, "loss": 1.205810546875, "step": 795 }, { "epoch": 0.45414348880330907, "grad_norm": 0.96875, "learning_rate": 2.8991130820399114e-05, "loss": 1.1697238683700562, "step": 796 }, { "epoch": 0.45471402082441875, "grad_norm": 1.0703125, "learning_rate": 2.896341463414634e-05, "loss": 1.27318274974823, "step": 797 }, { "epoch": 0.45528455284552843, "grad_norm": 1.015625, "learning_rate": 2.8935698447893573e-05, "loss": 1.2104084491729736, "step": 798 }, { "epoch": 0.4558550848666381, "grad_norm": 1.0234375, "learning_rate": 2.89079822616408e-05, "loss": 1.2579401731491089, "step": 799 }, { "epoch": 0.4564256168877478, "grad_norm": 0.97265625, "learning_rate": 2.888026607538803e-05, "loss": 1.1750009059906006, "step": 800 }, { "epoch": 0.45699614890885754, "grad_norm": 1.03125, "learning_rate": 2.8852549889135255e-05, "loss": 1.1911466121673584, "step": 801 }, { "epoch": 0.4575666809299672, "grad_norm": 0.96875, "learning_rate": 2.8824833702882487e-05, "loss": 1.0935354232788086, "step": 802 }, { "epoch": 0.4581372129510769, "grad_norm": 0.9453125, "learning_rate": 2.8797117516629713e-05, "loss": 1.1621028184890747, "step": 803 }, { "epoch": 0.4587077449721866, "grad_norm": 0.98828125, "learning_rate": 2.8769401330376943e-05, "loss": 1.1952382326126099, "step": 804 }, { "epoch": 0.45927827699329626, "grad_norm": 0.984375, "learning_rate": 2.874168514412417e-05, "loss": 1.2074031829833984, "step": 805 }, { "epoch": 0.45984880901440595, "grad_norm": 0.94921875, "learning_rate": 2.8713968957871395e-05, "loss": 1.191246509552002, "step": 806 }, { "epoch": 0.46041934103551563, "grad_norm": 0.921875, "learning_rate": 2.8686252771618628e-05, "loss": 1.2298707962036133, "step": 807 }, { "epoch": 0.4609898730566253, "grad_norm": 1.015625, "learning_rate": 2.8658536585365854e-05, "loss": 1.2514528036117554, "step": 808 }, { "epoch": 0.461560405077735, "grad_norm": 0.96484375, "learning_rate": 2.8630820399113084e-05, "loss": 1.2710151672363281, "step": 809 }, { "epoch": 0.4621309370988447, "grad_norm": 0.93359375, "learning_rate": 2.860310421286031e-05, "loss": 1.1337497234344482, "step": 810 }, { "epoch": 0.46270146911995436, "grad_norm": 0.96875, "learning_rate": 2.8575388026607542e-05, "loss": 1.1267883777618408, "step": 811 }, { "epoch": 0.46327200114106404, "grad_norm": 0.9609375, "learning_rate": 2.854767184035477e-05, "loss": 1.1755304336547852, "step": 812 }, { "epoch": 0.4638425331621737, "grad_norm": 0.96875, "learning_rate": 2.8519955654101998e-05, "loss": 1.1366599798202515, "step": 813 }, { "epoch": 0.4644130651832834, "grad_norm": 1.0234375, "learning_rate": 2.8492239467849224e-05, "loss": 1.2038339376449585, "step": 814 }, { "epoch": 0.4649835972043931, "grad_norm": 0.96875, "learning_rate": 2.8464523281596457e-05, "loss": 1.2154085636138916, "step": 815 }, { "epoch": 0.46555412922550277, "grad_norm": 1.0078125, "learning_rate": 2.8436807095343683e-05, "loss": 1.1818276643753052, "step": 816 }, { "epoch": 0.46612466124661245, "grad_norm": 1.0078125, "learning_rate": 2.8409090909090912e-05, "loss": 1.2436468601226807, "step": 817 }, { "epoch": 0.46669519326772213, "grad_norm": 0.953125, "learning_rate": 2.838137472283814e-05, "loss": 1.1363047361373901, "step": 818 }, { "epoch": 0.4672657252888318, "grad_norm": 0.984375, "learning_rate": 2.8353658536585365e-05, "loss": 1.1960558891296387, "step": 819 }, { "epoch": 0.4678362573099415, "grad_norm": 0.96875, "learning_rate": 2.8325942350332597e-05, "loss": 1.171709418296814, "step": 820 }, { "epoch": 0.46840678933105123, "grad_norm": 0.953125, "learning_rate": 2.8298226164079824e-05, "loss": 1.1537501811981201, "step": 821 }, { "epoch": 0.4689773213521609, "grad_norm": 0.9609375, "learning_rate": 2.8270509977827053e-05, "loss": 1.1839423179626465, "step": 822 }, { "epoch": 0.4695478533732706, "grad_norm": 0.98046875, "learning_rate": 2.824279379157428e-05, "loss": 1.1610156297683716, "step": 823 }, { "epoch": 0.4701183853943803, "grad_norm": 0.984375, "learning_rate": 2.8215077605321512e-05, "loss": 1.1708459854125977, "step": 824 }, { "epoch": 0.47068891741548996, "grad_norm": 1.015625, "learning_rate": 2.8187361419068735e-05, "loss": 1.251354455947876, "step": 825 }, { "epoch": 0.47125944943659964, "grad_norm": 0.984375, "learning_rate": 2.8159645232815967e-05, "loss": 1.2049927711486816, "step": 826 }, { "epoch": 0.4718299814577093, "grad_norm": 0.99609375, "learning_rate": 2.8131929046563194e-05, "loss": 1.230988621711731, "step": 827 }, { "epoch": 0.472400513478819, "grad_norm": 0.96484375, "learning_rate": 2.8104212860310426e-05, "loss": 1.1739616394042969, "step": 828 }, { "epoch": 0.4729710454999287, "grad_norm": 0.99609375, "learning_rate": 2.807649667405765e-05, "loss": 1.1999741792678833, "step": 829 }, { "epoch": 0.47354157752103837, "grad_norm": 1.0, "learning_rate": 2.8048780487804882e-05, "loss": 1.2062275409698486, "step": 830 }, { "epoch": 0.47411210954214805, "grad_norm": 1.0078125, "learning_rate": 2.8021064301552108e-05, "loss": 1.1344287395477295, "step": 831 }, { "epoch": 0.47468264156325773, "grad_norm": 0.96484375, "learning_rate": 2.7993348115299334e-05, "loss": 1.2056477069854736, "step": 832 }, { "epoch": 0.4752531735843674, "grad_norm": 1.015625, "learning_rate": 2.7965631929046564e-05, "loss": 1.1727713346481323, "step": 833 }, { "epoch": 0.4758237056054771, "grad_norm": 0.99609375, "learning_rate": 2.793791574279379e-05, "loss": 1.2081948518753052, "step": 834 }, { "epoch": 0.4763942376265868, "grad_norm": 0.98046875, "learning_rate": 2.7910199556541023e-05, "loss": 1.255791187286377, "step": 835 }, { "epoch": 0.47696476964769646, "grad_norm": 0.9921875, "learning_rate": 2.788248337028825e-05, "loss": 1.1889286041259766, "step": 836 }, { "epoch": 0.47753530166880614, "grad_norm": 0.9921875, "learning_rate": 2.7854767184035478e-05, "loss": 1.241337776184082, "step": 837 }, { "epoch": 0.4781058336899158, "grad_norm": 0.98828125, "learning_rate": 2.7827050997782704e-05, "loss": 1.2144089937210083, "step": 838 }, { "epoch": 0.4786763657110255, "grad_norm": 0.95703125, "learning_rate": 2.7799334811529937e-05, "loss": 1.1527715921401978, "step": 839 }, { "epoch": 0.4792468977321352, "grad_norm": 0.96875, "learning_rate": 2.7771618625277163e-05, "loss": 1.181959629058838, "step": 840 }, { "epoch": 0.47981742975324493, "grad_norm": 0.9921875, "learning_rate": 2.7743902439024393e-05, "loss": 1.1999069452285767, "step": 841 }, { "epoch": 0.4803879617743546, "grad_norm": 0.984375, "learning_rate": 2.771618625277162e-05, "loss": 1.2098867893218994, "step": 842 }, { "epoch": 0.4809584937954643, "grad_norm": 0.9765625, "learning_rate": 2.7688470066518845e-05, "loss": 1.1860891580581665, "step": 843 }, { "epoch": 0.481529025816574, "grad_norm": 0.9609375, "learning_rate": 2.7660753880266078e-05, "loss": 1.1108654737472534, "step": 844 }, { "epoch": 0.48209955783768366, "grad_norm": 0.953125, "learning_rate": 2.7633037694013304e-05, "loss": 1.2157371044158936, "step": 845 }, { "epoch": 0.48267008985879334, "grad_norm": 0.96875, "learning_rate": 2.7605321507760533e-05, "loss": 1.2216970920562744, "step": 846 }, { "epoch": 0.483240621879903, "grad_norm": 0.98046875, "learning_rate": 2.757760532150776e-05, "loss": 1.1434253454208374, "step": 847 }, { "epoch": 0.4838111539010127, "grad_norm": 0.99609375, "learning_rate": 2.7549889135254992e-05, "loss": 1.1241540908813477, "step": 848 }, { "epoch": 0.4843816859221224, "grad_norm": 0.9921875, "learning_rate": 2.7522172949002218e-05, "loss": 1.186653971672058, "step": 849 }, { "epoch": 0.48495221794323207, "grad_norm": 1.015625, "learning_rate": 2.7494456762749448e-05, "loss": 1.2525804042816162, "step": 850 }, { "epoch": 0.48552274996434175, "grad_norm": 0.98046875, "learning_rate": 2.7466740576496674e-05, "loss": 1.1987820863723755, "step": 851 }, { "epoch": 0.48609328198545143, "grad_norm": 0.96875, "learning_rate": 2.7439024390243906e-05, "loss": 1.2217812538146973, "step": 852 }, { "epoch": 0.4866638140065611, "grad_norm": 0.97265625, "learning_rate": 2.7411308203991133e-05, "loss": 1.201343297958374, "step": 853 }, { "epoch": 0.4872343460276708, "grad_norm": 0.953125, "learning_rate": 2.7383592017738362e-05, "loss": 1.1668754816055298, "step": 854 }, { "epoch": 0.4878048780487805, "grad_norm": 0.96875, "learning_rate": 2.7355875831485588e-05, "loss": 1.1264851093292236, "step": 855 }, { "epoch": 0.48837541006989016, "grad_norm": 0.9921875, "learning_rate": 2.7328159645232814e-05, "loss": 1.202168345451355, "step": 856 }, { "epoch": 0.48894594209099984, "grad_norm": 1.015625, "learning_rate": 2.7300443458980047e-05, "loss": 1.2231934070587158, "step": 857 }, { "epoch": 0.4895164741121095, "grad_norm": 1.0078125, "learning_rate": 2.7272727272727273e-05, "loss": 1.1511149406433105, "step": 858 }, { "epoch": 0.4900870061332192, "grad_norm": 1.0, "learning_rate": 2.7245011086474503e-05, "loss": 1.1898903846740723, "step": 859 }, { "epoch": 0.4906575381543289, "grad_norm": 1.0, "learning_rate": 2.721729490022173e-05, "loss": 1.1848946809768677, "step": 860 }, { "epoch": 0.49122807017543857, "grad_norm": 0.97265625, "learning_rate": 2.718957871396896e-05, "loss": 1.1898174285888672, "step": 861 }, { "epoch": 0.4917986021965483, "grad_norm": 1.0234375, "learning_rate": 2.7161862527716188e-05, "loss": 1.2187345027923584, "step": 862 }, { "epoch": 0.492369134217658, "grad_norm": 0.9765625, "learning_rate": 2.7134146341463417e-05, "loss": 1.1753157377243042, "step": 863 }, { "epoch": 0.49293966623876767, "grad_norm": 1.0078125, "learning_rate": 2.7106430155210643e-05, "loss": 1.2812843322753906, "step": 864 }, { "epoch": 0.49351019825987735, "grad_norm": 0.9921875, "learning_rate": 2.7078713968957876e-05, "loss": 1.2476832866668701, "step": 865 }, { "epoch": 0.49408073028098703, "grad_norm": 0.95703125, "learning_rate": 2.7050997782705102e-05, "loss": 1.1763570308685303, "step": 866 }, { "epoch": 0.4946512623020967, "grad_norm": 0.9609375, "learning_rate": 2.7023281596452328e-05, "loss": 1.159504771232605, "step": 867 }, { "epoch": 0.4952217943232064, "grad_norm": 0.94140625, "learning_rate": 2.6995565410199558e-05, "loss": 1.2344439029693604, "step": 868 }, { "epoch": 0.4957923263443161, "grad_norm": 0.98046875, "learning_rate": 2.6967849223946784e-05, "loss": 1.2668113708496094, "step": 869 }, { "epoch": 0.49636285836542576, "grad_norm": 0.96875, "learning_rate": 2.6940133037694017e-05, "loss": 1.2388842105865479, "step": 870 }, { "epoch": 0.49693339038653545, "grad_norm": 1.0, "learning_rate": 2.6912416851441243e-05, "loss": 1.197232723236084, "step": 871 }, { "epoch": 0.4975039224076451, "grad_norm": 0.98046875, "learning_rate": 2.6884700665188472e-05, "loss": 1.1960959434509277, "step": 872 }, { "epoch": 0.4980744544287548, "grad_norm": 0.99609375, "learning_rate": 2.6856984478935698e-05, "loss": 1.222888469696045, "step": 873 }, { "epoch": 0.4986449864498645, "grad_norm": 0.98828125, "learning_rate": 2.682926829268293e-05, "loss": 1.239640474319458, "step": 874 }, { "epoch": 0.4992155184709742, "grad_norm": 0.953125, "learning_rate": 2.6801552106430157e-05, "loss": 1.1557681560516357, "step": 875 }, { "epoch": 0.49978605049208386, "grad_norm": 1.0, "learning_rate": 2.6773835920177387e-05, "loss": 1.1697707176208496, "step": 876 }, { "epoch": 0.5003565825131936, "grad_norm": 1.0234375, "learning_rate": 2.6746119733924613e-05, "loss": 1.2065680027008057, "step": 877 }, { "epoch": 0.5009271145343033, "grad_norm": 0.9921875, "learning_rate": 2.6718403547671845e-05, "loss": 1.2194795608520508, "step": 878 }, { "epoch": 0.501497646555413, "grad_norm": 0.9609375, "learning_rate": 2.669068736141907e-05, "loss": 1.1722071170806885, "step": 879 }, { "epoch": 0.5020681785765226, "grad_norm": 0.99609375, "learning_rate": 2.6662971175166294e-05, "loss": 1.1860017776489258, "step": 880 }, { "epoch": 0.5026387105976323, "grad_norm": 0.9921875, "learning_rate": 2.6635254988913527e-05, "loss": 1.173937439918518, "step": 881 }, { "epoch": 0.503209242618742, "grad_norm": 1.0234375, "learning_rate": 2.6607538802660753e-05, "loss": 1.1348332166671753, "step": 882 }, { "epoch": 0.5037797746398517, "grad_norm": 0.97265625, "learning_rate": 2.6579822616407986e-05, "loss": 1.205221176147461, "step": 883 }, { "epoch": 0.5043503066609614, "grad_norm": 0.95703125, "learning_rate": 2.655210643015521e-05, "loss": 1.1510381698608398, "step": 884 }, { "epoch": 0.504920838682071, "grad_norm": 0.9921875, "learning_rate": 2.652439024390244e-05, "loss": 1.194382905960083, "step": 885 }, { "epoch": 0.5054913707031807, "grad_norm": 1.015625, "learning_rate": 2.6496674057649668e-05, "loss": 1.2697436809539795, "step": 886 }, { "epoch": 0.5060619027242904, "grad_norm": 0.9609375, "learning_rate": 2.64689578713969e-05, "loss": 1.1560388803482056, "step": 887 }, { "epoch": 0.5066324347454001, "grad_norm": 0.984375, "learning_rate": 2.6441241685144123e-05, "loss": 1.2498875856399536, "step": 888 }, { "epoch": 0.5072029667665098, "grad_norm": 0.9609375, "learning_rate": 2.6413525498891356e-05, "loss": 1.1706441640853882, "step": 889 }, { "epoch": 0.5077734987876195, "grad_norm": 0.99609375, "learning_rate": 2.6385809312638582e-05, "loss": 1.1960177421569824, "step": 890 }, { "epoch": 0.5083440308087291, "grad_norm": 1.0, "learning_rate": 2.6358093126385815e-05, "loss": 1.1732114553451538, "step": 891 }, { "epoch": 0.5089145628298388, "grad_norm": 0.984375, "learning_rate": 2.6330376940133038e-05, "loss": 1.1812173128128052, "step": 892 }, { "epoch": 0.5094850948509485, "grad_norm": 1.0, "learning_rate": 2.6302660753880264e-05, "loss": 1.243033528327942, "step": 893 }, { "epoch": 0.5100556268720582, "grad_norm": 0.984375, "learning_rate": 2.6274944567627497e-05, "loss": 1.1132174730300903, "step": 894 }, { "epoch": 0.5106261588931679, "grad_norm": 0.9296875, "learning_rate": 2.6247228381374723e-05, "loss": 1.129286289215088, "step": 895 }, { "epoch": 0.5111966909142776, "grad_norm": 1.0078125, "learning_rate": 2.6219512195121952e-05, "loss": 1.1969499588012695, "step": 896 }, { "epoch": 0.5117672229353872, "grad_norm": 0.9921875, "learning_rate": 2.6191796008869178e-05, "loss": 1.1295521259307861, "step": 897 }, { "epoch": 0.5123377549564969, "grad_norm": 1.0390625, "learning_rate": 2.616407982261641e-05, "loss": 1.1657040119171143, "step": 898 }, { "epoch": 0.5129082869776066, "grad_norm": 0.953125, "learning_rate": 2.6136363636363637e-05, "loss": 1.182844638824463, "step": 899 }, { "epoch": 0.5134788189987163, "grad_norm": 0.92578125, "learning_rate": 2.6108647450110867e-05, "loss": 1.11708664894104, "step": 900 }, { "epoch": 0.514049351019826, "grad_norm": 0.953125, "learning_rate": 2.6080931263858093e-05, "loss": 1.1282655000686646, "step": 901 }, { "epoch": 0.5146198830409356, "grad_norm": 0.98046875, "learning_rate": 2.6053215077605326e-05, "loss": 1.1830154657363892, "step": 902 }, { "epoch": 0.5151904150620453, "grad_norm": 0.9765625, "learning_rate": 2.602549889135255e-05, "loss": 1.1873393058776855, "step": 903 }, { "epoch": 0.515760947083155, "grad_norm": 0.953125, "learning_rate": 2.5997782705099778e-05, "loss": 1.1280049085617065, "step": 904 }, { "epoch": 0.5163314791042647, "grad_norm": 0.96875, "learning_rate": 2.5970066518847007e-05, "loss": 1.1866214275360107, "step": 905 }, { "epoch": 0.5169020111253744, "grad_norm": 0.9296875, "learning_rate": 2.5942350332594233e-05, "loss": 1.132464051246643, "step": 906 }, { "epoch": 0.517472543146484, "grad_norm": 0.9921875, "learning_rate": 2.5914634146341466e-05, "loss": 1.2057054042816162, "step": 907 }, { "epoch": 0.5180430751675937, "grad_norm": 0.96875, "learning_rate": 2.5886917960088692e-05, "loss": 1.1725504398345947, "step": 908 }, { "epoch": 0.5186136071887034, "grad_norm": 1.0078125, "learning_rate": 2.585920177383592e-05, "loss": 1.2105215787887573, "step": 909 }, { "epoch": 0.5191841392098131, "grad_norm": 0.9375, "learning_rate": 2.5831485587583148e-05, "loss": 1.126555323600769, "step": 910 }, { "epoch": 0.5197546712309228, "grad_norm": 0.953125, "learning_rate": 2.580376940133038e-05, "loss": 1.117220401763916, "step": 911 }, { "epoch": 0.5203252032520326, "grad_norm": 0.98828125, "learning_rate": 2.5776053215077607e-05, "loss": 1.1578710079193115, "step": 912 }, { "epoch": 0.5208957352731423, "grad_norm": 0.97265625, "learning_rate": 2.5748337028824836e-05, "loss": 1.1631922721862793, "step": 913 }, { "epoch": 0.5214662672942519, "grad_norm": 0.9921875, "learning_rate": 2.5720620842572062e-05, "loss": 1.2013893127441406, "step": 914 }, { "epoch": 0.5220367993153616, "grad_norm": 1.0, "learning_rate": 2.5692904656319295e-05, "loss": 1.159932017326355, "step": 915 }, { "epoch": 0.5226073313364713, "grad_norm": 0.9453125, "learning_rate": 2.566518847006652e-05, "loss": 1.1213711500167847, "step": 916 }, { "epoch": 0.523177863357581, "grad_norm": 1.0, "learning_rate": 2.5637472283813747e-05, "loss": 1.2035624980926514, "step": 917 }, { "epoch": 0.5237483953786907, "grad_norm": 0.921875, "learning_rate": 2.5609756097560977e-05, "loss": 1.100569725036621, "step": 918 }, { "epoch": 0.5243189273998003, "grad_norm": 0.99609375, "learning_rate": 2.5582039911308203e-05, "loss": 1.1802055835723877, "step": 919 }, { "epoch": 0.52488945942091, "grad_norm": 0.9453125, "learning_rate": 2.5554323725055436e-05, "loss": 1.2129563093185425, "step": 920 }, { "epoch": 0.5254599914420197, "grad_norm": 0.984375, "learning_rate": 2.552660753880266e-05, "loss": 1.2040753364562988, "step": 921 }, { "epoch": 0.5260305234631294, "grad_norm": 0.9921875, "learning_rate": 2.549889135254989e-05, "loss": 1.1266067028045654, "step": 922 }, { "epoch": 0.5266010554842391, "grad_norm": 1.0078125, "learning_rate": 2.5471175166297117e-05, "loss": 1.1967592239379883, "step": 923 }, { "epoch": 0.5271715875053488, "grad_norm": 0.97265625, "learning_rate": 2.544345898004435e-05, "loss": 1.1658574342727661, "step": 924 }, { "epoch": 0.5277421195264584, "grad_norm": 0.96875, "learning_rate": 2.5415742793791576e-05, "loss": 1.1974247694015503, "step": 925 }, { "epoch": 0.5283126515475681, "grad_norm": 0.96484375, "learning_rate": 2.5388026607538806e-05, "loss": 1.175785779953003, "step": 926 }, { "epoch": 0.5288831835686778, "grad_norm": 0.98828125, "learning_rate": 2.5360310421286032e-05, "loss": 1.2295399904251099, "step": 927 }, { "epoch": 0.5294537155897875, "grad_norm": 1.015625, "learning_rate": 2.5332594235033258e-05, "loss": 1.1797332763671875, "step": 928 }, { "epoch": 0.5300242476108972, "grad_norm": 0.9375, "learning_rate": 2.530487804878049e-05, "loss": 1.1036921739578247, "step": 929 }, { "epoch": 0.5305947796320069, "grad_norm": 1.0, "learning_rate": 2.5277161862527717e-05, "loss": 1.1661919355392456, "step": 930 }, { "epoch": 0.5311653116531165, "grad_norm": 1.015625, "learning_rate": 2.5249445676274946e-05, "loss": 1.220758318901062, "step": 931 }, { "epoch": 0.5317358436742262, "grad_norm": 1.015625, "learning_rate": 2.5221729490022172e-05, "loss": 1.2072967290878296, "step": 932 }, { "epoch": 0.5323063756953359, "grad_norm": 0.98046875, "learning_rate": 2.5194013303769405e-05, "loss": 1.211767315864563, "step": 933 }, { "epoch": 0.5328769077164456, "grad_norm": 1.0, "learning_rate": 2.516629711751663e-05, "loss": 1.196463942527771, "step": 934 }, { "epoch": 0.5334474397375553, "grad_norm": 0.96484375, "learning_rate": 2.513858093126386e-05, "loss": 1.1342837810516357, "step": 935 }, { "epoch": 0.5340179717586649, "grad_norm": 0.9765625, "learning_rate": 2.5110864745011087e-05, "loss": 1.155871868133545, "step": 936 }, { "epoch": 0.5345885037797746, "grad_norm": 1.0, "learning_rate": 2.508314855875832e-05, "loss": 1.1863211393356323, "step": 937 }, { "epoch": 0.5351590358008843, "grad_norm": 0.96484375, "learning_rate": 2.5055432372505546e-05, "loss": 1.1399109363555908, "step": 938 }, { "epoch": 0.535729567821994, "grad_norm": 0.96875, "learning_rate": 2.5027716186252775e-05, "loss": 1.148442268371582, "step": 939 }, { "epoch": 0.5363000998431037, "grad_norm": 1.0234375, "learning_rate": 2.5e-05, "loss": 1.2298827171325684, "step": 940 }, { "epoch": 0.5368706318642134, "grad_norm": 0.953125, "learning_rate": 2.497228381374723e-05, "loss": 1.1379940509796143, "step": 941 }, { "epoch": 0.537441163885323, "grad_norm": 0.9453125, "learning_rate": 2.4944567627494457e-05, "loss": 1.1394915580749512, "step": 942 }, { "epoch": 0.5380116959064327, "grad_norm": 0.9921875, "learning_rate": 2.4916851441241686e-05, "loss": 1.180498480796814, "step": 943 }, { "epoch": 0.5385822279275424, "grad_norm": 1.0546875, "learning_rate": 2.4889135254988916e-05, "loss": 1.2175443172454834, "step": 944 }, { "epoch": 0.5391527599486521, "grad_norm": 0.98828125, "learning_rate": 2.4861419068736145e-05, "loss": 1.1404181718826294, "step": 945 }, { "epoch": 0.5397232919697618, "grad_norm": 0.9765625, "learning_rate": 2.483370288248337e-05, "loss": 1.1929075717926025, "step": 946 }, { "epoch": 0.5402938239908714, "grad_norm": 0.96484375, "learning_rate": 2.4805986696230597e-05, "loss": 1.1470379829406738, "step": 947 }, { "epoch": 0.5408643560119811, "grad_norm": 1.0, "learning_rate": 2.4778270509977827e-05, "loss": 1.1692397594451904, "step": 948 }, { "epoch": 0.5414348880330908, "grad_norm": 1.0078125, "learning_rate": 2.4750554323725056e-05, "loss": 1.2243307828903198, "step": 949 }, { "epoch": 0.5420054200542005, "grad_norm": 0.99609375, "learning_rate": 2.4722838137472286e-05, "loss": 1.1853331327438354, "step": 950 }, { "epoch": 0.5425759520753102, "grad_norm": 1.015625, "learning_rate": 2.4695121951219512e-05, "loss": 1.2312514781951904, "step": 951 }, { "epoch": 0.5431464840964199, "grad_norm": 0.953125, "learning_rate": 2.466740576496674e-05, "loss": 1.1487960815429688, "step": 952 }, { "epoch": 0.5437170161175297, "grad_norm": 0.96875, "learning_rate": 2.463968957871397e-05, "loss": 1.1434435844421387, "step": 953 }, { "epoch": 0.5442875481386393, "grad_norm": 0.97265625, "learning_rate": 2.46119733924612e-05, "loss": 1.2065646648406982, "step": 954 }, { "epoch": 0.544858080159749, "grad_norm": 0.96875, "learning_rate": 2.4584257206208426e-05, "loss": 1.1631767749786377, "step": 955 }, { "epoch": 0.5454286121808587, "grad_norm": 0.94921875, "learning_rate": 2.4556541019955656e-05, "loss": 1.19287109375, "step": 956 }, { "epoch": 0.5459991442019684, "grad_norm": 0.98828125, "learning_rate": 2.4528824833702885e-05, "loss": 1.183131456375122, "step": 957 }, { "epoch": 0.5465696762230781, "grad_norm": 0.953125, "learning_rate": 2.4501108647450115e-05, "loss": 1.1865886449813843, "step": 958 }, { "epoch": 0.5471402082441877, "grad_norm": 0.98046875, "learning_rate": 2.447339246119734e-05, "loss": 1.1511285305023193, "step": 959 }, { "epoch": 0.5477107402652974, "grad_norm": 0.96484375, "learning_rate": 2.4445676274944567e-05, "loss": 1.1591591835021973, "step": 960 }, { "epoch": 0.5482812722864071, "grad_norm": 0.9375, "learning_rate": 2.4417960088691796e-05, "loss": 1.1885075569152832, "step": 961 }, { "epoch": 0.5488518043075168, "grad_norm": 0.97265625, "learning_rate": 2.4390243902439026e-05, "loss": 1.1785187721252441, "step": 962 }, { "epoch": 0.5494223363286265, "grad_norm": 0.96484375, "learning_rate": 2.4362527716186255e-05, "loss": 1.1689701080322266, "step": 963 }, { "epoch": 0.5499928683497362, "grad_norm": 0.94921875, "learning_rate": 2.433481152993348e-05, "loss": 1.1543480157852173, "step": 964 }, { "epoch": 0.5505634003708458, "grad_norm": 1.0078125, "learning_rate": 2.430709534368071e-05, "loss": 1.196134328842163, "step": 965 }, { "epoch": 0.5511339323919555, "grad_norm": 0.98828125, "learning_rate": 2.427937915742794e-05, "loss": 1.2235426902770996, "step": 966 }, { "epoch": 0.5517044644130652, "grad_norm": 0.96484375, "learning_rate": 2.425166297117517e-05, "loss": 1.2253239154815674, "step": 967 }, { "epoch": 0.5522749964341749, "grad_norm": 0.953125, "learning_rate": 2.4223946784922396e-05, "loss": 1.1899304389953613, "step": 968 }, { "epoch": 0.5528455284552846, "grad_norm": 1.0234375, "learning_rate": 2.4196230598669625e-05, "loss": 1.1620666980743408, "step": 969 }, { "epoch": 0.5534160604763942, "grad_norm": 0.9765625, "learning_rate": 2.4168514412416855e-05, "loss": 1.1896693706512451, "step": 970 }, { "epoch": 0.5539865924975039, "grad_norm": 0.921875, "learning_rate": 2.414079822616408e-05, "loss": 1.1168513298034668, "step": 971 }, { "epoch": 0.5545571245186136, "grad_norm": 0.9453125, "learning_rate": 2.4113082039911307e-05, "loss": 1.1533100605010986, "step": 972 }, { "epoch": 0.5551276565397233, "grad_norm": 0.953125, "learning_rate": 2.4085365853658536e-05, "loss": 1.11790132522583, "step": 973 }, { "epoch": 0.555698188560833, "grad_norm": 0.96875, "learning_rate": 2.4057649667405766e-05, "loss": 1.1832971572875977, "step": 974 }, { "epoch": 0.5562687205819427, "grad_norm": 0.9375, "learning_rate": 2.4029933481152995e-05, "loss": 1.136374592781067, "step": 975 }, { "epoch": 0.5568392526030523, "grad_norm": 0.94140625, "learning_rate": 2.400221729490022e-05, "loss": 1.13529634475708, "step": 976 }, { "epoch": 0.557409784624162, "grad_norm": 0.94140625, "learning_rate": 2.397450110864745e-05, "loss": 1.152282476425171, "step": 977 }, { "epoch": 0.5579803166452717, "grad_norm": 0.9375, "learning_rate": 2.394678492239468e-05, "loss": 1.1445283889770508, "step": 978 }, { "epoch": 0.5585508486663814, "grad_norm": 0.96875, "learning_rate": 2.391906873614191e-05, "loss": 1.1682907342910767, "step": 979 }, { "epoch": 0.5591213806874911, "grad_norm": 0.96484375, "learning_rate": 2.3891352549889136e-05, "loss": 1.2181129455566406, "step": 980 }, { "epoch": 0.5596919127086007, "grad_norm": 0.94140625, "learning_rate": 2.3863636363636365e-05, "loss": 1.1683390140533447, "step": 981 }, { "epoch": 0.5602624447297104, "grad_norm": 0.9609375, "learning_rate": 2.3835920177383595e-05, "loss": 1.1526210308074951, "step": 982 }, { "epoch": 0.5608329767508201, "grad_norm": 0.94140625, "learning_rate": 2.380820399113082e-05, "loss": 1.1839709281921387, "step": 983 }, { "epoch": 0.5614035087719298, "grad_norm": 0.96875, "learning_rate": 2.378048780487805e-05, "loss": 1.171961784362793, "step": 984 }, { "epoch": 0.5619740407930395, "grad_norm": 0.96484375, "learning_rate": 2.3752771618625276e-05, "loss": 1.1404699087142944, "step": 985 }, { "epoch": 0.5625445728141492, "grad_norm": 0.9375, "learning_rate": 2.3725055432372506e-05, "loss": 1.1446641683578491, "step": 986 }, { "epoch": 0.5631151048352588, "grad_norm": 0.9375, "learning_rate": 2.3697339246119735e-05, "loss": 1.1063508987426758, "step": 987 }, { "epoch": 0.5636856368563685, "grad_norm": 0.9921875, "learning_rate": 2.3669623059866965e-05, "loss": 1.1023223400115967, "step": 988 }, { "epoch": 0.5642561688774782, "grad_norm": 0.9453125, "learning_rate": 2.364190687361419e-05, "loss": 1.157923698425293, "step": 989 }, { "epoch": 0.5648267008985879, "grad_norm": 0.9609375, "learning_rate": 2.361419068736142e-05, "loss": 1.1578837633132935, "step": 990 }, { "epoch": 0.5653972329196976, "grad_norm": 0.94140625, "learning_rate": 2.358647450110865e-05, "loss": 1.110813856124878, "step": 991 }, { "epoch": 0.5659677649408072, "grad_norm": 0.91796875, "learning_rate": 2.355875831485588e-05, "loss": 1.1383073329925537, "step": 992 }, { "epoch": 0.566538296961917, "grad_norm": 0.94921875, "learning_rate": 2.3531042128603105e-05, "loss": 1.1709469556808472, "step": 993 }, { "epoch": 0.5671088289830267, "grad_norm": 0.984375, "learning_rate": 2.3503325942350335e-05, "loss": 1.1664437055587769, "step": 994 }, { "epoch": 0.5676793610041364, "grad_norm": 0.953125, "learning_rate": 2.347560975609756e-05, "loss": 1.1766831874847412, "step": 995 }, { "epoch": 0.5682498930252461, "grad_norm": 0.92578125, "learning_rate": 2.344789356984479e-05, "loss": 1.1888954639434814, "step": 996 }, { "epoch": 0.5688204250463558, "grad_norm": 1.0078125, "learning_rate": 2.3420177383592016e-05, "loss": 1.1901835203170776, "step": 997 }, { "epoch": 0.5693909570674655, "grad_norm": 0.9140625, "learning_rate": 2.3392461197339246e-05, "loss": 1.13261079788208, "step": 998 }, { "epoch": 0.5699614890885751, "grad_norm": 0.99609375, "learning_rate": 2.3364745011086475e-05, "loss": 1.2113161087036133, "step": 999 }, { "epoch": 0.5705320211096848, "grad_norm": 0.9609375, "learning_rate": 2.3337028824833705e-05, "loss": 1.1643033027648926, "step": 1000 }, { "epoch": 0.5711025531307945, "grad_norm": 1.0, "learning_rate": 2.330931263858093e-05, "loss": 1.2085559368133545, "step": 1001 }, { "epoch": 0.5716730851519042, "grad_norm": 0.97265625, "learning_rate": 2.328159645232816e-05, "loss": 1.1837122440338135, "step": 1002 }, { "epoch": 0.5722436171730139, "grad_norm": 1.03125, "learning_rate": 2.325388026607539e-05, "loss": 1.2685991525650024, "step": 1003 }, { "epoch": 0.5728141491941235, "grad_norm": 0.95703125, "learning_rate": 2.322616407982262e-05, "loss": 1.1660895347595215, "step": 1004 }, { "epoch": 0.5733846812152332, "grad_norm": 1.0, "learning_rate": 2.3198447893569845e-05, "loss": 1.1840052604675293, "step": 1005 }, { "epoch": 0.5739552132363429, "grad_norm": 0.9296875, "learning_rate": 2.3170731707317075e-05, "loss": 1.1665326356887817, "step": 1006 }, { "epoch": 0.5745257452574526, "grad_norm": 0.99609375, "learning_rate": 2.3143015521064304e-05, "loss": 1.1994144916534424, "step": 1007 }, { "epoch": 0.5750962772785623, "grad_norm": 0.9921875, "learning_rate": 2.311529933481153e-05, "loss": 1.1023156642913818, "step": 1008 }, { "epoch": 0.575666809299672, "grad_norm": 0.9453125, "learning_rate": 2.308758314855876e-05, "loss": 1.2176637649536133, "step": 1009 }, { "epoch": 0.5762373413207816, "grad_norm": 1.0390625, "learning_rate": 2.3059866962305986e-05, "loss": 1.2663724422454834, "step": 1010 }, { "epoch": 0.5768078733418913, "grad_norm": 1.0, "learning_rate": 2.3032150776053215e-05, "loss": 1.1681220531463623, "step": 1011 }, { "epoch": 0.577378405363001, "grad_norm": 1.0078125, "learning_rate": 2.3004434589800445e-05, "loss": 1.221947431564331, "step": 1012 }, { "epoch": 0.5779489373841107, "grad_norm": 0.9921875, "learning_rate": 2.2976718403547674e-05, "loss": 1.1309971809387207, "step": 1013 }, { "epoch": 0.5785194694052204, "grad_norm": 0.98828125, "learning_rate": 2.29490022172949e-05, "loss": 1.1859217882156372, "step": 1014 }, { "epoch": 0.57909000142633, "grad_norm": 1.0, "learning_rate": 2.292128603104213e-05, "loss": 1.1979272365570068, "step": 1015 }, { "epoch": 0.5796605334474397, "grad_norm": 1.0, "learning_rate": 2.289356984478936e-05, "loss": 1.1865754127502441, "step": 1016 }, { "epoch": 0.5802310654685494, "grad_norm": 0.95703125, "learning_rate": 2.286585365853659e-05, "loss": 1.1868486404418945, "step": 1017 }, { "epoch": 0.5808015974896591, "grad_norm": 0.921875, "learning_rate": 2.2838137472283815e-05, "loss": 1.129669427871704, "step": 1018 }, { "epoch": 0.5813721295107688, "grad_norm": 0.9375, "learning_rate": 2.2810421286031044e-05, "loss": 1.1734843254089355, "step": 1019 }, { "epoch": 0.5819426615318785, "grad_norm": 1.0390625, "learning_rate": 2.278270509977827e-05, "loss": 1.2343952655792236, "step": 1020 }, { "epoch": 0.5825131935529881, "grad_norm": 0.96484375, "learning_rate": 2.27549889135255e-05, "loss": 1.21380615234375, "step": 1021 }, { "epoch": 0.5830837255740978, "grad_norm": 0.9296875, "learning_rate": 2.272727272727273e-05, "loss": 1.1312305927276611, "step": 1022 }, { "epoch": 0.5836542575952075, "grad_norm": 0.9609375, "learning_rate": 2.2699556541019955e-05, "loss": 1.1510472297668457, "step": 1023 }, { "epoch": 0.5842247896163172, "grad_norm": 1.0, "learning_rate": 2.2671840354767185e-05, "loss": 1.1997393369674683, "step": 1024 }, { "epoch": 0.5847953216374269, "grad_norm": 0.953125, "learning_rate": 2.2644124168514414e-05, "loss": 1.1844977140426636, "step": 1025 }, { "epoch": 0.5853658536585366, "grad_norm": 0.95703125, "learning_rate": 2.261640798226164e-05, "loss": 1.1642664670944214, "step": 1026 }, { "epoch": 0.5859363856796462, "grad_norm": 0.95703125, "learning_rate": 2.258869179600887e-05, "loss": 1.1929872035980225, "step": 1027 }, { "epoch": 0.5865069177007559, "grad_norm": 0.98046875, "learning_rate": 2.25609756097561e-05, "loss": 1.2264790534973145, "step": 1028 }, { "epoch": 0.5870774497218656, "grad_norm": 0.96484375, "learning_rate": 2.253325942350333e-05, "loss": 1.208320140838623, "step": 1029 }, { "epoch": 0.5876479817429753, "grad_norm": 0.9140625, "learning_rate": 2.2505543237250555e-05, "loss": 1.1017545461654663, "step": 1030 }, { "epoch": 0.588218513764085, "grad_norm": 0.91015625, "learning_rate": 2.2477827050997784e-05, "loss": 1.0866947174072266, "step": 1031 }, { "epoch": 0.5887890457851946, "grad_norm": 0.94140625, "learning_rate": 2.245011086474501e-05, "loss": 1.134414553642273, "step": 1032 }, { "epoch": 0.5893595778063043, "grad_norm": 0.9609375, "learning_rate": 2.242239467849224e-05, "loss": 1.1386680603027344, "step": 1033 }, { "epoch": 0.5899301098274141, "grad_norm": 0.93359375, "learning_rate": 2.239467849223947e-05, "loss": 1.098857045173645, "step": 1034 }, { "epoch": 0.5905006418485238, "grad_norm": 0.98046875, "learning_rate": 2.2366962305986695e-05, "loss": 1.1710071563720703, "step": 1035 }, { "epoch": 0.5910711738696335, "grad_norm": 0.90234375, "learning_rate": 2.2339246119733925e-05, "loss": 1.1196489334106445, "step": 1036 }, { "epoch": 0.5916417058907432, "grad_norm": 0.94140625, "learning_rate": 2.2311529933481154e-05, "loss": 1.132148265838623, "step": 1037 }, { "epoch": 0.5922122379118528, "grad_norm": 0.9453125, "learning_rate": 2.2283813747228384e-05, "loss": 1.1694618463516235, "step": 1038 }, { "epoch": 0.5927827699329625, "grad_norm": 0.94140625, "learning_rate": 2.225609756097561e-05, "loss": 1.141546607017517, "step": 1039 }, { "epoch": 0.5933533019540722, "grad_norm": 1.015625, "learning_rate": 2.222838137472284e-05, "loss": 1.214141607284546, "step": 1040 }, { "epoch": 0.5939238339751819, "grad_norm": 0.9375, "learning_rate": 2.220066518847007e-05, "loss": 1.142057180404663, "step": 1041 }, { "epoch": 0.5944943659962916, "grad_norm": 0.9609375, "learning_rate": 2.2172949002217298e-05, "loss": 1.1707711219787598, "step": 1042 }, { "epoch": 0.5950648980174013, "grad_norm": 0.91796875, "learning_rate": 2.2145232815964524e-05, "loss": 1.164795994758606, "step": 1043 }, { "epoch": 0.5956354300385109, "grad_norm": 0.97265625, "learning_rate": 2.211751662971175e-05, "loss": 1.1659691333770752, "step": 1044 }, { "epoch": 0.5962059620596206, "grad_norm": 0.94921875, "learning_rate": 2.208980044345898e-05, "loss": 1.1294951438903809, "step": 1045 }, { "epoch": 0.5967764940807303, "grad_norm": 0.96875, "learning_rate": 2.206208425720621e-05, "loss": 1.1925092935562134, "step": 1046 }, { "epoch": 0.59734702610184, "grad_norm": 0.93359375, "learning_rate": 2.203436807095344e-05, "loss": 1.1600418090820312, "step": 1047 }, { "epoch": 0.5979175581229497, "grad_norm": 0.98046875, "learning_rate": 2.2006651884700665e-05, "loss": 1.157020092010498, "step": 1048 }, { "epoch": 0.5984880901440593, "grad_norm": 1.0234375, "learning_rate": 2.1978935698447894e-05, "loss": 1.1589795351028442, "step": 1049 }, { "epoch": 0.599058622165169, "grad_norm": 0.9453125, "learning_rate": 2.1951219512195124e-05, "loss": 1.1546876430511475, "step": 1050 }, { "epoch": 0.5996291541862787, "grad_norm": 0.94140625, "learning_rate": 2.1923503325942353e-05, "loss": 1.1549787521362305, "step": 1051 }, { "epoch": 0.6001996862073884, "grad_norm": 0.9921875, "learning_rate": 2.189578713968958e-05, "loss": 1.1518681049346924, "step": 1052 }, { "epoch": 0.6007702182284981, "grad_norm": 0.96484375, "learning_rate": 2.186807095343681e-05, "loss": 1.1609306335449219, "step": 1053 }, { "epoch": 0.6013407502496078, "grad_norm": 0.97265625, "learning_rate": 2.1840354767184038e-05, "loss": 1.1526927947998047, "step": 1054 }, { "epoch": 0.6019112822707174, "grad_norm": 0.98046875, "learning_rate": 2.1812638580931268e-05, "loss": 1.2030518054962158, "step": 1055 }, { "epoch": 0.6024818142918271, "grad_norm": 0.94921875, "learning_rate": 2.178492239467849e-05, "loss": 1.087314248085022, "step": 1056 }, { "epoch": 0.6030523463129368, "grad_norm": 0.93359375, "learning_rate": 2.175720620842572e-05, "loss": 1.120784044265747, "step": 1057 }, { "epoch": 0.6036228783340465, "grad_norm": 0.921875, "learning_rate": 2.172949002217295e-05, "loss": 1.0867156982421875, "step": 1058 }, { "epoch": 0.6041934103551562, "grad_norm": 0.96484375, "learning_rate": 2.170177383592018e-05, "loss": 1.2083582878112793, "step": 1059 }, { "epoch": 0.6047639423762659, "grad_norm": 0.94921875, "learning_rate": 2.1674057649667405e-05, "loss": 1.1944574117660522, "step": 1060 }, { "epoch": 0.6053344743973755, "grad_norm": 0.92578125, "learning_rate": 2.1646341463414634e-05, "loss": 1.118787169456482, "step": 1061 }, { "epoch": 0.6059050064184852, "grad_norm": 0.94921875, "learning_rate": 2.1618625277161864e-05, "loss": 1.1591801643371582, "step": 1062 }, { "epoch": 0.6064755384395949, "grad_norm": 0.95703125, "learning_rate": 2.1590909090909093e-05, "loss": 1.1802964210510254, "step": 1063 }, { "epoch": 0.6070460704607046, "grad_norm": 0.97265625, "learning_rate": 2.156319290465632e-05, "loss": 1.1993342638015747, "step": 1064 }, { "epoch": 0.6076166024818143, "grad_norm": 0.96484375, "learning_rate": 2.153547671840355e-05, "loss": 1.2244541645050049, "step": 1065 }, { "epoch": 0.6081871345029239, "grad_norm": 0.9375, "learning_rate": 2.150776053215078e-05, "loss": 1.1696969270706177, "step": 1066 }, { "epoch": 0.6087576665240336, "grad_norm": 0.9609375, "learning_rate": 2.1480044345898008e-05, "loss": 1.204698085784912, "step": 1067 }, { "epoch": 0.6093281985451433, "grad_norm": 0.96875, "learning_rate": 2.1452328159645234e-05, "loss": 1.167772650718689, "step": 1068 }, { "epoch": 0.609898730566253, "grad_norm": 0.93359375, "learning_rate": 2.142461197339246e-05, "loss": 1.1064563989639282, "step": 1069 }, { "epoch": 0.6104692625873627, "grad_norm": 0.9296875, "learning_rate": 2.139689578713969e-05, "loss": 1.1095709800720215, "step": 1070 }, { "epoch": 0.6110397946084724, "grad_norm": 0.953125, "learning_rate": 2.136917960088692e-05, "loss": 1.1526896953582764, "step": 1071 }, { "epoch": 0.611610326629582, "grad_norm": 0.98828125, "learning_rate": 2.134146341463415e-05, "loss": 1.1842620372772217, "step": 1072 }, { "epoch": 0.6121808586506917, "grad_norm": 0.96484375, "learning_rate": 2.1313747228381374e-05, "loss": 1.1854032278060913, "step": 1073 }, { "epoch": 0.6127513906718015, "grad_norm": 0.94140625, "learning_rate": 2.1286031042128604e-05, "loss": 1.1536649465560913, "step": 1074 }, { "epoch": 0.6133219226929112, "grad_norm": 0.99609375, "learning_rate": 2.1258314855875833e-05, "loss": 1.162165641784668, "step": 1075 }, { "epoch": 0.6138924547140209, "grad_norm": 0.95703125, "learning_rate": 2.1230598669623063e-05, "loss": 1.1589579582214355, "step": 1076 }, { "epoch": 0.6144629867351306, "grad_norm": 0.99609375, "learning_rate": 2.120288248337029e-05, "loss": 1.2380765676498413, "step": 1077 }, { "epoch": 0.6150335187562402, "grad_norm": 0.9921875, "learning_rate": 2.117516629711752e-05, "loss": 1.1789859533309937, "step": 1078 }, { "epoch": 0.6156040507773499, "grad_norm": 0.92578125, "learning_rate": 2.1147450110864748e-05, "loss": 1.1379293203353882, "step": 1079 }, { "epoch": 0.6161745827984596, "grad_norm": 0.984375, "learning_rate": 2.1119733924611977e-05, "loss": 1.176946759223938, "step": 1080 }, { "epoch": 0.6167451148195693, "grad_norm": 0.98046875, "learning_rate": 2.10920177383592e-05, "loss": 1.232793927192688, "step": 1081 }, { "epoch": 0.617315646840679, "grad_norm": 0.94140625, "learning_rate": 2.106430155210643e-05, "loss": 1.1333751678466797, "step": 1082 }, { "epoch": 0.6178861788617886, "grad_norm": 0.98046875, "learning_rate": 2.103658536585366e-05, "loss": 1.1847493648529053, "step": 1083 }, { "epoch": 0.6184567108828983, "grad_norm": 0.98828125, "learning_rate": 2.100886917960089e-05, "loss": 1.1365629434585571, "step": 1084 }, { "epoch": 0.619027242904008, "grad_norm": 0.9609375, "learning_rate": 2.0981152993348114e-05, "loss": 1.1531561613082886, "step": 1085 }, { "epoch": 0.6195977749251177, "grad_norm": 0.9765625, "learning_rate": 2.0953436807095344e-05, "loss": 1.1419352293014526, "step": 1086 }, { "epoch": 0.6201683069462274, "grad_norm": 0.95703125, "learning_rate": 2.0925720620842573e-05, "loss": 1.2071990966796875, "step": 1087 }, { "epoch": 0.6207388389673371, "grad_norm": 1.0078125, "learning_rate": 2.0898004434589803e-05, "loss": 1.146884799003601, "step": 1088 }, { "epoch": 0.6213093709884467, "grad_norm": 1.0, "learning_rate": 2.087028824833703e-05, "loss": 1.1956453323364258, "step": 1089 }, { "epoch": 0.6218799030095564, "grad_norm": 0.97265625, "learning_rate": 2.084257206208426e-05, "loss": 1.182574987411499, "step": 1090 }, { "epoch": 0.6224504350306661, "grad_norm": 0.9765625, "learning_rate": 2.0814855875831488e-05, "loss": 1.1805145740509033, "step": 1091 }, { "epoch": 0.6230209670517758, "grad_norm": 0.96484375, "learning_rate": 2.0787139689578717e-05, "loss": 1.173978567123413, "step": 1092 }, { "epoch": 0.6235914990728855, "grad_norm": 0.9375, "learning_rate": 2.0759423503325943e-05, "loss": 1.1732361316680908, "step": 1093 }, { "epoch": 0.6241620310939952, "grad_norm": 0.94921875, "learning_rate": 2.073170731707317e-05, "loss": 1.1978164911270142, "step": 1094 }, { "epoch": 0.6247325631151048, "grad_norm": 0.96484375, "learning_rate": 2.07039911308204e-05, "loss": 1.161289930343628, "step": 1095 }, { "epoch": 0.6253030951362145, "grad_norm": 0.953125, "learning_rate": 2.067627494456763e-05, "loss": 1.1583458185195923, "step": 1096 }, { "epoch": 0.6258736271573242, "grad_norm": 0.9765625, "learning_rate": 2.0648558758314858e-05, "loss": 1.1835911273956299, "step": 1097 }, { "epoch": 0.6264441591784339, "grad_norm": 0.9921875, "learning_rate": 2.0620842572062084e-05, "loss": 1.1692794561386108, "step": 1098 }, { "epoch": 0.6270146911995436, "grad_norm": 0.97265625, "learning_rate": 2.0593126385809313e-05, "loss": 1.1748257875442505, "step": 1099 }, { "epoch": 0.6275852232206532, "grad_norm": 0.9765625, "learning_rate": 2.0565410199556543e-05, "loss": 1.172876238822937, "step": 1100 }, { "epoch": 0.6281557552417629, "grad_norm": 1.0078125, "learning_rate": 2.0537694013303772e-05, "loss": 1.1829420328140259, "step": 1101 }, { "epoch": 0.6287262872628726, "grad_norm": 0.9375, "learning_rate": 2.0509977827051e-05, "loss": 1.163160800933838, "step": 1102 }, { "epoch": 0.6292968192839823, "grad_norm": 0.96484375, "learning_rate": 2.0482261640798228e-05, "loss": 1.144565463066101, "step": 1103 }, { "epoch": 0.629867351305092, "grad_norm": 0.953125, "learning_rate": 2.0454545454545457e-05, "loss": 1.1199369430541992, "step": 1104 }, { "epoch": 0.6304378833262017, "grad_norm": 0.9765625, "learning_rate": 2.0426829268292683e-05, "loss": 1.1951239109039307, "step": 1105 }, { "epoch": 0.6310084153473113, "grad_norm": 0.96484375, "learning_rate": 2.0399113082039913e-05, "loss": 1.1440958976745605, "step": 1106 }, { "epoch": 0.631578947368421, "grad_norm": 0.94140625, "learning_rate": 2.037139689578714e-05, "loss": 1.1329402923583984, "step": 1107 }, { "epoch": 0.631578947368421, "eval_loss": 1.1687453985214233, "eval_runtime": 80.1565, "eval_samples_per_second": 11.927, "eval_steps_per_second": 2.982, "step": 1107 } ], "logging_steps": 1, "max_steps": 1841, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 369, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.75350724523733e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }