{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.954168967421314, "eval_steps": 500, "global_step": 280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0176697956929873, "grad_norm": 5.875631617147988, "learning_rate": 2.8571428571428573e-06, "loss": 0.8112, "step": 1 }, { "epoch": 0.0353395913859746, "grad_norm": 5.961953835603671, "learning_rate": 5.7142857142857145e-06, "loss": 0.8194, "step": 2 }, { "epoch": 0.0530093870789619, "grad_norm": 5.5057251196001635, "learning_rate": 8.571428571428571e-06, "loss": 0.796, "step": 3 }, { "epoch": 0.0706791827719492, "grad_norm": 2.504760468622455, "learning_rate": 1.1428571428571429e-05, "loss": 0.7065, "step": 4 }, { "epoch": 0.0883489784649365, "grad_norm": 3.925627650334603, "learning_rate": 1.4285714285714287e-05, "loss": 0.7078, "step": 5 }, { "epoch": 0.1060187741579238, "grad_norm": 4.140394334047557, "learning_rate": 1.7142857142857142e-05, "loss": 0.6955, "step": 6 }, { "epoch": 0.1236885698509111, "grad_norm": 4.544574066739854, "learning_rate": 2e-05, "loss": 0.6606, "step": 7 }, { "epoch": 0.1413583655438984, "grad_norm": 2.8858399260590226, "learning_rate": 2.2857142857142858e-05, "loss": 0.6356, "step": 8 }, { "epoch": 0.1590281612368857, "grad_norm": 2.810080725674096, "learning_rate": 2.5714285714285718e-05, "loss": 0.6119, "step": 9 }, { "epoch": 0.176697956929873, "grad_norm": 2.3552991906071616, "learning_rate": 2.8571428571428574e-05, "loss": 0.6021, "step": 10 }, { "epoch": 0.1943677526228603, "grad_norm": 1.2910105034140027, "learning_rate": 3.142857142857143e-05, "loss": 0.5694, "step": 11 }, { "epoch": 0.2120375483158476, "grad_norm": 1.6332855505381296, "learning_rate": 3.4285714285714284e-05, "loss": 0.554, "step": 12 }, { "epoch": 0.22970734400883489, "grad_norm": 1.2595348729805071, "learning_rate": 3.714285714285715e-05, "loss": 0.5432, "step": 13 }, { "epoch": 0.2473771397018222, "grad_norm": 1.3472823656550243, "learning_rate": 4e-05, "loss": 0.536, "step": 14 }, { "epoch": 0.2650469353948095, "grad_norm": 0.9873839365603844, "learning_rate": 4.2857142857142856e-05, "loss": 0.5393, "step": 15 }, { "epoch": 0.2827167310877968, "grad_norm": 1.1377505855165275, "learning_rate": 4.5714285714285716e-05, "loss": 0.5293, "step": 16 }, { "epoch": 0.3003865267807841, "grad_norm": 1.0613233081748863, "learning_rate": 4.857142857142857e-05, "loss": 0.5129, "step": 17 }, { "epoch": 0.3180563224737714, "grad_norm": 1.1184007657565689, "learning_rate": 5.1428571428571436e-05, "loss": 0.5091, "step": 18 }, { "epoch": 0.3357261181667587, "grad_norm": 1.1596525894157543, "learning_rate": 5.4285714285714295e-05, "loss": 0.5141, "step": 19 }, { "epoch": 0.353395913859746, "grad_norm": 1.1056524487969162, "learning_rate": 5.714285714285715e-05, "loss": 0.5065, "step": 20 }, { "epoch": 0.37106570955273327, "grad_norm": 1.2246588779747853, "learning_rate": 6.000000000000001e-05, "loss": 0.5001, "step": 21 }, { "epoch": 0.3887355052457206, "grad_norm": 0.9005160048442259, "learning_rate": 6.285714285714286e-05, "loss": 0.4948, "step": 22 }, { "epoch": 0.4064053009387079, "grad_norm": 1.468175523379714, "learning_rate": 6.571428571428571e-05, "loss": 0.5029, "step": 23 }, { "epoch": 0.4240750966316952, "grad_norm": 0.7100895840287704, "learning_rate": 6.857142857142857e-05, "loss": 0.4901, "step": 24 }, { "epoch": 0.4417448923246825, "grad_norm": 1.4524384718122851, "learning_rate": 7.142857142857143e-05, "loss": 0.5015, "step": 25 }, { "epoch": 0.45941468801766977, "grad_norm": 3.29851781812619, "learning_rate": 7.42857142857143e-05, "loss": 0.4866, "step": 26 }, { "epoch": 0.4770844837106571, "grad_norm": 1.6338876773578732, "learning_rate": 7.714285714285715e-05, "loss": 0.5027, "step": 27 }, { "epoch": 0.4947542794036444, "grad_norm": 1.1095768120544467, "learning_rate": 8e-05, "loss": 0.4881, "step": 28 }, { "epoch": 0.5124240750966317, "grad_norm": 0.9195853861546358, "learning_rate": 7.9996891699239e-05, "loss": 0.4766, "step": 29 }, { "epoch": 0.530093870789619, "grad_norm": 1.893948627853531, "learning_rate": 7.998756728003266e-05, "loss": 0.5003, "step": 30 }, { "epoch": 0.5477636664826063, "grad_norm": 1.3966468358301396, "learning_rate": 7.997202819153595e-05, "loss": 0.4748, "step": 31 }, { "epoch": 0.5654334621755936, "grad_norm": 1.173904127551551, "learning_rate": 7.99502768487569e-05, "loss": 0.4728, "step": 32 }, { "epoch": 0.5831032578685809, "grad_norm": 1.6511214276120525, "learning_rate": 7.992231663218129e-05, "loss": 0.4727, "step": 33 }, { "epoch": 0.6007730535615682, "grad_norm": 0.8130185719483476, "learning_rate": 7.988815188724721e-05, "loss": 0.4696, "step": 34 }, { "epoch": 0.6184428492545555, "grad_norm": 1.2456661967993063, "learning_rate": 7.984778792366983e-05, "loss": 0.4711, "step": 35 }, { "epoch": 0.6361126449475428, "grad_norm": 1.0570393538806437, "learning_rate": 7.980123101461606e-05, "loss": 0.463, "step": 36 }, { "epoch": 0.6537824406405301, "grad_norm": 0.9901050412456828, "learning_rate": 7.974848839572971e-05, "loss": 0.4578, "step": 37 }, { "epoch": 0.6714522363335174, "grad_norm": 0.8705213139045455, "learning_rate": 7.96895682640069e-05, "loss": 0.4645, "step": 38 }, { "epoch": 0.6891220320265047, "grad_norm": 0.7854045575641022, "learning_rate": 7.962447977652211e-05, "loss": 0.4647, "step": 39 }, { "epoch": 0.706791827719492, "grad_norm": 0.8451669852541023, "learning_rate": 7.955323304900514e-05, "loss": 0.4609, "step": 40 }, { "epoch": 0.7244616234124793, "grad_norm": 1.0240484592422439, "learning_rate": 7.947583915426885e-05, "loss": 0.4583, "step": 41 }, { "epoch": 0.7421314191054665, "grad_norm": 0.9325468461529605, "learning_rate": 7.939231012048833e-05, "loss": 0.4599, "step": 42 }, { "epoch": 0.7598012147984539, "grad_norm": 0.8208173672541784, "learning_rate": 7.930265892933154e-05, "loss": 0.4488, "step": 43 }, { "epoch": 0.7774710104914412, "grad_norm": 0.8066370895813, "learning_rate": 7.920689951394175e-05, "loss": 0.4599, "step": 44 }, { "epoch": 0.7951408061844285, "grad_norm": 0.5967750986819955, "learning_rate": 7.91050467567722e-05, "loss": 0.4531, "step": 45 }, { "epoch": 0.8128106018774158, "grad_norm": 0.6644148535772102, "learning_rate": 7.899711648727294e-05, "loss": 0.4503, "step": 46 }, { "epoch": 0.830480397570403, "grad_norm": 0.7344655144114142, "learning_rate": 7.888312547943099e-05, "loss": 0.4509, "step": 47 }, { "epoch": 0.8481501932633904, "grad_norm": 3.462692478412094, "learning_rate": 7.876309144916312e-05, "loss": 0.4933, "step": 48 }, { "epoch": 0.8658199889563777, "grad_norm": 1.0900503309702443, "learning_rate": 7.863703305156273e-05, "loss": 0.4673, "step": 49 }, { "epoch": 0.883489784649365, "grad_norm": 1.372735585333971, "learning_rate": 7.850496987800048e-05, "loss": 0.45, "step": 50 }, { "epoch": 0.9011595803423523, "grad_norm": 0.9943280611059013, "learning_rate": 7.836692245307951e-05, "loss": 0.4619, "step": 51 }, { "epoch": 0.9188293760353395, "grad_norm": 1.1230540437524632, "learning_rate": 7.822291223144564e-05, "loss": 0.4602, "step": 52 }, { "epoch": 0.9364991717283269, "grad_norm": 0.9358182024156444, "learning_rate": 7.80729615944529e-05, "loss": 0.4627, "step": 53 }, { "epoch": 0.9541689674213142, "grad_norm": 0.6965038938335383, "learning_rate": 7.791709384668528e-05, "loss": 0.4377, "step": 54 }, { "epoch": 0.9718387631143015, "grad_norm": 0.8794762258340597, "learning_rate": 7.775533321233471e-05, "loss": 0.4416, "step": 55 }, { "epoch": 0.9895085588072888, "grad_norm": 0.5702192945738279, "learning_rate": 7.758770483143634e-05, "loss": 0.4381, "step": 56 }, { "epoch": 1.0088348978464936, "grad_norm": 1.0158837364149667, "learning_rate": 7.741423475596136e-05, "loss": 0.6602, "step": 57 }, { "epoch": 1.026504693539481, "grad_norm": 0.7661664733655509, "learning_rate": 7.723494994576818e-05, "loss": 0.4224, "step": 58 }, { "epoch": 1.0441744892324683, "grad_norm": 0.6984964000427009, "learning_rate": 7.704987826441235e-05, "loss": 0.423, "step": 59 }, { "epoch": 1.0618442849254555, "grad_norm": 0.543414847614615, "learning_rate": 7.685904847481631e-05, "loss": 0.4214, "step": 60 }, { "epoch": 1.079514080618443, "grad_norm": 0.6586728820833049, "learning_rate": 7.666249023479905e-05, "loss": 0.4232, "step": 61 }, { "epoch": 1.09718387631143, "grad_norm": 0.48664555730901193, "learning_rate": 7.646023409246694e-05, "loss": 0.4184, "step": 62 }, { "epoch": 1.1148536720044175, "grad_norm": 0.45691517952976585, "learning_rate": 7.625231148146601e-05, "loss": 0.4116, "step": 63 }, { "epoch": 1.1325234676974048, "grad_norm": 0.44580349839113315, "learning_rate": 7.603875471609677e-05, "loss": 0.4148, "step": 64 }, { "epoch": 1.150193263390392, "grad_norm": 0.38551843328636426, "learning_rate": 7.581959698629204e-05, "loss": 0.4081, "step": 65 }, { "epoch": 1.1678630590833794, "grad_norm": 0.4359324964542397, "learning_rate": 7.559487235245875e-05, "loss": 0.4151, "step": 66 }, { "epoch": 1.1855328547763666, "grad_norm": 0.41530730049997977, "learning_rate": 7.536461574018439e-05, "loss": 0.4116, "step": 67 }, { "epoch": 1.203202650469354, "grad_norm": 0.38292190540156723, "learning_rate": 7.512886293480914e-05, "loss": 0.4099, "step": 68 }, { "epoch": 1.2208724461623413, "grad_norm": 0.4050268821480764, "learning_rate": 7.488765057586422e-05, "loss": 0.4059, "step": 69 }, { "epoch": 1.2385422418553285, "grad_norm": 0.573303636137278, "learning_rate": 7.464101615137756e-05, "loss": 0.4137, "step": 70 }, { "epoch": 1.256212037548316, "grad_norm": 0.6806327106102632, "learning_rate": 7.438899799204762e-05, "loss": 0.412, "step": 71 }, { "epoch": 1.273881833241303, "grad_norm": 0.5165313145422143, "learning_rate": 7.413163526528623e-05, "loss": 0.4078, "step": 72 }, { "epoch": 1.2915516289342905, "grad_norm": 0.5532520513811989, "learning_rate": 7.386896796913137e-05, "loss": 0.4026, "step": 73 }, { "epoch": 1.3092214246272778, "grad_norm": 0.6170140080440525, "learning_rate": 7.360103692603087e-05, "loss": 0.4025, "step": 74 }, { "epoch": 1.326891220320265, "grad_norm": 0.42873518437379604, "learning_rate": 7.332788377649796e-05, "loss": 0.4052, "step": 75 }, { "epoch": 1.3445610160132524, "grad_norm": 0.449652816843054, "learning_rate": 7.30495509726398e-05, "loss": 0.4089, "step": 76 }, { "epoch": 1.3622308117062396, "grad_norm": 0.4609411200515682, "learning_rate": 7.276608177155968e-05, "loss": 0.409, "step": 77 }, { "epoch": 1.379900607399227, "grad_norm": 0.37783779994964356, "learning_rate": 7.247752022863428e-05, "loss": 0.411, "step": 78 }, { "epoch": 1.3975704030922143, "grad_norm": 0.47167800258784437, "learning_rate": 7.218391119066674e-05, "loss": 0.4006, "step": 79 }, { "epoch": 1.4152401987852015, "grad_norm": 0.5209290267234993, "learning_rate": 7.188530028891691e-05, "loss": 0.3961, "step": 80 }, { "epoch": 1.432909994478189, "grad_norm": 0.35540053272228744, "learning_rate": 7.158173393200942e-05, "loss": 0.3999, "step": 81 }, { "epoch": 1.450579790171176, "grad_norm": 0.2822473247891192, "learning_rate": 7.12732592987212e-05, "loss": 0.4029, "step": 82 }, { "epoch": 1.4682495858641635, "grad_norm": 0.44251535710813034, "learning_rate": 7.09599243306491e-05, "loss": 0.411, "step": 83 }, { "epoch": 1.4859193815571508, "grad_norm": 0.46257597741002365, "learning_rate": 7.064177772475912e-05, "loss": 0.3997, "step": 84 }, { "epoch": 1.503589177250138, "grad_norm": 0.3804354174312414, "learning_rate": 7.031886892581813e-05, "loss": 0.3984, "step": 85 }, { "epoch": 1.5212589729431254, "grad_norm": 0.2667610882166938, "learning_rate": 6.999124811870938e-05, "loss": 0.3986, "step": 86 }, { "epoch": 1.5389287686361126, "grad_norm": 0.30942594899982945, "learning_rate": 6.965896622063307e-05, "loss": 0.4055, "step": 87 }, { "epoch": 1.5565985643291, "grad_norm": 0.3550745842897038, "learning_rate": 6.932207487319305e-05, "loss": 0.408, "step": 88 }, { "epoch": 1.5742683600220873, "grad_norm": 0.3569377139350627, "learning_rate": 6.898062643437091e-05, "loss": 0.3961, "step": 89 }, { "epoch": 1.5919381557150745, "grad_norm": 0.33698677021351336, "learning_rate": 6.863467397038874e-05, "loss": 0.3927, "step": 90 }, { "epoch": 1.609607951408062, "grad_norm": 0.4283132299621792, "learning_rate": 6.828427124746191e-05, "loss": 0.3962, "step": 91 }, { "epoch": 1.627277747101049, "grad_norm": 0.4800585612046361, "learning_rate": 6.792947272344292e-05, "loss": 0.4024, "step": 92 }, { "epoch": 1.6449475427940365, "grad_norm": 0.4106952334802103, "learning_rate": 6.757033353935788e-05, "loss": 0.3983, "step": 93 }, { "epoch": 1.6626173384870238, "grad_norm": 0.44863543828572716, "learning_rate": 6.720690951083678e-05, "loss": 0.3983, "step": 94 }, { "epoch": 1.680287134180011, "grad_norm": 0.5972683621698959, "learning_rate": 6.68392571194388e-05, "loss": 0.3952, "step": 95 }, { "epoch": 1.6979569298729982, "grad_norm": 0.6864704632849609, "learning_rate": 6.646743350387438e-05, "loss": 0.4052, "step": 96 }, { "epoch": 1.7156267255659856, "grad_norm": 0.5997402895023315, "learning_rate": 6.609149645112485e-05, "loss": 0.3977, "step": 97 }, { "epoch": 1.733296521258973, "grad_norm": 0.42336994036744263, "learning_rate": 6.571150438746157e-05, "loss": 0.3985, "step": 98 }, { "epoch": 1.7509663169519603, "grad_norm": 0.416677515975007, "learning_rate": 6.532751636936561e-05, "loss": 0.4043, "step": 99 }, { "epoch": 1.7686361126449475, "grad_norm": 0.5095146036807348, "learning_rate": 6.493959207434934e-05, "loss": 0.3931, "step": 100 }, { "epoch": 1.7863059083379347, "grad_norm": 0.45946367582524933, "learning_rate": 6.45477917916819e-05, "loss": 0.3972, "step": 101 }, { "epoch": 1.8039757040309221, "grad_norm": 0.3713407116922932, "learning_rate": 6.41521764130191e-05, "loss": 0.4044, "step": 102 }, { "epoch": 1.8216454997239095, "grad_norm": 0.40017896468093417, "learning_rate": 6.375280742294007e-05, "loss": 0.398, "step": 103 }, { "epoch": 1.8393152954168968, "grad_norm": 0.5022523013468211, "learning_rate": 6.334974688939161e-05, "loss": 0.3963, "step": 104 }, { "epoch": 1.856985091109884, "grad_norm": 0.3934847620868829, "learning_rate": 6.294305745404185e-05, "loss": 0.3884, "step": 105 }, { "epoch": 1.8746548868028712, "grad_norm": 0.3453314877586019, "learning_rate": 6.253280232254489e-05, "loss": 0.3899, "step": 106 }, { "epoch": 1.8923246824958586, "grad_norm": 0.4479654044472846, "learning_rate": 6.211904525471758e-05, "loss": 0.3938, "step": 107 }, { "epoch": 1.909994478188846, "grad_norm": 0.39352474590110836, "learning_rate": 6.170185055463039e-05, "loss": 0.3915, "step": 108 }, { "epoch": 1.9276642738818333, "grad_norm": 0.2714956264200266, "learning_rate": 6.128128306061347e-05, "loss": 0.3899, "step": 109 }, { "epoch": 1.9453340695748205, "grad_norm": 0.31091747939470954, "learning_rate": 6.0857408135179926e-05, "loss": 0.3893, "step": 110 }, { "epoch": 1.9630038652678077, "grad_norm": 0.3575276621136432, "learning_rate": 6.0430291654867435e-05, "loss": 0.3913, "step": 111 }, { "epoch": 1.9806736609607951, "grad_norm": 0.2508689498362062, "learning_rate": 6.000000000000001e-05, "loss": 0.3972, "step": 112 }, { "epoch": 1.9983434566537825, "grad_norm": 0.5605963723377234, "learning_rate": 5.9566600044371584e-05, "loss": 0.5878, "step": 113 }, { "epoch": 2.017669795692987, "grad_norm": 0.7102289660217768, "learning_rate": 5.913015914485274e-05, "loss": 0.3661, "step": 114 }, { "epoch": 2.0353395913859744, "grad_norm": 0.3944445954033844, "learning_rate": 5.869074513092249e-05, "loss": 0.373, "step": 115 }, { "epoch": 2.053009387078962, "grad_norm": 0.48382052666433606, "learning_rate": 5.824842629412653e-05, "loss": 0.3739, "step": 116 }, { "epoch": 2.0706791827719493, "grad_norm": 0.5806563133800278, "learning_rate": 5.7803271377463695e-05, "loss": 0.3672, "step": 117 }, { "epoch": 2.0883489784649365, "grad_norm": 0.6502257019855456, "learning_rate": 5.735534956470233e-05, "loss": 0.3644, "step": 118 }, { "epoch": 2.1060187741579237, "grad_norm": 0.8472824581196564, "learning_rate": 5.6904730469627985e-05, "loss": 0.3709, "step": 119 }, { "epoch": 2.123688569850911, "grad_norm": 0.6853246906791319, "learning_rate": 5.645148412522447e-05, "loss": 0.3645, "step": 120 }, { "epoch": 2.1413583655438986, "grad_norm": 0.4107921656466385, "learning_rate": 5.5995680972789634e-05, "loss": 0.3662, "step": 121 }, { "epoch": 2.159028161236886, "grad_norm": 0.37871427512746525, "learning_rate": 5.5537391850987795e-05, "loss": 0.3614, "step": 122 }, { "epoch": 2.176697956929873, "grad_norm": 0.5083596297218169, "learning_rate": 5.507668798484021e-05, "loss": 0.3645, "step": 123 }, { "epoch": 2.19436775262286, "grad_norm": 0.4631791629644736, "learning_rate": 5.461364097465581e-05, "loss": 0.3651, "step": 124 }, { "epoch": 2.2120375483158474, "grad_norm": 0.3234371144780778, "learning_rate": 5.414832278490326e-05, "loss": 0.3604, "step": 125 }, { "epoch": 2.229707344008835, "grad_norm": 0.352276909118395, "learning_rate": 5.368080573302676e-05, "loss": 0.3662, "step": 126 }, { "epoch": 2.2473771397018223, "grad_norm": 0.41914234346119467, "learning_rate": 5.321116247820669e-05, "loss": 0.3603, "step": 127 }, { "epoch": 2.2650469353948095, "grad_norm": 0.28573640352352386, "learning_rate": 5.2739466010067385e-05, "loss": 0.3556, "step": 128 }, { "epoch": 2.2827167310877967, "grad_norm": 0.25492365093010394, "learning_rate": 5.226578963733338e-05, "loss": 0.363, "step": 129 }, { "epoch": 2.300386526780784, "grad_norm": 0.2927182277484276, "learning_rate": 5.179020697643618e-05, "loss": 0.3636, "step": 130 }, { "epoch": 2.3180563224737716, "grad_norm": 0.2580774273705453, "learning_rate": 5.13127919400731e-05, "loss": 0.3632, "step": 131 }, { "epoch": 2.335726118166759, "grad_norm": 0.22474459932551596, "learning_rate": 5.0833618725720214e-05, "loss": 0.3614, "step": 132 }, { "epoch": 2.353395913859746, "grad_norm": 0.25586289471702456, "learning_rate": 5.0352761804100835e-05, "loss": 0.36, "step": 133 }, { "epoch": 2.371065709552733, "grad_norm": 0.24042001746342648, "learning_rate": 4.987029590761174e-05, "loss": 0.3667, "step": 134 }, { "epoch": 2.3887355052457204, "grad_norm": 0.24039897426531276, "learning_rate": 4.9386296018708614e-05, "loss": 0.3673, "step": 135 }, { "epoch": 2.406405300938708, "grad_norm": 0.27314874775426934, "learning_rate": 4.890083735825258e-05, "loss": 0.3619, "step": 136 }, { "epoch": 2.4240750966316953, "grad_norm": 0.23246378924197766, "learning_rate": 4.841399537381984e-05, "loss": 0.3623, "step": 137 }, { "epoch": 2.4417448923246825, "grad_norm": 0.191508019981753, "learning_rate": 4.792584572797591e-05, "loss": 0.3633, "step": 138 }, { "epoch": 2.4594146880176697, "grad_norm": 0.22416894070833618, "learning_rate": 4.743646428651659e-05, "loss": 0.3584, "step": 139 }, { "epoch": 2.477084483710657, "grad_norm": 0.20230146319655629, "learning_rate": 4.694592710667723e-05, "loss": 0.3615, "step": 140 }, { "epoch": 2.4947542794036446, "grad_norm": 0.1822985257854695, "learning_rate": 4.645431042531227e-05, "loss": 0.363, "step": 141 }, { "epoch": 2.512424075096632, "grad_norm": 0.20736102263198034, "learning_rate": 4.5961690647046974e-05, "loss": 0.3586, "step": 142 }, { "epoch": 2.530093870789619, "grad_norm": 0.17577697599602832, "learning_rate": 4.546814433240294e-05, "loss": 0.3598, "step": 143 }, { "epoch": 2.547763666482606, "grad_norm": 0.20267553551988982, "learning_rate": 4.4973748185899416e-05, "loss": 0.3595, "step": 144 }, { "epoch": 2.5654334621755934, "grad_norm": 0.16672091257518545, "learning_rate": 4.4478579044132314e-05, "loss": 0.3591, "step": 145 }, { "epoch": 2.583103257868581, "grad_norm": 0.18713978691301508, "learning_rate": 4.398271386383267e-05, "loss": 0.3588, "step": 146 }, { "epoch": 2.6007730535615683, "grad_norm": 0.1544729610927051, "learning_rate": 4.348622970990634e-05, "loss": 0.3535, "step": 147 }, { "epoch": 2.6184428492545555, "grad_norm": 0.16023976528470063, "learning_rate": 4.298920374345698e-05, "loss": 0.3624, "step": 148 }, { "epoch": 2.6361126449475427, "grad_norm": 0.1952298963833661, "learning_rate": 4.249171320979409e-05, "loss": 0.3592, "step": 149 }, { "epoch": 2.65378244064053, "grad_norm": 0.17353667541371376, "learning_rate": 4.199383542642789e-05, "loss": 0.3655, "step": 150 }, { "epoch": 2.6714522363335176, "grad_norm": 0.194516804675962, "learning_rate": 4.149564777105304e-05, "loss": 0.3565, "step": 151 }, { "epoch": 2.689122032026505, "grad_norm": 0.1758217981986949, "learning_rate": 4.0997227669522924e-05, "loss": 0.3666, "step": 152 }, { "epoch": 2.706791827719492, "grad_norm": 0.16067174619876137, "learning_rate": 4.0498652583816606e-05, "loss": 0.3592, "step": 153 }, { "epoch": 2.724461623412479, "grad_norm": 0.14593999093364027, "learning_rate": 4e-05, "loss": 0.3561, "step": 154 }, { "epoch": 2.7421314191054664, "grad_norm": 0.1603549305082767, "learning_rate": 3.95013474161834e-05, "loss": 0.3588, "step": 155 }, { "epoch": 2.759801214798454, "grad_norm": 0.11543165975506586, "learning_rate": 3.9002772330477096e-05, "loss": 0.3613, "step": 156 }, { "epoch": 2.7774710104914413, "grad_norm": 0.16445493416381396, "learning_rate": 3.850435222894698e-05, "loss": 0.3607, "step": 157 }, { "epoch": 2.7951408061844285, "grad_norm": 0.1414422759760679, "learning_rate": 3.800616457357211e-05, "loss": 0.36, "step": 158 }, { "epoch": 2.8128106018774157, "grad_norm": 0.13046409193282807, "learning_rate": 3.7508286790205916e-05, "loss": 0.35, "step": 159 }, { "epoch": 2.830480397570403, "grad_norm": 0.1596088175716807, "learning_rate": 3.7010796256543034e-05, "loss": 0.3639, "step": 160 }, { "epoch": 2.8481501932633906, "grad_norm": 0.1263702590709236, "learning_rate": 3.6513770290093674e-05, "loss": 0.3592, "step": 161 }, { "epoch": 2.865819988956378, "grad_norm": 0.14761623625344572, "learning_rate": 3.601728613616734e-05, "loss": 0.3609, "step": 162 }, { "epoch": 2.883489784649365, "grad_norm": 0.1457836294562342, "learning_rate": 3.552142095586769e-05, "loss": 0.3515, "step": 163 }, { "epoch": 2.901159580342352, "grad_norm": 0.11727253956988348, "learning_rate": 3.5026251814100604e-05, "loss": 0.3611, "step": 164 }, { "epoch": 2.9188293760353394, "grad_norm": 0.13457982400211851, "learning_rate": 3.453185566759707e-05, "loss": 0.3536, "step": 165 }, { "epoch": 2.936499171728327, "grad_norm": 0.13998361556088473, "learning_rate": 3.403830935295302e-05, "loss": 0.3608, "step": 166 }, { "epoch": 2.9541689674213143, "grad_norm": 0.14067124850466778, "learning_rate": 3.3545689574687734e-05, "loss": 0.3706, "step": 167 }, { "epoch": 2.9718387631143015, "grad_norm": 0.1318011050837946, "learning_rate": 3.305407289332279e-05, "loss": 0.3544, "step": 168 }, { "epoch": 2.9895085588072887, "grad_norm": 0.14546432839266002, "learning_rate": 3.256353571348342e-05, "loss": 0.3709, "step": 169 }, { "epoch": 3.008834897846494, "grad_norm": 0.19583526414298022, "learning_rate": 3.207415427202411e-05, "loss": 0.527, "step": 170 }, { "epoch": 3.026504693539481, "grad_norm": 0.1877498962594719, "learning_rate": 3.1586004626180175e-05, "loss": 0.3322, "step": 171 }, { "epoch": 3.0441744892324683, "grad_norm": 0.18186432632351485, "learning_rate": 3.109916264174743e-05, "loss": 0.3366, "step": 172 }, { "epoch": 3.0618442849254555, "grad_norm": 0.19885800612643872, "learning_rate": 3.0613703981291406e-05, "loss": 0.3324, "step": 173 }, { "epoch": 3.0795140806184427, "grad_norm": 0.2089734413211629, "learning_rate": 3.0129704092388253e-05, "loss": 0.3339, "step": 174 }, { "epoch": 3.0971838763114303, "grad_norm": 0.17173171928937206, "learning_rate": 2.9647238195899168e-05, "loss": 0.3335, "step": 175 }, { "epoch": 3.1148536720044175, "grad_norm": 0.2123048979948462, "learning_rate": 2.9166381274279803e-05, "loss": 0.3333, "step": 176 }, { "epoch": 3.1325234676974048, "grad_norm": 0.16806112420590733, "learning_rate": 2.8687208059926904e-05, "loss": 0.3353, "step": 177 }, { "epoch": 3.150193263390392, "grad_norm": 0.17559998407873006, "learning_rate": 2.8209793023563833e-05, "loss": 0.3304, "step": 178 }, { "epoch": 3.167863059083379, "grad_norm": 0.1478401374423511, "learning_rate": 2.7734210362666637e-05, "loss": 0.3301, "step": 179 }, { "epoch": 3.185532854776367, "grad_norm": 0.15148759846687243, "learning_rate": 2.7260533989932628e-05, "loss": 0.3332, "step": 180 }, { "epoch": 3.203202650469354, "grad_norm": 0.13080567876743235, "learning_rate": 2.678883752179333e-05, "loss": 0.3296, "step": 181 }, { "epoch": 3.2208724461623413, "grad_norm": 0.14032258509798645, "learning_rate": 2.6319194266973256e-05, "loss": 0.3272, "step": 182 }, { "epoch": 3.2385422418553285, "grad_norm": 0.1162785431430188, "learning_rate": 2.5851677215096745e-05, "loss": 0.3316, "step": 183 }, { "epoch": 3.2562120375483157, "grad_norm": 0.13212366326804822, "learning_rate": 2.53863590253442e-05, "loss": 0.3357, "step": 184 }, { "epoch": 3.2738818332413033, "grad_norm": 0.11293978492936926, "learning_rate": 2.4923312015159794e-05, "loss": 0.3301, "step": 185 }, { "epoch": 3.2915516289342905, "grad_norm": 0.11627210040478692, "learning_rate": 2.4462608149012215e-05, "loss": 0.3372, "step": 186 }, { "epoch": 3.3092214246272778, "grad_norm": 0.1116155121257842, "learning_rate": 2.400431902721037e-05, "loss": 0.332, "step": 187 }, { "epoch": 3.326891220320265, "grad_norm": 0.10499190887903717, "learning_rate": 2.3548515874775547e-05, "loss": 0.3258, "step": 188 }, { "epoch": 3.344561016013252, "grad_norm": 0.11985499625363151, "learning_rate": 2.3095269530372032e-05, "loss": 0.3356, "step": 189 }, { "epoch": 3.36223081170624, "grad_norm": 0.10946277088364416, "learning_rate": 2.264465043529768e-05, "loss": 0.3339, "step": 190 }, { "epoch": 3.379900607399227, "grad_norm": 0.10946281010542962, "learning_rate": 2.2196728622536304e-05, "loss": 0.3324, "step": 191 }, { "epoch": 3.3975704030922143, "grad_norm": 0.10470866039213844, "learning_rate": 2.175157370587348e-05, "loss": 0.3333, "step": 192 }, { "epoch": 3.4152401987852015, "grad_norm": 0.11496566123138528, "learning_rate": 2.130925486907752e-05, "loss": 0.3299, "step": 193 }, { "epoch": 3.4329099944781887, "grad_norm": 0.10141893666492717, "learning_rate": 2.0869840855147286e-05, "loss": 0.3415, "step": 194 }, { "epoch": 3.4505797901711763, "grad_norm": 0.11362446781863374, "learning_rate": 2.0433399955628443e-05, "loss": 0.3325, "step": 195 }, { "epoch": 3.4682495858641635, "grad_norm": 0.09706953644786295, "learning_rate": 2.0000000000000012e-05, "loss": 0.3385, "step": 196 }, { "epoch": 3.4859193815571508, "grad_norm": 0.10343474750084429, "learning_rate": 1.956970834513259e-05, "loss": 0.3324, "step": 197 }, { "epoch": 3.503589177250138, "grad_norm": 0.1115997799981753, "learning_rate": 1.914259186482008e-05, "loss": 0.3304, "step": 198 }, { "epoch": 3.5212589729431256, "grad_norm": 0.100665947226005, "learning_rate": 1.8718716939386543e-05, "loss": 0.341, "step": 199 }, { "epoch": 3.5389287686361124, "grad_norm": 0.1112389145071717, "learning_rate": 1.829814944536963e-05, "loss": 0.3311, "step": 200 }, { "epoch": 3.5565985643291, "grad_norm": 0.10508913738753005, "learning_rate": 1.7880954745282425e-05, "loss": 0.3262, "step": 201 }, { "epoch": 3.5742683600220873, "grad_norm": 0.10715189168423515, "learning_rate": 1.7467197677455118e-05, "loss": 0.3387, "step": 202 }, { "epoch": 3.5919381557150745, "grad_norm": 0.12309528662432599, "learning_rate": 1.7056942545958167e-05, "loss": 0.3272, "step": 203 }, { "epoch": 3.609607951408062, "grad_norm": 0.09503224614910906, "learning_rate": 1.6650253110608415e-05, "loss": 0.3361, "step": 204 }, { "epoch": 3.627277747101049, "grad_norm": 0.10336176601010666, "learning_rate": 1.6247192577059943e-05, "loss": 0.3394, "step": 205 }, { "epoch": 3.6449475427940365, "grad_norm": 0.10325570712107657, "learning_rate": 1.5847823586980897e-05, "loss": 0.3329, "step": 206 }, { "epoch": 3.6626173384870238, "grad_norm": 0.09392755935325038, "learning_rate": 1.545220820831811e-05, "loss": 0.3273, "step": 207 }, { "epoch": 3.680287134180011, "grad_norm": 0.09365769242973894, "learning_rate": 1.5060407925650662e-05, "loss": 0.3366, "step": 208 }, { "epoch": 3.697956929872998, "grad_norm": 0.11814878497030305, "learning_rate": 1.4672483630634414e-05, "loss": 0.3365, "step": 209 }, { "epoch": 3.7156267255659854, "grad_norm": 0.08867405697122034, "learning_rate": 1.4288495612538427e-05, "loss": 0.3344, "step": 210 }, { "epoch": 3.733296521258973, "grad_norm": 0.09623647972155307, "learning_rate": 1.3908503548875167e-05, "loss": 0.334, "step": 211 }, { "epoch": 3.7509663169519603, "grad_norm": 0.10214752499383144, "learning_rate": 1.3532566496125634e-05, "loss": 0.3319, "step": 212 }, { "epoch": 3.7686361126449475, "grad_norm": 0.09305403874654943, "learning_rate": 1.3160742880561204e-05, "loss": 0.3327, "step": 213 }, { "epoch": 3.7863059083379347, "grad_norm": 0.09492686150258188, "learning_rate": 1.2793090489163218e-05, "loss": 0.3276, "step": 214 }, { "epoch": 3.803975704030922, "grad_norm": 0.08712346665776856, "learning_rate": 1.242966646064212e-05, "loss": 0.3378, "step": 215 }, { "epoch": 3.8216454997239095, "grad_norm": 0.09302902241662848, "learning_rate": 1.2070527276557092e-05, "loss": 0.3276, "step": 216 }, { "epoch": 3.8393152954168968, "grad_norm": 0.09984664869036139, "learning_rate": 1.1715728752538103e-05, "loss": 0.335, "step": 217 }, { "epoch": 3.856985091109884, "grad_norm": 0.08136111333958188, "learning_rate": 1.1365326029611263e-05, "loss": 0.325, "step": 218 }, { "epoch": 3.874654886802871, "grad_norm": 0.09067392795608281, "learning_rate": 1.1019373565629094e-05, "loss": 0.3326, "step": 219 }, { "epoch": 3.8923246824958584, "grad_norm": 0.1151467415131904, "learning_rate": 1.0677925126806956e-05, "loss": 0.3338, "step": 220 }, { "epoch": 3.909994478188846, "grad_norm": 0.08540355002710472, "learning_rate": 1.0341033779366931e-05, "loss": 0.3281, "step": 221 }, { "epoch": 3.9276642738818333, "grad_norm": 0.08890862244357016, "learning_rate": 1.0008751881290628e-05, "loss": 0.3279, "step": 222 }, { "epoch": 3.9453340695748205, "grad_norm": 0.08886465646079035, "learning_rate": 9.681131074181876e-06, "loss": 0.3331, "step": 223 }, { "epoch": 3.9630038652678077, "grad_norm": 0.08382860075616222, "learning_rate": 9.358222275240884e-06, "loss": 0.3301, "step": 224 }, { "epoch": 3.980673660960795, "grad_norm": 0.08493360832822018, "learning_rate": 9.040075669350905e-06, "loss": 0.3321, "step": 225 }, { "epoch": 3.9983434566537825, "grad_norm": 0.12681148128353786, "learning_rate": 8.72674070127881e-06, "loss": 0.4956, "step": 226 }, { "epoch": 4.017669795692988, "grad_norm": 0.11963433162616376, "learning_rate": 8.418266067990588e-06, "loss": 0.3171, "step": 227 }, { "epoch": 4.035339591385974, "grad_norm": 0.11337795603837922, "learning_rate": 8.114699711083113e-06, "loss": 0.3207, "step": 228 }, { "epoch": 4.053009387078962, "grad_norm": 0.08872533010955261, "learning_rate": 7.816088809333266e-06, "loss": 0.3165, "step": 229 }, { "epoch": 4.070679182771949, "grad_norm": 0.09241216784587171, "learning_rate": 7.52247977136574e-06, "loss": 0.328, "step": 230 }, { "epoch": 4.0883489784649365, "grad_norm": 0.09282854796647831, "learning_rate": 7.233918228440324e-06, "loss": 0.3119, "step": 231 }, { "epoch": 4.106018774157924, "grad_norm": 0.10157632018918776, "learning_rate": 6.950449027360213e-06, "loss": 0.3182, "step": 232 }, { "epoch": 4.123688569850911, "grad_norm": 0.10277617383975274, "learning_rate": 6.6721162235020476e-06, "loss": 0.319, "step": 233 }, { "epoch": 4.141358365543899, "grad_norm": 0.10056534251093886, "learning_rate": 6.398963073969144e-06, "loss": 0.3171, "step": 234 }, { "epoch": 4.159028161236885, "grad_norm": 0.09505805141388292, "learning_rate": 6.1310320308686354e-06, "loss": 0.3147, "step": 235 }, { "epoch": 4.176697956929873, "grad_norm": 0.09268654061240998, "learning_rate": 5.868364734713776e-06, "loss": 0.3191, "step": 236 }, { "epoch": 4.194367752622861, "grad_norm": 0.08706624460622792, "learning_rate": 5.611002007952389e-06, "loss": 0.3208, "step": 237 }, { "epoch": 4.212037548315847, "grad_norm": 0.08951679698719879, "learning_rate": 5.358983848622452e-06, "loss": 0.3172, "step": 238 }, { "epoch": 4.229707344008835, "grad_norm": 0.09131830865477111, "learning_rate": 5.112349424135788e-06, "loss": 0.3164, "step": 239 }, { "epoch": 4.247377139701822, "grad_norm": 0.08750009357915652, "learning_rate": 4.871137065190854e-06, "loss": 0.3106, "step": 240 }, { "epoch": 4.2650469353948095, "grad_norm": 0.0850783896741063, "learning_rate": 4.635384259815614e-06, "loss": 0.3169, "step": 241 }, { "epoch": 4.282716731087797, "grad_norm": 0.08475820034880008, "learning_rate": 4.405127647541259e-06, "loss": 0.3196, "step": 242 }, { "epoch": 4.300386526780784, "grad_norm": 0.08682717964250186, "learning_rate": 4.180403013707963e-06, "loss": 0.3109, "step": 243 }, { "epoch": 4.318056322473772, "grad_norm": 0.08602559336503131, "learning_rate": 3.961245283903239e-06, "loss": 0.3118, "step": 244 }, { "epoch": 4.335726118166758, "grad_norm": 0.08629377402093397, "learning_rate": 3.747688518534003e-06, "loss": 0.3153, "step": 245 }, { "epoch": 4.353395913859746, "grad_norm": 0.08163055098829672, "learning_rate": 3.5397659075330748e-06, "loss": 0.3139, "step": 246 }, { "epoch": 4.371065709552734, "grad_norm": 0.07572785159509317, "learning_rate": 3.3375097652009526e-06, "loss": 0.313, "step": 247 }, { "epoch": 4.38873550524572, "grad_norm": 0.08030300979420142, "learning_rate": 3.140951525183691e-06, "loss": 0.3154, "step": 248 }, { "epoch": 4.406405300938708, "grad_norm": 0.07708287661350902, "learning_rate": 2.950121735587654e-06, "loss": 0.3168, "step": 249 }, { "epoch": 4.424075096631695, "grad_norm": 0.09203966729547491, "learning_rate": 2.765050054231835e-06, "loss": 0.314, "step": 250 }, { "epoch": 4.4417448923246825, "grad_norm": 0.08041597052637767, "learning_rate": 2.5857652440386404e-06, "loss": 0.3197, "step": 251 }, { "epoch": 4.45941468801767, "grad_norm": 0.08037954632735851, "learning_rate": 2.4122951685636674e-06, "loss": 0.3185, "step": 252 }, { "epoch": 4.477084483710657, "grad_norm": 0.07294469295727943, "learning_rate": 2.244666787665297e-06, "loss": 0.3198, "step": 253 }, { "epoch": 4.494754279403645, "grad_norm": 0.07372400441204935, "learning_rate": 2.0829061533147322e-06, "loss": 0.3125, "step": 254 }, { "epoch": 4.512424075096631, "grad_norm": 0.07696683693078588, "learning_rate": 1.927038405547106e-06, "loss": 0.3153, "step": 255 }, { "epoch": 4.530093870789619, "grad_norm": 0.07761898190749474, "learning_rate": 1.7770877685543687e-06, "loss": 0.3164, "step": 256 }, { "epoch": 4.547763666482607, "grad_norm": 0.07405297885815232, "learning_rate": 1.6330775469204895e-06, "loss": 0.3165, "step": 257 }, { "epoch": 4.565433462175593, "grad_norm": 0.07194679469449752, "learning_rate": 1.495030121999519e-06, "loss": 0.3174, "step": 258 }, { "epoch": 4.583103257868581, "grad_norm": 0.07372588870054934, "learning_rate": 1.3629669484372722e-06, "loss": 0.3125, "step": 259 }, { "epoch": 4.600773053561568, "grad_norm": 0.0736484855706964, "learning_rate": 1.2369085508368862e-06, "loss": 0.3117, "step": 260 }, { "epoch": 4.6184428492545555, "grad_norm": 0.06929860862646713, "learning_rate": 1.1168745205690202e-06, "loss": 0.3188, "step": 261 }, { "epoch": 4.636112644947543, "grad_norm": 0.06998294296555901, "learning_rate": 1.0028835127270553e-06, "loss": 0.3111, "step": 262 }, { "epoch": 4.65378244064053, "grad_norm": 0.07010415717148563, "learning_rate": 8.949532432278185e-07, "loss": 0.3157, "step": 263 }, { "epoch": 4.671452236333518, "grad_norm": 0.07068983473178043, "learning_rate": 7.93100486058247e-07, "loss": 0.322, "step": 264 }, { "epoch": 4.689122032026504, "grad_norm": 0.07415661278084447, "learning_rate": 6.973410706684691e-07, "loss": 0.3122, "step": 265 }, { "epoch": 4.706791827719492, "grad_norm": 0.07147977962991126, "learning_rate": 6.076898795116792e-07, "loss": 0.3162, "step": 266 }, { "epoch": 4.72446162341248, "grad_norm": 0.07009164551218153, "learning_rate": 5.241608457311565e-07, "loss": 0.3174, "step": 267 }, { "epoch": 4.742131419105466, "grad_norm": 0.07130206559582739, "learning_rate": 4.467669509948591e-07, "loss": 0.3114, "step": 268 }, { "epoch": 4.759801214798454, "grad_norm": 0.06774782729285217, "learning_rate": 3.7552022347788766e-07, "loss": 0.3138, "step": 269 }, { "epoch": 4.777471010491441, "grad_norm": 0.07138164733421887, "learning_rate": 3.104317359931175e-07, "loss": 0.3209, "step": 270 }, { "epoch": 4.7951408061844285, "grad_norm": 0.06931798887191189, "learning_rate": 2.5151160427029584e-07, "loss": 0.3171, "step": 271 }, { "epoch": 4.812810601877416, "grad_norm": 0.07008995274999451, "learning_rate": 1.9876898538394362e-07, "loss": 0.311, "step": 272 }, { "epoch": 4.830480397570403, "grad_norm": 0.06835397987857686, "learning_rate": 1.522120763301782e-07, "loss": 0.323, "step": 273 }, { "epoch": 4.848150193263391, "grad_norm": 0.06982710466662823, "learning_rate": 1.1184811275279483e-07, "loss": 0.3209, "step": 274 }, { "epoch": 4.865819988956377, "grad_norm": 0.06719415454501948, "learning_rate": 7.76833678187261e-08, "loss": 0.3125, "step": 275 }, { "epoch": 4.883489784649365, "grad_norm": 0.06985964768247895, "learning_rate": 4.9723151243106225e-08, "loss": 0.3192, "step": 276 }, { "epoch": 4.901159580342353, "grad_norm": 0.06760152861997361, "learning_rate": 2.797180846405567e-08, "loss": 0.3176, "step": 277 }, { "epoch": 4.918829376035339, "grad_norm": 0.07336231991253046, "learning_rate": 1.2432719967350182e-08, "loss": 0.3201, "step": 278 }, { "epoch": 4.936499171728327, "grad_norm": 0.06901399516812774, "learning_rate": 3.108300761005545e-09, "loss": 0.3193, "step": 279 }, { "epoch": 4.954168967421314, "grad_norm": 0.06958592985440362, "learning_rate": 0.0, "loss": 0.3178, "step": 280 }, { "epoch": 4.954168967421314, "step": 280, "total_flos": 7.445251410192499e+18, "train_loss": 0.3906520079289164, "train_runtime": 65492.582, "train_samples_per_second": 2.212, "train_steps_per_second": 0.004 } ], "logging_steps": 1, "max_steps": 280, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.445251410192499e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }