{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9904153354632586, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009584664536741214, "grad_norm": 5.816953152720709, "learning_rate": 1.25e-06, "loss": 0.8809, "step": 1 }, { "epoch": 0.019169329073482427, "grad_norm": 5.964463857967548, "learning_rate": 2.5e-06, "loss": 0.8476, "step": 2 }, { "epoch": 0.02875399361022364, "grad_norm": 5.85622862207096, "learning_rate": 3.7500000000000005e-06, "loss": 0.8749, "step": 3 }, { "epoch": 0.038338658146964855, "grad_norm": 5.365349431954843, "learning_rate": 5e-06, "loss": 0.8422, "step": 4 }, { "epoch": 0.04792332268370607, "grad_norm": 4.006201698306361, "learning_rate": 6.25e-06, "loss": 0.8005, "step": 5 }, { "epoch": 0.05750798722044728, "grad_norm": 2.122270635740227, "learning_rate": 7.500000000000001e-06, "loss": 0.7687, "step": 6 }, { "epoch": 0.0670926517571885, "grad_norm": 3.975962985256971, "learning_rate": 8.750000000000001e-06, "loss": 0.7686, "step": 7 }, { "epoch": 0.07667731629392971, "grad_norm": 4.3633197716567595, "learning_rate": 1e-05, "loss": 0.761, "step": 8 }, { "epoch": 0.08626198083067092, "grad_norm": 4.812484653337378, "learning_rate": 1.125e-05, "loss": 0.7451, "step": 9 }, { "epoch": 0.09584664536741214, "grad_norm": 4.996167212561038, "learning_rate": 1.25e-05, "loss": 0.7657, "step": 10 }, { "epoch": 0.10543130990415335, "grad_norm": 3.391593689539008, "learning_rate": 1.375e-05, "loss": 0.7293, "step": 11 }, { "epoch": 0.11501597444089456, "grad_norm": 2.024258637124186, "learning_rate": 1.5000000000000002e-05, "loss": 0.6753, "step": 12 }, { "epoch": 0.12460063897763578, "grad_norm": 2.368340767997839, "learning_rate": 1.6250000000000002e-05, "loss": 0.675, "step": 13 }, { "epoch": 0.134185303514377, "grad_norm": 1.9034533687196296, "learning_rate": 1.7500000000000002e-05, "loss": 0.6346, "step": 14 }, { "epoch": 0.14376996805111822, "grad_norm": 1.511274358606332, "learning_rate": 1.8750000000000002e-05, "loss": 0.6395, "step": 15 }, { "epoch": 0.15335463258785942, "grad_norm": 1.5258008797446443, "learning_rate": 2e-05, "loss": 0.6605, "step": 16 }, { "epoch": 0.16293929712460065, "grad_norm": 1.4701453015485093, "learning_rate": 2.125e-05, "loss": 0.6345, "step": 17 }, { "epoch": 0.17252396166134185, "grad_norm": 1.139297657192997, "learning_rate": 2.25e-05, "loss": 0.6313, "step": 18 }, { "epoch": 0.18210862619808307, "grad_norm": 1.0628460283930738, "learning_rate": 2.375e-05, "loss": 0.6093, "step": 19 }, { "epoch": 0.19169329073482427, "grad_norm": 1.0582580403121424, "learning_rate": 2.5e-05, "loss": 0.5867, "step": 20 }, { "epoch": 0.2012779552715655, "grad_norm": 0.8977556913012904, "learning_rate": 2.625e-05, "loss": 0.5876, "step": 21 }, { "epoch": 0.2108626198083067, "grad_norm": 1.3235677545982394, "learning_rate": 2.75e-05, "loss": 0.5909, "step": 22 }, { "epoch": 0.22044728434504793, "grad_norm": 0.8869847698482135, "learning_rate": 2.875e-05, "loss": 0.5739, "step": 23 }, { "epoch": 0.23003194888178913, "grad_norm": 1.1759129421868764, "learning_rate": 3.0000000000000004e-05, "loss": 0.588, "step": 24 }, { "epoch": 0.23961661341853036, "grad_norm": 0.9909103854124853, "learning_rate": 3.125e-05, "loss": 0.5675, "step": 25 }, { "epoch": 0.24920127795527156, "grad_norm": 1.0713203550156574, "learning_rate": 3.2500000000000004e-05, "loss": 0.5492, "step": 26 }, { "epoch": 0.25878594249201275, "grad_norm": 0.9586606498055252, "learning_rate": 3.375e-05, "loss": 0.5539, "step": 27 }, { "epoch": 0.268370607028754, "grad_norm": 0.9213875490205139, "learning_rate": 3.5000000000000004e-05, "loss": 0.5319, "step": 28 }, { "epoch": 0.2779552715654952, "grad_norm": 1.0198534952792746, "learning_rate": 3.625e-05, "loss": 0.5722, "step": 29 }, { "epoch": 0.28753993610223644, "grad_norm": 0.9853545050808614, "learning_rate": 3.7500000000000003e-05, "loss": 0.5514, "step": 30 }, { "epoch": 0.2971246006389776, "grad_norm": 0.9771316358011265, "learning_rate": 3.875e-05, "loss": 0.548, "step": 31 }, { "epoch": 0.30670926517571884, "grad_norm": 1.070148811782629, "learning_rate": 4e-05, "loss": 0.5776, "step": 32 }, { "epoch": 0.31629392971246006, "grad_norm": 0.8966401903811619, "learning_rate": 3.9998741135094016e-05, "loss": 0.5444, "step": 33 }, { "epoch": 0.3258785942492013, "grad_norm": 0.893875729745141, "learning_rate": 3.999496469885014e-05, "loss": 0.5704, "step": 34 }, { "epoch": 0.3354632587859425, "grad_norm": 1.072867766600089, "learning_rate": 3.998867116667067e-05, "loss": 0.5477, "step": 35 }, { "epoch": 0.3450479233226837, "grad_norm": 0.9635968083722172, "learning_rate": 3.9979861330826295e-05, "loss": 0.5534, "step": 36 }, { "epoch": 0.3546325878594249, "grad_norm": 0.9899242418062558, "learning_rate": 3.996853630035634e-05, "loss": 0.5655, "step": 37 }, { "epoch": 0.36421725239616615, "grad_norm": 0.9130531711455715, "learning_rate": 3.995469750092912e-05, "loss": 0.5817, "step": 38 }, { "epoch": 0.3738019169329074, "grad_norm": 0.9387442528138782, "learning_rate": 3.9938346674662565e-05, "loss": 0.5044, "step": 39 }, { "epoch": 0.38338658146964855, "grad_norm": 1.0142174519180858, "learning_rate": 3.991948587990479e-05, "loss": 0.5359, "step": 40 }, { "epoch": 0.3929712460063898, "grad_norm": 0.8959196832437484, "learning_rate": 3.989811749097505e-05, "loss": 0.5537, "step": 41 }, { "epoch": 0.402555910543131, "grad_norm": 0.9987977928627828, "learning_rate": 3.9874244197864856e-05, "loss": 0.5782, "step": 42 }, { "epoch": 0.41214057507987223, "grad_norm": 0.8845338538227933, "learning_rate": 3.984786900589929e-05, "loss": 0.5415, "step": 43 }, { "epoch": 0.4217252396166134, "grad_norm": 1.0047800925961548, "learning_rate": 3.98189952353587e-05, "loss": 0.5542, "step": 44 }, { "epoch": 0.43130990415335463, "grad_norm": 0.8022173762000696, "learning_rate": 3.9787626521060736e-05, "loss": 0.5483, "step": 45 }, { "epoch": 0.44089456869009586, "grad_norm": 0.7883000207976826, "learning_rate": 3.9753766811902756e-05, "loss": 0.5473, "step": 46 }, { "epoch": 0.4504792332268371, "grad_norm": 0.8434281567738428, "learning_rate": 3.971742037036472e-05, "loss": 0.5634, "step": 47 }, { "epoch": 0.46006389776357826, "grad_norm": 1.0277496566918978, "learning_rate": 3.96785917719726e-05, "loss": 0.558, "step": 48 }, { "epoch": 0.4696485623003195, "grad_norm": 0.99005810315587, "learning_rate": 3.9637285904722376e-05, "loss": 0.5508, "step": 49 }, { "epoch": 0.4792332268370607, "grad_norm": 1.0782113201945098, "learning_rate": 3.9593507968464714e-05, "loss": 0.5478, "step": 50 }, { "epoch": 0.48881789137380194, "grad_norm": 0.721872938484921, "learning_rate": 3.9547263474250385e-05, "loss": 0.5366, "step": 51 }, { "epoch": 0.4984025559105431, "grad_norm": 1.2142224090633027, "learning_rate": 3.949855824363647e-05, "loss": 0.5544, "step": 52 }, { "epoch": 0.5079872204472844, "grad_norm": 0.9185255274236219, "learning_rate": 3.9447398407953536e-05, "loss": 0.4976, "step": 53 }, { "epoch": 0.5175718849840255, "grad_norm": 1.0876007980497202, "learning_rate": 3.939379040753374e-05, "loss": 0.5374, "step": 54 }, { "epoch": 0.5271565495207667, "grad_norm": 1.2327514959394714, "learning_rate": 3.933774099090013e-05, "loss": 0.5517, "step": 55 }, { "epoch": 0.536741214057508, "grad_norm": 0.6905831120274069, "learning_rate": 3.927925721391707e-05, "loss": 0.5114, "step": 56 }, { "epoch": 0.5463258785942492, "grad_norm": 1.3327650152930188, "learning_rate": 3.9218346438901996e-05, "loss": 0.5556, "step": 57 }, { "epoch": 0.5559105431309904, "grad_norm": 0.7267801369686653, "learning_rate": 3.9155016333698615e-05, "loss": 0.5446, "step": 58 }, { "epoch": 0.5654952076677316, "grad_norm": 1.1332074133198213, "learning_rate": 3.908927487071162e-05, "loss": 0.5297, "step": 59 }, { "epoch": 0.5750798722044729, "grad_norm": 1.1446700448302367, "learning_rate": 3.9021130325903076e-05, "loss": 0.5283, "step": 60 }, { "epoch": 0.5846645367412141, "grad_norm": 0.9428641512869639, "learning_rate": 3.895059127775058e-05, "loss": 0.5276, "step": 61 }, { "epoch": 0.5942492012779552, "grad_norm": 0.984662057057416, "learning_rate": 3.8877666606167354e-05, "loss": 0.5053, "step": 62 }, { "epoch": 0.6038338658146964, "grad_norm": 0.9200152881682313, "learning_rate": 3.880236549138438e-05, "loss": 0.5019, "step": 63 }, { "epoch": 0.6134185303514377, "grad_norm": 0.9735224575622512, "learning_rate": 3.872469741279475e-05, "loss": 0.5027, "step": 64 }, { "epoch": 0.6230031948881789, "grad_norm": 0.8601738218870937, "learning_rate": 3.8644672147760286e-05, "loss": 0.5373, "step": 65 }, { "epoch": 0.6325878594249201, "grad_norm": 1.0690694771471052, "learning_rate": 3.856229977038078e-05, "loss": 0.5141, "step": 66 }, { "epoch": 0.6421725239616614, "grad_norm": 0.639345682465469, "learning_rate": 3.8477590650225735e-05, "loss": 0.5237, "step": 67 }, { "epoch": 0.6517571884984026, "grad_norm": 0.9825658908653957, "learning_rate": 3.839055545102902e-05, "loss": 0.5218, "step": 68 }, { "epoch": 0.6613418530351438, "grad_norm": 0.6640592278410514, "learning_rate": 3.83012051293464e-05, "loss": 0.5106, "step": 69 }, { "epoch": 0.670926517571885, "grad_norm": 0.9942824925976919, "learning_rate": 3.8209550933176324e-05, "loss": 0.5371, "step": 70 }, { "epoch": 0.6805111821086262, "grad_norm": 0.8321580496371637, "learning_rate": 3.8115604400543885e-05, "loss": 0.5285, "step": 71 }, { "epoch": 0.6900958466453674, "grad_norm": 0.6720920559775677, "learning_rate": 3.801937735804838e-05, "loss": 0.5226, "step": 72 }, { "epoch": 0.6996805111821086, "grad_norm": 0.869173718218821, "learning_rate": 3.792088191937451e-05, "loss": 0.5575, "step": 73 }, { "epoch": 0.7092651757188498, "grad_norm": 0.9252932692964674, "learning_rate": 3.782013048376736e-05, "loss": 0.5331, "step": 74 }, { "epoch": 0.7188498402555911, "grad_norm": 0.7617188628088486, "learning_rate": 3.77171357344716e-05, "loss": 0.4955, "step": 75 }, { "epoch": 0.7284345047923323, "grad_norm": 0.8768762141880961, "learning_rate": 3.761191063713476e-05, "loss": 0.5319, "step": 76 }, { "epoch": 0.7380191693290735, "grad_norm": 0.5856671731846674, "learning_rate": 3.7504468438175076e-05, "loss": 0.5286, "step": 77 }, { "epoch": 0.7476038338658147, "grad_norm": 0.7357152016449541, "learning_rate": 3.7394822663113915e-05, "loss": 0.5202, "step": 78 }, { "epoch": 0.7571884984025559, "grad_norm": 0.8421603022757149, "learning_rate": 3.72829871148731e-05, "loss": 0.5094, "step": 79 }, { "epoch": 0.7667731629392971, "grad_norm": 0.7711330057528273, "learning_rate": 3.716897587203733e-05, "loss": 0.5067, "step": 80 }, { "epoch": 0.7763578274760383, "grad_norm": 0.8061539763692872, "learning_rate": 3.705280328708185e-05, "loss": 0.5052, "step": 81 }, { "epoch": 0.7859424920127795, "grad_norm": 0.9052464984501254, "learning_rate": 3.6934483984565684e-05, "loss": 0.5396, "step": 82 }, { "epoch": 0.7955271565495208, "grad_norm": 0.7613495063825277, "learning_rate": 3.681403285929061e-05, "loss": 0.5372, "step": 83 }, { "epoch": 0.805111821086262, "grad_norm": 0.9112267984327458, "learning_rate": 3.669146507442606e-05, "loss": 0.5022, "step": 84 }, { "epoch": 0.8146964856230032, "grad_norm": 0.776862619807027, "learning_rate": 3.6566796059600334e-05, "loss": 0.4943, "step": 85 }, { "epoch": 0.8242811501597445, "grad_norm": 0.6857530550183694, "learning_rate": 3.644004150895821e-05, "loss": 0.5137, "step": 86 }, { "epoch": 0.8338658146964856, "grad_norm": 1.0001214073388247, "learning_rate": 3.631121737918521e-05, "loss": 0.5211, "step": 87 }, { "epoch": 0.8434504792332268, "grad_norm": 0.614100200634229, "learning_rate": 3.6180339887498953e-05, "loss": 0.5426, "step": 88 }, { "epoch": 0.853035143769968, "grad_norm": 0.7416467757559503, "learning_rate": 3.6047425509607566e-05, "loss": 0.4952, "step": 89 }, { "epoch": 0.8626198083067093, "grad_norm": 0.8216759125475441, "learning_rate": 3.591249097763562e-05, "loss": 0.5101, "step": 90 }, { "epoch": 0.8722044728434505, "grad_norm": 0.8783521934817947, "learning_rate": 3.5775553278017824e-05, "loss": 0.5298, "step": 91 }, { "epoch": 0.8817891373801917, "grad_norm": 1.0305280265606676, "learning_rate": 3.56366296493606e-05, "loss": 0.509, "step": 92 }, { "epoch": 0.8913738019169329, "grad_norm": 0.9394240012360305, "learning_rate": 3.5495737580272024e-05, "loss": 0.5078, "step": 93 }, { "epoch": 0.9009584664536742, "grad_norm": 0.9309348675119059, "learning_rate": 3.535289480716023e-05, "loss": 0.5207, "step": 94 }, { "epoch": 0.9105431309904153, "grad_norm": 0.9046584824077331, "learning_rate": 3.520811931200063e-05, "loss": 0.5141, "step": 95 }, { "epoch": 0.9201277955271565, "grad_norm": 0.9752162620950977, "learning_rate": 3.5061429320072225e-05, "loss": 0.509, "step": 96 }, { "epoch": 0.9297124600638977, "grad_norm": 0.6999864028990369, "learning_rate": 3.4912843297663315e-05, "loss": 0.5009, "step": 97 }, { "epoch": 0.939297124600639, "grad_norm": 0.8545396872964545, "learning_rate": 3.476237994974682e-05, "loss": 0.5094, "step": 98 }, { "epoch": 0.9488817891373802, "grad_norm": 0.7634132561961192, "learning_rate": 3.4610058217625554e-05, "loss": 0.5254, "step": 99 }, { "epoch": 0.9584664536741214, "grad_norm": 0.7968935260475134, "learning_rate": 3.4455897276547836e-05, "loss": 0.5103, "step": 100 }, { "epoch": 0.9680511182108626, "grad_norm": 0.8779383622683993, "learning_rate": 3.429991653329351e-05, "loss": 0.4962, "step": 101 }, { "epoch": 0.9776357827476039, "grad_norm": 0.8873245389104287, "learning_rate": 3.4142135623730954e-05, "loss": 0.4886, "step": 102 }, { "epoch": 0.987220447284345, "grad_norm": 0.6369979342430059, "learning_rate": 3.398257441034515e-05, "loss": 0.5153, "step": 103 }, { "epoch": 0.9968051118210862, "grad_norm": 0.7723810447745028, "learning_rate": 3.38212529797373e-05, "loss": 0.5153, "step": 104 }, { "epoch": 1.0063897763578276, "grad_norm": 1.3732822336657626, "learning_rate": 3.365819164009614e-05, "loss": 0.8678, "step": 105 }, { "epoch": 1.0159744408945688, "grad_norm": 0.6491143685994553, "learning_rate": 3.349341091864149e-05, "loss": 0.3831, "step": 106 }, { "epoch": 1.0255591054313098, "grad_norm": 0.7068223705318214, "learning_rate": 3.3326931559040084e-05, "loss": 0.4093, "step": 107 }, { "epoch": 1.035143769968051, "grad_norm": 0.7448924108254381, "learning_rate": 3.315877451879426e-05, "loss": 0.4151, "step": 108 }, { "epoch": 1.0447284345047922, "grad_norm": 0.6408577512276367, "learning_rate": 3.298896096660367e-05, "loss": 0.4591, "step": 109 }, { "epoch": 1.0543130990415335, "grad_norm": 0.7245321395554075, "learning_rate": 3.2817512279700486e-05, "loss": 0.3995, "step": 110 }, { "epoch": 1.0638977635782747, "grad_norm": 0.8050047056671462, "learning_rate": 3.26444500411582e-05, "loss": 0.4517, "step": 111 }, { "epoch": 1.073482428115016, "grad_norm": 0.5692172276488292, "learning_rate": 3.246979603717467e-05, "loss": 0.3451, "step": 112 }, { "epoch": 1.0830670926517572, "grad_norm": 0.7143343812753972, "learning_rate": 3.2293572254329546e-05, "loss": 0.4343, "step": 113 }, { "epoch": 1.0926517571884984, "grad_norm": 0.621683888292599, "learning_rate": 3.21158008768164e-05, "loss": 0.4192, "step": 114 }, { "epoch": 1.1022364217252396, "grad_norm": 0.7045370668340967, "learning_rate": 3.1936504283650076e-05, "loss": 0.4369, "step": 115 }, { "epoch": 1.1118210862619808, "grad_norm": 0.689049088523779, "learning_rate": 3.1755705045849465e-05, "loss": 0.3995, "step": 116 }, { "epoch": 1.121405750798722, "grad_norm": 0.6154467041297189, "learning_rate": 3.157342592359612e-05, "loss": 0.4309, "step": 117 }, { "epoch": 1.1309904153354633, "grad_norm": 0.6721872707368349, "learning_rate": 3.138968986336904e-05, "loss": 0.3839, "step": 118 }, { "epoch": 1.1405750798722045, "grad_norm": 0.6919770496482566, "learning_rate": 3.1204519995056056e-05, "loss": 0.4536, "step": 119 }, { "epoch": 1.1501597444089458, "grad_norm": 0.548860031455983, "learning_rate": 3.101793962904205e-05, "loss": 0.3669, "step": 120 }, { "epoch": 1.159744408945687, "grad_norm": 0.662490642467861, "learning_rate": 3.082997225327452e-05, "loss": 0.4413, "step": 121 }, { "epoch": 1.1693290734824282, "grad_norm": 0.6083191080476268, "learning_rate": 3.064064153030673e-05, "loss": 0.3873, "step": 122 }, { "epoch": 1.1789137380191694, "grad_norm": 0.6537310951440493, "learning_rate": 3.0449971294318977e-05, "loss": 0.4173, "step": 123 }, { "epoch": 1.1884984025559104, "grad_norm": 0.6521406167992244, "learning_rate": 3.0257985548118127e-05, "loss": 0.4252, "step": 124 }, { "epoch": 1.1980830670926517, "grad_norm": 0.6149682224088401, "learning_rate": 3.0064708460116007e-05, "loss": 0.4375, "step": 125 }, { "epoch": 1.207667731629393, "grad_norm": 0.5660735007468374, "learning_rate": 2.987016436128694e-05, "loss": 0.4002, "step": 126 }, { "epoch": 1.2172523961661341, "grad_norm": 0.6860797663501914, "learning_rate": 2.9674377742104798e-05, "loss": 0.4501, "step": 127 }, { "epoch": 1.2268370607028753, "grad_norm": 0.5579864217961026, "learning_rate": 2.9477373249459974e-05, "loss": 0.3881, "step": 128 }, { "epoch": 1.2364217252396166, "grad_norm": 0.7520213328572914, "learning_rate": 2.9279175683556684e-05, "loss": 0.4309, "step": 129 }, { "epoch": 1.2460063897763578, "grad_norm": 0.5836183639678406, "learning_rate": 2.9079809994790937e-05, "loss": 0.4493, "step": 130 }, { "epoch": 1.255591054313099, "grad_norm": 0.5454961635732396, "learning_rate": 2.8879301280609645e-05, "loss": 0.4248, "step": 131 }, { "epoch": 1.2651757188498403, "grad_norm": 0.5814578805359331, "learning_rate": 2.8677674782351164e-05, "loss": 0.4133, "step": 132 }, { "epoch": 1.2747603833865815, "grad_norm": 0.529954247172676, "learning_rate": 2.8474955882067776e-05, "loss": 0.4165, "step": 133 }, { "epoch": 1.2843450479233227, "grad_norm": 0.5428271229133367, "learning_rate": 2.8271170099330415e-05, "loss": 0.4439, "step": 134 }, { "epoch": 1.293929712460064, "grad_norm": 0.6847831612932209, "learning_rate": 2.8066343088016105e-05, "loss": 0.441, "step": 135 }, { "epoch": 1.3035143769968052, "grad_norm": 0.49210190834226275, "learning_rate": 2.7860500633078475e-05, "loss": 0.4165, "step": 136 }, { "epoch": 1.3130990415335464, "grad_norm": 0.6253827417860478, "learning_rate": 2.7653668647301797e-05, "loss": 0.4177, "step": 137 }, { "epoch": 1.3226837060702876, "grad_norm": 0.5902351365403176, "learning_rate": 2.7445873168038906e-05, "loss": 0.418, "step": 138 }, { "epoch": 1.3322683706070286, "grad_norm": 0.5490945288774098, "learning_rate": 2.7237140353933445e-05, "loss": 0.4352, "step": 139 }, { "epoch": 1.34185303514377, "grad_norm": 0.6497645956557252, "learning_rate": 2.7027496481626858e-05, "loss": 0.442, "step": 140 }, { "epoch": 1.351437699680511, "grad_norm": 0.542052646410243, "learning_rate": 2.68169679424505e-05, "loss": 0.3954, "step": 141 }, { "epoch": 1.3610223642172525, "grad_norm": 0.6469294947936336, "learning_rate": 2.6605581239103347e-05, "loss": 0.4324, "step": 142 }, { "epoch": 1.3706070287539935, "grad_norm": 0.6114843722995497, "learning_rate": 2.6393362982315632e-05, "loss": 0.4086, "step": 143 }, { "epoch": 1.3801916932907348, "grad_norm": 0.5302481010581994, "learning_rate": 2.618033988749895e-05, "loss": 0.3797, "step": 144 }, { "epoch": 1.389776357827476, "grad_norm": 0.6290020988684742, "learning_rate": 2.5966538771383124e-05, "loss": 0.4616, "step": 145 }, { "epoch": 1.3993610223642172, "grad_norm": 0.4520856565609043, "learning_rate": 2.5751986548640345e-05, "loss": 0.3691, "step": 146 }, { "epoch": 1.4089456869009584, "grad_norm": 0.6229237189143199, "learning_rate": 2.5536710228496986e-05, "loss": 0.4428, "step": 147 }, { "epoch": 1.4185303514376997, "grad_norm": 0.5985663011286853, "learning_rate": 2.5320736911333503e-05, "loss": 0.4101, "step": 148 }, { "epoch": 1.428115015974441, "grad_norm": 0.5602348014424229, "learning_rate": 2.5104093785272854e-05, "loss": 0.3799, "step": 149 }, { "epoch": 1.4376996805111821, "grad_norm": 0.5489525751898331, "learning_rate": 2.4886808122757882e-05, "loss": 0.4414, "step": 150 }, { "epoch": 1.4472843450479234, "grad_norm": 0.624773105379871, "learning_rate": 2.4668907277118114e-05, "loss": 0.381, "step": 151 }, { "epoch": 1.4568690095846646, "grad_norm": 0.5426555057346395, "learning_rate": 2.445041867912629e-05, "loss": 0.4539, "step": 152 }, { "epoch": 1.4664536741214058, "grad_norm": 0.55536633578182, "learning_rate": 2.423136983354526e-05, "loss": 0.3802, "step": 153 }, { "epoch": 1.476038338658147, "grad_norm": 0.4416725668124843, "learning_rate": 2.401178831566546e-05, "loss": 0.3748, "step": 154 }, { "epoch": 1.4856230031948883, "grad_norm": 0.6230029207177754, "learning_rate": 2.379170176783357e-05, "loss": 0.4455, "step": 155 }, { "epoch": 1.4952076677316293, "grad_norm": 0.42530667418850676, "learning_rate": 2.3571137895972735e-05, "loss": 0.3917, "step": 156 }, { "epoch": 1.5047923322683707, "grad_norm": 0.5190777897196971, "learning_rate": 2.335012446609473e-05, "loss": 0.3944, "step": 157 }, { "epoch": 1.5143769968051117, "grad_norm": 0.46663489738662295, "learning_rate": 2.312868930080462e-05, "loss": 0.4337, "step": 158 }, { "epoch": 1.5239616613418532, "grad_norm": 0.4797855409160743, "learning_rate": 2.2906860275798257e-05, "loss": 0.3447, "step": 159 }, { "epoch": 1.5335463258785942, "grad_norm": 0.48961810463783534, "learning_rate": 2.2684665316353112e-05, "loss": 0.4036, "step": 160 }, { "epoch": 1.5431309904153354, "grad_norm": 0.5015403228340246, "learning_rate": 2.246213239381286e-05, "loss": 0.4494, "step": 161 }, { "epoch": 1.5527156549520766, "grad_norm": 0.49826419920046305, "learning_rate": 2.2239289522066157e-05, "loss": 0.3662, "step": 162 }, { "epoch": 1.5623003194888179, "grad_norm": 0.5982999217769431, "learning_rate": 2.201616475402009e-05, "loss": 0.4716, "step": 163 }, { "epoch": 1.571884984025559, "grad_norm": 0.4282708816156415, "learning_rate": 2.179278617806867e-05, "loss": 0.3435, "step": 164 }, { "epoch": 1.5814696485623003, "grad_norm": 0.4734658108408669, "learning_rate": 2.1569181914556904e-05, "loss": 0.4212, "step": 165 }, { "epoch": 1.5910543130990416, "grad_norm": 0.49859752482578384, "learning_rate": 2.1345380112240796e-05, "loss": 0.425, "step": 166 }, { "epoch": 1.6006389776357828, "grad_norm": 0.47157462374026493, "learning_rate": 2.1121408944743838e-05, "loss": 0.4507, "step": 167 }, { "epoch": 1.610223642172524, "grad_norm": 0.4248405831748351, "learning_rate": 2.08972966070103e-05, "loss": 0.3602, "step": 168 }, { "epoch": 1.619808306709265, "grad_norm": 0.5006569891631065, "learning_rate": 2.0673071311755885e-05, "loss": 0.4462, "step": 169 }, { "epoch": 1.6293929712460065, "grad_norm": 0.43452414054342864, "learning_rate": 2.0448761285916103e-05, "loss": 0.3398, "step": 170 }, { "epoch": 1.6389776357827475, "grad_norm": 0.5593870865885154, "learning_rate": 2.022439476709292e-05, "loss": 0.4578, "step": 171 }, { "epoch": 1.648562300319489, "grad_norm": 0.48660923787373483, "learning_rate": 2e-05, "loss": 0.402, "step": 172 }, { "epoch": 1.65814696485623, "grad_norm": 0.47042892474320597, "learning_rate": 1.9775605232907085e-05, "loss": 0.3732, "step": 173 }, { "epoch": 1.6677316293929714, "grad_norm": 0.4637788148192955, "learning_rate": 1.9551238714083903e-05, "loss": 0.4164, "step": 174 }, { "epoch": 1.6773162939297124, "grad_norm": 0.5061846620195124, "learning_rate": 1.932692868824413e-05, "loss": 0.375, "step": 175 }, { "epoch": 1.6869009584664538, "grad_norm": 0.4389361045425349, "learning_rate": 1.910270339298971e-05, "loss": 0.4262, "step": 176 }, { "epoch": 1.6964856230031948, "grad_norm": 0.4781379704434354, "learning_rate": 1.8878591055256165e-05, "loss": 0.4094, "step": 177 }, { "epoch": 1.706070287539936, "grad_norm": 0.4719818469093975, "learning_rate": 1.8654619887759207e-05, "loss": 0.4398, "step": 178 }, { "epoch": 1.7156549520766773, "grad_norm": 0.4401976778891567, "learning_rate": 1.8430818085443106e-05, "loss": 0.3973, "step": 179 }, { "epoch": 1.7252396166134185, "grad_norm": 0.5070123943848236, "learning_rate": 1.8207213821931332e-05, "loss": 0.4068, "step": 180 }, { "epoch": 1.7348242811501597, "grad_norm": 0.4165356789504617, "learning_rate": 1.7983835245979914e-05, "loss": 0.4368, "step": 181 }, { "epoch": 1.744408945686901, "grad_norm": 0.5166587940461813, "learning_rate": 1.7760710477933846e-05, "loss": 0.4043, "step": 182 }, { "epoch": 1.7539936102236422, "grad_norm": 0.48269705466364915, "learning_rate": 1.7537867606187145e-05, "loss": 0.4118, "step": 183 }, { "epoch": 1.7635782747603834, "grad_norm": 0.47756180477333937, "learning_rate": 1.7315334683646898e-05, "loss": 0.42, "step": 184 }, { "epoch": 1.7731629392971247, "grad_norm": 0.4765695567575237, "learning_rate": 1.7093139724201753e-05, "loss": 0.4012, "step": 185 }, { "epoch": 1.7827476038338657, "grad_norm": 0.4681108878215689, "learning_rate": 1.687131069919538e-05, "loss": 0.41, "step": 186 }, { "epoch": 1.792332268370607, "grad_norm": 0.46037561395697485, "learning_rate": 1.6649875533905276e-05, "loss": 0.4043, "step": 187 }, { "epoch": 1.8019169329073481, "grad_norm": 0.419707165268871, "learning_rate": 1.642886210402727e-05, "loss": 0.3914, "step": 188 }, { "epoch": 1.8115015974440896, "grad_norm": 0.4619831244311854, "learning_rate": 1.620829823216643e-05, "loss": 0.4228, "step": 189 }, { "epoch": 1.8210862619808306, "grad_norm": 0.4361667434307363, "learning_rate": 1.5988211684334548e-05, "loss": 0.4047, "step": 190 }, { "epoch": 1.830670926517572, "grad_norm": 0.46079446208057, "learning_rate": 1.5768630166454746e-05, "loss": 0.4362, "step": 191 }, { "epoch": 1.840255591054313, "grad_norm": 0.4550050932615957, "learning_rate": 1.5549581320873715e-05, "loss": 0.4118, "step": 192 }, { "epoch": 1.8498402555910545, "grad_norm": 0.39293704953570213, "learning_rate": 1.53310927228819e-05, "loss": 0.407, "step": 193 }, { "epoch": 1.8594249201277955, "grad_norm": 0.4352517300933002, "learning_rate": 1.5113191877242116e-05, "loss": 0.3869, "step": 194 }, { "epoch": 1.8690095846645367, "grad_norm": 0.41513949179810855, "learning_rate": 1.4895906214727149e-05, "loss": 0.3755, "step": 195 }, { "epoch": 1.878594249201278, "grad_norm": 0.422581641804126, "learning_rate": 1.46792630886665e-05, "loss": 0.3707, "step": 196 }, { "epoch": 1.8881789137380192, "grad_norm": 0.4315158978443658, "learning_rate": 1.4463289771503015e-05, "loss": 0.4135, "step": 197 }, { "epoch": 1.8977635782747604, "grad_norm": 0.4439035469190941, "learning_rate": 1.4248013451359657e-05, "loss": 0.4299, "step": 198 }, { "epoch": 1.9073482428115016, "grad_norm": 0.38114332649556365, "learning_rate": 1.403346122861688e-05, "loss": 0.3986, "step": 199 }, { "epoch": 1.9169329073482428, "grad_norm": 0.4562893320122979, "learning_rate": 1.3819660112501054e-05, "loss": 0.4447, "step": 200 }, { "epoch": 1.926517571884984, "grad_norm": 0.41109636995050963, "learning_rate": 1.3606637017684375e-05, "loss": 0.4231, "step": 201 }, { "epoch": 1.9361022364217253, "grad_norm": 0.4108514256627659, "learning_rate": 1.3394418760896665e-05, "loss": 0.3919, "step": 202 }, { "epoch": 1.9456869009584663, "grad_norm": 0.4403482836797348, "learning_rate": 1.31830320575495e-05, "loss": 0.4004, "step": 203 }, { "epoch": 1.9552715654952078, "grad_norm": 0.46058376777126453, "learning_rate": 1.2972503518373145e-05, "loss": 0.4271, "step": 204 }, { "epoch": 1.9648562300319488, "grad_norm": 0.4549563896458377, "learning_rate": 1.2762859646066561e-05, "loss": 0.4155, "step": 205 }, { "epoch": 1.9744408945686902, "grad_norm": 0.47102947639065273, "learning_rate": 1.2554126831961097e-05, "loss": 0.3947, "step": 206 }, { "epoch": 1.9840255591054312, "grad_norm": 0.43255866772480006, "learning_rate": 1.2346331352698206e-05, "loss": 0.3965, "step": 207 }, { "epoch": 1.9936102236421727, "grad_norm": 0.6055961125329625, "learning_rate": 1.213949936692153e-05, "loss": 0.4308, "step": 208 }, { "epoch": 2.0031948881789137, "grad_norm": 0.8201258360584602, "learning_rate": 1.1933656911983901e-05, "loss": 0.5536, "step": 209 }, { "epoch": 2.012779552715655, "grad_norm": 0.5116622937539458, "learning_rate": 1.1728829900669592e-05, "loss": 0.3256, "step": 210 }, { "epoch": 2.022364217252396, "grad_norm": 0.6977084341791524, "learning_rate": 1.1525044117932227e-05, "loss": 0.3275, "step": 211 }, { "epoch": 2.0319488817891376, "grad_norm": 0.639393679219998, "learning_rate": 1.132232521764884e-05, "loss": 0.3009, "step": 212 }, { "epoch": 2.0415335463258786, "grad_norm": 0.4890609363134351, "learning_rate": 1.1120698719390362e-05, "loss": 0.3387, "step": 213 }, { "epoch": 2.0511182108626196, "grad_norm": 0.5588000853620018, "learning_rate": 1.0920190005209066e-05, "loss": 0.3105, "step": 214 }, { "epoch": 2.060702875399361, "grad_norm": 0.5036006567654044, "learning_rate": 1.0720824316443321e-05, "loss": 0.2901, "step": 215 }, { "epoch": 2.070287539936102, "grad_norm": 0.5915822002117069, "learning_rate": 1.0522626750540029e-05, "loss": 0.3493, "step": 216 }, { "epoch": 2.0798722044728435, "grad_norm": 0.4214007118584772, "learning_rate": 1.0325622257895205e-05, "loss": 0.2985, "step": 217 }, { "epoch": 2.0894568690095845, "grad_norm": 0.5295101270711986, "learning_rate": 1.0129835638713064e-05, "loss": 0.3166, "step": 218 }, { "epoch": 2.099041533546326, "grad_norm": 0.4906090405908181, "learning_rate": 9.935291539884e-06, "loss": 0.2843, "step": 219 }, { "epoch": 2.108626198083067, "grad_norm": 0.3938053903767187, "learning_rate": 9.74201445188188e-06, "loss": 0.3163, "step": 220 }, { "epoch": 2.1182108626198084, "grad_norm": 0.5012038895473048, "learning_rate": 9.550028705681024e-06, "loss": 0.3222, "step": 221 }, { "epoch": 2.1277955271565494, "grad_norm": 0.4321669775385389, "learning_rate": 9.359358469693272e-06, "loss": 0.2909, "step": 222 }, { "epoch": 2.137380191693291, "grad_norm": 0.4117178329226722, "learning_rate": 9.170027746725487e-06, "loss": 0.3146, "step": 223 }, { "epoch": 2.146964856230032, "grad_norm": 0.3793388916643685, "learning_rate": 8.982060370957953e-06, "loss": 0.3017, "step": 224 }, { "epoch": 2.1565495207667733, "grad_norm": 0.4535610427107228, "learning_rate": 8.795480004943946e-06, "loss": 0.3456, "step": 225 }, { "epoch": 2.1661341853035143, "grad_norm": 0.37244536401873557, "learning_rate": 8.610310136630962e-06, "loss": 0.2761, "step": 226 }, { "epoch": 2.1757188498402558, "grad_norm": 0.3905670548801518, "learning_rate": 8.426574076403887e-06, "loss": 0.302, "step": 227 }, { "epoch": 2.1853035143769968, "grad_norm": 0.4359210822644894, "learning_rate": 8.24429495415054e-06, "loss": 0.338, "step": 228 }, { "epoch": 2.194888178913738, "grad_norm": 0.35920435170508774, "learning_rate": 8.063495716349929e-06, "loss": 0.3081, "step": 229 }, { "epoch": 2.2044728434504792, "grad_norm": 0.37621369347346134, "learning_rate": 7.884199123183604e-06, "loss": 0.3419, "step": 230 }, { "epoch": 2.2140575079872207, "grad_norm": 0.3458946393058899, "learning_rate": 7.706427745670458e-06, "loss": 0.2749, "step": 231 }, { "epoch": 2.2236421725239617, "grad_norm": 0.4279444858711213, "learning_rate": 7.530203962825331e-06, "loss": 0.3365, "step": 232 }, { "epoch": 2.2332268370607027, "grad_norm": 0.3940413928415214, "learning_rate": 7.355549958841808e-06, "loss": 0.3282, "step": 233 }, { "epoch": 2.242811501597444, "grad_norm": 0.39687410854343524, "learning_rate": 7.182487720299518e-06, "loss": 0.2943, "step": 234 }, { "epoch": 2.252396166134185, "grad_norm": 0.42285032823633234, "learning_rate": 7.01103903339633e-06, "loss": 0.3252, "step": 235 }, { "epoch": 2.2619808306709266, "grad_norm": 0.3730435843457758, "learning_rate": 6.841225481205749e-06, "loss": 0.2755, "step": 236 }, { "epoch": 2.2715654952076676, "grad_norm": 0.35400532023912695, "learning_rate": 6.6730684409599225e-06, "loss": 0.2962, "step": 237 }, { "epoch": 2.281150159744409, "grad_norm": 0.39975466800088416, "learning_rate": 6.5065890813585145e-06, "loss": 0.2951, "step": 238 }, { "epoch": 2.29073482428115, "grad_norm": 0.4291346122689877, "learning_rate": 6.3418083599038624e-06, "loss": 0.317, "step": 239 }, { "epoch": 2.3003194888178915, "grad_norm": 0.3209273222551871, "learning_rate": 6.178747020262708e-06, "loss": 0.2881, "step": 240 }, { "epoch": 2.3099041533546325, "grad_norm": 0.3305467433463152, "learning_rate": 6.017425589654853e-06, "loss": 0.2595, "step": 241 }, { "epoch": 2.319488817891374, "grad_norm": 0.38111354545103865, "learning_rate": 5.857864376269051e-06, "loss": 0.326, "step": 242 }, { "epoch": 2.329073482428115, "grad_norm": 0.30011533883033215, "learning_rate": 5.700083466706494e-06, "loss": 0.2744, "step": 243 }, { "epoch": 2.3386581469648564, "grad_norm": 0.3813259748607558, "learning_rate": 5.544102723452171e-06, "loss": 0.3105, "step": 244 }, { "epoch": 2.3482428115015974, "grad_norm": 0.35123991131982374, "learning_rate": 5.38994178237445e-06, "loss": 0.316, "step": 245 }, { "epoch": 2.357827476038339, "grad_norm": 0.32456708820534624, "learning_rate": 5.237620050253189e-06, "loss": 0.3226, "step": 246 }, { "epoch": 2.36741214057508, "grad_norm": 0.2976838087469898, "learning_rate": 5.087156702336689e-06, "loss": 0.2614, "step": 247 }, { "epoch": 2.376996805111821, "grad_norm": 0.36552843906148524, "learning_rate": 4.938570679927784e-06, "loss": 0.3123, "step": 248 }, { "epoch": 2.3865814696485623, "grad_norm": 0.35095503173458703, "learning_rate": 4.791880687999382e-06, "loss": 0.3123, "step": 249 }, { "epoch": 2.3961661341853033, "grad_norm": 0.30625640312787195, "learning_rate": 4.647105192839778e-06, "loss": 0.3176, "step": 250 }, { "epoch": 2.405750798722045, "grad_norm": 0.341761533375559, "learning_rate": 4.504262419727983e-06, "loss": 0.3144, "step": 251 }, { "epoch": 2.415335463258786, "grad_norm": 0.3533342249857026, "learning_rate": 4.363370350639405e-06, "loss": 0.34, "step": 252 }, { "epoch": 2.4249201277955272, "grad_norm": 0.34883281084308354, "learning_rate": 4.2244467219821806e-06, "loss": 0.2955, "step": 253 }, { "epoch": 2.4345047923322682, "grad_norm": 0.32463619347820316, "learning_rate": 4.087509022364382e-06, "loss": 0.283, "step": 254 }, { "epoch": 2.4440894568690097, "grad_norm": 0.33770338798840704, "learning_rate": 3.952574490392443e-06, "loss": 0.3177, "step": 255 }, { "epoch": 2.4536741214057507, "grad_norm": 0.3110029435092563, "learning_rate": 3.819660112501053e-06, "loss": 0.302, "step": 256 }, { "epoch": 2.463258785942492, "grad_norm": 0.30800590517974463, "learning_rate": 3.6887826208147968e-06, "loss": 0.302, "step": 257 }, { "epoch": 2.472843450479233, "grad_norm": 0.3232681352037157, "learning_rate": 3.5599584910418037e-06, "loss": 0.2646, "step": 258 }, { "epoch": 2.4824281150159746, "grad_norm": 0.36421296660907726, "learning_rate": 3.433203940399672e-06, "loss": 0.3084, "step": 259 }, { "epoch": 2.4920127795527156, "grad_norm": 0.3831030312811903, "learning_rate": 3.3085349255739475e-06, "loss": 0.3469, "step": 260 }, { "epoch": 2.501597444089457, "grad_norm": 0.3325867758576896, "learning_rate": 3.1859671407093984e-06, "loss": 0.3093, "step": 261 }, { "epoch": 2.511182108626198, "grad_norm": 0.337706654419708, "learning_rate": 3.0655160154343177e-06, "loss": 0.2979, "step": 262 }, { "epoch": 2.520766773162939, "grad_norm": 0.32967438884893885, "learning_rate": 2.947196712918157e-06, "loss": 0.3047, "step": 263 }, { "epoch": 2.5303514376996805, "grad_norm": 0.3233433151169985, "learning_rate": 2.8310241279626784e-06, "loss": 0.2969, "step": 264 }, { "epoch": 2.539936102236422, "grad_norm": 0.31301087906685726, "learning_rate": 2.7170128851269084e-06, "loss": 0.2981, "step": 265 }, { "epoch": 2.549520766773163, "grad_norm": 0.32786149056284464, "learning_rate": 2.6051773368860935e-06, "loss": 0.3308, "step": 266 }, { "epoch": 2.559105431309904, "grad_norm": 0.31611139290200363, "learning_rate": 2.4955315618249263e-06, "loss": 0.3134, "step": 267 }, { "epoch": 2.5686900958466454, "grad_norm": 0.3202278737140233, "learning_rate": 2.38808936286524e-06, "loss": 0.2988, "step": 268 }, { "epoch": 2.5782747603833864, "grad_norm": 0.3187169467092785, "learning_rate": 2.2828642655284038e-06, "loss": 0.275, "step": 269 }, { "epoch": 2.587859424920128, "grad_norm": 0.31150312167717303, "learning_rate": 2.1798695162326444e-06, "loss": 0.2913, "step": 270 }, { "epoch": 2.597444089456869, "grad_norm": 0.3199602869029221, "learning_rate": 2.0791180806254975e-06, "loss": 0.3142, "step": 271 }, { "epoch": 2.6070287539936103, "grad_norm": 0.2859711870729773, "learning_rate": 1.9806226419516195e-06, "loss": 0.2994, "step": 272 }, { "epoch": 2.6166134185303513, "grad_norm": 0.31034616192733694, "learning_rate": 1.8843955994561191e-06, "loss": 0.3301, "step": 273 }, { "epoch": 2.626198083067093, "grad_norm": 0.2911594390655468, "learning_rate": 1.790449066823683e-06, "loss": 0.3136, "step": 274 }, { "epoch": 2.635782747603834, "grad_norm": 0.31846174945632705, "learning_rate": 1.6987948706536038e-06, "loss": 0.3265, "step": 275 }, { "epoch": 2.6453674121405752, "grad_norm": 0.3091371745510634, "learning_rate": 1.6094445489709886e-06, "loss": 0.3116, "step": 276 }, { "epoch": 2.6549520766773163, "grad_norm": 0.28744483162457096, "learning_rate": 1.5224093497742654e-06, "loss": 0.3081, "step": 277 }, { "epoch": 2.6645367412140573, "grad_norm": 0.3010104381158104, "learning_rate": 1.4377002296192233e-06, "loss": 0.2914, "step": 278 }, { "epoch": 2.6741214057507987, "grad_norm": 0.3184703269379148, "learning_rate": 1.3553278522397162e-06, "loss": 0.3398, "step": 279 }, { "epoch": 2.68370607028754, "grad_norm": 0.3012745996881748, "learning_rate": 1.275302587205256e-06, "loss": 0.2895, "step": 280 }, { "epoch": 2.693290734824281, "grad_norm": 0.32944896697723586, "learning_rate": 1.1976345086156193e-06, "loss": 0.3205, "step": 281 }, { "epoch": 2.702875399361022, "grad_norm": 0.30329250730790236, "learning_rate": 1.1223333938326486e-06, "loss": 0.307, "step": 282 }, { "epoch": 2.7124600638977636, "grad_norm": 0.31477266554231703, "learning_rate": 1.0494087222494253e-06, "loss": 0.2977, "step": 283 }, { "epoch": 2.722044728434505, "grad_norm": 0.27218751575717026, "learning_rate": 9.788696740969295e-07, "loss": 0.2727, "step": 284 }, { "epoch": 2.731629392971246, "grad_norm": 0.2957347661302385, "learning_rate": 9.107251292883856e-07, "loss": 0.3241, "step": 285 }, { "epoch": 2.741214057507987, "grad_norm": 0.2754457767022288, "learning_rate": 8.44983666301391e-07, "loss": 0.2888, "step": 286 }, { "epoch": 2.7507987220447285, "grad_norm": 0.3009061607925657, "learning_rate": 7.816535610980103e-07, "loss": 0.3236, "step": 287 }, { "epoch": 2.7603833865814695, "grad_norm": 0.3042692164321764, "learning_rate": 7.207427860829352e-07, "loss": 0.3293, "step": 288 }, { "epoch": 2.769968051118211, "grad_norm": 0.3084699080559463, "learning_rate": 6.622590090998727e-07, "loss": 0.2896, "step": 289 }, { "epoch": 2.779552715654952, "grad_norm": 0.2874746995613232, "learning_rate": 6.062095924662625e-07, "loss": 0.2977, "step": 290 }, { "epoch": 2.7891373801916934, "grad_norm": 0.30432287110598966, "learning_rate": 5.526015920464689e-07, "loss": 0.3527, "step": 291 }, { "epoch": 2.7987220447284344, "grad_norm": 0.2752619163287045, "learning_rate": 5.014417563635276e-07, "loss": 0.2575, "step": 292 }, { "epoch": 2.8083067092651754, "grad_norm": 0.3000832642958543, "learning_rate": 4.5273652574961745e-07, "loss": 0.3026, "step": 293 }, { "epoch": 2.817891373801917, "grad_norm": 0.28459336557371145, "learning_rate": 4.064920315352905e-07, "loss": 0.2953, "step": 294 }, { "epoch": 2.8274760383386583, "grad_norm": 0.3403049604495616, "learning_rate": 3.62714095277632e-07, "loss": 0.3698, "step": 295 }, { "epoch": 2.8370607028753994, "grad_norm": 0.28503876136131184, "learning_rate": 3.214082280274067e-07, "loss": 0.2709, "step": 296 }, { "epoch": 2.8466453674121404, "grad_norm": 0.299085461383954, "learning_rate": 2.825796296352823e-07, "loss": 0.3268, "step": 297 }, { "epoch": 2.856230031948882, "grad_norm": 0.3016256794104278, "learning_rate": 2.462331880972468e-07, "loss": 0.3032, "step": 298 }, { "epoch": 2.8658146964856233, "grad_norm": 0.45115419737664014, "learning_rate": 2.123734789392673e-07, "loss": 0.3539, "step": 299 }, { "epoch": 2.8753993610223643, "grad_norm": 0.2661772363746791, "learning_rate": 1.81004764641306e-07, "loss": 0.2696, "step": 300 }, { "epoch": 2.8849840255591053, "grad_norm": 0.2901415907606709, "learning_rate": 1.5213099410071873e-07, "loss": 0.3199, "step": 301 }, { "epoch": 2.8945686900958467, "grad_norm": 0.29953435371037257, "learning_rate": 1.2575580213514792e-07, "loss": 0.3403, "step": 302 }, { "epoch": 2.9041533546325877, "grad_norm": 0.28555862760239925, "learning_rate": 1.0188250902495312e-07, "loss": 0.3211, "step": 303 }, { "epoch": 2.913738019169329, "grad_norm": 0.3026928327893977, "learning_rate": 8.051412009521864e-08, "loss": 0.3422, "step": 304 }, { "epoch": 2.92332268370607, "grad_norm": 0.27586171376383506, "learning_rate": 6.165332533744072e-08, "loss": 0.2635, "step": 305 }, { "epoch": 2.9329073482428116, "grad_norm": 0.2797989784483068, "learning_rate": 4.530249907087836e-08, "loss": 0.2942, "step": 306 }, { "epoch": 2.9424920127795526, "grad_norm": 0.32210333279055925, "learning_rate": 3.146369964366791e-08, "loss": 0.3829, "step": 307 }, { "epoch": 2.952076677316294, "grad_norm": 0.27055679198155164, "learning_rate": 2.0138669173708213e-08, "loss": 0.2838, "step": 308 }, { "epoch": 2.961661341853035, "grad_norm": 0.28816307097851795, "learning_rate": 1.1328833329333767e-08, "loss": 0.3031, "step": 309 }, { "epoch": 2.9712460063897765, "grad_norm": 0.28899005094993563, "learning_rate": 5.0353011498693875e-09, "loss": 0.3185, "step": 310 }, { "epoch": 2.9808306709265175, "grad_norm": 0.26574618706072956, "learning_rate": 1.2588649059885883e-09, "loss": 0.2968, "step": 311 }, { "epoch": 2.9904153354632586, "grad_norm": 0.28718668323400676, "learning_rate": 0.0, "loss": 0.3009, "step": 312 }, { "epoch": 2.9904153354632586, "step": 312, "total_flos": 4.4115474684667494e+17, "train_loss": 0.43267787887881964, "train_runtime": 10181.5896, "train_samples_per_second": 2.946, "train_steps_per_second": 0.031 } ], "logging_steps": 1.0, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.4115474684667494e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }