{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.942630185348632, "eval_steps": 500, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01412180052956752, "grad_norm": 5.979931009348192, "learning_rate": 2.285714285714286e-06, "loss": 0.8154, "step": 1 }, { "epoch": 0.02824360105913504, "grad_norm": 6.047301062181586, "learning_rate": 4.571428571428572e-06, "loss": 0.8243, "step": 2 }, { "epoch": 0.04236540158870256, "grad_norm": 5.589397434568418, "learning_rate": 6.857142857142858e-06, "loss": 0.8057, "step": 3 }, { "epoch": 0.05648720211827008, "grad_norm": 4.017126644109988, "learning_rate": 9.142857142857144e-06, "loss": 0.7602, "step": 4 }, { "epoch": 0.0706090026478376, "grad_norm": 2.1627060531795967, "learning_rate": 1.1428571428571429e-05, "loss": 0.7197, "step": 5 }, { "epoch": 0.08473080317740513, "grad_norm": 4.667143281538081, "learning_rate": 1.3714285714285716e-05, "loss": 0.7374, "step": 6 }, { "epoch": 0.09885260370697264, "grad_norm": 6.545492179001986, "learning_rate": 1.6000000000000003e-05, "loss": 0.7325, "step": 7 }, { "epoch": 0.11297440423654016, "grad_norm": 7.281711055430636, "learning_rate": 1.8285714285714288e-05, "loss": 0.7502, "step": 8 }, { "epoch": 0.12709620476610767, "grad_norm": 4.392674048119666, "learning_rate": 2.057142857142857e-05, "loss": 0.7051, "step": 9 }, { "epoch": 0.1412180052956752, "grad_norm": 2.9074563359884973, "learning_rate": 2.2857142857142858e-05, "loss": 0.6593, "step": 10 }, { "epoch": 0.1553398058252427, "grad_norm": 2.205323794009288, "learning_rate": 2.5142857142857143e-05, "loss": 0.6276, "step": 11 }, { "epoch": 0.16946160635481025, "grad_norm": 1.4150033586173336, "learning_rate": 2.742857142857143e-05, "loss": 0.6013, "step": 12 }, { "epoch": 0.18358340688437777, "grad_norm": 1.3251831618492345, "learning_rate": 2.9714285714285717e-05, "loss": 0.5859, "step": 13 }, { "epoch": 0.1977052074139453, "grad_norm": 3.1805546649918544, "learning_rate": 3.2000000000000005e-05, "loss": 0.5824, "step": 14 }, { "epoch": 0.2118270079435128, "grad_norm": 1.4989971887780031, "learning_rate": 3.4285714285714284e-05, "loss": 0.5708, "step": 15 }, { "epoch": 0.22594880847308033, "grad_norm": 1.203788524759605, "learning_rate": 3.6571428571428576e-05, "loss": 0.5563, "step": 16 }, { "epoch": 0.24007060900264784, "grad_norm": 1.3578810581727971, "learning_rate": 3.885714285714286e-05, "loss": 0.5566, "step": 17 }, { "epoch": 0.25419240953221534, "grad_norm": 0.8074275719021523, "learning_rate": 4.114285714285714e-05, "loss": 0.5507, "step": 18 }, { "epoch": 0.26831421006178285, "grad_norm": 1.1227844753364196, "learning_rate": 4.342857142857143e-05, "loss": 0.5418, "step": 19 }, { "epoch": 0.2824360105913504, "grad_norm": 0.8760389990351023, "learning_rate": 4.5714285714285716e-05, "loss": 0.5333, "step": 20 }, { "epoch": 0.2965578111209179, "grad_norm": 1.1881496989284341, "learning_rate": 4.8e-05, "loss": 0.5305, "step": 21 }, { "epoch": 0.3106796116504854, "grad_norm": 0.9681459485298871, "learning_rate": 5.0285714285714286e-05, "loss": 0.5189, "step": 22 }, { "epoch": 0.324801412180053, "grad_norm": 1.4344211614758422, "learning_rate": 5.257142857142858e-05, "loss": 0.5198, "step": 23 }, { "epoch": 0.3389232127096205, "grad_norm": 0.8492140987790524, "learning_rate": 5.485714285714286e-05, "loss": 0.5102, "step": 24 }, { "epoch": 0.353045013239188, "grad_norm": 1.2351656998878342, "learning_rate": 5.714285714285715e-05, "loss": 0.5098, "step": 25 }, { "epoch": 0.36716681376875554, "grad_norm": 0.898578091846962, "learning_rate": 5.9428571428571434e-05, "loss": 0.5015, "step": 26 }, { "epoch": 0.38128861429832306, "grad_norm": 1.5089247050740433, "learning_rate": 6.171428571428573e-05, "loss": 0.507, "step": 27 }, { "epoch": 0.3954104148278906, "grad_norm": 0.9864208925736987, "learning_rate": 6.400000000000001e-05, "loss": 0.5034, "step": 28 }, { "epoch": 0.4095322153574581, "grad_norm": 1.0206985743120736, "learning_rate": 6.62857142857143e-05, "loss": 0.4983, "step": 29 }, { "epoch": 0.4236540158870256, "grad_norm": 1.4229934179471342, "learning_rate": 6.857142857142857e-05, "loss": 0.508, "step": 30 }, { "epoch": 0.43777581641659313, "grad_norm": 0.9625460430091453, "learning_rate": 7.085714285714287e-05, "loss": 0.5016, "step": 31 }, { "epoch": 0.45189761694616065, "grad_norm": 1.1144628190568628, "learning_rate": 7.314285714285715e-05, "loss": 0.4939, "step": 32 }, { "epoch": 0.46601941747572817, "grad_norm": 0.9463549200929555, "learning_rate": 7.542857142857144e-05, "loss": 0.4924, "step": 33 }, { "epoch": 0.4801412180052957, "grad_norm": 0.8892598203382347, "learning_rate": 7.771428571428572e-05, "loss": 0.4902, "step": 34 }, { "epoch": 0.4942630185348632, "grad_norm": 0.9413854750510515, "learning_rate": 8e-05, "loss": 0.4852, "step": 35 }, { "epoch": 0.5083848190644307, "grad_norm": 0.9034630826375731, "learning_rate": 7.999801067823773e-05, "loss": 0.4853, "step": 36 }, { "epoch": 0.5225066195939982, "grad_norm": 1.2269858722315412, "learning_rate": 7.999204291082095e-05, "loss": 0.4764, "step": 37 }, { "epoch": 0.5366284201235657, "grad_norm": 0.9045227868888749, "learning_rate": 7.998209729134014e-05, "loss": 0.4809, "step": 38 }, { "epoch": 0.5507502206531333, "grad_norm": 1.2176978127037603, "learning_rate": 7.996817480904718e-05, "loss": 0.4719, "step": 39 }, { "epoch": 0.5648720211827007, "grad_norm": 1.2333419409501036, "learning_rate": 7.99502768487569e-05, "loss": 0.477, "step": 40 }, { "epoch": 0.5789938217122683, "grad_norm": 0.7483281237491166, "learning_rate": 7.99284051907094e-05, "loss": 0.4724, "step": 41 }, { "epoch": 0.5931156222418358, "grad_norm": 0.6151558817864059, "learning_rate": 7.990256201039297e-05, "loss": 0.4662, "step": 42 }, { "epoch": 0.6072374227714034, "grad_norm": 0.6814568571856022, "learning_rate": 7.987274987832764e-05, "loss": 0.4621, "step": 43 }, { "epoch": 0.6213592233009708, "grad_norm": 0.9240497143419791, "learning_rate": 7.983897175980957e-05, "loss": 0.4665, "step": 44 }, { "epoch": 0.6354810238305384, "grad_norm": 1.2034986971304784, "learning_rate": 7.980123101461606e-05, "loss": 0.4761, "step": 45 }, { "epoch": 0.649602824360106, "grad_norm": 0.8192788227089312, "learning_rate": 7.975953139667141e-05, "loss": 0.4652, "step": 46 }, { "epoch": 0.6637246248896734, "grad_norm": 0.7683306980890072, "learning_rate": 7.97138770536735e-05, "loss": 0.4619, "step": 47 }, { "epoch": 0.677846425419241, "grad_norm": 0.7567015766907312, "learning_rate": 7.966427252668121e-05, "loss": 0.4638, "step": 48 }, { "epoch": 0.6919682259488085, "grad_norm": 0.6846820764750615, "learning_rate": 7.961072274966282e-05, "loss": 0.4527, "step": 49 }, { "epoch": 0.706090026478376, "grad_norm": 0.7395598100512276, "learning_rate": 7.955323304900514e-05, "loss": 0.4571, "step": 50 }, { "epoch": 0.7202118270079435, "grad_norm": 0.5351232158771764, "learning_rate": 7.949180914298383e-05, "loss": 0.4496, "step": 51 }, { "epoch": 0.7343336275375111, "grad_norm": 0.6324424558337066, "learning_rate": 7.942645714119452e-05, "loss": 0.4593, "step": 52 }, { "epoch": 0.7484554280670785, "grad_norm": 0.547964739600884, "learning_rate": 7.93571835439452e-05, "loss": 0.4502, "step": 53 }, { "epoch": 0.7625772285966461, "grad_norm": 0.7115536296101671, "learning_rate": 7.928399524160956e-05, "loss": 0.447, "step": 54 }, { "epoch": 0.7766990291262136, "grad_norm": 0.742782814289987, "learning_rate": 7.920689951394175e-05, "loss": 0.4461, "step": 55 }, { "epoch": 0.7908208296557812, "grad_norm": 0.6862659469941464, "learning_rate": 7.912590402935223e-05, "loss": 0.4473, "step": 56 }, { "epoch": 0.8049426301853486, "grad_norm": 0.6235041641613883, "learning_rate": 7.904101684414498e-05, "loss": 0.4472, "step": 57 }, { "epoch": 0.8190644307149162, "grad_norm": 0.44600795869954046, "learning_rate": 7.895224640171625e-05, "loss": 0.4442, "step": 58 }, { "epoch": 0.8331862312444837, "grad_norm": 0.48251979778530707, "learning_rate": 7.88596015317147e-05, "loss": 0.4449, "step": 59 }, { "epoch": 0.8473080317740512, "grad_norm": 0.4787558150068957, "learning_rate": 7.876309144916312e-05, "loss": 0.4433, "step": 60 }, { "epoch": 0.8614298323036187, "grad_norm": 0.41827598666685606, "learning_rate": 7.86627257535419e-05, "loss": 0.4401, "step": 61 }, { "epoch": 0.8755516328331863, "grad_norm": 0.4724100749619687, "learning_rate": 7.855851442783414e-05, "loss": 0.4374, "step": 62 }, { "epoch": 0.8896734333627537, "grad_norm": 0.6571994588226032, "learning_rate": 7.845046783753276e-05, "loss": 0.4409, "step": 63 }, { "epoch": 0.9037952338923213, "grad_norm": 0.9369294338435781, "learning_rate": 7.833859672960943e-05, "loss": 0.4407, "step": 64 }, { "epoch": 0.9179170344218888, "grad_norm": 1.1601560447987704, "learning_rate": 7.822291223144564e-05, "loss": 0.4602, "step": 65 }, { "epoch": 0.9320388349514563, "grad_norm": 0.6934703654331164, "learning_rate": 7.810342584972585e-05, "loss": 0.4369, "step": 66 }, { "epoch": 0.9461606354810238, "grad_norm": 0.5791439547503463, "learning_rate": 7.798014946929306e-05, "loss": 0.4356, "step": 67 }, { "epoch": 0.9602824360105914, "grad_norm": 0.8373041828808443, "learning_rate": 7.785309535196657e-05, "loss": 0.4504, "step": 68 }, { "epoch": 0.9744042365401588, "grad_norm": 0.6796500376958069, "learning_rate": 7.772227613532242e-05, "loss": 0.4392, "step": 69 }, { "epoch": 0.9885260370697264, "grad_norm": 0.6686880597044009, "learning_rate": 7.758770483143634e-05, "loss": 0.4474, "step": 70 }, { "epoch": 1.002647837599294, "grad_norm": 0.6901488338737102, "learning_rate": 7.74493948255895e-05, "loss": 0.5108, "step": 71 }, { "epoch": 1.0167696381288613, "grad_norm": 0.7139924415191212, "learning_rate": 7.730735987493711e-05, "loss": 0.4227, "step": 72 }, { "epoch": 1.030891438658429, "grad_norm": 0.7623382444431029, "learning_rate": 7.71616141071401e-05, "loss": 0.419, "step": 73 }, { "epoch": 1.0450132391879965, "grad_norm": 0.8179708530719029, "learning_rate": 7.701217201895987e-05, "loss": 0.4182, "step": 74 }, { "epoch": 1.059135039717564, "grad_norm": 0.6036364923611257, "learning_rate": 7.685904847481631e-05, "loss": 0.4147, "step": 75 }, { "epoch": 1.0732568402471314, "grad_norm": 0.5415944966587694, "learning_rate": 7.670225870530936e-05, "loss": 0.4192, "step": 76 }, { "epoch": 1.087378640776699, "grad_norm": 0.548496642769106, "learning_rate": 7.654181830570404e-05, "loss": 0.4193, "step": 77 }, { "epoch": 1.1015004413062666, "grad_norm": 0.4357435844414465, "learning_rate": 7.637774323437929e-05, "loss": 0.4126, "step": 78 }, { "epoch": 1.1156222418358341, "grad_norm": 0.5890851003105865, "learning_rate": 7.62100498112406e-05, "loss": 0.4193, "step": 79 }, { "epoch": 1.1297440423654015, "grad_norm": 0.5417176133106055, "learning_rate": 7.603875471609677e-05, "loss": 0.4069, "step": 80 }, { "epoch": 1.143865842894969, "grad_norm": 0.5234067170715418, "learning_rate": 7.586387498700084e-05, "loss": 0.4187, "step": 81 }, { "epoch": 1.1579876434245366, "grad_norm": 0.4795761329002007, "learning_rate": 7.568542801855535e-05, "loss": 0.4101, "step": 82 }, { "epoch": 1.1721094439541042, "grad_norm": 0.510485207368403, "learning_rate": 7.550343156018217e-05, "loss": 0.4074, "step": 83 }, { "epoch": 1.1862312444836718, "grad_norm": 0.5160993194955293, "learning_rate": 7.531790371435709e-05, "loss": 0.4105, "step": 84 }, { "epoch": 1.2003530450132391, "grad_norm": 0.6272135654421417, "learning_rate": 7.512886293480914e-05, "loss": 0.4131, "step": 85 }, { "epoch": 1.2144748455428067, "grad_norm": 0.7144516241332823, "learning_rate": 7.49363280246852e-05, "loss": 0.4123, "step": 86 }, { "epoch": 1.2285966460723743, "grad_norm": 1.0197175196301183, "learning_rate": 7.474031813467956e-05, "loss": 0.4199, "step": 87 }, { "epoch": 1.2427184466019416, "grad_norm": 0.9885970877399597, "learning_rate": 7.454085276112925e-05, "loss": 0.4152, "step": 88 }, { "epoch": 1.2568402471315092, "grad_norm": 0.5518795345815659, "learning_rate": 7.433795174407465e-05, "loss": 0.4064, "step": 89 }, { "epoch": 1.2709620476610768, "grad_norm": 0.42697954065556326, "learning_rate": 7.413163526528623e-05, "loss": 0.409, "step": 90 }, { "epoch": 1.2850838481906444, "grad_norm": 0.698380780251885, "learning_rate": 7.392192384625704e-05, "loss": 0.4054, "step": 91 }, { "epoch": 1.299205648720212, "grad_norm": 0.6703174317830842, "learning_rate": 7.370883834616157e-05, "loss": 0.4099, "step": 92 }, { "epoch": 1.3133274492497793, "grad_norm": 0.3951173073488556, "learning_rate": 7.349239995978095e-05, "loss": 0.4084, "step": 93 }, { "epoch": 1.3274492497793469, "grad_norm": 0.43174109319559356, "learning_rate": 7.327263021539478e-05, "loss": 0.4048, "step": 94 }, { "epoch": 1.3415710503089144, "grad_norm": 0.5360712514545947, "learning_rate": 7.30495509726398e-05, "loss": 0.4068, "step": 95 }, { "epoch": 1.3556928508384818, "grad_norm": 0.42774436448586106, "learning_rate": 7.282318442033567e-05, "loss": 0.4034, "step": 96 }, { "epoch": 1.3698146513680494, "grad_norm": 0.5210499488927217, "learning_rate": 7.259355307427781e-05, "loss": 0.4078, "step": 97 }, { "epoch": 1.383936451897617, "grad_norm": 0.7093148406292331, "learning_rate": 7.236067977499791e-05, "loss": 0.4084, "step": 98 }, { "epoch": 1.3980582524271845, "grad_norm": 0.8196300420238753, "learning_rate": 7.212458768549208e-05, "loss": 0.4069, "step": 99 }, { "epoch": 1.412180052956752, "grad_norm": 0.9973540383790642, "learning_rate": 7.188530028891691e-05, "loss": 0.4047, "step": 100 }, { "epoch": 1.4263018534863194, "grad_norm": 1.0704815886198962, "learning_rate": 7.164284138625367e-05, "loss": 0.4075, "step": 101 }, { "epoch": 1.440423654015887, "grad_norm": 0.5848553932345868, "learning_rate": 7.13972350939409e-05, "loss": 0.4036, "step": 102 }, { "epoch": 1.4545454545454546, "grad_norm": 0.37289550464762866, "learning_rate": 7.114850584147577e-05, "loss": 0.4068, "step": 103 }, { "epoch": 1.468667255075022, "grad_norm": 0.6651429035225815, "learning_rate": 7.089667836898399e-05, "loss": 0.4053, "step": 104 }, { "epoch": 1.4827890556045895, "grad_norm": 0.6931193008736451, "learning_rate": 7.064177772475912e-05, "loss": 0.4002, "step": 105 }, { "epoch": 1.496910856134157, "grad_norm": 0.3938085941153356, "learning_rate": 7.038382926277113e-05, "loss": 0.4013, "step": 106 }, { "epoch": 1.5110326566637247, "grad_norm": 0.410899316731272, "learning_rate": 7.012285864014445e-05, "loss": 0.404, "step": 107 }, { "epoch": 1.5251544571932922, "grad_norm": 0.5933306150673846, "learning_rate": 6.985889181460602e-05, "loss": 0.3992, "step": 108 }, { "epoch": 1.5392762577228596, "grad_norm": 0.47465582200581674, "learning_rate": 6.959195504190337e-05, "loss": 0.4022, "step": 109 }, { "epoch": 1.5533980582524272, "grad_norm": 0.29047076547162964, "learning_rate": 6.932207487319305e-05, "loss": 0.3933, "step": 110 }, { "epoch": 1.5675198587819947, "grad_norm": 0.3955673661524972, "learning_rate": 6.904927815239972e-05, "loss": 0.4014, "step": 111 }, { "epoch": 1.581641659311562, "grad_norm": 0.4729958849916794, "learning_rate": 6.877359201354606e-05, "loss": 0.4014, "step": 112 }, { "epoch": 1.5957634598411299, "grad_norm": 0.3117935062342313, "learning_rate": 6.84950438780538e-05, "loss": 0.4021, "step": 113 }, { "epoch": 1.6098852603706972, "grad_norm": 0.29707807435124145, "learning_rate": 6.821366145201636e-05, "loss": 0.4003, "step": 114 }, { "epoch": 1.6240070609002648, "grad_norm": 0.43753418225532925, "learning_rate": 6.792947272344292e-05, "loss": 0.3992, "step": 115 }, { "epoch": 1.6381288614298324, "grad_norm": 0.27791625901461003, "learning_rate": 6.76425059594746e-05, "loss": 0.3982, "step": 116 }, { "epoch": 1.6522506619593997, "grad_norm": 0.2525501356536547, "learning_rate": 6.73527897035728e-05, "loss": 0.4053, "step": 117 }, { "epoch": 1.6663724624889673, "grad_norm": 0.3669470139311434, "learning_rate": 6.706035277268022e-05, "loss": 0.4024, "step": 118 }, { "epoch": 1.6804942630185349, "grad_norm": 0.30825865476024705, "learning_rate": 6.676522425435433e-05, "loss": 0.3945, "step": 119 }, { "epoch": 1.6946160635481022, "grad_norm": 0.28018108144253323, "learning_rate": 6.646743350387438e-05, "loss": 0.3984, "step": 120 }, { "epoch": 1.70873786407767, "grad_norm": 0.30987982118204843, "learning_rate": 6.616701014132138e-05, "loss": 0.4021, "step": 121 }, { "epoch": 1.7228596646072374, "grad_norm": 0.3633571089136772, "learning_rate": 6.586398404863198e-05, "loss": 0.4026, "step": 122 }, { "epoch": 1.736981465136805, "grad_norm": 0.36013913213670684, "learning_rate": 6.555838536662624e-05, "loss": 0.3925, "step": 123 }, { "epoch": 1.7511032656663725, "grad_norm": 0.36709693358558493, "learning_rate": 6.525024449200956e-05, "loss": 0.3976, "step": 124 }, { "epoch": 1.7652250661959399, "grad_norm": 0.44695442666677676, "learning_rate": 6.493959207434934e-05, "loss": 0.3982, "step": 125 }, { "epoch": 1.7793468667255075, "grad_norm": 0.4500722428050271, "learning_rate": 6.462645901302633e-05, "loss": 0.3947, "step": 126 }, { "epoch": 1.793468667255075, "grad_norm": 0.39821702121821073, "learning_rate": 6.431087645416121e-05, "loss": 0.4015, "step": 127 }, { "epoch": 1.8075904677846424, "grad_norm": 0.42798393839154475, "learning_rate": 6.399287578751656e-05, "loss": 0.3959, "step": 128 }, { "epoch": 1.8217122683142102, "grad_norm": 0.4978207058435827, "learning_rate": 6.367248864337471e-05, "loss": 0.3975, "step": 129 }, { "epoch": 1.8358340688437775, "grad_norm": 0.4727933665511357, "learning_rate": 6.334974688939161e-05, "loss": 0.3961, "step": 130 }, { "epoch": 1.849955869373345, "grad_norm": 0.30157997491072186, "learning_rate": 6.302468262742695e-05, "loss": 0.3923, "step": 131 }, { "epoch": 1.8640776699029127, "grad_norm": 0.30111696128126747, "learning_rate": 6.269732819035128e-05, "loss": 0.3895, "step": 132 }, { "epoch": 1.87819947043248, "grad_norm": 0.33866239621320493, "learning_rate": 6.236771613882987e-05, "loss": 0.3933, "step": 133 }, { "epoch": 1.8923212709620476, "grad_norm": 0.2963866045397337, "learning_rate": 6.20358792580841e-05, "loss": 0.3865, "step": 134 }, { "epoch": 1.9064430714916152, "grad_norm": 0.2821832464959724, "learning_rate": 6.170185055463039e-05, "loss": 0.3985, "step": 135 }, { "epoch": 1.9205648720211828, "grad_norm": 0.26513081199542754, "learning_rate": 6.136566325299715e-05, "loss": 0.3972, "step": 136 }, { "epoch": 1.9346866725507503, "grad_norm": 0.25040847849987535, "learning_rate": 6.102735079242019e-05, "loss": 0.398, "step": 137 }, { "epoch": 1.9488084730803177, "grad_norm": 0.303971008854815, "learning_rate": 6.068694682351651e-05, "loss": 0.3957, "step": 138 }, { "epoch": 1.9629302736098853, "grad_norm": 0.2610849344447032, "learning_rate": 6.0344485204937274e-05, "loss": 0.3953, "step": 139 }, { "epoch": 1.9770520741394528, "grad_norm": 0.24540141466965165, "learning_rate": 6.000000000000001e-05, "loss": 0.3955, "step": 140 }, { "epoch": 1.9911738746690202, "grad_norm": 0.2807585102662493, "learning_rate": 5.965352547330046e-05, "loss": 0.4096, "step": 141 }, { "epoch": 2.005295675198588, "grad_norm": 0.4269953277008037, "learning_rate": 5.930509608730444e-05, "loss": 0.4441, "step": 142 }, { "epoch": 2.0194174757281553, "grad_norm": 0.5672907609303462, "learning_rate": 5.895474649891995e-05, "loss": 0.3728, "step": 143 }, { "epoch": 2.0335392762577227, "grad_norm": 0.7266748405757633, "learning_rate": 5.860251155605003e-05, "loss": 0.3745, "step": 144 }, { "epoch": 2.0476610767872905, "grad_norm": 1.0069160934332146, "learning_rate": 5.824842629412653e-05, "loss": 0.3832, "step": 145 }, { "epoch": 2.061782877316858, "grad_norm": 0.9424187541004289, "learning_rate": 5.7892525932625305e-05, "loss": 0.3779, "step": 146 }, { "epoch": 2.0759046778464256, "grad_norm": 0.42574191446629944, "learning_rate": 5.75348458715631e-05, "loss": 0.3718, "step": 147 }, { "epoch": 2.090026478375993, "grad_norm": 0.7408316783846461, "learning_rate": 5.7175421687976374e-05, "loss": 0.3699, "step": 148 }, { "epoch": 2.1041482789055603, "grad_norm": 0.6750908749341442, "learning_rate": 5.681428913238263e-05, "loss": 0.367, "step": 149 }, { "epoch": 2.118270079435128, "grad_norm": 0.486610272879909, "learning_rate": 5.645148412522447e-05, "loss": 0.3752, "step": 150 }, { "epoch": 2.1323918799646955, "grad_norm": 0.5306866815139071, "learning_rate": 5.60870427532967e-05, "loss": 0.3657, "step": 151 }, { "epoch": 2.146513680494263, "grad_norm": 0.4884339447717486, "learning_rate": 5.572100126615695e-05, "loss": 0.3701, "step": 152 }, { "epoch": 2.1606354810238306, "grad_norm": 0.39396923901380754, "learning_rate": 5.535339607252003e-05, "loss": 0.364, "step": 153 }, { "epoch": 2.174757281553398, "grad_norm": 0.3784748162116266, "learning_rate": 5.4984263736636494e-05, "loss": 0.3641, "step": 154 }, { "epoch": 2.1888790820829658, "grad_norm": 0.38537106208995364, "learning_rate": 5.461364097465581e-05, "loss": 0.3634, "step": 155 }, { "epoch": 2.203000882612533, "grad_norm": 0.33639666599879814, "learning_rate": 5.424156465097428e-05, "loss": 0.3676, "step": 156 }, { "epoch": 2.2171226831421005, "grad_norm": 0.3286791724075738, "learning_rate": 5.38680717745683e-05, "loss": 0.3649, "step": 157 }, { "epoch": 2.2312444836716683, "grad_norm": 0.3129994921836922, "learning_rate": 5.349319949531321e-05, "loss": 0.3646, "step": 158 }, { "epoch": 2.2453662842012356, "grad_norm": 0.3031016329231297, "learning_rate": 5.3116985100288185e-05, "loss": 0.3682, "step": 159 }, { "epoch": 2.259488084730803, "grad_norm": 0.27541410223019297, "learning_rate": 5.2739466010067385e-05, "loss": 0.3606, "step": 160 }, { "epoch": 2.2736098852603708, "grad_norm": 0.36257443661095795, "learning_rate": 5.23606797749979e-05, "loss": 0.3638, "step": 161 }, { "epoch": 2.287731685789938, "grad_norm": 0.2567579985831816, "learning_rate": 5.1980664071464776e-05, "loss": 0.3667, "step": 162 }, { "epoch": 2.301853486319506, "grad_norm": 0.31190867351244567, "learning_rate": 5.159945669814345e-05, "loss": 0.3696, "step": 163 }, { "epoch": 2.3159752868490733, "grad_norm": 0.2937382011800516, "learning_rate": 5.121709557224011e-05, "loss": 0.3606, "step": 164 }, { "epoch": 2.3300970873786406, "grad_norm": 0.2256249267158452, "learning_rate": 5.0833618725720214e-05, "loss": 0.365, "step": 165 }, { "epoch": 2.3442188879082084, "grad_norm": 0.298331814145165, "learning_rate": 5.044906430152554e-05, "loss": 0.3667, "step": 166 }, { "epoch": 2.358340688437776, "grad_norm": 0.18895739371171252, "learning_rate": 5.006347054978035e-05, "loss": 0.3699, "step": 167 }, { "epoch": 2.3724624889673436, "grad_norm": 0.25034317840687215, "learning_rate": 4.967687582398671e-05, "loss": 0.3587, "step": 168 }, { "epoch": 2.386584289496911, "grad_norm": 0.17907966208059622, "learning_rate": 4.9289318577209706e-05, "loss": 0.3636, "step": 169 }, { "epoch": 2.4007060900264783, "grad_norm": 0.21210095036882018, "learning_rate": 4.890083735825258e-05, "loss": 0.3605, "step": 170 }, { "epoch": 2.414827890556046, "grad_norm": 0.16489305774518265, "learning_rate": 4.851147080782249e-05, "loss": 0.3648, "step": 171 }, { "epoch": 2.4289496910856134, "grad_norm": 0.19143993377462817, "learning_rate": 4.812125765468705e-05, "loss": 0.3606, "step": 172 }, { "epoch": 2.443071491615181, "grad_norm": 0.17804983590295367, "learning_rate": 4.773023671182213e-05, "loss": 0.3637, "step": 173 }, { "epoch": 2.4571932921447486, "grad_norm": 0.16707259472270428, "learning_rate": 4.73384468725513e-05, "loss": 0.3636, "step": 174 }, { "epoch": 2.471315092674316, "grad_norm": 0.17481885632199456, "learning_rate": 4.694592710667723e-05, "loss": 0.3645, "step": 175 }, { "epoch": 2.4854368932038833, "grad_norm": 0.1681053608116463, "learning_rate": 4.6552716456605514e-05, "loss": 0.3605, "step": 176 }, { "epoch": 2.499558693733451, "grad_norm": 0.14964611415536702, "learning_rate": 4.615885403346134e-05, "loss": 0.3562, "step": 177 }, { "epoch": 2.5136804942630184, "grad_norm": 0.14164675176141614, "learning_rate": 4.576437901319921e-05, "loss": 0.3636, "step": 178 }, { "epoch": 2.5278022947925862, "grad_norm": 0.16548274190466053, "learning_rate": 4.5369330632706223e-05, "loss": 0.3648, "step": 179 }, { "epoch": 2.5419240953221536, "grad_norm": 0.15269683467677936, "learning_rate": 4.4973748185899416e-05, "loss": 0.3612, "step": 180 }, { "epoch": 2.556045895851721, "grad_norm": 0.16869434151649507, "learning_rate": 4.457767101981728e-05, "loss": 0.3677, "step": 181 }, { "epoch": 2.5701676963812887, "grad_norm": 0.13337265767063033, "learning_rate": 4.418113853070614e-05, "loss": 0.3626, "step": 182 }, { "epoch": 2.584289496910856, "grad_norm": 0.14682144236789746, "learning_rate": 4.378419016010149e-05, "loss": 0.364, "step": 183 }, { "epoch": 2.598411297440424, "grad_norm": 0.150937900490833, "learning_rate": 4.338686539090493e-05, "loss": 0.3615, "step": 184 }, { "epoch": 2.6125330979699912, "grad_norm": 0.1341377364551312, "learning_rate": 4.298920374345698e-05, "loss": 0.3596, "step": 185 }, { "epoch": 2.6266548984995586, "grad_norm": 0.15572962430762588, "learning_rate": 4.259124477160607e-05, "loss": 0.3625, "step": 186 }, { "epoch": 2.6407766990291264, "grad_norm": 0.1475404012486826, "learning_rate": 4.219302805877441e-05, "loss": 0.3617, "step": 187 }, { "epoch": 2.6548984995586937, "grad_norm": 0.1781262720167099, "learning_rate": 4.17945932140206e-05, "loss": 0.3666, "step": 188 }, { "epoch": 2.6690203000882615, "grad_norm": 0.13824587532461255, "learning_rate": 4.139597986810005e-05, "loss": 0.3629, "step": 189 }, { "epoch": 2.683142100617829, "grad_norm": 0.15963593698467365, "learning_rate": 4.0997227669522924e-05, "loss": 0.3628, "step": 190 }, { "epoch": 2.6972639011473962, "grad_norm": 0.12511715922220792, "learning_rate": 4.059837628061055e-05, "loss": 0.3638, "step": 191 }, { "epoch": 2.7113857016769636, "grad_norm": 0.15752313446706914, "learning_rate": 4.019946537355033e-05, "loss": 0.3614, "step": 192 }, { "epoch": 2.7255075022065314, "grad_norm": 0.13647413322377422, "learning_rate": 3.9800534626449683e-05, "loss": 0.3634, "step": 193 }, { "epoch": 2.7396293027360987, "grad_norm": 0.13525074863232164, "learning_rate": 3.940162371938947e-05, "loss": 0.3587, "step": 194 }, { "epoch": 2.7537511032656665, "grad_norm": 0.13297285710552217, "learning_rate": 3.9002772330477096e-05, "loss": 0.3599, "step": 195 }, { "epoch": 2.767872903795234, "grad_norm": 0.14225004712058384, "learning_rate": 3.860402013189998e-05, "loss": 0.3575, "step": 196 }, { "epoch": 2.7819947043248012, "grad_norm": 0.13373630438071715, "learning_rate": 3.820540678597942e-05, "loss": 0.3648, "step": 197 }, { "epoch": 2.796116504854369, "grad_norm": 0.12615478953418785, "learning_rate": 3.78069719412256e-05, "loss": 0.3609, "step": 198 }, { "epoch": 2.8102383053839364, "grad_norm": 0.12669967225071216, "learning_rate": 3.740875522839393e-05, "loss": 0.3608, "step": 199 }, { "epoch": 2.824360105913504, "grad_norm": 0.13635382545910668, "learning_rate": 3.7010796256543034e-05, "loss": 0.3549, "step": 200 }, { "epoch": 2.8384819064430715, "grad_norm": 0.11546629160995592, "learning_rate": 3.661313460909507e-05, "loss": 0.3593, "step": 201 }, { "epoch": 2.852603706972639, "grad_norm": 0.12139128794186867, "learning_rate": 3.621580983989852e-05, "loss": 0.3608, "step": 202 }, { "epoch": 2.8667255075022067, "grad_norm": 0.12319344865206981, "learning_rate": 3.581886146929387e-05, "loss": 0.3605, "step": 203 }, { "epoch": 2.880847308031774, "grad_norm": 0.14742473593815408, "learning_rate": 3.542232898018273e-05, "loss": 0.3582, "step": 204 }, { "epoch": 2.894969108561342, "grad_norm": 0.11086460953888361, "learning_rate": 3.5026251814100604e-05, "loss": 0.359, "step": 205 }, { "epoch": 2.909090909090909, "grad_norm": 0.13533789741325936, "learning_rate": 3.4630669367293797e-05, "loss": 0.3562, "step": 206 }, { "epoch": 2.9232127096204765, "grad_norm": 0.11573276006772669, "learning_rate": 3.4235620986800806e-05, "loss": 0.3641, "step": 207 }, { "epoch": 2.937334510150044, "grad_norm": 0.12838446326005826, "learning_rate": 3.384114596653866e-05, "loss": 0.361, "step": 208 }, { "epoch": 2.9514563106796117, "grad_norm": 0.12304575149956651, "learning_rate": 3.344728354339449e-05, "loss": 0.3586, "step": 209 }, { "epoch": 2.965578111209179, "grad_norm": 0.12773291501034634, "learning_rate": 3.305407289332279e-05, "loss": 0.3559, "step": 210 }, { "epoch": 2.979699911738747, "grad_norm": 0.16335068209235123, "learning_rate": 3.266155312744871e-05, "loss": 0.3631, "step": 211 }, { "epoch": 2.993821712268314, "grad_norm": 0.1186978138033666, "learning_rate": 3.226976328817788e-05, "loss": 0.3927, "step": 212 }, { "epoch": 3.0079435127978815, "grad_norm": 0.16211984652497452, "learning_rate": 3.187874234531296e-05, "loss": 0.3822, "step": 213 }, { "epoch": 3.0220653133274493, "grad_norm": 0.14214772364476422, "learning_rate": 3.1488529192177526e-05, "loss": 0.3393, "step": 214 }, { "epoch": 3.0361871138570167, "grad_norm": 0.13255124874063956, "learning_rate": 3.109916264174743e-05, "loss": 0.3373, "step": 215 }, { "epoch": 3.0503089143865845, "grad_norm": 0.16606000923059963, "learning_rate": 3.071068142279031e-05, "loss": 0.3371, "step": 216 }, { "epoch": 3.064430714916152, "grad_norm": 0.14657630327267304, "learning_rate": 3.0323124176013297e-05, "loss": 0.3355, "step": 217 }, { "epoch": 3.078552515445719, "grad_norm": 0.1341605905929287, "learning_rate": 2.993652945021966e-05, "loss": 0.3377, "step": 218 }, { "epoch": 3.092674315975287, "grad_norm": 0.14490108611743277, "learning_rate": 2.955093569847447e-05, "loss": 0.3366, "step": 219 }, { "epoch": 3.1067961165048543, "grad_norm": 0.13919821523407064, "learning_rate": 2.9166381274279803e-05, "loss": 0.3312, "step": 220 }, { "epoch": 3.120917917034422, "grad_norm": 0.16300975058477254, "learning_rate": 2.8782904427759898e-05, "loss": 0.3311, "step": 221 }, { "epoch": 3.1350397175639895, "grad_norm": 0.1183225077661534, "learning_rate": 2.8400543301856553e-05, "loss": 0.3282, "step": 222 }, { "epoch": 3.149161518093557, "grad_norm": 0.14092204872317698, "learning_rate": 2.8019335928535234e-05, "loss": 0.3297, "step": 223 }, { "epoch": 3.1632833186231246, "grad_norm": 0.1282390396455681, "learning_rate": 2.7639320225002108e-05, "loss": 0.327, "step": 224 }, { "epoch": 3.177405119152692, "grad_norm": 0.12936573725572997, "learning_rate": 2.7260533989932628e-05, "loss": 0.3346, "step": 225 }, { "epoch": 3.1915269196822593, "grad_norm": 0.11727309920196596, "learning_rate": 2.688301489971183e-05, "loss": 0.3271, "step": 226 }, { "epoch": 3.205648720211827, "grad_norm": 0.12274146196879084, "learning_rate": 2.6506800504686806e-05, "loss": 0.328, "step": 227 }, { "epoch": 3.2197705207413945, "grad_norm": 0.11029811005681434, "learning_rate": 2.6131928225431713e-05, "loss": 0.33, "step": 228 }, { "epoch": 3.233892321270962, "grad_norm": 0.12463320131443856, "learning_rate": 2.575843534902573e-05, "loss": 0.3358, "step": 229 }, { "epoch": 3.2480141218005296, "grad_norm": 0.11256203223325899, "learning_rate": 2.53863590253442e-05, "loss": 0.3364, "step": 230 }, { "epoch": 3.262135922330097, "grad_norm": 0.10841743259905046, "learning_rate": 2.501573626336352e-05, "loss": 0.3337, "step": 231 }, { "epoch": 3.2762577228596648, "grad_norm": 0.11593566286716334, "learning_rate": 2.464660392747999e-05, "loss": 0.3301, "step": 232 }, { "epoch": 3.290379523389232, "grad_norm": 0.10969283000201786, "learning_rate": 2.427899873384306e-05, "loss": 0.332, "step": 233 }, { "epoch": 3.3045013239187995, "grad_norm": 0.12033857141829916, "learning_rate": 2.3912957246703305e-05, "loss": 0.3377, "step": 234 }, { "epoch": 3.3186231244483673, "grad_norm": 0.10210001952439796, "learning_rate": 2.3548515874775547e-05, "loss": 0.3297, "step": 235 }, { "epoch": 3.3327449249779346, "grad_norm": 0.12241287674636975, "learning_rate": 2.3185710867617387e-05, "loss": 0.3361, "step": 236 }, { "epoch": 3.3468667255075024, "grad_norm": 0.10969299118083352, "learning_rate": 2.2824578312023632e-05, "loss": 0.3322, "step": 237 }, { "epoch": 3.3609885260370698, "grad_norm": 0.12151530040465547, "learning_rate": 2.24651541284369e-05, "loss": 0.3361, "step": 238 }, { "epoch": 3.375110326566637, "grad_norm": 0.10631863902215113, "learning_rate": 2.210747406737469e-05, "loss": 0.3344, "step": 239 }, { "epoch": 3.389232127096205, "grad_norm": 0.11983276963310185, "learning_rate": 2.175157370587348e-05, "loss": 0.3324, "step": 240 }, { "epoch": 3.4033539276257723, "grad_norm": 0.10203118790788067, "learning_rate": 2.1397488443949985e-05, "loss": 0.3366, "step": 241 }, { "epoch": 3.4174757281553396, "grad_norm": 0.11460733945580791, "learning_rate": 2.1045253501080058e-05, "loss": 0.3335, "step": 242 }, { "epoch": 3.4315975286849074, "grad_norm": 0.10361959122829918, "learning_rate": 2.0694903912695574e-05, "loss": 0.3342, "step": 243 }, { "epoch": 3.4457193292144748, "grad_norm": 0.10602009006473866, "learning_rate": 2.0346474526699552e-05, "loss": 0.3343, "step": 244 }, { "epoch": 3.459841129744042, "grad_norm": 0.0981614565374733, "learning_rate": 2.0000000000000012e-05, "loss": 0.3342, "step": 245 }, { "epoch": 3.47396293027361, "grad_norm": 0.10563881070295801, "learning_rate": 1.9655514795062746e-05, "loss": 0.3317, "step": 246 }, { "epoch": 3.4880847308031773, "grad_norm": 0.0982393867459211, "learning_rate": 1.931305317648349e-05, "loss": 0.336, "step": 247 }, { "epoch": 3.502206531332745, "grad_norm": 0.10341107342114168, "learning_rate": 1.897264920757981e-05, "loss": 0.3329, "step": 248 }, { "epoch": 3.5163283318623124, "grad_norm": 0.1009205150822494, "learning_rate": 1.8634336747002853e-05, "loss": 0.3363, "step": 249 }, { "epoch": 3.5304501323918798, "grad_norm": 0.09562831286129422, "learning_rate": 1.829814944536963e-05, "loss": 0.3366, "step": 250 }, { "epoch": 3.5445719329214476, "grad_norm": 0.10055162803558056, "learning_rate": 1.7964120741915905e-05, "loss": 0.3359, "step": 251 }, { "epoch": 3.558693733451015, "grad_norm": 0.10362087580690618, "learning_rate": 1.7632283861170135e-05, "loss": 0.33, "step": 252 }, { "epoch": 3.5728155339805827, "grad_norm": 0.09578324331311534, "learning_rate": 1.7302671809648735e-05, "loss": 0.3336, "step": 253 }, { "epoch": 3.58693733451015, "grad_norm": 0.1021943484963981, "learning_rate": 1.6975317372573066e-05, "loss": 0.334, "step": 254 }, { "epoch": 3.6010591350397174, "grad_norm": 0.10104477227737499, "learning_rate": 1.6650253110608415e-05, "loss": 0.3352, "step": 255 }, { "epoch": 3.615180935569285, "grad_norm": 0.09719144111824624, "learning_rate": 1.6327511356625302e-05, "loss": 0.3339, "step": 256 }, { "epoch": 3.6293027360988526, "grad_norm": 0.10082549447043057, "learning_rate": 1.6007124212483453e-05, "loss": 0.3303, "step": 257 }, { "epoch": 3.6434245366284204, "grad_norm": 0.09855344501708733, "learning_rate": 1.5689123545838804e-05, "loss": 0.3319, "step": 258 }, { "epoch": 3.6575463371579877, "grad_norm": 0.10038693196972406, "learning_rate": 1.537354098697367e-05, "loss": 0.3285, "step": 259 }, { "epoch": 3.671668137687555, "grad_norm": 0.10993218050906065, "learning_rate": 1.5060407925650662e-05, "loss": 0.3346, "step": 260 }, { "epoch": 3.6857899382171224, "grad_norm": 0.09881058692426582, "learning_rate": 1.4749755507990449e-05, "loss": 0.3265, "step": 261 }, { "epoch": 3.69991173874669, "grad_norm": 0.11110424733317653, "learning_rate": 1.4441614633373773e-05, "loss": 0.3367, "step": 262 }, { "epoch": 3.7140335392762576, "grad_norm": 0.09507466207790345, "learning_rate": 1.413601595136802e-05, "loss": 0.335, "step": 263 }, { "epoch": 3.7281553398058254, "grad_norm": 0.10341229060389236, "learning_rate": 1.383298985867863e-05, "loss": 0.3324, "step": 264 }, { "epoch": 3.7422771403353927, "grad_norm": 0.09734360531860331, "learning_rate": 1.3532566496125634e-05, "loss": 0.3313, "step": 265 }, { "epoch": 3.75639894086496, "grad_norm": 0.09174570798780135, "learning_rate": 1.3234775745645684e-05, "loss": 0.3351, "step": 266 }, { "epoch": 3.770520741394528, "grad_norm": 0.10147835781586892, "learning_rate": 1.2939647227319791e-05, "loss": 0.3353, "step": 267 }, { "epoch": 3.784642541924095, "grad_norm": 0.09808246222031777, "learning_rate": 1.2647210296427197e-05, "loss": 0.3323, "step": 268 }, { "epoch": 3.798764342453663, "grad_norm": 0.09735163985861015, "learning_rate": 1.2357494040525416e-05, "loss": 0.3391, "step": 269 }, { "epoch": 3.8128861429832304, "grad_norm": 0.08930562493255255, "learning_rate": 1.2070527276557092e-05, "loss": 0.3327, "step": 270 }, { "epoch": 3.8270079435127977, "grad_norm": 0.09744814905553326, "learning_rate": 1.178633854798365e-05, "loss": 0.33, "step": 271 }, { "epoch": 3.8411297440423655, "grad_norm": 0.09183836496663382, "learning_rate": 1.1504956121946216e-05, "loss": 0.3317, "step": 272 }, { "epoch": 3.855251544571933, "grad_norm": 0.08801876422756064, "learning_rate": 1.1226407986453963e-05, "loss": 0.3294, "step": 273 }, { "epoch": 3.8693733451015007, "grad_norm": 0.08798928229950856, "learning_rate": 1.0950721847600282e-05, "loss": 0.3282, "step": 274 }, { "epoch": 3.883495145631068, "grad_norm": 0.09000845113363774, "learning_rate": 1.0677925126806956e-05, "loss": 0.335, "step": 275 }, { "epoch": 3.8976169461606354, "grad_norm": 0.09609952332604478, "learning_rate": 1.040804495809665e-05, "loss": 0.3352, "step": 276 }, { "epoch": 3.911738746690203, "grad_norm": 0.09426777621829556, "learning_rate": 1.0141108185393995e-05, "loss": 0.3307, "step": 277 }, { "epoch": 3.9258605472197705, "grad_norm": 0.08749576305220681, "learning_rate": 9.877141359855567e-06, "loss": 0.3316, "step": 278 }, { "epoch": 3.9399823477493383, "grad_norm": 0.08573388419725536, "learning_rate": 9.616170737228882e-06, "loss": 0.3301, "step": 279 }, { "epoch": 3.9541041482789057, "grad_norm": 0.08677743094561904, "learning_rate": 9.358222275240884e-06, "loss": 0.3309, "step": 280 }, { "epoch": 3.968225948808473, "grad_norm": 0.08456912932018501, "learning_rate": 9.103321631016024e-06, "loss": 0.3294, "step": 281 }, { "epoch": 3.9823477493380404, "grad_norm": 0.0892840459688823, "learning_rate": 8.851494158524242e-06, "loss": 0.3299, "step": 282 }, { "epoch": 3.996469549867608, "grad_norm": 0.09785834932292316, "learning_rate": 8.602764906059109e-06, "loss": 0.3734, "step": 283 }, { "epoch": 4.010591350397176, "grad_norm": 0.1159182382828669, "learning_rate": 8.35715861374636e-06, "loss": 0.3432, "step": 284 }, { "epoch": 4.024713150926743, "grad_norm": 0.11348869033645836, "learning_rate": 8.114699711083113e-06, "loss": 0.3187, "step": 285 }, { "epoch": 4.038834951456311, "grad_norm": 0.09626843456466473, "learning_rate": 7.875412314507942e-06, "loss": 0.3213, "step": 286 }, { "epoch": 4.052956751985878, "grad_norm": 0.0918806636447836, "learning_rate": 7.639320225002106e-06, "loss": 0.3169, "step": 287 }, { "epoch": 4.067078552515445, "grad_norm": 0.09514043448978982, "learning_rate": 7.406446925722211e-06, "loss": 0.3148, "step": 288 }, { "epoch": 4.081200353045014, "grad_norm": 0.10508295602012874, "learning_rate": 7.176815579664343e-06, "loss": 0.3132, "step": 289 }, { "epoch": 4.095322153574581, "grad_norm": 0.10091079365331981, "learning_rate": 6.950449027360213e-06, "loss": 0.3175, "step": 290 }, { "epoch": 4.109443954104148, "grad_norm": 0.0973346460822993, "learning_rate": 6.7273697846052515e-06, "loss": 0.3184, "step": 291 }, { "epoch": 4.123565754633716, "grad_norm": 0.09115379235697503, "learning_rate": 6.507600040219073e-06, "loss": 0.3164, "step": 292 }, { "epoch": 4.137687555163283, "grad_norm": 0.08901902718597547, "learning_rate": 6.291161653838434e-06, "loss": 0.3177, "step": 293 }, { "epoch": 4.151809355692851, "grad_norm": 0.09132299423316595, "learning_rate": 6.078076153742962e-06, "loss": 0.3131, "step": 294 }, { "epoch": 4.165931156222419, "grad_norm": 0.09543903005749907, "learning_rate": 5.868364734713776e-06, "loss": 0.3142, "step": 295 }, { "epoch": 4.180052956751986, "grad_norm": 0.09061531269851537, "learning_rate": 5.662048255925357e-06, "loss": 0.3204, "step": 296 }, { "epoch": 4.194174757281553, "grad_norm": 0.08551951038992002, "learning_rate": 5.459147238870768e-06, "loss": 0.3158, "step": 297 }, { "epoch": 4.208296557811121, "grad_norm": 0.08387425510980595, "learning_rate": 5.259681865320447e-06, "loss": 0.3194, "step": 298 }, { "epoch": 4.222418358340688, "grad_norm": 0.0901228464398898, "learning_rate": 5.063671975314814e-06, "loss": 0.3163, "step": 299 }, { "epoch": 4.236540158870256, "grad_norm": 0.08691256583540367, "learning_rate": 4.871137065190854e-06, "loss": 0.315, "step": 300 }, { "epoch": 4.250661959399824, "grad_norm": 0.0878527835574059, "learning_rate": 4.6820962856429205e-06, "loss": 0.3176, "step": 301 }, { "epoch": 4.264783759929391, "grad_norm": 0.0840437037057203, "learning_rate": 4.496568439817836e-06, "loss": 0.322, "step": 302 }, { "epoch": 4.278905560458958, "grad_norm": 0.08904988122589128, "learning_rate": 4.314571981444666e-06, "loss": 0.311, "step": 303 }, { "epoch": 4.293027360988526, "grad_norm": 0.08120215219780037, "learning_rate": 4.136125012999168e-06, "loss": 0.3203, "step": 304 }, { "epoch": 4.307149161518094, "grad_norm": 0.08522052695009742, "learning_rate": 3.961245283903239e-06, "loss": 0.3161, "step": 305 }, { "epoch": 4.321270962047661, "grad_norm": 0.08319753808748938, "learning_rate": 3.7899501887594102e-06, "loss": 0.315, "step": 306 }, { "epoch": 4.335392762577229, "grad_norm": 0.08198211403858394, "learning_rate": 3.622256765620713e-06, "loss": 0.3165, "step": 307 }, { "epoch": 4.349514563106796, "grad_norm": 0.07827444542073485, "learning_rate": 3.458181694295961e-06, "loss": 0.3114, "step": 308 }, { "epoch": 4.363636363636363, "grad_norm": 0.07827005931051699, "learning_rate": 3.297741294690644e-06, "loss": 0.3125, "step": 309 }, { "epoch": 4.3777581641659316, "grad_norm": 0.07833274350751808, "learning_rate": 3.140951525183691e-06, "loss": 0.3156, "step": 310 }, { "epoch": 4.391879964695499, "grad_norm": 0.08055700180528477, "learning_rate": 2.987827981040132e-06, "loss": 0.3144, "step": 311 }, { "epoch": 4.406001765225066, "grad_norm": 0.0799614180245514, "learning_rate": 2.8383858928598963e-06, "loss": 0.3157, "step": 312 }, { "epoch": 4.420123565754634, "grad_norm": 0.0722165779006397, "learning_rate": 2.692640125062895e-06, "loss": 0.3116, "step": 313 }, { "epoch": 4.434245366284201, "grad_norm": 0.07776220076295337, "learning_rate": 2.550605174410512e-06, "loss": 0.3206, "step": 314 }, { "epoch": 4.448367166813769, "grad_norm": 0.07577160557474086, "learning_rate": 2.4122951685636674e-06, "loss": 0.3119, "step": 315 }, { "epoch": 4.4624889673433366, "grad_norm": 0.07292199486310709, "learning_rate": 2.2777238646775768e-06, "loss": 0.314, "step": 316 }, { "epoch": 4.476610767872904, "grad_norm": 0.07321270589774292, "learning_rate": 2.14690464803343e-06, "loss": 0.3116, "step": 317 }, { "epoch": 4.490732568402471, "grad_norm": 0.07971761444372055, "learning_rate": 2.0198505307069462e-06, "loss": 0.3162, "step": 318 }, { "epoch": 4.504854368932039, "grad_norm": 0.0823725656624792, "learning_rate": 1.896574150274151e-06, "loss": 0.318, "step": 319 }, { "epoch": 4.518976169461606, "grad_norm": 0.07311612247681858, "learning_rate": 1.7770877685543687e-06, "loss": 0.3146, "step": 320 }, { "epoch": 4.533097969991174, "grad_norm": 0.0754285797360244, "learning_rate": 1.6614032703905714e-06, "loss": 0.3188, "step": 321 }, { "epoch": 4.5472197705207416, "grad_norm": 0.07192329712907819, "learning_rate": 1.5495321624672443e-06, "loss": 0.3117, "step": 322 }, { "epoch": 4.561341571050309, "grad_norm": 0.07683729191513318, "learning_rate": 1.4414855721658705e-06, "loss": 0.3179, "step": 323 }, { "epoch": 4.575463371579876, "grad_norm": 0.07466087193345237, "learning_rate": 1.3372742464581134e-06, "loss": 0.3169, "step": 324 }, { "epoch": 4.589585172109444, "grad_norm": 0.07472750780066512, "learning_rate": 1.2369085508368862e-06, "loss": 0.313, "step": 325 }, { "epoch": 4.603706972639012, "grad_norm": 0.07567268942020543, "learning_rate": 1.1403984682852998e-06, "loss": 0.3162, "step": 326 }, { "epoch": 4.617828773168579, "grad_norm": 0.07193466653913613, "learning_rate": 1.0477535982837473e-06, "loss": 0.3169, "step": 327 }, { "epoch": 4.631950573698147, "grad_norm": 0.07310364397796111, "learning_rate": 9.589831558550222e-07, "loss": 0.3147, "step": 328 }, { "epoch": 4.646072374227714, "grad_norm": 0.07226831665121733, "learning_rate": 8.740959706477725e-07, "loss": 0.3155, "step": 329 }, { "epoch": 4.660194174757281, "grad_norm": 0.07380784680617208, "learning_rate": 7.93100486058247e-07, "loss": 0.3172, "step": 330 }, { "epoch": 4.674315975286849, "grad_norm": 0.07265097137199653, "learning_rate": 7.160047583904473e-07, "loss": 0.3123, "step": 331 }, { "epoch": 4.688437775816417, "grad_norm": 0.07526606061681983, "learning_rate": 6.428164560548134e-07, "loss": 0.3126, "step": 332 }, { "epoch": 4.702559576345984, "grad_norm": 0.07096951660387449, "learning_rate": 5.735428588054825e-07, "loss": 0.3091, "step": 333 }, { "epoch": 4.716681376875552, "grad_norm": 0.07491929428893927, "learning_rate": 5.081908570161753e-07, "loss": 0.3168, "step": 334 }, { "epoch": 4.730803177405119, "grad_norm": 0.07068035565889964, "learning_rate": 4.467669509948591e-07, "loss": 0.3168, "step": 335 }, { "epoch": 4.744924977934687, "grad_norm": 0.07006153238881019, "learning_rate": 3.8927725033718553e-07, "loss": 0.3096, "step": 336 }, { "epoch": 4.7590467784642545, "grad_norm": 0.07031296479074185, "learning_rate": 3.3572747331878984e-07, "loss": 0.3127, "step": 337 }, { "epoch": 4.773168578993822, "grad_norm": 0.07086156685048181, "learning_rate": 2.8612294632650586e-07, "loss": 0.3165, "step": 338 }, { "epoch": 4.787290379523389, "grad_norm": 0.07041702874195928, "learning_rate": 2.404686033285897e-07, "loss": 0.3211, "step": 339 }, { "epoch": 4.801412180052957, "grad_norm": 0.07111545002538634, "learning_rate": 1.9876898538394362e-07, "loss": 0.3139, "step": 340 }, { "epoch": 4.815533980582524, "grad_norm": 0.06964445264833816, "learning_rate": 1.6102824019043728e-07, "loss": 0.3119, "step": 341 }, { "epoch": 4.829655781112092, "grad_norm": 0.07185826317569316, "learning_rate": 1.2725012167236207e-07, "loss": 0.3189, "step": 342 }, { "epoch": 4.8437775816416595, "grad_norm": 0.07175971991165786, "learning_rate": 9.74379896070321e-08, "loss": 0.3144, "step": 343 }, { "epoch": 4.857899382171227, "grad_norm": 0.07027377563502572, "learning_rate": 7.159480929059381e-08, "loss": 0.3208, "step": 344 }, { "epoch": 4.872021182700794, "grad_norm": 0.07130198834034268, "learning_rate": 4.9723151243106225e-08, "loss": 0.3164, "step": 345 }, { "epoch": 4.886142983230362, "grad_norm": 0.07512577557190175, "learning_rate": 3.1825190952829986e-08, "loss": 0.3183, "step": 346 }, { "epoch": 4.90026478375993, "grad_norm": 0.0718819094759202, "learning_rate": 1.7902708659867096e-08, "loss": 0.3185, "step": 347 }, { "epoch": 4.914386584289497, "grad_norm": 0.0706893833001464, "learning_rate": 7.957089179058131e-09, "loss": 0.3142, "step": 348 }, { "epoch": 4.9285083848190645, "grad_norm": 0.07170028442056126, "learning_rate": 1.9893217622790616e-09, "loss": 0.3181, "step": 349 }, { "epoch": 4.942630185348632, "grad_norm": 0.07142066838497432, "learning_rate": 0.0, "loss": 0.313, "step": 350 }, { "epoch": 4.942630185348632, "step": 350, "total_flos": 9.306564393200255e+18, "train_loss": 0.0, "train_runtime": 1.9909, "train_samples_per_second": 91040.986, "train_steps_per_second": 175.798 } ], "logging_steps": 1, "max_steps": 350, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.306564393200255e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }