{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.953191489361702, "eval_steps": 500, "global_step": 440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011347517730496455, "grad_norm": 28.768582165086425, "learning_rate": 4.999936276068748e-05, "loss": 2.5462, "num_input_tokens_seen": 262144, "step": 1 }, { "epoch": 0.02269503546099291, "grad_norm": 93.43249139527468, "learning_rate": 4.9997451075235834e-05, "loss": 4.367, "num_input_tokens_seen": 524288, "step": 2 }, { "epoch": 0.03404255319148936, "grad_norm": 27.45199589426381, "learning_rate": 4.999426504110115e-05, "loss": 3.7754, "num_input_tokens_seen": 786432, "step": 3 }, { "epoch": 0.04539007092198582, "grad_norm": 21.466687229475028, "learning_rate": 4.9989804820704735e-05, "loss": 3.4412, "num_input_tokens_seen": 1048576, "step": 4 }, { "epoch": 0.05673758865248227, "grad_norm": 8.511408964141404, "learning_rate": 4.99840706414248e-05, "loss": 2.5369, "num_input_tokens_seen": 1310720, "step": 5 }, { "epoch": 0.06808510638297872, "grad_norm": 17.110148299969282, "learning_rate": 4.9977062795584893e-05, "loss": 2.531, "num_input_tokens_seen": 1572864, "step": 6 }, { "epoch": 0.07943262411347518, "grad_norm": 8.528722034783323, "learning_rate": 4.9968781640439026e-05, "loss": 2.2971, "num_input_tokens_seen": 1835008, "step": 7 }, { "epoch": 0.09078014184397164, "grad_norm": 7.852506050761374, "learning_rate": 4.995922759815339e-05, "loss": 2.2175, "num_input_tokens_seen": 2097152, "step": 8 }, { "epoch": 0.10212765957446808, "grad_norm": 5.024862487166255, "learning_rate": 4.9948401155784904e-05, "loss": 2.0184, "num_input_tokens_seen": 2359296, "step": 9 }, { "epoch": 0.11347517730496454, "grad_norm": 5.07110237293465, "learning_rate": 4.993630286525634e-05, "loss": 1.97, "num_input_tokens_seen": 2621440, "step": 10 }, { "epoch": 0.12482269503546099, "grad_norm": 3.902962877759486, "learning_rate": 4.99229333433282e-05, "loss": 1.9201, "num_input_tokens_seen": 2883584, "step": 11 }, { "epoch": 0.13617021276595745, "grad_norm": 3.6676315365476855, "learning_rate": 4.9908293271567286e-05, "loss": 1.8835, "num_input_tokens_seen": 3145728, "step": 12 }, { "epoch": 0.1475177304964539, "grad_norm": 2.929600242853849, "learning_rate": 4.9892383396311934e-05, "loss": 1.808, "num_input_tokens_seen": 3407872, "step": 13 }, { "epoch": 0.15886524822695036, "grad_norm": 2.5508693562661717, "learning_rate": 4.987520452863399e-05, "loss": 1.7956, "num_input_tokens_seen": 3670016, "step": 14 }, { "epoch": 0.1702127659574468, "grad_norm": 2.2262214099883053, "learning_rate": 4.985675754429744e-05, "loss": 1.781, "num_input_tokens_seen": 3932160, "step": 15 }, { "epoch": 0.18156028368794327, "grad_norm": 2.8994102519435656, "learning_rate": 4.9837043383713753e-05, "loss": 1.7132, "num_input_tokens_seen": 4194304, "step": 16 }, { "epoch": 0.19290780141843972, "grad_norm": 3.2652620723767845, "learning_rate": 4.981606305189401e-05, "loss": 1.7316, "num_input_tokens_seen": 4456448, "step": 17 }, { "epoch": 0.20425531914893616, "grad_norm": 2.588037121879393, "learning_rate": 4.979381761839757e-05, "loss": 1.7198, "num_input_tokens_seen": 4718592, "step": 18 }, { "epoch": 0.21560283687943263, "grad_norm": 1.9459781834170677, "learning_rate": 4.9770308217277614e-05, "loss": 1.6976, "num_input_tokens_seen": 4980736, "step": 19 }, { "epoch": 0.22695035460992907, "grad_norm": 4.291046579259317, "learning_rate": 4.9745536047023324e-05, "loss": 1.7159, "num_input_tokens_seen": 5242880, "step": 20 }, { "epoch": 0.23829787234042554, "grad_norm": 1.9625030923747173, "learning_rate": 4.971950237049874e-05, "loss": 1.6871, "num_input_tokens_seen": 5505024, "step": 21 }, { "epoch": 0.24964539007092199, "grad_norm": 3.677910756835627, "learning_rate": 4.9692208514878444e-05, "loss": 1.641, "num_input_tokens_seen": 5767168, "step": 22 }, { "epoch": 0.26099290780141843, "grad_norm": 2.2017196904664247, "learning_rate": 4.966365587157986e-05, "loss": 1.6336, "num_input_tokens_seen": 6029312, "step": 23 }, { "epoch": 0.2723404255319149, "grad_norm": 2.9082791522352958, "learning_rate": 4.963384589619233e-05, "loss": 1.6184, "num_input_tokens_seen": 6291456, "step": 24 }, { "epoch": 0.28368794326241137, "grad_norm": 2.306973394969553, "learning_rate": 4.96027801084029e-05, "loss": 1.6175, "num_input_tokens_seen": 6553600, "step": 25 }, { "epoch": 0.2950354609929078, "grad_norm": 1.8276771179473579, "learning_rate": 4.957046009191889e-05, "loss": 1.6145, "num_input_tokens_seen": 6815744, "step": 26 }, { "epoch": 0.30638297872340425, "grad_norm": 2.7024583313160213, "learning_rate": 4.95368874943871e-05, "loss": 1.5905, "num_input_tokens_seen": 7077888, "step": 27 }, { "epoch": 0.3177304964539007, "grad_norm": 1.8227465911208784, "learning_rate": 4.9502064027309836e-05, "loss": 1.5847, "num_input_tokens_seen": 7340032, "step": 28 }, { "epoch": 0.32907801418439714, "grad_norm": 2.189585770318853, "learning_rate": 4.946599146595769e-05, "loss": 1.5862, "num_input_tokens_seen": 7602176, "step": 29 }, { "epoch": 0.3404255319148936, "grad_norm": 1.6963880678360608, "learning_rate": 4.942867164927899e-05, "loss": 1.5856, "num_input_tokens_seen": 7864320, "step": 30 }, { "epoch": 0.3517730496453901, "grad_norm": 1.884475186213032, "learning_rate": 4.9390106479806085e-05, "loss": 1.5462, "num_input_tokens_seen": 8126464, "step": 31 }, { "epoch": 0.36312056737588655, "grad_norm": 1.7633790709929305, "learning_rate": 4.935029792355834e-05, "loss": 1.5519, "num_input_tokens_seen": 8388608, "step": 32 }, { "epoch": 0.37446808510638296, "grad_norm": 1.6293655236791516, "learning_rate": 4.9309248009941914e-05, "loss": 1.5426, "num_input_tokens_seen": 8650752, "step": 33 }, { "epoch": 0.38581560283687943, "grad_norm": 1.6363171652251638, "learning_rate": 4.9266958831646315e-05, "loss": 1.5179, "num_input_tokens_seen": 8912896, "step": 34 }, { "epoch": 0.3971631205673759, "grad_norm": 1.76404697988344, "learning_rate": 4.922343254453768e-05, "loss": 1.5046, "num_input_tokens_seen": 9175040, "step": 35 }, { "epoch": 0.4085106382978723, "grad_norm": 1.8922889495300352, "learning_rate": 4.917867136754893e-05, "loss": 1.5147, "num_input_tokens_seen": 9437184, "step": 36 }, { "epoch": 0.4198581560283688, "grad_norm": 1.4943749144982619, "learning_rate": 4.913267758256658e-05, "loss": 1.5326, "num_input_tokens_seen": 9699328, "step": 37 }, { "epoch": 0.43120567375886526, "grad_norm": 1.652782449680942, "learning_rate": 4.9085453534314476e-05, "loss": 1.5253, "num_input_tokens_seen": 9961472, "step": 38 }, { "epoch": 0.4425531914893617, "grad_norm": 1.8558129403608419, "learning_rate": 4.9037001630234215e-05, "loss": 1.5003, "num_input_tokens_seen": 10223616, "step": 39 }, { "epoch": 0.45390070921985815, "grad_norm": 1.664990274441562, "learning_rate": 4.898732434036244e-05, "loss": 1.5021, "num_input_tokens_seen": 10485760, "step": 40 }, { "epoch": 0.4652482269503546, "grad_norm": 1.7198271557244196, "learning_rate": 4.893642419720491e-05, "loss": 1.4748, "num_input_tokens_seen": 10747904, "step": 41 }, { "epoch": 0.4765957446808511, "grad_norm": 1.7242786140938589, "learning_rate": 4.888430379560742e-05, "loss": 1.5064, "num_input_tokens_seen": 11010048, "step": 42 }, { "epoch": 0.4879432624113475, "grad_norm": 1.289994987213828, "learning_rate": 4.883096579262346e-05, "loss": 1.4787, "num_input_tokens_seen": 11272192, "step": 43 }, { "epoch": 0.49929078014184397, "grad_norm": 2.0636727883365693, "learning_rate": 4.877641290737884e-05, "loss": 1.5032, "num_input_tokens_seen": 11534336, "step": 44 }, { "epoch": 0.5106382978723404, "grad_norm": 1.50384393546133, "learning_rate": 4.872064792093299e-05, "loss": 1.5017, "num_input_tokens_seen": 11796480, "step": 45 }, { "epoch": 0.5219858156028369, "grad_norm": 1.7274141300795698, "learning_rate": 4.866367367613725e-05, "loss": 1.4899, "num_input_tokens_seen": 12058624, "step": 46 }, { "epoch": 0.5333333333333333, "grad_norm": 1.4776150452075232, "learning_rate": 4.86054930774899e-05, "loss": 1.4501, "num_input_tokens_seen": 12320768, "step": 47 }, { "epoch": 0.5446808510638298, "grad_norm": 2.0772123001061296, "learning_rate": 4.854610909098812e-05, "loss": 1.4729, "num_input_tokens_seen": 12582912, "step": 48 }, { "epoch": 0.5560283687943263, "grad_norm": 1.4784518476625488, "learning_rate": 4.848552474397676e-05, "loss": 1.4639, "num_input_tokens_seen": 12845056, "step": 49 }, { "epoch": 0.5673758865248227, "grad_norm": 1.5172983637361528, "learning_rate": 4.842374312499405e-05, "loss": 1.4626, "num_input_tokens_seen": 13107200, "step": 50 }, { "epoch": 0.5787234042553191, "grad_norm": 1.590645940567932, "learning_rate": 4.836076738361408e-05, "loss": 1.4767, "num_input_tokens_seen": 13369344, "step": 51 }, { "epoch": 0.5900709219858156, "grad_norm": 1.5086587054410023, "learning_rate": 4.829660073028631e-05, "loss": 1.453, "num_input_tokens_seen": 13631488, "step": 52 }, { "epoch": 0.601418439716312, "grad_norm": 1.4196104282840498, "learning_rate": 4.823124643617187e-05, "loss": 1.4406, "num_input_tokens_seen": 13893632, "step": 53 }, { "epoch": 0.6127659574468085, "grad_norm": 1.5140910517728237, "learning_rate": 4.8164707832976783e-05, "loss": 1.4498, "num_input_tokens_seen": 14155776, "step": 54 }, { "epoch": 0.624113475177305, "grad_norm": 1.3210319443750393, "learning_rate": 4.8096988312782174e-05, "loss": 1.4288, "num_input_tokens_seen": 14417920, "step": 55 }, { "epoch": 0.6354609929078014, "grad_norm": 1.5120891443494813, "learning_rate": 4.802809132787125e-05, "loss": 1.4267, "num_input_tokens_seen": 14680064, "step": 56 }, { "epoch": 0.6468085106382979, "grad_norm": 1.7385310877259594, "learning_rate": 4.7958020390553426e-05, "loss": 1.4775, "num_input_tokens_seen": 14942208, "step": 57 }, { "epoch": 0.6581560283687943, "grad_norm": 1.3858014810326225, "learning_rate": 4.7886779072985156e-05, "loss": 1.4387, "num_input_tokens_seen": 15204352, "step": 58 }, { "epoch": 0.6695035460992907, "grad_norm": 1.8605931025096338, "learning_rate": 4.78143710069879e-05, "loss": 1.4093, "num_input_tokens_seen": 15466496, "step": 59 }, { "epoch": 0.6808510638297872, "grad_norm": 1.3831101615966328, "learning_rate": 4.774079988386296e-05, "loss": 1.421, "num_input_tokens_seen": 15728640, "step": 60 }, { "epoch": 0.6921985815602837, "grad_norm": 1.6322196763337964, "learning_rate": 4.766606945420329e-05, "loss": 1.4411, "num_input_tokens_seen": 15990784, "step": 61 }, { "epoch": 0.7035460992907802, "grad_norm": 1.6072671722268586, "learning_rate": 4.759018352770229e-05, "loss": 1.4283, "num_input_tokens_seen": 16252928, "step": 62 }, { "epoch": 0.7148936170212766, "grad_norm": 1.2535166201622518, "learning_rate": 4.751314597295963e-05, "loss": 1.4526, "num_input_tokens_seen": 16515072, "step": 63 }, { "epoch": 0.7262411347517731, "grad_norm": 1.5543521846201784, "learning_rate": 4.743496071728396e-05, "loss": 1.4148, "num_input_tokens_seen": 16777216, "step": 64 }, { "epoch": 0.7375886524822695, "grad_norm": 1.3594873685564324, "learning_rate": 4.735563174649278e-05, "loss": 1.3976, "num_input_tokens_seen": 17039360, "step": 65 }, { "epoch": 0.7489361702127659, "grad_norm": 1.3662132975871213, "learning_rate": 4.72751631047092e-05, "loss": 1.4137, "num_input_tokens_seen": 17301504, "step": 66 }, { "epoch": 0.7602836879432624, "grad_norm": 1.2729255728645377, "learning_rate": 4.719355889415576e-05, "loss": 1.3951, "num_input_tokens_seen": 17563648, "step": 67 }, { "epoch": 0.7716312056737589, "grad_norm": 1.2598405597617752, "learning_rate": 4.711082327494536e-05, "loss": 1.4049, "num_input_tokens_seen": 17825792, "step": 68 }, { "epoch": 0.7829787234042553, "grad_norm": 1.336014117151041, "learning_rate": 4.7026960464869116e-05, "loss": 1.4167, "num_input_tokens_seen": 18087936, "step": 69 }, { "epoch": 0.7943262411347518, "grad_norm": 1.4872379534302926, "learning_rate": 4.6941974739181395e-05, "loss": 1.4048, "num_input_tokens_seen": 18350080, "step": 70 }, { "epoch": 0.8056737588652483, "grad_norm": 1.2920324415831732, "learning_rate": 4.6855870430381816e-05, "loss": 1.4083, "num_input_tokens_seen": 18612224, "step": 71 }, { "epoch": 0.8170212765957446, "grad_norm": 1.6143627607763888, "learning_rate": 4.6768651927994434e-05, "loss": 1.3906, "num_input_tokens_seen": 18874368, "step": 72 }, { "epoch": 0.8283687943262411, "grad_norm": 1.2568838039190413, "learning_rate": 4.668032367834392e-05, "loss": 1.3973, "num_input_tokens_seen": 19136512, "step": 73 }, { "epoch": 0.8397163120567376, "grad_norm": 1.5547346669542705, "learning_rate": 4.6590890184328925e-05, "loss": 1.3918, "num_input_tokens_seen": 19398656, "step": 74 }, { "epoch": 0.851063829787234, "grad_norm": 1.4356818846930628, "learning_rate": 4.6500356005192514e-05, "loss": 1.3819, "num_input_tokens_seen": 19660800, "step": 75 }, { "epoch": 0.8624113475177305, "grad_norm": 1.3284920385369607, "learning_rate": 4.640872575628973e-05, "loss": 1.3933, "num_input_tokens_seen": 19922944, "step": 76 }, { "epoch": 0.873758865248227, "grad_norm": 1.107701469460885, "learning_rate": 4.6316004108852305e-05, "loss": 1.4081, "num_input_tokens_seen": 20185088, "step": 77 }, { "epoch": 0.8851063829787233, "grad_norm": 1.2529354895940894, "learning_rate": 4.622219578975057e-05, "loss": 1.3801, "num_input_tokens_seen": 20447232, "step": 78 }, { "epoch": 0.8964539007092198, "grad_norm": 1.3380392380500745, "learning_rate": 4.6127305581252414e-05, "loss": 1.3655, "num_input_tokens_seen": 20709376, "step": 79 }, { "epoch": 0.9078014184397163, "grad_norm": 1.2047088627238154, "learning_rate": 4.6031338320779534e-05, "loss": 1.3896, "num_input_tokens_seen": 20971520, "step": 80 }, { "epoch": 0.9191489361702128, "grad_norm": 1.4991837815250253, "learning_rate": 4.593429890066082e-05, "loss": 1.405, "num_input_tokens_seen": 21233664, "step": 81 }, { "epoch": 0.9304964539007092, "grad_norm": 1.1414547231387788, "learning_rate": 4.583619226788294e-05, "loss": 1.3843, "num_input_tokens_seen": 21495808, "step": 82 }, { "epoch": 0.9418439716312057, "grad_norm": 1.4695936511434455, "learning_rate": 4.573702342383816e-05, "loss": 1.3698, "num_input_tokens_seen": 21757952, "step": 83 }, { "epoch": 0.9531914893617022, "grad_norm": 1.2845207314880724, "learning_rate": 4.563679742406935e-05, "loss": 1.3806, "num_input_tokens_seen": 22020096, "step": 84 }, { "epoch": 0.9645390070921985, "grad_norm": 1.4475613066375035, "learning_rate": 4.5535519378012295e-05, "loss": 1.3715, "num_input_tokens_seen": 22282240, "step": 85 }, { "epoch": 0.975886524822695, "grad_norm": 1.4999411758550487, "learning_rate": 4.543319444873517e-05, "loss": 1.3718, "num_input_tokens_seen": 22544384, "step": 86 }, { "epoch": 0.9872340425531915, "grad_norm": 1.261663898483524, "learning_rate": 4.532982785267541e-05, "loss": 1.3564, "num_input_tokens_seen": 22806528, "step": 87 }, { "epoch": 0.9985815602836879, "grad_norm": 1.3464933652070896, "learning_rate": 4.522542485937369e-05, "loss": 1.3744, "num_input_tokens_seen": 23068672, "step": 88 }, { "epoch": 1.0, "grad_norm": 1.3464933652070896, "learning_rate": 4.511999079120534e-05, "loss": 1.2363, "num_input_tokens_seen": 23101440, "step": 89 }, { "epoch": 1.0113475177304965, "grad_norm": 3.3795576075508627, "learning_rate": 4.5013531023109014e-05, "loss": 1.0013, "num_input_tokens_seen": 23363584, "step": 90 }, { "epoch": 1.022695035460993, "grad_norm": 1.98501369471366, "learning_rate": 4.4906050982312664e-05, "loss": 1.0396, "num_input_tokens_seen": 23625728, "step": 91 }, { "epoch": 1.0340425531914894, "grad_norm": 1.7550611786324055, "learning_rate": 4.479755614805688e-05, "loss": 0.967, "num_input_tokens_seen": 23887872, "step": 92 }, { "epoch": 1.0453900709219859, "grad_norm": 2.3047442852698063, "learning_rate": 4.4688052051315545e-05, "loss": 1.0097, "num_input_tokens_seen": 24150016, "step": 93 }, { "epoch": 1.0567375886524824, "grad_norm": 1.4568858521127028, "learning_rate": 4.457754427451389e-05, "loss": 0.9699, "num_input_tokens_seen": 24412160, "step": 94 }, { "epoch": 1.0680851063829788, "grad_norm": 1.9193925592577146, "learning_rate": 4.446603845124388e-05, "loss": 0.9548, "num_input_tokens_seen": 24674304, "step": 95 }, { "epoch": 1.0794326241134753, "grad_norm": 1.3493703965773043, "learning_rate": 4.4353540265977064e-05, "loss": 0.9504, "num_input_tokens_seen": 24936448, "step": 96 }, { "epoch": 1.0907801418439715, "grad_norm": 1.7615744346425224, "learning_rate": 4.4240055453774734e-05, "loss": 0.9807, "num_input_tokens_seen": 25198592, "step": 97 }, { "epoch": 1.102127659574468, "grad_norm": 1.8062764758660264, "learning_rate": 4.412558979999558e-05, "loss": 0.9363, "num_input_tokens_seen": 25460736, "step": 98 }, { "epoch": 1.1134751773049645, "grad_norm": 1.7426430146262821, "learning_rate": 4.401014914000078e-05, "loss": 0.968, "num_input_tokens_seen": 25722880, "step": 99 }, { "epoch": 1.124822695035461, "grad_norm": 1.5389141278391227, "learning_rate": 4.389373935885646e-05, "loss": 0.9428, "num_input_tokens_seen": 25985024, "step": 100 }, { "epoch": 1.1361702127659574, "grad_norm": 1.4116557578228721, "learning_rate": 4.3776366391033746e-05, "loss": 0.921, "num_input_tokens_seen": 26247168, "step": 101 }, { "epoch": 1.147517730496454, "grad_norm": 1.4555202749322727, "learning_rate": 4.365803622010618e-05, "loss": 0.9368, "num_input_tokens_seen": 26509312, "step": 102 }, { "epoch": 1.1588652482269504, "grad_norm": 1.51393536895009, "learning_rate": 4.35387548784447e-05, "loss": 0.9424, "num_input_tokens_seen": 26771456, "step": 103 }, { "epoch": 1.1702127659574468, "grad_norm": 1.3497959200668166, "learning_rate": 4.341852844691012e-05, "loss": 0.9395, "num_input_tokens_seen": 27033600, "step": 104 }, { "epoch": 1.1815602836879433, "grad_norm": 1.963720316682781, "learning_rate": 4.329736305454314e-05, "loss": 0.9575, "num_input_tokens_seen": 27295744, "step": 105 }, { "epoch": 1.1929078014184398, "grad_norm": 1.5398810468770388, "learning_rate": 4.3175264878251845e-05, "loss": 0.9087, "num_input_tokens_seen": 27557888, "step": 106 }, { "epoch": 1.2042553191489362, "grad_norm": 2.249980638617661, "learning_rate": 4.305224014249688e-05, "loss": 0.9613, "num_input_tokens_seen": 27820032, "step": 107 }, { "epoch": 1.2156028368794327, "grad_norm": 1.7994413618778806, "learning_rate": 4.292829511897409e-05, "loss": 0.9336, "num_input_tokens_seen": 28082176, "step": 108 }, { "epoch": 1.226950354609929, "grad_norm": 2.1172062063526695, "learning_rate": 4.280343612629479e-05, "loss": 0.9444, "num_input_tokens_seen": 28344320, "step": 109 }, { "epoch": 1.2382978723404254, "grad_norm": 1.689025623361111, "learning_rate": 4.267766952966369e-05, "loss": 0.9054, "num_input_tokens_seen": 28606464, "step": 110 }, { "epoch": 1.249645390070922, "grad_norm": 1.7237793371868935, "learning_rate": 4.255100174055434e-05, "loss": 0.9391, "num_input_tokens_seen": 28868608, "step": 111 }, { "epoch": 1.2609929078014184, "grad_norm": 1.5807920095324717, "learning_rate": 4.242343921638234e-05, "loss": 0.9232, "num_input_tokens_seen": 29130752, "step": 112 }, { "epoch": 1.2723404255319148, "grad_norm": 1.5455548794679541, "learning_rate": 4.22949884601761e-05, "loss": 0.9383, "num_input_tokens_seen": 29392896, "step": 113 }, { "epoch": 1.2836879432624113, "grad_norm": 1.505150180088299, "learning_rate": 4.2165656020245336e-05, "loss": 0.921, "num_input_tokens_seen": 29655040, "step": 114 }, { "epoch": 1.2950354609929078, "grad_norm": 1.7329578957192628, "learning_rate": 4.2035448489847284e-05, "loss": 0.9525, "num_input_tokens_seen": 29917184, "step": 115 }, { "epoch": 1.3063829787234043, "grad_norm": 1.4120446620663845, "learning_rate": 4.1904372506850484e-05, "loss": 0.8843, "num_input_tokens_seen": 30179328, "step": 116 }, { "epoch": 1.3177304964539007, "grad_norm": 1.440521563342665, "learning_rate": 4.1772434753396504e-05, "loss": 0.9295, "num_input_tokens_seen": 30441472, "step": 117 }, { "epoch": 1.3290780141843972, "grad_norm": 1.2751014560476446, "learning_rate": 4.1639641955559205e-05, "loss": 0.965, "num_input_tokens_seen": 30703616, "step": 118 }, { "epoch": 1.3404255319148937, "grad_norm": 1.353326348573935, "learning_rate": 4.1506000883001875e-05, "loss": 0.9193, "num_input_tokens_seen": 30965760, "step": 119 }, { "epoch": 1.3517730496453901, "grad_norm": 1.3458043729465723, "learning_rate": 4.137151834863213e-05, "loss": 0.9418, "num_input_tokens_seen": 31227904, "step": 120 }, { "epoch": 1.3631205673758866, "grad_norm": 1.3301133192597863, "learning_rate": 4.123620120825459e-05, "loss": 0.9139, "num_input_tokens_seen": 31490048, "step": 121 }, { "epoch": 1.374468085106383, "grad_norm": 1.519558123717531, "learning_rate": 4.1100056360221384e-05, "loss": 0.9378, "num_input_tokens_seen": 31752192, "step": 122 }, { "epoch": 1.3858156028368795, "grad_norm": 1.2225151449916911, "learning_rate": 4.096309074508046e-05, "loss": 0.9334, "num_input_tokens_seen": 32014336, "step": 123 }, { "epoch": 1.397163120567376, "grad_norm": 1.3505855040891415, "learning_rate": 4.082531134522176e-05, "loss": 0.9227, "num_input_tokens_seen": 32276480, "step": 124 }, { "epoch": 1.4085106382978723, "grad_norm": 1.1984405892159349, "learning_rate": 4.06867251845213e-05, "loss": 0.9232, "num_input_tokens_seen": 32538624, "step": 125 }, { "epoch": 1.4198581560283687, "grad_norm": 1.2105780592446167, "learning_rate": 4.054733932798306e-05, "loss": 0.9126, "num_input_tokens_seen": 32800768, "step": 126 }, { "epoch": 1.4312056737588652, "grad_norm": 1.4082252271919538, "learning_rate": 4.0407160881378824e-05, "loss": 0.9026, "num_input_tokens_seen": 33062912, "step": 127 }, { "epoch": 1.4425531914893617, "grad_norm": 1.242226249471717, "learning_rate": 4.0266196990885955e-05, "loss": 0.9329, "num_input_tokens_seen": 33325056, "step": 128 }, { "epoch": 1.4539007092198581, "grad_norm": 1.2320327101088704, "learning_rate": 4.012445484272307e-05, "loss": 0.9055, "num_input_tokens_seen": 33587200, "step": 129 }, { "epoch": 1.4652482269503546, "grad_norm": 1.097364604214534, "learning_rate": 3.9981941662783674e-05, "loss": 0.9505, "num_input_tokens_seen": 33849344, "step": 130 }, { "epoch": 1.476595744680851, "grad_norm": 1.287506918091485, "learning_rate": 3.9838664716267855e-05, "loss": 0.9205, "num_input_tokens_seen": 34111488, "step": 131 }, { "epoch": 1.4879432624113476, "grad_norm": 1.1710856488092862, "learning_rate": 3.969463130731183e-05, "loss": 0.9757, "num_input_tokens_seen": 34373632, "step": 132 }, { "epoch": 1.499290780141844, "grad_norm": 1.1761537048031911, "learning_rate": 3.954984877861565e-05, "loss": 0.9375, "num_input_tokens_seen": 34635776, "step": 133 }, { "epoch": 1.5106382978723403, "grad_norm": 1.211479891065261, "learning_rate": 3.9404324511068825e-05, "loss": 0.9631, "num_input_tokens_seen": 34897920, "step": 134 }, { "epoch": 1.5219858156028367, "grad_norm": 1.1227811129578062, "learning_rate": 3.92580659233741e-05, "loss": 0.9413, "num_input_tokens_seen": 35160064, "step": 135 }, { "epoch": 1.5333333333333332, "grad_norm": 1.2565270122423726, "learning_rate": 3.911108047166924e-05, "loss": 0.9436, "num_input_tokens_seen": 35422208, "step": 136 }, { "epoch": 1.5446808510638297, "grad_norm": 1.194511509764168, "learning_rate": 3.8963375649146866e-05, "loss": 0.9537, "num_input_tokens_seen": 35684352, "step": 137 }, { "epoch": 1.5560283687943262, "grad_norm": 1.147953601769395, "learning_rate": 3.881495898567257e-05, "loss": 0.9279, "num_input_tokens_seen": 35946496, "step": 138 }, { "epoch": 1.5673758865248226, "grad_norm": 1.128582567310139, "learning_rate": 3.866583804740095e-05, "loss": 0.9053, "num_input_tokens_seen": 36208640, "step": 139 }, { "epoch": 1.578723404255319, "grad_norm": 1.1883091515233541, "learning_rate": 3.851602043638994e-05, "loss": 0.9499, "num_input_tokens_seen": 36470784, "step": 140 }, { "epoch": 1.5900709219858156, "grad_norm": 1.154451792017842, "learning_rate": 3.8365513790213265e-05, "loss": 0.9262, "num_input_tokens_seen": 36732928, "step": 141 }, { "epoch": 1.601418439716312, "grad_norm": 1.2185944157116932, "learning_rate": 3.821432578157105e-05, "loss": 0.934, "num_input_tokens_seen": 36995072, "step": 142 }, { "epoch": 1.6127659574468085, "grad_norm": 1.1779364162194832, "learning_rate": 3.8062464117898724e-05, "loss": 0.9588, "num_input_tokens_seen": 37257216, "step": 143 }, { "epoch": 1.624113475177305, "grad_norm": 1.1736568320305454, "learning_rate": 3.790993654097405e-05, "loss": 0.9616, "num_input_tokens_seen": 37519360, "step": 144 }, { "epoch": 1.6354609929078014, "grad_norm": 1.174721701784409, "learning_rate": 3.77567508265225e-05, "loss": 0.9551, "num_input_tokens_seen": 37781504, "step": 145 }, { "epoch": 1.646808510638298, "grad_norm": 1.1583059430503744, "learning_rate": 3.76029147838208e-05, "loss": 0.9636, "num_input_tokens_seen": 38043648, "step": 146 }, { "epoch": 1.6581560283687944, "grad_norm": 1.2781768836563048, "learning_rate": 3.74484362552989e-05, "loss": 0.949, "num_input_tokens_seen": 38305792, "step": 147 }, { "epoch": 1.6695035460992909, "grad_norm": 1.2187569053893257, "learning_rate": 3.72933231161401e-05, "loss": 0.9325, "num_input_tokens_seen": 38567936, "step": 148 }, { "epoch": 1.6808510638297873, "grad_norm": 1.1127467597849947, "learning_rate": 3.713758327387961e-05, "loss": 0.9175, "num_input_tokens_seen": 38830080, "step": 149 }, { "epoch": 1.6921985815602838, "grad_norm": 1.5466609004385776, "learning_rate": 3.6981224668001424e-05, "loss": 0.9579, "num_input_tokens_seen": 39092224, "step": 150 }, { "epoch": 1.7035460992907803, "grad_norm": 1.2711234448756215, "learning_rate": 3.682425526953359e-05, "loss": 0.9568, "num_input_tokens_seen": 39354368, "step": 151 }, { "epoch": 1.7148936170212767, "grad_norm": 1.2561675729769082, "learning_rate": 3.6666683080641846e-05, "loss": 0.931, "num_input_tokens_seen": 39616512, "step": 152 }, { "epoch": 1.7262411347517732, "grad_norm": 1.2872779270009815, "learning_rate": 3.6508516134221635e-05, "loss": 0.9456, "num_input_tokens_seen": 39878656, "step": 153 }, { "epoch": 1.7375886524822695, "grad_norm": 1.211380322340917, "learning_rate": 3.634976249348867e-05, "loss": 0.9525, "num_input_tokens_seen": 40140800, "step": 154 }, { "epoch": 1.748936170212766, "grad_norm": 1.262195787306148, "learning_rate": 3.619043025156782e-05, "loss": 0.9254, "num_input_tokens_seen": 40402944, "step": 155 }, { "epoch": 1.7602836879432624, "grad_norm": 1.158471503672054, "learning_rate": 3.603052753108053e-05, "loss": 0.9594, "num_input_tokens_seen": 40665088, "step": 156 }, { "epoch": 1.7716312056737589, "grad_norm": 1.2179519684336035, "learning_rate": 3.58700624837308e-05, "loss": 0.9565, "num_input_tokens_seen": 40927232, "step": 157 }, { "epoch": 1.7829787234042553, "grad_norm": 1.2186733946981332, "learning_rate": 3.5709043289889536e-05, "loss": 0.957, "num_input_tokens_seen": 41189376, "step": 158 }, { "epoch": 1.7943262411347518, "grad_norm": 1.1317921418808867, "learning_rate": 3.554747815817756e-05, "loss": 0.9698, "num_input_tokens_seen": 41451520, "step": 159 }, { "epoch": 1.8056737588652483, "grad_norm": 1.2084681311643604, "learning_rate": 3.5385375325047166e-05, "loss": 0.9534, "num_input_tokens_seen": 41713664, "step": 160 }, { "epoch": 1.8170212765957445, "grad_norm": 1.1480526352440725, "learning_rate": 3.522274305436217e-05, "loss": 0.9458, "num_input_tokens_seen": 41975808, "step": 161 }, { "epoch": 1.828368794326241, "grad_norm": 1.158020278288251, "learning_rate": 3.50595896369767e-05, "loss": 0.9299, "num_input_tokens_seen": 42237952, "step": 162 }, { "epoch": 1.8397163120567375, "grad_norm": 1.2652775197204456, "learning_rate": 3.4895923390312466e-05, "loss": 0.9702, "num_input_tokens_seen": 42500096, "step": 163 }, { "epoch": 1.851063829787234, "grad_norm": 1.083067001347801, "learning_rate": 3.4731752657934794e-05, "loss": 0.9805, "num_input_tokens_seen": 42762240, "step": 164 }, { "epoch": 1.8624113475177304, "grad_norm": 1.3218549445020937, "learning_rate": 3.456708580912725e-05, "loss": 0.9199, "num_input_tokens_seen": 43024384, "step": 165 }, { "epoch": 1.8737588652482269, "grad_norm": 1.1602075511901062, "learning_rate": 3.4401931238464994e-05, "loss": 0.9702, "num_input_tokens_seen": 43286528, "step": 166 }, { "epoch": 1.8851063829787233, "grad_norm": 1.0903691972788288, "learning_rate": 3.423629736538685e-05, "loss": 0.9444, "num_input_tokens_seen": 43548672, "step": 167 }, { "epoch": 1.8964539007092198, "grad_norm": 1.3066277139158358, "learning_rate": 3.4070192633766025e-05, "loss": 0.9559, "num_input_tokens_seen": 43810816, "step": 168 }, { "epoch": 1.9078014184397163, "grad_norm": 1.1068620390544621, "learning_rate": 3.390362551147974e-05, "loss": 0.9547, "num_input_tokens_seen": 44072960, "step": 169 }, { "epoch": 1.9191489361702128, "grad_norm": 1.2160218176405404, "learning_rate": 3.3736604489977466e-05, "loss": 0.9774, "num_input_tokens_seen": 44335104, "step": 170 }, { "epoch": 1.9304964539007092, "grad_norm": 1.1169244406424832, "learning_rate": 3.356913808384807e-05, "loss": 0.9485, "num_input_tokens_seen": 44597248, "step": 171 }, { "epoch": 1.9418439716312057, "grad_norm": 1.1440632925298737, "learning_rate": 3.3401234830385756e-05, "loss": 0.9556, "num_input_tokens_seen": 44859392, "step": 172 }, { "epoch": 1.9531914893617022, "grad_norm": 1.156402005245823, "learning_rate": 3.323290328915483e-05, "loss": 0.9635, "num_input_tokens_seen": 45121536, "step": 173 }, { "epoch": 1.9645390070921986, "grad_norm": 1.0465098069882341, "learning_rate": 3.306415204155335e-05, "loss": 0.9327, "num_input_tokens_seen": 45383680, "step": 174 }, { "epoch": 1.9758865248226951, "grad_norm": 1.097782670172884, "learning_rate": 3.2894989690375626e-05, "loss": 0.9274, "num_input_tokens_seen": 45645824, "step": 175 }, { "epoch": 1.9872340425531916, "grad_norm": 1.138066760772727, "learning_rate": 3.272542485937369e-05, "loss": 0.9402, "num_input_tokens_seen": 45907968, "step": 176 }, { "epoch": 1.998581560283688, "grad_norm": 1.1812784275655155, "learning_rate": 3.255546619281765e-05, "loss": 0.9699, "num_input_tokens_seen": 46170112, "step": 177 }, { "epoch": 2.0, "grad_norm": 1.1812784275655155, "learning_rate": 3.2385122355055005e-05, "loss": 0.7485, "num_input_tokens_seen": 46202880, "step": 178 }, { "epoch": 2.0113475177304965, "grad_norm": 3.8292467760018236, "learning_rate": 3.221440203006897e-05, "loss": 0.5128, "num_input_tokens_seen": 46465024, "step": 179 }, { "epoch": 2.022695035460993, "grad_norm": 2.6270275589892638, "learning_rate": 3.2043313921035743e-05, "loss": 0.494, "num_input_tokens_seen": 46727168, "step": 180 }, { "epoch": 2.0340425531914894, "grad_norm": 1.9013960211342629, "learning_rate": 3.1871866749880846e-05, "loss": 0.4649, "num_input_tokens_seen": 46989312, "step": 181 }, { "epoch": 2.045390070921986, "grad_norm": 2.6940464488513314, "learning_rate": 3.170006925683448e-05, "loss": 0.4609, "num_input_tokens_seen": 47251456, "step": 182 }, { "epoch": 2.0567375886524824, "grad_norm": 2.9136774444822446, "learning_rate": 3.152793019998594e-05, "loss": 0.4303, "num_input_tokens_seen": 47513600, "step": 183 }, { "epoch": 2.068085106382979, "grad_norm": 2.284711351155428, "learning_rate": 3.135545835483718e-05, "loss": 0.4343, "num_input_tokens_seen": 47775744, "step": 184 }, { "epoch": 2.0794326241134753, "grad_norm": 1.8761627155008236, "learning_rate": 3.118266251385539e-05, "loss": 0.4224, "num_input_tokens_seen": 48037888, "step": 185 }, { "epoch": 2.0907801418439718, "grad_norm": 1.537845244765071, "learning_rate": 3.100955148602481e-05, "loss": 0.4173, "num_input_tokens_seen": 48300032, "step": 186 }, { "epoch": 2.1021276595744682, "grad_norm": 1.4350770359924392, "learning_rate": 3.083613409639764e-05, "loss": 0.3962, "num_input_tokens_seen": 48562176, "step": 187 }, { "epoch": 2.1134751773049647, "grad_norm": 1.3504256900149412, "learning_rate": 3.0662419185644115e-05, "loss": 0.393, "num_input_tokens_seen": 48824320, "step": 188 }, { "epoch": 2.124822695035461, "grad_norm": 1.4687686815955256, "learning_rate": 3.0488415609601862e-05, "loss": 0.3826, "num_input_tokens_seen": 49086464, "step": 189 }, { "epoch": 2.1361702127659576, "grad_norm": 1.275174193399539, "learning_rate": 3.0314132238824415e-05, "loss": 0.3961, "num_input_tokens_seen": 49348608, "step": 190 }, { "epoch": 2.147517730496454, "grad_norm": 1.43179460079197, "learning_rate": 3.013957795812902e-05, "loss": 0.3759, "num_input_tokens_seen": 49610752, "step": 191 }, { "epoch": 2.1588652482269506, "grad_norm": 1.4035121217192592, "learning_rate": 2.996476166614364e-05, "loss": 0.3716, "num_input_tokens_seen": 49872896, "step": 192 }, { "epoch": 2.1702127659574466, "grad_norm": 1.39291598371889, "learning_rate": 2.9789692274853388e-05, "loss": 0.3875, "num_input_tokens_seen": 50135040, "step": 193 }, { "epoch": 2.181560283687943, "grad_norm": 1.630578340810045, "learning_rate": 2.9614378709146133e-05, "loss": 0.374, "num_input_tokens_seen": 50397184, "step": 194 }, { "epoch": 2.1929078014184396, "grad_norm": 1.3814469271709833, "learning_rate": 2.943882990635759e-05, "loss": 0.3694, "num_input_tokens_seen": 50659328, "step": 195 }, { "epoch": 2.204255319148936, "grad_norm": 1.6114694228119848, "learning_rate": 2.92630548158156e-05, "loss": 0.3901, "num_input_tokens_seen": 50921472, "step": 196 }, { "epoch": 2.2156028368794325, "grad_norm": 5.245754079879191, "learning_rate": 2.9087062398384e-05, "loss": 0.3825, "num_input_tokens_seen": 51183616, "step": 197 }, { "epoch": 2.226950354609929, "grad_norm": 2.250670427725288, "learning_rate": 2.8910861626005776e-05, "loss": 0.4064, "num_input_tokens_seen": 51445760, "step": 198 }, { "epoch": 2.2382978723404254, "grad_norm": 1.7095359356479407, "learning_rate": 2.873446148124563e-05, "loss": 0.4096, "num_input_tokens_seen": 51707904, "step": 199 }, { "epoch": 2.249645390070922, "grad_norm": 1.4355808796008729, "learning_rate": 2.8557870956832132e-05, "loss": 0.3814, "num_input_tokens_seen": 51970048, "step": 200 }, { "epoch": 2.2609929078014184, "grad_norm": 1.73510400599197, "learning_rate": 2.8381099055199222e-05, "loss": 0.3932, "num_input_tokens_seen": 52232192, "step": 201 }, { "epoch": 2.272340425531915, "grad_norm": 1.555638605050248, "learning_rate": 2.8204154788027325e-05, "loss": 0.3613, "num_input_tokens_seen": 52494336, "step": 202 }, { "epoch": 2.2836879432624113, "grad_norm": 1.1987449143902036, "learning_rate": 2.8027047175783873e-05, "loss": 0.359, "num_input_tokens_seen": 52756480, "step": 203 }, { "epoch": 2.295035460992908, "grad_norm": 1.442324830425997, "learning_rate": 2.7849785247263515e-05, "loss": 0.3897, "num_input_tokens_seen": 53018624, "step": 204 }, { "epoch": 2.3063829787234043, "grad_norm": 1.3866321638394548, "learning_rate": 2.767237803912783e-05, "loss": 0.3945, "num_input_tokens_seen": 53280768, "step": 205 }, { "epoch": 2.3177304964539007, "grad_norm": 1.2184999424862926, "learning_rate": 2.7494834595444568e-05, "loss": 0.3682, "num_input_tokens_seen": 53542912, "step": 206 }, { "epoch": 2.329078014184397, "grad_norm": 1.2180997743390845, "learning_rate": 2.731716396722672e-05, "loss": 0.3686, "num_input_tokens_seen": 53805056, "step": 207 }, { "epoch": 2.3404255319148937, "grad_norm": 1.315597212375225, "learning_rate": 2.7139375211970996e-05, "loss": 0.3683, "num_input_tokens_seen": 54067200, "step": 208 }, { "epoch": 2.35177304964539, "grad_norm": 1.1389971788403916, "learning_rate": 2.6961477393196126e-05, "loss": 0.3656, "num_input_tokens_seen": 54329344, "step": 209 }, { "epoch": 2.3631205673758866, "grad_norm": 1.1423683445861843, "learning_rate": 2.6783479579980807e-05, "loss": 0.3659, "num_input_tokens_seen": 54591488, "step": 210 }, { "epoch": 2.374468085106383, "grad_norm": 1.1906990298985376, "learning_rate": 2.6605390846501377e-05, "loss": 0.3581, "num_input_tokens_seen": 54853632, "step": 211 }, { "epoch": 2.3858156028368795, "grad_norm": 1.3437643319906911, "learning_rate": 2.6427220271569203e-05, "loss": 0.3693, "num_input_tokens_seen": 55115776, "step": 212 }, { "epoch": 2.397163120567376, "grad_norm": 1.3294483159523118, "learning_rate": 2.624897693816785e-05, "loss": 0.3746, "num_input_tokens_seen": 55377920, "step": 213 }, { "epoch": 2.4085106382978725, "grad_norm": 1.1245200003757012, "learning_rate": 2.6070669932990067e-05, "loss": 0.3741, "num_input_tokens_seen": 55640064, "step": 214 }, { "epoch": 2.419858156028369, "grad_norm": 1.2751337825743647, "learning_rate": 2.5892308345974515e-05, "loss": 0.3583, "num_input_tokens_seen": 55902208, "step": 215 }, { "epoch": 2.4312056737588654, "grad_norm": 1.1580659329331486, "learning_rate": 2.5713901269842404e-05, "loss": 0.3622, "num_input_tokens_seen": 56164352, "step": 216 }, { "epoch": 2.4425531914893615, "grad_norm": 1.1800640589939102, "learning_rate": 2.5535457799633955e-05, "loss": 0.3588, "num_input_tokens_seen": 56426496, "step": 217 }, { "epoch": 2.453900709219858, "grad_norm": 1.198033947606563, "learning_rate": 2.5356987032244683e-05, "loss": 0.3614, "num_input_tokens_seen": 56688640, "step": 218 }, { "epoch": 2.4652482269503544, "grad_norm": 1.1820243301759021, "learning_rate": 2.5178498065961736e-05, "loss": 0.3727, "num_input_tokens_seen": 56950784, "step": 219 }, { "epoch": 2.476595744680851, "grad_norm": 1.1419500525476782, "learning_rate": 2.5e-05, "loss": 0.3595, "num_input_tokens_seen": 57212928, "step": 220 }, { "epoch": 2.4879432624113473, "grad_norm": 1.1601188236644033, "learning_rate": 2.4821501934038266e-05, "loss": 0.3475, "num_input_tokens_seen": 57475072, "step": 221 }, { "epoch": 2.499290780141844, "grad_norm": 1.2274900948257617, "learning_rate": 2.4643012967755326e-05, "loss": 0.373, "num_input_tokens_seen": 57737216, "step": 222 }, { "epoch": 2.5106382978723403, "grad_norm": 1.2801139230605143, "learning_rate": 2.446454220036605e-05, "loss": 0.3709, "num_input_tokens_seen": 57999360, "step": 223 }, { "epoch": 2.5219858156028367, "grad_norm": 1.175617548960192, "learning_rate": 2.42860987301576e-05, "loss": 0.3616, "num_input_tokens_seen": 58261504, "step": 224 }, { "epoch": 2.533333333333333, "grad_norm": 1.2552798520658928, "learning_rate": 2.410769165402549e-05, "loss": 0.3873, "num_input_tokens_seen": 58523648, "step": 225 }, { "epoch": 2.5446808510638297, "grad_norm": 1.1904955099860972, "learning_rate": 2.3929330067009942e-05, "loss": 0.3739, "num_input_tokens_seen": 58785792, "step": 226 }, { "epoch": 2.556028368794326, "grad_norm": 1.096367614278807, "learning_rate": 2.3751023061832158e-05, "loss": 0.3567, "num_input_tokens_seen": 59047936, "step": 227 }, { "epoch": 2.5673758865248226, "grad_norm": 1.1935334517672878, "learning_rate": 2.35727797284308e-05, "loss": 0.3902, "num_input_tokens_seen": 59310080, "step": 228 }, { "epoch": 2.578723404255319, "grad_norm": 1.18472305830408, "learning_rate": 2.339460915349862e-05, "loss": 0.3814, "num_input_tokens_seen": 59572224, "step": 229 }, { "epoch": 2.5900709219858156, "grad_norm": 1.171242395850753, "learning_rate": 2.3216520420019195e-05, "loss": 0.3786, "num_input_tokens_seen": 59834368, "step": 230 }, { "epoch": 2.601418439716312, "grad_norm": 1.0900508856776798, "learning_rate": 2.303852260680388e-05, "loss": 0.3773, "num_input_tokens_seen": 60096512, "step": 231 }, { "epoch": 2.6127659574468085, "grad_norm": 1.2090568930739582, "learning_rate": 2.2860624788029013e-05, "loss": 0.3892, "num_input_tokens_seen": 60358656, "step": 232 }, { "epoch": 2.624113475177305, "grad_norm": 1.187407278266127, "learning_rate": 2.268283603277328e-05, "loss": 0.3784, "num_input_tokens_seen": 60620800, "step": 233 }, { "epoch": 2.6354609929078014, "grad_norm": 1.1154631653760168, "learning_rate": 2.250516540455543e-05, "loss": 0.3617, "num_input_tokens_seen": 60882944, "step": 234 }, { "epoch": 2.646808510638298, "grad_norm": 1.1730027507450338, "learning_rate": 2.2327621960872187e-05, "loss": 0.3681, "num_input_tokens_seen": 61145088, "step": 235 }, { "epoch": 2.6581560283687944, "grad_norm": 1.0970354831582407, "learning_rate": 2.2150214752736488e-05, "loss": 0.3684, "num_input_tokens_seen": 61407232, "step": 236 }, { "epoch": 2.669503546099291, "grad_norm": 1.0380524878431867, "learning_rate": 2.197295282421613e-05, "loss": 0.3544, "num_input_tokens_seen": 61669376, "step": 237 }, { "epoch": 2.6808510638297873, "grad_norm": 1.1922728093817796, "learning_rate": 2.179584521197268e-05, "loss": 0.3669, "num_input_tokens_seen": 61931520, "step": 238 }, { "epoch": 2.692198581560284, "grad_norm": 1.1115431834278655, "learning_rate": 2.1618900944800777e-05, "loss": 0.3786, "num_input_tokens_seen": 62193664, "step": 239 }, { "epoch": 2.7035460992907803, "grad_norm": 1.1319193354917367, "learning_rate": 2.1442129043167874e-05, "loss": 0.3669, "num_input_tokens_seen": 62455808, "step": 240 }, { "epoch": 2.7148936170212767, "grad_norm": 1.1615463220718218, "learning_rate": 2.1265538518754374e-05, "loss": 0.3665, "num_input_tokens_seen": 62717952, "step": 241 }, { "epoch": 2.726241134751773, "grad_norm": 1.1434386890748394, "learning_rate": 2.1089138373994223e-05, "loss": 0.392, "num_input_tokens_seen": 62980096, "step": 242 }, { "epoch": 2.7375886524822697, "grad_norm": 1.1369786895076892, "learning_rate": 2.0912937601616005e-05, "loss": 0.372, "num_input_tokens_seen": 63242240, "step": 243 }, { "epoch": 2.748936170212766, "grad_norm": 1.0632983215067038, "learning_rate": 2.0736945184184405e-05, "loss": 0.3648, "num_input_tokens_seen": 63504384, "step": 244 }, { "epoch": 2.7602836879432626, "grad_norm": 1.1587019300349575, "learning_rate": 2.0561170093642423e-05, "loss": 0.3875, "num_input_tokens_seen": 63766528, "step": 245 }, { "epoch": 2.771631205673759, "grad_norm": 1.1459361371000099, "learning_rate": 2.038562129085387e-05, "loss": 0.3734, "num_input_tokens_seen": 64028672, "step": 246 }, { "epoch": 2.7829787234042556, "grad_norm": 1.1468262297024947, "learning_rate": 2.0210307725146615e-05, "loss": 0.3593, "num_input_tokens_seen": 64290816, "step": 247 }, { "epoch": 2.794326241134752, "grad_norm": 1.1534392716010342, "learning_rate": 2.003523833385637e-05, "loss": 0.3844, "num_input_tokens_seen": 64552960, "step": 248 }, { "epoch": 2.8056737588652485, "grad_norm": 1.1768077334945029, "learning_rate": 1.9860422041870987e-05, "loss": 0.3801, "num_input_tokens_seen": 64815104, "step": 249 }, { "epoch": 2.8170212765957445, "grad_norm": 1.1272963366670457, "learning_rate": 1.9685867761175584e-05, "loss": 0.3849, "num_input_tokens_seen": 65077248, "step": 250 }, { "epoch": 2.828368794326241, "grad_norm": 1.1169732679286002, "learning_rate": 1.9511584390398147e-05, "loss": 0.3739, "num_input_tokens_seen": 65339392, "step": 251 }, { "epoch": 2.8397163120567375, "grad_norm": 1.1016941662820015, "learning_rate": 1.9337580814355888e-05, "loss": 0.366, "num_input_tokens_seen": 65601536, "step": 252 }, { "epoch": 2.851063829787234, "grad_norm": 1.1758369662868582, "learning_rate": 1.9163865903602374e-05, "loss": 0.3629, "num_input_tokens_seen": 65863680, "step": 253 }, { "epoch": 2.8624113475177304, "grad_norm": 1.1242921844137204, "learning_rate": 1.899044851397519e-05, "loss": 0.3814, "num_input_tokens_seen": 66125824, "step": 254 }, { "epoch": 2.873758865248227, "grad_norm": 1.1614961100727705, "learning_rate": 1.881733748614461e-05, "loss": 0.3775, "num_input_tokens_seen": 66387968, "step": 255 }, { "epoch": 2.8851063829787233, "grad_norm": 1.1581204676106365, "learning_rate": 1.8644541645162834e-05, "loss": 0.3783, "num_input_tokens_seen": 66650112, "step": 256 }, { "epoch": 2.89645390070922, "grad_norm": 1.1120579438295783, "learning_rate": 1.8472069800014068e-05, "loss": 0.35, "num_input_tokens_seen": 66912256, "step": 257 }, { "epoch": 2.9078014184397163, "grad_norm": 1.1756299409660191, "learning_rate": 1.8299930743165535e-05, "loss": 0.3742, "num_input_tokens_seen": 67174400, "step": 258 }, { "epoch": 2.9191489361702128, "grad_norm": 1.1402458104536393, "learning_rate": 1.8128133250119157e-05, "loss": 0.3652, "num_input_tokens_seen": 67436544, "step": 259 }, { "epoch": 2.9304964539007092, "grad_norm": 1.1433045669706552, "learning_rate": 1.795668607896426e-05, "loss": 0.3832, "num_input_tokens_seen": 67698688, "step": 260 }, { "epoch": 2.9418439716312057, "grad_norm": 1.1856499993733003, "learning_rate": 1.778559796993104e-05, "loss": 0.3807, "num_input_tokens_seen": 67960832, "step": 261 }, { "epoch": 2.953191489361702, "grad_norm": 1.1271393587454326, "learning_rate": 1.7614877644945e-05, "loss": 0.3799, "num_input_tokens_seen": 68222976, "step": 262 }, { "epoch": 2.9645390070921986, "grad_norm": 1.0826436831731212, "learning_rate": 1.7444533807182357e-05, "loss": 0.3412, "num_input_tokens_seen": 68485120, "step": 263 }, { "epoch": 2.975886524822695, "grad_norm": 1.131271727043019, "learning_rate": 1.7274575140626318e-05, "loss": 0.3666, "num_input_tokens_seen": 68747264, "step": 264 }, { "epoch": 2.9872340425531916, "grad_norm": 1.0929232699820757, "learning_rate": 1.710501030962438e-05, "loss": 0.3787, "num_input_tokens_seen": 69009408, "step": 265 }, { "epoch": 2.998581560283688, "grad_norm": 1.0625348413281066, "learning_rate": 1.6935847958446657e-05, "loss": 0.3623, "num_input_tokens_seen": 69271552, "step": 266 }, { "epoch": 3.0, "grad_norm": 1.0625348413281066, "learning_rate": 1.6767096710845174e-05, "loss": 0.2925, "num_input_tokens_seen": 69304320, "step": 267 }, { "epoch": 3.0113475177304965, "grad_norm": 2.86130141515677, "learning_rate": 1.6598765169614243e-05, "loss": 0.1334, "num_input_tokens_seen": 69566464, "step": 268 }, { "epoch": 3.022695035460993, "grad_norm": 1.5425933485480858, "learning_rate": 1.643086191615194e-05, "loss": 0.1248, "num_input_tokens_seen": 69828608, "step": 269 }, { "epoch": 3.0340425531914894, "grad_norm": 1.2430694797405355, "learning_rate": 1.6263395510022543e-05, "loss": 0.1212, "num_input_tokens_seen": 70090752, "step": 270 }, { "epoch": 3.045390070921986, "grad_norm": 0.9436822278912739, "learning_rate": 1.6096374488520265e-05, "loss": 0.1026, "num_input_tokens_seen": 70352896, "step": 271 }, { "epoch": 3.0567375886524824, "grad_norm": 1.0262534287871352, "learning_rate": 1.5929807366233977e-05, "loss": 0.1147, "num_input_tokens_seen": 70615040, "step": 272 }, { "epoch": 3.068085106382979, "grad_norm": 1.41916462482431, "learning_rate": 1.5763702634613152e-05, "loss": 0.1112, "num_input_tokens_seen": 70877184, "step": 273 }, { "epoch": 3.0794326241134753, "grad_norm": 1.6458131533253877, "learning_rate": 1.559806876153501e-05, "loss": 0.1142, "num_input_tokens_seen": 71139328, "step": 274 }, { "epoch": 3.0907801418439718, "grad_norm": 1.4067415425269045, "learning_rate": 1.5432914190872757e-05, "loss": 0.1094, "num_input_tokens_seen": 71401472, "step": 275 }, { "epoch": 3.1021276595744682, "grad_norm": 1.2565976210206782, "learning_rate": 1.5268247342065215e-05, "loss": 0.1024, "num_input_tokens_seen": 71663616, "step": 276 }, { "epoch": 3.1134751773049647, "grad_norm": 1.0843813224286079, "learning_rate": 1.5104076609687545e-05, "loss": 0.1023, "num_input_tokens_seen": 71925760, "step": 277 }, { "epoch": 3.124822695035461, "grad_norm": 0.9477884459564624, "learning_rate": 1.4940410363023306e-05, "loss": 0.0978, "num_input_tokens_seen": 72187904, "step": 278 }, { "epoch": 3.1361702127659576, "grad_norm": 0.822005236524651, "learning_rate": 1.4777256945637834e-05, "loss": 0.099, "num_input_tokens_seen": 72450048, "step": 279 }, { "epoch": 3.147517730496454, "grad_norm": 0.8023754053464645, "learning_rate": 1.4614624674952842e-05, "loss": 0.0956, "num_input_tokens_seen": 72712192, "step": 280 }, { "epoch": 3.1588652482269506, "grad_norm": 0.8464902902144091, "learning_rate": 1.4452521841822436e-05, "loss": 0.0998, "num_input_tokens_seen": 72974336, "step": 281 }, { "epoch": 3.1702127659574466, "grad_norm": 0.7987684436490388, "learning_rate": 1.4290956710110475e-05, "loss": 0.0945, "num_input_tokens_seen": 73236480, "step": 282 }, { "epoch": 3.181560283687943, "grad_norm": 0.8094393208784639, "learning_rate": 1.4129937516269203e-05, "loss": 0.0988, "num_input_tokens_seen": 73498624, "step": 283 }, { "epoch": 3.1929078014184396, "grad_norm": 0.8777616390752891, "learning_rate": 1.3969472468919461e-05, "loss": 0.0979, "num_input_tokens_seen": 73760768, "step": 284 }, { "epoch": 3.204255319148936, "grad_norm": 0.8153492833919085, "learning_rate": 1.3809569748432189e-05, "loss": 0.0937, "num_input_tokens_seen": 74022912, "step": 285 }, { "epoch": 3.2156028368794325, "grad_norm": 0.8075245623239392, "learning_rate": 1.3650237506511331e-05, "loss": 0.0895, "num_input_tokens_seen": 74285056, "step": 286 }, { "epoch": 3.226950354609929, "grad_norm": 0.8173043815021659, "learning_rate": 1.3491483865778365e-05, "loss": 0.0924, "num_input_tokens_seen": 74547200, "step": 287 }, { "epoch": 3.2382978723404254, "grad_norm": 0.9317108513266112, "learning_rate": 1.3333316919358157e-05, "loss": 0.0987, "num_input_tokens_seen": 74809344, "step": 288 }, { "epoch": 3.249645390070922, "grad_norm": 0.8288147543181388, "learning_rate": 1.3175744730466408e-05, "loss": 0.0841, "num_input_tokens_seen": 75071488, "step": 289 }, { "epoch": 3.2609929078014184, "grad_norm": 0.8070883717373725, "learning_rate": 1.301877533199859e-05, "loss": 0.0931, "num_input_tokens_seen": 75333632, "step": 290 }, { "epoch": 3.272340425531915, "grad_norm": 0.8688265252758833, "learning_rate": 1.2862416726120396e-05, "loss": 0.0918, "num_input_tokens_seen": 75595776, "step": 291 }, { "epoch": 3.2836879432624113, "grad_norm": 0.7969272205965748, "learning_rate": 1.2706676883859903e-05, "loss": 0.0893, "num_input_tokens_seen": 75857920, "step": 292 }, { "epoch": 3.295035460992908, "grad_norm": 0.7533077115551139, "learning_rate": 1.2551563744701109e-05, "loss": 0.0886, "num_input_tokens_seen": 76120064, "step": 293 }, { "epoch": 3.3063829787234043, "grad_norm": 0.7785634970662512, "learning_rate": 1.2397085216179208e-05, "loss": 0.0883, "num_input_tokens_seen": 76382208, "step": 294 }, { "epoch": 3.3177304964539007, "grad_norm": 0.7903142511464022, "learning_rate": 1.2243249173477513e-05, "loss": 0.0882, "num_input_tokens_seen": 76644352, "step": 295 }, { "epoch": 3.329078014184397, "grad_norm": 0.7742080002169154, "learning_rate": 1.2090063459025955e-05, "loss": 0.0885, "num_input_tokens_seen": 76906496, "step": 296 }, { "epoch": 3.3404255319148937, "grad_norm": 0.8054772194334612, "learning_rate": 1.1937535882101281e-05, "loss": 0.0933, "num_input_tokens_seen": 77168640, "step": 297 }, { "epoch": 3.35177304964539, "grad_norm": 0.7675259582762006, "learning_rate": 1.1785674218428952e-05, "loss": 0.0943, "num_input_tokens_seen": 77430784, "step": 298 }, { "epoch": 3.3631205673758866, "grad_norm": 0.7414624503341706, "learning_rate": 1.163448620978674e-05, "loss": 0.0883, "num_input_tokens_seen": 77692928, "step": 299 }, { "epoch": 3.374468085106383, "grad_norm": 0.8192053000270058, "learning_rate": 1.148397956361007e-05, "loss": 0.0945, "num_input_tokens_seen": 77955072, "step": 300 }, { "epoch": 3.3858156028368795, "grad_norm": 0.7961744887050209, "learning_rate": 1.1334161952599054e-05, "loss": 0.0859, "num_input_tokens_seen": 78217216, "step": 301 }, { "epoch": 3.397163120567376, "grad_norm": 0.806567989025415, "learning_rate": 1.1185041014327433e-05, "loss": 0.0982, "num_input_tokens_seen": 78479360, "step": 302 }, { "epoch": 3.4085106382978725, "grad_norm": 0.7627938171296575, "learning_rate": 1.1036624350853145e-05, "loss": 0.0891, "num_input_tokens_seen": 78741504, "step": 303 }, { "epoch": 3.419858156028369, "grad_norm": 0.7764856826754996, "learning_rate": 1.0888919528330777e-05, "loss": 0.0905, "num_input_tokens_seen": 79003648, "step": 304 }, { "epoch": 3.4312056737588654, "grad_norm": 0.8011164595440228, "learning_rate": 1.0741934076625895e-05, "loss": 0.0891, "num_input_tokens_seen": 79265792, "step": 305 }, { "epoch": 3.4425531914893615, "grad_norm": 0.7764472737957309, "learning_rate": 1.059567548893118e-05, "loss": 0.0869, "num_input_tokens_seen": 79527936, "step": 306 }, { "epoch": 3.453900709219858, "grad_norm": 0.7201240824906335, "learning_rate": 1.0450151221384358e-05, "loss": 0.09, "num_input_tokens_seen": 79790080, "step": 307 }, { "epoch": 3.4652482269503544, "grad_norm": 0.8109600636209235, "learning_rate": 1.0305368692688174e-05, "loss": 0.0876, "num_input_tokens_seen": 80052224, "step": 308 }, { "epoch": 3.476595744680851, "grad_norm": 0.7559387558842937, "learning_rate": 1.016133528373215e-05, "loss": 0.0875, "num_input_tokens_seen": 80314368, "step": 309 }, { "epoch": 3.4879432624113473, "grad_norm": 0.7260621903497292, "learning_rate": 1.0018058337216327e-05, "loss": 0.079, "num_input_tokens_seen": 80576512, "step": 310 }, { "epoch": 3.499290780141844, "grad_norm": 0.7591840412491047, "learning_rate": 9.875545157276939e-06, "loss": 0.0793, "num_input_tokens_seen": 80838656, "step": 311 }, { "epoch": 3.5106382978723403, "grad_norm": 0.741059488771299, "learning_rate": 9.733803009114045e-06, "loss": 0.0881, "num_input_tokens_seen": 81100800, "step": 312 }, { "epoch": 3.5219858156028367, "grad_norm": 0.7374917809789198, "learning_rate": 9.592839118621187e-06, "loss": 0.0909, "num_input_tokens_seen": 81362944, "step": 313 }, { "epoch": 3.533333333333333, "grad_norm": 0.7828847705307154, "learning_rate": 9.452660672016949e-06, "loss": 0.0808, "num_input_tokens_seen": 81625088, "step": 314 }, { "epoch": 3.5446808510638297, "grad_norm": 0.7578842723626861, "learning_rate": 9.313274815478698e-06, "loss": 0.0853, "num_input_tokens_seen": 81887232, "step": 315 }, { "epoch": 3.556028368794326, "grad_norm": 0.8012195295633705, "learning_rate": 9.174688654778243e-06, "loss": 0.0913, "num_input_tokens_seen": 82149376, "step": 316 }, { "epoch": 3.5673758865248226, "grad_norm": 0.7505651055808276, "learning_rate": 9.036909254919549e-06, "loss": 0.0851, "num_input_tokens_seen": 82411520, "step": 317 }, { "epoch": 3.578723404255319, "grad_norm": 0.7563671980529104, "learning_rate": 8.899943639778619e-06, "loss": 0.0898, "num_input_tokens_seen": 82673664, "step": 318 }, { "epoch": 3.5900709219858156, "grad_norm": 0.7466202200881212, "learning_rate": 8.763798791745411e-06, "loss": 0.0808, "num_input_tokens_seen": 82935808, "step": 319 }, { "epoch": 3.601418439716312, "grad_norm": 0.6880453348725938, "learning_rate": 8.628481651367876e-06, "loss": 0.084, "num_input_tokens_seen": 83197952, "step": 320 }, { "epoch": 3.6127659574468085, "grad_norm": 0.7694297499307822, "learning_rate": 8.49399911699814e-06, "loss": 0.0957, "num_input_tokens_seen": 83460096, "step": 321 }, { "epoch": 3.624113475177305, "grad_norm": 0.7252376317705156, "learning_rate": 8.360358044440797e-06, "loss": 0.0893, "num_input_tokens_seen": 83722240, "step": 322 }, { "epoch": 3.6354609929078014, "grad_norm": 0.7725027392222746, "learning_rate": 8.227565246603493e-06, "loss": 0.0804, "num_input_tokens_seen": 83984384, "step": 323 }, { "epoch": 3.646808510638298, "grad_norm": 0.6952306274806225, "learning_rate": 8.09562749314952e-06, "loss": 0.0868, "num_input_tokens_seen": 84246528, "step": 324 }, { "epoch": 3.6581560283687944, "grad_norm": 0.7076094401023963, "learning_rate": 7.96455151015272e-06, "loss": 0.089, "num_input_tokens_seen": 84508672, "step": 325 }, { "epoch": 3.669503546099291, "grad_norm": 0.7549486979904751, "learning_rate": 7.83434397975466e-06, "loss": 0.0866, "num_input_tokens_seen": 84770816, "step": 326 }, { "epoch": 3.6808510638297873, "grad_norm": 0.7136449420637923, "learning_rate": 7.705011539823911e-06, "loss": 0.0815, "num_input_tokens_seen": 85032960, "step": 327 }, { "epoch": 3.692198581560284, "grad_norm": 0.7048331650984001, "learning_rate": 7.576560783617668e-06, "loss": 0.0818, "num_input_tokens_seen": 85295104, "step": 328 }, { "epoch": 3.7035460992907803, "grad_norm": 0.7432477427581144, "learning_rate": 7.448998259445664e-06, "loss": 0.0857, "num_input_tokens_seen": 85557248, "step": 329 }, { "epoch": 3.7148936170212767, "grad_norm": 0.7596413190844352, "learning_rate": 7.3223304703363135e-06, "loss": 0.0768, "num_input_tokens_seen": 85819392, "step": 330 }, { "epoch": 3.726241134751773, "grad_norm": 0.743937151653106, "learning_rate": 7.196563873705209e-06, "loss": 0.0924, "num_input_tokens_seen": 86081536, "step": 331 }, { "epoch": 3.7375886524822697, "grad_norm": 0.754620960814025, "learning_rate": 7.071704881025915e-06, "loss": 0.0864, "num_input_tokens_seen": 86343680, "step": 332 }, { "epoch": 3.748936170212766, "grad_norm": 0.7675088057052156, "learning_rate": 6.947759857503119e-06, "loss": 0.0887, "num_input_tokens_seen": 86605824, "step": 333 }, { "epoch": 3.7602836879432626, "grad_norm": 0.7509318055805201, "learning_rate": 6.824735121748163e-06, "loss": 0.083, "num_input_tokens_seen": 86867968, "step": 334 }, { "epoch": 3.771631205673759, "grad_norm": 0.7079671243094418, "learning_rate": 6.70263694545687e-06, "loss": 0.0783, "num_input_tokens_seen": 87130112, "step": 335 }, { "epoch": 3.7829787234042556, "grad_norm": 0.6902777214012117, "learning_rate": 6.5814715530898745e-06, "loss": 0.077, "num_input_tokens_seen": 87392256, "step": 336 }, { "epoch": 3.794326241134752, "grad_norm": 0.736131108315663, "learning_rate": 6.461245121555307e-06, "loss": 0.0832, "num_input_tokens_seen": 87654400, "step": 337 }, { "epoch": 3.8056737588652485, "grad_norm": 0.7602848818372718, "learning_rate": 6.341963779893828e-06, "loss": 0.0815, "num_input_tokens_seen": 87916544, "step": 338 }, { "epoch": 3.8170212765957445, "grad_norm": 0.7035246279871188, "learning_rate": 6.223633608966254e-06, "loss": 0.0802, "num_input_tokens_seen": 88178688, "step": 339 }, { "epoch": 3.828368794326241, "grad_norm": 0.7007067705436694, "learning_rate": 6.106260641143546e-06, "loss": 0.0866, "num_input_tokens_seen": 88440832, "step": 340 }, { "epoch": 3.8397163120567375, "grad_norm": 0.7399965287319934, "learning_rate": 5.989850859999227e-06, "loss": 0.0768, "num_input_tokens_seen": 88702976, "step": 341 }, { "epoch": 3.851063829787234, "grad_norm": 0.7007319108851067, "learning_rate": 5.874410200004421e-06, "loss": 0.0744, "num_input_tokens_seen": 88965120, "step": 342 }, { "epoch": 3.8624113475177304, "grad_norm": 0.6845592652085333, "learning_rate": 5.759944546225271e-06, "loss": 0.081, "num_input_tokens_seen": 89227264, "step": 343 }, { "epoch": 3.873758865248227, "grad_norm": 0.68562597454897, "learning_rate": 5.646459734022938e-06, "loss": 0.0708, "num_input_tokens_seen": 89489408, "step": 344 }, { "epoch": 3.8851063829787233, "grad_norm": 0.7125683393543343, "learning_rate": 5.533961548756128e-06, "loss": 0.078, "num_input_tokens_seen": 89751552, "step": 345 }, { "epoch": 3.89645390070922, "grad_norm": 0.7549793531707268, "learning_rate": 5.422455725486114e-06, "loss": 0.0878, "num_input_tokens_seen": 90013696, "step": 346 }, { "epoch": 3.9078014184397163, "grad_norm": 0.743147041969246, "learning_rate": 5.311947948684457e-06, "loss": 0.0825, "num_input_tokens_seen": 90275840, "step": 347 }, { "epoch": 3.9191489361702128, "grad_norm": 0.7265981456361329, "learning_rate": 5.202443851943126e-06, "loss": 0.0818, "num_input_tokens_seen": 90537984, "step": 348 }, { "epoch": 3.9304964539007092, "grad_norm": 0.7077026530723873, "learning_rate": 5.093949017687341e-06, "loss": 0.081, "num_input_tokens_seen": 90800128, "step": 349 }, { "epoch": 3.9418439716312057, "grad_norm": 0.7036326076512418, "learning_rate": 4.986468976890993e-06, "loss": 0.0747, "num_input_tokens_seen": 91062272, "step": 350 }, { "epoch": 3.953191489361702, "grad_norm": 0.6905130968128987, "learning_rate": 4.880009208794667e-06, "loss": 0.0811, "num_input_tokens_seen": 91324416, "step": 351 }, { "epoch": 3.9645390070921986, "grad_norm": 0.7073702264699631, "learning_rate": 4.7745751406263165e-06, "loss": 0.0749, "num_input_tokens_seen": 91586560, "step": 352 }, { "epoch": 3.975886524822695, "grad_norm": 0.6905489552770284, "learning_rate": 4.670172147324592e-06, "loss": 0.0787, "num_input_tokens_seen": 91848704, "step": 353 }, { "epoch": 3.9872340425531916, "grad_norm": 0.7261963446135224, "learning_rate": 4.566805551264827e-06, "loss": 0.0811, "num_input_tokens_seen": 92110848, "step": 354 }, { "epoch": 3.998581560283688, "grad_norm": 0.7198472258747808, "learning_rate": 4.4644806219877184e-06, "loss": 0.0738, "num_input_tokens_seen": 92372992, "step": 355 }, { "epoch": 4.0, "grad_norm": 0.7198472258747808, "learning_rate": 4.36320257593065e-06, "loss": 0.0466, "num_input_tokens_seen": 92405760, "step": 356 }, { "epoch": 4.0113475177304965, "grad_norm": 1.4688330189672048, "learning_rate": 4.262976576161842e-06, "loss": 0.0249, "num_input_tokens_seen": 92667904, "step": 357 }, { "epoch": 4.022695035460993, "grad_norm": 0.5330531371566788, "learning_rate": 4.1638077321170646e-06, "loss": 0.0231, "num_input_tokens_seen": 92930048, "step": 358 }, { "epoch": 4.034042553191489, "grad_norm": 0.4922654159647282, "learning_rate": 4.0657010993391865e-06, "loss": 0.0222, "num_input_tokens_seen": 93192192, "step": 359 }, { "epoch": 4.045390070921986, "grad_norm": 0.4561531097868261, "learning_rate": 3.968661679220468e-06, "loss": 0.0197, "num_input_tokens_seen": 93454336, "step": 360 }, { "epoch": 4.056737588652482, "grad_norm": 0.42335290314314167, "learning_rate": 3.872694418747594e-06, "loss": 0.0189, "num_input_tokens_seen": 93716480, "step": 361 }, { "epoch": 4.068085106382979, "grad_norm": 0.4008499835789085, "learning_rate": 3.777804210249436e-06, "loss": 0.0176, "num_input_tokens_seen": 93978624, "step": 362 }, { "epoch": 4.079432624113475, "grad_norm": 0.38452603039597705, "learning_rate": 3.6839958911476957e-06, "loss": 0.0198, "num_input_tokens_seen": 94240768, "step": 363 }, { "epoch": 4.090780141843972, "grad_norm": 0.3766839792161632, "learning_rate": 3.591274243710277e-06, "loss": 0.0177, "num_input_tokens_seen": 94502912, "step": 364 }, { "epoch": 4.102127659574468, "grad_norm": 0.3464025018288397, "learning_rate": 3.499643994807486e-06, "loss": 0.0165, "num_input_tokens_seen": 94765056, "step": 365 }, { "epoch": 4.113475177304965, "grad_norm": 0.3369150140378179, "learning_rate": 3.4091098156710744e-06, "loss": 0.0161, "num_input_tokens_seen": 95027200, "step": 366 }, { "epoch": 4.124822695035461, "grad_norm": 0.31335039715603397, "learning_rate": 3.319676321656082e-06, "loss": 0.0156, "num_input_tokens_seen": 95289344, "step": 367 }, { "epoch": 4.136170212765958, "grad_norm": 0.37158481677020094, "learning_rate": 3.2313480720055745e-06, "loss": 0.0165, "num_input_tokens_seen": 95551488, "step": 368 }, { "epoch": 4.147517730496454, "grad_norm": 0.37465306599622433, "learning_rate": 3.1441295696181897e-06, "loss": 0.0152, "num_input_tokens_seen": 95813632, "step": 369 }, { "epoch": 4.158865248226951, "grad_norm": 0.35876073758096233, "learning_rate": 3.058025260818609e-06, "loss": 0.0149, "num_input_tokens_seen": 96075776, "step": 370 }, { "epoch": 4.170212765957447, "grad_norm": 0.3949875998401012, "learning_rate": 2.9730395351308866e-06, "loss": 0.0153, "num_input_tokens_seen": 96337920, "step": 371 }, { "epoch": 4.1815602836879435, "grad_norm": 0.3828061015559776, "learning_rate": 2.889176725054643e-06, "loss": 0.0173, "num_input_tokens_seen": 96600064, "step": 372 }, { "epoch": 4.19290780141844, "grad_norm": 0.45575162609289305, "learning_rate": 2.80644110584424e-06, "loss": 0.0148, "num_input_tokens_seen": 96862208, "step": 373 }, { "epoch": 4.2042553191489365, "grad_norm": 0.42419973216850165, "learning_rate": 2.7248368952908053e-06, "loss": 0.0165, "num_input_tokens_seen": 97124352, "step": 374 }, { "epoch": 4.215602836879433, "grad_norm": 0.42022424947578696, "learning_rate": 2.6443682535072177e-06, "loss": 0.0154, "num_input_tokens_seen": 97386496, "step": 375 }, { "epoch": 4.226950354609929, "grad_norm": 0.4092868425580408, "learning_rate": 2.565039282716045e-06, "loss": 0.0149, "num_input_tokens_seen": 97648640, "step": 376 }, { "epoch": 4.238297872340426, "grad_norm": 0.403775041864487, "learning_rate": 2.486854027040375e-06, "loss": 0.0143, "num_input_tokens_seen": 97910784, "step": 377 }, { "epoch": 4.249645390070922, "grad_norm": 0.400415704748068, "learning_rate": 2.4098164722977073e-06, "loss": 0.016, "num_input_tokens_seen": 98172928, "step": 378 }, { "epoch": 4.260992907801419, "grad_norm": 0.3970826955886615, "learning_rate": 2.333930545796717e-06, "loss": 0.0159, "num_input_tokens_seen": 98435072, "step": 379 }, { "epoch": 4.272340425531915, "grad_norm": 0.4227943452768279, "learning_rate": 2.2592001161370392e-06, "loss": 0.0162, "num_input_tokens_seen": 98697216, "step": 380 }, { "epoch": 4.283687943262412, "grad_norm": 0.38818903700312996, "learning_rate": 2.185628993012101e-06, "loss": 0.0144, "num_input_tokens_seen": 98959360, "step": 381 }, { "epoch": 4.295035460992908, "grad_norm": 0.37517565866002883, "learning_rate": 2.11322092701485e-06, "loss": 0.0147, "num_input_tokens_seen": 99221504, "step": 382 }, { "epoch": 4.306382978723404, "grad_norm": 0.4122413600819348, "learning_rate": 2.0419796094465788e-06, "loss": 0.0149, "num_input_tokens_seen": 99483648, "step": 383 }, { "epoch": 4.317730496453901, "grad_norm": 0.3844299207768967, "learning_rate": 1.97190867212875e-06, "loss": 0.0163, "num_input_tokens_seen": 99745792, "step": 384 }, { "epoch": 4.329078014184397, "grad_norm": 0.39386937733627553, "learning_rate": 1.9030116872178316e-06, "loss": 0.0147, "num_input_tokens_seen": 100007936, "step": 385 }, { "epoch": 4.340425531914893, "grad_norm": 0.338450589586265, "learning_rate": 1.8352921670232143e-06, "loss": 0.0142, "num_input_tokens_seen": 100270080, "step": 386 }, { "epoch": 4.35177304964539, "grad_norm": 0.35418453635661823, "learning_rate": 1.768753563828135e-06, "loss": 0.0148, "num_input_tokens_seen": 100532224, "step": 387 }, { "epoch": 4.363120567375886, "grad_norm": 0.35934554094848764, "learning_rate": 1.703399269713693e-06, "loss": 0.0148, "num_input_tokens_seen": 100794368, "step": 388 }, { "epoch": 4.374468085106383, "grad_norm": 0.34543051645814415, "learning_rate": 1.6392326163859273e-06, "loss": 0.0158, "num_input_tokens_seen": 101056512, "step": 389 }, { "epoch": 4.385815602836879, "grad_norm": 0.3413445651057825, "learning_rate": 1.5762568750059604e-06, "loss": 0.0149, "num_input_tokens_seen": 101318656, "step": 390 }, { "epoch": 4.397163120567376, "grad_norm": 0.352456974681825, "learning_rate": 1.5144752560232372e-06, "loss": 0.0159, "num_input_tokens_seen": 101580800, "step": 391 }, { "epoch": 4.408510638297872, "grad_norm": 0.341100842734161, "learning_rate": 1.4538909090118846e-06, "loss": 0.0137, "num_input_tokens_seen": 101842944, "step": 392 }, { "epoch": 4.4198581560283685, "grad_norm": 0.35870504615922777, "learning_rate": 1.3945069225101026e-06, "loss": 0.0151, "num_input_tokens_seen": 102105088, "step": 393 }, { "epoch": 4.431205673758865, "grad_norm": 0.35676458850601245, "learning_rate": 1.3363263238627493e-06, "loss": 0.0166, "num_input_tokens_seen": 102367232, "step": 394 }, { "epoch": 4.4425531914893615, "grad_norm": 0.3756541331179786, "learning_rate": 1.2793520790670116e-06, "loss": 0.0152, "num_input_tokens_seen": 102629376, "step": 395 }, { "epoch": 4.453900709219858, "grad_norm": 0.3445790534591682, "learning_rate": 1.2235870926211619e-06, "loss": 0.0144, "num_input_tokens_seen": 102891520, "step": 396 }, { "epoch": 4.465248226950354, "grad_norm": 0.34515720116215964, "learning_rate": 1.1690342073765375e-06, "loss": 0.0144, "num_input_tokens_seen": 103153664, "step": 397 }, { "epoch": 4.476595744680851, "grad_norm": 0.345726417626317, "learning_rate": 1.1156962043925828e-06, "loss": 0.0158, "num_input_tokens_seen": 103415808, "step": 398 }, { "epoch": 4.487943262411347, "grad_norm": 0.3542601574103813, "learning_rate": 1.0635758027950888e-06, "loss": 0.0133, "num_input_tokens_seen": 103677952, "step": 399 }, { "epoch": 4.499290780141844, "grad_norm": 0.32899237428693795, "learning_rate": 1.0126756596375686e-06, "loss": 0.016, "num_input_tokens_seen": 103940096, "step": 400 }, { "epoch": 4.51063829787234, "grad_norm": 0.3481316804818486, "learning_rate": 9.629983697657886e-07, "loss": 0.0165, "num_input_tokens_seen": 104202240, "step": 401 }, { "epoch": 4.521985815602837, "grad_norm": 0.3233545153466053, "learning_rate": 9.145464656855257e-07, "loss": 0.0134, "num_input_tokens_seen": 104464384, "step": 402 }, { "epoch": 4.533333333333333, "grad_norm": 0.33628106449340045, "learning_rate": 8.673224174334221e-07, "loss": 0.0132, "num_input_tokens_seen": 104726528, "step": 403 }, { "epoch": 4.54468085106383, "grad_norm": 0.3399953670308035, "learning_rate": 8.213286324510738e-07, "loss": 0.0156, "num_input_tokens_seen": 104988672, "step": 404 }, { "epoch": 4.556028368794326, "grad_norm": 0.3850226318275653, "learning_rate": 7.765674554623181e-07, "loss": 0.0165, "num_input_tokens_seen": 105250816, "step": 405 }, { "epoch": 4.567375886524823, "grad_norm": 0.353247574363122, "learning_rate": 7.330411683536876e-07, "loss": 0.0144, "num_input_tokens_seen": 105512960, "step": 406 }, { "epoch": 4.578723404255319, "grad_norm": 0.32706270694141454, "learning_rate": 6.907519900580861e-07, "loss": 0.0153, "num_input_tokens_seen": 105775104, "step": 407 }, { "epoch": 4.590070921985816, "grad_norm": 0.37578666168375224, "learning_rate": 6.497020764416633e-07, "loss": 0.0156, "num_input_tokens_seen": 106037248, "step": 408 }, { "epoch": 4.601418439716312, "grad_norm": 0.3185935927438266, "learning_rate": 6.098935201939187e-07, "loss": 0.0136, "num_input_tokens_seen": 106299392, "step": 409 }, { "epoch": 4.6127659574468085, "grad_norm": 0.3198588451035788, "learning_rate": 5.713283507210148e-07, "loss": 0.0139, "num_input_tokens_seen": 106561536, "step": 410 }, { "epoch": 4.624113475177305, "grad_norm": 0.3127004347421057, "learning_rate": 5.340085340423129e-07, "loss": 0.0144, "num_input_tokens_seen": 106823680, "step": 411 }, { "epoch": 4.6354609929078014, "grad_norm": 0.3412415661804973, "learning_rate": 4.979359726901639e-07, "loss": 0.0131, "num_input_tokens_seen": 107085824, "step": 412 }, { "epoch": 4.646808510638298, "grad_norm": 0.3048113188870906, "learning_rate": 4.63112505612906e-07, "loss": 0.0131, "num_input_tokens_seen": 107347968, "step": 413 }, { "epoch": 4.658156028368794, "grad_norm": 0.3005056677101423, "learning_rate": 4.2953990808111135e-07, "loss": 0.015, "num_input_tokens_seen": 107610112, "step": 414 }, { "epoch": 4.669503546099291, "grad_norm": 0.3504689469184949, "learning_rate": 3.972198915970976e-07, "loss": 0.0152, "num_input_tokens_seen": 107872256, "step": 415 }, { "epoch": 4.680851063829787, "grad_norm": 0.38892190993563536, "learning_rate": 3.6615410380767544e-07, "loss": 0.0153, "num_input_tokens_seen": 108134400, "step": 416 }, { "epoch": 4.692198581560284, "grad_norm": 0.3234118453449877, "learning_rate": 3.3634412842014353e-07, "loss": 0.0139, "num_input_tokens_seen": 108396544, "step": 417 }, { "epoch": 4.70354609929078, "grad_norm": 0.33351975045706705, "learning_rate": 3.077914851215585e-07, "loss": 0.0139, "num_input_tokens_seen": 108658688, "step": 418 }, { "epoch": 4.714893617021277, "grad_norm": 0.3243353662442738, "learning_rate": 2.804976295012612e-07, "loss": 0.014, "num_input_tokens_seen": 108920832, "step": 419 }, { "epoch": 4.726241134751773, "grad_norm": 0.3191756927282505, "learning_rate": 2.544639529766829e-07, "loss": 0.0137, "num_input_tokens_seen": 109182976, "step": 420 }, { "epoch": 4.73758865248227, "grad_norm": 0.34240534777520126, "learning_rate": 2.2969178272238545e-07, "loss": 0.0139, "num_input_tokens_seen": 109445120, "step": 421 }, { "epoch": 4.748936170212766, "grad_norm": 0.2956411840988351, "learning_rate": 2.061823816024322e-07, "loss": 0.0132, "num_input_tokens_seen": 109707264, "step": 422 }, { "epoch": 4.760283687943263, "grad_norm": 0.36482955125245486, "learning_rate": 1.8393694810599493e-07, "loss": 0.016, "num_input_tokens_seen": 109969408, "step": 423 }, { "epoch": 4.771631205673759, "grad_norm": 0.3332106952260904, "learning_rate": 1.6295661628624447e-07, "loss": 0.0136, "num_input_tokens_seen": 110231552, "step": 424 }, { "epoch": 4.782978723404256, "grad_norm": 0.3411172110927968, "learning_rate": 1.4324245570256633e-07, "loss": 0.0169, "num_input_tokens_seen": 110493696, "step": 425 }, { "epoch": 4.794326241134752, "grad_norm": 0.34024588053935717, "learning_rate": 1.2479547136600989e-07, "loss": 0.0133, "num_input_tokens_seen": 110755840, "step": 426 }, { "epoch": 4.8056737588652485, "grad_norm": 0.34496629497126063, "learning_rate": 1.0761660368806548e-07, "loss": 0.0144, "num_input_tokens_seen": 111017984, "step": 427 }, { "epoch": 4.817021276595745, "grad_norm": 0.33516309634270675, "learning_rate": 9.170672843271666e-08, "loss": 0.0153, "num_input_tokens_seen": 111280128, "step": 428 }, { "epoch": 4.828368794326241, "grad_norm": 0.35890460501548294, "learning_rate": 7.706665667180091e-08, "loss": 0.0127, "num_input_tokens_seen": 111542272, "step": 429 }, { "epoch": 4.839716312056738, "grad_norm": 0.3167706249520597, "learning_rate": 6.369713474366212e-08, "loss": 0.0139, "num_input_tokens_seen": 111804416, "step": 430 }, { "epoch": 4.851063829787234, "grad_norm": 0.34919750392535776, "learning_rate": 5.159884421509498e-08, "loss": 0.0142, "num_input_tokens_seen": 112066560, "step": 431 }, { "epoch": 4.862411347517731, "grad_norm": 0.322002789312833, "learning_rate": 4.07724018466088e-08, "loss": 0.0156, "num_input_tokens_seen": 112328704, "step": 432 }, { "epoch": 4.873758865248227, "grad_norm": 0.35090105881839767, "learning_rate": 3.1218359560974966e-08, "loss": 0.0154, "num_input_tokens_seen": 112590848, "step": 433 }, { "epoch": 4.885106382978723, "grad_norm": 0.39964488827914374, "learning_rate": 2.2937204415107717e-08, "loss": 0.0151, "num_input_tokens_seen": 112852992, "step": 434 }, { "epoch": 4.89645390070922, "grad_norm": 0.3076123689331371, "learning_rate": 1.5929358575206275e-08, "loss": 0.0144, "num_input_tokens_seen": 113115136, "step": 435 }, { "epoch": 4.907801418439716, "grad_norm": 0.34810902058971566, "learning_rate": 1.0195179295269252e-08, "loss": 0.0146, "num_input_tokens_seen": 113377280, "step": 436 }, { "epoch": 4.919148936170213, "grad_norm": 0.3213963089203548, "learning_rate": 5.7349588988481194e-09, "loss": 0.0125, "num_input_tokens_seen": 113639424, "step": 437 }, { "epoch": 4.930496453900709, "grad_norm": 0.32788714163055016, "learning_rate": 2.5489247641674596e-09, "loss": 0.0129, "num_input_tokens_seen": 113901568, "step": 438 }, { "epoch": 4.941843971631206, "grad_norm": 0.36471489051146844, "learning_rate": 6.372393125203546e-10, "loss": 0.0148, "num_input_tokens_seen": 114163712, "step": 439 }, { "epoch": 4.953191489361702, "grad_norm": 0.3392645067497023, "learning_rate": 0.0, "loss": 0.0161, "num_input_tokens_seen": 114425856, "step": 440 }, { "epoch": 4.953191489361702, "num_input_tokens_seen": 114425856, "step": 440, "total_flos": 182736094494720.0, "train_loss": 0.6214238642672585, "train_runtime": 10972.3782, "train_samples_per_second": 10.27, "train_steps_per_second": 0.04 } ], "logging_steps": 1, "max_steps": 440, "num_input_tokens_seen": 114425856, "num_train_epochs": 5, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 182736094494720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }