| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 453, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.033112582781456956, | |
| "grad_norm": 2.2606041431427, | |
| "learning_rate": 4.998497170031657e-05, | |
| "loss": 0.6997, | |
| "num_input_tokens_seen": 4912, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06622516556291391, | |
| "grad_norm": 2.3543715476989746, | |
| "learning_rate": 4.9939904869249616e-05, | |
| "loss": 0.3509, | |
| "num_input_tokens_seen": 9936, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09933774834437085, | |
| "grad_norm": 1.801797866821289, | |
| "learning_rate": 4.9864853689026556e-05, | |
| "loss": 0.2853, | |
| "num_input_tokens_seen": 14864, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.13245033112582782, | |
| "grad_norm": 1.3575855493545532, | |
| "learning_rate": 4.975990839097764e-05, | |
| "loss": 0.2456, | |
| "num_input_tokens_seen": 19824, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.16556291390728478, | |
| "grad_norm": 2.5929694175720215, | |
| "learning_rate": 4.9625195147054034e-05, | |
| "loss": 0.2084, | |
| "num_input_tokens_seen": 24480, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.1986754966887417, | |
| "grad_norm": 0.8744802474975586, | |
| "learning_rate": 4.9460875918135804e-05, | |
| "loss": 0.1626, | |
| "num_input_tokens_seen": 29296, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.23178807947019867, | |
| "grad_norm": 2.857471466064453, | |
| "learning_rate": 4.9267148259312224e-05, | |
| "loss": 0.1405, | |
| "num_input_tokens_seen": 33936, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.26490066225165565, | |
| "grad_norm": 1.7763206958770752, | |
| "learning_rate": 4.9044245082368415e-05, | |
| "loss": 0.1182, | |
| "num_input_tokens_seen": 39056, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2980132450331126, | |
| "grad_norm": 2.78945255279541, | |
| "learning_rate": 4.879243437576383e-05, | |
| "loss": 0.1285, | |
| "num_input_tokens_seen": 43520, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.33112582781456956, | |
| "grad_norm": 1.1319329738616943, | |
| "learning_rate": 4.8512018882439475e-05, | |
| "loss": 0.0906, | |
| "num_input_tokens_seen": 48656, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.36423841059602646, | |
| "grad_norm": 0.8884170055389404, | |
| "learning_rate": 4.820333573584091e-05, | |
| "loss": 0.054, | |
| "num_input_tokens_seen": 53600, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3973509933774834, | |
| "grad_norm": 2.0057179927825928, | |
| "learning_rate": 4.786675605459487e-05, | |
| "loss": 0.073, | |
| "num_input_tokens_seen": 58384, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4304635761589404, | |
| "grad_norm": 1.3076105117797852, | |
| "learning_rate": 4.7502684496326746e-05, | |
| "loss": 0.0524, | |
| "num_input_tokens_seen": 63152, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.46357615894039733, | |
| "grad_norm": 1.074171543121338, | |
| "learning_rate": 4.711155877115523e-05, | |
| "loss": 0.0534, | |
| "num_input_tokens_seen": 67888, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4966887417218543, | |
| "grad_norm": 1.634122371673584, | |
| "learning_rate": 4.669384911544927e-05, | |
| "loss": 0.0759, | |
| "num_input_tokens_seen": 72480, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5298013245033113, | |
| "grad_norm": 1.0059384107589722, | |
| "learning_rate": 4.625005772647979e-05, | |
| "loss": 0.0766, | |
| "num_input_tokens_seen": 77120, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5629139072847682, | |
| "grad_norm": 1.3958277702331543, | |
| "learning_rate": 4.578071815864602e-05, | |
| "loss": 0.0383, | |
| "num_input_tokens_seen": 81584, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5960264900662252, | |
| "grad_norm": 2.721231698989868, | |
| "learning_rate": 4.528639468200226e-05, | |
| "loss": 0.0393, | |
| "num_input_tokens_seen": 86416, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6291390728476821, | |
| "grad_norm": 1.4776098728179932, | |
| "learning_rate": 4.476768160385632e-05, | |
| "loss": 0.0589, | |
| "num_input_tokens_seen": 91248, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6622516556291391, | |
| "grad_norm": 1.1105955839157104, | |
| "learning_rate": 4.4225202554255227e-05, | |
| "loss": 0.0351, | |
| "num_input_tokens_seen": 95936, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.695364238410596, | |
| "grad_norm": 1.1301405429840088, | |
| "learning_rate": 4.3659609736217344e-05, | |
| "loss": 0.0417, | |
| "num_input_tokens_seen": 100704, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.7284768211920529, | |
| "grad_norm": 1.869552493095398, | |
| "learning_rate": 4.3071583141612135e-05, | |
| "loss": 0.0437, | |
| "num_input_tokens_seen": 105376, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7615894039735099, | |
| "grad_norm": 1.0497004985809326, | |
| "learning_rate": 4.2461829733630435e-05, | |
| "loss": 0.0498, | |
| "num_input_tokens_seen": 110208, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7947019867549668, | |
| "grad_norm": 0.4084171652793884, | |
| "learning_rate": 4.1831082596828106e-05, | |
| "loss": 0.0239, | |
| "num_input_tokens_seen": 114704, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8278145695364238, | |
| "grad_norm": 0.4082948565483093, | |
| "learning_rate": 4.118010005576485e-05, | |
| "loss": 0.0228, | |
| "num_input_tokens_seen": 119744, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8609271523178808, | |
| "grad_norm": 0.246117502450943, | |
| "learning_rate": 4.050966476329793e-05, | |
| "loss": 0.039, | |
| "num_input_tokens_seen": 124736, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8940397350993378, | |
| "grad_norm": 0.5470453500747681, | |
| "learning_rate": 3.9820582759626825e-05, | |
| "loss": 0.05, | |
| "num_input_tokens_seen": 129552, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.9271523178807947, | |
| "grad_norm": 0.6097426414489746, | |
| "learning_rate": 3.911368250322014e-05, | |
| "loss": 0.02, | |
| "num_input_tokens_seen": 134400, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9602649006622517, | |
| "grad_norm": 0.4274413585662842, | |
| "learning_rate": 3.8389813874789856e-05, | |
| "loss": 0.0159, | |
| "num_input_tokens_seen": 139424, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9933774834437086, | |
| "grad_norm": 0.1949557512998581, | |
| "learning_rate": 3.764984715551032e-05, | |
| "loss": 0.017, | |
| "num_input_tokens_seen": 144368, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0264900662251655, | |
| "grad_norm": 0.9226788878440857, | |
| "learning_rate": 3.6894671980710574e-05, | |
| "loss": 0.0298, | |
| "num_input_tokens_seen": 149040, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.0596026490066226, | |
| "grad_norm": 1.5730527639389038, | |
| "learning_rate": 3.612519627029787e-05, | |
| "loss": 0.0357, | |
| "num_input_tokens_seen": 153712, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0927152317880795, | |
| "grad_norm": 0.5750863552093506, | |
| "learning_rate": 3.534234513719821e-05, | |
| "loss": 0.0185, | |
| "num_input_tokens_seen": 158640, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.1258278145695364, | |
| "grad_norm": 0.4966561198234558, | |
| "learning_rate": 3.4547059775126445e-05, | |
| "loss": 0.0339, | |
| "num_input_tokens_seen": 163552, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.1589403973509933, | |
| "grad_norm": 0.6601276993751526, | |
| "learning_rate": 3.3740296327022984e-05, | |
| "loss": 0.0224, | |
| "num_input_tokens_seen": 168352, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.1920529801324504, | |
| "grad_norm": 1.2051963806152344, | |
| "learning_rate": 3.292302473551757e-05, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 173312, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.2251655629139073, | |
| "grad_norm": 0.48197436332702637, | |
| "learning_rate": 3.20962275768022e-05, | |
| "loss": 0.0275, | |
| "num_input_tokens_seen": 178160, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.2582781456953642, | |
| "grad_norm": 0.3365241289138794, | |
| "learning_rate": 3.126089887931515e-05, | |
| "loss": 0.0086, | |
| "num_input_tokens_seen": 182944, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2913907284768211, | |
| "grad_norm": 0.5984787940979004, | |
| "learning_rate": 3.0418042928656414e-05, | |
| "loss": 0.0166, | |
| "num_input_tokens_seen": 187920, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.3245033112582782, | |
| "grad_norm": 0.3351554274559021, | |
| "learning_rate": 2.9568673060171326e-05, | |
| "loss": 0.0167, | |
| "num_input_tokens_seen": 192992, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3576158940397351, | |
| "grad_norm": 0.21267688274383545, | |
| "learning_rate": 2.8713810440653926e-05, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 197616, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.390728476821192, | |
| "grad_norm": 0.17382262647151947, | |
| "learning_rate": 2.7854482840634965e-05, | |
| "loss": 0.0142, | |
| "num_input_tokens_seen": 202400, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.423841059602649, | |
| "grad_norm": 0.552691638469696, | |
| "learning_rate": 2.6991723398730383e-05, | |
| "loss": 0.0136, | |
| "num_input_tokens_seen": 207216, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.4569536423841059, | |
| "grad_norm": 0.4015306532382965, | |
| "learning_rate": 2.6126569379535985e-05, | |
| "loss": 0.0184, | |
| "num_input_tokens_seen": 211744, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.490066225165563, | |
| "grad_norm": 0.20611760020256042, | |
| "learning_rate": 2.526006092656161e-05, | |
| "loss": 0.0098, | |
| "num_input_tokens_seen": 216608, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.5231788079470199, | |
| "grad_norm": 0.3923095762729645, | |
| "learning_rate": 2.4393239811704e-05, | |
| "loss": 0.012, | |
| "num_input_tokens_seen": 221552, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.5562913907284768, | |
| "grad_norm": 0.2283228635787964, | |
| "learning_rate": 2.3527148182762054e-05, | |
| "loss": 0.0106, | |
| "num_input_tokens_seen": 226272, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.589403973509934, | |
| "grad_norm": 1.2066081762313843, | |
| "learning_rate": 2.2662827310499995e-05, | |
| "loss": 0.0128, | |
| "num_input_tokens_seen": 231072, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.6225165562913908, | |
| "grad_norm": 0.39357009530067444, | |
| "learning_rate": 2.1801316336765126e-05, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 235728, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.6556291390728477, | |
| "grad_norm": 0.2383226603269577, | |
| "learning_rate": 2.0943651025164932e-05, | |
| "loss": 0.0084, | |
| "num_input_tokens_seen": 240560, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6887417218543046, | |
| "grad_norm": 1.2794650793075562, | |
| "learning_rate": 2.0090862515805898e-05, | |
| "loss": 0.0143, | |
| "num_input_tokens_seen": 245408, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.7218543046357615, | |
| "grad_norm": 0.5334519147872925, | |
| "learning_rate": 1.9243976085590824e-05, | |
| "loss": 0.011, | |
| "num_input_tokens_seen": 250400, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.7549668874172184, | |
| "grad_norm": 0.32287150621414185, | |
| "learning_rate": 1.840400991556541e-05, | |
| "loss": 0.0127, | |
| "num_input_tokens_seen": 255216, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.7880794701986755, | |
| "grad_norm": 0.7233603596687317, | |
| "learning_rate": 1.7571973866795815e-05, | |
| "loss": 0.0127, | |
| "num_input_tokens_seen": 260080, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.8211920529801324, | |
| "grad_norm": 0.2672172784805298, | |
| "learning_rate": 1.6748868266249114e-05, | |
| "loss": 0.0122, | |
| "num_input_tokens_seen": 264848, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.8543046357615895, | |
| "grad_norm": 0.32350099086761475, | |
| "learning_rate": 1.5935682704136183e-05, | |
| "loss": 0.0169, | |
| "num_input_tokens_seen": 269776, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8874172185430464, | |
| "grad_norm": 0.26331964135169983, | |
| "learning_rate": 1.5133394844163093e-05, | |
| "loss": 0.0242, | |
| "num_input_tokens_seen": 274752, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.9205298013245033, | |
| "grad_norm": 0.25610876083374023, | |
| "learning_rate": 1.4342969248121185e-05, | |
| "loss": 0.0079, | |
| "num_input_tokens_seen": 279440, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.9536423841059603, | |
| "grad_norm": 1.2221907377243042, | |
| "learning_rate": 1.3565356216229268e-05, | |
| "loss": 0.0194, | |
| "num_input_tokens_seen": 284288, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.9867549668874172, | |
| "grad_norm": 0.46571576595306396, | |
| "learning_rate": 1.2801490644621789e-05, | |
| "loss": 0.0133, | |
| "num_input_tokens_seen": 289216, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.019867549668874, | |
| "grad_norm": 0.06845066696405411, | |
| "learning_rate": 1.2052290901357025e-05, | |
| "loss": 0.0049, | |
| "num_input_tokens_seen": 293992, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.052980132450331, | |
| "grad_norm": 0.3654076159000397, | |
| "learning_rate": 1.1318657722296097e-05, | |
| "loss": 0.0121, | |
| "num_input_tokens_seen": 299048, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.0860927152317883, | |
| "grad_norm": 0.1736595779657364, | |
| "learning_rate": 1.0601473128180855e-05, | |
| "loss": 0.0102, | |
| "num_input_tokens_seen": 303768, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.119205298013245, | |
| "grad_norm": 0.15334352850914001, | |
| "learning_rate": 9.90159936421197e-06, | |
| "loss": 0.0072, | |
| "num_input_tokens_seen": 308872, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.152317880794702, | |
| "grad_norm": 0.15292225778102875, | |
| "learning_rate": 9.219877863402682e-06, | |
| "loss": 0.0066, | |
| "num_input_tokens_seen": 313800, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.185430463576159, | |
| "grad_norm": 0.12052378058433533, | |
| "learning_rate": 8.55712823495419e-06, | |
| "loss": 0.0119, | |
| "num_input_tokens_seen": 318536, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.218543046357616, | |
| "grad_norm": 0.16140712797641754, | |
| "learning_rate": 7.91414727886898e-06, | |
| "loss": 0.0057, | |
| "num_input_tokens_seen": 323512, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.251655629139073, | |
| "grad_norm": 0.49767032265663147, | |
| "learning_rate": 7.291708027986988e-06, | |
| "loss": 0.0136, | |
| "num_input_tokens_seen": 328552, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.2847682119205297, | |
| "grad_norm": 0.10487841814756393, | |
| "learning_rate": 6.690558818595943e-06, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 333272, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.3178807947019866, | |
| "grad_norm": 0.2247195988893509, | |
| "learning_rate": 6.111422390733715e-06, | |
| "loss": 0.0141, | |
| "num_input_tokens_seen": 337816, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.3509933774834435, | |
| "grad_norm": 0.15797987580299377, | |
| "learning_rate": 5.55499501926394e-06, | |
| "loss": 0.014, | |
| "num_input_tokens_seen": 342792, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.384105960264901, | |
| "grad_norm": 0.33236005902290344, | |
| "learning_rate": 5.02194567676986e-06, | |
| "loss": 0.0045, | |
| "num_input_tokens_seen": 347288, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.4172185430463577, | |
| "grad_norm": 0.3886609375476837, | |
| "learning_rate": 4.51291522927268e-06, | |
| "loss": 0.0062, | |
| "num_input_tokens_seen": 352088, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.4503311258278146, | |
| "grad_norm": 0.3580506145954132, | |
| "learning_rate": 4.028515665741439e-06, | |
| "loss": 0.0028, | |
| "num_input_tokens_seen": 357096, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.4834437086092715, | |
| "grad_norm": 0.045303523540496826, | |
| "learning_rate": 3.5693293623207086e-06, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 361928, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.5165562913907285, | |
| "grad_norm": 0.3307873010635376, | |
| "learning_rate": 3.135908382160771e-06, | |
| "loss": 0.0061, | |
| "num_input_tokens_seen": 366632, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.5496688741721854, | |
| "grad_norm": 0.31569671630859375, | |
| "learning_rate": 2.728773811691923e-06, | |
| "loss": 0.0077, | |
| "num_input_tokens_seen": 371352, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.5827814569536423, | |
| "grad_norm": 0.25220000743865967, | |
| "learning_rate": 2.348415134141102e-06, | |
| "loss": 0.0053, | |
| "num_input_tokens_seen": 375976, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.6158940397350996, | |
| "grad_norm": 0.12679828703403473, | |
| "learning_rate": 1.995289641043768e-06, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 381016, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.6490066225165565, | |
| "grad_norm": 0.2858022153377533, | |
| "learning_rate": 1.6698218824588164e-06, | |
| "loss": 0.0063, | |
| "num_input_tokens_seen": 385688, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.6821192052980134, | |
| "grad_norm": 0.3448869287967682, | |
| "learning_rate": 1.3724031565473112e-06, | |
| "loss": 0.0052, | |
| "num_input_tokens_seen": 390584, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.7152317880794703, | |
| "grad_norm": 0.26856377720832825, | |
| "learning_rate": 1.1033910391288065e-06, | |
| "loss": 0.0051, | |
| "num_input_tokens_seen": 395576, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.748344370860927, | |
| "grad_norm": 0.3648998737335205, | |
| "learning_rate": 8.631089537808307e-07, | |
| "loss": 0.0084, | |
| "num_input_tokens_seen": 400328, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.781456953642384, | |
| "grad_norm": 1.5257205963134766, | |
| "learning_rate": 6.51845782998356e-07, | |
| "loss": 0.0122, | |
| "num_input_tokens_seen": 405192, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.814569536423841, | |
| "grad_norm": 0.4184490144252777, | |
| "learning_rate": 4.698555208807853e-07, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 409912, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.847682119205298, | |
| "grad_norm": 0.39113691449165344, | |
| "learning_rate": 3.1735696776400703e-07, | |
| "loss": 0.0044, | |
| "num_input_tokens_seen": 414792, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.880794701986755, | |
| "grad_norm": 0.49995261430740356, | |
| "learning_rate": 1.9453346716462317e-07, | |
| "loss": 0.0091, | |
| "num_input_tokens_seen": 419752, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.9139072847682117, | |
| "grad_norm": 0.45024681091308594, | |
| "learning_rate": 1.0153268535264827e-07, | |
| "loss": 0.005, | |
| "num_input_tokens_seen": 424808, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.9470198675496686, | |
| "grad_norm": 0.11354193091392517, | |
| "learning_rate": 3.846643381766879e-08, | |
| "loss": 0.0051, | |
| "num_input_tokens_seen": 429704, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.980132450331126, | |
| "grad_norm": 0.07164571434259415, | |
| "learning_rate": 5.41053484192644e-09, | |
| "loss": 0.0049, | |
| "num_input_tokens_seen": 434536, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "num_input_tokens_seen": 437216, | |
| "step": 453, | |
| "total_flos": 1.8708380691726336e+16, | |
| "train_loss": 0.044493223137528556, | |
| "train_runtime": 988.3228, | |
| "train_samples_per_second": 7.331, | |
| "train_steps_per_second": 0.458 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 453, | |
| "num_input_tokens_seen": 437216, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.8708380691726336e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |