{ "best_metric": null, "best_model_checkpoint": null, "epoch": 50.0, "eval_steps": 500, "global_step": 367750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.11480995267629623, "learning_rate": 9.800000000000001e-06, "loss": 0.2501, "step": 7355 }, { "epoch": 1.0, "eval_loss": 0.42542216181755066, "eval_runtime": 128.5308, "eval_samples_per_second": 457.774, "eval_steps_per_second": 7.158, "step": 7355 }, { "epoch": 2.0, "grad_norm": 0.158855602145195, "learning_rate": 9.600000000000001e-06, "loss": 0.2332, "step": 14710 }, { "epoch": 2.0, "eval_loss": 0.41126397252082825, "eval_runtime": 128.8707, "eval_samples_per_second": 456.566, "eval_steps_per_second": 7.139, "step": 14710 }, { "epoch": 3.0, "grad_norm": 0.20708617568016052, "learning_rate": 9.4e-06, "loss": 0.2295, "step": 22065 }, { "epoch": 3.0, "eval_loss": 0.40615084767341614, "eval_runtime": 128.7304, "eval_samples_per_second": 457.064, "eval_steps_per_second": 7.147, "step": 22065 }, { "epoch": 4.0, "grad_norm": 0.2054029405117035, "learning_rate": 9.200000000000002e-06, "loss": 0.2273, "step": 29420 }, { "epoch": 4.0, "eval_loss": 0.40087950229644775, "eval_runtime": 128.7273, "eval_samples_per_second": 457.075, "eval_steps_per_second": 7.147, "step": 29420 }, { "epoch": 5.0, "grad_norm": 0.19840490818023682, "learning_rate": 9e-06, "loss": 0.2256, "step": 36775 }, { "epoch": 5.0, "eval_loss": 0.3977925777435303, "eval_runtime": 128.707, "eval_samples_per_second": 457.147, "eval_steps_per_second": 7.148, "step": 36775 }, { "epoch": 6.0, "grad_norm": 0.25789105892181396, "learning_rate": 8.8e-06, "loss": 0.2243, "step": 44130 }, { "epoch": 6.0, "eval_loss": 0.3958837389945984, "eval_runtime": 128.6907, "eval_samples_per_second": 457.205, "eval_steps_per_second": 7.149, "step": 44130 }, { "epoch": 7.0, "grad_norm": 0.21235878765583038, "learning_rate": 8.6e-06, "loss": 0.2231, "step": 51485 }, { "epoch": 7.0, "eval_loss": 0.39352869987487793, "eval_runtime": 128.701, "eval_samples_per_second": 457.168, "eval_steps_per_second": 7.148, "step": 51485 }, { "epoch": 8.0, "grad_norm": 0.1889820694923401, "learning_rate": 8.400000000000001e-06, "loss": 0.2221, "step": 58840 }, { "epoch": 8.0, "eval_loss": 0.3912597596645355, "eval_runtime": 128.7122, "eval_samples_per_second": 457.128, "eval_steps_per_second": 7.148, "step": 58840 }, { "epoch": 9.0, "grad_norm": 0.22390136122703552, "learning_rate": 8.2e-06, "loss": 0.2212, "step": 66195 }, { "epoch": 9.0, "eval_loss": 0.39093491435050964, "eval_runtime": 128.7173, "eval_samples_per_second": 457.11, "eval_steps_per_second": 7.147, "step": 66195 }, { "epoch": 10.0, "grad_norm": 0.1813807338476181, "learning_rate": 8.000000000000001e-06, "loss": 0.2205, "step": 73550 }, { "epoch": 10.0, "eval_loss": 0.389521986246109, "eval_runtime": 128.684, "eval_samples_per_second": 457.229, "eval_steps_per_second": 7.149, "step": 73550 }, { "epoch": 11.0, "grad_norm": 0.17810355126857758, "learning_rate": 7.800000000000002e-06, "loss": 0.2197, "step": 80905 }, { "epoch": 11.0, "eval_loss": 0.3886621296405792, "eval_runtime": 128.7099, "eval_samples_per_second": 457.137, "eval_steps_per_second": 7.148, "step": 80905 }, { "epoch": 12.0, "grad_norm": 0.24489013850688934, "learning_rate": 7.600000000000001e-06, "loss": 0.219, "step": 88260 }, { "epoch": 12.0, "eval_loss": 0.3879886269569397, "eval_runtime": 128.7058, "eval_samples_per_second": 457.151, "eval_steps_per_second": 7.148, "step": 88260 }, { "epoch": 13.0, "grad_norm": 0.1965673714876175, "learning_rate": 7.4e-06, "loss": 0.2184, "step": 95615 }, { "epoch": 13.0, "eval_loss": 0.38754525780677795, "eval_runtime": 128.7201, "eval_samples_per_second": 457.1, "eval_steps_per_second": 7.147, "step": 95615 }, { "epoch": 14.0, "grad_norm": 0.22494736313819885, "learning_rate": 7.2000000000000005e-06, "loss": 0.2178, "step": 102970 }, { "epoch": 14.0, "eval_loss": 0.3873791992664337, "eval_runtime": 128.7144, "eval_samples_per_second": 457.121, "eval_steps_per_second": 7.148, "step": 102970 }, { "epoch": 15.0, "grad_norm": 0.32273635268211365, "learning_rate": 7e-06, "loss": 0.2172, "step": 110325 }, { "epoch": 15.0, "eval_loss": 0.38621675968170166, "eval_runtime": 128.7027, "eval_samples_per_second": 457.162, "eval_steps_per_second": 7.148, "step": 110325 }, { "epoch": 16.0, "grad_norm": 0.17209158837795258, "learning_rate": 6.800000000000001e-06, "loss": 0.2167, "step": 117680 }, { "epoch": 16.0, "eval_loss": 0.3857288658618927, "eval_runtime": 128.6791, "eval_samples_per_second": 457.246, "eval_steps_per_second": 7.15, "step": 117680 }, { "epoch": 17.0, "grad_norm": 0.27914878726005554, "learning_rate": 6.600000000000001e-06, "loss": 0.2162, "step": 125035 }, { "epoch": 17.0, "eval_loss": 0.3846561014652252, "eval_runtime": 128.6869, "eval_samples_per_second": 457.218, "eval_steps_per_second": 7.149, "step": 125035 }, { "epoch": 18.0, "grad_norm": 0.23364859819412231, "learning_rate": 6.4000000000000006e-06, "loss": 0.2157, "step": 132390 }, { "epoch": 18.0, "eval_loss": 0.3847697675228119, "eval_runtime": 128.7148, "eval_samples_per_second": 457.119, "eval_steps_per_second": 7.148, "step": 132390 }, { "epoch": 19.0, "grad_norm": 0.172671377658844, "learning_rate": 6.200000000000001e-06, "loss": 0.2152, "step": 139745 }, { "epoch": 19.0, "eval_loss": 0.38386115431785583, "eval_runtime": 128.6991, "eval_samples_per_second": 457.175, "eval_steps_per_second": 7.148, "step": 139745 }, { "epoch": 20.0, "grad_norm": 0.19780349731445312, "learning_rate": 6e-06, "loss": 0.2148, "step": 147100 }, { "epoch": 20.0, "eval_loss": 0.3836727738380432, "eval_runtime": 128.7294, "eval_samples_per_second": 457.067, "eval_steps_per_second": 7.147, "step": 147100 }, { "epoch": 21.0, "grad_norm": 0.26560327410697937, "learning_rate": 5.8e-06, "loss": 0.2144, "step": 154455 }, { "epoch": 21.0, "eval_loss": 0.3844703435897827, "eval_runtime": 128.7099, "eval_samples_per_second": 457.137, "eval_steps_per_second": 7.148, "step": 154455 }, { "epoch": 22.0, "grad_norm": 0.22332455217838287, "learning_rate": 5.600000000000001e-06, "loss": 0.2139, "step": 161810 }, { "epoch": 22.0, "eval_loss": 0.3834006190299988, "eval_runtime": 128.7123, "eval_samples_per_second": 457.128, "eval_steps_per_second": 7.148, "step": 161810 }, { "epoch": 23.0, "grad_norm": 0.2586681842803955, "learning_rate": 5.400000000000001e-06, "loss": 0.2136, "step": 169165 }, { "epoch": 23.0, "eval_loss": 0.38348647952079773, "eval_runtime": 128.6691, "eval_samples_per_second": 457.281, "eval_steps_per_second": 7.15, "step": 169165 }, { "epoch": 24.0, "grad_norm": 0.2845219075679779, "learning_rate": 5.2e-06, "loss": 0.2132, "step": 176520 }, { "epoch": 24.0, "eval_loss": 0.3828953504562378, "eval_runtime": 128.7332, "eval_samples_per_second": 457.054, "eval_steps_per_second": 7.147, "step": 176520 }, { "epoch": 25.0, "grad_norm": 0.27165067195892334, "learning_rate": 5e-06, "loss": 0.2128, "step": 183875 }, { "epoch": 25.0, "eval_loss": 0.38219162821769714, "eval_runtime": 128.7152, "eval_samples_per_second": 457.118, "eval_steps_per_second": 7.148, "step": 183875 }, { "epoch": 26.0, "grad_norm": 0.23254956305027008, "learning_rate": 4.800000000000001e-06, "loss": 0.2125, "step": 191230 }, { "epoch": 26.0, "eval_loss": 0.38233497738838196, "eval_runtime": 128.8883, "eval_samples_per_second": 456.504, "eval_steps_per_second": 7.138, "step": 191230 }, { "epoch": 27.0, "grad_norm": 0.2750227749347687, "learning_rate": 4.600000000000001e-06, "loss": 0.2122, "step": 198585 }, { "epoch": 27.0, "eval_loss": 0.3827952444553375, "eval_runtime": 128.7247, "eval_samples_per_second": 457.084, "eval_steps_per_second": 7.147, "step": 198585 }, { "epoch": 28.0, "grad_norm": 0.3043362498283386, "learning_rate": 4.4e-06, "loss": 0.2118, "step": 205940 }, { "epoch": 28.0, "eval_loss": 0.38314878940582275, "eval_runtime": 128.7576, "eval_samples_per_second": 456.967, "eval_steps_per_second": 7.145, "step": 205940 }, { "epoch": 29.0, "grad_norm": 0.22233448922634125, "learning_rate": 4.2000000000000004e-06, "loss": 0.2115, "step": 213295 }, { "epoch": 29.0, "eval_loss": 0.3818701505661011, "eval_runtime": 128.736, "eval_samples_per_second": 457.044, "eval_steps_per_second": 7.146, "step": 213295 }, { "epoch": 30.0, "grad_norm": 0.26145127415657043, "learning_rate": 4.000000000000001e-06, "loss": 0.2112, "step": 220650 }, { "epoch": 30.0, "eval_loss": 0.38293564319610596, "eval_runtime": 128.9344, "eval_samples_per_second": 456.341, "eval_steps_per_second": 7.135, "step": 220650 }, { "epoch": 31.0, "grad_norm": 0.2705918252468109, "learning_rate": 3.8000000000000005e-06, "loss": 0.211, "step": 228005 }, { "epoch": 31.0, "eval_loss": 0.3823812007904053, "eval_runtime": 128.8218, "eval_samples_per_second": 456.739, "eval_steps_per_second": 7.142, "step": 228005 }, { "epoch": 32.0, "grad_norm": 0.2663235366344452, "learning_rate": 3.6000000000000003e-06, "loss": 0.2107, "step": 235360 }, { "epoch": 32.0, "eval_loss": 0.382473886013031, "eval_runtime": 128.7309, "eval_samples_per_second": 457.062, "eval_steps_per_second": 7.147, "step": 235360 }, { "epoch": 33.0, "grad_norm": 0.23493929207324982, "learning_rate": 3.4000000000000005e-06, "loss": 0.2104, "step": 242715 }, { "epoch": 33.0, "eval_loss": 0.3834179639816284, "eval_runtime": 128.7424, "eval_samples_per_second": 457.021, "eval_steps_per_second": 7.146, "step": 242715 }, { "epoch": 34.0, "grad_norm": 0.2235766053199768, "learning_rate": 3.2000000000000003e-06, "loss": 0.2102, "step": 250070 }, { "epoch": 34.0, "eval_loss": 0.3825724124908447, "eval_runtime": 128.7687, "eval_samples_per_second": 456.928, "eval_steps_per_second": 7.145, "step": 250070 }, { "epoch": 35.0, "grad_norm": 0.2881753742694855, "learning_rate": 3e-06, "loss": 0.2099, "step": 257425 }, { "epoch": 35.0, "eval_loss": 0.3824039697647095, "eval_runtime": 133.1919, "eval_samples_per_second": 441.754, "eval_steps_per_second": 6.907, "step": 257425 }, { "epoch": 36.0, "grad_norm": 0.35670992732048035, "learning_rate": 2.8000000000000003e-06, "loss": 0.2097, "step": 264780 }, { "epoch": 36.0, "eval_loss": 0.38277488946914673, "eval_runtime": 128.7343, "eval_samples_per_second": 457.05, "eval_steps_per_second": 7.147, "step": 264780 }, { "epoch": 37.0, "grad_norm": 0.29673638939857483, "learning_rate": 2.6e-06, "loss": 0.2095, "step": 272135 }, { "epoch": 37.0, "eval_loss": 0.38287386298179626, "eval_runtime": 129.4264, "eval_samples_per_second": 454.606, "eval_steps_per_second": 7.108, "step": 272135 }, { "epoch": 38.0, "grad_norm": 0.25621339678764343, "learning_rate": 2.4000000000000003e-06, "loss": 0.2093, "step": 279490 }, { "epoch": 38.0, "eval_loss": 0.3827780485153198, "eval_runtime": 129.087, "eval_samples_per_second": 455.801, "eval_steps_per_second": 7.127, "step": 279490 }, { "epoch": 39.0, "grad_norm": 0.31819215416908264, "learning_rate": 2.2e-06, "loss": 0.2091, "step": 286845 }, { "epoch": 39.0, "eval_loss": 0.3822120726108551, "eval_runtime": 128.8612, "eval_samples_per_second": 456.6, "eval_steps_per_second": 7.139, "step": 286845 }, { "epoch": 40.0, "grad_norm": 0.2761085033416748, "learning_rate": 2.0000000000000003e-06, "loss": 0.2089, "step": 294200 }, { "epoch": 40.0, "eval_loss": 0.3824302554130554, "eval_runtime": 129.0551, "eval_samples_per_second": 455.914, "eval_steps_per_second": 7.129, "step": 294200 }, { "epoch": 41.0, "grad_norm": 0.27816739678382874, "learning_rate": 1.8000000000000001e-06, "loss": 0.2088, "step": 301555 }, { "epoch": 41.0, "eval_loss": 0.38338372111320496, "eval_runtime": 129.0432, "eval_samples_per_second": 455.956, "eval_steps_per_second": 7.129, "step": 301555 }, { "epoch": 42.0, "grad_norm": 0.3370245695114136, "learning_rate": 1.6000000000000001e-06, "loss": 0.2086, "step": 308910 }, { "epoch": 42.0, "eval_loss": 0.3826825022697449, "eval_runtime": 129.0854, "eval_samples_per_second": 455.807, "eval_steps_per_second": 7.127, "step": 308910 }, { "epoch": 43.0, "grad_norm": 0.23392541706562042, "learning_rate": 1.4000000000000001e-06, "loss": 0.2085, "step": 316265 }, { "epoch": 43.0, "eval_loss": 0.382882684469223, "eval_runtime": 128.8341, "eval_samples_per_second": 456.696, "eval_steps_per_second": 7.141, "step": 316265 }, { "epoch": 44.0, "grad_norm": 0.2567419409751892, "learning_rate": 1.2000000000000002e-06, "loss": 0.2083, "step": 323620 }, { "epoch": 44.0, "eval_loss": 0.3828926682472229, "eval_runtime": 129.1671, "eval_samples_per_second": 455.518, "eval_steps_per_second": 7.123, "step": 323620 }, { "epoch": 45.0, "grad_norm": 0.22591634094715118, "learning_rate": 1.0000000000000002e-06, "loss": 0.2082, "step": 330975 }, { "epoch": 45.0, "eval_loss": 0.3830114006996155, "eval_runtime": 128.7432, "eval_samples_per_second": 457.018, "eval_steps_per_second": 7.146, "step": 330975 }, { "epoch": 46.0, "grad_norm": 0.310523122549057, "learning_rate": 8.000000000000001e-07, "loss": 0.2081, "step": 338330 }, { "epoch": 46.0, "eval_loss": 0.38255205750465393, "eval_runtime": 129.0345, "eval_samples_per_second": 455.987, "eval_steps_per_second": 7.13, "step": 338330 }, { "epoch": 47.0, "grad_norm": 0.278604120016098, "learning_rate": 6.000000000000001e-07, "loss": 0.208, "step": 345685 }, { "epoch": 47.0, "eval_loss": 0.3827236294746399, "eval_runtime": 129.0063, "eval_samples_per_second": 456.086, "eval_steps_per_second": 7.131, "step": 345685 }, { "epoch": 48.0, "grad_norm": 0.2605680227279663, "learning_rate": 4.0000000000000003e-07, "loss": 0.2079, "step": 353040 }, { "epoch": 48.0, "eval_loss": 0.38287004828453064, "eval_runtime": 128.8174, "eval_samples_per_second": 456.755, "eval_steps_per_second": 7.142, "step": 353040 }, { "epoch": 49.0, "grad_norm": 0.3245304822921753, "learning_rate": 2.0000000000000002e-07, "loss": 0.2078, "step": 360395 }, { "epoch": 49.0, "eval_loss": 0.38298800587654114, "eval_runtime": 128.8343, "eval_samples_per_second": 456.695, "eval_steps_per_second": 7.141, "step": 360395 }, { "epoch": 50.0, "grad_norm": 0.3787703812122345, "learning_rate": 0.0, "loss": 0.2078, "step": 367750 }, { "epoch": 50.0, "eval_loss": 0.3828030824661255, "eval_runtime": 128.8773, "eval_samples_per_second": 456.543, "eval_steps_per_second": 7.139, "step": 367750 } ], "logging_steps": 500, "max_steps": 367750, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 2.9088945658368e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }