leaBroe's picture
Upload 17 files
548c872 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 50.0,
"eval_steps": 500,
"global_step": 367750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"grad_norm": 0.11480995267629623,
"learning_rate": 9.800000000000001e-06,
"loss": 0.2501,
"step": 7355
},
{
"epoch": 1.0,
"eval_loss": 0.42542216181755066,
"eval_runtime": 128.5308,
"eval_samples_per_second": 457.774,
"eval_steps_per_second": 7.158,
"step": 7355
},
{
"epoch": 2.0,
"grad_norm": 0.158855602145195,
"learning_rate": 9.600000000000001e-06,
"loss": 0.2332,
"step": 14710
},
{
"epoch": 2.0,
"eval_loss": 0.41126397252082825,
"eval_runtime": 128.8707,
"eval_samples_per_second": 456.566,
"eval_steps_per_second": 7.139,
"step": 14710
},
{
"epoch": 3.0,
"grad_norm": 0.20708617568016052,
"learning_rate": 9.4e-06,
"loss": 0.2295,
"step": 22065
},
{
"epoch": 3.0,
"eval_loss": 0.40615084767341614,
"eval_runtime": 128.7304,
"eval_samples_per_second": 457.064,
"eval_steps_per_second": 7.147,
"step": 22065
},
{
"epoch": 4.0,
"grad_norm": 0.2054029405117035,
"learning_rate": 9.200000000000002e-06,
"loss": 0.2273,
"step": 29420
},
{
"epoch": 4.0,
"eval_loss": 0.40087950229644775,
"eval_runtime": 128.7273,
"eval_samples_per_second": 457.075,
"eval_steps_per_second": 7.147,
"step": 29420
},
{
"epoch": 5.0,
"grad_norm": 0.19840490818023682,
"learning_rate": 9e-06,
"loss": 0.2256,
"step": 36775
},
{
"epoch": 5.0,
"eval_loss": 0.3977925777435303,
"eval_runtime": 128.707,
"eval_samples_per_second": 457.147,
"eval_steps_per_second": 7.148,
"step": 36775
},
{
"epoch": 6.0,
"grad_norm": 0.25789105892181396,
"learning_rate": 8.8e-06,
"loss": 0.2243,
"step": 44130
},
{
"epoch": 6.0,
"eval_loss": 0.3958837389945984,
"eval_runtime": 128.6907,
"eval_samples_per_second": 457.205,
"eval_steps_per_second": 7.149,
"step": 44130
},
{
"epoch": 7.0,
"grad_norm": 0.21235878765583038,
"learning_rate": 8.6e-06,
"loss": 0.2231,
"step": 51485
},
{
"epoch": 7.0,
"eval_loss": 0.39352869987487793,
"eval_runtime": 128.701,
"eval_samples_per_second": 457.168,
"eval_steps_per_second": 7.148,
"step": 51485
},
{
"epoch": 8.0,
"grad_norm": 0.1889820694923401,
"learning_rate": 8.400000000000001e-06,
"loss": 0.2221,
"step": 58840
},
{
"epoch": 8.0,
"eval_loss": 0.3912597596645355,
"eval_runtime": 128.7122,
"eval_samples_per_second": 457.128,
"eval_steps_per_second": 7.148,
"step": 58840
},
{
"epoch": 9.0,
"grad_norm": 0.22390136122703552,
"learning_rate": 8.2e-06,
"loss": 0.2212,
"step": 66195
},
{
"epoch": 9.0,
"eval_loss": 0.39093491435050964,
"eval_runtime": 128.7173,
"eval_samples_per_second": 457.11,
"eval_steps_per_second": 7.147,
"step": 66195
},
{
"epoch": 10.0,
"grad_norm": 0.1813807338476181,
"learning_rate": 8.000000000000001e-06,
"loss": 0.2205,
"step": 73550
},
{
"epoch": 10.0,
"eval_loss": 0.389521986246109,
"eval_runtime": 128.684,
"eval_samples_per_second": 457.229,
"eval_steps_per_second": 7.149,
"step": 73550
},
{
"epoch": 11.0,
"grad_norm": 0.17810355126857758,
"learning_rate": 7.800000000000002e-06,
"loss": 0.2197,
"step": 80905
},
{
"epoch": 11.0,
"eval_loss": 0.3886621296405792,
"eval_runtime": 128.7099,
"eval_samples_per_second": 457.137,
"eval_steps_per_second": 7.148,
"step": 80905
},
{
"epoch": 12.0,
"grad_norm": 0.24489013850688934,
"learning_rate": 7.600000000000001e-06,
"loss": 0.219,
"step": 88260
},
{
"epoch": 12.0,
"eval_loss": 0.3879886269569397,
"eval_runtime": 128.7058,
"eval_samples_per_second": 457.151,
"eval_steps_per_second": 7.148,
"step": 88260
},
{
"epoch": 13.0,
"grad_norm": 0.1965673714876175,
"learning_rate": 7.4e-06,
"loss": 0.2184,
"step": 95615
},
{
"epoch": 13.0,
"eval_loss": 0.38754525780677795,
"eval_runtime": 128.7201,
"eval_samples_per_second": 457.1,
"eval_steps_per_second": 7.147,
"step": 95615
},
{
"epoch": 14.0,
"grad_norm": 0.22494736313819885,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.2178,
"step": 102970
},
{
"epoch": 14.0,
"eval_loss": 0.3873791992664337,
"eval_runtime": 128.7144,
"eval_samples_per_second": 457.121,
"eval_steps_per_second": 7.148,
"step": 102970
},
{
"epoch": 15.0,
"grad_norm": 0.32273635268211365,
"learning_rate": 7e-06,
"loss": 0.2172,
"step": 110325
},
{
"epoch": 15.0,
"eval_loss": 0.38621675968170166,
"eval_runtime": 128.7027,
"eval_samples_per_second": 457.162,
"eval_steps_per_second": 7.148,
"step": 110325
},
{
"epoch": 16.0,
"grad_norm": 0.17209158837795258,
"learning_rate": 6.800000000000001e-06,
"loss": 0.2167,
"step": 117680
},
{
"epoch": 16.0,
"eval_loss": 0.3857288658618927,
"eval_runtime": 128.6791,
"eval_samples_per_second": 457.246,
"eval_steps_per_second": 7.15,
"step": 117680
},
{
"epoch": 17.0,
"grad_norm": 0.27914878726005554,
"learning_rate": 6.600000000000001e-06,
"loss": 0.2162,
"step": 125035
},
{
"epoch": 17.0,
"eval_loss": 0.3846561014652252,
"eval_runtime": 128.6869,
"eval_samples_per_second": 457.218,
"eval_steps_per_second": 7.149,
"step": 125035
},
{
"epoch": 18.0,
"grad_norm": 0.23364859819412231,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.2157,
"step": 132390
},
{
"epoch": 18.0,
"eval_loss": 0.3847697675228119,
"eval_runtime": 128.7148,
"eval_samples_per_second": 457.119,
"eval_steps_per_second": 7.148,
"step": 132390
},
{
"epoch": 19.0,
"grad_norm": 0.172671377658844,
"learning_rate": 6.200000000000001e-06,
"loss": 0.2152,
"step": 139745
},
{
"epoch": 19.0,
"eval_loss": 0.38386115431785583,
"eval_runtime": 128.6991,
"eval_samples_per_second": 457.175,
"eval_steps_per_second": 7.148,
"step": 139745
},
{
"epoch": 20.0,
"grad_norm": 0.19780349731445312,
"learning_rate": 6e-06,
"loss": 0.2148,
"step": 147100
},
{
"epoch": 20.0,
"eval_loss": 0.3836727738380432,
"eval_runtime": 128.7294,
"eval_samples_per_second": 457.067,
"eval_steps_per_second": 7.147,
"step": 147100
},
{
"epoch": 21.0,
"grad_norm": 0.26560327410697937,
"learning_rate": 5.8e-06,
"loss": 0.2144,
"step": 154455
},
{
"epoch": 21.0,
"eval_loss": 0.3844703435897827,
"eval_runtime": 128.7099,
"eval_samples_per_second": 457.137,
"eval_steps_per_second": 7.148,
"step": 154455
},
{
"epoch": 22.0,
"grad_norm": 0.22332455217838287,
"learning_rate": 5.600000000000001e-06,
"loss": 0.2139,
"step": 161810
},
{
"epoch": 22.0,
"eval_loss": 0.3834006190299988,
"eval_runtime": 128.7123,
"eval_samples_per_second": 457.128,
"eval_steps_per_second": 7.148,
"step": 161810
},
{
"epoch": 23.0,
"grad_norm": 0.2586681842803955,
"learning_rate": 5.400000000000001e-06,
"loss": 0.2136,
"step": 169165
},
{
"epoch": 23.0,
"eval_loss": 0.38348647952079773,
"eval_runtime": 128.6691,
"eval_samples_per_second": 457.281,
"eval_steps_per_second": 7.15,
"step": 169165
},
{
"epoch": 24.0,
"grad_norm": 0.2845219075679779,
"learning_rate": 5.2e-06,
"loss": 0.2132,
"step": 176520
},
{
"epoch": 24.0,
"eval_loss": 0.3828953504562378,
"eval_runtime": 128.7332,
"eval_samples_per_second": 457.054,
"eval_steps_per_second": 7.147,
"step": 176520
},
{
"epoch": 25.0,
"grad_norm": 0.27165067195892334,
"learning_rate": 5e-06,
"loss": 0.2128,
"step": 183875
},
{
"epoch": 25.0,
"eval_loss": 0.38219162821769714,
"eval_runtime": 128.7152,
"eval_samples_per_second": 457.118,
"eval_steps_per_second": 7.148,
"step": 183875
},
{
"epoch": 26.0,
"grad_norm": 0.23254956305027008,
"learning_rate": 4.800000000000001e-06,
"loss": 0.2125,
"step": 191230
},
{
"epoch": 26.0,
"eval_loss": 0.38233497738838196,
"eval_runtime": 128.8883,
"eval_samples_per_second": 456.504,
"eval_steps_per_second": 7.138,
"step": 191230
},
{
"epoch": 27.0,
"grad_norm": 0.2750227749347687,
"learning_rate": 4.600000000000001e-06,
"loss": 0.2122,
"step": 198585
},
{
"epoch": 27.0,
"eval_loss": 0.3827952444553375,
"eval_runtime": 128.7247,
"eval_samples_per_second": 457.084,
"eval_steps_per_second": 7.147,
"step": 198585
},
{
"epoch": 28.0,
"grad_norm": 0.3043362498283386,
"learning_rate": 4.4e-06,
"loss": 0.2118,
"step": 205940
},
{
"epoch": 28.0,
"eval_loss": 0.38314878940582275,
"eval_runtime": 128.7576,
"eval_samples_per_second": 456.967,
"eval_steps_per_second": 7.145,
"step": 205940
},
{
"epoch": 29.0,
"grad_norm": 0.22233448922634125,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.2115,
"step": 213295
},
{
"epoch": 29.0,
"eval_loss": 0.3818701505661011,
"eval_runtime": 128.736,
"eval_samples_per_second": 457.044,
"eval_steps_per_second": 7.146,
"step": 213295
},
{
"epoch": 30.0,
"grad_norm": 0.26145127415657043,
"learning_rate": 4.000000000000001e-06,
"loss": 0.2112,
"step": 220650
},
{
"epoch": 30.0,
"eval_loss": 0.38293564319610596,
"eval_runtime": 128.9344,
"eval_samples_per_second": 456.341,
"eval_steps_per_second": 7.135,
"step": 220650
},
{
"epoch": 31.0,
"grad_norm": 0.2705918252468109,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.211,
"step": 228005
},
{
"epoch": 31.0,
"eval_loss": 0.3823812007904053,
"eval_runtime": 128.8218,
"eval_samples_per_second": 456.739,
"eval_steps_per_second": 7.142,
"step": 228005
},
{
"epoch": 32.0,
"grad_norm": 0.2663235366344452,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.2107,
"step": 235360
},
{
"epoch": 32.0,
"eval_loss": 0.382473886013031,
"eval_runtime": 128.7309,
"eval_samples_per_second": 457.062,
"eval_steps_per_second": 7.147,
"step": 235360
},
{
"epoch": 33.0,
"grad_norm": 0.23493929207324982,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.2104,
"step": 242715
},
{
"epoch": 33.0,
"eval_loss": 0.3834179639816284,
"eval_runtime": 128.7424,
"eval_samples_per_second": 457.021,
"eval_steps_per_second": 7.146,
"step": 242715
},
{
"epoch": 34.0,
"grad_norm": 0.2235766053199768,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.2102,
"step": 250070
},
{
"epoch": 34.0,
"eval_loss": 0.3825724124908447,
"eval_runtime": 128.7687,
"eval_samples_per_second": 456.928,
"eval_steps_per_second": 7.145,
"step": 250070
},
{
"epoch": 35.0,
"grad_norm": 0.2881753742694855,
"learning_rate": 3e-06,
"loss": 0.2099,
"step": 257425
},
{
"epoch": 35.0,
"eval_loss": 0.3824039697647095,
"eval_runtime": 133.1919,
"eval_samples_per_second": 441.754,
"eval_steps_per_second": 6.907,
"step": 257425
},
{
"epoch": 36.0,
"grad_norm": 0.35670992732048035,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.2097,
"step": 264780
},
{
"epoch": 36.0,
"eval_loss": 0.38277488946914673,
"eval_runtime": 128.7343,
"eval_samples_per_second": 457.05,
"eval_steps_per_second": 7.147,
"step": 264780
},
{
"epoch": 37.0,
"grad_norm": 0.29673638939857483,
"learning_rate": 2.6e-06,
"loss": 0.2095,
"step": 272135
},
{
"epoch": 37.0,
"eval_loss": 0.38287386298179626,
"eval_runtime": 129.4264,
"eval_samples_per_second": 454.606,
"eval_steps_per_second": 7.108,
"step": 272135
},
{
"epoch": 38.0,
"grad_norm": 0.25621339678764343,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.2093,
"step": 279490
},
{
"epoch": 38.0,
"eval_loss": 0.3827780485153198,
"eval_runtime": 129.087,
"eval_samples_per_second": 455.801,
"eval_steps_per_second": 7.127,
"step": 279490
},
{
"epoch": 39.0,
"grad_norm": 0.31819215416908264,
"learning_rate": 2.2e-06,
"loss": 0.2091,
"step": 286845
},
{
"epoch": 39.0,
"eval_loss": 0.3822120726108551,
"eval_runtime": 128.8612,
"eval_samples_per_second": 456.6,
"eval_steps_per_second": 7.139,
"step": 286845
},
{
"epoch": 40.0,
"grad_norm": 0.2761085033416748,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.2089,
"step": 294200
},
{
"epoch": 40.0,
"eval_loss": 0.3824302554130554,
"eval_runtime": 129.0551,
"eval_samples_per_second": 455.914,
"eval_steps_per_second": 7.129,
"step": 294200
},
{
"epoch": 41.0,
"grad_norm": 0.27816739678382874,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.2088,
"step": 301555
},
{
"epoch": 41.0,
"eval_loss": 0.38338372111320496,
"eval_runtime": 129.0432,
"eval_samples_per_second": 455.956,
"eval_steps_per_second": 7.129,
"step": 301555
},
{
"epoch": 42.0,
"grad_norm": 0.3370245695114136,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.2086,
"step": 308910
},
{
"epoch": 42.0,
"eval_loss": 0.3826825022697449,
"eval_runtime": 129.0854,
"eval_samples_per_second": 455.807,
"eval_steps_per_second": 7.127,
"step": 308910
},
{
"epoch": 43.0,
"grad_norm": 0.23392541706562042,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.2085,
"step": 316265
},
{
"epoch": 43.0,
"eval_loss": 0.382882684469223,
"eval_runtime": 128.8341,
"eval_samples_per_second": 456.696,
"eval_steps_per_second": 7.141,
"step": 316265
},
{
"epoch": 44.0,
"grad_norm": 0.2567419409751892,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.2083,
"step": 323620
},
{
"epoch": 44.0,
"eval_loss": 0.3828926682472229,
"eval_runtime": 129.1671,
"eval_samples_per_second": 455.518,
"eval_steps_per_second": 7.123,
"step": 323620
},
{
"epoch": 45.0,
"grad_norm": 0.22591634094715118,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.2082,
"step": 330975
},
{
"epoch": 45.0,
"eval_loss": 0.3830114006996155,
"eval_runtime": 128.7432,
"eval_samples_per_second": 457.018,
"eval_steps_per_second": 7.146,
"step": 330975
},
{
"epoch": 46.0,
"grad_norm": 0.310523122549057,
"learning_rate": 8.000000000000001e-07,
"loss": 0.2081,
"step": 338330
},
{
"epoch": 46.0,
"eval_loss": 0.38255205750465393,
"eval_runtime": 129.0345,
"eval_samples_per_second": 455.987,
"eval_steps_per_second": 7.13,
"step": 338330
},
{
"epoch": 47.0,
"grad_norm": 0.278604120016098,
"learning_rate": 6.000000000000001e-07,
"loss": 0.208,
"step": 345685
},
{
"epoch": 47.0,
"eval_loss": 0.3827236294746399,
"eval_runtime": 129.0063,
"eval_samples_per_second": 456.086,
"eval_steps_per_second": 7.131,
"step": 345685
},
{
"epoch": 48.0,
"grad_norm": 0.2605680227279663,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.2079,
"step": 353040
},
{
"epoch": 48.0,
"eval_loss": 0.38287004828453064,
"eval_runtime": 128.8174,
"eval_samples_per_second": 456.755,
"eval_steps_per_second": 7.142,
"step": 353040
},
{
"epoch": 49.0,
"grad_norm": 0.3245304822921753,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.2078,
"step": 360395
},
{
"epoch": 49.0,
"eval_loss": 0.38298800587654114,
"eval_runtime": 128.8343,
"eval_samples_per_second": 456.695,
"eval_steps_per_second": 7.141,
"step": 360395
},
{
"epoch": 50.0,
"grad_norm": 0.3787703812122345,
"learning_rate": 0.0,
"loss": 0.2078,
"step": 367750
},
{
"epoch": 50.0,
"eval_loss": 0.3828030824661255,
"eval_runtime": 128.8773,
"eval_samples_per_second": 456.543,
"eval_steps_per_second": 7.139,
"step": 367750
}
],
"logging_steps": 500,
"max_steps": 367750,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"total_flos": 2.9088945658368e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}