{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.067484662576687, "eval_steps": 10000000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03067484662576687, "grad_norm": 40.34086591617319, "learning_rate": 6.134969325153374e-09, "loss": 2.8872, "step": 10 }, { "epoch": 0.06134969325153374, "grad_norm": 40.21853524962094, "learning_rate": 1.2269938650306748e-08, "loss": 2.8167, "step": 20 }, { "epoch": 0.09202453987730061, "grad_norm": 42.91710233147346, "learning_rate": 1.8404907975460124e-08, "loss": 2.9017, "step": 30 }, { "epoch": 0.12269938650306748, "grad_norm": 42.19478017724499, "learning_rate": 2.4539877300613496e-08, "loss": 2.9177, "step": 40 }, { "epoch": 0.15337423312883436, "grad_norm": 39.89856563443135, "learning_rate": 3.067484662576687e-08, "loss": 2.8621, "step": 50 }, { "epoch": 0.18404907975460122, "grad_norm": 39.031000731023376, "learning_rate": 3.680981595092025e-08, "loss": 2.8787, "step": 60 }, { "epoch": 0.2147239263803681, "grad_norm": 40.17556155872599, "learning_rate": 4.294478527607362e-08, "loss": 2.8723, "step": 70 }, { "epoch": 0.24539877300613497, "grad_norm": 40.44537988014516, "learning_rate": 4.907975460122699e-08, "loss": 2.8328, "step": 80 }, { "epoch": 0.27607361963190186, "grad_norm": 36.38071257012731, "learning_rate": 5.521472392638036e-08, "loss": 2.8269, "step": 90 }, { "epoch": 0.3067484662576687, "grad_norm": 35.01954486493492, "learning_rate": 6.134969325153374e-08, "loss": 2.8021, "step": 100 }, { "epoch": 0.3374233128834356, "grad_norm": 35.816703465963236, "learning_rate": 6.748466257668711e-08, "loss": 2.7854, "step": 110 }, { "epoch": 0.36809815950920244, "grad_norm": 25.57200780260961, "learning_rate": 7.36196319018405e-08, "loss": 2.6267, "step": 120 }, { "epoch": 0.3987730061349693, "grad_norm": 22.490209736367557, "learning_rate": 7.975460122699386e-08, "loss": 2.5126, "step": 130 }, { "epoch": 0.4294478527607362, "grad_norm": 21.154756083713355, "learning_rate": 8.588957055214724e-08, "loss": 2.5148, "step": 140 }, { "epoch": 0.4601226993865031, "grad_norm": 18.721430927730754, "learning_rate": 9.202453987730061e-08, "loss": 2.4065, "step": 150 }, { "epoch": 0.49079754601226994, "grad_norm": 11.049403510574644, "learning_rate": 9.815950920245398e-08, "loss": 2.3448, "step": 160 }, { "epoch": 0.5214723926380368, "grad_norm": 8.021100065182837, "learning_rate": 1.0429447852760735e-07, "loss": 2.1846, "step": 170 }, { "epoch": 0.5521472392638037, "grad_norm": 6.759878770426535, "learning_rate": 1.1042944785276073e-07, "loss": 2.1407, "step": 180 }, { "epoch": 0.5828220858895705, "grad_norm": 5.6950515516559275, "learning_rate": 1.165644171779141e-07, "loss": 2.0731, "step": 190 }, { "epoch": 0.6134969325153374, "grad_norm": 4.589530123869859, "learning_rate": 1.2269938650306748e-07, "loss": 2.096, "step": 200 }, { "epoch": 0.6441717791411042, "grad_norm": 4.0999552773713255, "learning_rate": 1.2883435582822087e-07, "loss": 2.0651, "step": 210 }, { "epoch": 0.6748466257668712, "grad_norm": 3.8635184963656877, "learning_rate": 1.3496932515337422e-07, "loss": 2.0136, "step": 220 }, { "epoch": 0.7055214723926381, "grad_norm": 3.801649063409229, "learning_rate": 1.4110429447852758e-07, "loss": 2.0283, "step": 230 }, { "epoch": 0.7361963190184049, "grad_norm": 3.499553222570716, "learning_rate": 1.47239263803681e-07, "loss": 2.0207, "step": 240 }, { "epoch": 0.7668711656441718, "grad_norm": 3.5859940211720125, "learning_rate": 1.5337423312883435e-07, "loss": 1.9723, "step": 250 }, { "epoch": 0.7975460122699386, "grad_norm": 3.5360060488259433, "learning_rate": 1.595092024539877e-07, "loss": 1.9378, "step": 260 }, { "epoch": 0.8282208588957055, "grad_norm": 3.4861162838130446, "learning_rate": 1.656441717791411e-07, "loss": 2.0179, "step": 270 }, { "epoch": 0.8588957055214724, "grad_norm": 3.5048041671476895, "learning_rate": 1.7177914110429448e-07, "loss": 1.9889, "step": 280 }, { "epoch": 0.8895705521472392, "grad_norm": 3.5320508940958515, "learning_rate": 1.7791411042944784e-07, "loss": 1.9974, "step": 290 }, { "epoch": 0.9202453987730062, "grad_norm": 3.479602078136922, "learning_rate": 1.8404907975460122e-07, "loss": 1.94, "step": 300 }, { "epoch": 0.950920245398773, "grad_norm": 3.463981738252198, "learning_rate": 1.901840490797546e-07, "loss": 1.956, "step": 310 }, { "epoch": 0.9815950920245399, "grad_norm": 3.2538325898676637, "learning_rate": 1.9631901840490797e-07, "loss": 1.9886, "step": 320 }, { "epoch": 1.0122699386503067, "grad_norm": 3.5596683489380294, "learning_rate": 2.0245398773006135e-07, "loss": 1.9918, "step": 330 }, { "epoch": 1.0429447852760736, "grad_norm": 3.238486372143609, "learning_rate": 2.085889570552147e-07, "loss": 1.9839, "step": 340 }, { "epoch": 1.0736196319018405, "grad_norm": 3.246470029117208, "learning_rate": 2.147239263803681e-07, "loss": 1.9707, "step": 350 }, { "epoch": 1.1042944785276074, "grad_norm": 3.3548676253751895, "learning_rate": 2.2085889570552145e-07, "loss": 1.9399, "step": 360 }, { "epoch": 1.1349693251533743, "grad_norm": 2.9837136528798998, "learning_rate": 2.2699386503067484e-07, "loss": 1.9081, "step": 370 }, { "epoch": 1.165644171779141, "grad_norm": 3.123338783793841, "learning_rate": 2.331288343558282e-07, "loss": 1.9495, "step": 380 }, { "epoch": 1.196319018404908, "grad_norm": 3.12823522424731, "learning_rate": 2.392638036809816e-07, "loss": 1.9149, "step": 390 }, { "epoch": 1.2269938650306749, "grad_norm": 3.127421120344186, "learning_rate": 2.4539877300613496e-07, "loss": 1.9231, "step": 400 }, { "epoch": 1.2576687116564418, "grad_norm": 3.0563142379215784, "learning_rate": 2.5153374233128835e-07, "loss": 1.9259, "step": 410 }, { "epoch": 1.2883435582822087, "grad_norm": 3.002860823723609, "learning_rate": 2.5766871165644173e-07, "loss": 1.8779, "step": 420 }, { "epoch": 1.3190184049079754, "grad_norm": 3.040065378882382, "learning_rate": 2.6380368098159506e-07, "loss": 1.919, "step": 430 }, { "epoch": 1.3496932515337423, "grad_norm": 3.2257346917267484, "learning_rate": 2.6993865030674845e-07, "loss": 1.9377, "step": 440 }, { "epoch": 1.3803680981595092, "grad_norm": 2.9954147708634724, "learning_rate": 2.7607361963190183e-07, "loss": 1.9159, "step": 450 }, { "epoch": 1.4110429447852761, "grad_norm": 3.0206681193118583, "learning_rate": 2.8220858895705517e-07, "loss": 1.9015, "step": 460 }, { "epoch": 1.441717791411043, "grad_norm": 3.1151975930939413, "learning_rate": 2.8834355828220855e-07, "loss": 1.9162, "step": 470 }, { "epoch": 1.4723926380368098, "grad_norm": 3.110216679336694, "learning_rate": 2.94478527607362e-07, "loss": 1.92, "step": 480 }, { "epoch": 1.5030674846625767, "grad_norm": 3.046863348968689, "learning_rate": 3.006134969325153e-07, "loss": 1.9171, "step": 490 }, { "epoch": 1.5337423312883436, "grad_norm": 2.945865360342739, "learning_rate": 3.067484662576687e-07, "loss": 1.9095, "step": 500 }, { "epoch": 1.5644171779141103, "grad_norm": 3.176637928075508, "learning_rate": 3.128834355828221e-07, "loss": 1.9282, "step": 510 }, { "epoch": 1.5950920245398774, "grad_norm": 3.0369820219737056, "learning_rate": 3.190184049079754e-07, "loss": 1.936, "step": 520 }, { "epoch": 1.6257668711656441, "grad_norm": 3.004806377144078, "learning_rate": 3.251533742331288e-07, "loss": 1.9434, "step": 530 }, { "epoch": 1.656441717791411, "grad_norm": 3.2902184434846133, "learning_rate": 3.312883435582822e-07, "loss": 1.8933, "step": 540 }, { "epoch": 1.687116564417178, "grad_norm": 3.1870514820826905, "learning_rate": 3.374233128834356e-07, "loss": 1.9213, "step": 550 }, { "epoch": 1.7177914110429446, "grad_norm": 3.131942976786612, "learning_rate": 3.4355828220858896e-07, "loss": 1.9434, "step": 560 }, { "epoch": 1.7484662576687118, "grad_norm": 3.0513682450360378, "learning_rate": 3.496932515337423e-07, "loss": 1.8838, "step": 570 }, { "epoch": 1.7791411042944785, "grad_norm": 3.177020551414707, "learning_rate": 3.558282208588957e-07, "loss": 1.9052, "step": 580 }, { "epoch": 1.8098159509202454, "grad_norm": 2.9620179420902506, "learning_rate": 3.6196319018404906e-07, "loss": 1.8909, "step": 590 }, { "epoch": 1.8404907975460123, "grad_norm": 2.990366230166413, "learning_rate": 3.6809815950920245e-07, "loss": 1.8493, "step": 600 }, { "epoch": 1.871165644171779, "grad_norm": 2.975455921573483, "learning_rate": 3.7423312883435583e-07, "loss": 1.876, "step": 610 }, { "epoch": 1.9018404907975461, "grad_norm": 3.1776175076162905, "learning_rate": 3.803680981595092e-07, "loss": 1.8962, "step": 620 }, { "epoch": 1.9325153374233128, "grad_norm": 3.0920446560067725, "learning_rate": 3.8650306748466255e-07, "loss": 1.9504, "step": 630 }, { "epoch": 1.9631901840490797, "grad_norm": 3.0815885404494883, "learning_rate": 3.9263803680981593e-07, "loss": 1.912, "step": 640 }, { "epoch": 1.9938650306748467, "grad_norm": 3.111764851371804, "learning_rate": 3.9877300613496926e-07, "loss": 1.9481, "step": 650 }, { "epoch": 2.0245398773006134, "grad_norm": 2.7879149976133806, "learning_rate": 4.049079754601227e-07, "loss": 1.8721, "step": 660 }, { "epoch": 2.0552147239263805, "grad_norm": 3.001396783754136, "learning_rate": 4.110429447852761e-07, "loss": 1.8426, "step": 670 }, { "epoch": 2.085889570552147, "grad_norm": 2.818029202172762, "learning_rate": 4.171779141104294e-07, "loss": 1.8664, "step": 680 }, { "epoch": 2.116564417177914, "grad_norm": 3.0655338296444645, "learning_rate": 4.233128834355828e-07, "loss": 1.8812, "step": 690 }, { "epoch": 2.147239263803681, "grad_norm": 3.0134914811420237, "learning_rate": 4.294478527607362e-07, "loss": 1.8828, "step": 700 }, { "epoch": 2.1779141104294477, "grad_norm": 3.0293103580307252, "learning_rate": 4.355828220858895e-07, "loss": 1.8596, "step": 710 }, { "epoch": 2.208588957055215, "grad_norm": 2.7905142590645284, "learning_rate": 4.417177914110429e-07, "loss": 1.8733, "step": 720 }, { "epoch": 2.2392638036809815, "grad_norm": 2.925363620028882, "learning_rate": 4.4785276073619634e-07, "loss": 1.8983, "step": 730 }, { "epoch": 2.2699386503067487, "grad_norm": 2.8448443834523975, "learning_rate": 4.5398773006134967e-07, "loss": 1.8764, "step": 740 }, { "epoch": 2.3006134969325154, "grad_norm": 2.9628937142708875, "learning_rate": 4.6012269938650306e-07, "loss": 1.8754, "step": 750 }, { "epoch": 2.331288343558282, "grad_norm": 3.4170857688413427, "learning_rate": 4.662576687116564e-07, "loss": 1.8782, "step": 760 }, { "epoch": 2.361963190184049, "grad_norm": 3.0679580454695388, "learning_rate": 4.7239263803680977e-07, "loss": 1.904, "step": 770 }, { "epoch": 2.392638036809816, "grad_norm": 2.9137539934318784, "learning_rate": 4.785276073619632e-07, "loss": 1.8132, "step": 780 }, { "epoch": 2.4233128834355826, "grad_norm": 3.061624511427873, "learning_rate": 4.846625766871165e-07, "loss": 1.8769, "step": 790 }, { "epoch": 2.4539877300613497, "grad_norm": 3.139832542082945, "learning_rate": 4.907975460122699e-07, "loss": 1.9011, "step": 800 }, { "epoch": 2.4846625766871164, "grad_norm": 3.040517067124859, "learning_rate": 4.969325153374233e-07, "loss": 1.8553, "step": 810 }, { "epoch": 2.5153374233128836, "grad_norm": 3.2258257658971905, "learning_rate": 5.030674846625767e-07, "loss": 1.865, "step": 820 }, { "epoch": 2.5460122699386503, "grad_norm": 3.0070195680844845, "learning_rate": 5.0920245398773e-07, "loss": 1.8483, "step": 830 }, { "epoch": 2.5766871165644174, "grad_norm": 2.86401603854259, "learning_rate": 5.153374233128835e-07, "loss": 1.8825, "step": 840 }, { "epoch": 2.607361963190184, "grad_norm": 2.8239983106989333, "learning_rate": 5.214723926380368e-07, "loss": 1.8549, "step": 850 }, { "epoch": 2.638036809815951, "grad_norm": 3.0228245480925113, "learning_rate": 5.276073619631901e-07, "loss": 1.8534, "step": 860 }, { "epoch": 2.668711656441718, "grad_norm": 3.407476550864115, "learning_rate": 5.337423312883436e-07, "loss": 1.8691, "step": 870 }, { "epoch": 2.6993865030674846, "grad_norm": 3.0394566261515643, "learning_rate": 5.398773006134969e-07, "loss": 1.858, "step": 880 }, { "epoch": 2.7300613496932513, "grad_norm": 2.9005018784059606, "learning_rate": 5.460122699386502e-07, "loss": 1.8769, "step": 890 }, { "epoch": 2.7607361963190185, "grad_norm": 3.069201075368232, "learning_rate": 5.521472392638037e-07, "loss": 1.8772, "step": 900 }, { "epoch": 2.791411042944785, "grad_norm": 3.1035188318429636, "learning_rate": 5.58282208588957e-07, "loss": 1.7893, "step": 910 }, { "epoch": 2.8220858895705523, "grad_norm": 2.92374023876635, "learning_rate": 5.644171779141103e-07, "loss": 1.8506, "step": 920 }, { "epoch": 2.852760736196319, "grad_norm": 2.9181750683296763, "learning_rate": 5.705521472392638e-07, "loss": 1.8237, "step": 930 }, { "epoch": 2.883435582822086, "grad_norm": 3.0644688369202497, "learning_rate": 5.766871165644171e-07, "loss": 1.8199, "step": 940 }, { "epoch": 2.914110429447853, "grad_norm": 3.0388171866487177, "learning_rate": 5.828220858895705e-07, "loss": 1.8451, "step": 950 }, { "epoch": 2.9447852760736195, "grad_norm": 2.9394714399921673, "learning_rate": 5.88957055214724e-07, "loss": 1.8562, "step": 960 }, { "epoch": 2.9754601226993866, "grad_norm": 3.516190266528493, "learning_rate": 5.950920245398773e-07, "loss": 1.8405, "step": 970 }, { "epoch": 3.0061349693251533, "grad_norm": 2.9630766702006324, "learning_rate": 6.012269938650306e-07, "loss": 1.8408, "step": 980 }, { "epoch": 3.03680981595092, "grad_norm": 2.7943981161053917, "learning_rate": 6.073619631901841e-07, "loss": 1.7695, "step": 990 }, { "epoch": 3.067484662576687, "grad_norm": 2.870330771125139, "learning_rate": 6.134969325153374e-07, "loss": 1.8207, "step": 1000 } ], "logging_steps": 10, "max_steps": 16300, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 53163861147648.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }