| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.067484662576687, | |
| "eval_steps": 10000000, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03067484662576687, | |
| "grad_norm": 40.34086591617319, | |
| "learning_rate": 6.134969325153374e-09, | |
| "loss": 2.8872, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06134969325153374, | |
| "grad_norm": 40.21853524962094, | |
| "learning_rate": 1.2269938650306748e-08, | |
| "loss": 2.8167, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09202453987730061, | |
| "grad_norm": 42.91710233147346, | |
| "learning_rate": 1.8404907975460124e-08, | |
| "loss": 2.9017, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12269938650306748, | |
| "grad_norm": 42.19478017724499, | |
| "learning_rate": 2.4539877300613496e-08, | |
| "loss": 2.9177, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15337423312883436, | |
| "grad_norm": 39.89856563443135, | |
| "learning_rate": 3.067484662576687e-08, | |
| "loss": 2.8621, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18404907975460122, | |
| "grad_norm": 39.031000731023376, | |
| "learning_rate": 3.680981595092025e-08, | |
| "loss": 2.8787, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2147239263803681, | |
| "grad_norm": 40.17556155872599, | |
| "learning_rate": 4.294478527607362e-08, | |
| "loss": 2.8723, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.24539877300613497, | |
| "grad_norm": 40.44537988014516, | |
| "learning_rate": 4.907975460122699e-08, | |
| "loss": 2.8328, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.27607361963190186, | |
| "grad_norm": 36.38071257012731, | |
| "learning_rate": 5.521472392638036e-08, | |
| "loss": 2.8269, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3067484662576687, | |
| "grad_norm": 35.01954486493492, | |
| "learning_rate": 6.134969325153374e-08, | |
| "loss": 2.8021, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3374233128834356, | |
| "grad_norm": 35.816703465963236, | |
| "learning_rate": 6.748466257668711e-08, | |
| "loss": 2.7854, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.36809815950920244, | |
| "grad_norm": 25.57200780260961, | |
| "learning_rate": 7.36196319018405e-08, | |
| "loss": 2.6267, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3987730061349693, | |
| "grad_norm": 22.490209736367557, | |
| "learning_rate": 7.975460122699386e-08, | |
| "loss": 2.5126, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4294478527607362, | |
| "grad_norm": 21.154756083713355, | |
| "learning_rate": 8.588957055214724e-08, | |
| "loss": 2.5148, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4601226993865031, | |
| "grad_norm": 18.721430927730754, | |
| "learning_rate": 9.202453987730061e-08, | |
| "loss": 2.4065, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.49079754601226994, | |
| "grad_norm": 11.049403510574644, | |
| "learning_rate": 9.815950920245398e-08, | |
| "loss": 2.3448, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5214723926380368, | |
| "grad_norm": 8.021100065182837, | |
| "learning_rate": 1.0429447852760735e-07, | |
| "loss": 2.1846, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5521472392638037, | |
| "grad_norm": 6.759878770426535, | |
| "learning_rate": 1.1042944785276073e-07, | |
| "loss": 2.1407, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5828220858895705, | |
| "grad_norm": 5.6950515516559275, | |
| "learning_rate": 1.165644171779141e-07, | |
| "loss": 2.0731, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6134969325153374, | |
| "grad_norm": 4.589530123869859, | |
| "learning_rate": 1.2269938650306748e-07, | |
| "loss": 2.096, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6441717791411042, | |
| "grad_norm": 4.0999552773713255, | |
| "learning_rate": 1.2883435582822087e-07, | |
| "loss": 2.0651, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6748466257668712, | |
| "grad_norm": 3.8635184963656877, | |
| "learning_rate": 1.3496932515337422e-07, | |
| "loss": 2.0136, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7055214723926381, | |
| "grad_norm": 3.801649063409229, | |
| "learning_rate": 1.4110429447852758e-07, | |
| "loss": 2.0283, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7361963190184049, | |
| "grad_norm": 3.499553222570716, | |
| "learning_rate": 1.47239263803681e-07, | |
| "loss": 2.0207, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7668711656441718, | |
| "grad_norm": 3.5859940211720125, | |
| "learning_rate": 1.5337423312883435e-07, | |
| "loss": 1.9723, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7975460122699386, | |
| "grad_norm": 3.5360060488259433, | |
| "learning_rate": 1.595092024539877e-07, | |
| "loss": 1.9378, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8282208588957055, | |
| "grad_norm": 3.4861162838130446, | |
| "learning_rate": 1.656441717791411e-07, | |
| "loss": 2.0179, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8588957055214724, | |
| "grad_norm": 3.5048041671476895, | |
| "learning_rate": 1.7177914110429448e-07, | |
| "loss": 1.9889, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8895705521472392, | |
| "grad_norm": 3.5320508940958515, | |
| "learning_rate": 1.7791411042944784e-07, | |
| "loss": 1.9974, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9202453987730062, | |
| "grad_norm": 3.479602078136922, | |
| "learning_rate": 1.8404907975460122e-07, | |
| "loss": 1.94, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.950920245398773, | |
| "grad_norm": 3.463981738252198, | |
| "learning_rate": 1.901840490797546e-07, | |
| "loss": 1.956, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9815950920245399, | |
| "grad_norm": 3.2538325898676637, | |
| "learning_rate": 1.9631901840490797e-07, | |
| "loss": 1.9886, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0122699386503067, | |
| "grad_norm": 3.5596683489380294, | |
| "learning_rate": 2.0245398773006135e-07, | |
| "loss": 1.9918, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0429447852760736, | |
| "grad_norm": 3.238486372143609, | |
| "learning_rate": 2.085889570552147e-07, | |
| "loss": 1.9839, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.0736196319018405, | |
| "grad_norm": 3.246470029117208, | |
| "learning_rate": 2.147239263803681e-07, | |
| "loss": 1.9707, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1042944785276074, | |
| "grad_norm": 3.3548676253751895, | |
| "learning_rate": 2.2085889570552145e-07, | |
| "loss": 1.9399, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.1349693251533743, | |
| "grad_norm": 2.9837136528798998, | |
| "learning_rate": 2.2699386503067484e-07, | |
| "loss": 1.9081, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.165644171779141, | |
| "grad_norm": 3.123338783793841, | |
| "learning_rate": 2.331288343558282e-07, | |
| "loss": 1.9495, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.196319018404908, | |
| "grad_norm": 3.12823522424731, | |
| "learning_rate": 2.392638036809816e-07, | |
| "loss": 1.9149, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.2269938650306749, | |
| "grad_norm": 3.127421120344186, | |
| "learning_rate": 2.4539877300613496e-07, | |
| "loss": 1.9231, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2576687116564418, | |
| "grad_norm": 3.0563142379215784, | |
| "learning_rate": 2.5153374233128835e-07, | |
| "loss": 1.9259, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.2883435582822087, | |
| "grad_norm": 3.002860823723609, | |
| "learning_rate": 2.5766871165644173e-07, | |
| "loss": 1.8779, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.3190184049079754, | |
| "grad_norm": 3.040065378882382, | |
| "learning_rate": 2.6380368098159506e-07, | |
| "loss": 1.919, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.3496932515337423, | |
| "grad_norm": 3.2257346917267484, | |
| "learning_rate": 2.6993865030674845e-07, | |
| "loss": 1.9377, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.3803680981595092, | |
| "grad_norm": 2.9954147708634724, | |
| "learning_rate": 2.7607361963190183e-07, | |
| "loss": 1.9159, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4110429447852761, | |
| "grad_norm": 3.0206681193118583, | |
| "learning_rate": 2.8220858895705517e-07, | |
| "loss": 1.9015, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.441717791411043, | |
| "grad_norm": 3.1151975930939413, | |
| "learning_rate": 2.8834355828220855e-07, | |
| "loss": 1.9162, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.4723926380368098, | |
| "grad_norm": 3.110216679336694, | |
| "learning_rate": 2.94478527607362e-07, | |
| "loss": 1.92, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5030674846625767, | |
| "grad_norm": 3.046863348968689, | |
| "learning_rate": 3.006134969325153e-07, | |
| "loss": 1.9171, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.5337423312883436, | |
| "grad_norm": 2.945865360342739, | |
| "learning_rate": 3.067484662576687e-07, | |
| "loss": 1.9095, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5644171779141103, | |
| "grad_norm": 3.176637928075508, | |
| "learning_rate": 3.128834355828221e-07, | |
| "loss": 1.9282, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.5950920245398774, | |
| "grad_norm": 3.0369820219737056, | |
| "learning_rate": 3.190184049079754e-07, | |
| "loss": 1.936, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6257668711656441, | |
| "grad_norm": 3.004806377144078, | |
| "learning_rate": 3.251533742331288e-07, | |
| "loss": 1.9434, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.656441717791411, | |
| "grad_norm": 3.2902184434846133, | |
| "learning_rate": 3.312883435582822e-07, | |
| "loss": 1.8933, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.687116564417178, | |
| "grad_norm": 3.1870514820826905, | |
| "learning_rate": 3.374233128834356e-07, | |
| "loss": 1.9213, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.7177914110429446, | |
| "grad_norm": 3.131942976786612, | |
| "learning_rate": 3.4355828220858896e-07, | |
| "loss": 1.9434, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.7484662576687118, | |
| "grad_norm": 3.0513682450360378, | |
| "learning_rate": 3.496932515337423e-07, | |
| "loss": 1.8838, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.7791411042944785, | |
| "grad_norm": 3.177020551414707, | |
| "learning_rate": 3.558282208588957e-07, | |
| "loss": 1.9052, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.8098159509202454, | |
| "grad_norm": 2.9620179420902506, | |
| "learning_rate": 3.6196319018404906e-07, | |
| "loss": 1.8909, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.8404907975460123, | |
| "grad_norm": 2.990366230166413, | |
| "learning_rate": 3.6809815950920245e-07, | |
| "loss": 1.8493, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.871165644171779, | |
| "grad_norm": 2.975455921573483, | |
| "learning_rate": 3.7423312883435583e-07, | |
| "loss": 1.876, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.9018404907975461, | |
| "grad_norm": 3.1776175076162905, | |
| "learning_rate": 3.803680981595092e-07, | |
| "loss": 1.8962, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.9325153374233128, | |
| "grad_norm": 3.0920446560067725, | |
| "learning_rate": 3.8650306748466255e-07, | |
| "loss": 1.9504, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.9631901840490797, | |
| "grad_norm": 3.0815885404494883, | |
| "learning_rate": 3.9263803680981593e-07, | |
| "loss": 1.912, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.9938650306748467, | |
| "grad_norm": 3.111764851371804, | |
| "learning_rate": 3.9877300613496926e-07, | |
| "loss": 1.9481, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.0245398773006134, | |
| "grad_norm": 2.7879149976133806, | |
| "learning_rate": 4.049079754601227e-07, | |
| "loss": 1.8721, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.0552147239263805, | |
| "grad_norm": 3.001396783754136, | |
| "learning_rate": 4.110429447852761e-07, | |
| "loss": 1.8426, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.085889570552147, | |
| "grad_norm": 2.818029202172762, | |
| "learning_rate": 4.171779141104294e-07, | |
| "loss": 1.8664, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.116564417177914, | |
| "grad_norm": 3.0655338296444645, | |
| "learning_rate": 4.233128834355828e-07, | |
| "loss": 1.8812, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.147239263803681, | |
| "grad_norm": 3.0134914811420237, | |
| "learning_rate": 4.294478527607362e-07, | |
| "loss": 1.8828, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.1779141104294477, | |
| "grad_norm": 3.0293103580307252, | |
| "learning_rate": 4.355828220858895e-07, | |
| "loss": 1.8596, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.208588957055215, | |
| "grad_norm": 2.7905142590645284, | |
| "learning_rate": 4.417177914110429e-07, | |
| "loss": 1.8733, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.2392638036809815, | |
| "grad_norm": 2.925363620028882, | |
| "learning_rate": 4.4785276073619634e-07, | |
| "loss": 1.8983, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.2699386503067487, | |
| "grad_norm": 2.8448443834523975, | |
| "learning_rate": 4.5398773006134967e-07, | |
| "loss": 1.8764, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.3006134969325154, | |
| "grad_norm": 2.9628937142708875, | |
| "learning_rate": 4.6012269938650306e-07, | |
| "loss": 1.8754, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.331288343558282, | |
| "grad_norm": 3.4170857688413427, | |
| "learning_rate": 4.662576687116564e-07, | |
| "loss": 1.8782, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.361963190184049, | |
| "grad_norm": 3.0679580454695388, | |
| "learning_rate": 4.7239263803680977e-07, | |
| "loss": 1.904, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.392638036809816, | |
| "grad_norm": 2.9137539934318784, | |
| "learning_rate": 4.785276073619632e-07, | |
| "loss": 1.8132, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.4233128834355826, | |
| "grad_norm": 3.061624511427873, | |
| "learning_rate": 4.846625766871165e-07, | |
| "loss": 1.8769, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.4539877300613497, | |
| "grad_norm": 3.139832542082945, | |
| "learning_rate": 4.907975460122699e-07, | |
| "loss": 1.9011, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.4846625766871164, | |
| "grad_norm": 3.040517067124859, | |
| "learning_rate": 4.969325153374233e-07, | |
| "loss": 1.8553, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.5153374233128836, | |
| "grad_norm": 3.2258257658971905, | |
| "learning_rate": 5.030674846625767e-07, | |
| "loss": 1.865, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.5460122699386503, | |
| "grad_norm": 3.0070195680844845, | |
| "learning_rate": 5.0920245398773e-07, | |
| "loss": 1.8483, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.5766871165644174, | |
| "grad_norm": 2.86401603854259, | |
| "learning_rate": 5.153374233128835e-07, | |
| "loss": 1.8825, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.607361963190184, | |
| "grad_norm": 2.8239983106989333, | |
| "learning_rate": 5.214723926380368e-07, | |
| "loss": 1.8549, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.638036809815951, | |
| "grad_norm": 3.0228245480925113, | |
| "learning_rate": 5.276073619631901e-07, | |
| "loss": 1.8534, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.668711656441718, | |
| "grad_norm": 3.407476550864115, | |
| "learning_rate": 5.337423312883436e-07, | |
| "loss": 1.8691, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.6993865030674846, | |
| "grad_norm": 3.0394566261515643, | |
| "learning_rate": 5.398773006134969e-07, | |
| "loss": 1.858, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.7300613496932513, | |
| "grad_norm": 2.9005018784059606, | |
| "learning_rate": 5.460122699386502e-07, | |
| "loss": 1.8769, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.7607361963190185, | |
| "grad_norm": 3.069201075368232, | |
| "learning_rate": 5.521472392638037e-07, | |
| "loss": 1.8772, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.791411042944785, | |
| "grad_norm": 3.1035188318429636, | |
| "learning_rate": 5.58282208588957e-07, | |
| "loss": 1.7893, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.8220858895705523, | |
| "grad_norm": 2.92374023876635, | |
| "learning_rate": 5.644171779141103e-07, | |
| "loss": 1.8506, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.852760736196319, | |
| "grad_norm": 2.9181750683296763, | |
| "learning_rate": 5.705521472392638e-07, | |
| "loss": 1.8237, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.883435582822086, | |
| "grad_norm": 3.0644688369202497, | |
| "learning_rate": 5.766871165644171e-07, | |
| "loss": 1.8199, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.914110429447853, | |
| "grad_norm": 3.0388171866487177, | |
| "learning_rate": 5.828220858895705e-07, | |
| "loss": 1.8451, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.9447852760736195, | |
| "grad_norm": 2.9394714399921673, | |
| "learning_rate": 5.88957055214724e-07, | |
| "loss": 1.8562, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.9754601226993866, | |
| "grad_norm": 3.516190266528493, | |
| "learning_rate": 5.950920245398773e-07, | |
| "loss": 1.8405, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.0061349693251533, | |
| "grad_norm": 2.9630766702006324, | |
| "learning_rate": 6.012269938650306e-07, | |
| "loss": 1.8408, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.03680981595092, | |
| "grad_norm": 2.7943981161053917, | |
| "learning_rate": 6.073619631901841e-07, | |
| "loss": 1.7695, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.067484662576687, | |
| "grad_norm": 2.870330771125139, | |
| "learning_rate": 6.134969325153374e-07, | |
| "loss": 1.8207, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 16300, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 53163861147648.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |