| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 12.269938650306749, | |
| "eval_steps": 10000000, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.12269938650306748, | |
| "grad_norm": 24.408997453689647, | |
| "learning_rate": 1.2345679012345678e-08, | |
| "loss": 2.7299, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.24539877300613497, | |
| "grad_norm": 24.80765178012853, | |
| "learning_rate": 2.4691358024691355e-08, | |
| "loss": 2.7355, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.36809815950920244, | |
| "grad_norm": 25.187726534643133, | |
| "learning_rate": 3.7037037037037036e-08, | |
| "loss": 2.7705, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.49079754601226994, | |
| "grad_norm": 25.323501806850057, | |
| "learning_rate": 4.938271604938271e-08, | |
| "loss": 2.7413, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6134969325153374, | |
| "grad_norm": 24.247097730244068, | |
| "learning_rate": 6.172839506172839e-08, | |
| "loss": 2.7371, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.7361963190184049, | |
| "grad_norm": 24.31447909409825, | |
| "learning_rate": 7.407407407407407e-08, | |
| "loss": 2.7364, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8588957055214724, | |
| "grad_norm": 23.244236630258634, | |
| "learning_rate": 8.641975308641975e-08, | |
| "loss": 2.6991, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.9815950920245399, | |
| "grad_norm": 23.476416023688525, | |
| "learning_rate": 9.876543209876542e-08, | |
| "loss": 2.7081, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.1042944785276074, | |
| "grad_norm": 20.366176427963136, | |
| "learning_rate": 1.111111111111111e-07, | |
| "loss": 2.6605, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.2269938650306749, | |
| "grad_norm": 19.455018752097676, | |
| "learning_rate": 1.2345679012345677e-07, | |
| "loss": 2.6013, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.3496932515337423, | |
| "grad_norm": 19.61610178323797, | |
| "learning_rate": 1.3580246913580246e-07, | |
| "loss": 2.5733, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.4723926380368098, | |
| "grad_norm": 10.050332977637998, | |
| "learning_rate": 1.4814814814814815e-07, | |
| "loss": 2.4569, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.5950920245398774, | |
| "grad_norm": 8.220625543189085, | |
| "learning_rate": 1.604938271604938e-07, | |
| "loss": 2.4254, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.7177914110429446, | |
| "grad_norm": 6.393668406476926, | |
| "learning_rate": 1.728395061728395e-07, | |
| "loss": 2.4087, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.8404907975460123, | |
| "grad_norm": 4.395664172815528, | |
| "learning_rate": 1.8518518518518516e-07, | |
| "loss": 2.316, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.9631901840490797, | |
| "grad_norm": 3.372590478708616, | |
| "learning_rate": 1.9753086419753084e-07, | |
| "loss": 2.3029, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.085889570552147, | |
| "grad_norm": 3.0108879202152203, | |
| "learning_rate": 2.0987654320987653e-07, | |
| "loss": 2.2702, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.208588957055215, | |
| "grad_norm": 2.7225559520418483, | |
| "learning_rate": 2.222222222222222e-07, | |
| "loss": 2.253, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.331288343558282, | |
| "grad_norm": 2.4432492884647794, | |
| "learning_rate": 2.3456790123456788e-07, | |
| "loss": 2.2483, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.4539877300613497, | |
| "grad_norm": 2.3400196487113116, | |
| "learning_rate": 2.4691358024691354e-07, | |
| "loss": 2.223, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.5766871165644174, | |
| "grad_norm": 2.1352206898241612, | |
| "learning_rate": 2.5925925925925923e-07, | |
| "loss": 2.2025, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.6993865030674846, | |
| "grad_norm": 2.023643903641845, | |
| "learning_rate": 2.716049382716049e-07, | |
| "loss": 2.1908, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.8220858895705523, | |
| "grad_norm": 1.9335965899623562, | |
| "learning_rate": 2.839506172839506e-07, | |
| "loss": 2.1824, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.9447852760736195, | |
| "grad_norm": 1.9077831623847687, | |
| "learning_rate": 2.962962962962963e-07, | |
| "loss": 2.1608, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.067484662576687, | |
| "grad_norm": 1.8162929105709213, | |
| "learning_rate": 3.086419753086419e-07, | |
| "loss": 2.1647, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.190184049079755, | |
| "grad_norm": 1.7624936746874469, | |
| "learning_rate": 3.209876543209876e-07, | |
| "loss": 2.141, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.312883435582822, | |
| "grad_norm": 1.7925134660582491, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 2.1508, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.4355828220858897, | |
| "grad_norm": 1.6707105797889426, | |
| "learning_rate": 3.45679012345679e-07, | |
| "loss": 2.1824, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.558282208588957, | |
| "grad_norm": 1.8016949939412934, | |
| "learning_rate": 3.580246913580247e-07, | |
| "loss": 2.153, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.6809815950920246, | |
| "grad_norm": 1.7179134496371937, | |
| "learning_rate": 3.703703703703703e-07, | |
| "loss": 2.1517, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.8036809815950923, | |
| "grad_norm": 1.644436248853206, | |
| "learning_rate": 3.82716049382716e-07, | |
| "loss": 2.1282, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.9263803680981595, | |
| "grad_norm": 1.6481513302515154, | |
| "learning_rate": 3.950617283950617e-07, | |
| "loss": 2.1479, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.049079754601227, | |
| "grad_norm": 1.5834405350588614, | |
| "learning_rate": 4.0740740740740737e-07, | |
| "loss": 2.1205, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.171779141104294, | |
| "grad_norm": 1.5774683971395849, | |
| "learning_rate": 4.1975308641975306e-07, | |
| "loss": 2.0829, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.294478527607362, | |
| "grad_norm": 1.5942216093543309, | |
| "learning_rate": 4.320987654320987e-07, | |
| "loss": 2.1374, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.41717791411043, | |
| "grad_norm": 1.9297965568116624, | |
| "learning_rate": 4.444444444444444e-07, | |
| "loss": 2.1496, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.539877300613497, | |
| "grad_norm": 1.6010753048632336, | |
| "learning_rate": 4.5679012345679007e-07, | |
| "loss": 2.1165, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.662576687116564, | |
| "grad_norm": 2.0277456221645247, | |
| "learning_rate": 4.6913580246913576e-07, | |
| "loss": 2.0951, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.785276073619632, | |
| "grad_norm": 1.6100359846439383, | |
| "learning_rate": 4.814814814814814e-07, | |
| "loss": 2.1221, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.9079754601226995, | |
| "grad_norm": 1.5913594422754647, | |
| "learning_rate": 4.938271604938271e-07, | |
| "loss": 2.124, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.030674846625767, | |
| "grad_norm": 1.543916416223568, | |
| "learning_rate": 5.061728395061729e-07, | |
| "loss": 2.093, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.153374233128835, | |
| "grad_norm": 1.6314296110841167, | |
| "learning_rate": 5.185185185185185e-07, | |
| "loss": 2.0833, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.276073619631902, | |
| "grad_norm": 1.4616507419385325, | |
| "learning_rate": 5.308641975308642e-07, | |
| "loss": 2.1095, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 5.398773006134969, | |
| "grad_norm": 1.5844834668769874, | |
| "learning_rate": 5.432098765432098e-07, | |
| "loss": 2.094, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 5.521472392638037, | |
| "grad_norm": 1.558323789647778, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 2.0901, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.644171779141105, | |
| "grad_norm": 1.526929915237913, | |
| "learning_rate": 5.679012345679012e-07, | |
| "loss": 2.0844, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.766871165644172, | |
| "grad_norm": 1.4896565468847545, | |
| "learning_rate": 5.802469135802469e-07, | |
| "loss": 2.0774, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.889570552147239, | |
| "grad_norm": 1.4983542070251128, | |
| "learning_rate": 5.925925925925926e-07, | |
| "loss": 2.0979, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 6.012269938650307, | |
| "grad_norm": 1.474608208975854, | |
| "learning_rate": 6.049382716049383e-07, | |
| "loss": 2.087, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 6.134969325153374, | |
| "grad_norm": 1.6014533478490705, | |
| "learning_rate": 6.172839506172839e-07, | |
| "loss": 2.0547, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 6.257668711656442, | |
| "grad_norm": 1.454642285240305, | |
| "learning_rate": 6.296296296296296e-07, | |
| "loss": 2.0449, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 6.38036809815951, | |
| "grad_norm": 1.6436891729462102, | |
| "learning_rate": 6.419753086419752e-07, | |
| "loss": 2.0696, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 6.5030674846625764, | |
| "grad_norm": 1.4807234984332847, | |
| "learning_rate": 6.54320987654321e-07, | |
| "loss": 2.0742, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 6.625766871165644, | |
| "grad_norm": 1.5356652480116542, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 2.0696, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.748466257668712, | |
| "grad_norm": 1.6579289531016874, | |
| "learning_rate": 6.790123456790123e-07, | |
| "loss": 2.0551, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.871165644171779, | |
| "grad_norm": 1.55170120881645, | |
| "learning_rate": 6.91358024691358e-07, | |
| "loss": 2.0657, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.993865030674847, | |
| "grad_norm": 1.4899273802926296, | |
| "learning_rate": 7.037037037037037e-07, | |
| "loss": 2.078, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 7.116564417177914, | |
| "grad_norm": 1.44595351107914, | |
| "learning_rate": 7.160493827160494e-07, | |
| "loss": 2.0238, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 7.2392638036809815, | |
| "grad_norm": 1.453250943181157, | |
| "learning_rate": 7.28395061728395e-07, | |
| "loss": 2.0446, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 7.361963190184049, | |
| "grad_norm": 1.598197750855362, | |
| "learning_rate": 7.407407407407406e-07, | |
| "loss": 2.0297, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 7.484662576687117, | |
| "grad_norm": 1.6244958519175605, | |
| "learning_rate": 7.530864197530864e-07, | |
| "loss": 2.0399, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 7.6073619631901845, | |
| "grad_norm": 1.690696721697701, | |
| "learning_rate": 7.65432098765432e-07, | |
| "loss": 2.0291, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 7.730061349693251, | |
| "grad_norm": 1.540010541664445, | |
| "learning_rate": 7.777777777777778e-07, | |
| "loss": 2.047, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.852760736196319, | |
| "grad_norm": 1.4784188621578072, | |
| "learning_rate": 7.901234567901234e-07, | |
| "loss": 2.0237, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.975460122699387, | |
| "grad_norm": 1.5806193808808893, | |
| "learning_rate": 8.024691358024691e-07, | |
| "loss": 2.0349, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 8.098159509202453, | |
| "grad_norm": 1.5066964330658228, | |
| "learning_rate": 8.148148148148147e-07, | |
| "loss": 2.019, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 8.220858895705522, | |
| "grad_norm": 1.4657046782549297, | |
| "learning_rate": 8.271604938271604e-07, | |
| "loss": 2.0146, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 8.343558282208589, | |
| "grad_norm": 1.5129566513929427, | |
| "learning_rate": 8.395061728395061e-07, | |
| "loss": 1.9937, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 8.466257668711656, | |
| "grad_norm": 1.6073531745364882, | |
| "learning_rate": 8.518518518518518e-07, | |
| "loss": 2.0201, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 8.588957055214724, | |
| "grad_norm": 1.465742853126048, | |
| "learning_rate": 8.641975308641974e-07, | |
| "loss": 2.0263, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 8.71165644171779, | |
| "grad_norm": 1.4713574884471192, | |
| "learning_rate": 8.765432098765432e-07, | |
| "loss": 2.0074, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 8.83435582822086, | |
| "grad_norm": 1.4818223515949642, | |
| "learning_rate": 8.888888888888888e-07, | |
| "loss": 1.9803, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.957055214723926, | |
| "grad_norm": 1.4960768789879928, | |
| "learning_rate": 9.012345679012346e-07, | |
| "loss": 1.9988, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 9.079754601226995, | |
| "grad_norm": 1.5034869146088043, | |
| "learning_rate": 9.135802469135801e-07, | |
| "loss": 1.9713, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 9.202453987730062, | |
| "grad_norm": 1.743667247923265, | |
| "learning_rate": 9.259259259259259e-07, | |
| "loss": 1.9839, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 9.325153374233128, | |
| "grad_norm": 1.4121136174696463, | |
| "learning_rate": 9.382716049382715e-07, | |
| "loss": 1.986, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 9.447852760736197, | |
| "grad_norm": 1.5213247914456218, | |
| "learning_rate": 9.506172839506172e-07, | |
| "loss": 1.9643, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 9.570552147239264, | |
| "grad_norm": 1.701165501505246, | |
| "learning_rate": 9.629629629629628e-07, | |
| "loss": 1.944, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 9.69325153374233, | |
| "grad_norm": 1.7772160828783827, | |
| "learning_rate": 9.753086419753086e-07, | |
| "loss": 1.9757, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 9.815950920245399, | |
| "grad_norm": 1.5247152271180484, | |
| "learning_rate": 9.876543209876542e-07, | |
| "loss": 1.964, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 9.938650306748466, | |
| "grad_norm": 1.591870678195776, | |
| "learning_rate": 1e-06, | |
| "loss": 1.9758, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 10.061349693251534, | |
| "grad_norm": 1.8151248184194306, | |
| "learning_rate": 9.999953571567085e-07, | |
| "loss": 1.9689, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 10.184049079754601, | |
| "grad_norm": 1.5659661977407868, | |
| "learning_rate": 9.999814287130579e-07, | |
| "loss": 1.9584, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 10.30674846625767, | |
| "grad_norm": 1.515025607589833, | |
| "learning_rate": 9.999582149277185e-07, | |
| "loss": 1.9602, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 10.429447852760736, | |
| "grad_norm": 1.6025743323173112, | |
| "learning_rate": 9.999257162318025e-07, | |
| "loss": 1.936, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 10.552147239263803, | |
| "grad_norm": 1.622286529441025, | |
| "learning_rate": 9.99883933228855e-07, | |
| "loss": 1.9014, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 10.674846625766872, | |
| "grad_norm": 1.9193284429523172, | |
| "learning_rate": 9.998328666948437e-07, | |
| "loss": 1.9282, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 10.797546012269938, | |
| "grad_norm": 1.4435929643824292, | |
| "learning_rate": 9.997725175781443e-07, | |
| "loss": 1.9131, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 10.920245398773005, | |
| "grad_norm": 1.5657198611737813, | |
| "learning_rate": 9.99702886999523e-07, | |
| "loss": 1.9444, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 11.042944785276074, | |
| "grad_norm": 1.5303385836526135, | |
| "learning_rate": 9.99623976252115e-07, | |
| "loss": 1.9228, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 11.16564417177914, | |
| "grad_norm": 1.5388464555082484, | |
| "learning_rate": 9.995357868014013e-07, | |
| "loss": 1.8934, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 11.28834355828221, | |
| "grad_norm": 1.6428643084524577, | |
| "learning_rate": 9.994383202851812e-07, | |
| "loss": 1.9055, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 11.411042944785276, | |
| "grad_norm": 1.4935758108698727, | |
| "learning_rate": 9.993315785135416e-07, | |
| "loss": 1.9118, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 11.533742331288344, | |
| "grad_norm": 1.4752185060017586, | |
| "learning_rate": 9.992155634688238e-07, | |
| "loss": 1.8953, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 11.656441717791411, | |
| "grad_norm": 1.5678765737976184, | |
| "learning_rate": 9.990902773055865e-07, | |
| "loss": 1.8996, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 11.779141104294478, | |
| "grad_norm": 1.5181431605841578, | |
| "learning_rate": 9.98955722350566e-07, | |
| "loss": 1.8932, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 11.901840490797547, | |
| "grad_norm": 1.8570007259349715, | |
| "learning_rate": 9.988119011026324e-07, | |
| "loss": 1.8856, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 12.024539877300613, | |
| "grad_norm": 1.5488809335108225, | |
| "learning_rate": 9.986588162327434e-07, | |
| "loss": 1.8918, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 12.14723926380368, | |
| "grad_norm": 1.5533160872341008, | |
| "learning_rate": 9.98496470583896e-07, | |
| "loss": 1.8707, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 12.269938650306749, | |
| "grad_norm": 1.5667441114509637, | |
| "learning_rate": 9.983248671710714e-07, | |
| "loss": 1.8489, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 8100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 67376603725824.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |