{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 1159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": NaN, "eval_runtime": 130.0879, "eval_samples_per_second": 43.863, "eval_steps_per_second": 5.489, "step": 0 }, { "epoch": 0.008628127696289905, "grad_norm": 208.8164520263672, "learning_rate": 2.0000000000000003e-06, "loss": 44.1582, "step": 10 }, { "epoch": 0.01725625539257981, "grad_norm": 95.85818481445312, "learning_rate": 7.000000000000001e-06, "loss": 20.8334, "step": 20 }, { "epoch": 0.025884383088869714, "grad_norm": 30.946754455566406, "learning_rate": 1.2e-05, "loss": 6.8289, "step": 30 }, { "epoch": 0.03451251078515962, "grad_norm": 15.001774787902832, "learning_rate": 1.7000000000000003e-05, "loss": 6.1975, "step": 40 }, { "epoch": 0.04314063848144953, "grad_norm": 36.383766174316406, "learning_rate": 2.2000000000000003e-05, "loss": 9.5655, "step": 50 }, { "epoch": 0.05176876617773943, "grad_norm": 21.175804138183594, "learning_rate": 2.7000000000000002e-05, "loss": 8.0912, "step": 60 }, { "epoch": 0.060396893874029335, "grad_norm": 18.93609046936035, "learning_rate": 3.2000000000000005e-05, "loss": 10.4302, "step": 70 }, { "epoch": 0.06902502157031924, "grad_norm": 15.232029914855957, "learning_rate": 3.7e-05, "loss": 9.1437, "step": 80 }, { "epoch": 0.07765314926660914, "grad_norm": 61.93816375732422, "learning_rate": 4.2e-05, "loss": 7.0303, "step": 90 }, { "epoch": 0.08628127696289906, "grad_norm": 13.675076484680176, "learning_rate": 4.7e-05, "loss": 6.6254, "step": 100 }, { "epoch": 0.09490940465918896, "grad_norm": 15.733867645263672, "learning_rate": 4.9811142587346554e-05, "loss": 11.0107, "step": 110 }, { "epoch": 0.10353753235547886, "grad_norm": 23.753643035888672, "learning_rate": 4.9338999055712935e-05, "loss": 8.7021, "step": 120 }, { "epoch": 0.11216566005176877, "grad_norm": 17.372941970825195, "learning_rate": 4.8866855524079323e-05, "loss": 7.971, "step": 130 }, { "epoch": 0.12079378774805867, "grad_norm": 16.303627014160156, "learning_rate": 4.8394711992445705e-05, "loss": 7.7894, "step": 140 }, { "epoch": 0.12942191544434858, "grad_norm": 22.732084274291992, "learning_rate": 4.7922568460812086e-05, "loss": 8.4125, "step": 150 }, { "epoch": 0.13805004314063848, "grad_norm": 7.587573528289795, "learning_rate": 4.7450424929178475e-05, "loss": 8.7211, "step": 160 }, { "epoch": 0.14667817083692838, "grad_norm": 11.529291152954102, "learning_rate": 4.6978281397544856e-05, "loss": 7.4511, "step": 170 }, { "epoch": 0.15530629853321828, "grad_norm": 18.30257225036621, "learning_rate": 4.650613786591124e-05, "loss": 8.3549, "step": 180 }, { "epoch": 0.16393442622950818, "grad_norm": 35.710201263427734, "learning_rate": 4.6033994334277626e-05, "loss": 7.7639, "step": 190 }, { "epoch": 0.1725625539257981, "grad_norm": 17.525699615478516, "learning_rate": 4.556185080264401e-05, "loss": 8.7268, "step": 200 }, { "epoch": 0.181190681622088, "grad_norm": 12.543689727783203, "learning_rate": 4.508970727101039e-05, "loss": 6.5113, "step": 210 }, { "epoch": 0.1898188093183779, "grad_norm": 11.80916976928711, "learning_rate": 4.461756373937677e-05, "loss": 7.9873, "step": 220 }, { "epoch": 0.1984469370146678, "grad_norm": 5.335144996643066, "learning_rate": 4.414542020774315e-05, "loss": 7.3828, "step": 230 }, { "epoch": 0.2070750647109577, "grad_norm": 14.807497024536133, "learning_rate": 4.3673276676109534e-05, "loss": 7.9503, "step": 240 }, { "epoch": 0.21570319240724764, "grad_norm": 7.473222255706787, "learning_rate": 4.320113314447592e-05, "loss": 9.4133, "step": 250 }, { "epoch": 0.22433132010353754, "grad_norm": 16.84607696533203, "learning_rate": 4.272898961284231e-05, "loss": 11.3476, "step": 260 }, { "epoch": 0.23295944779982744, "grad_norm": 14.314810752868652, "learning_rate": 4.225684608120869e-05, "loss": 8.3349, "step": 270 }, { "epoch": 0.24158757549611734, "grad_norm": 7.726313591003418, "learning_rate": 4.178470254957507e-05, "loss": 9.0962, "step": 280 }, { "epoch": 0.25021570319240727, "grad_norm": 5.218513488769531, "learning_rate": 4.1312559017941455e-05, "loss": 9.2658, "step": 290 }, { "epoch": 0.25884383088869717, "grad_norm": 8.278141021728516, "learning_rate": 4.0840415486307836e-05, "loss": 12.7095, "step": 300 }, { "epoch": 0.26747195858498707, "grad_norm": 10.2701416015625, "learning_rate": 4.0368271954674225e-05, "loss": 10.1141, "step": 310 }, { "epoch": 0.27610008628127697, "grad_norm": 6.783595085144043, "learning_rate": 3.9896128423040606e-05, "loss": 9.1178, "step": 320 }, { "epoch": 0.28472821397756687, "grad_norm": 12.468270301818848, "learning_rate": 3.942398489140699e-05, "loss": 7.8647, "step": 330 }, { "epoch": 0.29335634167385677, "grad_norm": 8.456822395324707, "learning_rate": 3.895184135977337e-05, "loss": 8.4621, "step": 340 }, { "epoch": 0.30198446937014667, "grad_norm": 11.295036315917969, "learning_rate": 3.847969782813976e-05, "loss": 9.196, "step": 350 }, { "epoch": 0.31061259706643657, "grad_norm": 14.213314056396484, "learning_rate": 3.800755429650614e-05, "loss": 8.1778, "step": 360 }, { "epoch": 0.31924072476272647, "grad_norm": 14.220091819763184, "learning_rate": 3.753541076487253e-05, "loss": 7.8913, "step": 370 }, { "epoch": 0.32786885245901637, "grad_norm": 13.04310131072998, "learning_rate": 3.706326723323891e-05, "loss": 7.7094, "step": 380 }, { "epoch": 0.3364969801553063, "grad_norm": 10.724278450012207, "learning_rate": 3.659112370160529e-05, "loss": 9.4355, "step": 390 }, { "epoch": 0.3451251078515962, "grad_norm": 23.478923797607422, "learning_rate": 3.611898016997167e-05, "loss": 7.8075, "step": 400 }, { "epoch": 0.3537532355478861, "grad_norm": 13.352030754089355, "learning_rate": 3.564683663833805e-05, "loss": 7.3814, "step": 410 }, { "epoch": 0.362381363244176, "grad_norm": 9.937482833862305, "learning_rate": 3.5174693106704435e-05, "loss": 8.9733, "step": 420 }, { "epoch": 0.3710094909404659, "grad_norm": 11.626459121704102, "learning_rate": 3.470254957507082e-05, "loss": 10.1639, "step": 430 }, { "epoch": 0.3796376186367558, "grad_norm": 10.614298820495605, "learning_rate": 3.4230406043437205e-05, "loss": 7.7592, "step": 440 }, { "epoch": 0.3882657463330457, "grad_norm": 4.33737850189209, "learning_rate": 3.375826251180359e-05, "loss": 9.8424, "step": 450 }, { "epoch": 0.3968938740293356, "grad_norm": 13.16214370727539, "learning_rate": 3.3286118980169974e-05, "loss": 6.1196, "step": 460 }, { "epoch": 0.4055220017256255, "grad_norm": 12.274761199951172, "learning_rate": 3.2813975448536356e-05, "loss": 7.7816, "step": 470 }, { "epoch": 0.4141501294219154, "grad_norm": 9.882450103759766, "learning_rate": 3.234183191690274e-05, "loss": 8.8377, "step": 480 }, { "epoch": 0.4227782571182053, "grad_norm": 7.4602766036987305, "learning_rate": 3.1869688385269126e-05, "loss": 7.8737, "step": 490 }, { "epoch": 0.4314063848144953, "grad_norm": 11.084466934204102, "learning_rate": 3.139754485363551e-05, "loss": 7.4251, "step": 500 }, { "epoch": 0.4400345125107852, "grad_norm": 14.730960845947266, "learning_rate": 3.092540132200189e-05, "loss": 5.4958, "step": 510 }, { "epoch": 0.4486626402070751, "grad_norm": 12.292424201965332, "learning_rate": 3.0453257790368274e-05, "loss": 7.7441, "step": 520 }, { "epoch": 0.457290767903365, "grad_norm": 14.31530475616455, "learning_rate": 2.9981114258734655e-05, "loss": 9.7954, "step": 530 }, { "epoch": 0.4659188955996549, "grad_norm": 56.466800689697266, "learning_rate": 2.9508970727101037e-05, "loss": 7.182, "step": 540 }, { "epoch": 0.4745470232959448, "grad_norm": 10.024373054504395, "learning_rate": 2.9036827195467425e-05, "loss": 9.0355, "step": 550 }, { "epoch": 0.4831751509922347, "grad_norm": 6.825662612915039, "learning_rate": 2.8564683663833807e-05, "loss": 9.4844, "step": 560 }, { "epoch": 0.4918032786885246, "grad_norm": 7.98593807220459, "learning_rate": 2.809254013220019e-05, "loss": 7.0413, "step": 570 }, { "epoch": 0.5004314063848145, "grad_norm": 9.947932243347168, "learning_rate": 2.7620396600566573e-05, "loss": 7.2181, "step": 580 }, { "epoch": 0.5090595340811044, "grad_norm": 14.444029808044434, "learning_rate": 2.7148253068932954e-05, "loss": 8.3113, "step": 590 }, { "epoch": 0.5176876617773943, "grad_norm": 5.243804931640625, "learning_rate": 2.6676109537299343e-05, "loss": 9.8681, "step": 600 }, { "epoch": 0.5263157894736842, "grad_norm": 15.272309303283691, "learning_rate": 2.6203966005665724e-05, "loss": 6.4987, "step": 610 }, { "epoch": 0.5349439171699741, "grad_norm": 14.450007438659668, "learning_rate": 2.573182247403211e-05, "loss": 8.7722, "step": 620 }, { "epoch": 0.543572044866264, "grad_norm": 4.825082778930664, "learning_rate": 2.525967894239849e-05, "loss": 10.2199, "step": 630 }, { "epoch": 0.5522001725625539, "grad_norm": 4.846455097198486, "learning_rate": 2.4787535410764872e-05, "loss": 6.8151, "step": 640 }, { "epoch": 0.5608283002588438, "grad_norm": 18.088956832885742, "learning_rate": 2.4315391879131257e-05, "loss": 6.9832, "step": 650 }, { "epoch": 0.5694564279551337, "grad_norm": 3.5512518882751465, "learning_rate": 2.3843248347497642e-05, "loss": 9.6627, "step": 660 }, { "epoch": 0.5780845556514237, "grad_norm": 8.27271842956543, "learning_rate": 2.3371104815864024e-05, "loss": 8.0493, "step": 670 }, { "epoch": 0.5867126833477135, "grad_norm": 7.861291408538818, "learning_rate": 2.289896128423041e-05, "loss": 6.2845, "step": 680 }, { "epoch": 0.5953408110440035, "grad_norm": 11.14802074432373, "learning_rate": 2.242681775259679e-05, "loss": 8.7777, "step": 690 }, { "epoch": 0.6039689387402933, "grad_norm": 8.262513160705566, "learning_rate": 2.195467422096317e-05, "loss": 8.8687, "step": 700 }, { "epoch": 0.6125970664365833, "grad_norm": 18.18682861328125, "learning_rate": 2.1482530689329556e-05, "loss": 9.9046, "step": 710 }, { "epoch": 0.6212251941328731, "grad_norm": 6.271354675292969, "learning_rate": 2.101038715769594e-05, "loss": 7.5048, "step": 720 }, { "epoch": 0.6298533218291631, "grad_norm": 11.321942329406738, "learning_rate": 2.0538243626062323e-05, "loss": 9.4543, "step": 730 }, { "epoch": 0.6384814495254529, "grad_norm": 6.52807092666626, "learning_rate": 2.0066100094428708e-05, "loss": 7.942, "step": 740 }, { "epoch": 0.6471095772217429, "grad_norm": 9.259538650512695, "learning_rate": 1.959395656279509e-05, "loss": 7.9326, "step": 750 }, { "epoch": 0.6557377049180327, "grad_norm": 11.931748390197754, "learning_rate": 1.9121813031161474e-05, "loss": 10.2928, "step": 760 }, { "epoch": 0.6643658326143227, "grad_norm": 26.903627395629883, "learning_rate": 1.864966949952786e-05, "loss": 7.4203, "step": 770 }, { "epoch": 0.6729939603106126, "grad_norm": 4.38643741607666, "learning_rate": 1.817752596789424e-05, "loss": 8.5358, "step": 780 }, { "epoch": 0.6816220880069025, "grad_norm": 13.639317512512207, "learning_rate": 1.7705382436260622e-05, "loss": 9.0024, "step": 790 }, { "epoch": 0.6902502157031924, "grad_norm": 7.604945659637451, "learning_rate": 1.7233238904627007e-05, "loss": 8.7945, "step": 800 }, { "epoch": 0.6988783433994823, "grad_norm": 4.23636531829834, "learning_rate": 1.6761095372993392e-05, "loss": 9.6466, "step": 810 }, { "epoch": 0.7075064710957722, "grad_norm": 9.454889297485352, "learning_rate": 1.6288951841359773e-05, "loss": 7.9837, "step": 820 }, { "epoch": 0.7161345987920621, "grad_norm": 9.216086387634277, "learning_rate": 1.5816808309726158e-05, "loss": 7.5399, "step": 830 }, { "epoch": 0.724762726488352, "grad_norm": 9.077666282653809, "learning_rate": 1.534466477809254e-05, "loss": 8.0818, "step": 840 }, { "epoch": 0.7333908541846419, "grad_norm": 11.995235443115234, "learning_rate": 1.4872521246458923e-05, "loss": 7.4127, "step": 850 }, { "epoch": 0.7420189818809318, "grad_norm": 5.774163722991943, "learning_rate": 1.4400377714825308e-05, "loss": 7.7635, "step": 860 }, { "epoch": 0.7506471095772217, "grad_norm": 3.043311357498169, "learning_rate": 1.3928234183191691e-05, "loss": 7.4845, "step": 870 }, { "epoch": 0.7592752372735116, "grad_norm": 9.676830291748047, "learning_rate": 1.3456090651558073e-05, "loss": 8.4687, "step": 880 }, { "epoch": 0.7679033649698016, "grad_norm": 20.002187728881836, "learning_rate": 1.2983947119924458e-05, "loss": 6.5787, "step": 890 }, { "epoch": 0.7765314926660914, "grad_norm": 7.795656681060791, "learning_rate": 1.251180358829084e-05, "loss": 5.2528, "step": 900 }, { "epoch": 0.7851596203623814, "grad_norm": 10.025925636291504, "learning_rate": 1.2039660056657224e-05, "loss": 6.2703, "step": 910 }, { "epoch": 0.7937877480586712, "grad_norm": 5.656552314758301, "learning_rate": 1.1567516525023609e-05, "loss": 6.6007, "step": 920 }, { "epoch": 0.8024158757549612, "grad_norm": 11.046673774719238, "learning_rate": 1.109537299338999e-05, "loss": 6.9978, "step": 930 }, { "epoch": 0.811044003451251, "grad_norm": 20.649337768554688, "learning_rate": 1.0623229461756375e-05, "loss": 8.2104, "step": 940 }, { "epoch": 0.819672131147541, "grad_norm": 12.507070541381836, "learning_rate": 1.0151085930122758e-05, "loss": 7.3538, "step": 950 }, { "epoch": 0.8283002588438308, "grad_norm": 8.762845039367676, "learning_rate": 9.678942398489142e-06, "loss": 6.6649, "step": 960 }, { "epoch": 0.8369283865401208, "grad_norm": 4.919765472412109, "learning_rate": 9.206798866855525e-06, "loss": 6.1039, "step": 970 }, { "epoch": 0.8455565142364107, "grad_norm": 13.5018310546875, "learning_rate": 8.734655335221908e-06, "loss": 7.5592, "step": 980 }, { "epoch": 0.8541846419327006, "grad_norm": 7.929318428039551, "learning_rate": 8.262511803588291e-06, "loss": 7.3745, "step": 990 }, { "epoch": 0.8628127696289906, "grad_norm": 2.267650604248047, "learning_rate": 7.790368271954675e-06, "loss": 6.4057, "step": 1000 }, { "epoch": 0.8714408973252804, "grad_norm": 7.356956958770752, "learning_rate": 7.3182247403210586e-06, "loss": 6.1371, "step": 1010 }, { "epoch": 0.8800690250215704, "grad_norm": 9.035082817077637, "learning_rate": 6.846081208687441e-06, "loss": 6.4763, "step": 1020 }, { "epoch": 0.8886971527178602, "grad_norm": 4.9309844970703125, "learning_rate": 6.373937677053825e-06, "loss": 7.2184, "step": 1030 }, { "epoch": 0.8973252804141502, "grad_norm": 13.067819595336914, "learning_rate": 5.901794145420208e-06, "loss": 9.0975, "step": 1040 }, { "epoch": 0.90595340811044, "grad_norm": 1.801363468170166, "learning_rate": 5.429650613786591e-06, "loss": 8.5242, "step": 1050 }, { "epoch": 0.91458153580673, "grad_norm": 21.638532638549805, "learning_rate": 4.957507082152975e-06, "loss": 6.6817, "step": 1060 }, { "epoch": 0.9232096635030198, "grad_norm": 11.125391006469727, "learning_rate": 4.485363550519358e-06, "loss": 7.3333, "step": 1070 }, { "epoch": 0.9318377911993098, "grad_norm": 20.023378372192383, "learning_rate": 4.013220018885742e-06, "loss": 7.5053, "step": 1080 }, { "epoch": 0.9404659188955996, "grad_norm": 6.69666051864624, "learning_rate": 3.541076487252125e-06, "loss": 7.1582, "step": 1090 }, { "epoch": 0.9490940465918896, "grad_norm": 8.274604797363281, "learning_rate": 3.0689329556185083e-06, "loss": 8.7796, "step": 1100 }, { "epoch": 0.9577221742881795, "grad_norm": 5.5356221199035645, "learning_rate": 2.5967894239848915e-06, "loss": 9.3139, "step": 1110 }, { "epoch": 0.9663503019844694, "grad_norm": 5.540348052978516, "learning_rate": 2.124645892351275e-06, "loss": 8.2046, "step": 1120 }, { "epoch": 0.9749784296807593, "grad_norm": 9.052054405212402, "learning_rate": 1.6525023607176583e-06, "loss": 7.8996, "step": 1130 }, { "epoch": 0.9836065573770492, "grad_norm": 2.191704034805298, "learning_rate": 1.1803588290840418e-06, "loss": 10.2246, "step": 1140 }, { "epoch": 0.9922346850733391, "grad_norm": 1.935456395149231, "learning_rate": 7.08215297450425e-07, "loss": 8.5122, "step": 1150 } ], "logging_steps": 10, "max_steps": 1159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }