| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 2000, | |
| "global_step": 1159, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_loss": NaN, | |
| "eval_runtime": 130.0879, | |
| "eval_samples_per_second": 43.863, | |
| "eval_steps_per_second": 5.489, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.008628127696289905, | |
| "grad_norm": 208.8164520263672, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 44.1582, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01725625539257981, | |
| "grad_norm": 95.85818481445312, | |
| "learning_rate": 7.000000000000001e-06, | |
| "loss": 20.8334, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.025884383088869714, | |
| "grad_norm": 30.946754455566406, | |
| "learning_rate": 1.2e-05, | |
| "loss": 6.8289, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03451251078515962, | |
| "grad_norm": 15.001774787902832, | |
| "learning_rate": 1.7000000000000003e-05, | |
| "loss": 6.1975, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04314063848144953, | |
| "grad_norm": 36.383766174316406, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 9.5655, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05176876617773943, | |
| "grad_norm": 21.175804138183594, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 8.0912, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.060396893874029335, | |
| "grad_norm": 18.93609046936035, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 10.4302, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.06902502157031924, | |
| "grad_norm": 15.232029914855957, | |
| "learning_rate": 3.7e-05, | |
| "loss": 9.1437, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.07765314926660914, | |
| "grad_norm": 61.93816375732422, | |
| "learning_rate": 4.2e-05, | |
| "loss": 7.0303, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08628127696289906, | |
| "grad_norm": 13.675076484680176, | |
| "learning_rate": 4.7e-05, | |
| "loss": 6.6254, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09490940465918896, | |
| "grad_norm": 15.733867645263672, | |
| "learning_rate": 4.9811142587346554e-05, | |
| "loss": 11.0107, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10353753235547886, | |
| "grad_norm": 23.753643035888672, | |
| "learning_rate": 4.9338999055712935e-05, | |
| "loss": 8.7021, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.11216566005176877, | |
| "grad_norm": 17.372941970825195, | |
| "learning_rate": 4.8866855524079323e-05, | |
| "loss": 7.971, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.12079378774805867, | |
| "grad_norm": 16.303627014160156, | |
| "learning_rate": 4.8394711992445705e-05, | |
| "loss": 7.7894, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.12942191544434858, | |
| "grad_norm": 22.732084274291992, | |
| "learning_rate": 4.7922568460812086e-05, | |
| "loss": 8.4125, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.13805004314063848, | |
| "grad_norm": 7.587573528289795, | |
| "learning_rate": 4.7450424929178475e-05, | |
| "loss": 8.7211, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.14667817083692838, | |
| "grad_norm": 11.529291152954102, | |
| "learning_rate": 4.6978281397544856e-05, | |
| "loss": 7.4511, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.15530629853321828, | |
| "grad_norm": 18.30257225036621, | |
| "learning_rate": 4.650613786591124e-05, | |
| "loss": 8.3549, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.16393442622950818, | |
| "grad_norm": 35.710201263427734, | |
| "learning_rate": 4.6033994334277626e-05, | |
| "loss": 7.7639, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1725625539257981, | |
| "grad_norm": 17.525699615478516, | |
| "learning_rate": 4.556185080264401e-05, | |
| "loss": 8.7268, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.181190681622088, | |
| "grad_norm": 12.543689727783203, | |
| "learning_rate": 4.508970727101039e-05, | |
| "loss": 6.5113, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1898188093183779, | |
| "grad_norm": 11.80916976928711, | |
| "learning_rate": 4.461756373937677e-05, | |
| "loss": 7.9873, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1984469370146678, | |
| "grad_norm": 5.335144996643066, | |
| "learning_rate": 4.414542020774315e-05, | |
| "loss": 7.3828, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2070750647109577, | |
| "grad_norm": 14.807497024536133, | |
| "learning_rate": 4.3673276676109534e-05, | |
| "loss": 7.9503, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.21570319240724764, | |
| "grad_norm": 7.473222255706787, | |
| "learning_rate": 4.320113314447592e-05, | |
| "loss": 9.4133, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.22433132010353754, | |
| "grad_norm": 16.84607696533203, | |
| "learning_rate": 4.272898961284231e-05, | |
| "loss": 11.3476, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.23295944779982744, | |
| "grad_norm": 14.314810752868652, | |
| "learning_rate": 4.225684608120869e-05, | |
| "loss": 8.3349, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.24158757549611734, | |
| "grad_norm": 7.726313591003418, | |
| "learning_rate": 4.178470254957507e-05, | |
| "loss": 9.0962, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.25021570319240727, | |
| "grad_norm": 5.218513488769531, | |
| "learning_rate": 4.1312559017941455e-05, | |
| "loss": 9.2658, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.25884383088869717, | |
| "grad_norm": 8.278141021728516, | |
| "learning_rate": 4.0840415486307836e-05, | |
| "loss": 12.7095, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.26747195858498707, | |
| "grad_norm": 10.2701416015625, | |
| "learning_rate": 4.0368271954674225e-05, | |
| "loss": 10.1141, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.27610008628127697, | |
| "grad_norm": 6.783595085144043, | |
| "learning_rate": 3.9896128423040606e-05, | |
| "loss": 9.1178, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.28472821397756687, | |
| "grad_norm": 12.468270301818848, | |
| "learning_rate": 3.942398489140699e-05, | |
| "loss": 7.8647, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.29335634167385677, | |
| "grad_norm": 8.456822395324707, | |
| "learning_rate": 3.895184135977337e-05, | |
| "loss": 8.4621, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.30198446937014667, | |
| "grad_norm": 11.295036315917969, | |
| "learning_rate": 3.847969782813976e-05, | |
| "loss": 9.196, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.31061259706643657, | |
| "grad_norm": 14.213314056396484, | |
| "learning_rate": 3.800755429650614e-05, | |
| "loss": 8.1778, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.31924072476272647, | |
| "grad_norm": 14.220091819763184, | |
| "learning_rate": 3.753541076487253e-05, | |
| "loss": 7.8913, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.32786885245901637, | |
| "grad_norm": 13.04310131072998, | |
| "learning_rate": 3.706326723323891e-05, | |
| "loss": 7.7094, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3364969801553063, | |
| "grad_norm": 10.724278450012207, | |
| "learning_rate": 3.659112370160529e-05, | |
| "loss": 9.4355, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3451251078515962, | |
| "grad_norm": 23.478923797607422, | |
| "learning_rate": 3.611898016997167e-05, | |
| "loss": 7.8075, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3537532355478861, | |
| "grad_norm": 13.352030754089355, | |
| "learning_rate": 3.564683663833805e-05, | |
| "loss": 7.3814, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.362381363244176, | |
| "grad_norm": 9.937482833862305, | |
| "learning_rate": 3.5174693106704435e-05, | |
| "loss": 8.9733, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.3710094909404659, | |
| "grad_norm": 11.626459121704102, | |
| "learning_rate": 3.470254957507082e-05, | |
| "loss": 10.1639, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.3796376186367558, | |
| "grad_norm": 10.614298820495605, | |
| "learning_rate": 3.4230406043437205e-05, | |
| "loss": 7.7592, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.3882657463330457, | |
| "grad_norm": 4.33737850189209, | |
| "learning_rate": 3.375826251180359e-05, | |
| "loss": 9.8424, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3968938740293356, | |
| "grad_norm": 13.16214370727539, | |
| "learning_rate": 3.3286118980169974e-05, | |
| "loss": 6.1196, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4055220017256255, | |
| "grad_norm": 12.274761199951172, | |
| "learning_rate": 3.2813975448536356e-05, | |
| "loss": 7.7816, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4141501294219154, | |
| "grad_norm": 9.882450103759766, | |
| "learning_rate": 3.234183191690274e-05, | |
| "loss": 8.8377, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4227782571182053, | |
| "grad_norm": 7.4602766036987305, | |
| "learning_rate": 3.1869688385269126e-05, | |
| "loss": 7.8737, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4314063848144953, | |
| "grad_norm": 11.084466934204102, | |
| "learning_rate": 3.139754485363551e-05, | |
| "loss": 7.4251, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4400345125107852, | |
| "grad_norm": 14.730960845947266, | |
| "learning_rate": 3.092540132200189e-05, | |
| "loss": 5.4958, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.4486626402070751, | |
| "grad_norm": 12.292424201965332, | |
| "learning_rate": 3.0453257790368274e-05, | |
| "loss": 7.7441, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.457290767903365, | |
| "grad_norm": 14.31530475616455, | |
| "learning_rate": 2.9981114258734655e-05, | |
| "loss": 9.7954, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.4659188955996549, | |
| "grad_norm": 56.466800689697266, | |
| "learning_rate": 2.9508970727101037e-05, | |
| "loss": 7.182, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.4745470232959448, | |
| "grad_norm": 10.024373054504395, | |
| "learning_rate": 2.9036827195467425e-05, | |
| "loss": 9.0355, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.4831751509922347, | |
| "grad_norm": 6.825662612915039, | |
| "learning_rate": 2.8564683663833807e-05, | |
| "loss": 9.4844, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4918032786885246, | |
| "grad_norm": 7.98593807220459, | |
| "learning_rate": 2.809254013220019e-05, | |
| "loss": 7.0413, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5004314063848145, | |
| "grad_norm": 9.947932243347168, | |
| "learning_rate": 2.7620396600566573e-05, | |
| "loss": 7.2181, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5090595340811044, | |
| "grad_norm": 14.444029808044434, | |
| "learning_rate": 2.7148253068932954e-05, | |
| "loss": 8.3113, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5176876617773943, | |
| "grad_norm": 5.243804931640625, | |
| "learning_rate": 2.6676109537299343e-05, | |
| "loss": 9.8681, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 15.272309303283691, | |
| "learning_rate": 2.6203966005665724e-05, | |
| "loss": 6.4987, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.5349439171699741, | |
| "grad_norm": 14.450007438659668, | |
| "learning_rate": 2.573182247403211e-05, | |
| "loss": 8.7722, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.543572044866264, | |
| "grad_norm": 4.825082778930664, | |
| "learning_rate": 2.525967894239849e-05, | |
| "loss": 10.2199, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5522001725625539, | |
| "grad_norm": 4.846455097198486, | |
| "learning_rate": 2.4787535410764872e-05, | |
| "loss": 6.8151, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5608283002588438, | |
| "grad_norm": 18.088956832885742, | |
| "learning_rate": 2.4315391879131257e-05, | |
| "loss": 6.9832, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5694564279551337, | |
| "grad_norm": 3.5512518882751465, | |
| "learning_rate": 2.3843248347497642e-05, | |
| "loss": 9.6627, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5780845556514237, | |
| "grad_norm": 8.27271842956543, | |
| "learning_rate": 2.3371104815864024e-05, | |
| "loss": 8.0493, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5867126833477135, | |
| "grad_norm": 7.861291408538818, | |
| "learning_rate": 2.289896128423041e-05, | |
| "loss": 6.2845, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5953408110440035, | |
| "grad_norm": 11.14802074432373, | |
| "learning_rate": 2.242681775259679e-05, | |
| "loss": 8.7777, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6039689387402933, | |
| "grad_norm": 8.262513160705566, | |
| "learning_rate": 2.195467422096317e-05, | |
| "loss": 8.8687, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6125970664365833, | |
| "grad_norm": 18.18682861328125, | |
| "learning_rate": 2.1482530689329556e-05, | |
| "loss": 9.9046, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6212251941328731, | |
| "grad_norm": 6.271354675292969, | |
| "learning_rate": 2.101038715769594e-05, | |
| "loss": 7.5048, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6298533218291631, | |
| "grad_norm": 11.321942329406738, | |
| "learning_rate": 2.0538243626062323e-05, | |
| "loss": 9.4543, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.6384814495254529, | |
| "grad_norm": 6.52807092666626, | |
| "learning_rate": 2.0066100094428708e-05, | |
| "loss": 7.942, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6471095772217429, | |
| "grad_norm": 9.259538650512695, | |
| "learning_rate": 1.959395656279509e-05, | |
| "loss": 7.9326, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 11.931748390197754, | |
| "learning_rate": 1.9121813031161474e-05, | |
| "loss": 10.2928, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6643658326143227, | |
| "grad_norm": 26.903627395629883, | |
| "learning_rate": 1.864966949952786e-05, | |
| "loss": 7.4203, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.6729939603106126, | |
| "grad_norm": 4.38643741607666, | |
| "learning_rate": 1.817752596789424e-05, | |
| "loss": 8.5358, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.6816220880069025, | |
| "grad_norm": 13.639317512512207, | |
| "learning_rate": 1.7705382436260622e-05, | |
| "loss": 9.0024, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.6902502157031924, | |
| "grad_norm": 7.604945659637451, | |
| "learning_rate": 1.7233238904627007e-05, | |
| "loss": 8.7945, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6988783433994823, | |
| "grad_norm": 4.23636531829834, | |
| "learning_rate": 1.6761095372993392e-05, | |
| "loss": 9.6466, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7075064710957722, | |
| "grad_norm": 9.454889297485352, | |
| "learning_rate": 1.6288951841359773e-05, | |
| "loss": 7.9837, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.7161345987920621, | |
| "grad_norm": 9.216086387634277, | |
| "learning_rate": 1.5816808309726158e-05, | |
| "loss": 7.5399, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.724762726488352, | |
| "grad_norm": 9.077666282653809, | |
| "learning_rate": 1.534466477809254e-05, | |
| "loss": 8.0818, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.7333908541846419, | |
| "grad_norm": 11.995235443115234, | |
| "learning_rate": 1.4872521246458923e-05, | |
| "loss": 7.4127, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.7420189818809318, | |
| "grad_norm": 5.774163722991943, | |
| "learning_rate": 1.4400377714825308e-05, | |
| "loss": 7.7635, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.7506471095772217, | |
| "grad_norm": 3.043311357498169, | |
| "learning_rate": 1.3928234183191691e-05, | |
| "loss": 7.4845, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.7592752372735116, | |
| "grad_norm": 9.676830291748047, | |
| "learning_rate": 1.3456090651558073e-05, | |
| "loss": 8.4687, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7679033649698016, | |
| "grad_norm": 20.002187728881836, | |
| "learning_rate": 1.2983947119924458e-05, | |
| "loss": 6.5787, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.7765314926660914, | |
| "grad_norm": 7.795656681060791, | |
| "learning_rate": 1.251180358829084e-05, | |
| "loss": 5.2528, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7851596203623814, | |
| "grad_norm": 10.025925636291504, | |
| "learning_rate": 1.2039660056657224e-05, | |
| "loss": 6.2703, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.7937877480586712, | |
| "grad_norm": 5.656552314758301, | |
| "learning_rate": 1.1567516525023609e-05, | |
| "loss": 6.6007, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.8024158757549612, | |
| "grad_norm": 11.046673774719238, | |
| "learning_rate": 1.109537299338999e-05, | |
| "loss": 6.9978, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.811044003451251, | |
| "grad_norm": 20.649337768554688, | |
| "learning_rate": 1.0623229461756375e-05, | |
| "loss": 8.2104, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.819672131147541, | |
| "grad_norm": 12.507070541381836, | |
| "learning_rate": 1.0151085930122758e-05, | |
| "loss": 7.3538, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.8283002588438308, | |
| "grad_norm": 8.762845039367676, | |
| "learning_rate": 9.678942398489142e-06, | |
| "loss": 6.6649, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.8369283865401208, | |
| "grad_norm": 4.919765472412109, | |
| "learning_rate": 9.206798866855525e-06, | |
| "loss": 6.1039, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.8455565142364107, | |
| "grad_norm": 13.5018310546875, | |
| "learning_rate": 8.734655335221908e-06, | |
| "loss": 7.5592, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.8541846419327006, | |
| "grad_norm": 7.929318428039551, | |
| "learning_rate": 8.262511803588291e-06, | |
| "loss": 7.3745, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8628127696289906, | |
| "grad_norm": 2.267650604248047, | |
| "learning_rate": 7.790368271954675e-06, | |
| "loss": 6.4057, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8714408973252804, | |
| "grad_norm": 7.356956958770752, | |
| "learning_rate": 7.3182247403210586e-06, | |
| "loss": 6.1371, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.8800690250215704, | |
| "grad_norm": 9.035082817077637, | |
| "learning_rate": 6.846081208687441e-06, | |
| "loss": 6.4763, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.8886971527178602, | |
| "grad_norm": 4.9309844970703125, | |
| "learning_rate": 6.373937677053825e-06, | |
| "loss": 7.2184, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.8973252804141502, | |
| "grad_norm": 13.067819595336914, | |
| "learning_rate": 5.901794145420208e-06, | |
| "loss": 9.0975, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.90595340811044, | |
| "grad_norm": 1.801363468170166, | |
| "learning_rate": 5.429650613786591e-06, | |
| "loss": 8.5242, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.91458153580673, | |
| "grad_norm": 21.638532638549805, | |
| "learning_rate": 4.957507082152975e-06, | |
| "loss": 6.6817, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.9232096635030198, | |
| "grad_norm": 11.125391006469727, | |
| "learning_rate": 4.485363550519358e-06, | |
| "loss": 7.3333, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.9318377911993098, | |
| "grad_norm": 20.023378372192383, | |
| "learning_rate": 4.013220018885742e-06, | |
| "loss": 7.5053, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9404659188955996, | |
| "grad_norm": 6.69666051864624, | |
| "learning_rate": 3.541076487252125e-06, | |
| "loss": 7.1582, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.9490940465918896, | |
| "grad_norm": 8.274604797363281, | |
| "learning_rate": 3.0689329556185083e-06, | |
| "loss": 8.7796, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9577221742881795, | |
| "grad_norm": 5.5356221199035645, | |
| "learning_rate": 2.5967894239848915e-06, | |
| "loss": 9.3139, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.9663503019844694, | |
| "grad_norm": 5.540348052978516, | |
| "learning_rate": 2.124645892351275e-06, | |
| "loss": 8.2046, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.9749784296807593, | |
| "grad_norm": 9.052054405212402, | |
| "learning_rate": 1.6525023607176583e-06, | |
| "loss": 7.8996, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.9836065573770492, | |
| "grad_norm": 2.191704034805298, | |
| "learning_rate": 1.1803588290840418e-06, | |
| "loss": 10.2246, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.9922346850733391, | |
| "grad_norm": 1.935456395149231, | |
| "learning_rate": 7.08215297450425e-07, | |
| "loss": 8.5122, | |
| "step": 1150 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1159, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 4000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |