{ "best_global_step": 1, "best_metric": 1.4945952892303467, "best_model_checkpoint": null, "epoch": 0.06246096189881324, "eval_steps": 50, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.246096189881324e-05, "eval_loss": 1.4945952892303467, "eval_runtime": 43.2738, "eval_samples_per_second": 19.481, "eval_steps_per_second": 19.481, "step": 1 }, { "epoch": 0.0006246096189881324, "grad_norm": 116.0, "learning_rate": 0.045000000000000005, "loss": 44.6564, "step": 10 }, { "epoch": 0.0012492192379762648, "grad_norm": 81.5, "learning_rate": 0.04998980482070473, "loss": 266.9058, "step": 20 }, { "epoch": 0.0018738288569643974, "grad_norm": 126.0, "learning_rate": 0.049954572901111285, "loss": 349.7276, "step": 30 }, { "epoch": 0.0024984384759525295, "grad_norm": 47.25, "learning_rate": 0.04989421384191499, "loss": 355.1523, "step": 40 }, { "epoch": 0.003123048094940662, "grad_norm": 21.625, "learning_rate": 0.04980878841957203, "loss": 271.2299, "step": 50 }, { "epoch": 0.003123048094940662, "eval_loss": 138.12083435058594, "eval_runtime": 43.2248, "eval_samples_per_second": 19.503, "eval_steps_per_second": 19.503, "step": 50 }, { "epoch": 0.0037476577139287947, "grad_norm": 110.5, "learning_rate": 0.049698382650241506, "loss": 119.4599, "step": 60 }, { "epoch": 0.004372267332916927, "grad_norm": 876.0, "learning_rate": 0.04956310770317444, "loss": 79.5745, "step": 70 }, { "epoch": 0.004996876951905059, "grad_norm": 12.1875, "learning_rate": 0.04940309978877575, "loss": 43.8974, "step": 80 }, { "epoch": 0.005621486570893191, "grad_norm": 144.0, "learning_rate": 0.04921852002145197, "loss": 24.9981, "step": 90 }, { "epoch": 0.006246096189881324, "grad_norm": 65.0, "learning_rate": 0.04900955425738262, "loss": 22.4848, "step": 100 }, { "epoch": 0.006246096189881324, "eval_loss": 19.088651657104492, "eval_runtime": 54.7485, "eval_samples_per_second": 15.398, "eval_steps_per_second": 15.398, "step": 100 }, { "epoch": 0.006870705808869456, "grad_norm": 99.0, "learning_rate": 0.048776412907378844, "loss": 16.2813, "step": 110 }, { "epoch": 0.007495315427857589, "grad_norm": 15.0, "learning_rate": 0.04851933072501756, "loss": 14.2515, "step": 120 }, { "epoch": 0.008119925046845722, "grad_norm": 30.625, "learning_rate": 0.048238566570264485, "loss": 11.9391, "step": 130 }, { "epoch": 0.008744534665833853, "grad_norm": 24.125, "learning_rate": 0.047934403148824085, "loss": 11.4894, "step": 140 }, { "epoch": 0.009369144284821987, "grad_norm": 20.75, "learning_rate": 0.047607146727478934, "loss": 11.6593, "step": 150 }, { "epoch": 0.009369144284821987, "eval_loss": 10.226225852966309, "eval_runtime": 55.6759, "eval_samples_per_second": 15.141, "eval_steps_per_second": 15.141, "step": 150 }, { "epoch": 0.009993753903810118, "grad_norm": 154.0, "learning_rate": 0.04725712682570498, "loss": 13.3082, "step": 160 }, { "epoch": 0.010618363522798251, "grad_norm": 1224.0, "learning_rate": 0.046884695883873395, "loss": 13.503, "step": 170 }, { "epoch": 0.011242973141786383, "grad_norm": 122.0, "learning_rate": 0.04649022890837298, "loss": 13.4923, "step": 180 }, { "epoch": 0.011867582760774516, "grad_norm": 103.0, "learning_rate": 0.046074123094010544, "loss": 10.8538, "step": 190 }, { "epoch": 0.012492192379762648, "grad_norm": 39.75, "learning_rate": 0.04563679742406935, "loss": 13.9073, "step": 200 }, { "epoch": 0.012492192379762648, "eval_loss": 12.213621139526367, "eval_runtime": 52.8292, "eval_samples_per_second": 15.957, "eval_steps_per_second": 15.957, "step": 200 }, { "epoch": 0.01311680199875078, "grad_norm": 230.0, "learning_rate": 0.045178692248428534, "loss": 11.6545, "step": 210 }, { "epoch": 0.013741411617738912, "grad_norm": 50.5, "learning_rate": 0.04470026884016805, "loss": 9.9724, "step": 220 }, { "epoch": 0.014366021236727046, "grad_norm": 89.0, "learning_rate": 0.0442020089311058, "loss": 10.161, "step": 230 }, { "epoch": 0.014990630855715179, "grad_norm": 1152.0, "learning_rate": 0.043684414226734525, "loss": 9.1416, "step": 240 }, { "epoch": 0.01561524047470331, "grad_norm": 233.0, "learning_rate": 0.04314800590104691, "loss": 8.8728, "step": 250 }, { "epoch": 0.01561524047470331, "eval_loss": 8.775226593017578, "eval_runtime": 56.6712, "eval_samples_per_second": 14.875, "eval_steps_per_second": 14.875, "step": 250 }, { "epoch": 0.016239850093691444, "grad_norm": 2976.0, "learning_rate": 0.04259332407175751, "loss": 8.7117, "step": 260 }, { "epoch": 0.016864459712679577, "grad_norm": 604.0, "learning_rate": 0.04202092725645009, "loss": 10.3623, "step": 270 }, { "epoch": 0.017489069331667707, "grad_norm": 2688.0, "learning_rate": 0.04143139181019764, "loss": 10.8841, "step": 280 }, { "epoch": 0.01811367895065584, "grad_norm": 141.0, "learning_rate": 0.040825311345221764, "loss": 9.8937, "step": 290 }, { "epoch": 0.018738288569643973, "grad_norm": 252.0, "learning_rate": 0.04020329613317545, "loss": 12.0118, "step": 300 }, { "epoch": 0.018738288569643973, "eval_loss": 9.644805908203125, "eval_runtime": 56.3116, "eval_samples_per_second": 14.97, "eval_steps_per_second": 14.97, "step": 300 }, { "epoch": 0.019362898188632106, "grad_norm": 42.75, "learning_rate": 0.03956597249065126, "loss": 9.9067, "step": 310 }, { "epoch": 0.019987507807620236, "grad_norm": 200.0, "learning_rate": 0.0389139821485336, "loss": 7.9326, "step": 320 }, { "epoch": 0.02061211742660837, "grad_norm": 458.0, "learning_rate": 0.03824798160583012, "loss": 12.0065, "step": 330 }, { "epoch": 0.021236727045596503, "grad_norm": 684.0, "learning_rate": 0.037568641468632896, "loss": 15.9906, "step": 340 }, { "epoch": 0.021861336664584636, "grad_norm": 2160.0, "learning_rate": 0.03687664577487488, "loss": 15.0601, "step": 350 }, { "epoch": 0.021861336664584636, "eval_loss": 12.592202186584473, "eval_runtime": 52.3897, "eval_samples_per_second": 16.091, "eval_steps_per_second": 16.091, "step": 350 }, { "epoch": 0.022485946283572766, "grad_norm": 8384.0, "learning_rate": 0.03617269130556171, "loss": 13.996, "step": 360 }, { "epoch": 0.0231105559025609, "grad_norm": 438.0, "learning_rate": 0.035457486883172316, "loss": 10.1482, "step": 370 }, { "epoch": 0.023735165521549032, "grad_norm": 6464.0, "learning_rate": 0.03473175265793479, "loss": 9.1758, "step": 380 }, { "epoch": 0.024359775140537165, "grad_norm": 596.0, "learning_rate": 0.033996219382696063, "loss": 8.7717, "step": 390 }, { "epoch": 0.024984384759525295, "grad_norm": 44288.0, "learning_rate": 0.033251627677115835, "loss": 10.1332, "step": 400 }, { "epoch": 0.024984384759525295, "eval_loss": 9.592761993408203, "eval_runtime": 56.6259, "eval_samples_per_second": 14.887, "eval_steps_per_second": 14.887, "step": 400 }, { "epoch": 0.02560899437851343, "grad_norm": 2224.0, "learning_rate": 0.032498727281925266, "loss": 9.5517, "step": 410 }, { "epoch": 0.02623360399750156, "grad_norm": 4192.0, "learning_rate": 0.0317382763040017, "loss": 8.8907, "step": 420 }, { "epoch": 0.026858213616489695, "grad_norm": 4640.0, "learning_rate": 0.030971040453019225, "loss": 8.6126, "step": 430 }, { "epoch": 0.027482823235477825, "grad_norm": 960.0, "learning_rate": 0.03019779227044398, "loss": 9.0244, "step": 440 }, { "epoch": 0.028107432854465958, "grad_norm": 5760.0, "learning_rate": 0.029419310351650393, "loss": 8.9802, "step": 450 }, { "epoch": 0.028107432854465958, "eval_loss": 8.143258094787598, "eval_runtime": 56.465, "eval_samples_per_second": 14.93, "eval_steps_per_second": 14.93, "step": 450 }, { "epoch": 0.02873204247345409, "grad_norm": 442368.0, "learning_rate": 0.02863637856194159, "loss": 8.4952, "step": 460 }, { "epoch": 0.029356652092442224, "grad_norm": 272.0, "learning_rate": 0.027849785247263517, "loss": 7.9457, "step": 470 }, { "epoch": 0.029981261711430358, "grad_norm": 2240.0, "learning_rate": 0.02706032244040741, "loss": 7.8189, "step": 480 }, { "epoch": 0.030605871330418487, "grad_norm": 2448.0, "learning_rate": 0.026268785063499858, "loss": 8.0747, "step": 490 }, { "epoch": 0.03123048094940662, "grad_norm": 1896.0, "learning_rate": 0.025475970127583666, "loss": 14.4091, "step": 500 }, { "epoch": 0.03123048094940662, "eval_loss": 16.79290771484375, "eval_runtime": 52.5411, "eval_samples_per_second": 16.045, "eval_steps_per_second": 16.045, "step": 500 }, { "epoch": 0.03185509056839475, "grad_norm": 8192.0, "learning_rate": 0.024682675930095266, "loss": 15.3158, "step": 510 }, { "epoch": 0.03247970018738289, "grad_norm": 112.5, "learning_rate": 0.02388970125104685, "loss": 13.7666, "step": 520 }, { "epoch": 0.03310430980637102, "grad_norm": 4992.0, "learning_rate": 0.02309784454872262, "loss": 12.1874, "step": 530 }, { "epoch": 0.033728919425359154, "grad_norm": 278.0, "learning_rate": 0.022307903155699027, "loss": 9.8201, "step": 540 }, { "epoch": 0.034353529044347283, "grad_norm": 1480.0, "learning_rate": 0.02152067247599837, "loss": 10.4309, "step": 550 }, { "epoch": 0.034353529044347283, "eval_loss": 10.767745971679688, "eval_runtime": 53.7213, "eval_samples_per_second": 15.692, "eval_steps_per_second": 15.692, "step": 550 }, { "epoch": 0.03497813866333541, "grad_norm": 1072.0, "learning_rate": 0.020736945184184407, "loss": 9.5244, "step": 560 }, { "epoch": 0.03560274828232355, "grad_norm": 2352.0, "learning_rate": 0.019957510427206296, "loss": 8.832, "step": 570 }, { "epoch": 0.03622735790131168, "grad_norm": 540.0, "learning_rate": 0.01918315302979444, "loss": 10.0747, "step": 580 }, { "epoch": 0.03685196752029981, "grad_norm": 600.0, "learning_rate": 0.018414652704208584, "loss": 8.9236, "step": 590 }, { "epoch": 0.037476577139287946, "grad_norm": 644.0, "learning_rate": 0.017652783265133608, "loss": 8.524, "step": 600 }, { "epoch": 0.037476577139287946, "eval_loss": 8.281222343444824, "eval_runtime": 52.4549, "eval_samples_per_second": 16.071, "eval_steps_per_second": 16.071, "step": 600 }, { "epoch": 0.038101186758276076, "grad_norm": 56.75, "learning_rate": 0.01689831185051374, "loss": 8.3835, "step": 610 }, { "epoch": 0.03872579637726421, "grad_norm": 864.0, "learning_rate": 0.016151998149109708, "loss": 7.9404, "step": 620 }, { "epoch": 0.03935040599625234, "grad_norm": 458.0, "learning_rate": 0.015414593635556518, "loss": 8.7404, "step": 630 }, { "epoch": 0.03997501561524047, "grad_norm": 412.0, "learning_rate": 0.014686840813692224, "loss": 7.7151, "step": 640 }, { "epoch": 0.04059962523422861, "grad_norm": 95.5, "learning_rate": 0.013969472468919462, "loss": 7.7596, "step": 650 }, { "epoch": 0.04059962523422861, "eval_loss": 7.7680439949035645, "eval_runtime": 55.2903, "eval_samples_per_second": 15.247, "eval_steps_per_second": 15.247, "step": 650 }, { "epoch": 0.04122423485321674, "grad_norm": 876.0, "learning_rate": 0.013263210930352737, "loss": 8.5446, "step": 660 }, { "epoch": 0.04184884447220487, "grad_norm": 356.0, "learning_rate": 0.01256876734349413, "loss": 9.8441, "step": 670 }, { "epoch": 0.042473454091193005, "grad_norm": 21.0, "learning_rate": 0.011886840954170141, "loss": 8.588, "step": 680 }, { "epoch": 0.043098063710181135, "grad_norm": 158.0, "learning_rate": 0.011218118404450424, "loss": 7.9069, "step": 690 }, { "epoch": 0.04372267332916927, "grad_norm": 1848.0, "learning_rate": 0.010563273041257332, "loss": 8.0914, "step": 700 }, { "epoch": 0.04372267332916927, "eval_loss": 8.19365119934082, "eval_runtime": 67.1703, "eval_samples_per_second": 12.55, "eval_steps_per_second": 12.55, "step": 700 }, { "epoch": 0.0443472829481574, "grad_norm": 1304.0, "learning_rate": 0.009922964238362761, "loss": 8.057, "step": 710 }, { "epoch": 0.04497189256714553, "grad_norm": 187.0, "learning_rate": 0.009297836732454564, "loss": 7.3939, "step": 720 }, { "epoch": 0.04559650218613367, "grad_norm": 744.0, "learning_rate": 0.0086885199739414, "loss": 7.4594, "step": 730 }, { "epoch": 0.0462211118051218, "grad_norm": 35.0, "learning_rate": 0.00809562749314952, "loss": 7.2934, "step": 740 }, { "epoch": 0.046845721424109935, "grad_norm": 65024.0, "learning_rate": 0.0075197562825497334, "loss": 7.937, "step": 750 }, { "epoch": 0.046845721424109935, "eval_loss": 8.636336326599121, "eval_runtime": 52.4038, "eval_samples_per_second": 16.087, "eval_steps_per_second": 16.087, "step": 750 }, { "epoch": 0.047470331043098064, "grad_norm": 117.5, "learning_rate": 0.006961486195636613, "loss": 8.5237, "step": 760 }, { "epoch": 0.048094940662086194, "grad_norm": 1024.0, "learning_rate": 0.006421379363065142, "loss": 8.9334, "step": 770 }, { "epoch": 0.04871955028107433, "grad_norm": 1216.0, "learning_rate": 0.005899979626632835, "loss": 7.7087, "step": 780 }, { "epoch": 0.04934415990006246, "grad_norm": 1120.0, "learning_rate": 0.005397811991677107, "loss": 8.1393, "step": 790 }, { "epoch": 0.04996876951905059, "grad_norm": 374.0, "learning_rate": 0.0049153820984394365, "loss": 9.1765, "step": 800 }, { "epoch": 0.04996876951905059, "eval_loss": 9.214335441589355, "eval_runtime": 54.987, "eval_samples_per_second": 15.331, "eval_steps_per_second": 15.331, "step": 800 }, { "epoch": 0.05059337913803873, "grad_norm": 1072.0, "learning_rate": 0.004453175712928476, "loss": 8.6992, "step": 810 }, { "epoch": 0.05121798875702686, "grad_norm": 3584.0, "learning_rate": 0.004011658237794877, "loss": 7.6904, "step": 820 }, { "epoch": 0.051842598376014994, "grad_norm": 121.5, "learning_rate": 0.003591274243710277, "loss": 8.3538, "step": 830 }, { "epoch": 0.05246720799500312, "grad_norm": 524.0, "learning_rate": 0.0031924470217222834, "loss": 7.6898, "step": 840 }, { "epoch": 0.05309181761399125, "grad_norm": 516.0, "learning_rate": 0.002815578157036303, "loss": 8.4977, "step": 850 }, { "epoch": 0.05309181761399125, "eval_loss": 8.241488456726074, "eval_runtime": 55.3183, "eval_samples_per_second": 15.239, "eval_steps_per_second": 15.239, "step": 850 }, { "epoch": 0.05371642723297939, "grad_norm": 482.0, "learning_rate": 0.002461047124653279, "loss": 8.2881, "step": 860 }, { "epoch": 0.05434103685196752, "grad_norm": 1864.0, "learning_rate": 0.0021292109072704956, "loss": 7.7393, "step": 870 }, { "epoch": 0.05496564647095565, "grad_norm": 181.0, "learning_rate": 0.0018204036358303172, "loss": 7.7977, "step": 880 }, { "epoch": 0.055590256089943786, "grad_norm": 70.0, "learning_rate": 0.001534936253078606, "loss": 7.4679, "step": 890 }, { "epoch": 0.056214865708931916, "grad_norm": 756.0, "learning_rate": 0.0012730962004717683, "loss": 6.7936, "step": 900 }, { "epoch": 0.056214865708931916, "eval_loss": 7.608245372772217, "eval_runtime": 53.9447, "eval_samples_per_second": 15.627, "eval_steps_per_second": 15.627, "step": 900 }, { "epoch": 0.05683947532792005, "grad_norm": 63.5, "learning_rate": 0.0010351471287475406, "loss": 7.814, "step": 910 }, { "epoch": 0.05746408494690818, "grad_norm": 1088.0, "learning_rate": 0.0008213286324510738, "loss": 7.1516, "step": 920 }, { "epoch": 0.05808869456589631, "grad_norm": 268.0, "learning_rate": 0.000631856008683518, "loss": 7.2965, "step": 930 }, { "epoch": 0.05871330418488445, "grad_norm": 600.0, "learning_rate": 0.00046692004031609894, "loss": 7.5007, "step": 940 }, { "epoch": 0.05933791380387258, "grad_norm": 148.0, "learning_rate": 0.0003266868038879434, "loss": 7.4526, "step": 950 }, { "epoch": 0.05933791380387258, "eval_loss": 7.6293864250183105, "eval_runtime": 52.7226, "eval_samples_per_second": 15.989, "eval_steps_per_second": 15.989, "step": 950 }, { "epoch": 0.059962523422860715, "grad_norm": 40.5, "learning_rate": 0.00021129750238107204, "loss": 6.8338, "step": 960 }, { "epoch": 0.060587133041848845, "grad_norm": 268.0, "learning_rate": 0.00012086832304096795, "loss": 7.0505, "step": 970 }, { "epoch": 0.061211742660836975, "grad_norm": 76.5, "learning_rate": 5.54903203858731e-05, "loss": 7.9223, "step": 980 }, { "epoch": 0.06183635227982511, "grad_norm": 108.0, "learning_rate": 1.5229324522605948e-05, "loss": 7.8772, "step": 990 }, { "epoch": 0.06246096189881324, "grad_norm": 516.0, "learning_rate": 1.2587486122317416e-07, "loss": 7.4005, "step": 1000 }, { "epoch": 0.06246096189881324, "eval_loss": 7.635000705718994, "eval_runtime": 52.4898, "eval_samples_per_second": 16.06, "eval_steps_per_second": 16.06, "step": 1000 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 20, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 20 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.08462163968e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }