| { | |
| "best_global_step": 1, | |
| "best_metric": 1.4945952892303467, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.06246096189881324, | |
| "eval_steps": 50, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 6.246096189881324e-05, | |
| "eval_loss": 1.4945952892303467, | |
| "eval_runtime": 43.2738, | |
| "eval_samples_per_second": 19.481, | |
| "eval_steps_per_second": 19.481, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0006246096189881324, | |
| "grad_norm": 116.0, | |
| "learning_rate": 0.045000000000000005, | |
| "loss": 44.6564, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0012492192379762648, | |
| "grad_norm": 81.5, | |
| "learning_rate": 0.04998980482070473, | |
| "loss": 266.9058, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0018738288569643974, | |
| "grad_norm": 126.0, | |
| "learning_rate": 0.049954572901111285, | |
| "loss": 349.7276, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0024984384759525295, | |
| "grad_norm": 47.25, | |
| "learning_rate": 0.04989421384191499, | |
| "loss": 355.1523, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.003123048094940662, | |
| "grad_norm": 21.625, | |
| "learning_rate": 0.04980878841957203, | |
| "loss": 271.2299, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.003123048094940662, | |
| "eval_loss": 138.12083435058594, | |
| "eval_runtime": 43.2248, | |
| "eval_samples_per_second": 19.503, | |
| "eval_steps_per_second": 19.503, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0037476577139287947, | |
| "grad_norm": 110.5, | |
| "learning_rate": 0.049698382650241506, | |
| "loss": 119.4599, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.004372267332916927, | |
| "grad_norm": 876.0, | |
| "learning_rate": 0.04956310770317444, | |
| "loss": 79.5745, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.004996876951905059, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 0.04940309978877575, | |
| "loss": 43.8974, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.005621486570893191, | |
| "grad_norm": 144.0, | |
| "learning_rate": 0.04921852002145197, | |
| "loss": 24.9981, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.006246096189881324, | |
| "grad_norm": 65.0, | |
| "learning_rate": 0.04900955425738262, | |
| "loss": 22.4848, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.006246096189881324, | |
| "eval_loss": 19.088651657104492, | |
| "eval_runtime": 54.7485, | |
| "eval_samples_per_second": 15.398, | |
| "eval_steps_per_second": 15.398, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.006870705808869456, | |
| "grad_norm": 99.0, | |
| "learning_rate": 0.048776412907378844, | |
| "loss": 16.2813, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.007495315427857589, | |
| "grad_norm": 15.0, | |
| "learning_rate": 0.04851933072501756, | |
| "loss": 14.2515, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.008119925046845722, | |
| "grad_norm": 30.625, | |
| "learning_rate": 0.048238566570264485, | |
| "loss": 11.9391, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.008744534665833853, | |
| "grad_norm": 24.125, | |
| "learning_rate": 0.047934403148824085, | |
| "loss": 11.4894, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.009369144284821987, | |
| "grad_norm": 20.75, | |
| "learning_rate": 0.047607146727478934, | |
| "loss": 11.6593, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.009369144284821987, | |
| "eval_loss": 10.226225852966309, | |
| "eval_runtime": 55.6759, | |
| "eval_samples_per_second": 15.141, | |
| "eval_steps_per_second": 15.141, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.009993753903810118, | |
| "grad_norm": 154.0, | |
| "learning_rate": 0.04725712682570498, | |
| "loss": 13.3082, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.010618363522798251, | |
| "grad_norm": 1224.0, | |
| "learning_rate": 0.046884695883873395, | |
| "loss": 13.503, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.011242973141786383, | |
| "grad_norm": 122.0, | |
| "learning_rate": 0.04649022890837298, | |
| "loss": 13.4923, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.011867582760774516, | |
| "grad_norm": 103.0, | |
| "learning_rate": 0.046074123094010544, | |
| "loss": 10.8538, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.012492192379762648, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.04563679742406935, | |
| "loss": 13.9073, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.012492192379762648, | |
| "eval_loss": 12.213621139526367, | |
| "eval_runtime": 52.8292, | |
| "eval_samples_per_second": 15.957, | |
| "eval_steps_per_second": 15.957, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01311680199875078, | |
| "grad_norm": 230.0, | |
| "learning_rate": 0.045178692248428534, | |
| "loss": 11.6545, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.013741411617738912, | |
| "grad_norm": 50.5, | |
| "learning_rate": 0.04470026884016805, | |
| "loss": 9.9724, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.014366021236727046, | |
| "grad_norm": 89.0, | |
| "learning_rate": 0.0442020089311058, | |
| "loss": 10.161, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.014990630855715179, | |
| "grad_norm": 1152.0, | |
| "learning_rate": 0.043684414226734525, | |
| "loss": 9.1416, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.01561524047470331, | |
| "grad_norm": 233.0, | |
| "learning_rate": 0.04314800590104691, | |
| "loss": 8.8728, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.01561524047470331, | |
| "eval_loss": 8.775226593017578, | |
| "eval_runtime": 56.6712, | |
| "eval_samples_per_second": 14.875, | |
| "eval_steps_per_second": 14.875, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.016239850093691444, | |
| "grad_norm": 2976.0, | |
| "learning_rate": 0.04259332407175751, | |
| "loss": 8.7117, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.016864459712679577, | |
| "grad_norm": 604.0, | |
| "learning_rate": 0.04202092725645009, | |
| "loss": 10.3623, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.017489069331667707, | |
| "grad_norm": 2688.0, | |
| "learning_rate": 0.04143139181019764, | |
| "loss": 10.8841, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.01811367895065584, | |
| "grad_norm": 141.0, | |
| "learning_rate": 0.040825311345221764, | |
| "loss": 9.8937, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.018738288569643973, | |
| "grad_norm": 252.0, | |
| "learning_rate": 0.04020329613317545, | |
| "loss": 12.0118, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.018738288569643973, | |
| "eval_loss": 9.644805908203125, | |
| "eval_runtime": 56.3116, | |
| "eval_samples_per_second": 14.97, | |
| "eval_steps_per_second": 14.97, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.019362898188632106, | |
| "grad_norm": 42.75, | |
| "learning_rate": 0.03956597249065126, | |
| "loss": 9.9067, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.019987507807620236, | |
| "grad_norm": 200.0, | |
| "learning_rate": 0.0389139821485336, | |
| "loss": 7.9326, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.02061211742660837, | |
| "grad_norm": 458.0, | |
| "learning_rate": 0.03824798160583012, | |
| "loss": 12.0065, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.021236727045596503, | |
| "grad_norm": 684.0, | |
| "learning_rate": 0.037568641468632896, | |
| "loss": 15.9906, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.021861336664584636, | |
| "grad_norm": 2160.0, | |
| "learning_rate": 0.03687664577487488, | |
| "loss": 15.0601, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.021861336664584636, | |
| "eval_loss": 12.592202186584473, | |
| "eval_runtime": 52.3897, | |
| "eval_samples_per_second": 16.091, | |
| "eval_steps_per_second": 16.091, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.022485946283572766, | |
| "grad_norm": 8384.0, | |
| "learning_rate": 0.03617269130556171, | |
| "loss": 13.996, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.0231105559025609, | |
| "grad_norm": 438.0, | |
| "learning_rate": 0.035457486883172316, | |
| "loss": 10.1482, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.023735165521549032, | |
| "grad_norm": 6464.0, | |
| "learning_rate": 0.03473175265793479, | |
| "loss": 9.1758, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.024359775140537165, | |
| "grad_norm": 596.0, | |
| "learning_rate": 0.033996219382696063, | |
| "loss": 8.7717, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.024984384759525295, | |
| "grad_norm": 44288.0, | |
| "learning_rate": 0.033251627677115835, | |
| "loss": 10.1332, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.024984384759525295, | |
| "eval_loss": 9.592761993408203, | |
| "eval_runtime": 56.6259, | |
| "eval_samples_per_second": 14.887, | |
| "eval_steps_per_second": 14.887, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.02560899437851343, | |
| "grad_norm": 2224.0, | |
| "learning_rate": 0.032498727281925266, | |
| "loss": 9.5517, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.02623360399750156, | |
| "grad_norm": 4192.0, | |
| "learning_rate": 0.0317382763040017, | |
| "loss": 8.8907, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.026858213616489695, | |
| "grad_norm": 4640.0, | |
| "learning_rate": 0.030971040453019225, | |
| "loss": 8.6126, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.027482823235477825, | |
| "grad_norm": 960.0, | |
| "learning_rate": 0.03019779227044398, | |
| "loss": 9.0244, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.028107432854465958, | |
| "grad_norm": 5760.0, | |
| "learning_rate": 0.029419310351650393, | |
| "loss": 8.9802, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.028107432854465958, | |
| "eval_loss": 8.143258094787598, | |
| "eval_runtime": 56.465, | |
| "eval_samples_per_second": 14.93, | |
| "eval_steps_per_second": 14.93, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.02873204247345409, | |
| "grad_norm": 442368.0, | |
| "learning_rate": 0.02863637856194159, | |
| "loss": 8.4952, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.029356652092442224, | |
| "grad_norm": 272.0, | |
| "learning_rate": 0.027849785247263517, | |
| "loss": 7.9457, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.029981261711430358, | |
| "grad_norm": 2240.0, | |
| "learning_rate": 0.02706032244040741, | |
| "loss": 7.8189, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.030605871330418487, | |
| "grad_norm": 2448.0, | |
| "learning_rate": 0.026268785063499858, | |
| "loss": 8.0747, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.03123048094940662, | |
| "grad_norm": 1896.0, | |
| "learning_rate": 0.025475970127583666, | |
| "loss": 14.4091, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03123048094940662, | |
| "eval_loss": 16.79290771484375, | |
| "eval_runtime": 52.5411, | |
| "eval_samples_per_second": 16.045, | |
| "eval_steps_per_second": 16.045, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03185509056839475, | |
| "grad_norm": 8192.0, | |
| "learning_rate": 0.024682675930095266, | |
| "loss": 15.3158, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.03247970018738289, | |
| "grad_norm": 112.5, | |
| "learning_rate": 0.02388970125104685, | |
| "loss": 13.7666, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.03310430980637102, | |
| "grad_norm": 4992.0, | |
| "learning_rate": 0.02309784454872262, | |
| "loss": 12.1874, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.033728919425359154, | |
| "grad_norm": 278.0, | |
| "learning_rate": 0.022307903155699027, | |
| "loss": 9.8201, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.034353529044347283, | |
| "grad_norm": 1480.0, | |
| "learning_rate": 0.02152067247599837, | |
| "loss": 10.4309, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.034353529044347283, | |
| "eval_loss": 10.767745971679688, | |
| "eval_runtime": 53.7213, | |
| "eval_samples_per_second": 15.692, | |
| "eval_steps_per_second": 15.692, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.03497813866333541, | |
| "grad_norm": 1072.0, | |
| "learning_rate": 0.020736945184184407, | |
| "loss": 9.5244, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.03560274828232355, | |
| "grad_norm": 2352.0, | |
| "learning_rate": 0.019957510427206296, | |
| "loss": 8.832, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.03622735790131168, | |
| "grad_norm": 540.0, | |
| "learning_rate": 0.01918315302979444, | |
| "loss": 10.0747, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.03685196752029981, | |
| "grad_norm": 600.0, | |
| "learning_rate": 0.018414652704208584, | |
| "loss": 8.9236, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.037476577139287946, | |
| "grad_norm": 644.0, | |
| "learning_rate": 0.017652783265133608, | |
| "loss": 8.524, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.037476577139287946, | |
| "eval_loss": 8.281222343444824, | |
| "eval_runtime": 52.4549, | |
| "eval_samples_per_second": 16.071, | |
| "eval_steps_per_second": 16.071, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.038101186758276076, | |
| "grad_norm": 56.75, | |
| "learning_rate": 0.01689831185051374, | |
| "loss": 8.3835, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.03872579637726421, | |
| "grad_norm": 864.0, | |
| "learning_rate": 0.016151998149109708, | |
| "loss": 7.9404, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.03935040599625234, | |
| "grad_norm": 458.0, | |
| "learning_rate": 0.015414593635556518, | |
| "loss": 8.7404, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.03997501561524047, | |
| "grad_norm": 412.0, | |
| "learning_rate": 0.014686840813692224, | |
| "loss": 7.7151, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.04059962523422861, | |
| "grad_norm": 95.5, | |
| "learning_rate": 0.013969472468919462, | |
| "loss": 7.7596, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.04059962523422861, | |
| "eval_loss": 7.7680439949035645, | |
| "eval_runtime": 55.2903, | |
| "eval_samples_per_second": 15.247, | |
| "eval_steps_per_second": 15.247, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.04122423485321674, | |
| "grad_norm": 876.0, | |
| "learning_rate": 0.013263210930352737, | |
| "loss": 8.5446, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.04184884447220487, | |
| "grad_norm": 356.0, | |
| "learning_rate": 0.01256876734349413, | |
| "loss": 9.8441, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.042473454091193005, | |
| "grad_norm": 21.0, | |
| "learning_rate": 0.011886840954170141, | |
| "loss": 8.588, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.043098063710181135, | |
| "grad_norm": 158.0, | |
| "learning_rate": 0.011218118404450424, | |
| "loss": 7.9069, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.04372267332916927, | |
| "grad_norm": 1848.0, | |
| "learning_rate": 0.010563273041257332, | |
| "loss": 8.0914, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.04372267332916927, | |
| "eval_loss": 8.19365119934082, | |
| "eval_runtime": 67.1703, | |
| "eval_samples_per_second": 12.55, | |
| "eval_steps_per_second": 12.55, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0443472829481574, | |
| "grad_norm": 1304.0, | |
| "learning_rate": 0.009922964238362761, | |
| "loss": 8.057, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.04497189256714553, | |
| "grad_norm": 187.0, | |
| "learning_rate": 0.009297836732454564, | |
| "loss": 7.3939, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.04559650218613367, | |
| "grad_norm": 744.0, | |
| "learning_rate": 0.0086885199739414, | |
| "loss": 7.4594, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.0462211118051218, | |
| "grad_norm": 35.0, | |
| "learning_rate": 0.00809562749314952, | |
| "loss": 7.2934, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.046845721424109935, | |
| "grad_norm": 65024.0, | |
| "learning_rate": 0.0075197562825497334, | |
| "loss": 7.937, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.046845721424109935, | |
| "eval_loss": 8.636336326599121, | |
| "eval_runtime": 52.4038, | |
| "eval_samples_per_second": 16.087, | |
| "eval_steps_per_second": 16.087, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.047470331043098064, | |
| "grad_norm": 117.5, | |
| "learning_rate": 0.006961486195636613, | |
| "loss": 8.5237, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.048094940662086194, | |
| "grad_norm": 1024.0, | |
| "learning_rate": 0.006421379363065142, | |
| "loss": 8.9334, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.04871955028107433, | |
| "grad_norm": 1216.0, | |
| "learning_rate": 0.005899979626632835, | |
| "loss": 7.7087, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.04934415990006246, | |
| "grad_norm": 1120.0, | |
| "learning_rate": 0.005397811991677107, | |
| "loss": 8.1393, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.04996876951905059, | |
| "grad_norm": 374.0, | |
| "learning_rate": 0.0049153820984394365, | |
| "loss": 9.1765, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.04996876951905059, | |
| "eval_loss": 9.214335441589355, | |
| "eval_runtime": 54.987, | |
| "eval_samples_per_second": 15.331, | |
| "eval_steps_per_second": 15.331, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.05059337913803873, | |
| "grad_norm": 1072.0, | |
| "learning_rate": 0.004453175712928476, | |
| "loss": 8.6992, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.05121798875702686, | |
| "grad_norm": 3584.0, | |
| "learning_rate": 0.004011658237794877, | |
| "loss": 7.6904, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.051842598376014994, | |
| "grad_norm": 121.5, | |
| "learning_rate": 0.003591274243710277, | |
| "loss": 8.3538, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.05246720799500312, | |
| "grad_norm": 524.0, | |
| "learning_rate": 0.0031924470217222834, | |
| "loss": 7.6898, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.05309181761399125, | |
| "grad_norm": 516.0, | |
| "learning_rate": 0.002815578157036303, | |
| "loss": 8.4977, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.05309181761399125, | |
| "eval_loss": 8.241488456726074, | |
| "eval_runtime": 55.3183, | |
| "eval_samples_per_second": 15.239, | |
| "eval_steps_per_second": 15.239, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.05371642723297939, | |
| "grad_norm": 482.0, | |
| "learning_rate": 0.002461047124653279, | |
| "loss": 8.2881, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.05434103685196752, | |
| "grad_norm": 1864.0, | |
| "learning_rate": 0.0021292109072704956, | |
| "loss": 7.7393, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.05496564647095565, | |
| "grad_norm": 181.0, | |
| "learning_rate": 0.0018204036358303172, | |
| "loss": 7.7977, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.055590256089943786, | |
| "grad_norm": 70.0, | |
| "learning_rate": 0.001534936253078606, | |
| "loss": 7.4679, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.056214865708931916, | |
| "grad_norm": 756.0, | |
| "learning_rate": 0.0012730962004717683, | |
| "loss": 6.7936, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.056214865708931916, | |
| "eval_loss": 7.608245372772217, | |
| "eval_runtime": 53.9447, | |
| "eval_samples_per_second": 15.627, | |
| "eval_steps_per_second": 15.627, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.05683947532792005, | |
| "grad_norm": 63.5, | |
| "learning_rate": 0.0010351471287475406, | |
| "loss": 7.814, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.05746408494690818, | |
| "grad_norm": 1088.0, | |
| "learning_rate": 0.0008213286324510738, | |
| "loss": 7.1516, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.05808869456589631, | |
| "grad_norm": 268.0, | |
| "learning_rate": 0.000631856008683518, | |
| "loss": 7.2965, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.05871330418488445, | |
| "grad_norm": 600.0, | |
| "learning_rate": 0.00046692004031609894, | |
| "loss": 7.5007, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.05933791380387258, | |
| "grad_norm": 148.0, | |
| "learning_rate": 0.0003266868038879434, | |
| "loss": 7.4526, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.05933791380387258, | |
| "eval_loss": 7.6293864250183105, | |
| "eval_runtime": 52.7226, | |
| "eval_samples_per_second": 15.989, | |
| "eval_steps_per_second": 15.989, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.059962523422860715, | |
| "grad_norm": 40.5, | |
| "learning_rate": 0.00021129750238107204, | |
| "loss": 6.8338, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.060587133041848845, | |
| "grad_norm": 268.0, | |
| "learning_rate": 0.00012086832304096795, | |
| "loss": 7.0505, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.061211742660836975, | |
| "grad_norm": 76.5, | |
| "learning_rate": 5.54903203858731e-05, | |
| "loss": 7.9223, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.06183635227982511, | |
| "grad_norm": 108.0, | |
| "learning_rate": 1.5229324522605948e-05, | |
| "loss": 7.8772, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.06246096189881324, | |
| "grad_norm": 516.0, | |
| "learning_rate": 1.2587486122317416e-07, | |
| "loss": 7.4005, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06246096189881324, | |
| "eval_loss": 7.635000705718994, | |
| "eval_runtime": 52.4898, | |
| "eval_samples_per_second": 16.06, | |
| "eval_steps_per_second": 16.06, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 20, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 20 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.08462163968e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |