| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1320, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022727272727272728, | |
| "grad_norm": 4.216299100450332, | |
| "learning_rate": 7.575757575757576e-07, | |
| "loss": 0.8793, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.045454545454545456, | |
| "grad_norm": 2.056435316321031, | |
| "learning_rate": 1.5151515151515152e-06, | |
| "loss": 0.8057, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06818181818181818, | |
| "grad_norm": 1.9275540206080632, | |
| "learning_rate": 2.2727272727272728e-06, | |
| "loss": 0.7678, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09090909090909091, | |
| "grad_norm": 1.7253286859097539, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "loss": 0.7336, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11363636363636363, | |
| "grad_norm": 1.278366987427592, | |
| "learning_rate": 3.7878787878787882e-06, | |
| "loss": 0.6895, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13636363636363635, | |
| "grad_norm": 1.1739375472499556, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.676, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1590909090909091, | |
| "grad_norm": 1.1586165476819004, | |
| "learning_rate": 5.303030303030303e-06, | |
| "loss": 0.6864, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18181818181818182, | |
| "grad_norm": 1.133793120521184, | |
| "learning_rate": 6.060606060606061e-06, | |
| "loss": 0.6523, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.20454545454545456, | |
| "grad_norm": 1.2445713032210026, | |
| "learning_rate": 6.818181818181818e-06, | |
| "loss": 0.674, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22727272727272727, | |
| "grad_norm": 1.2505387601379425, | |
| "learning_rate": 7.5757575757575764e-06, | |
| "loss": 0.6568, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.1132683168525286, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.6155, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2727272727272727, | |
| "grad_norm": 1.433006253708681, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 0.6012, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.29545454545454547, | |
| "grad_norm": 1.2478812183923813, | |
| "learning_rate": 9.84848484848485e-06, | |
| "loss": 0.5775, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3181818181818182, | |
| "grad_norm": 1.342826770916446, | |
| "learning_rate": 9.99888115313551e-06, | |
| "loss": 0.6196, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3409090909090909, | |
| "grad_norm": 1.4940098193114923, | |
| "learning_rate": 9.994336695915041e-06, | |
| "loss": 0.6026, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 1.9439510780638498, | |
| "learning_rate": 9.986299875742612e-06, | |
| "loss": 0.6041, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.38636363636363635, | |
| "grad_norm": 1.3507972394977277, | |
| "learning_rate": 9.97477631248223e-06, | |
| "loss": 0.6199, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4090909090909091, | |
| "grad_norm": 1.2713852384788942, | |
| "learning_rate": 9.959774064153977e-06, | |
| "loss": 0.5927, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4318181818181818, | |
| "grad_norm": 1.4909588519341612, | |
| "learning_rate": 9.941303621299332e-06, | |
| "loss": 0.5789, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 1.3989309265695034, | |
| "learning_rate": 9.919377899645497e-06, | |
| "loss": 0.5824, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4772727272727273, | |
| "grad_norm": 1.361953413747448, | |
| "learning_rate": 9.894012231073895e-06, | |
| "loss": 0.5764, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.707171586855627, | |
| "learning_rate": 9.86522435289912e-06, | |
| "loss": 0.6241, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5227272727272727, | |
| "grad_norm": 1.558499254125664, | |
| "learning_rate": 9.833034395465866e-06, | |
| "loss": 0.5793, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5454545454545454, | |
| "grad_norm": 1.4598536682151586, | |
| "learning_rate": 9.797464868072489e-06, | |
| "loss": 0.593, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5681818181818182, | |
| "grad_norm": 1.2976070663198394, | |
| "learning_rate": 9.758540643231041e-06, | |
| "loss": 0.5758, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5909090909090909, | |
| "grad_norm": 1.425671844427152, | |
| "learning_rate": 9.716288939274818e-06, | |
| "loss": 0.5967, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6136363636363636, | |
| "grad_norm": 1.802751724655766, | |
| "learning_rate": 9.670739301325534e-06, | |
| "loss": 0.5837, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6363636363636364, | |
| "grad_norm": 1.4286258682851425, | |
| "learning_rate": 9.621923580633462e-06, | |
| "loss": 0.5833, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6590909090909091, | |
| "grad_norm": 1.4862503722831126, | |
| "learning_rate": 9.56987591230498e-06, | |
| "loss": 0.5917, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6818181818181818, | |
| "grad_norm": 1.095266143817191, | |
| "learning_rate": 9.514632691433108e-06, | |
| "loss": 0.5279, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7045454545454546, | |
| "grad_norm": 1.1224324886180095, | |
| "learning_rate": 9.456232547647695e-06, | |
| "loss": 0.6009, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 1.3051649056954844, | |
| "learning_rate": 9.394716318103098e-06, | |
| "loss": 0.5929, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.5824844748662101, | |
| "learning_rate": 9.330127018922195e-06, | |
| "loss": 0.5595, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7727272727272727, | |
| "grad_norm": 1.0544710779375677, | |
| "learning_rate": 9.262509815116732e-06, | |
| "loss": 0.5794, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7954545454545454, | |
| "grad_norm": 1.251551622142736, | |
| "learning_rate": 9.191911989005038e-06, | |
| "loss": 0.5426, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8181818181818182, | |
| "grad_norm": 1.2264688941801714, | |
| "learning_rate": 9.118382907149164e-06, | |
| "loss": 0.5594, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8409090909090909, | |
| "grad_norm": 1.0399260483816235, | |
| "learning_rate": 9.041973985834595e-06, | |
| "loss": 0.5282, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8636363636363636, | |
| "grad_norm": 1.2474857073082677, | |
| "learning_rate": 8.96273865511666e-06, | |
| "loss": 0.5537, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8863636363636364, | |
| "grad_norm": 1.191985776599411, | |
| "learning_rate": 8.880732321458785e-06, | |
| "loss": 0.6053, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 1.5427483242501547, | |
| "learning_rate": 8.796012328988716e-06, | |
| "loss": 0.571, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9318181818181818, | |
| "grad_norm": 1.260434651752352, | |
| "learning_rate": 8.708637919399798e-06, | |
| "loss": 0.5767, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9545454545454546, | |
| "grad_norm": 1.176587569159202, | |
| "learning_rate": 8.61867019052535e-06, | |
| "loss": 0.5645, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9772727272727273, | |
| "grad_norm": 1.0939445746618568, | |
| "learning_rate": 8.526172053615122e-06, | |
| "loss": 0.5667, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.0948273947100784, | |
| "learning_rate": 8.43120818934367e-06, | |
| "loss": 0.5658, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0227272727272727, | |
| "grad_norm": 1.869119754894398, | |
| "learning_rate": 8.33384500258146e-06, | |
| "loss": 0.4976, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0454545454545454, | |
| "grad_norm": 1.0225134404780003, | |
| "learning_rate": 8.234150575960288e-06, | |
| "loss": 0.4736, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.0681818181818181, | |
| "grad_norm": 1.0910125912719129, | |
| "learning_rate": 8.132194622265508e-06, | |
| "loss": 0.4617, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0909090909090908, | |
| "grad_norm": 1.2388544525919998, | |
| "learning_rate": 8.028048435688333e-06, | |
| "loss": 0.4424, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1136363636363635, | |
| "grad_norm": 0.9660473941968221, | |
| "learning_rate": 7.921784841972355e-06, | |
| "loss": 0.4518, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1363636363636362, | |
| "grad_norm": 1.1220630535855642, | |
| "learning_rate": 7.813478147489052e-06, | |
| "loss": 0.4633, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.1590909090909092, | |
| "grad_norm": 1.1167849673537338, | |
| "learning_rate": 7.703204087277989e-06, | |
| "loss": 0.4868, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.1818181818181819, | |
| "grad_norm": 0.9638438879759306, | |
| "learning_rate": 7.5910397720879785e-06, | |
| "loss": 0.4593, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2045454545454546, | |
| "grad_norm": 1.1970441399867813, | |
| "learning_rate": 7.477063634456263e-06, | |
| "loss": 0.4242, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.2272727272727273, | |
| "grad_norm": 1.1125673218050052, | |
| "learning_rate": 7.361355373863415e-06, | |
| "loss": 0.4542, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.212090924206183, | |
| "learning_rate": 7.243995901002312e-06, | |
| "loss": 0.4719, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.2727272727272727, | |
| "grad_norm": 1.3189335875058832, | |
| "learning_rate": 7.1250672812001505e-06, | |
| "loss": 0.4109, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.2954545454545454, | |
| "grad_norm": 1.0875934749907739, | |
| "learning_rate": 7.004652677033069e-06, | |
| "loss": 0.4574, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3181818181818181, | |
| "grad_norm": 1.0235983526316594, | |
| "learning_rate": 6.882836290173493e-06, | |
| "loss": 0.4282, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.3409090909090908, | |
| "grad_norm": 0.9212342358034311, | |
| "learning_rate": 6.759703302510898e-06, | |
| "loss": 0.4632, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 1.49044649738047, | |
| "learning_rate": 6.635339816587109e-06, | |
| "loss": 0.4302, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.3863636363636362, | |
| "grad_norm": 1.073316467245383, | |
| "learning_rate": 6.5098327953878585e-06, | |
| "loss": 0.4941, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.4090909090909092, | |
| "grad_norm": 1.0637166263057045, | |
| "learning_rate": 6.383270001532636e-06, | |
| "loss": 0.4458, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.4318181818181819, | |
| "grad_norm": 1.1700782600055681, | |
| "learning_rate": 6.255739935905396e-06, | |
| "loss": 0.4477, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.4545454545454546, | |
| "grad_norm": 1.2623627275586613, | |
| "learning_rate": 6.127331775769023e-06, | |
| "loss": 0.4391, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.4772727272727273, | |
| "grad_norm": 1.1157521052470225, | |
| "learning_rate": 5.998135312406821e-06, | |
| "loss": 0.4362, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.0863888181765478, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.4456, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.5227272727272727, | |
| "grad_norm": 1.3928604336515944, | |
| "learning_rate": 5.737739334127611e-06, | |
| "loss": 0.4287, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.5454545454545454, | |
| "grad_norm": 1.1254993222217602, | |
| "learning_rate": 5.60672190490541e-06, | |
| "loss": 0.4379, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.5681818181818183, | |
| "grad_norm": 1.1485234304507437, | |
| "learning_rate": 5.475280216520913e-06, | |
| "loss": 0.4178, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.5909090909090908, | |
| "grad_norm": 1.3586597877374391, | |
| "learning_rate": 5.343506181496405e-06, | |
| "loss": 0.4592, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6136363636363638, | |
| "grad_norm": 0.9803070225742867, | |
| "learning_rate": 5.2114919447524155e-06, | |
| "loss": 0.4382, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.6363636363636362, | |
| "grad_norm": 1.1176208192540875, | |
| "learning_rate": 5.07932981917404e-06, | |
| "loss": 0.4695, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.6590909090909092, | |
| "grad_norm": 1.0325024296236278, | |
| "learning_rate": 4.947112221059803e-06, | |
| "loss": 0.4436, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.6818181818181817, | |
| "grad_norm": 1.0956622454209288, | |
| "learning_rate": 4.81493160549821e-06, | |
| "loss": 0.4278, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.7045454545454546, | |
| "grad_norm": 1.1280883864313334, | |
| "learning_rate": 4.682880401717178e-06, | |
| "loss": 0.451, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.7272727272727273, | |
| "grad_norm": 0.8436589146748317, | |
| "learning_rate": 4.551050948451542e-06, | |
| "loss": 0.4337, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.002644552716769, | |
| "learning_rate": 4.4195354293738484e-06, | |
| "loss": 0.4174, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.7727272727272727, | |
| "grad_norm": 0.8500087740708504, | |
| "learning_rate": 4.2884258086335755e-06, | |
| "loss": 0.4572, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.7954545454545454, | |
| "grad_norm": 1.165083596610894, | |
| "learning_rate": 4.1578137665498485e-06, | |
| "loss": 0.4331, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 1.0059665177740011, | |
| "learning_rate": 4.027790635502646e-06, | |
| "loss": 0.4445, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.8409090909090908, | |
| "grad_norm": 2.003774894211316, | |
| "learning_rate": 3.898447336067297e-06, | |
| "loss": 0.4699, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.8636363636363638, | |
| "grad_norm": 1.1797465582944633, | |
| "learning_rate": 3.769874313436933e-06, | |
| "loss": 0.5095, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.8863636363636362, | |
| "grad_norm": 1.0163777930722036, | |
| "learning_rate": 3.6421614741773702e-06, | |
| "loss": 0.4285, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.9090909090909092, | |
| "grad_norm": 1.0541924430540879, | |
| "learning_rate": 3.5153981233586277e-06, | |
| "loss": 0.4199, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.9318181818181817, | |
| "grad_norm": 1.1028658179696385, | |
| "learning_rate": 3.389672902107044e-06, | |
| "loss": 0.4591, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.9545454545454546, | |
| "grad_norm": 1.098963724725395, | |
| "learning_rate": 3.2650737256216885e-06, | |
| "loss": 0.4179, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.9772727272727273, | |
| "grad_norm": 0.9061861814538137, | |
| "learning_rate": 3.141687721698363e-06, | |
| "loss": 0.4167, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.3924708477763184, | |
| "learning_rate": 3.019601169804216e-06, | |
| "loss": 0.469, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.022727272727273, | |
| "grad_norm": 1.2456296595255472, | |
| "learning_rate": 2.898899440745569e-06, | |
| "loss": 0.3491, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.0454545454545454, | |
| "grad_norm": 0.8978328163628839, | |
| "learning_rate": 2.7796669369711294e-06, | |
| "loss": 0.308, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.0681818181818183, | |
| "grad_norm": 0.8598502090088423, | |
| "learning_rate": 2.6619870335523434e-06, | |
| "loss": 0.3655, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.090909090909091, | |
| "grad_norm": 0.8171399237916982, | |
| "learning_rate": 2.5459420198821604e-06, | |
| "loss": 0.3383, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.1136363636363638, | |
| "grad_norm": 0.965447288251514, | |
| "learning_rate": 2.4316130421329696e-06, | |
| "loss": 0.3162, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.1363636363636362, | |
| "grad_norm": 0.8484113747287527, | |
| "learning_rate": 2.319080046513954e-06, | |
| "loss": 0.3399, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.159090909090909, | |
| "grad_norm": 0.9794879904206493, | |
| "learning_rate": 2.2084217233675386e-06, | |
| "loss": 0.3488, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.1818181818181817, | |
| "grad_norm": 0.8913446271065173, | |
| "learning_rate": 2.09971545214401e-06, | |
| "loss": 0.3324, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.2045454545454546, | |
| "grad_norm": 1.4051618963825436, | |
| "learning_rate": 1.9930372472928095e-06, | |
| "loss": 0.3339, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.227272727272727, | |
| "grad_norm": 0.9626015554026367, | |
| "learning_rate": 1.8884617051083183e-06, | |
| "loss": 0.3375, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.8172835382662239, | |
| "learning_rate": 1.7860619515673034e-06, | |
| "loss": 0.3164, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "grad_norm": 1.2647707717613101, | |
| "learning_rate": 1.68590959119452e-06, | |
| "loss": 0.359, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.2954545454545454, | |
| "grad_norm": 0.9792953799185291, | |
| "learning_rate": 1.5880746569921867e-06, | |
| "loss": 0.3332, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.3181818181818183, | |
| "grad_norm": 1.158589848827058, | |
| "learning_rate": 1.4926255614683931e-06, | |
| "loss": 0.3246, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.340909090909091, | |
| "grad_norm": 0.9481933742154114, | |
| "learning_rate": 1.3996290487986568e-06, | |
| "loss": 0.3292, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.3636363636363638, | |
| "grad_norm": 0.8448865059667644, | |
| "learning_rate": 1.3091501481540676e-06, | |
| "loss": 0.3326, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.3863636363636362, | |
| "grad_norm": 1.2795734110759045, | |
| "learning_rate": 1.2212521282287093e-06, | |
| "loss": 0.3132, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.409090909090909, | |
| "grad_norm": 0.9968497266675121, | |
| "learning_rate": 1.135996452998085e-06, | |
| "loss": 0.3124, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.4318181818181817, | |
| "grad_norm": 0.9014679818925503, | |
| "learning_rate": 1.0534427387395391e-06, | |
| "loss": 0.3384, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.4545454545454546, | |
| "grad_norm": 0.9030282427838496, | |
| "learning_rate": 9.73648712344707e-07, | |
| "loss": 0.3311, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.4772727272727275, | |
| "grad_norm": 0.8744414413885918, | |
| "learning_rate": 8.966701709531344e-07, | |
| "loss": 0.2984, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.8547240058983322, | |
| "learning_rate": 8.225609429353187e-07, | |
| "loss": 0.3592, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.5227272727272725, | |
| "grad_norm": 0.7559827852933007, | |
| "learning_rate": 7.513728502524286e-07, | |
| "loss": 0.3377, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.5454545454545454, | |
| "grad_norm": 1.0314710766418276, | |
| "learning_rate": 6.831556722190453e-07, | |
| "loss": 0.3308, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.5681818181818183, | |
| "grad_norm": 0.9841664198438486, | |
| "learning_rate": 6.179571106942466e-07, | |
| "loss": 0.3229, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.590909090909091, | |
| "grad_norm": 0.8922396963283047, | |
| "learning_rate": 5.558227567253832e-07, | |
| "loss": 0.3279, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.6136363636363638, | |
| "grad_norm": 1.029465590048154, | |
| "learning_rate": 4.967960586678722e-07, | |
| "loss": 0.3216, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.6363636363636362, | |
| "grad_norm": 0.9773512266201999, | |
| "learning_rate": 4.4091829180330503e-07, | |
| "loss": 0.3238, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.659090909090909, | |
| "grad_norm": 0.8896849987933176, | |
| "learning_rate": 3.882285294770938e-07, | |
| "loss": 0.3304, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.6818181818181817, | |
| "grad_norm": 0.8131842758725988, | |
| "learning_rate": 3.3876361577587115e-07, | |
| "loss": 0.3367, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.7045454545454546, | |
| "grad_norm": 0.9315090022423707, | |
| "learning_rate": 2.9255813976372227e-07, | |
| "loss": 0.3455, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 0.9350701684403658, | |
| "learning_rate": 2.4964441129527337e-07, | |
| "loss": 0.3498, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.9241666028305437, | |
| "learning_rate": 2.1005243842255552e-07, | |
| "loss": 0.3228, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.7727272727272725, | |
| "grad_norm": 0.9973114953100479, | |
| "learning_rate": 1.738099064114368e-07, | |
| "loss": 0.3243, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.7954545454545454, | |
| "grad_norm": 1.1498216741764613, | |
| "learning_rate": 1.4094215838229176e-07, | |
| "loss": 0.3546, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.8181818181818183, | |
| "grad_norm": 0.8459294019911658, | |
| "learning_rate": 1.1147217758845752e-07, | |
| "loss": 0.3449, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.840909090909091, | |
| "grad_norm": 0.9010685354140354, | |
| "learning_rate": 8.542057134485638e-08, | |
| "loss": 0.3619, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.8636363636363638, | |
| "grad_norm": 0.9441507651135089, | |
| "learning_rate": 6.280555661802857e-08, | |
| "loss": 0.292, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.8863636363636362, | |
| "grad_norm": 0.8552184854323925, | |
| "learning_rate": 4.3642947287654284e-08, | |
| "loss": 0.3406, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.909090909090909, | |
| "grad_norm": 1.0207055525449416, | |
| "learning_rate": 2.7946143088466437e-08, | |
| "loss": 0.3347, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.9318181818181817, | |
| "grad_norm": 1.264848487223289, | |
| "learning_rate": 1.5726120240288632e-08, | |
| "loss": 0.3179, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.9545454545454546, | |
| "grad_norm": 0.9986328803754655, | |
| "learning_rate": 6.991423772753636e-09, | |
| "loss": 0.3346, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.9772727272727275, | |
| "grad_norm": 0.9347293247423821, | |
| "learning_rate": 1.7481615500691829e-09, | |
| "loss": 0.3288, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.762892298177065, | |
| "learning_rate": 0.0, | |
| "loss": 0.3526, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 1320, | |
| "total_flos": 178716929818624.0, | |
| "train_loss": 0.46517832586259555, | |
| "train_runtime": 7796.3988, | |
| "train_samples_per_second": 1.352, | |
| "train_steps_per_second": 0.169 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1320, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 178716929818624.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |