| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 1722, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.017421602787456445, |
| "grad_norm": 4.23232260712533, |
| "learning_rate": 5e-06, |
| "loss": 0.8052, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03484320557491289, |
| "grad_norm": 1.090448885849464, |
| "learning_rate": 5e-06, |
| "loss": 0.7339, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.05226480836236934, |
| "grad_norm": 0.5594978328465695, |
| "learning_rate": 5e-06, |
| "loss": 0.7095, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06968641114982578, |
| "grad_norm": 1.1923898055066504, |
| "learning_rate": 5e-06, |
| "loss": 0.6814, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08710801393728224, |
| "grad_norm": 0.5747583450737418, |
| "learning_rate": 5e-06, |
| "loss": 0.6829, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10452961672473868, |
| "grad_norm": 0.785119232212872, |
| "learning_rate": 5e-06, |
| "loss": 0.6835, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.12195121951219512, |
| "grad_norm": 1.5728993899325203, |
| "learning_rate": 5e-06, |
| "loss": 0.665, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.13937282229965156, |
| "grad_norm": 0.7455142878474823, |
| "learning_rate": 5e-06, |
| "loss": 0.6713, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.156794425087108, |
| "grad_norm": 0.6988533368272343, |
| "learning_rate": 5e-06, |
| "loss": 0.6674, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.17421602787456447, |
| "grad_norm": 0.577576926357653, |
| "learning_rate": 5e-06, |
| "loss": 0.6559, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1916376306620209, |
| "grad_norm": 0.43710001001993254, |
| "learning_rate": 5e-06, |
| "loss": 0.658, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.20905923344947736, |
| "grad_norm": 0.4588280367846969, |
| "learning_rate": 5e-06, |
| "loss": 0.6614, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2264808362369338, |
| "grad_norm": 1.0622546678030131, |
| "learning_rate": 5e-06, |
| "loss": 0.6418, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.24390243902439024, |
| "grad_norm": 0.4876367656167722, |
| "learning_rate": 5e-06, |
| "loss": 0.6506, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2613240418118467, |
| "grad_norm": 0.7010307485292822, |
| "learning_rate": 5e-06, |
| "loss": 0.6557, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2787456445993031, |
| "grad_norm": 0.5661043333608667, |
| "learning_rate": 5e-06, |
| "loss": 0.6606, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2961672473867596, |
| "grad_norm": 0.6447522539807854, |
| "learning_rate": 5e-06, |
| "loss": 0.648, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.313588850174216, |
| "grad_norm": 0.5557040024270404, |
| "learning_rate": 5e-06, |
| "loss": 0.6394, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3310104529616725, |
| "grad_norm": 0.5892120822328696, |
| "learning_rate": 5e-06, |
| "loss": 0.6324, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.34843205574912894, |
| "grad_norm": 1.2270732972578622, |
| "learning_rate": 5e-06, |
| "loss": 0.6391, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.36585365853658536, |
| "grad_norm": 0.7936010458943015, |
| "learning_rate": 5e-06, |
| "loss": 0.6513, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3832752613240418, |
| "grad_norm": 0.5219069470174827, |
| "learning_rate": 5e-06, |
| "loss": 0.6383, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.40069686411149824, |
| "grad_norm": 0.5167778651845648, |
| "learning_rate": 5e-06, |
| "loss": 0.642, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.4181184668989547, |
| "grad_norm": 0.6042604006779657, |
| "learning_rate": 5e-06, |
| "loss": 0.6355, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.4355400696864111, |
| "grad_norm": 0.5626765857595839, |
| "learning_rate": 5e-06, |
| "loss": 0.6349, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.4529616724738676, |
| "grad_norm": 0.6050093926243707, |
| "learning_rate": 5e-06, |
| "loss": 0.6375, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.47038327526132406, |
| "grad_norm": 1.1722586831285615, |
| "learning_rate": 5e-06, |
| "loss": 0.6368, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.4878048780487805, |
| "grad_norm": 0.6394366335041135, |
| "learning_rate": 5e-06, |
| "loss": 0.6349, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5052264808362369, |
| "grad_norm": 0.5632707832249847, |
| "learning_rate": 5e-06, |
| "loss": 0.6457, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5226480836236934, |
| "grad_norm": 0.4974262767629502, |
| "learning_rate": 5e-06, |
| "loss": 0.6415, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5400696864111498, |
| "grad_norm": 0.44045344640465933, |
| "learning_rate": 5e-06, |
| "loss": 0.6283, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5574912891986062, |
| "grad_norm": 0.6133307797581175, |
| "learning_rate": 5e-06, |
| "loss": 0.6305, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5749128919860628, |
| "grad_norm": 0.6241919834071155, |
| "learning_rate": 5e-06, |
| "loss": 0.6325, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5923344947735192, |
| "grad_norm": 0.9604073604115976, |
| "learning_rate": 5e-06, |
| "loss": 0.6236, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6097560975609756, |
| "grad_norm": 0.4973268061310772, |
| "learning_rate": 5e-06, |
| "loss": 0.6289, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.627177700348432, |
| "grad_norm": 0.5223995080337381, |
| "learning_rate": 5e-06, |
| "loss": 0.6199, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6445993031358885, |
| "grad_norm": 1.5828632527530289, |
| "learning_rate": 5e-06, |
| "loss": 0.62, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.662020905923345, |
| "grad_norm": 0.8033684194335495, |
| "learning_rate": 5e-06, |
| "loss": 0.6229, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6794425087108014, |
| "grad_norm": 0.4489264166825215, |
| "learning_rate": 5e-06, |
| "loss": 0.6176, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6968641114982579, |
| "grad_norm": 0.45304647738253323, |
| "learning_rate": 5e-06, |
| "loss": 0.6502, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 0.4583093519604427, |
| "learning_rate": 5e-06, |
| "loss": 0.6251, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7317073170731707, |
| "grad_norm": 0.5180914331546351, |
| "learning_rate": 5e-06, |
| "loss": 0.6183, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.7491289198606271, |
| "grad_norm": 0.6239170894152923, |
| "learning_rate": 5e-06, |
| "loss": 0.6237, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.7665505226480837, |
| "grad_norm": 0.4926313951403478, |
| "learning_rate": 5e-06, |
| "loss": 0.6284, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.7839721254355401, |
| "grad_norm": 0.4995773601460432, |
| "learning_rate": 5e-06, |
| "loss": 0.6318, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.8013937282229965, |
| "grad_norm": 0.4485468257574125, |
| "learning_rate": 5e-06, |
| "loss": 0.6252, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.818815331010453, |
| "grad_norm": 0.5360793232656679, |
| "learning_rate": 5e-06, |
| "loss": 0.6212, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8362369337979094, |
| "grad_norm": 0.45265558710756704, |
| "learning_rate": 5e-06, |
| "loss": 0.6156, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.8536585365853658, |
| "grad_norm": 0.4979934540526454, |
| "learning_rate": 5e-06, |
| "loss": 0.6235, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.8710801393728222, |
| "grad_norm": 0.46394699009987744, |
| "learning_rate": 5e-06, |
| "loss": 0.6206, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8885017421602788, |
| "grad_norm": 0.4482059620483837, |
| "learning_rate": 5e-06, |
| "loss": 0.6233, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.9059233449477352, |
| "grad_norm": 0.8667921510432307, |
| "learning_rate": 5e-06, |
| "loss": 0.6362, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.9233449477351916, |
| "grad_norm": 0.4876834431481369, |
| "learning_rate": 5e-06, |
| "loss": 0.6074, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.9407665505226481, |
| "grad_norm": 0.6031385118008376, |
| "learning_rate": 5e-06, |
| "loss": 0.6231, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.9581881533101045, |
| "grad_norm": 0.48495984846873175, |
| "learning_rate": 5e-06, |
| "loss": 0.6158, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.975609756097561, |
| "grad_norm": 0.5157653944459283, |
| "learning_rate": 5e-06, |
| "loss": 0.61, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.9930313588850174, |
| "grad_norm": 0.521220284257238, |
| "learning_rate": 5e-06, |
| "loss": 0.6317, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.6184476613998413, |
| "eval_runtime": 102.546, |
| "eval_samples_per_second": 150.762, |
| "eval_steps_per_second": 0.595, |
| "step": 574 |
| }, |
| { |
| "epoch": 1.0104529616724738, |
| "grad_norm": 0.6327651273127783, |
| "learning_rate": 5e-06, |
| "loss": 0.5832, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.0278745644599303, |
| "grad_norm": 0.5288866778412307, |
| "learning_rate": 5e-06, |
| "loss": 0.5889, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.0452961672473868, |
| "grad_norm": 0.5377085180191767, |
| "learning_rate": 5e-06, |
| "loss": 0.5807, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0627177700348431, |
| "grad_norm": 0.4465127927001278, |
| "learning_rate": 5e-06, |
| "loss": 0.5708, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.0801393728222997, |
| "grad_norm": 0.6776944622630791, |
| "learning_rate": 5e-06, |
| "loss": 0.5754, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.0975609756097562, |
| "grad_norm": 0.44453843430321455, |
| "learning_rate": 5e-06, |
| "loss": 0.5635, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.1149825783972125, |
| "grad_norm": 0.48396420132061224, |
| "learning_rate": 5e-06, |
| "loss": 0.5801, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.132404181184669, |
| "grad_norm": 0.5270761457237293, |
| "learning_rate": 5e-06, |
| "loss": 0.5686, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.1498257839721253, |
| "grad_norm": 0.45599815140341615, |
| "learning_rate": 5e-06, |
| "loss": 0.5642, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.1672473867595818, |
| "grad_norm": 0.5093721582712323, |
| "learning_rate": 5e-06, |
| "loss": 0.5831, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.1846689895470384, |
| "grad_norm": 0.4497684602732363, |
| "learning_rate": 5e-06, |
| "loss": 0.5608, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.202090592334495, |
| "grad_norm": 0.5143375374386742, |
| "learning_rate": 5e-06, |
| "loss": 0.5662, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.2195121951219512, |
| "grad_norm": 0.4451444116519644, |
| "learning_rate": 5e-06, |
| "loss": 0.5711, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2369337979094077, |
| "grad_norm": 0.46679960693678263, |
| "learning_rate": 5e-06, |
| "loss": 0.5657, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.254355400696864, |
| "grad_norm": 0.5006491283617938, |
| "learning_rate": 5e-06, |
| "loss": 0.5792, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.2717770034843205, |
| "grad_norm": 0.4495537005309244, |
| "learning_rate": 5e-06, |
| "loss": 0.5805, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.289198606271777, |
| "grad_norm": 0.4390068367423214, |
| "learning_rate": 5e-06, |
| "loss": 0.5775, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.3066202090592334, |
| "grad_norm": 0.4389959338405849, |
| "learning_rate": 5e-06, |
| "loss": 0.5654, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.32404181184669, |
| "grad_norm": 0.4420845718118641, |
| "learning_rate": 5e-06, |
| "loss": 0.5636, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.3414634146341464, |
| "grad_norm": 0.4351549602400041, |
| "learning_rate": 5e-06, |
| "loss": 0.5875, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.3588850174216027, |
| "grad_norm": 0.4222569130248892, |
| "learning_rate": 5e-06, |
| "loss": 0.5698, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.3763066202090593, |
| "grad_norm": 0.4393466391765974, |
| "learning_rate": 5e-06, |
| "loss": 0.5632, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.3937282229965158, |
| "grad_norm": 0.42922861834632947, |
| "learning_rate": 5e-06, |
| "loss": 0.568, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.411149825783972, |
| "grad_norm": 0.5197493756645316, |
| "learning_rate": 5e-06, |
| "loss": 0.571, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 0.4702610130943239, |
| "learning_rate": 5e-06, |
| "loss": 0.5795, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.445993031358885, |
| "grad_norm": 0.5713605410071362, |
| "learning_rate": 5e-06, |
| "loss": 0.581, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.4634146341463414, |
| "grad_norm": 0.4327164094855577, |
| "learning_rate": 5e-06, |
| "loss": 0.5672, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.480836236933798, |
| "grad_norm": 0.46324992297841866, |
| "learning_rate": 5e-06, |
| "loss": 0.5836, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.4982578397212545, |
| "grad_norm": 0.4056298274470244, |
| "learning_rate": 5e-06, |
| "loss": 0.5689, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.5156794425087108, |
| "grad_norm": 0.4106073081439743, |
| "learning_rate": 5e-06, |
| "loss": 0.5769, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.533101045296167, |
| "grad_norm": 0.5231152847115961, |
| "learning_rate": 5e-06, |
| "loss": 0.572, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.5505226480836236, |
| "grad_norm": 0.5162703511329246, |
| "learning_rate": 5e-06, |
| "loss": 0.5834, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.5679442508710801, |
| "grad_norm": 0.5257615502145718, |
| "learning_rate": 5e-06, |
| "loss": 0.5829, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.5853658536585367, |
| "grad_norm": 0.44342708002425985, |
| "learning_rate": 5e-06, |
| "loss": 0.5854, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.6027874564459932, |
| "grad_norm": 0.431545157360716, |
| "learning_rate": 5e-06, |
| "loss": 0.5589, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.6202090592334495, |
| "grad_norm": 0.4087850777420917, |
| "learning_rate": 5e-06, |
| "loss": 0.5705, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.6376306620209058, |
| "grad_norm": 0.4329398638465802, |
| "learning_rate": 5e-06, |
| "loss": 0.5722, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.6550522648083623, |
| "grad_norm": 0.4405392415792593, |
| "learning_rate": 5e-06, |
| "loss": 0.5652, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.6724738675958188, |
| "grad_norm": 0.5482919030958512, |
| "learning_rate": 5e-06, |
| "loss": 0.572, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.6898954703832754, |
| "grad_norm": 0.44133187245294253, |
| "learning_rate": 5e-06, |
| "loss": 0.5898, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.7073170731707317, |
| "grad_norm": 0.41643492065728077, |
| "learning_rate": 5e-06, |
| "loss": 0.5726, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.7247386759581882, |
| "grad_norm": 0.4628775707744871, |
| "learning_rate": 5e-06, |
| "loss": 0.5829, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.7421602787456445, |
| "grad_norm": 0.4508863482959889, |
| "learning_rate": 5e-06, |
| "loss": 0.5761, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.759581881533101, |
| "grad_norm": 0.4373059788947402, |
| "learning_rate": 5e-06, |
| "loss": 0.5651, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.7770034843205575, |
| "grad_norm": 0.5464086008522352, |
| "learning_rate": 5e-06, |
| "loss": 0.5883, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.794425087108014, |
| "grad_norm": 0.41246773347477955, |
| "learning_rate": 5e-06, |
| "loss": 0.5771, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.8118466898954704, |
| "grad_norm": 0.42532816074157515, |
| "learning_rate": 5e-06, |
| "loss": 0.5687, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.8292682926829267, |
| "grad_norm": 0.4345927236320441, |
| "learning_rate": 5e-06, |
| "loss": 0.5816, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.8466898954703832, |
| "grad_norm": 0.4120436308316067, |
| "learning_rate": 5e-06, |
| "loss": 0.5707, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.8641114982578397, |
| "grad_norm": 0.4618291184346104, |
| "learning_rate": 5e-06, |
| "loss": 0.5821, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.8815331010452963, |
| "grad_norm": 0.45243018945582253, |
| "learning_rate": 5e-06, |
| "loss": 0.5728, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.8989547038327528, |
| "grad_norm": 0.4269465047250204, |
| "learning_rate": 5e-06, |
| "loss": 0.5705, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.916376306620209, |
| "grad_norm": 0.43233458025624255, |
| "learning_rate": 5e-06, |
| "loss": 0.5855, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9337979094076654, |
| "grad_norm": 0.44005309425506883, |
| "learning_rate": 5e-06, |
| "loss": 0.5748, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.951219512195122, |
| "grad_norm": 0.40705597974047253, |
| "learning_rate": 5e-06, |
| "loss": 0.5728, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.9686411149825784, |
| "grad_norm": 0.4780753898940448, |
| "learning_rate": 5e-06, |
| "loss": 0.5733, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.986062717770035, |
| "grad_norm": 0.4830503319420354, |
| "learning_rate": 5e-06, |
| "loss": 0.5746, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.6078594326972961, |
| "eval_runtime": 102.4542, |
| "eval_samples_per_second": 150.897, |
| "eval_steps_per_second": 0.595, |
| "step": 1148 |
| }, |
| { |
| "epoch": 2.0034843205574915, |
| "grad_norm": 0.7175711328124225, |
| "learning_rate": 5e-06, |
| "loss": 0.5665, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.0209059233449476, |
| "grad_norm": 0.679439539516226, |
| "learning_rate": 5e-06, |
| "loss": 0.5218, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.038327526132404, |
| "grad_norm": 0.46244973212886337, |
| "learning_rate": 5e-06, |
| "loss": 0.5257, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.0557491289198606, |
| "grad_norm": 0.4714660556562506, |
| "learning_rate": 5e-06, |
| "loss": 0.5252, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.073170731707317, |
| "grad_norm": 0.4509460084000914, |
| "learning_rate": 5e-06, |
| "loss": 0.5262, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.0905923344947737, |
| "grad_norm": 0.5350365352949017, |
| "learning_rate": 5e-06, |
| "loss": 0.5284, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.10801393728223, |
| "grad_norm": 0.49646641648094414, |
| "learning_rate": 5e-06, |
| "loss": 0.5195, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.1254355400696863, |
| "grad_norm": 0.4528849214196566, |
| "learning_rate": 5e-06, |
| "loss": 0.5223, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 0.4479458966165427, |
| "learning_rate": 5e-06, |
| "loss": 0.5198, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.1602787456445993, |
| "grad_norm": 0.43520596575481335, |
| "learning_rate": 5e-06, |
| "loss": 0.535, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.177700348432056, |
| "grad_norm": 0.47995514229561104, |
| "learning_rate": 5e-06, |
| "loss": 0.5161, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.1951219512195124, |
| "grad_norm": 0.43290006578859574, |
| "learning_rate": 5e-06, |
| "loss": 0.5275, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.2125435540069684, |
| "grad_norm": 0.4379349897015291, |
| "learning_rate": 5e-06, |
| "loss": 0.5224, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.229965156794425, |
| "grad_norm": 0.5361414369409284, |
| "learning_rate": 5e-06, |
| "loss": 0.5307, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.2473867595818815, |
| "grad_norm": 0.45014532910893845, |
| "learning_rate": 5e-06, |
| "loss": 0.531, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.264808362369338, |
| "grad_norm": 0.46972835545493746, |
| "learning_rate": 5e-06, |
| "loss": 0.5253, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.2822299651567945, |
| "grad_norm": 0.5266781012889059, |
| "learning_rate": 5e-06, |
| "loss": 0.5374, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.2996515679442506, |
| "grad_norm": 0.49405584000739605, |
| "learning_rate": 5e-06, |
| "loss": 0.5363, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.317073170731707, |
| "grad_norm": 0.5358650684856928, |
| "learning_rate": 5e-06, |
| "loss": 0.5258, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.3344947735191637, |
| "grad_norm": 0.4930438995104039, |
| "learning_rate": 5e-06, |
| "loss": 0.5221, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.35191637630662, |
| "grad_norm": 0.48325846355244906, |
| "learning_rate": 5e-06, |
| "loss": 0.5247, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.3693379790940767, |
| "grad_norm": 0.4505148203988916, |
| "learning_rate": 5e-06, |
| "loss": 0.5391, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.3867595818815333, |
| "grad_norm": 0.5629302906655761, |
| "learning_rate": 5e-06, |
| "loss": 0.5241, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.40418118466899, |
| "grad_norm": 0.4980091419887635, |
| "learning_rate": 5e-06, |
| "loss": 0.5327, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.421602787456446, |
| "grad_norm": 0.46042746236588616, |
| "learning_rate": 5e-06, |
| "loss": 0.527, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.4390243902439024, |
| "grad_norm": 0.4667347118399181, |
| "learning_rate": 5e-06, |
| "loss": 0.5307, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.456445993031359, |
| "grad_norm": 0.48551794317447705, |
| "learning_rate": 5e-06, |
| "loss": 0.5393, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.4738675958188154, |
| "grad_norm": 0.5047118295651266, |
| "learning_rate": 5e-06, |
| "loss": 0.5337, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.491289198606272, |
| "grad_norm": 0.4947597029259197, |
| "learning_rate": 5e-06, |
| "loss": 0.5195, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.508710801393728, |
| "grad_norm": 0.44353375409132884, |
| "learning_rate": 5e-06, |
| "loss": 0.5167, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.5261324041811846, |
| "grad_norm": 0.47068505360221635, |
| "learning_rate": 5e-06, |
| "loss": 0.5207, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.543554006968641, |
| "grad_norm": 0.5219085859698269, |
| "learning_rate": 5e-06, |
| "loss": 0.5295, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.5609756097560976, |
| "grad_norm": 0.46936542375240276, |
| "learning_rate": 5e-06, |
| "loss": 0.5267, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.578397212543554, |
| "grad_norm": 0.47305680006884215, |
| "learning_rate": 5e-06, |
| "loss": 0.5374, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.59581881533101, |
| "grad_norm": 0.4974195900032511, |
| "learning_rate": 5e-06, |
| "loss": 0.53, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.6132404181184667, |
| "grad_norm": 0.49549272278932555, |
| "learning_rate": 5e-06, |
| "loss": 0.5271, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6306620209059233, |
| "grad_norm": 0.45848051857701816, |
| "learning_rate": 5e-06, |
| "loss": 0.5337, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.64808362369338, |
| "grad_norm": 0.46187090856288615, |
| "learning_rate": 5e-06, |
| "loss": 0.5327, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.6655052264808363, |
| "grad_norm": 0.46616993884920344, |
| "learning_rate": 5e-06, |
| "loss": 0.523, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.682926829268293, |
| "grad_norm": 0.5000992320205292, |
| "learning_rate": 5e-06, |
| "loss": 0.5253, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.7003484320557494, |
| "grad_norm": 0.4930605706870518, |
| "learning_rate": 5e-06, |
| "loss": 0.5348, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.7177700348432055, |
| "grad_norm": 0.4450804918183241, |
| "learning_rate": 5e-06, |
| "loss": 0.5245, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.735191637630662, |
| "grad_norm": 0.45277058932860653, |
| "learning_rate": 5e-06, |
| "loss": 0.5283, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.7526132404181185, |
| "grad_norm": 0.4707554219475042, |
| "learning_rate": 5e-06, |
| "loss": 0.5314, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.770034843205575, |
| "grad_norm": 0.4904087908466357, |
| "learning_rate": 5e-06, |
| "loss": 0.5245, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.7874564459930316, |
| "grad_norm": 0.45719978404615347, |
| "learning_rate": 5e-06, |
| "loss": 0.5301, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8048780487804876, |
| "grad_norm": 0.4793529886533003, |
| "learning_rate": 5e-06, |
| "loss": 0.5291, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.822299651567944, |
| "grad_norm": 0.526158778246725, |
| "learning_rate": 5e-06, |
| "loss": 0.5221, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.8397212543554007, |
| "grad_norm": 0.44007289003001554, |
| "learning_rate": 5e-06, |
| "loss": 0.5329, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.49203142798367927, |
| "learning_rate": 5e-06, |
| "loss": 0.5303, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.8745644599303137, |
| "grad_norm": 0.4352645604564786, |
| "learning_rate": 5e-06, |
| "loss": 0.5188, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.89198606271777, |
| "grad_norm": 0.5003548253398568, |
| "learning_rate": 5e-06, |
| "loss": 0.5392, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.9094076655052263, |
| "grad_norm": 0.43054886136800163, |
| "learning_rate": 5e-06, |
| "loss": 0.5348, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.926829268292683, |
| "grad_norm": 0.433053259111073, |
| "learning_rate": 5e-06, |
| "loss": 0.5314, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.9442508710801394, |
| "grad_norm": 0.47725032683099916, |
| "learning_rate": 5e-06, |
| "loss": 0.53, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.961672473867596, |
| "grad_norm": 0.4287797652157984, |
| "learning_rate": 5e-06, |
| "loss": 0.5177, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.979094076655052, |
| "grad_norm": 0.4999809691385612, |
| "learning_rate": 5e-06, |
| "loss": 0.5275, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.996515679442509, |
| "grad_norm": 0.48674503368325667, |
| "learning_rate": 5e-06, |
| "loss": 0.54, |
| "step": 1720 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.6087974905967712, |
| "eval_runtime": 100.2232, |
| "eval_samples_per_second": 154.256, |
| "eval_steps_per_second": 0.609, |
| "step": 1722 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 1722, |
| "total_flos": 2883576618024960.0, |
| "train_loss": 0.5818286676232408, |
| "train_runtime": 14902.5092, |
| "train_samples_per_second": 59.131, |
| "train_steps_per_second": 0.116 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1722, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2883576618024960.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|