| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1293, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02320185614849188, | |
| "grad_norm": 12.88911742024848, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0604, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04640371229698376, | |
| "grad_norm": 1.2750125629212934, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9197, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06960556844547564, | |
| "grad_norm": 1.3405185935171573, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8724, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09280742459396751, | |
| "grad_norm": 1.5959997640606394, | |
| "learning_rate": 5e-06, | |
| "loss": 0.848, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11600928074245939, | |
| "grad_norm": 1.1755361886660722, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8298, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13921113689095127, | |
| "grad_norm": 1.058737549762726, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8127, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.16241299303944315, | |
| "grad_norm": 1.3975177955215072, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8069, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18561484918793503, | |
| "grad_norm": 1.000120781135683, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7916, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2088167053364269, | |
| "grad_norm": 1.1104739022155425, | |
| "learning_rate": 5e-06, | |
| "loss": 0.786, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.23201856148491878, | |
| "grad_norm": 1.0148782387264679, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7879, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2552204176334107, | |
| "grad_norm": 0.6172239836313566, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7749, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.27842227378190254, | |
| "grad_norm": 0.6603746714841605, | |
| "learning_rate": 5e-06, | |
| "loss": 0.774, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.30162412993039445, | |
| "grad_norm": 0.9671070353476017, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7724, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3248259860788863, | |
| "grad_norm": 0.8187942051333351, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7708, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3480278422273782, | |
| "grad_norm": 1.3610378926806153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7694, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.37122969837587005, | |
| "grad_norm": 1.553376896488753, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7629, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.39443155452436196, | |
| "grad_norm": 0.7372808387717105, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7592, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4176334106728538, | |
| "grad_norm": 0.7999800308605823, | |
| "learning_rate": 5e-06, | |
| "loss": 0.761, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4408352668213457, | |
| "grad_norm": 0.6456605845949898, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7529, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.46403712296983757, | |
| "grad_norm": 0.6857944616690483, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7542, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4872389791183295, | |
| "grad_norm": 0.7399534311272358, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7567, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5104408352668214, | |
| "grad_norm": 0.8146601801797138, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7569, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5336426914153132, | |
| "grad_norm": 0.7170940501236818, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7567, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5568445475638051, | |
| "grad_norm": 0.7145338686681713, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7537, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.580046403712297, | |
| "grad_norm": 0.6297383464994889, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7505, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6032482598607889, | |
| "grad_norm": 0.8359250648408794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7483, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6264501160092807, | |
| "grad_norm": 0.8559118062066237, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7471, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6496519721577726, | |
| "grad_norm": 0.8004192504745514, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7452, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6728538283062645, | |
| "grad_norm": 0.8636411680464042, | |
| "learning_rate": 5e-06, | |
| "loss": 0.748, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6960556844547564, | |
| "grad_norm": 0.6021818739523073, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7447, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7192575406032483, | |
| "grad_norm": 0.8518355169658604, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7447, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7424593967517401, | |
| "grad_norm": 0.5675681353459878, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7425, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.765661252900232, | |
| "grad_norm": 0.5949964123605757, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7365, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7888631090487239, | |
| "grad_norm": 0.7773659953313287, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7431, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8120649651972158, | |
| "grad_norm": 0.6466434386338613, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7409, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8352668213457076, | |
| "grad_norm": 0.603002279117198, | |
| "learning_rate": 5e-06, | |
| "loss": 0.741, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8584686774941995, | |
| "grad_norm": 0.6532864546899737, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7387, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8816705336426914, | |
| "grad_norm": 0.6733105605197987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.74, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9048723897911833, | |
| "grad_norm": 0.633076130519661, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7363, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9280742459396751, | |
| "grad_norm": 0.7237029479086221, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7375, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.951276102088167, | |
| "grad_norm": 0.6675220876656223, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7352, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.974477958236659, | |
| "grad_norm": 0.6256898511266926, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7377, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9976798143851509, | |
| "grad_norm": 0.8498153430162059, | |
| "learning_rate": 5e-06, | |
| "loss": 0.735, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.7354981899261475, | |
| "eval_runtime": 43.1952, | |
| "eval_samples_per_second": 268.34, | |
| "eval_steps_per_second": 1.065, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.0208816705336428, | |
| "grad_norm": 0.8111056050061451, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6896, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0440835266821347, | |
| "grad_norm": 0.7762771093311698, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6869, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0672853828306264, | |
| "grad_norm": 0.6742008198606638, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6832, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.0904872389791183, | |
| "grad_norm": 0.5903757898819316, | |
| "learning_rate": 5e-06, | |
| "loss": 0.686, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1136890951276102, | |
| "grad_norm": 0.8524218992437677, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6897, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.136890951276102, | |
| "grad_norm": 0.5987795742946272, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6855, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.160092807424594, | |
| "grad_norm": 0.6220145776226531, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6854, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.1832946635730859, | |
| "grad_norm": 0.6598280088778764, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6809, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2064965197215778, | |
| "grad_norm": 0.6184109214867479, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6842, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2296983758700697, | |
| "grad_norm": 0.6848903033397736, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6841, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.2529002320185616, | |
| "grad_norm": 0.8158383274767669, | |
| "learning_rate": 5e-06, | |
| "loss": 0.689, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.2761020881670533, | |
| "grad_norm": 0.8760027682177647, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6827, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.2993039443155452, | |
| "grad_norm": 0.6235083449781156, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6813, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.322505800464037, | |
| "grad_norm": 0.6241280762142453, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6845, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.345707656612529, | |
| "grad_norm": 0.6154058211971913, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6871, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.368909512761021, | |
| "grad_norm": 0.7296694454747678, | |
| "learning_rate": 5e-06, | |
| "loss": 0.685, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.3921113689095128, | |
| "grad_norm": 0.5944003900425753, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6864, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4153132250580047, | |
| "grad_norm": 0.5695025126515694, | |
| "learning_rate": 5e-06, | |
| "loss": 0.685, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.4385150812064964, | |
| "grad_norm": 0.6421383889585768, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6872, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.4617169373549883, | |
| "grad_norm": 0.6858640435731316, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6864, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.4849187935034802, | |
| "grad_norm": 0.7119910556237473, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6863, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.5081206496519721, | |
| "grad_norm": 0.8350390704982291, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6855, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.531322505800464, | |
| "grad_norm": 0.7123547269087491, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6915, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.554524361948956, | |
| "grad_norm": 0.8015995455407697, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6886, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.5777262180974478, | |
| "grad_norm": 0.6050445463416455, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6856, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.6009280742459397, | |
| "grad_norm": 0.7998010508831949, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6846, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.6241299303944317, | |
| "grad_norm": 0.8702101163375846, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6828, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6473317865429236, | |
| "grad_norm": 0.797904640236941, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6873, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.6705336426914155, | |
| "grad_norm": 0.631342458953997, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6841, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.6937354988399071, | |
| "grad_norm": 0.5919876881033387, | |
| "learning_rate": 5e-06, | |
| "loss": 0.684, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.716937354988399, | |
| "grad_norm": 0.6446049779707106, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6817, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.740139211136891, | |
| "grad_norm": 0.6291781282636879, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6829, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.7633410672853829, | |
| "grad_norm": 0.6031629391085, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6865, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.7865429234338746, | |
| "grad_norm": 0.6967045010922567, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6851, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.8097447795823665, | |
| "grad_norm": 0.6809250999293309, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6793, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.8329466357308584, | |
| "grad_norm": 0.6189111190147917, | |
| "learning_rate": 5e-06, | |
| "loss": 0.684, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.8561484918793503, | |
| "grad_norm": 0.7950062264818519, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6859, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.8793503480278422, | |
| "grad_norm": 0.552766584160325, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6771, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.902552204176334, | |
| "grad_norm": 0.595582468722862, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6792, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.925754060324826, | |
| "grad_norm": 0.6553730568575968, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6801, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.948955916473318, | |
| "grad_norm": 0.6422758262902983, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6818, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.9721577726218098, | |
| "grad_norm": 0.6469531756231099, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6791, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.9953596287703017, | |
| "grad_norm": 0.8601617789977748, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6847, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.7253859043121338, | |
| "eval_runtime": 44.8365, | |
| "eval_samples_per_second": 258.517, | |
| "eval_steps_per_second": 1.026, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 2.0185614849187936, | |
| "grad_norm": 0.8332415773499794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6349, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.0417633410672855, | |
| "grad_norm": 0.7926156467000511, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6281, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.0649651972157774, | |
| "grad_norm": 0.8228551865240357, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6278, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.0881670533642693, | |
| "grad_norm": 0.7064748007097121, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6246, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.111368909512761, | |
| "grad_norm": 0.7925930702513274, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6319, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.1345707656612527, | |
| "grad_norm": 0.7580312575113166, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6345, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.1577726218097446, | |
| "grad_norm": 0.8223784752811615, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6287, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.1809744779582365, | |
| "grad_norm": 0.6801723430553506, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6345, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.2041763341067284, | |
| "grad_norm": 0.6104725779187484, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6332, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.2273781902552203, | |
| "grad_norm": 1.005789926503314, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6316, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.2505800464037122, | |
| "grad_norm": 0.7442488963065897, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6359, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.273781902552204, | |
| "grad_norm": 0.6635661944450735, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6318, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.296983758700696, | |
| "grad_norm": 0.6303564812431357, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6337, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.320185614849188, | |
| "grad_norm": 0.6850690465216134, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6331, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.34338747099768, | |
| "grad_norm": 0.6704436455876167, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6332, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.3665893271461718, | |
| "grad_norm": 0.6442885483242252, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6352, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.3897911832946637, | |
| "grad_norm": 0.6247270790520427, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.4129930394431556, | |
| "grad_norm": 0.6488764369214974, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6333, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.4361948955916475, | |
| "grad_norm": 0.7457367188839334, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.4593967517401394, | |
| "grad_norm": 0.6794824677100129, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6376, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.4825986078886313, | |
| "grad_norm": 0.7964782872185094, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6387, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.505800464037123, | |
| "grad_norm": 0.7107734542729047, | |
| "learning_rate": 5e-06, | |
| "loss": 0.636, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.529002320185615, | |
| "grad_norm": 0.6118818323350269, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6378, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.5522041763341066, | |
| "grad_norm": 0.6792915882704795, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6401, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.5754060324825985, | |
| "grad_norm": 0.5791884637384154, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6348, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.5986078886310904, | |
| "grad_norm": 0.65354154094048, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6356, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.6218097447795823, | |
| "grad_norm": 0.7784156660236978, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6385, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.645011600928074, | |
| "grad_norm": 0.6148881616275584, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6368, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.668213457076566, | |
| "grad_norm": 0.9309313128658645, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6341, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.691415313225058, | |
| "grad_norm": 0.6998580652650385, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6383, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.71461716937355, | |
| "grad_norm": 0.6470112662348189, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6345, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.737819025522042, | |
| "grad_norm": 0.693281848491071, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6373, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.7610208816705337, | |
| "grad_norm": 0.718979849412995, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6389, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.7842227378190256, | |
| "grad_norm": 0.6226393842218627, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6323, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.8074245939675175, | |
| "grad_norm": 0.6041906661727968, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6411, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.8306264501160094, | |
| "grad_norm": 0.6212934149482867, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6376, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.853828306264501, | |
| "grad_norm": 0.6648887269433067, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6391, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.877030162412993, | |
| "grad_norm": 0.5811847662970155, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6379, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.9002320185614847, | |
| "grad_norm": 0.8904746056337586, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6397, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.9234338747099766, | |
| "grad_norm": 0.6627485975413968, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6343, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.9466357308584685, | |
| "grad_norm": 0.5926997597872009, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6358, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.9698375870069604, | |
| "grad_norm": 0.5956109668755322, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.9930394431554523, | |
| "grad_norm": 0.6391850750364162, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6347, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.7314478158950806, | |
| "eval_runtime": 42.305, | |
| "eval_samples_per_second": 273.986, | |
| "eval_steps_per_second": 1.087, | |
| "step": 1293 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 1293, | |
| "total_flos": 2165823158353920.0, | |
| "train_loss": 0.6976413785812012, | |
| "train_runtime": 8439.309, | |
| "train_samples_per_second": 78.281, | |
| "train_steps_per_second": 0.153 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1293, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2165823158353920.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |