{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1293, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02320185614849188, "grad_norm": 12.88911742024848, "learning_rate": 5e-06, "loss": 1.0604, "step": 10 }, { "epoch": 0.04640371229698376, "grad_norm": 1.2750125629212934, "learning_rate": 5e-06, "loss": 0.9197, "step": 20 }, { "epoch": 0.06960556844547564, "grad_norm": 1.3405185935171573, "learning_rate": 5e-06, "loss": 0.8724, "step": 30 }, { "epoch": 0.09280742459396751, "grad_norm": 1.5959997640606394, "learning_rate": 5e-06, "loss": 0.848, "step": 40 }, { "epoch": 0.11600928074245939, "grad_norm": 1.1755361886660722, "learning_rate": 5e-06, "loss": 0.8298, "step": 50 }, { "epoch": 0.13921113689095127, "grad_norm": 1.058737549762726, "learning_rate": 5e-06, "loss": 0.8127, "step": 60 }, { "epoch": 0.16241299303944315, "grad_norm": 1.3975177955215072, "learning_rate": 5e-06, "loss": 0.8069, "step": 70 }, { "epoch": 0.18561484918793503, "grad_norm": 1.000120781135683, "learning_rate": 5e-06, "loss": 0.7916, "step": 80 }, { "epoch": 0.2088167053364269, "grad_norm": 1.1104739022155425, "learning_rate": 5e-06, "loss": 0.786, "step": 90 }, { "epoch": 0.23201856148491878, "grad_norm": 1.0148782387264679, "learning_rate": 5e-06, "loss": 0.7879, "step": 100 }, { "epoch": 0.2552204176334107, "grad_norm": 0.6172239836313566, "learning_rate": 5e-06, "loss": 0.7749, "step": 110 }, { "epoch": 0.27842227378190254, "grad_norm": 0.6603746714841605, "learning_rate": 5e-06, "loss": 0.774, "step": 120 }, { "epoch": 0.30162412993039445, "grad_norm": 0.9671070353476017, "learning_rate": 5e-06, "loss": 0.7724, "step": 130 }, { "epoch": 0.3248259860788863, "grad_norm": 0.8187942051333351, "learning_rate": 5e-06, "loss": 0.7708, "step": 140 }, { "epoch": 0.3480278422273782, "grad_norm": 1.3610378926806153, "learning_rate": 5e-06, "loss": 0.7694, "step": 150 }, { "epoch": 0.37122969837587005, "grad_norm": 1.553376896488753, "learning_rate": 5e-06, "loss": 0.7629, "step": 160 }, { "epoch": 0.39443155452436196, "grad_norm": 0.7372808387717105, "learning_rate": 5e-06, "loss": 0.7592, "step": 170 }, { "epoch": 0.4176334106728538, "grad_norm": 0.7999800308605823, "learning_rate": 5e-06, "loss": 0.761, "step": 180 }, { "epoch": 0.4408352668213457, "grad_norm": 0.6456605845949898, "learning_rate": 5e-06, "loss": 0.7529, "step": 190 }, { "epoch": 0.46403712296983757, "grad_norm": 0.6857944616690483, "learning_rate": 5e-06, "loss": 0.7542, "step": 200 }, { "epoch": 0.4872389791183295, "grad_norm": 0.7399534311272358, "learning_rate": 5e-06, "loss": 0.7567, "step": 210 }, { "epoch": 0.5104408352668214, "grad_norm": 0.8146601801797138, "learning_rate": 5e-06, "loss": 0.7569, "step": 220 }, { "epoch": 0.5336426914153132, "grad_norm": 0.7170940501236818, "learning_rate": 5e-06, "loss": 0.7567, "step": 230 }, { "epoch": 0.5568445475638051, "grad_norm": 0.7145338686681713, "learning_rate": 5e-06, "loss": 0.7537, "step": 240 }, { "epoch": 0.580046403712297, "grad_norm": 0.6297383464994889, "learning_rate": 5e-06, "loss": 0.7505, "step": 250 }, { "epoch": 0.6032482598607889, "grad_norm": 0.8359250648408794, "learning_rate": 5e-06, "loss": 0.7483, "step": 260 }, { "epoch": 0.6264501160092807, "grad_norm": 0.8559118062066237, "learning_rate": 5e-06, "loss": 0.7471, "step": 270 }, { "epoch": 0.6496519721577726, "grad_norm": 0.8004192504745514, "learning_rate": 5e-06, "loss": 0.7452, "step": 280 }, { "epoch": 0.6728538283062645, "grad_norm": 0.8636411680464042, "learning_rate": 5e-06, "loss": 0.748, "step": 290 }, { "epoch": 0.6960556844547564, "grad_norm": 0.6021818739523073, "learning_rate": 5e-06, "loss": 0.7447, "step": 300 }, { "epoch": 0.7192575406032483, "grad_norm": 0.8518355169658604, "learning_rate": 5e-06, "loss": 0.7447, "step": 310 }, { "epoch": 0.7424593967517401, "grad_norm": 0.5675681353459878, "learning_rate": 5e-06, "loss": 0.7425, "step": 320 }, { "epoch": 0.765661252900232, "grad_norm": 0.5949964123605757, "learning_rate": 5e-06, "loss": 0.7365, "step": 330 }, { "epoch": 0.7888631090487239, "grad_norm": 0.7773659953313287, "learning_rate": 5e-06, "loss": 0.7431, "step": 340 }, { "epoch": 0.8120649651972158, "grad_norm": 0.6466434386338613, "learning_rate": 5e-06, "loss": 0.7409, "step": 350 }, { "epoch": 0.8352668213457076, "grad_norm": 0.603002279117198, "learning_rate": 5e-06, "loss": 0.741, "step": 360 }, { "epoch": 0.8584686774941995, "grad_norm": 0.6532864546899737, "learning_rate": 5e-06, "loss": 0.7387, "step": 370 }, { "epoch": 0.8816705336426914, "grad_norm": 0.6733105605197987, "learning_rate": 5e-06, "loss": 0.74, "step": 380 }, { "epoch": 0.9048723897911833, "grad_norm": 0.633076130519661, "learning_rate": 5e-06, "loss": 0.7363, "step": 390 }, { "epoch": 0.9280742459396751, "grad_norm": 0.7237029479086221, "learning_rate": 5e-06, "loss": 0.7375, "step": 400 }, { "epoch": 0.951276102088167, "grad_norm": 0.6675220876656223, "learning_rate": 5e-06, "loss": 0.7352, "step": 410 }, { "epoch": 0.974477958236659, "grad_norm": 0.6256898511266926, "learning_rate": 5e-06, "loss": 0.7377, "step": 420 }, { "epoch": 0.9976798143851509, "grad_norm": 0.8498153430162059, "learning_rate": 5e-06, "loss": 0.735, "step": 430 }, { "epoch": 1.0, "eval_loss": 0.7354981899261475, "eval_runtime": 43.1952, "eval_samples_per_second": 268.34, "eval_steps_per_second": 1.065, "step": 431 }, { "epoch": 1.0208816705336428, "grad_norm": 0.8111056050061451, "learning_rate": 5e-06, "loss": 0.6896, "step": 440 }, { "epoch": 1.0440835266821347, "grad_norm": 0.7762771093311698, "learning_rate": 5e-06, "loss": 0.6869, "step": 450 }, { "epoch": 1.0672853828306264, "grad_norm": 0.6742008198606638, "learning_rate": 5e-06, "loss": 0.6832, "step": 460 }, { "epoch": 1.0904872389791183, "grad_norm": 0.5903757898819316, "learning_rate": 5e-06, "loss": 0.686, "step": 470 }, { "epoch": 1.1136890951276102, "grad_norm": 0.8524218992437677, "learning_rate": 5e-06, "loss": 0.6897, "step": 480 }, { "epoch": 1.136890951276102, "grad_norm": 0.5987795742946272, "learning_rate": 5e-06, "loss": 0.6855, "step": 490 }, { "epoch": 1.160092807424594, "grad_norm": 0.6220145776226531, "learning_rate": 5e-06, "loss": 0.6854, "step": 500 }, { "epoch": 1.1832946635730859, "grad_norm": 0.6598280088778764, "learning_rate": 5e-06, "loss": 0.6809, "step": 510 }, { "epoch": 1.2064965197215778, "grad_norm": 0.6184109214867479, "learning_rate": 5e-06, "loss": 0.6842, "step": 520 }, { "epoch": 1.2296983758700697, "grad_norm": 0.6848903033397736, "learning_rate": 5e-06, "loss": 0.6841, "step": 530 }, { "epoch": 1.2529002320185616, "grad_norm": 0.8158383274767669, "learning_rate": 5e-06, "loss": 0.689, "step": 540 }, { "epoch": 1.2761020881670533, "grad_norm": 0.8760027682177647, "learning_rate": 5e-06, "loss": 0.6827, "step": 550 }, { "epoch": 1.2993039443155452, "grad_norm": 0.6235083449781156, "learning_rate": 5e-06, "loss": 0.6813, "step": 560 }, { "epoch": 1.322505800464037, "grad_norm": 0.6241280762142453, "learning_rate": 5e-06, "loss": 0.6845, "step": 570 }, { "epoch": 1.345707656612529, "grad_norm": 0.6154058211971913, "learning_rate": 5e-06, "loss": 0.6871, "step": 580 }, { "epoch": 1.368909512761021, "grad_norm": 0.7296694454747678, "learning_rate": 5e-06, "loss": 0.685, "step": 590 }, { "epoch": 1.3921113689095128, "grad_norm": 0.5944003900425753, "learning_rate": 5e-06, "loss": 0.6864, "step": 600 }, { "epoch": 1.4153132250580047, "grad_norm": 0.5695025126515694, "learning_rate": 5e-06, "loss": 0.685, "step": 610 }, { "epoch": 1.4385150812064964, "grad_norm": 0.6421383889585768, "learning_rate": 5e-06, "loss": 0.6872, "step": 620 }, { "epoch": 1.4617169373549883, "grad_norm": 0.6858640435731316, "learning_rate": 5e-06, "loss": 0.6864, "step": 630 }, { "epoch": 1.4849187935034802, "grad_norm": 0.7119910556237473, "learning_rate": 5e-06, "loss": 0.6863, "step": 640 }, { "epoch": 1.5081206496519721, "grad_norm": 0.8350390704982291, "learning_rate": 5e-06, "loss": 0.6855, "step": 650 }, { "epoch": 1.531322505800464, "grad_norm": 0.7123547269087491, "learning_rate": 5e-06, "loss": 0.6915, "step": 660 }, { "epoch": 1.554524361948956, "grad_norm": 0.8015995455407697, "learning_rate": 5e-06, "loss": 0.6886, "step": 670 }, { "epoch": 1.5777262180974478, "grad_norm": 0.6050445463416455, "learning_rate": 5e-06, "loss": 0.6856, "step": 680 }, { "epoch": 1.6009280742459397, "grad_norm": 0.7998010508831949, "learning_rate": 5e-06, "loss": 0.6846, "step": 690 }, { "epoch": 1.6241299303944317, "grad_norm": 0.8702101163375846, "learning_rate": 5e-06, "loss": 0.6828, "step": 700 }, { "epoch": 1.6473317865429236, "grad_norm": 0.797904640236941, "learning_rate": 5e-06, "loss": 0.6873, "step": 710 }, { "epoch": 1.6705336426914155, "grad_norm": 0.631342458953997, "learning_rate": 5e-06, "loss": 0.6841, "step": 720 }, { "epoch": 1.6937354988399071, "grad_norm": 0.5919876881033387, "learning_rate": 5e-06, "loss": 0.684, "step": 730 }, { "epoch": 1.716937354988399, "grad_norm": 0.6446049779707106, "learning_rate": 5e-06, "loss": 0.6817, "step": 740 }, { "epoch": 1.740139211136891, "grad_norm": 0.6291781282636879, "learning_rate": 5e-06, "loss": 0.6829, "step": 750 }, { "epoch": 1.7633410672853829, "grad_norm": 0.6031629391085, "learning_rate": 5e-06, "loss": 0.6865, "step": 760 }, { "epoch": 1.7865429234338746, "grad_norm": 0.6967045010922567, "learning_rate": 5e-06, "loss": 0.6851, "step": 770 }, { "epoch": 1.8097447795823665, "grad_norm": 0.6809250999293309, "learning_rate": 5e-06, "loss": 0.6793, "step": 780 }, { "epoch": 1.8329466357308584, "grad_norm": 0.6189111190147917, "learning_rate": 5e-06, "loss": 0.684, "step": 790 }, { "epoch": 1.8561484918793503, "grad_norm": 0.7950062264818519, "learning_rate": 5e-06, "loss": 0.6859, "step": 800 }, { "epoch": 1.8793503480278422, "grad_norm": 0.552766584160325, "learning_rate": 5e-06, "loss": 0.6771, "step": 810 }, { "epoch": 1.902552204176334, "grad_norm": 0.595582468722862, "learning_rate": 5e-06, "loss": 0.6792, "step": 820 }, { "epoch": 1.925754060324826, "grad_norm": 0.6553730568575968, "learning_rate": 5e-06, "loss": 0.6801, "step": 830 }, { "epoch": 1.948955916473318, "grad_norm": 0.6422758262902983, "learning_rate": 5e-06, "loss": 0.6818, "step": 840 }, { "epoch": 1.9721577726218098, "grad_norm": 0.6469531756231099, "learning_rate": 5e-06, "loss": 0.6791, "step": 850 }, { "epoch": 1.9953596287703017, "grad_norm": 0.8601617789977748, "learning_rate": 5e-06, "loss": 0.6847, "step": 860 }, { "epoch": 2.0, "eval_loss": 0.7253859043121338, "eval_runtime": 44.8365, "eval_samples_per_second": 258.517, "eval_steps_per_second": 1.026, "step": 862 }, { "epoch": 2.0185614849187936, "grad_norm": 0.8332415773499794, "learning_rate": 5e-06, "loss": 0.6349, "step": 870 }, { "epoch": 2.0417633410672855, "grad_norm": 0.7926156467000511, "learning_rate": 5e-06, "loss": 0.6281, "step": 880 }, { "epoch": 2.0649651972157774, "grad_norm": 0.8228551865240357, "learning_rate": 5e-06, "loss": 0.6278, "step": 890 }, { "epoch": 2.0881670533642693, "grad_norm": 0.7064748007097121, "learning_rate": 5e-06, "loss": 0.6246, "step": 900 }, { "epoch": 2.111368909512761, "grad_norm": 0.7925930702513274, "learning_rate": 5e-06, "loss": 0.6319, "step": 910 }, { "epoch": 2.1345707656612527, "grad_norm": 0.7580312575113166, "learning_rate": 5e-06, "loss": 0.6345, "step": 920 }, { "epoch": 2.1577726218097446, "grad_norm": 0.8223784752811615, "learning_rate": 5e-06, "loss": 0.6287, "step": 930 }, { "epoch": 2.1809744779582365, "grad_norm": 0.6801723430553506, "learning_rate": 5e-06, "loss": 0.6345, "step": 940 }, { "epoch": 2.2041763341067284, "grad_norm": 0.6104725779187484, "learning_rate": 5e-06, "loss": 0.6332, "step": 950 }, { "epoch": 2.2273781902552203, "grad_norm": 1.005789926503314, "learning_rate": 5e-06, "loss": 0.6316, "step": 960 }, { "epoch": 2.2505800464037122, "grad_norm": 0.7442488963065897, "learning_rate": 5e-06, "loss": 0.6359, "step": 970 }, { "epoch": 2.273781902552204, "grad_norm": 0.6635661944450735, "learning_rate": 5e-06, "loss": 0.6318, "step": 980 }, { "epoch": 2.296983758700696, "grad_norm": 0.6303564812431357, "learning_rate": 5e-06, "loss": 0.6337, "step": 990 }, { "epoch": 2.320185614849188, "grad_norm": 0.6850690465216134, "learning_rate": 5e-06, "loss": 0.6331, "step": 1000 }, { "epoch": 2.34338747099768, "grad_norm": 0.6704436455876167, "learning_rate": 5e-06, "loss": 0.6332, "step": 1010 }, { "epoch": 2.3665893271461718, "grad_norm": 0.6442885483242252, "learning_rate": 5e-06, "loss": 0.6352, "step": 1020 }, { "epoch": 2.3897911832946637, "grad_norm": 0.6247270790520427, "learning_rate": 5e-06, "loss": 0.6351, "step": 1030 }, { "epoch": 2.4129930394431556, "grad_norm": 0.6488764369214974, "learning_rate": 5e-06, "loss": 0.6333, "step": 1040 }, { "epoch": 2.4361948955916475, "grad_norm": 0.7457367188839334, "learning_rate": 5e-06, "loss": 0.6351, "step": 1050 }, { "epoch": 2.4593967517401394, "grad_norm": 0.6794824677100129, "learning_rate": 5e-06, "loss": 0.6376, "step": 1060 }, { "epoch": 2.4825986078886313, "grad_norm": 0.7964782872185094, "learning_rate": 5e-06, "loss": 0.6387, "step": 1070 }, { "epoch": 2.505800464037123, "grad_norm": 0.7107734542729047, "learning_rate": 5e-06, "loss": 0.636, "step": 1080 }, { "epoch": 2.529002320185615, "grad_norm": 0.6118818323350269, "learning_rate": 5e-06, "loss": 0.6378, "step": 1090 }, { "epoch": 2.5522041763341066, "grad_norm": 0.6792915882704795, "learning_rate": 5e-06, "loss": 0.6401, "step": 1100 }, { "epoch": 2.5754060324825985, "grad_norm": 0.5791884637384154, "learning_rate": 5e-06, "loss": 0.6348, "step": 1110 }, { "epoch": 2.5986078886310904, "grad_norm": 0.65354154094048, "learning_rate": 5e-06, "loss": 0.6356, "step": 1120 }, { "epoch": 2.6218097447795823, "grad_norm": 0.7784156660236978, "learning_rate": 5e-06, "loss": 0.6385, "step": 1130 }, { "epoch": 2.645011600928074, "grad_norm": 0.6148881616275584, "learning_rate": 5e-06, "loss": 0.6368, "step": 1140 }, { "epoch": 2.668213457076566, "grad_norm": 0.9309313128658645, "learning_rate": 5e-06, "loss": 0.6341, "step": 1150 }, { "epoch": 2.691415313225058, "grad_norm": 0.6998580652650385, "learning_rate": 5e-06, "loss": 0.6383, "step": 1160 }, { "epoch": 2.71461716937355, "grad_norm": 0.6470112662348189, "learning_rate": 5e-06, "loss": 0.6345, "step": 1170 }, { "epoch": 2.737819025522042, "grad_norm": 0.693281848491071, "learning_rate": 5e-06, "loss": 0.6373, "step": 1180 }, { "epoch": 2.7610208816705337, "grad_norm": 0.718979849412995, "learning_rate": 5e-06, "loss": 0.6389, "step": 1190 }, { "epoch": 2.7842227378190256, "grad_norm": 0.6226393842218627, "learning_rate": 5e-06, "loss": 0.6323, "step": 1200 }, { "epoch": 2.8074245939675175, "grad_norm": 0.6041906661727968, "learning_rate": 5e-06, "loss": 0.6411, "step": 1210 }, { "epoch": 2.8306264501160094, "grad_norm": 0.6212934149482867, "learning_rate": 5e-06, "loss": 0.6376, "step": 1220 }, { "epoch": 2.853828306264501, "grad_norm": 0.6648887269433067, "learning_rate": 5e-06, "loss": 0.6391, "step": 1230 }, { "epoch": 2.877030162412993, "grad_norm": 0.5811847662970155, "learning_rate": 5e-06, "loss": 0.6379, "step": 1240 }, { "epoch": 2.9002320185614847, "grad_norm": 0.8904746056337586, "learning_rate": 5e-06, "loss": 0.6397, "step": 1250 }, { "epoch": 2.9234338747099766, "grad_norm": 0.6627485975413968, "learning_rate": 5e-06, "loss": 0.6343, "step": 1260 }, { "epoch": 2.9466357308584685, "grad_norm": 0.5926997597872009, "learning_rate": 5e-06, "loss": 0.6358, "step": 1270 }, { "epoch": 2.9698375870069604, "grad_norm": 0.5956109668755322, "learning_rate": 5e-06, "loss": 0.6351, "step": 1280 }, { "epoch": 2.9930394431554523, "grad_norm": 0.6391850750364162, "learning_rate": 5e-06, "loss": 0.6347, "step": 1290 }, { "epoch": 3.0, "eval_loss": 0.7314478158950806, "eval_runtime": 42.305, "eval_samples_per_second": 273.986, "eval_steps_per_second": 1.087, "step": 1293 }, { "epoch": 3.0, "step": 1293, "total_flos": 2165823158353920.0, "train_loss": 0.6976413785812012, "train_runtime": 8439.309, "train_samples_per_second": 78.281, "train_steps_per_second": 0.153 } ], "logging_steps": 10, "max_steps": 1293, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2165823158353920.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }