{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9928514694201747, "eval_steps": 500, "global_step": 942, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03177124702144559, "grad_norm": 3.4397125264946693, "learning_rate": 5e-06, "loss": 0.8269, "step": 10 }, { "epoch": 0.06354249404289118, "grad_norm": 1.3162043926897933, "learning_rate": 5e-06, "loss": 0.7492, "step": 20 }, { "epoch": 0.09531374106433678, "grad_norm": 1.740768639790779, "learning_rate": 5e-06, "loss": 0.7286, "step": 30 }, { "epoch": 0.12708498808578236, "grad_norm": 1.047097510755796, "learning_rate": 5e-06, "loss": 0.7128, "step": 40 }, { "epoch": 0.15885623510722796, "grad_norm": 2.1299057435530897, "learning_rate": 5e-06, "loss": 0.696, "step": 50 }, { "epoch": 0.19062748212867356, "grad_norm": 1.334088036223787, "learning_rate": 5e-06, "loss": 0.6873, "step": 60 }, { "epoch": 0.22239872915011913, "grad_norm": 1.2813478557952303, "learning_rate": 5e-06, "loss": 0.6917, "step": 70 }, { "epoch": 0.2541699761715647, "grad_norm": 1.0941086787458663, "learning_rate": 5e-06, "loss": 0.6812, "step": 80 }, { "epoch": 0.28594122319301035, "grad_norm": 0.7478585687808075, "learning_rate": 5e-06, "loss": 0.6755, "step": 90 }, { "epoch": 0.3177124702144559, "grad_norm": 0.6575538230318294, "learning_rate": 5e-06, "loss": 0.6712, "step": 100 }, { "epoch": 0.3494837172359015, "grad_norm": 0.9175291572423222, "learning_rate": 5e-06, "loss": 0.6712, "step": 110 }, { "epoch": 0.3812549642573471, "grad_norm": 0.9905458946212224, "learning_rate": 5e-06, "loss": 0.6702, "step": 120 }, { "epoch": 0.4130262112787927, "grad_norm": 0.5564766884877899, "learning_rate": 5e-06, "loss": 0.6639, "step": 130 }, { "epoch": 0.44479745830023826, "grad_norm": 0.6577684294501048, "learning_rate": 5e-06, "loss": 0.6664, "step": 140 }, { "epoch": 0.4765687053216839, "grad_norm": 0.6804018194983035, "learning_rate": 5e-06, "loss": 0.6601, "step": 150 }, { "epoch": 0.5083399523431295, "grad_norm": 1.179572621196443, "learning_rate": 5e-06, "loss": 0.6579, "step": 160 }, { "epoch": 0.5401111993645751, "grad_norm": 0.6797391956593329, "learning_rate": 5e-06, "loss": 0.6574, "step": 170 }, { "epoch": 0.5718824463860207, "grad_norm": 0.4921523794931075, "learning_rate": 5e-06, "loss": 0.6611, "step": 180 }, { "epoch": 0.6036536934074662, "grad_norm": 0.43865289260205137, "learning_rate": 5e-06, "loss": 0.6593, "step": 190 }, { "epoch": 0.6354249404289118, "grad_norm": 0.6676069056649501, "learning_rate": 5e-06, "loss": 0.653, "step": 200 }, { "epoch": 0.6671961874503575, "grad_norm": 0.6607839687607865, "learning_rate": 5e-06, "loss": 0.6465, "step": 210 }, { "epoch": 0.698967434471803, "grad_norm": 0.5527119566717958, "learning_rate": 5e-06, "loss": 0.6513, "step": 220 }, { "epoch": 0.7307386814932486, "grad_norm": 1.1754706747514654, "learning_rate": 5e-06, "loss": 0.6552, "step": 230 }, { "epoch": 0.7625099285146942, "grad_norm": 0.5274214651086202, "learning_rate": 5e-06, "loss": 0.6476, "step": 240 }, { "epoch": 0.7942811755361397, "grad_norm": 0.5191905788117661, "learning_rate": 5e-06, "loss": 0.6515, "step": 250 }, { "epoch": 0.8260524225575854, "grad_norm": 0.5021156228769978, "learning_rate": 5e-06, "loss": 0.6391, "step": 260 }, { "epoch": 0.857823669579031, "grad_norm": 0.4860867602338113, "learning_rate": 5e-06, "loss": 0.6488, "step": 270 }, { "epoch": 0.8895949166004765, "grad_norm": 0.5477781086085485, "learning_rate": 5e-06, "loss": 0.644, "step": 280 }, { "epoch": 0.9213661636219221, "grad_norm": 0.4541673708322245, "learning_rate": 5e-06, "loss": 0.6533, "step": 290 }, { "epoch": 0.9531374106433678, "grad_norm": 0.6665636520469818, "learning_rate": 5e-06, "loss": 0.6458, "step": 300 }, { "epoch": 0.9849086576648134, "grad_norm": 1.2466688450308792, "learning_rate": 5e-06, "loss": 0.6416, "step": 310 }, { "epoch": 0.9976171564733916, "eval_loss": 0.6390731930732727, "eval_runtime": 169.5155, "eval_samples_per_second": 50.025, "eval_steps_per_second": 0.395, "step": 314 }, { "epoch": 1.016679904686259, "grad_norm": 0.8923786845867862, "learning_rate": 5e-06, "loss": 0.6169, "step": 320 }, { "epoch": 1.0484511517077044, "grad_norm": 0.8258460752195317, "learning_rate": 5e-06, "loss": 0.5989, "step": 330 }, { "epoch": 1.0802223987291502, "grad_norm": 0.8974017625848637, "learning_rate": 5e-06, "loss": 0.6002, "step": 340 }, { "epoch": 1.1119936457505957, "grad_norm": 0.6051249372281612, "learning_rate": 5e-06, "loss": 0.5969, "step": 350 }, { "epoch": 1.1437648927720412, "grad_norm": 0.4964829017815635, "learning_rate": 5e-06, "loss": 0.5932, "step": 360 }, { "epoch": 1.175536139793487, "grad_norm": 0.7022791565648377, "learning_rate": 5e-06, "loss": 0.5891, "step": 370 }, { "epoch": 1.2073073868149324, "grad_norm": 0.48687833937339325, "learning_rate": 5e-06, "loss": 0.5877, "step": 380 }, { "epoch": 1.2390786338363782, "grad_norm": 0.8131676427879705, "learning_rate": 5e-06, "loss": 0.5934, "step": 390 }, { "epoch": 1.2708498808578237, "grad_norm": 0.5514260097560263, "learning_rate": 5e-06, "loss": 0.5951, "step": 400 }, { "epoch": 1.3026211278792692, "grad_norm": 0.5691182386121336, "learning_rate": 5e-06, "loss": 0.5924, "step": 410 }, { "epoch": 1.3343923749007147, "grad_norm": 0.5887848699748005, "learning_rate": 5e-06, "loss": 0.59, "step": 420 }, { "epoch": 1.3661636219221605, "grad_norm": 0.6171594708352537, "learning_rate": 5e-06, "loss": 0.5903, "step": 430 }, { "epoch": 1.397934868943606, "grad_norm": 0.5006287746249459, "learning_rate": 5e-06, "loss": 0.5995, "step": 440 }, { "epoch": 1.4297061159650517, "grad_norm": 0.5783363834709443, "learning_rate": 5e-06, "loss": 0.5987, "step": 450 }, { "epoch": 1.4614773629864972, "grad_norm": 0.5607638180460264, "learning_rate": 5e-06, "loss": 0.5883, "step": 460 }, { "epoch": 1.4932486100079427, "grad_norm": 0.5510599347940959, "learning_rate": 5e-06, "loss": 0.5931, "step": 470 }, { "epoch": 1.5250198570293882, "grad_norm": 0.6341820224047038, "learning_rate": 5e-06, "loss": 0.5871, "step": 480 }, { "epoch": 1.556791104050834, "grad_norm": 0.5477960140838268, "learning_rate": 5e-06, "loss": 0.6005, "step": 490 }, { "epoch": 1.5885623510722797, "grad_norm": 0.5564566962985059, "learning_rate": 5e-06, "loss": 0.585, "step": 500 }, { "epoch": 1.6203335980937252, "grad_norm": 0.6479737616056668, "learning_rate": 5e-06, "loss": 0.5934, "step": 510 }, { "epoch": 1.6521048451151708, "grad_norm": 0.5569401541147417, "learning_rate": 5e-06, "loss": 0.5931, "step": 520 }, { "epoch": 1.6838760921366163, "grad_norm": 0.4956422971136313, "learning_rate": 5e-06, "loss": 0.5847, "step": 530 }, { "epoch": 1.715647339158062, "grad_norm": 0.5764728462806425, "learning_rate": 5e-06, "loss": 0.5918, "step": 540 }, { "epoch": 1.7474185861795075, "grad_norm": 0.5683144564118795, "learning_rate": 5e-06, "loss": 0.5893, "step": 550 }, { "epoch": 1.7791898332009533, "grad_norm": 0.45523545149858596, "learning_rate": 5e-06, "loss": 0.5961, "step": 560 }, { "epoch": 1.8109610802223988, "grad_norm": 0.5067511090235118, "learning_rate": 5e-06, "loss": 0.5933, "step": 570 }, { "epoch": 1.8427323272438443, "grad_norm": 0.5003192884659386, "learning_rate": 5e-06, "loss": 0.5915, "step": 580 }, { "epoch": 1.8745035742652898, "grad_norm": 0.496854313730653, "learning_rate": 5e-06, "loss": 0.5871, "step": 590 }, { "epoch": 1.9062748212867355, "grad_norm": 0.5480065320983066, "learning_rate": 5e-06, "loss": 0.5971, "step": 600 }, { "epoch": 1.938046068308181, "grad_norm": 0.4892198010452981, "learning_rate": 5e-06, "loss": 0.5925, "step": 610 }, { "epoch": 1.9698173153296268, "grad_norm": 0.5258022377268232, "learning_rate": 5e-06, "loss": 0.5971, "step": 620 }, { "epoch": 1.9984114376489277, "eval_loss": 0.6303147673606873, "eval_runtime": 169.9094, "eval_samples_per_second": 49.909, "eval_steps_per_second": 0.394, "step": 629 }, { "epoch": 2.0015885623510723, "grad_norm": 0.6865592089506408, "learning_rate": 5e-06, "loss": 0.5833, "step": 630 }, { "epoch": 2.033359809372518, "grad_norm": 0.734114065829172, "learning_rate": 5e-06, "loss": 0.5361, "step": 640 }, { "epoch": 2.0651310563939633, "grad_norm": 0.6113057952860269, "learning_rate": 5e-06, "loss": 0.538, "step": 650 }, { "epoch": 2.096902303415409, "grad_norm": 0.7419614574245821, "learning_rate": 5e-06, "loss": 0.5393, "step": 660 }, { "epoch": 2.128673550436855, "grad_norm": 0.5125951202265081, "learning_rate": 5e-06, "loss": 0.5338, "step": 670 }, { "epoch": 2.1604447974583003, "grad_norm": 0.5037660268311202, "learning_rate": 5e-06, "loss": 0.5405, "step": 680 }, { "epoch": 2.192216044479746, "grad_norm": 0.5393677046760976, "learning_rate": 5e-06, "loss": 0.5321, "step": 690 }, { "epoch": 2.2239872915011913, "grad_norm": 0.5385301073245446, "learning_rate": 5e-06, "loss": 0.5403, "step": 700 }, { "epoch": 2.255758538522637, "grad_norm": 0.7724939064931073, "learning_rate": 5e-06, "loss": 0.531, "step": 710 }, { "epoch": 2.2875297855440824, "grad_norm": 0.607781764707526, "learning_rate": 5e-06, "loss": 0.5444, "step": 720 }, { "epoch": 2.3193010325655283, "grad_norm": 0.6227051590924966, "learning_rate": 5e-06, "loss": 0.5363, "step": 730 }, { "epoch": 2.351072279586974, "grad_norm": 0.5541522360131526, "learning_rate": 5e-06, "loss": 0.5377, "step": 740 }, { "epoch": 2.3828435266084194, "grad_norm": 0.6694258055269825, "learning_rate": 5e-06, "loss": 0.55, "step": 750 }, { "epoch": 2.414614773629865, "grad_norm": 0.6024726800298386, "learning_rate": 5e-06, "loss": 0.542, "step": 760 }, { "epoch": 2.4463860206513104, "grad_norm": 0.49906984309919067, "learning_rate": 5e-06, "loss": 0.5415, "step": 770 }, { "epoch": 2.4781572676727563, "grad_norm": 0.5863593266075681, "learning_rate": 5e-06, "loss": 0.5451, "step": 780 }, { "epoch": 2.509928514694202, "grad_norm": 0.5802769154479103, "learning_rate": 5e-06, "loss": 0.5396, "step": 790 }, { "epoch": 2.5416997617156474, "grad_norm": 0.5886600271892876, "learning_rate": 5e-06, "loss": 0.5424, "step": 800 }, { "epoch": 2.573471008737093, "grad_norm": 0.6006325307836762, "learning_rate": 5e-06, "loss": 0.5425, "step": 810 }, { "epoch": 2.6052422557585384, "grad_norm": 0.6296696985193921, "learning_rate": 5e-06, "loss": 0.5406, "step": 820 }, { "epoch": 2.6370135027799844, "grad_norm": 0.6082995671271276, "learning_rate": 5e-06, "loss": 0.5371, "step": 830 }, { "epoch": 2.6687847498014294, "grad_norm": 0.6361605601027807, "learning_rate": 5e-06, "loss": 0.5422, "step": 840 }, { "epoch": 2.7005559968228754, "grad_norm": 0.5254863126848727, "learning_rate": 5e-06, "loss": 0.546, "step": 850 }, { "epoch": 2.732327243844321, "grad_norm": 0.7335792362538626, "learning_rate": 5e-06, "loss": 0.5423, "step": 860 }, { "epoch": 2.7640984908657664, "grad_norm": 0.6113916939877616, "learning_rate": 5e-06, "loss": 0.5409, "step": 870 }, { "epoch": 2.795869737887212, "grad_norm": 0.49594942341373327, "learning_rate": 5e-06, "loss": 0.542, "step": 880 }, { "epoch": 2.8276409849086575, "grad_norm": 0.6503566754620093, "learning_rate": 5e-06, "loss": 0.5408, "step": 890 }, { "epoch": 2.8594122319301034, "grad_norm": 0.6016129927241319, "learning_rate": 5e-06, "loss": 0.546, "step": 900 }, { "epoch": 2.891183478951549, "grad_norm": 0.5016315173194889, "learning_rate": 5e-06, "loss": 0.5451, "step": 910 }, { "epoch": 2.9229547259729944, "grad_norm": 0.585287305851242, "learning_rate": 5e-06, "loss": 0.5433, "step": 920 }, { "epoch": 2.95472597299444, "grad_norm": 0.5879035774581888, "learning_rate": 5e-06, "loss": 0.5467, "step": 930 }, { "epoch": 2.9864972200158855, "grad_norm": 0.4888796678433693, "learning_rate": 5e-06, "loss": 0.5442, "step": 940 }, { "epoch": 2.9928514694201747, "eval_loss": 0.6355611085891724, "eval_runtime": 170.8785, "eval_samples_per_second": 49.626, "eval_steps_per_second": 0.392, "step": 942 }, { "epoch": 2.9928514694201747, "step": 942, "total_flos": 1577466325893120.0, "train_loss": 0.6022182443592452, "train_runtime": 28264.0109, "train_samples_per_second": 17.101, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 942, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1577466325893120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }