| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9928514694201747, | |
| "eval_steps": 500, | |
| "global_step": 942, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03177124702144559, | |
| "grad_norm": 3.4397125264946693, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8269, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06354249404289118, | |
| "grad_norm": 1.3162043926897933, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7492, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09531374106433678, | |
| "grad_norm": 1.740768639790779, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7286, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12708498808578236, | |
| "grad_norm": 1.047097510755796, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7128, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15885623510722796, | |
| "grad_norm": 2.1299057435530897, | |
| "learning_rate": 5e-06, | |
| "loss": 0.696, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.19062748212867356, | |
| "grad_norm": 1.334088036223787, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6873, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.22239872915011913, | |
| "grad_norm": 1.2813478557952303, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6917, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2541699761715647, | |
| "grad_norm": 1.0941086787458663, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6812, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.28594122319301035, | |
| "grad_norm": 0.7478585687808075, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6755, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3177124702144559, | |
| "grad_norm": 0.6575538230318294, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6712, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3494837172359015, | |
| "grad_norm": 0.9175291572423222, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6712, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3812549642573471, | |
| "grad_norm": 0.9905458946212224, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6702, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4130262112787927, | |
| "grad_norm": 0.5564766884877899, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6639, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.44479745830023826, | |
| "grad_norm": 0.6577684294501048, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6664, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4765687053216839, | |
| "grad_norm": 0.6804018194983035, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6601, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5083399523431295, | |
| "grad_norm": 1.179572621196443, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6579, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5401111993645751, | |
| "grad_norm": 0.6797391956593329, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6574, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5718824463860207, | |
| "grad_norm": 0.4921523794931075, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6611, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6036536934074662, | |
| "grad_norm": 0.43865289260205137, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6593, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6354249404289118, | |
| "grad_norm": 0.6676069056649501, | |
| "learning_rate": 5e-06, | |
| "loss": 0.653, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6671961874503575, | |
| "grad_norm": 0.6607839687607865, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6465, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.698967434471803, | |
| "grad_norm": 0.5527119566717958, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6513, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7307386814932486, | |
| "grad_norm": 1.1754706747514654, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6552, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7625099285146942, | |
| "grad_norm": 0.5274214651086202, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6476, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7942811755361397, | |
| "grad_norm": 0.5191905788117661, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6515, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8260524225575854, | |
| "grad_norm": 0.5021156228769978, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6391, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.857823669579031, | |
| "grad_norm": 0.4860867602338113, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6488, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8895949166004765, | |
| "grad_norm": 0.5477781086085485, | |
| "learning_rate": 5e-06, | |
| "loss": 0.644, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9213661636219221, | |
| "grad_norm": 0.4541673708322245, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6533, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9531374106433678, | |
| "grad_norm": 0.6665636520469818, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6458, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9849086576648134, | |
| "grad_norm": 1.2466688450308792, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6416, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9976171564733916, | |
| "eval_loss": 0.6390731930732727, | |
| "eval_runtime": 169.5155, | |
| "eval_samples_per_second": 50.025, | |
| "eval_steps_per_second": 0.395, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.016679904686259, | |
| "grad_norm": 0.8923786845867862, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6169, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0484511517077044, | |
| "grad_norm": 0.8258460752195317, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5989, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0802223987291502, | |
| "grad_norm": 0.8974017625848637, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6002, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.1119936457505957, | |
| "grad_norm": 0.6051249372281612, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5969, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1437648927720412, | |
| "grad_norm": 0.4964829017815635, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5932, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.175536139793487, | |
| "grad_norm": 0.7022791565648377, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5891, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.2073073868149324, | |
| "grad_norm": 0.48687833937339325, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5877, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2390786338363782, | |
| "grad_norm": 0.8131676427879705, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5934, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.2708498808578237, | |
| "grad_norm": 0.5514260097560263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5951, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.3026211278792692, | |
| "grad_norm": 0.5691182386121336, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5924, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.3343923749007147, | |
| "grad_norm": 0.5887848699748005, | |
| "learning_rate": 5e-06, | |
| "loss": 0.59, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.3661636219221605, | |
| "grad_norm": 0.6171594708352537, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5903, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.397934868943606, | |
| "grad_norm": 0.5006287746249459, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5995, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.4297061159650517, | |
| "grad_norm": 0.5783363834709443, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5987, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4614773629864972, | |
| "grad_norm": 0.5607638180460264, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5883, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4932486100079427, | |
| "grad_norm": 0.5510599347940959, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5931, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.5250198570293882, | |
| "grad_norm": 0.6341820224047038, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5871, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.556791104050834, | |
| "grad_norm": 0.5477960140838268, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6005, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.5885623510722797, | |
| "grad_norm": 0.5564566962985059, | |
| "learning_rate": 5e-06, | |
| "loss": 0.585, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.6203335980937252, | |
| "grad_norm": 0.6479737616056668, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5934, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.6521048451151708, | |
| "grad_norm": 0.5569401541147417, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5931, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6838760921366163, | |
| "grad_norm": 0.4956422971136313, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5847, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.715647339158062, | |
| "grad_norm": 0.5764728462806425, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5918, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.7474185861795075, | |
| "grad_norm": 0.5683144564118795, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5893, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.7791898332009533, | |
| "grad_norm": 0.45523545149858596, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5961, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.8109610802223988, | |
| "grad_norm": 0.5067511090235118, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5933, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.8427323272438443, | |
| "grad_norm": 0.5003192884659386, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5915, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.8745035742652898, | |
| "grad_norm": 0.496854313730653, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5871, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.9062748212867355, | |
| "grad_norm": 0.5480065320983066, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5971, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.938046068308181, | |
| "grad_norm": 0.4892198010452981, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5925, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.9698173153296268, | |
| "grad_norm": 0.5258022377268232, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5971, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.9984114376489277, | |
| "eval_loss": 0.6303147673606873, | |
| "eval_runtime": 169.9094, | |
| "eval_samples_per_second": 49.909, | |
| "eval_steps_per_second": 0.394, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 2.0015885623510723, | |
| "grad_norm": 0.6865592089506408, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5833, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.033359809372518, | |
| "grad_norm": 0.734114065829172, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5361, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.0651310563939633, | |
| "grad_norm": 0.6113057952860269, | |
| "learning_rate": 5e-06, | |
| "loss": 0.538, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.096902303415409, | |
| "grad_norm": 0.7419614574245821, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5393, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.128673550436855, | |
| "grad_norm": 0.5125951202265081, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5338, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.1604447974583003, | |
| "grad_norm": 0.5037660268311202, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5405, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.192216044479746, | |
| "grad_norm": 0.5393677046760976, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5321, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.2239872915011913, | |
| "grad_norm": 0.5385301073245446, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5403, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.255758538522637, | |
| "grad_norm": 0.7724939064931073, | |
| "learning_rate": 5e-06, | |
| "loss": 0.531, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.2875297855440824, | |
| "grad_norm": 0.607781764707526, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5444, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.3193010325655283, | |
| "grad_norm": 0.6227051590924966, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5363, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.351072279586974, | |
| "grad_norm": 0.5541522360131526, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5377, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.3828435266084194, | |
| "grad_norm": 0.6694258055269825, | |
| "learning_rate": 5e-06, | |
| "loss": 0.55, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.414614773629865, | |
| "grad_norm": 0.6024726800298386, | |
| "learning_rate": 5e-06, | |
| "loss": 0.542, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.4463860206513104, | |
| "grad_norm": 0.49906984309919067, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5415, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.4781572676727563, | |
| "grad_norm": 0.5863593266075681, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5451, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.509928514694202, | |
| "grad_norm": 0.5802769154479103, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5396, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.5416997617156474, | |
| "grad_norm": 0.5886600271892876, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5424, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.573471008737093, | |
| "grad_norm": 0.6006325307836762, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5425, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.6052422557585384, | |
| "grad_norm": 0.6296696985193921, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5406, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.6370135027799844, | |
| "grad_norm": 0.6082995671271276, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5371, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.6687847498014294, | |
| "grad_norm": 0.6361605601027807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5422, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.7005559968228754, | |
| "grad_norm": 0.5254863126848727, | |
| "learning_rate": 5e-06, | |
| "loss": 0.546, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.732327243844321, | |
| "grad_norm": 0.7335792362538626, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5423, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.7640984908657664, | |
| "grad_norm": 0.6113916939877616, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5409, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.795869737887212, | |
| "grad_norm": 0.49594942341373327, | |
| "learning_rate": 5e-06, | |
| "loss": 0.542, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.8276409849086575, | |
| "grad_norm": 0.6503566754620093, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5408, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.8594122319301034, | |
| "grad_norm": 0.6016129927241319, | |
| "learning_rate": 5e-06, | |
| "loss": 0.546, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.891183478951549, | |
| "grad_norm": 0.5016315173194889, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5451, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.9229547259729944, | |
| "grad_norm": 0.585287305851242, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5433, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.95472597299444, | |
| "grad_norm": 0.5879035774581888, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5467, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.9864972200158855, | |
| "grad_norm": 0.4888796678433693, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5442, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.9928514694201747, | |
| "eval_loss": 0.6355611085891724, | |
| "eval_runtime": 170.8785, | |
| "eval_samples_per_second": 49.626, | |
| "eval_steps_per_second": 0.392, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 2.9928514694201747, | |
| "step": 942, | |
| "total_flos": 1577466325893120.0, | |
| "train_loss": 0.6022182443592452, | |
| "train_runtime": 28264.0109, | |
| "train_samples_per_second": 17.101, | |
| "train_steps_per_second": 0.033 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 942, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1577466325893120.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |