| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.99626400996264, |
| "eval_steps": 500, |
| "global_step": 1203, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.024906600249066, |
| "grad_norm": 44.361316887243134, |
| "learning_rate": 5e-06, |
| "loss": 1.0469, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.049813200498132, |
| "grad_norm": 1.9340361918852786, |
| "learning_rate": 5e-06, |
| "loss": 0.9485, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.074719800747198, |
| "grad_norm": 1.5810624474405115, |
| "learning_rate": 5e-06, |
| "loss": 0.9118, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.099626400996264, |
| "grad_norm": 1.0226804734735437, |
| "learning_rate": 5e-06, |
| "loss": 0.896, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.12453300124533001, |
| "grad_norm": 0.8059494431972822, |
| "learning_rate": 5e-06, |
| "loss": 0.885, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.149439601494396, |
| "grad_norm": 1.0759666179613023, |
| "learning_rate": 5e-06, |
| "loss": 0.8743, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.17434620174346202, |
| "grad_norm": 0.6070805032397537, |
| "learning_rate": 5e-06, |
| "loss": 0.865, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.199252801992528, |
| "grad_norm": 0.8206282215157027, |
| "learning_rate": 5e-06, |
| "loss": 0.8637, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.22415940224159403, |
| "grad_norm": 0.8375312112666434, |
| "learning_rate": 5e-06, |
| "loss": 0.8538, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.24906600249066002, |
| "grad_norm": 0.8745465775067757, |
| "learning_rate": 5e-06, |
| "loss": 0.8462, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.273972602739726, |
| "grad_norm": 0.7671374593440573, |
| "learning_rate": 5e-06, |
| "loss": 0.8516, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.298879202988792, |
| "grad_norm": 0.6357492120016651, |
| "learning_rate": 5e-06, |
| "loss": 0.8424, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.32378580323785805, |
| "grad_norm": 0.7985944551248969, |
| "learning_rate": 5e-06, |
| "loss": 0.838, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.34869240348692404, |
| "grad_norm": 0.6906428387839867, |
| "learning_rate": 5e-06, |
| "loss": 0.8425, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.37359900373599003, |
| "grad_norm": 0.63382119448828, |
| "learning_rate": 5e-06, |
| "loss": 0.8394, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.398505603985056, |
| "grad_norm": 0.7372274527250459, |
| "learning_rate": 5e-06, |
| "loss": 0.8331, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.42341220423412207, |
| "grad_norm": 0.6083911703654408, |
| "learning_rate": 5e-06, |
| "loss": 0.8326, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.44831880448318806, |
| "grad_norm": 1.1261618495477859, |
| "learning_rate": 5e-06, |
| "loss": 0.8279, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.47322540473225405, |
| "grad_norm": 0.7015056078596724, |
| "learning_rate": 5e-06, |
| "loss": 0.8289, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.49813200498132004, |
| "grad_norm": 0.7547586360167027, |
| "learning_rate": 5e-06, |
| "loss": 0.8269, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.523038605230386, |
| "grad_norm": 0.729606315703109, |
| "learning_rate": 5e-06, |
| "loss": 0.827, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.547945205479452, |
| "grad_norm": 0.576263809426071, |
| "learning_rate": 5e-06, |
| "loss": 0.8278, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.572851805728518, |
| "grad_norm": 0.608656960705352, |
| "learning_rate": 5e-06, |
| "loss": 0.8255, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.597758405977584, |
| "grad_norm": 0.6942678291488105, |
| "learning_rate": 5e-06, |
| "loss": 0.8247, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6226650062266501, |
| "grad_norm": 0.8474112920696388, |
| "learning_rate": 5e-06, |
| "loss": 0.8211, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6475716064757161, |
| "grad_norm": 0.7436541561484579, |
| "learning_rate": 5e-06, |
| "loss": 0.8205, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.6724782067247821, |
| "grad_norm": 0.5723501381180358, |
| "learning_rate": 5e-06, |
| "loss": 0.8175, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6973848069738481, |
| "grad_norm": 0.7393336592407068, |
| "learning_rate": 5e-06, |
| "loss": 0.8266, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7222914072229141, |
| "grad_norm": 0.8642437433242355, |
| "learning_rate": 5e-06, |
| "loss": 0.8213, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.7471980074719801, |
| "grad_norm": 0.6599736959065436, |
| "learning_rate": 5e-06, |
| "loss": 0.8197, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.772104607721046, |
| "grad_norm": 0.589894020890247, |
| "learning_rate": 5e-06, |
| "loss": 0.8171, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.797011207970112, |
| "grad_norm": 0.6770015328448542, |
| "learning_rate": 5e-06, |
| "loss": 0.8159, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.821917808219178, |
| "grad_norm": 0.6953955951335576, |
| "learning_rate": 5e-06, |
| "loss": 0.8156, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.8468244084682441, |
| "grad_norm": 0.8401761226093455, |
| "learning_rate": 5e-06, |
| "loss": 0.8136, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.8717310087173101, |
| "grad_norm": 0.6479655559695816, |
| "learning_rate": 5e-06, |
| "loss": 0.8091, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.8966376089663761, |
| "grad_norm": 0.7636033751591921, |
| "learning_rate": 5e-06, |
| "loss": 0.8127, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.9215442092154421, |
| "grad_norm": 0.5680882933927079, |
| "learning_rate": 5e-06, |
| "loss": 0.8139, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.9464508094645081, |
| "grad_norm": 0.5317095758960971, |
| "learning_rate": 5e-06, |
| "loss": 0.8148, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.9713574097135741, |
| "grad_norm": 0.5355215121901621, |
| "learning_rate": 5e-06, |
| "loss": 0.8133, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.9962640099626401, |
| "grad_norm": 0.5034767977871308, |
| "learning_rate": 5e-06, |
| "loss": 0.8102, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.9987546699875467, |
| "eval_loss": 0.8109647631645203, |
| "eval_runtime": 429.8587, |
| "eval_samples_per_second": 25.166, |
| "eval_steps_per_second": 0.395, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.0211706102117062, |
| "grad_norm": 0.7772306063471781, |
| "learning_rate": 5e-06, |
| "loss": 0.8079, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.046077210460772, |
| "grad_norm": 0.547172758232467, |
| "learning_rate": 5e-06, |
| "loss": 0.7693, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.0709838107098382, |
| "grad_norm": 0.6602663338851202, |
| "learning_rate": 5e-06, |
| "loss": 0.7659, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.095890410958904, |
| "grad_norm": 0.5563021526867751, |
| "learning_rate": 5e-06, |
| "loss": 0.7669, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.1207970112079702, |
| "grad_norm": 0.5764004393923637, |
| "learning_rate": 5e-06, |
| "loss": 0.7676, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.145703611457036, |
| "grad_norm": 0.6104368430877777, |
| "learning_rate": 5e-06, |
| "loss": 0.7668, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.1706102117061021, |
| "grad_norm": 0.5856299384522291, |
| "learning_rate": 5e-06, |
| "loss": 0.7653, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.195516811955168, |
| "grad_norm": 0.5968500894238352, |
| "learning_rate": 5e-06, |
| "loss": 0.7691, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.2204234122042341, |
| "grad_norm": 0.6425311166512483, |
| "learning_rate": 5e-06, |
| "loss": 0.7692, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.2453300124533002, |
| "grad_norm": 0.5800761501783642, |
| "learning_rate": 5e-06, |
| "loss": 0.77, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.270236612702366, |
| "grad_norm": 0.5217881601799819, |
| "learning_rate": 5e-06, |
| "loss": 0.7649, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.2951432129514322, |
| "grad_norm": 0.7577819320627684, |
| "learning_rate": 5e-06, |
| "loss": 0.7682, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.320049813200498, |
| "grad_norm": 0.7467713696988785, |
| "learning_rate": 5e-06, |
| "loss": 0.7712, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.3449564134495642, |
| "grad_norm": 0.6010822997576867, |
| "learning_rate": 5e-06, |
| "loss": 0.7664, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.36986301369863, |
| "grad_norm": 0.689181454056687, |
| "learning_rate": 5e-06, |
| "loss": 0.7679, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.3947696139476962, |
| "grad_norm": 0.4747660184884877, |
| "learning_rate": 5e-06, |
| "loss": 0.7639, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.419676214196762, |
| "grad_norm": 0.5116161932838977, |
| "learning_rate": 5e-06, |
| "loss": 0.7677, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.4445828144458281, |
| "grad_norm": 0.6433790988167347, |
| "learning_rate": 5e-06, |
| "loss": 0.7626, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.4694894146948942, |
| "grad_norm": 0.6145972003931011, |
| "learning_rate": 5e-06, |
| "loss": 0.7639, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.4943960149439601, |
| "grad_norm": 0.5887457741602182, |
| "learning_rate": 5e-06, |
| "loss": 0.7612, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.519302615193026, |
| "grad_norm": 0.5628593594779383, |
| "learning_rate": 5e-06, |
| "loss": 0.7685, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.544209215442092, |
| "grad_norm": 0.49978624448408865, |
| "learning_rate": 5e-06, |
| "loss": 0.7655, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.5691158156911582, |
| "grad_norm": 0.5121970961880906, |
| "learning_rate": 5e-06, |
| "loss": 0.7646, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.5940224159402243, |
| "grad_norm": 0.5120901081120943, |
| "learning_rate": 5e-06, |
| "loss": 0.7633, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.6189290161892902, |
| "grad_norm": 0.5708046084852306, |
| "learning_rate": 5e-06, |
| "loss": 0.7701, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.643835616438356, |
| "grad_norm": 0.559772892922969, |
| "learning_rate": 5e-06, |
| "loss": 0.7671, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.6687422166874222, |
| "grad_norm": 0.508876685275154, |
| "learning_rate": 5e-06, |
| "loss": 0.7627, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.6936488169364883, |
| "grad_norm": 0.5547904679119214, |
| "learning_rate": 5e-06, |
| "loss": 0.7665, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.7185554171855542, |
| "grad_norm": 0.5327048566040764, |
| "learning_rate": 5e-06, |
| "loss": 0.7612, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.74346201743462, |
| "grad_norm": 0.5681641331800833, |
| "learning_rate": 5e-06, |
| "loss": 0.7625, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.7683686176836861, |
| "grad_norm": 0.5583754277477581, |
| "learning_rate": 5e-06, |
| "loss": 0.7678, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.7932752179327522, |
| "grad_norm": 0.5821109954208563, |
| "learning_rate": 5e-06, |
| "loss": 0.7641, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.8181818181818183, |
| "grad_norm": 0.6033561880814401, |
| "learning_rate": 5e-06, |
| "loss": 0.7671, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.8430884184308842, |
| "grad_norm": 0.6575859282775093, |
| "learning_rate": 5e-06, |
| "loss": 0.7569, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.86799501867995, |
| "grad_norm": 0.5332781614516378, |
| "learning_rate": 5e-06, |
| "loss": 0.7617, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.8929016189290162, |
| "grad_norm": 0.6171829234250781, |
| "learning_rate": 5e-06, |
| "loss": 0.7628, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.9178082191780823, |
| "grad_norm": 0.553381597192015, |
| "learning_rate": 5e-06, |
| "loss": 0.7623, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.9427148194271482, |
| "grad_norm": 0.5971496735780886, |
| "learning_rate": 5e-06, |
| "loss": 0.7595, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.967621419676214, |
| "grad_norm": 0.566450928468519, |
| "learning_rate": 5e-06, |
| "loss": 0.7613, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.9925280199252802, |
| "grad_norm": 0.6533740130175245, |
| "learning_rate": 5e-06, |
| "loss": 0.7613, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.798328697681427, |
| "eval_runtime": 427.8716, |
| "eval_samples_per_second": 25.283, |
| "eval_steps_per_second": 0.397, |
| "step": 803 |
| }, |
| { |
| "epoch": 2.0174346201743463, |
| "grad_norm": 0.9428578537809662, |
| "learning_rate": 5e-06, |
| "loss": 0.764, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.0423412204234124, |
| "grad_norm": 0.682790769401012, |
| "learning_rate": 5e-06, |
| "loss": 0.7174, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.067247820672478, |
| "grad_norm": 0.732474650025201, |
| "learning_rate": 5e-06, |
| "loss": 0.7136, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.092154420921544, |
| "grad_norm": 0.5517924405803882, |
| "learning_rate": 5e-06, |
| "loss": 0.7141, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.1170610211706102, |
| "grad_norm": 0.527980258175362, |
| "learning_rate": 5e-06, |
| "loss": 0.7205, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.1419676214196763, |
| "grad_norm": 0.5578355324627287, |
| "learning_rate": 5e-06, |
| "loss": 0.7182, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.166874221668742, |
| "grad_norm": 0.5384061514408854, |
| "learning_rate": 5e-06, |
| "loss": 0.7174, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.191780821917808, |
| "grad_norm": 0.5433323621552549, |
| "learning_rate": 5e-06, |
| "loss": 0.7227, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.216687422166874, |
| "grad_norm": 0.5934434020270568, |
| "learning_rate": 5e-06, |
| "loss": 0.7154, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.2415940224159403, |
| "grad_norm": 0.5610116690136854, |
| "learning_rate": 5e-06, |
| "loss": 0.7181, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.2665006226650064, |
| "grad_norm": 0.5956518936383002, |
| "learning_rate": 5e-06, |
| "loss": 0.7188, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.291407222914072, |
| "grad_norm": 0.5700434018521554, |
| "learning_rate": 5e-06, |
| "loss": 0.7189, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.316313823163138, |
| "grad_norm": 0.6159365804430498, |
| "learning_rate": 5e-06, |
| "loss": 0.7208, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.3412204234122043, |
| "grad_norm": 0.5739840813262334, |
| "learning_rate": 5e-06, |
| "loss": 0.7191, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.3661270236612704, |
| "grad_norm": 0.517298472276118, |
| "learning_rate": 5e-06, |
| "loss": 0.7254, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.391033623910336, |
| "grad_norm": 0.5309037963536546, |
| "learning_rate": 5e-06, |
| "loss": 0.7177, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.415940224159402, |
| "grad_norm": 0.5464343303315381, |
| "learning_rate": 5e-06, |
| "loss": 0.7162, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.4408468244084682, |
| "grad_norm": 0.5884939657248605, |
| "learning_rate": 5e-06, |
| "loss": 0.7238, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.4657534246575343, |
| "grad_norm": 0.6058278477423068, |
| "learning_rate": 5e-06, |
| "loss": 0.7217, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.4906600249066004, |
| "grad_norm": 0.575706246130651, |
| "learning_rate": 5e-06, |
| "loss": 0.7211, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.515566625155666, |
| "grad_norm": 0.6210182727077225, |
| "learning_rate": 5e-06, |
| "loss": 0.722, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.540473225404732, |
| "grad_norm": 0.6248334338554098, |
| "learning_rate": 5e-06, |
| "loss": 0.7226, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.5653798256537983, |
| "grad_norm": 0.6075603863013977, |
| "learning_rate": 5e-06, |
| "loss": 0.7201, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.5902864259028644, |
| "grad_norm": 0.6125989005343908, |
| "learning_rate": 5e-06, |
| "loss": 0.7225, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.61519302615193, |
| "grad_norm": 0.5723698102141317, |
| "learning_rate": 5e-06, |
| "loss": 0.7184, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.640099626400996, |
| "grad_norm": 0.5988876404053375, |
| "learning_rate": 5e-06, |
| "loss": 0.7228, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.6650062266500623, |
| "grad_norm": 0.5535541669685047, |
| "learning_rate": 5e-06, |
| "loss": 0.7219, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.6899128268991284, |
| "grad_norm": 0.6560134586878092, |
| "learning_rate": 5e-06, |
| "loss": 0.7231, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.7148194271481945, |
| "grad_norm": 0.570154529031656, |
| "learning_rate": 5e-06, |
| "loss": 0.7207, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.73972602739726, |
| "grad_norm": 0.6903793080558596, |
| "learning_rate": 5e-06, |
| "loss": 0.7225, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.7646326276463262, |
| "grad_norm": 0.609309475396782, |
| "learning_rate": 5e-06, |
| "loss": 0.7199, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.7895392278953923, |
| "grad_norm": 0.4982077265492007, |
| "learning_rate": 5e-06, |
| "loss": 0.723, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.8144458281444584, |
| "grad_norm": 0.5520401600798728, |
| "learning_rate": 5e-06, |
| "loss": 0.7195, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.839352428393524, |
| "grad_norm": 0.5678772706098874, |
| "learning_rate": 5e-06, |
| "loss": 0.7241, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.86425902864259, |
| "grad_norm": 0.6919987752510048, |
| "learning_rate": 5e-06, |
| "loss": 0.7218, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.8891656288916563, |
| "grad_norm": 0.5523800519721218, |
| "learning_rate": 5e-06, |
| "loss": 0.7223, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.9140722291407224, |
| "grad_norm": 0.5786175424826561, |
| "learning_rate": 5e-06, |
| "loss": 0.7248, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.9389788293897885, |
| "grad_norm": 0.5805260846296417, |
| "learning_rate": 5e-06, |
| "loss": 0.7186, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.963885429638854, |
| "grad_norm": 0.6087027130014465, |
| "learning_rate": 5e-06, |
| "loss": 0.7225, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.9887920298879203, |
| "grad_norm": 0.6138969910749299, |
| "learning_rate": 5e-06, |
| "loss": 0.7228, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.99626400996264, |
| "eval_loss": 0.7962795495986938, |
| "eval_runtime": 431.1284, |
| "eval_samples_per_second": 25.092, |
| "eval_steps_per_second": 0.394, |
| "step": 1203 |
| }, |
| { |
| "epoch": 2.99626400996264, |
| "step": 1203, |
| "total_flos": 2014860426608640.0, |
| "train_loss": 0.7768312251676843, |
| "train_runtime": 70934.0832, |
| "train_samples_per_second": 8.693, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1203, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2014860426608640.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|