| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.999438727782975, | |
| "eval_steps": 500, | |
| "global_step": 1002, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.029934518241347054, | |
| "grad_norm": 1.4403637427480462, | |
| "learning_rate": 2e-06, | |
| "loss": 0.786, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05986903648269411, | |
| "grad_norm": 0.7585646246007425, | |
| "learning_rate": 2e-06, | |
| "loss": 0.7103, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08980355472404115, | |
| "grad_norm": 0.7877019895147521, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6845, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11973807296538821, | |
| "grad_norm": 0.7526284526378991, | |
| "learning_rate": 2e-06, | |
| "loss": 0.679, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14967259120673526, | |
| "grad_norm": 0.6462593569528299, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6695, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1796071094480823, | |
| "grad_norm": 0.7389173448475503, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6608, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.20954162768942938, | |
| "grad_norm": 0.6921347775545471, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6569, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.23947614593077643, | |
| "grad_norm": 0.6664400127292619, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6581, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2694106641721235, | |
| "grad_norm": 0.6931891241440936, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6491, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2993451824134705, | |
| "grad_norm": 0.7790977620270173, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6497, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3292797006548176, | |
| "grad_norm": 0.6948688988620945, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6451, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3592142188961646, | |
| "grad_norm": 0.723431615671195, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6439, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3891487371375117, | |
| "grad_norm": 0.6738418075637761, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6436, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.41908325537885877, | |
| "grad_norm": 0.6653975851597016, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6372, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4490177736202058, | |
| "grad_norm": 0.6601441553159767, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6387, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.47895229186155286, | |
| "grad_norm": 0.7054970289254373, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6375, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5088868101028999, | |
| "grad_norm": 0.7199826049159775, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6317, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.538821328344247, | |
| "grad_norm": 0.7113756352568565, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6332, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.568755846585594, | |
| "grad_norm": 0.7044231675783028, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6308, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.598690364826941, | |
| "grad_norm": 0.7309238834285012, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6387, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6286248830682881, | |
| "grad_norm": 0.7095992221953259, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6324, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6585594013096352, | |
| "grad_norm": 0.641738749873779, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6304, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6884939195509823, | |
| "grad_norm": 0.7754539446943373, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6329, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7184284377923292, | |
| "grad_norm": 0.6924947810387134, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6317, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7483629560336763, | |
| "grad_norm": 0.832705575892092, | |
| "learning_rate": 2e-06, | |
| "loss": 0.628, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7782974742750234, | |
| "grad_norm": 0.7232676849968064, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6267, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8082319925163705, | |
| "grad_norm": 0.7419413773726808, | |
| "learning_rate": 2e-06, | |
| "loss": 0.634, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8381665107577175, | |
| "grad_norm": 0.7199244688464713, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6244, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8681010289990645, | |
| "grad_norm": 0.6947896790690724, | |
| "learning_rate": 2e-06, | |
| "loss": 0.621, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8980355472404116, | |
| "grad_norm": 0.7851259698790731, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6222, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9279700654817586, | |
| "grad_norm": 0.7028052394028984, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6207, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9579045837231057, | |
| "grad_norm": 0.8063430875369427, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6227, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9878391019644528, | |
| "grad_norm": 0.8830953748187379, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6157, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9998129092609915, | |
| "eval_loss": 0.6270928382873535, | |
| "eval_runtime": 518.3743, | |
| "eval_samples_per_second": 17.364, | |
| "eval_steps_per_second": 0.544, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.0177736202057999, | |
| "grad_norm": 0.7847590939808942, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6651, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.047708138447147, | |
| "grad_norm": 0.7143067638653315, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5899, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.077642656688494, | |
| "grad_norm": 0.6616124745271351, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5856, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.1075771749298409, | |
| "grad_norm": 0.6761369846237072, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5848, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.137511693171188, | |
| "grad_norm": 0.7226956240435726, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5847, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.167446211412535, | |
| "grad_norm": 0.6492986790737924, | |
| "learning_rate": 2e-06, | |
| "loss": 0.586, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.197380729653882, | |
| "grad_norm": 0.9238463038340056, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5862, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2273152478952292, | |
| "grad_norm": 0.7384372135206262, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5875, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.2572497661365762, | |
| "grad_norm": 0.7798681474247887, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5922, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2871842843779233, | |
| "grad_norm": 0.6769000626695846, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5856, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.3171188026192704, | |
| "grad_norm": 0.7081906344223899, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5873, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.3470533208606175, | |
| "grad_norm": 0.6669059541200351, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5807, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.3769878391019645, | |
| "grad_norm": 0.7803535308998839, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5913, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4069223573433116, | |
| "grad_norm": 0.7695489200166007, | |
| "learning_rate": 2e-06, | |
| "loss": 0.59, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.4368568755846587, | |
| "grad_norm": 0.7503675606232418, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5917, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.4667913938260055, | |
| "grad_norm": 0.7163429476040114, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5904, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.4967259120673526, | |
| "grad_norm": 0.7665811370018359, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5866, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5266604303086997, | |
| "grad_norm": 0.8742449064402781, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5892, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.5565949485500468, | |
| "grad_norm": 0.6515777532874945, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5867, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5865294667913938, | |
| "grad_norm": 0.7364233764205356, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5871, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.616463985032741, | |
| "grad_norm": 0.6869489380724798, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5907, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.646398503274088, | |
| "grad_norm": 0.6905962679241299, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5896, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.6763330215154348, | |
| "grad_norm": 0.722590813324787, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5795, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.706267539756782, | |
| "grad_norm": 0.7155444269064662, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5812, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.736202057998129, | |
| "grad_norm": 0.6934837112832971, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5821, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.766136576239476, | |
| "grad_norm": 0.6890374087051357, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5851, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.7960710944808231, | |
| "grad_norm": 0.6987803144035127, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5857, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.8260056127221702, | |
| "grad_norm": 0.8108212865561982, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5829, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.8559401309635173, | |
| "grad_norm": 0.6612306296879438, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5807, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.8858746492048644, | |
| "grad_norm": 0.7409886326024834, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5829, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.9158091674462114, | |
| "grad_norm": 0.9188602740383207, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5842, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.9457436856875585, | |
| "grad_norm": 0.730641006811515, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5823, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.9756782039289056, | |
| "grad_norm": 0.6773026445013379, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5868, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.999625818521983, | |
| "eval_loss": 0.6162874102592468, | |
| "eval_runtime": 518.1783, | |
| "eval_samples_per_second": 17.37, | |
| "eval_steps_per_second": 0.544, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 2.0056127221702527, | |
| "grad_norm": 0.9331327758390977, | |
| "learning_rate": 2e-06, | |
| "loss": 0.6381, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.0355472404115997, | |
| "grad_norm": 0.7895988754181943, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5476, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.065481758652947, | |
| "grad_norm": 0.8235146344949044, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5451, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.095416276894294, | |
| "grad_norm": 0.7267162587943428, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5498, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.125350795135641, | |
| "grad_norm": 0.7345843211419183, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5495, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.155285313376988, | |
| "grad_norm": 0.6908318018947618, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5524, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.185219831618335, | |
| "grad_norm": 0.6981065999228409, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5516, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.2151543498596817, | |
| "grad_norm": 0.7445453069101049, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5517, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.245088868101029, | |
| "grad_norm": 0.7278674464439252, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5538, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.275023386342376, | |
| "grad_norm": 0.6879321927261636, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5478, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.304957904583723, | |
| "grad_norm": 0.7343459951201352, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5538, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.33489242282507, | |
| "grad_norm": 0.7607710355221491, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5481, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.364826941066417, | |
| "grad_norm": 0.7417516698375253, | |
| "learning_rate": 2e-06, | |
| "loss": 0.549, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.394761459307764, | |
| "grad_norm": 0.6975464703626868, | |
| "learning_rate": 2e-06, | |
| "loss": 0.552, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.4246959775491113, | |
| "grad_norm": 0.7361057536866448, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5593, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.4546304957904583, | |
| "grad_norm": 0.7452745025276496, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5552, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.4845650140318054, | |
| "grad_norm": 0.7760152871621997, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5542, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.5144995322731525, | |
| "grad_norm": 0.7081762034349137, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5512, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.5444340505144996, | |
| "grad_norm": 0.7832350493954435, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5545, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.5743685687558466, | |
| "grad_norm": 0.767598031204084, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5527, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.6043030869971937, | |
| "grad_norm": 0.6862395207363299, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5487, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.634237605238541, | |
| "grad_norm": 0.6888763524013458, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5478, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.664172123479888, | |
| "grad_norm": 0.8203400036669106, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5534, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.694106641721235, | |
| "grad_norm": 0.8260483366154581, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5561, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.724041159962582, | |
| "grad_norm": 0.7295963125166559, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5488, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.753975678203929, | |
| "grad_norm": 0.6857495144156721, | |
| "learning_rate": 2e-06, | |
| "loss": 0.55, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.7839101964452757, | |
| "grad_norm": 0.7241224249227611, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5554, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.8138447146866232, | |
| "grad_norm": 0.6688797316878076, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5544, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.84377923292797, | |
| "grad_norm": 0.7283714304791777, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5526, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.8737137511693174, | |
| "grad_norm": 0.7216563707506914, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5548, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.903648269410664, | |
| "grad_norm": 0.7405760586708118, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5491, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.933582787652011, | |
| "grad_norm": 0.7101741115648686, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5552, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.963517305893358, | |
| "grad_norm": 0.687213640945178, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5568, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.9934518241347052, | |
| "grad_norm": 0.7118013404841623, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5572, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.999438727782975, | |
| "eval_loss": 0.6159842014312744, | |
| "eval_runtime": 517.5485, | |
| "eval_samples_per_second": 17.392, | |
| "eval_steps_per_second": 0.545, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 2.999438727782975, | |
| "step": 1002, | |
| "total_flos": 3818092983484416.0, | |
| "train_loss": 0.5959667034015922, | |
| "train_runtime": 91515.2633, | |
| "train_samples_per_second": 5.606, | |
| "train_steps_per_second": 0.011 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1002, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3818092983484416.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |