{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 30, "global_step": 948, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021102611448166712, "grad_norm": 5.49099588394165, "learning_rate": 1.8947368421052634e-05, "loss": 2.4799, "step": 10 }, { "epoch": 0.042205222896333425, "grad_norm": 0.7057839035987854, "learning_rate": 4e-05, "loss": 0.9017, "step": 20 }, { "epoch": 0.06330783434450013, "grad_norm": 0.7793235778808594, "learning_rate": 6.105263157894737e-05, "loss": 0.5561, "step": 30 }, { "epoch": 0.06330783434450013, "eval_loss": 0.49612972140312195, "eval_runtime": 44.7367, "eval_samples_per_second": 4.471, "eval_steps_per_second": 4.471, "step": 30 }, { "epoch": 0.08441044579266685, "grad_norm": 0.6479541063308716, "learning_rate": 8.210526315789474e-05, "loss": 0.4088, "step": 40 }, { "epoch": 0.10551305724083355, "grad_norm": 0.6190423965454102, "learning_rate": 0.00010315789473684211, "loss": 0.3332, "step": 50 }, { "epoch": 0.12661566868900026, "grad_norm": 0.48822861909866333, "learning_rate": 0.00012421052631578949, "loss": 0.2752, "step": 60 }, { "epoch": 0.12661566868900026, "eval_loss": 0.25475138425827026, "eval_runtime": 44.1798, "eval_samples_per_second": 4.527, "eval_steps_per_second": 4.527, "step": 60 }, { "epoch": 0.14771828013716698, "grad_norm": 0.3956296443939209, "learning_rate": 0.00014526315789473686, "loss": 0.2283, "step": 70 }, { "epoch": 0.1688208915853337, "grad_norm": 0.6851626038551331, "learning_rate": 0.00016631578947368423, "loss": 0.2017, "step": 80 }, { "epoch": 0.1899235030335004, "grad_norm": 2.741124153137207, "learning_rate": 0.0001873684210526316, "loss": 0.1877, "step": 90 }, { "epoch": 0.1899235030335004, "eval_loss": 0.19585472345352173, "eval_runtime": 44.05, "eval_samples_per_second": 4.54, "eval_steps_per_second": 4.54, "step": 90 }, { "epoch": 0.2110261144816671, "grad_norm": 0.4686296880245209, "learning_rate": 0.00019998914864890175, "loss": 0.1862, "step": 100 }, { "epoch": 0.23212872592983383, "grad_norm": 0.5210604071617126, "learning_rate": 0.0001998670979935533, "loss": 0.1754, "step": 110 }, { "epoch": 0.2532313373780005, "grad_norm": 0.2621477246284485, "learning_rate": 0.00019960959858204754, "loss": 0.1767, "step": 120 }, { "epoch": 0.2532313373780005, "eval_loss": 0.16388529539108276, "eval_runtime": 44.3724, "eval_samples_per_second": 4.507, "eval_steps_per_second": 4.507, "step": 120 }, { "epoch": 0.27433394882616724, "grad_norm": 0.3740817904472351, "learning_rate": 0.00019921699965828662, "loss": 0.1666, "step": 130 }, { "epoch": 0.29543656027433396, "grad_norm": 0.5232918858528137, "learning_rate": 0.00019868983370030348, "loss": 0.1624, "step": 140 }, { "epoch": 0.3165391717225007, "grad_norm": 0.2019444853067398, "learning_rate": 0.00019802881569806706, "loss": 0.1647, "step": 150 }, { "epoch": 0.3165391717225007, "eval_loss": 0.15114803612232208, "eval_runtime": 44.2313, "eval_samples_per_second": 4.522, "eval_steps_per_second": 4.522, "step": 150 }, { "epoch": 0.3376417831706674, "grad_norm": 0.23078428208827972, "learning_rate": 0.00019723484218374865, "loss": 0.142, "step": 160 }, { "epoch": 0.35874439461883406, "grad_norm": 0.15399664640426636, "learning_rate": 0.00019630899001576405, "loss": 0.1472, "step": 170 }, { "epoch": 0.3798470060670008, "grad_norm": 0.21795395016670227, "learning_rate": 0.0001952525149182412, "loss": 0.1511, "step": 180 }, { "epoch": 0.3798470060670008, "eval_loss": 0.14088743925094604, "eval_runtime": 44.23, "eval_samples_per_second": 4.522, "eval_steps_per_second": 4.522, "step": 180 }, { "epoch": 0.4009496175151675, "grad_norm": 0.2159528136253357, "learning_rate": 0.00019406684977789395, "loss": 0.1426, "step": 190 }, { "epoch": 0.4220522289633342, "grad_norm": 0.154087632894516, "learning_rate": 0.00019275360270061217, "loss": 0.1469, "step": 200 }, { "epoch": 0.44315484041150094, "grad_norm": 0.1741837114095688, "learning_rate": 0.0001913145548304034, "loss": 0.139, "step": 210 }, { "epoch": 0.44315484041150094, "eval_loss": 0.13762199878692627, "eval_runtime": 44.3745, "eval_samples_per_second": 4.507, "eval_steps_per_second": 4.507, "step": 210 }, { "epoch": 0.46425745185966766, "grad_norm": 0.14827653765678406, "learning_rate": 0.00018975165793364503, "loss": 0.1391, "step": 220 }, { "epoch": 0.4853600633078343, "grad_norm": 0.152383491396904, "learning_rate": 0.00018806703175192283, "loss": 0.1418, "step": 230 }, { "epoch": 0.506462674756001, "grad_norm": 0.2013552486896515, "learning_rate": 0.0001862629611270464, "loss": 0.1442, "step": 240 }, { "epoch": 0.506462674756001, "eval_loss": 0.136888787150383, "eval_runtime": 44.4633, "eval_samples_per_second": 4.498, "eval_steps_per_second": 4.498, "step": 240 }, { "epoch": 0.5275652862041678, "grad_norm": 0.16066157817840576, "learning_rate": 0.00018434189290214106, "loss": 0.1424, "step": 250 }, { "epoch": 0.5486678976523345, "grad_norm": 0.1520007848739624, "learning_rate": 0.00018230643260301838, "loss": 0.1608, "step": 260 }, { "epoch": 0.5697705091005012, "grad_norm": 0.1666969507932663, "learning_rate": 0.00018015934090432757, "loss": 0.1342, "step": 270 }, { "epoch": 0.5697705091005012, "eval_loss": 0.13377884030342102, "eval_runtime": 44.2414, "eval_samples_per_second": 4.521, "eval_steps_per_second": 4.521, "step": 270 }, { "epoch": 0.5908731205486679, "grad_norm": 0.1368287354707718, "learning_rate": 0.00017790352988527984, "loss": 0.1367, "step": 280 }, { "epoch": 0.6119757319968346, "grad_norm": 0.13659009337425232, "learning_rate": 0.000175542059080024, "loss": 0.14, "step": 290 }, { "epoch": 0.6330783434450014, "grad_norm": 0.19124054908752441, "learning_rate": 0.00017307813132803066, "loss": 0.1403, "step": 300 }, { "epoch": 0.6330783434450014, "eval_loss": 0.13211439549922943, "eval_runtime": 44.7668, "eval_samples_per_second": 4.468, "eval_steps_per_second": 4.468, "step": 300 }, { "epoch": 0.6541809548931681, "grad_norm": 0.14924179017543793, "learning_rate": 0.0001705150884301129, "loss": 0.1317, "step": 310 }, { "epoch": 0.6752835663413348, "grad_norm": 0.1274843066930771, "learning_rate": 0.00016785640661597467, "loss": 0.1477, "step": 320 }, { "epoch": 0.6963861777895014, "grad_norm": 0.15062034130096436, "learning_rate": 0.00016510569182943524, "loss": 0.1367, "step": 330 }, { "epoch": 0.6963861777895014, "eval_loss": 0.13036254048347473, "eval_runtime": 44.6273, "eval_samples_per_second": 4.482, "eval_steps_per_second": 4.482, "step": 330 }, { "epoch": 0.7174887892376681, "grad_norm": 0.13609440624713898, "learning_rate": 0.00016226667483772275, "loss": 0.1294, "step": 340 }, { "epoch": 0.7385914006858348, "grad_norm": 0.12146595865488052, "learning_rate": 0.00015934320617147214, "loss": 0.1356, "step": 350 }, { "epoch": 0.7596940121340016, "grad_norm": 0.33670490980148315, "learning_rate": 0.0001563392509022882, "loss": 0.1348, "step": 360 }, { "epoch": 0.7596940121340016, "eval_loss": 0.1280602663755417, "eval_runtime": 44.5897, "eval_samples_per_second": 4.485, "eval_steps_per_second": 4.485, "step": 360 }, { "epoch": 0.7807966235821683, "grad_norm": 0.11047045141458511, "learning_rate": 0.00015325888326495833, "loss": 0.1306, "step": 370 }, { "epoch": 0.801899235030335, "grad_norm": 0.24782629311084747, "learning_rate": 0.0001501062811316082, "loss": 0.1421, "step": 380 }, { "epoch": 0.8230018464785017, "grad_norm": 0.15822850167751312, "learning_rate": 0.0001468857203452953, "loss": 0.1345, "step": 390 }, { "epoch": 0.8230018464785017, "eval_loss": 0.12854796648025513, "eval_runtime": 44.5908, "eval_samples_per_second": 4.485, "eval_steps_per_second": 4.485, "step": 390 }, { "epoch": 0.8441044579266684, "grad_norm": 0.17262572050094604, "learning_rate": 0.00014360156892072518, "loss": 0.138, "step": 400 }, { "epoch": 0.8652070693748352, "grad_norm": 0.16533038020133972, "learning_rate": 0.00014025828111995635, "loss": 0.13, "step": 410 }, { "epoch": 0.8863096808230019, "grad_norm": 0.10585814714431763, "learning_rate": 0.00013686039141112886, "loss": 0.1267, "step": 420 }, { "epoch": 0.8863096808230019, "eval_loss": 0.12870755791664124, "eval_runtime": 44.5358, "eval_samples_per_second": 4.491, "eval_steps_per_second": 4.491, "step": 420 }, { "epoch": 0.9074122922711686, "grad_norm": 0.12024762481451035, "learning_rate": 0.00013341250831840998, "loss": 0.1394, "step": 430 }, { "epoch": 0.9285149037193353, "grad_norm": 0.11544947326183319, "learning_rate": 0.0001299193081714986, "loss": 0.1409, "step": 440 }, { "epoch": 0.9496175151675019, "grad_norm": 0.2589576840400696, "learning_rate": 0.0001263855287631654, "loss": 0.1352, "step": 450 }, { "epoch": 0.9496175151675019, "eval_loss": 0.12487487494945526, "eval_runtime": 44.5323, "eval_samples_per_second": 4.491, "eval_steps_per_second": 4.491, "step": 450 }, { "epoch": 0.9707201266156686, "grad_norm": 0.13384610414505005, "learning_rate": 0.00012281596292343163, "loss": 0.1231, "step": 460 }, { "epoch": 0.9918227380638354, "grad_norm": 0.10393290221691132, "learning_rate": 0.00011921545201910099, "loss": 0.1347, "step": 470 }, { "epoch": 1.0126615668689, "grad_norm": 0.11627262830734253, "learning_rate": 0.00011558887938746194, "loss": 0.1305, "step": 480 }, { "epoch": 1.0126615668689, "eval_loss": 0.12428628653287888, "eval_runtime": 44.5885, "eval_samples_per_second": 4.485, "eval_steps_per_second": 4.485, "step": 480 }, { "epoch": 1.0337641783170668, "grad_norm": 0.13240262866020203, "learning_rate": 0.00011194116371306573, "loss": 0.1308, "step": 490 }, { "epoch": 1.0548667897652335, "grad_norm": 0.15070736408233643, "learning_rate": 0.00010827725235656294, "loss": 0.13, "step": 500 }, { "epoch": 1.0759694012134002, "grad_norm": 0.1023663654923439, "learning_rate": 0.00010460211464464757, "loss": 0.1291, "step": 510 }, { "epoch": 1.0759694012134002, "eval_loss": 0.12311653047800064, "eval_runtime": 44.4742, "eval_samples_per_second": 4.497, "eval_steps_per_second": 4.497, "step": 510 }, { "epoch": 1.097072012661567, "grad_norm": 0.11538528650999069, "learning_rate": 0.00010092073513020834, "loss": 0.1314, "step": 520 }, { "epoch": 1.1181746241097337, "grad_norm": 0.09773898124694824, "learning_rate": 9.723810683182883e-05, "loss": 0.1238, "step": 530 }, { "epoch": 1.1392772355579002, "grad_norm": 0.1469469964504242, "learning_rate": 9.355922446180593e-05, "loss": 0.1302, "step": 540 }, { "epoch": 1.1392772355579002, "eval_loss": 0.12340555340051651, "eval_runtime": 44.7791, "eval_samples_per_second": 4.466, "eval_steps_per_second": 4.466, "step": 540 }, { "epoch": 1.160379847006067, "grad_norm": 0.08568503707647324, "learning_rate": 8.988907765187104e-05, "loss": 0.1291, "step": 550 }, { "epoch": 1.1814824584542336, "grad_norm": 0.11329666525125504, "learning_rate": 8.623264418580185e-05, "loss": 0.1243, "step": 560 }, { "epoch": 1.2025850699024003, "grad_norm": 0.20136046409606934, "learning_rate": 8.259488324810359e-05, "loss": 0.1236, "step": 570 }, { "epoch": 1.2025850699024003, "eval_loss": 0.12261851131916046, "eval_runtime": 44.5714, "eval_samples_per_second": 4.487, "eval_steps_per_second": 4.487, "step": 570 }, { "epoch": 1.223687681350567, "grad_norm": 0.09609243273735046, "learning_rate": 7.89807286979162e-05, "loss": 0.125, "step": 580 }, { "epoch": 1.2447902927987338, "grad_norm": 0.15356111526489258, "learning_rate": 7.539508237726986e-05, "loss": 0.1268, "step": 590 }, { "epoch": 1.2658929042469005, "grad_norm": 0.09328490495681763, "learning_rate": 7.184280746276537e-05, "loss": 0.1239, "step": 600 }, { "epoch": 1.2658929042469005, "eval_loss": 0.1222267746925354, "eval_runtime": 44.5091, "eval_samples_per_second": 4.493, "eval_steps_per_second": 4.493, "step": 600 }, { "epoch": 1.2869955156950672, "grad_norm": 0.10134255886077881, "learning_rate": 6.832872186969583e-05, "loss": 0.122, "step": 610 }, { "epoch": 1.308098127143234, "grad_norm": 0.08854757249355316, "learning_rate": 6.485759171755574e-05, "loss": 0.1271, "step": 620 }, { "epoch": 1.3292007385914006, "grad_norm": 0.12832631170749664, "learning_rate": 6.143412486580051e-05, "loss": 0.1243, "step": 630 }, { "epoch": 1.3292007385914006, "eval_loss": 0.1208844780921936, "eval_runtime": 44.7617, "eval_samples_per_second": 4.468, "eval_steps_per_second": 4.468, "step": 630 }, { "epoch": 1.3503033500395674, "grad_norm": 0.10490237921476364, "learning_rate": 5.8062964528623096e-05, "loss": 0.1233, "step": 640 }, { "epoch": 1.371405961487734, "grad_norm": 0.09632379561662674, "learning_rate": 5.474868297740874e-05, "loss": 0.1246, "step": 650 }, { "epoch": 1.3925085729359008, "grad_norm": 0.09513814002275467, "learning_rate": 5.149577533940836e-05, "loss": 0.125, "step": 660 }, { "epoch": 1.3925085729359008, "eval_loss": 0.12093473225831985, "eval_runtime": 44.6914, "eval_samples_per_second": 4.475, "eval_steps_per_second": 4.475, "step": 660 }, { "epoch": 1.4136111843840675, "grad_norm": 0.08354990929365158, "learning_rate": 4.8308653501042166e-05, "loss": 0.1247, "step": 670 }, { "epoch": 1.4347137958322342, "grad_norm": 0.07765179127454758, "learning_rate": 4.519164012410171e-05, "loss": 0.1225, "step": 680 }, { "epoch": 1.455816407280401, "grad_norm": 0.09267658740282059, "learning_rate": 4.214896278296646e-05, "loss": 0.1269, "step": 690 }, { "epoch": 1.455816407280401, "eval_loss": 0.12080849707126617, "eval_runtime": 44.6277, "eval_samples_per_second": 4.482, "eval_steps_per_second": 4.482, "step": 690 }, { "epoch": 1.4769190187285677, "grad_norm": 0.10476211458444595, "learning_rate": 3.9184748230786584e-05, "loss": 0.1207, "step": 700 }, { "epoch": 1.4980216301767344, "grad_norm": 0.09438898414373398, "learning_rate": 3.6303016802408594e-05, "loss": 0.1243, "step": 710 }, { "epoch": 1.5191242416249011, "grad_norm": 0.09003426134586334, "learning_rate": 3.3507676961634796e-05, "loss": 0.1225, "step": 720 }, { "epoch": 1.5191242416249011, "eval_loss": 0.11986906081438065, "eval_runtime": 44.9813, "eval_samples_per_second": 4.446, "eval_steps_per_second": 4.446, "step": 720 }, { "epoch": 1.5402268530730678, "grad_norm": 0.09385745972394943, "learning_rate": 3.080252000021264e-05, "loss": 0.1262, "step": 730 }, { "epoch": 1.5613294645212346, "grad_norm": 0.09874516725540161, "learning_rate": 2.8191214895743424e-05, "loss": 0.1195, "step": 740 }, { "epoch": 1.5824320759694013, "grad_norm": 0.08888901770114899, "learning_rate": 2.5677303335484025e-05, "loss": 0.1176, "step": 750 }, { "epoch": 1.5824320759694013, "eval_loss": 0.11973254382610321, "eval_runtime": 44.7792, "eval_samples_per_second": 4.466, "eval_steps_per_second": 4.466, "step": 750 }, { "epoch": 1.603534687417568, "grad_norm": 0.08025766164064407, "learning_rate": 2.3264194912791605e-05, "loss": 0.1294, "step": 760 }, { "epoch": 1.6246372988657347, "grad_norm": 0.1188935935497284, "learning_rate": 2.0955162502726135e-05, "loss": 0.1186, "step": 770 }, { "epoch": 1.6457399103139014, "grad_norm": 0.0846625491976738, "learning_rate": 1.8753337823082084e-05, "loss": 0.1227, "step": 780 }, { "epoch": 1.6457399103139014, "eval_loss": 0.11954256147146225, "eval_runtime": 44.8428, "eval_samples_per_second": 4.46, "eval_steps_per_second": 4.46, "step": 780 }, { "epoch": 1.6668425217620682, "grad_norm": 0.09665997326374054, "learning_rate": 1.666170718687069e-05, "loss": 0.121, "step": 790 }, { "epoch": 1.6879451332102349, "grad_norm": 0.10237396508455276, "learning_rate": 1.4683107452013223e-05, "loss": 0.122, "step": 800 }, { "epoch": 1.7090477446584016, "grad_norm": 0.09061301499605179, "learning_rate": 1.2820222173738628e-05, "loss": 0.1133, "step": 810 }, { "epoch": 1.7090477446584016, "eval_loss": 0.11932696402072906, "eval_runtime": 44.6177, "eval_samples_per_second": 4.483, "eval_steps_per_second": 4.483, "step": 810 }, { "epoch": 1.7301503561065683, "grad_norm": 0.09903734177350998, "learning_rate": 1.1075577964904104e-05, "loss": 0.1259, "step": 820 }, { "epoch": 1.751252967554735, "grad_norm": 0.0887165367603302, "learning_rate": 9.451541069175273e-06, "loss": 0.116, "step": 830 }, { "epoch": 1.7723555790029017, "grad_norm": 0.09350364655256271, "learning_rate": 7.950314151713056e-06, "loss": 0.1241, "step": 840 }, { "epoch": 1.7723555790029017, "eval_loss": 0.11908172816038132, "eval_runtime": 44.7048, "eval_samples_per_second": 4.474, "eval_steps_per_second": 4.474, "step": 840 }, { "epoch": 1.7934581904510685, "grad_norm": 0.10343047231435776, "learning_rate": 6.57393331172097e-06, "loss": 0.1183, "step": 850 }, { "epoch": 1.8145608018992352, "grad_norm": 0.08541911840438843, "learning_rate": 5.324265320903843e-06, "loss": 0.1258, "step": 860 }, { "epoch": 1.835663413347402, "grad_norm": 0.10567805916070938, "learning_rate": 4.203005091583801e-06, "loss": 0.1204, "step": 870 }, { "epoch": 1.835663413347402, "eval_loss": 0.11883310228586197, "eval_runtime": 44.6887, "eval_samples_per_second": 4.475, "eval_steps_per_second": 4.475, "step": 870 }, { "epoch": 1.8567660247955684, "grad_norm": 0.10206745564937592, "learning_rate": 3.2116733779075094e-06, "loss": 0.1273, "step": 880 }, { "epoch": 1.8778686362437351, "grad_norm": 0.09496993571519852, "learning_rate": 2.351614713262418e-06, "loss": 0.1239, "step": 890 }, { "epoch": 1.8989712476919018, "grad_norm": 0.13183720409870148, "learning_rate": 1.623995586699334e-06, "loss": 0.122, "step": 900 }, { "epoch": 1.8989712476919018, "eval_loss": 0.11881602555513382, "eval_runtime": 44.6717, "eval_samples_per_second": 4.477, "eval_steps_per_second": 4.477, "step": 900 }, { "epoch": 1.9200738591400686, "grad_norm": 0.09778941422700882, "learning_rate": 1.029802860834983e-06, "loss": 0.1264, "step": 910 }, { "epoch": 1.9411764705882353, "grad_norm": 0.10276590287685394, "learning_rate": 5.698424333799413e-07, "loss": 0.1205, "step": 920 }, { "epoch": 1.962279082036402, "grad_norm": 0.08584799617528915, "learning_rate": 2.4473814410759245e-07, "loss": 0.1253, "step": 930 }, { "epoch": 1.962279082036402, "eval_loss": 0.11866004765033722, "eval_runtime": 44.6565, "eval_samples_per_second": 4.479, "eval_steps_per_second": 4.479, "step": 930 }, { "epoch": 1.9833816934845687, "grad_norm": 0.08022117614746094, "learning_rate": 5.4930928746410596e-08, "loss": 0.1215, "step": 940 } ], "logging_steps": 10, "max_steps": 948, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.42087045072171e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }