| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.8989712476919018, | |
| "eval_steps": 30, | |
| "global_step": 900, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.021102611448166712, | |
| "grad_norm": 5.49099588394165, | |
| "learning_rate": 1.8947368421052634e-05, | |
| "loss": 2.4799, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.042205222896333425, | |
| "grad_norm": 0.7057839035987854, | |
| "learning_rate": 4e-05, | |
| "loss": 0.9017, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06330783434450013, | |
| "grad_norm": 0.7793235778808594, | |
| "learning_rate": 6.105263157894737e-05, | |
| "loss": 0.5561, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06330783434450013, | |
| "eval_loss": 0.49612972140312195, | |
| "eval_runtime": 44.7367, | |
| "eval_samples_per_second": 4.471, | |
| "eval_steps_per_second": 4.471, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08441044579266685, | |
| "grad_norm": 0.6479541063308716, | |
| "learning_rate": 8.210526315789474e-05, | |
| "loss": 0.4088, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10551305724083355, | |
| "grad_norm": 0.6190423965454102, | |
| "learning_rate": 0.00010315789473684211, | |
| "loss": 0.3332, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12661566868900026, | |
| "grad_norm": 0.48822861909866333, | |
| "learning_rate": 0.00012421052631578949, | |
| "loss": 0.2752, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12661566868900026, | |
| "eval_loss": 0.25475138425827026, | |
| "eval_runtime": 44.1798, | |
| "eval_samples_per_second": 4.527, | |
| "eval_steps_per_second": 4.527, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14771828013716698, | |
| "grad_norm": 0.3956296443939209, | |
| "learning_rate": 0.00014526315789473686, | |
| "loss": 0.2283, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1688208915853337, | |
| "grad_norm": 0.6851626038551331, | |
| "learning_rate": 0.00016631578947368423, | |
| "loss": 0.2017, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1899235030335004, | |
| "grad_norm": 2.741124153137207, | |
| "learning_rate": 0.0001873684210526316, | |
| "loss": 0.1877, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1899235030335004, | |
| "eval_loss": 0.19585472345352173, | |
| "eval_runtime": 44.05, | |
| "eval_samples_per_second": 4.54, | |
| "eval_steps_per_second": 4.54, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2110261144816671, | |
| "grad_norm": 0.4686296880245209, | |
| "learning_rate": 0.00019998914864890175, | |
| "loss": 0.1862, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23212872592983383, | |
| "grad_norm": 0.5210604071617126, | |
| "learning_rate": 0.0001998670979935533, | |
| "loss": 0.1754, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2532313373780005, | |
| "grad_norm": 0.2621477246284485, | |
| "learning_rate": 0.00019960959858204754, | |
| "loss": 0.1767, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2532313373780005, | |
| "eval_loss": 0.16388529539108276, | |
| "eval_runtime": 44.3724, | |
| "eval_samples_per_second": 4.507, | |
| "eval_steps_per_second": 4.507, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.27433394882616724, | |
| "grad_norm": 0.3740817904472351, | |
| "learning_rate": 0.00019921699965828662, | |
| "loss": 0.1666, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.29543656027433396, | |
| "grad_norm": 0.5232918858528137, | |
| "learning_rate": 0.00019868983370030348, | |
| "loss": 0.1624, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3165391717225007, | |
| "grad_norm": 0.2019444853067398, | |
| "learning_rate": 0.00019802881569806706, | |
| "loss": 0.1647, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3165391717225007, | |
| "eval_loss": 0.15114803612232208, | |
| "eval_runtime": 44.2313, | |
| "eval_samples_per_second": 4.522, | |
| "eval_steps_per_second": 4.522, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3376417831706674, | |
| "grad_norm": 0.23078428208827972, | |
| "learning_rate": 0.00019723484218374865, | |
| "loss": 0.142, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.35874439461883406, | |
| "grad_norm": 0.15399664640426636, | |
| "learning_rate": 0.00019630899001576405, | |
| "loss": 0.1472, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3798470060670008, | |
| "grad_norm": 0.21795395016670227, | |
| "learning_rate": 0.0001952525149182412, | |
| "loss": 0.1511, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3798470060670008, | |
| "eval_loss": 0.14088743925094604, | |
| "eval_runtime": 44.23, | |
| "eval_samples_per_second": 4.522, | |
| "eval_steps_per_second": 4.522, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4009496175151675, | |
| "grad_norm": 0.2159528136253357, | |
| "learning_rate": 0.00019406684977789395, | |
| "loss": 0.1426, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4220522289633342, | |
| "grad_norm": 0.154087632894516, | |
| "learning_rate": 0.00019275360270061217, | |
| "loss": 0.1469, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.44315484041150094, | |
| "grad_norm": 0.1741837114095688, | |
| "learning_rate": 0.0001913145548304034, | |
| "loss": 0.139, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.44315484041150094, | |
| "eval_loss": 0.13762199878692627, | |
| "eval_runtime": 44.3745, | |
| "eval_samples_per_second": 4.507, | |
| "eval_steps_per_second": 4.507, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.46425745185966766, | |
| "grad_norm": 0.14827653765678406, | |
| "learning_rate": 0.00018975165793364503, | |
| "loss": 0.1391, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4853600633078343, | |
| "grad_norm": 0.152383491396904, | |
| "learning_rate": 0.00018806703175192283, | |
| "loss": 0.1418, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.506462674756001, | |
| "grad_norm": 0.2013552486896515, | |
| "learning_rate": 0.0001862629611270464, | |
| "loss": 0.1442, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.506462674756001, | |
| "eval_loss": 0.136888787150383, | |
| "eval_runtime": 44.4633, | |
| "eval_samples_per_second": 4.498, | |
| "eval_steps_per_second": 4.498, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5275652862041678, | |
| "grad_norm": 0.16066157817840576, | |
| "learning_rate": 0.00018434189290214106, | |
| "loss": 0.1424, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5486678976523345, | |
| "grad_norm": 0.1520007848739624, | |
| "learning_rate": 0.00018230643260301838, | |
| "loss": 0.1608, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5697705091005012, | |
| "grad_norm": 0.1666969507932663, | |
| "learning_rate": 0.00018015934090432757, | |
| "loss": 0.1342, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5697705091005012, | |
| "eval_loss": 0.13377884030342102, | |
| "eval_runtime": 44.2414, | |
| "eval_samples_per_second": 4.521, | |
| "eval_steps_per_second": 4.521, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5908731205486679, | |
| "grad_norm": 0.1368287354707718, | |
| "learning_rate": 0.00017790352988527984, | |
| "loss": 0.1367, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6119757319968346, | |
| "grad_norm": 0.13659009337425232, | |
| "learning_rate": 0.000175542059080024, | |
| "loss": 0.14, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6330783434450014, | |
| "grad_norm": 0.19124054908752441, | |
| "learning_rate": 0.00017307813132803066, | |
| "loss": 0.1403, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6330783434450014, | |
| "eval_loss": 0.13211439549922943, | |
| "eval_runtime": 44.7668, | |
| "eval_samples_per_second": 4.468, | |
| "eval_steps_per_second": 4.468, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6541809548931681, | |
| "grad_norm": 0.14924179017543793, | |
| "learning_rate": 0.0001705150884301129, | |
| "loss": 0.1317, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6752835663413348, | |
| "grad_norm": 0.1274843066930771, | |
| "learning_rate": 0.00016785640661597467, | |
| "loss": 0.1477, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6963861777895014, | |
| "grad_norm": 0.15062034130096436, | |
| "learning_rate": 0.00016510569182943524, | |
| "loss": 0.1367, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6963861777895014, | |
| "eval_loss": 0.13036254048347473, | |
| "eval_runtime": 44.6273, | |
| "eval_samples_per_second": 4.482, | |
| "eval_steps_per_second": 4.482, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7174887892376681, | |
| "grad_norm": 0.13609440624713898, | |
| "learning_rate": 0.00016226667483772275, | |
| "loss": 0.1294, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7385914006858348, | |
| "grad_norm": 0.12146595865488052, | |
| "learning_rate": 0.00015934320617147214, | |
| "loss": 0.1356, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7596940121340016, | |
| "grad_norm": 0.33670490980148315, | |
| "learning_rate": 0.0001563392509022882, | |
| "loss": 0.1348, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7596940121340016, | |
| "eval_loss": 0.1280602663755417, | |
| "eval_runtime": 44.5897, | |
| "eval_samples_per_second": 4.485, | |
| "eval_steps_per_second": 4.485, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7807966235821683, | |
| "grad_norm": 0.11047045141458511, | |
| "learning_rate": 0.00015325888326495833, | |
| "loss": 0.1306, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.801899235030335, | |
| "grad_norm": 0.24782629311084747, | |
| "learning_rate": 0.0001501062811316082, | |
| "loss": 0.1421, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8230018464785017, | |
| "grad_norm": 0.15822850167751312, | |
| "learning_rate": 0.0001468857203452953, | |
| "loss": 0.1345, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8230018464785017, | |
| "eval_loss": 0.12854796648025513, | |
| "eval_runtime": 44.5908, | |
| "eval_samples_per_second": 4.485, | |
| "eval_steps_per_second": 4.485, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8441044579266684, | |
| "grad_norm": 0.17262572050094604, | |
| "learning_rate": 0.00014360156892072518, | |
| "loss": 0.138, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8652070693748352, | |
| "grad_norm": 0.16533038020133972, | |
| "learning_rate": 0.00014025828111995635, | |
| "loss": 0.13, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8863096808230019, | |
| "grad_norm": 0.10585814714431763, | |
| "learning_rate": 0.00013686039141112886, | |
| "loss": 0.1267, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8863096808230019, | |
| "eval_loss": 0.12870755791664124, | |
| "eval_runtime": 44.5358, | |
| "eval_samples_per_second": 4.491, | |
| "eval_steps_per_second": 4.491, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9074122922711686, | |
| "grad_norm": 0.12024762481451035, | |
| "learning_rate": 0.00013341250831840998, | |
| "loss": 0.1394, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9285149037193353, | |
| "grad_norm": 0.11544947326183319, | |
| "learning_rate": 0.0001299193081714986, | |
| "loss": 0.1409, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9496175151675019, | |
| "grad_norm": 0.2589576840400696, | |
| "learning_rate": 0.0001263855287631654, | |
| "loss": 0.1352, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9496175151675019, | |
| "eval_loss": 0.12487487494945526, | |
| "eval_runtime": 44.5323, | |
| "eval_samples_per_second": 4.491, | |
| "eval_steps_per_second": 4.491, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9707201266156686, | |
| "grad_norm": 0.13384610414505005, | |
| "learning_rate": 0.00012281596292343163, | |
| "loss": 0.1231, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9918227380638354, | |
| "grad_norm": 0.10393290221691132, | |
| "learning_rate": 0.00011921545201910099, | |
| "loss": 0.1347, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0126615668689, | |
| "grad_norm": 0.11627262830734253, | |
| "learning_rate": 0.00011558887938746194, | |
| "loss": 0.1305, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0126615668689, | |
| "eval_loss": 0.12428628653287888, | |
| "eval_runtime": 44.5885, | |
| "eval_samples_per_second": 4.485, | |
| "eval_steps_per_second": 4.485, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0337641783170668, | |
| "grad_norm": 0.13240262866020203, | |
| "learning_rate": 0.00011194116371306573, | |
| "loss": 0.1308, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.0548667897652335, | |
| "grad_norm": 0.15070736408233643, | |
| "learning_rate": 0.00010827725235656294, | |
| "loss": 0.13, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0759694012134002, | |
| "grad_norm": 0.1023663654923439, | |
| "learning_rate": 0.00010460211464464757, | |
| "loss": 0.1291, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0759694012134002, | |
| "eval_loss": 0.12311653047800064, | |
| "eval_runtime": 44.4742, | |
| "eval_samples_per_second": 4.497, | |
| "eval_steps_per_second": 4.497, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.097072012661567, | |
| "grad_norm": 0.11538528650999069, | |
| "learning_rate": 0.00010092073513020834, | |
| "loss": 0.1314, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.1181746241097337, | |
| "grad_norm": 0.09773898124694824, | |
| "learning_rate": 9.723810683182883e-05, | |
| "loss": 0.1238, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.1392772355579002, | |
| "grad_norm": 0.1469469964504242, | |
| "learning_rate": 9.355922446180593e-05, | |
| "loss": 0.1302, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1392772355579002, | |
| "eval_loss": 0.12340555340051651, | |
| "eval_runtime": 44.7791, | |
| "eval_samples_per_second": 4.466, | |
| "eval_steps_per_second": 4.466, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.160379847006067, | |
| "grad_norm": 0.08568503707647324, | |
| "learning_rate": 8.988907765187104e-05, | |
| "loss": 0.1291, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1814824584542336, | |
| "grad_norm": 0.11329666525125504, | |
| "learning_rate": 8.623264418580185e-05, | |
| "loss": 0.1243, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.2025850699024003, | |
| "grad_norm": 0.20136046409606934, | |
| "learning_rate": 8.259488324810359e-05, | |
| "loss": 0.1236, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.2025850699024003, | |
| "eval_loss": 0.12261851131916046, | |
| "eval_runtime": 44.5714, | |
| "eval_samples_per_second": 4.487, | |
| "eval_steps_per_second": 4.487, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.223687681350567, | |
| "grad_norm": 0.09609243273735046, | |
| "learning_rate": 7.89807286979162e-05, | |
| "loss": 0.125, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.2447902927987338, | |
| "grad_norm": 0.15356111526489258, | |
| "learning_rate": 7.539508237726986e-05, | |
| "loss": 0.1268, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.2658929042469005, | |
| "grad_norm": 0.09328490495681763, | |
| "learning_rate": 7.184280746276537e-05, | |
| "loss": 0.1239, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2658929042469005, | |
| "eval_loss": 0.1222267746925354, | |
| "eval_runtime": 44.5091, | |
| "eval_samples_per_second": 4.493, | |
| "eval_steps_per_second": 4.493, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2869955156950672, | |
| "grad_norm": 0.10134255886077881, | |
| "learning_rate": 6.832872186969583e-05, | |
| "loss": 0.122, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.308098127143234, | |
| "grad_norm": 0.08854757249355316, | |
| "learning_rate": 6.485759171755574e-05, | |
| "loss": 0.1271, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.3292007385914006, | |
| "grad_norm": 0.12832631170749664, | |
| "learning_rate": 6.143412486580051e-05, | |
| "loss": 0.1243, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.3292007385914006, | |
| "eval_loss": 0.1208844780921936, | |
| "eval_runtime": 44.7617, | |
| "eval_samples_per_second": 4.468, | |
| "eval_steps_per_second": 4.468, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.3503033500395674, | |
| "grad_norm": 0.10490237921476364, | |
| "learning_rate": 5.8062964528623096e-05, | |
| "loss": 0.1233, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.371405961487734, | |
| "grad_norm": 0.09632379561662674, | |
| "learning_rate": 5.474868297740874e-05, | |
| "loss": 0.1246, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3925085729359008, | |
| "grad_norm": 0.09513814002275467, | |
| "learning_rate": 5.149577533940836e-05, | |
| "loss": 0.125, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3925085729359008, | |
| "eval_loss": 0.12093473225831985, | |
| "eval_runtime": 44.6914, | |
| "eval_samples_per_second": 4.475, | |
| "eval_steps_per_second": 4.475, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.4136111843840675, | |
| "grad_norm": 0.08354990929365158, | |
| "learning_rate": 4.8308653501042166e-05, | |
| "loss": 0.1247, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.4347137958322342, | |
| "grad_norm": 0.07765179127454758, | |
| "learning_rate": 4.519164012410171e-05, | |
| "loss": 0.1225, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.455816407280401, | |
| "grad_norm": 0.09267658740282059, | |
| "learning_rate": 4.214896278296646e-05, | |
| "loss": 0.1269, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.455816407280401, | |
| "eval_loss": 0.12080849707126617, | |
| "eval_runtime": 44.6277, | |
| "eval_samples_per_second": 4.482, | |
| "eval_steps_per_second": 4.482, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.4769190187285677, | |
| "grad_norm": 0.10476211458444595, | |
| "learning_rate": 3.9184748230786584e-05, | |
| "loss": 0.1207, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4980216301767344, | |
| "grad_norm": 0.09438898414373398, | |
| "learning_rate": 3.6303016802408594e-05, | |
| "loss": 0.1243, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.5191242416249011, | |
| "grad_norm": 0.09003426134586334, | |
| "learning_rate": 3.3507676961634796e-05, | |
| "loss": 0.1225, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.5191242416249011, | |
| "eval_loss": 0.11986906081438065, | |
| "eval_runtime": 44.9813, | |
| "eval_samples_per_second": 4.446, | |
| "eval_steps_per_second": 4.446, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.5402268530730678, | |
| "grad_norm": 0.09385745972394943, | |
| "learning_rate": 3.080252000021264e-05, | |
| "loss": 0.1262, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.5613294645212346, | |
| "grad_norm": 0.09874516725540161, | |
| "learning_rate": 2.8191214895743424e-05, | |
| "loss": 0.1195, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.5824320759694013, | |
| "grad_norm": 0.08888901770114899, | |
| "learning_rate": 2.5677303335484025e-05, | |
| "loss": 0.1176, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.5824320759694013, | |
| "eval_loss": 0.11973254382610321, | |
| "eval_runtime": 44.7792, | |
| "eval_samples_per_second": 4.466, | |
| "eval_steps_per_second": 4.466, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.603534687417568, | |
| "grad_norm": 0.08025766164064407, | |
| "learning_rate": 2.3264194912791605e-05, | |
| "loss": 0.1294, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.6246372988657347, | |
| "grad_norm": 0.1188935935497284, | |
| "learning_rate": 2.0955162502726135e-05, | |
| "loss": 0.1186, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.6457399103139014, | |
| "grad_norm": 0.0846625491976738, | |
| "learning_rate": 1.8753337823082084e-05, | |
| "loss": 0.1227, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.6457399103139014, | |
| "eval_loss": 0.11954256147146225, | |
| "eval_runtime": 44.8428, | |
| "eval_samples_per_second": 4.46, | |
| "eval_steps_per_second": 4.46, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.6668425217620682, | |
| "grad_norm": 0.09665997326374054, | |
| "learning_rate": 1.666170718687069e-05, | |
| "loss": 0.121, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.6879451332102349, | |
| "grad_norm": 0.10237396508455276, | |
| "learning_rate": 1.4683107452013223e-05, | |
| "loss": 0.122, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.7090477446584016, | |
| "grad_norm": 0.09061301499605179, | |
| "learning_rate": 1.2820222173738628e-05, | |
| "loss": 0.1133, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.7090477446584016, | |
| "eval_loss": 0.11932696402072906, | |
| "eval_runtime": 44.6177, | |
| "eval_samples_per_second": 4.483, | |
| "eval_steps_per_second": 4.483, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.7301503561065683, | |
| "grad_norm": 0.09903734177350998, | |
| "learning_rate": 1.1075577964904104e-05, | |
| "loss": 0.1259, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.751252967554735, | |
| "grad_norm": 0.0887165367603302, | |
| "learning_rate": 9.451541069175273e-06, | |
| "loss": 0.116, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.7723555790029017, | |
| "grad_norm": 0.09350364655256271, | |
| "learning_rate": 7.950314151713056e-06, | |
| "loss": 0.1241, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.7723555790029017, | |
| "eval_loss": 0.11908172816038132, | |
| "eval_runtime": 44.7048, | |
| "eval_samples_per_second": 4.474, | |
| "eval_steps_per_second": 4.474, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.7934581904510685, | |
| "grad_norm": 0.10343047231435776, | |
| "learning_rate": 6.57393331172097e-06, | |
| "loss": 0.1183, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.8145608018992352, | |
| "grad_norm": 0.08541911840438843, | |
| "learning_rate": 5.324265320903843e-06, | |
| "loss": 0.1258, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.835663413347402, | |
| "grad_norm": 0.10567805916070938, | |
| "learning_rate": 4.203005091583801e-06, | |
| "loss": 0.1204, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.835663413347402, | |
| "eval_loss": 0.11883310228586197, | |
| "eval_runtime": 44.6887, | |
| "eval_samples_per_second": 4.475, | |
| "eval_steps_per_second": 4.475, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.8567660247955684, | |
| "grad_norm": 0.10206745564937592, | |
| "learning_rate": 3.2116733779075094e-06, | |
| "loss": 0.1273, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.8778686362437351, | |
| "grad_norm": 0.09496993571519852, | |
| "learning_rate": 2.351614713262418e-06, | |
| "loss": 0.1239, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.8989712476919018, | |
| "grad_norm": 0.13183720409870148, | |
| "learning_rate": 1.623995586699334e-06, | |
| "loss": 0.122, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8989712476919018, | |
| "eval_loss": 0.11881602555513382, | |
| "eval_runtime": 44.6717, | |
| "eval_samples_per_second": 4.477, | |
| "eval_steps_per_second": 4.477, | |
| "step": 900 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 948, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.348133794370519e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |