| { | |
| "best_global_step": 26000, | |
| "best_metric": 1.9807677268981934, | |
| "best_model_checkpoint": "./medical_qwen_finetuned_improved/checkpoint-26000", | |
| "epoch": 7.9997372273734, | |
| "eval_steps": 100, | |
| "global_step": 26632, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00750778933143136, | |
| "grad_norm": 5.094513416290283, | |
| "learning_rate": 1.6893477240732053e-07, | |
| "loss": 3.2412, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01501557866286272, | |
| "grad_norm": 3.9722039699554443, | |
| "learning_rate": 3.4490849366494603e-07, | |
| "loss": 3.2095, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02252336799429408, | |
| "grad_norm": 3.527215003967285, | |
| "learning_rate": 5.208822149225716e-07, | |
| "loss": 3.1277, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.03003115732572544, | |
| "grad_norm": 2.680919647216797, | |
| "learning_rate": 6.968559361801971e-07, | |
| "loss": 3.0231, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03003115732572544, | |
| "eval_loss": 2.9903318881988525, | |
| "eval_runtime": 319.3002, | |
| "eval_samples_per_second": 17.567, | |
| "eval_steps_per_second": 4.394, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0375389466571568, | |
| "grad_norm": 2.085681438446045, | |
| "learning_rate": 8.728296574378227e-07, | |
| "loss": 2.9442, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.04504673598858816, | |
| "grad_norm": 1.8346056938171387, | |
| "learning_rate": 1.0488033786954481e-06, | |
| "loss": 2.8133, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05255452532001952, | |
| "grad_norm": 1.7170641422271729, | |
| "learning_rate": 1.2247770999530738e-06, | |
| "loss": 2.741, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.06006231465145088, | |
| "grad_norm": 1.7041053771972656, | |
| "learning_rate": 1.4007508212106992e-06, | |
| "loss": 2.6097, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06006231465145088, | |
| "eval_loss": 2.5728588104248047, | |
| "eval_runtime": 244.8867, | |
| "eval_samples_per_second": 22.904, | |
| "eval_steps_per_second": 5.729, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06757010398288224, | |
| "grad_norm": 1.1266319751739502, | |
| "learning_rate": 1.5767245424683247e-06, | |
| "loss": 2.5265, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.0750778933143136, | |
| "grad_norm": 0.9362080097198486, | |
| "learning_rate": 1.7526982637259503e-06, | |
| "loss": 2.4491, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08258568264574496, | |
| "grad_norm": 0.5819249153137207, | |
| "learning_rate": 1.928671984983576e-06, | |
| "loss": 2.3677, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.09009347197717632, | |
| "grad_norm": 0.5669568777084351, | |
| "learning_rate": 2.1046457062412012e-06, | |
| "loss": 2.3315, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.09009347197717632, | |
| "eval_loss": 2.288224458694458, | |
| "eval_runtime": 244.739, | |
| "eval_samples_per_second": 22.918, | |
| "eval_steps_per_second": 5.733, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.09760126130860768, | |
| "grad_norm": 0.43035316467285156, | |
| "learning_rate": 2.280619427498827e-06, | |
| "loss": 2.2718, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.10510905064003905, | |
| "grad_norm": 0.41122695803642273, | |
| "learning_rate": 2.4565931487564526e-06, | |
| "loss": 2.2705, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1126168399714704, | |
| "grad_norm": 0.38599300384521484, | |
| "learning_rate": 2.632566870014078e-06, | |
| "loss": 2.2149, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.12012462930290176, | |
| "grad_norm": 0.36087512969970703, | |
| "learning_rate": 2.8085405912717034e-06, | |
| "loss": 2.242, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.12012462930290176, | |
| "eval_loss": 2.2342677116394043, | |
| "eval_runtime": 244.7563, | |
| "eval_samples_per_second": 22.917, | |
| "eval_steps_per_second": 5.732, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1276324186343331, | |
| "grad_norm": 0.39531558752059937, | |
| "learning_rate": 2.984514312529329e-06, | |
| "loss": 2.2117, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.13514020796576448, | |
| "grad_norm": 0.4547671675682068, | |
| "learning_rate": 3.1604880337869548e-06, | |
| "loss": 2.2321, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.14264799729719585, | |
| "grad_norm": 0.37058719992637634, | |
| "learning_rate": 3.33646175504458e-06, | |
| "loss": 2.2581, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.1501557866286272, | |
| "grad_norm": 0.3959207534790039, | |
| "learning_rate": 3.5124354763022057e-06, | |
| "loss": 2.2329, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1501557866286272, | |
| "eval_loss": 2.2151083946228027, | |
| "eval_runtime": 244.7784, | |
| "eval_samples_per_second": 22.915, | |
| "eval_steps_per_second": 5.732, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.15766357596005856, | |
| "grad_norm": 0.4138086438179016, | |
| "learning_rate": 3.688409197559831e-06, | |
| "loss": 2.235, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.16517136529148993, | |
| "grad_norm": 0.4153759777545929, | |
| "learning_rate": 3.864382918817457e-06, | |
| "loss": 2.2237, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.17267915462292127, | |
| "grad_norm": 0.4081685245037079, | |
| "learning_rate": 4.0403566400750816e-06, | |
| "loss": 2.2002, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.18018694395435264, | |
| "grad_norm": 0.38760289549827576, | |
| "learning_rate": 4.216330361332708e-06, | |
| "loss": 2.2159, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.18018694395435264, | |
| "eval_loss": 2.204134464263916, | |
| "eval_runtime": 244.6583, | |
| "eval_samples_per_second": 22.926, | |
| "eval_steps_per_second": 5.735, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.187694733285784, | |
| "grad_norm": 0.38073575496673584, | |
| "learning_rate": 4.392304082590333e-06, | |
| "loss": 2.1967, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.19520252261721535, | |
| "grad_norm": 0.4018952250480652, | |
| "learning_rate": 4.568277803847959e-06, | |
| "loss": 2.1968, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.20271031194864672, | |
| "grad_norm": 0.4137013256549835, | |
| "learning_rate": 4.744251525105584e-06, | |
| "loss": 2.1997, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.2102181012800781, | |
| "grad_norm": 0.411466509103775, | |
| "learning_rate": 4.92022524636321e-06, | |
| "loss": 2.2099, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2102181012800781, | |
| "eval_loss": 2.195915699005127, | |
| "eval_runtime": 244.7304, | |
| "eval_samples_per_second": 22.919, | |
| "eval_steps_per_second": 5.733, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.21772589061150943, | |
| "grad_norm": 0.41950109601020813, | |
| "learning_rate": 5.096198967620835e-06, | |
| "loss": 2.1777, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.2252336799429408, | |
| "grad_norm": 0.41122791171073914, | |
| "learning_rate": 5.272172688878461e-06, | |
| "loss": 2.2063, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.23274146927437217, | |
| "grad_norm": 0.44570910930633545, | |
| "learning_rate": 5.448146410136086e-06, | |
| "loss": 2.1962, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.2402492586058035, | |
| "grad_norm": 0.40760159492492676, | |
| "learning_rate": 5.624120131393712e-06, | |
| "loss": 2.2007, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2402492586058035, | |
| "eval_loss": 2.1890077590942383, | |
| "eval_runtime": 244.6573, | |
| "eval_samples_per_second": 22.926, | |
| "eval_steps_per_second": 5.735, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24775704793723488, | |
| "grad_norm": 0.4488222897052765, | |
| "learning_rate": 5.800093852651337e-06, | |
| "loss": 2.2008, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.2552648372686662, | |
| "grad_norm": 0.4745488166809082, | |
| "learning_rate": 5.976067573908963e-06, | |
| "loss": 2.2013, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2627726266000976, | |
| "grad_norm": 0.45855531096458435, | |
| "learning_rate": 6.152041295166589e-06, | |
| "loss": 2.1824, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.27028041593152896, | |
| "grad_norm": 0.4843423366546631, | |
| "learning_rate": 6.328015016424214e-06, | |
| "loss": 2.1872, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.27028041593152896, | |
| "eval_loss": 2.182678699493408, | |
| "eval_runtime": 244.7929, | |
| "eval_samples_per_second": 22.913, | |
| "eval_steps_per_second": 5.731, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.27778820526296033, | |
| "grad_norm": 0.4567316770553589, | |
| "learning_rate": 6.5039887376818395e-06, | |
| "loss": 2.184, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.2852959945943917, | |
| "grad_norm": 0.46967923641204834, | |
| "learning_rate": 6.679962458939465e-06, | |
| "loss": 2.1739, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.29280378392582307, | |
| "grad_norm": 0.4461369216442108, | |
| "learning_rate": 6.85593618019709e-06, | |
| "loss": 2.1818, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.3003115732572544, | |
| "grad_norm": 0.4638686776161194, | |
| "learning_rate": 7.031909901454717e-06, | |
| "loss": 2.194, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3003115732572544, | |
| "eval_loss": 2.1770713329315186, | |
| "eval_runtime": 244.571, | |
| "eval_samples_per_second": 22.934, | |
| "eval_steps_per_second": 5.737, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.30781936258868575, | |
| "grad_norm": 0.4287603199481964, | |
| "learning_rate": 7.207883622712341e-06, | |
| "loss": 2.1563, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.3153271519201171, | |
| "grad_norm": 0.4473567605018616, | |
| "learning_rate": 7.383857343969968e-06, | |
| "loss": 2.1661, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3228349412515485, | |
| "grad_norm": 0.5221546292304993, | |
| "learning_rate": 7.559831065227592e-06, | |
| "loss": 2.1744, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.33034273058297986, | |
| "grad_norm": 0.4909228980541229, | |
| "learning_rate": 7.735804786485218e-06, | |
| "loss": 2.1729, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.33034273058297986, | |
| "eval_loss": 2.1715242862701416, | |
| "eval_runtime": 245.0613, | |
| "eval_samples_per_second": 22.888, | |
| "eval_steps_per_second": 5.725, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.33785051991441123, | |
| "grad_norm": 0.5596965551376343, | |
| "learning_rate": 7.911778507742844e-06, | |
| "loss": 2.1615, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.34535830924584254, | |
| "grad_norm": 0.4983489215373993, | |
| "learning_rate": 8.08775222900047e-06, | |
| "loss": 2.1717, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3528660985772739, | |
| "grad_norm": 0.485856831073761, | |
| "learning_rate": 8.263725950258095e-06, | |
| "loss": 2.1507, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.3603738879087053, | |
| "grad_norm": 0.5247727632522583, | |
| "learning_rate": 8.43969967151572e-06, | |
| "loss": 2.1939, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3603738879087053, | |
| "eval_loss": 2.1655497550964355, | |
| "eval_runtime": 244.6519, | |
| "eval_samples_per_second": 22.926, | |
| "eval_steps_per_second": 5.735, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.36788167724013665, | |
| "grad_norm": 0.5695153474807739, | |
| "learning_rate": 8.615673392773347e-06, | |
| "loss": 2.1827, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.375389466571568, | |
| "grad_norm": 0.5112013816833496, | |
| "learning_rate": 8.791647114030971e-06, | |
| "loss": 2.1748, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3828972559029994, | |
| "grad_norm": 0.46719494462013245, | |
| "learning_rate": 8.967620835288597e-06, | |
| "loss": 2.1856, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.3904050452344307, | |
| "grad_norm": 0.48362448811531067, | |
| "learning_rate": 9.143594556546222e-06, | |
| "loss": 2.1766, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3904050452344307, | |
| "eval_loss": 2.160381317138672, | |
| "eval_runtime": 245.4635, | |
| "eval_samples_per_second": 22.851, | |
| "eval_steps_per_second": 5.716, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3979128345658621, | |
| "grad_norm": 0.5096102356910706, | |
| "learning_rate": 9.31956827780385e-06, | |
| "loss": 2.1664, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.40542062389729344, | |
| "grad_norm": 0.6038557887077332, | |
| "learning_rate": 9.495541999061475e-06, | |
| "loss": 2.1617, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4129284132287248, | |
| "grad_norm": 0.5893401503562927, | |
| "learning_rate": 9.671515720319098e-06, | |
| "loss": 2.1473, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.4204362025601562, | |
| "grad_norm": 0.5666929483413696, | |
| "learning_rate": 9.847489441576724e-06, | |
| "loss": 2.195, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4204362025601562, | |
| "eval_loss": 2.154971122741699, | |
| "eval_runtime": 244.6458, | |
| "eval_samples_per_second": 22.927, | |
| "eval_steps_per_second": 5.735, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.42794399189158755, | |
| "grad_norm": 0.6115418672561646, | |
| "learning_rate": 1.0023463162834351e-05, | |
| "loss": 2.1356, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.43545178122301886, | |
| "grad_norm": 0.6469879150390625, | |
| "learning_rate": 1.0199436884091976e-05, | |
| "loss": 2.1755, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.44295957055445023, | |
| "grad_norm": 0.5257688760757446, | |
| "learning_rate": 1.0375410605349602e-05, | |
| "loss": 2.1731, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.4504673598858816, | |
| "grad_norm": 0.5619986653327942, | |
| "learning_rate": 1.0551384326607226e-05, | |
| "loss": 2.172, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4504673598858816, | |
| "eval_loss": 2.15023136138916, | |
| "eval_runtime": 244.8519, | |
| "eval_samples_per_second": 22.908, | |
| "eval_steps_per_second": 5.73, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.45797514921731297, | |
| "grad_norm": 0.5681572556495667, | |
| "learning_rate": 1.0727358047864853e-05, | |
| "loss": 2.1556, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.46548293854874434, | |
| "grad_norm": 0.6319741010665894, | |
| "learning_rate": 1.0903331769122478e-05, | |
| "loss": 2.1369, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4729907278801757, | |
| "grad_norm": 0.5815430283546448, | |
| "learning_rate": 1.1079305490380104e-05, | |
| "loss": 2.1589, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.480498517211607, | |
| "grad_norm": 0.5797183513641357, | |
| "learning_rate": 1.1255279211637729e-05, | |
| "loss": 2.1676, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.480498517211607, | |
| "eval_loss": 2.1454966068267822, | |
| "eval_runtime": 244.6098, | |
| "eval_samples_per_second": 22.93, | |
| "eval_steps_per_second": 5.736, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4880063065430384, | |
| "grad_norm": 0.6238908171653748, | |
| "learning_rate": 1.1431252932895355e-05, | |
| "loss": 2.1451, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.49551409587446976, | |
| "grad_norm": 0.6378119587898254, | |
| "learning_rate": 1.160722665415298e-05, | |
| "loss": 2.1393, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5030218852059011, | |
| "grad_norm": 0.5630180239677429, | |
| "learning_rate": 1.1783200375410605e-05, | |
| "loss": 2.1532, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.5105296745373324, | |
| "grad_norm": 0.5868392586708069, | |
| "learning_rate": 1.1959174096668231e-05, | |
| "loss": 2.1473, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5105296745373324, | |
| "eval_loss": 2.141220808029175, | |
| "eval_runtime": 244.9172, | |
| "eval_samples_per_second": 22.902, | |
| "eval_steps_per_second": 5.728, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5180374638687638, | |
| "grad_norm": 0.6577850580215454, | |
| "learning_rate": 1.2135147817925858e-05, | |
| "loss": 2.1379, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.5255452532001952, | |
| "grad_norm": 0.6026327013969421, | |
| "learning_rate": 1.2311121539183482e-05, | |
| "loss": 2.1464, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5330530425316266, | |
| "grad_norm": 0.60736483335495, | |
| "learning_rate": 1.2487095260441107e-05, | |
| "loss": 2.1588, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.5405608318630579, | |
| "grad_norm": 0.6438941359519958, | |
| "learning_rate": 1.2663068981698733e-05, | |
| "loss": 2.1421, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5405608318630579, | |
| "eval_loss": 2.1365151405334473, | |
| "eval_runtime": 244.577, | |
| "eval_samples_per_second": 22.933, | |
| "eval_steps_per_second": 5.736, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5480686211944893, | |
| "grad_norm": 0.6403496861457825, | |
| "learning_rate": 1.283904270295636e-05, | |
| "loss": 2.1428, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.5555764105259207, | |
| "grad_norm": 0.645140528678894, | |
| "learning_rate": 1.3015016424213985e-05, | |
| "loss": 2.1603, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.563084199857352, | |
| "grad_norm": 0.6453937292098999, | |
| "learning_rate": 1.3190990145471609e-05, | |
| "loss": 2.156, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.5705919891887834, | |
| "grad_norm": 0.7146685123443604, | |
| "learning_rate": 1.3366963866729234e-05, | |
| "loss": 2.1016, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5705919891887834, | |
| "eval_loss": 2.1333518028259277, | |
| "eval_runtime": 245.1598, | |
| "eval_samples_per_second": 22.879, | |
| "eval_steps_per_second": 5.723, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5780997785202148, | |
| "grad_norm": 0.6153611540794373, | |
| "learning_rate": 1.3542937587986862e-05, | |
| "loss": 2.1577, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.5856075678516461, | |
| "grad_norm": 0.7233150601387024, | |
| "learning_rate": 1.3718911309244487e-05, | |
| "loss": 2.1348, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5931153571830774, | |
| "grad_norm": 0.7316763401031494, | |
| "learning_rate": 1.3894885030502113e-05, | |
| "loss": 2.1316, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.6006231465145088, | |
| "grad_norm": 0.6433097124099731, | |
| "learning_rate": 1.4070858751759736e-05, | |
| "loss": 2.1445, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6006231465145088, | |
| "eval_loss": 2.129106044769287, | |
| "eval_runtime": 244.5211, | |
| "eval_samples_per_second": 22.939, | |
| "eval_steps_per_second": 5.738, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6081309358459401, | |
| "grad_norm": 0.6830511689186096, | |
| "learning_rate": 1.4246832473017363e-05, | |
| "loss": 2.1139, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.6156387251773715, | |
| "grad_norm": 0.6850073337554932, | |
| "learning_rate": 1.4422806194274989e-05, | |
| "loss": 2.1218, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6231465145088029, | |
| "grad_norm": 0.6426066160202026, | |
| "learning_rate": 1.4598779915532614e-05, | |
| "loss": 2.1275, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.6306543038402342, | |
| "grad_norm": 0.6646946668624878, | |
| "learning_rate": 1.477475363679024e-05, | |
| "loss": 2.126, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6306543038402342, | |
| "eval_loss": 2.1254663467407227, | |
| "eval_runtime": 244.8863, | |
| "eval_samples_per_second": 22.905, | |
| "eval_steps_per_second": 5.729, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6381620931716656, | |
| "grad_norm": 0.7284884452819824, | |
| "learning_rate": 1.4950727358047865e-05, | |
| "loss": 2.116, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.645669882503097, | |
| "grad_norm": 0.8441785573959351, | |
| "learning_rate": 1.4999980024014693e-05, | |
| "loss": 2.1195, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6531776718345284, | |
| "grad_norm": 0.7109578847885132, | |
| "learning_rate": 1.4999886001482528e-05, | |
| "loss": 2.122, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.6606854611659597, | |
| "grad_norm": 0.7228453755378723, | |
| "learning_rate": 1.4999714912309012e-05, | |
| "loss": 2.1058, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6606854611659597, | |
| "eval_loss": 2.1222198009490967, | |
| "eval_runtime": 245.2072, | |
| "eval_samples_per_second": 22.875, | |
| "eval_steps_per_second": 5.722, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6681932504973911, | |
| "grad_norm": 0.7744355201721191, | |
| "learning_rate": 1.4999466758252207e-05, | |
| "loss": 2.1252, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.6757010398288225, | |
| "grad_norm": 0.7705317735671997, | |
| "learning_rate": 1.4999141541862068e-05, | |
| "loss": 2.0941, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6832088291602537, | |
| "grad_norm": 0.7709174156188965, | |
| "learning_rate": 1.4998739266480427e-05, | |
| "loss": 2.1044, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.6907166184916851, | |
| "grad_norm": 0.6840139627456665, | |
| "learning_rate": 1.4998259936240949e-05, | |
| "loss": 2.1146, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6907166184916851, | |
| "eval_loss": 2.1187844276428223, | |
| "eval_runtime": 244.5599, | |
| "eval_samples_per_second": 22.935, | |
| "eval_steps_per_second": 5.737, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6982244078231165, | |
| "grad_norm": 0.8008989095687866, | |
| "learning_rate": 1.4997703556069088e-05, | |
| "loss": 2.1483, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.7057321971545478, | |
| "grad_norm": 0.7936817407608032, | |
| "learning_rate": 1.499707013168205e-05, | |
| "loss": 2.1354, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7132399864859792, | |
| "grad_norm": 0.7062814831733704, | |
| "learning_rate": 1.4996359669588714e-05, | |
| "loss": 2.1378, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.7207477758174106, | |
| "grad_norm": 0.8156118392944336, | |
| "learning_rate": 1.4995572177089582e-05, | |
| "loss": 2.0949, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7207477758174106, | |
| "eval_loss": 2.1153197288513184, | |
| "eval_runtime": 244.9003, | |
| "eval_samples_per_second": 22.903, | |
| "eval_steps_per_second": 5.729, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7282555651488419, | |
| "grad_norm": 0.7018394470214844, | |
| "learning_rate": 1.4994707662276703e-05, | |
| "loss": 2.1084, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.7357633544802733, | |
| "grad_norm": 0.7865644097328186, | |
| "learning_rate": 1.4993766134033573e-05, | |
| "loss": 2.1087, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7432711438117047, | |
| "grad_norm": 0.7718919515609741, | |
| "learning_rate": 1.4992747602035062e-05, | |
| "loss": 2.1248, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.750778933143136, | |
| "grad_norm": 0.8038984537124634, | |
| "learning_rate": 1.499165207674731e-05, | |
| "loss": 2.124, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.750778933143136, | |
| "eval_loss": 2.112464666366577, | |
| "eval_runtime": 244.648, | |
| "eval_samples_per_second": 22.927, | |
| "eval_steps_per_second": 5.735, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7582867224745674, | |
| "grad_norm": 0.8126859664916992, | |
| "learning_rate": 1.4990479569427615e-05, | |
| "loss": 2.0879, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.7657945118059988, | |
| "grad_norm": 0.7394261360168457, | |
| "learning_rate": 1.4989230092124322e-05, | |
| "loss": 2.1167, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.77330230113743, | |
| "grad_norm": 0.8700124621391296, | |
| "learning_rate": 1.498790365767669e-05, | |
| "loss": 2.0892, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.7808100904688614, | |
| "grad_norm": 0.7596783638000488, | |
| "learning_rate": 1.4986500279714777e-05, | |
| "loss": 2.112, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7808100904688614, | |
| "eval_loss": 2.1093900203704834, | |
| "eval_runtime": 244.8809, | |
| "eval_samples_per_second": 22.905, | |
| "eval_steps_per_second": 5.729, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7883178798002928, | |
| "grad_norm": 0.7278156876564026, | |
| "learning_rate": 1.4985019972659285e-05, | |
| "loss": 2.1186, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.7958256691317241, | |
| "grad_norm": 0.8945568203926086, | |
| "learning_rate": 1.4983462751721418e-05, | |
| "loss": 2.0986, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8033334584631555, | |
| "grad_norm": 0.8277415037155151, | |
| "learning_rate": 1.498182863290272e-05, | |
| "loss": 2.1247, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.8108412477945869, | |
| "grad_norm": 0.7230107188224792, | |
| "learning_rate": 1.4980117632994925e-05, | |
| "loss": 2.1107, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8108412477945869, | |
| "eval_loss": 2.106996774673462, | |
| "eval_runtime": 244.8592, | |
| "eval_samples_per_second": 22.907, | |
| "eval_steps_per_second": 5.73, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8183490371260183, | |
| "grad_norm": 0.8236918449401855, | |
| "learning_rate": 1.4978329769579768e-05, | |
| "loss": 2.1138, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.8258568264574496, | |
| "grad_norm": 0.7915171384811401, | |
| "learning_rate": 1.4976465061028811e-05, | |
| "loss": 2.1113, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.833364615788881, | |
| "grad_norm": 0.8001993894577026, | |
| "learning_rate": 1.4974523526503252e-05, | |
| "loss": 2.122, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.8408724051203124, | |
| "grad_norm": 0.915046751499176, | |
| "learning_rate": 1.4972505185953739e-05, | |
| "loss": 2.1145, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8408724051203124, | |
| "eval_loss": 2.1040894985198975, | |
| "eval_runtime": 244.924, | |
| "eval_samples_per_second": 22.901, | |
| "eval_steps_per_second": 5.728, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8483801944517437, | |
| "grad_norm": 0.7762336134910583, | |
| "learning_rate": 1.4970410060120146e-05, | |
| "loss": 2.0905, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.8558879837831751, | |
| "grad_norm": 0.8220327496528625, | |
| "learning_rate": 1.496823817053138e-05, | |
| "loss": 2.1149, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.8633957731146064, | |
| "grad_norm": 0.8111168146133423, | |
| "learning_rate": 1.4965989539505144e-05, | |
| "loss": 2.1035, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.8709035624460377, | |
| "grad_norm": 0.7875452637672424, | |
| "learning_rate": 1.4963664190147713e-05, | |
| "loss": 2.1091, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8709035624460377, | |
| "eval_loss": 2.101161241531372, | |
| "eval_runtime": 247.6368, | |
| "eval_samples_per_second": 22.65, | |
| "eval_steps_per_second": 5.666, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8784113517774691, | |
| "grad_norm": 0.8538459539413452, | |
| "learning_rate": 1.4961262146353696e-05, | |
| "loss": 2.0994, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.8859191411089005, | |
| "grad_norm": 0.7686406373977661, | |
| "learning_rate": 1.4958783432805801e-05, | |
| "loss": 2.0858, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8934269304403318, | |
| "grad_norm": 0.792827844619751, | |
| "learning_rate": 1.4956228074974561e-05, | |
| "loss": 2.1001, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.9009347197717632, | |
| "grad_norm": 0.9214953780174255, | |
| "learning_rate": 1.4953596099118089e-05, | |
| "loss": 2.0844, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9009347197717632, | |
| "eval_loss": 2.100034713745117, | |
| "eval_runtime": 247.8843, | |
| "eval_samples_per_second": 22.627, | |
| "eval_steps_per_second": 5.66, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9084425091031946, | |
| "grad_norm": 0.8309657573699951, | |
| "learning_rate": 1.49508875322818e-05, | |
| "loss": 2.0882, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.9159502984346259, | |
| "grad_norm": 0.8833063244819641, | |
| "learning_rate": 1.4948102402298141e-05, | |
| "loss": 2.1063, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9234580877660573, | |
| "grad_norm": 0.7956681847572327, | |
| "learning_rate": 1.4945240737786292e-05, | |
| "loss": 2.0885, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.9309658770974887, | |
| "grad_norm": 0.8342053890228271, | |
| "learning_rate": 1.4942302568151882e-05, | |
| "loss": 2.1001, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9309658770974887, | |
| "eval_loss": 2.0970711708068848, | |
| "eval_runtime": 245.0795, | |
| "eval_samples_per_second": 22.886, | |
| "eval_steps_per_second": 5.725, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.93847366642892, | |
| "grad_norm": 0.9061738848686218, | |
| "learning_rate": 1.493928792358669e-05, | |
| "loss": 2.1135, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.9459814557603514, | |
| "grad_norm": 0.9443092346191406, | |
| "learning_rate": 1.4936196835068322e-05, | |
| "loss": 2.0909, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.9534892450917827, | |
| "grad_norm": 0.7598241567611694, | |
| "learning_rate": 1.4933029334359898e-05, | |
| "loss": 2.1215, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.960997034423214, | |
| "grad_norm": 1.001592993736267, | |
| "learning_rate": 1.4929785454009737e-05, | |
| "loss": 2.0884, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.960997034423214, | |
| "eval_loss": 2.09686541557312, | |
| "eval_runtime": 244.53, | |
| "eval_samples_per_second": 22.938, | |
| "eval_steps_per_second": 5.738, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9685048237546454, | |
| "grad_norm": 0.9168058633804321, | |
| "learning_rate": 1.4926465227351008e-05, | |
| "loss": 2.0785, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.9760126130860768, | |
| "grad_norm": 0.8249208331108093, | |
| "learning_rate": 1.4923068688501385e-05, | |
| "loss": 2.0841, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.9835204024175082, | |
| "grad_norm": 0.8430188298225403, | |
| "learning_rate": 1.4919595872362719e-05, | |
| "loss": 2.0969, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.9910281917489395, | |
| "grad_norm": 0.9370065927505493, | |
| "learning_rate": 1.491604681462065e-05, | |
| "loss": 2.1052, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9910281917489395, | |
| "eval_loss": 2.0929176807403564, | |
| "eval_runtime": 244.8371, | |
| "eval_samples_per_second": 22.909, | |
| "eval_steps_per_second": 5.73, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9985359810803709, | |
| "grad_norm": 0.7515010237693787, | |
| "learning_rate": 1.4912421551744264e-05, | |
| "loss": 2.0882, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 1.0063065430384024, | |
| "grad_norm": 0.8594741821289062, | |
| "learning_rate": 1.4908720120985703e-05, | |
| "loss": 2.2045, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.0138143323698336, | |
| "grad_norm": 0.852730929851532, | |
| "learning_rate": 1.4904942560379791e-05, | |
| "loss": 2.0833, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 1.0213221217012651, | |
| "grad_norm": 0.8965045809745789, | |
| "learning_rate": 1.4901088908743635e-05, | |
| "loss": 2.1122, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0213221217012651, | |
| "eval_loss": 2.0909690856933594, | |
| "eval_runtime": 245.1692, | |
| "eval_samples_per_second": 22.878, | |
| "eval_steps_per_second": 5.723, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0288299110326964, | |
| "grad_norm": 0.8129332065582275, | |
| "learning_rate": 1.4897159205676244e-05, | |
| "loss": 2.062, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 1.0363377003641279, | |
| "grad_norm": 0.7968320846557617, | |
| "learning_rate": 1.4893153491558093e-05, | |
| "loss": 2.1195, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0438454896955591, | |
| "grad_norm": 0.8569227457046509, | |
| "learning_rate": 1.4889071807550734e-05, | |
| "loss": 2.0819, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 1.0513532790269906, | |
| "grad_norm": 0.790208101272583, | |
| "learning_rate": 1.4884914195596364e-05, | |
| "loss": 2.0831, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0513532790269906, | |
| "eval_loss": 2.0892488956451416, | |
| "eval_runtime": 244.1949, | |
| "eval_samples_per_second": 22.969, | |
| "eval_steps_per_second": 5.745, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0588610683584219, | |
| "grad_norm": 0.7736139893531799, | |
| "learning_rate": 1.488068069841739e-05, | |
| "loss": 2.0969, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 1.0663688576898531, | |
| "grad_norm": 0.9392566084861755, | |
| "learning_rate": 1.4876371359515992e-05, | |
| "loss": 2.0835, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0738766470212846, | |
| "grad_norm": 0.9095376133918762, | |
| "learning_rate": 1.4871986223173682e-05, | |
| "loss": 2.0882, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 1.0813844363527159, | |
| "grad_norm": 0.999569833278656, | |
| "learning_rate": 1.4867525334450842e-05, | |
| "loss": 2.0789, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0813844363527159, | |
| "eval_loss": 2.0872867107391357, | |
| "eval_runtime": 245.2287, | |
| "eval_samples_per_second": 22.873, | |
| "eval_steps_per_second": 5.721, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0888922256841473, | |
| "grad_norm": 0.8475573658943176, | |
| "learning_rate": 1.4862988739186265e-05, | |
| "loss": 2.0472, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 1.0964000150155786, | |
| "grad_norm": 0.8783066868782043, | |
| "learning_rate": 1.4858376483996675e-05, | |
| "loss": 2.1, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.10390780434701, | |
| "grad_norm": 0.8863905072212219, | |
| "learning_rate": 1.4853688616276268e-05, | |
| "loss": 2.112, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 1.1114155936784413, | |
| "grad_norm": 1.0993289947509766, | |
| "learning_rate": 1.4848925184196203e-05, | |
| "loss": 2.0788, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.1114155936784413, | |
| "eval_loss": 2.0860979557037354, | |
| "eval_runtime": 245.145, | |
| "eval_samples_per_second": 22.88, | |
| "eval_steps_per_second": 5.723, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.1189233830098728, | |
| "grad_norm": 0.7591436505317688, | |
| "learning_rate": 1.4844086236704119e-05, | |
| "loss": 2.0705, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 1.126431172341304, | |
| "grad_norm": 0.9064419269561768, | |
| "learning_rate": 1.4839171823523628e-05, | |
| "loss": 2.0421, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1339389616727356, | |
| "grad_norm": 0.8282918930053711, | |
| "learning_rate": 1.483418199515381e-05, | |
| "loss": 2.0621, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 1.1414467510041668, | |
| "grad_norm": 0.9208828806877136, | |
| "learning_rate": 1.4829116802868684e-05, | |
| "loss": 2.08, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.1414467510041668, | |
| "eval_loss": 2.0833563804626465, | |
| "eval_runtime": 245.2937, | |
| "eval_samples_per_second": 22.866, | |
| "eval_steps_per_second": 5.72, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.1489545403355983, | |
| "grad_norm": 0.8673622608184814, | |
| "learning_rate": 1.4823976298716686e-05, | |
| "loss": 2.0879, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 1.1564623296670296, | |
| "grad_norm": 0.9238690137863159, | |
| "learning_rate": 1.4818760535520142e-05, | |
| "loss": 2.083, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.1639701189984608, | |
| "grad_norm": 0.855536937713623, | |
| "learning_rate": 1.4813469566874711e-05, | |
| "loss": 2.0705, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 1.1714779083298923, | |
| "grad_norm": 0.8495576977729797, | |
| "learning_rate": 1.4808103447148845e-05, | |
| "loss": 2.092, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.1714779083298923, | |
| "eval_loss": 2.081465721130371, | |
| "eval_runtime": 244.6166, | |
| "eval_samples_per_second": 22.93, | |
| "eval_steps_per_second": 5.736, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.1789856976613236, | |
| "grad_norm": 0.9213201403617859, | |
| "learning_rate": 1.4802662231483224e-05, | |
| "loss": 2.0695, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 1.186493486992755, | |
| "grad_norm": 0.9453656673431396, | |
| "learning_rate": 1.4797145975790194e-05, | |
| "loss": 2.0856, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.1940012763241863, | |
| "grad_norm": 0.894378662109375, | |
| "learning_rate": 1.4791554736753193e-05, | |
| "loss": 2.0705, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 1.2015090656556178, | |
| "grad_norm": 0.9393320083618164, | |
| "learning_rate": 1.4785888571826158e-05, | |
| "loss": 2.0693, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2015090656556178, | |
| "eval_loss": 2.079852819442749, | |
| "eval_runtime": 245.0032, | |
| "eval_samples_per_second": 22.894, | |
| "eval_steps_per_second": 5.726, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.209016854987049, | |
| "grad_norm": 0.8150069117546082, | |
| "learning_rate": 1.478014753923295e-05, | |
| "loss": 2.0721, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 1.2165246443184805, | |
| "grad_norm": 0.867784321308136, | |
| "learning_rate": 1.4774331697966743e-05, | |
| "loss": 2.1046, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.2240324336499118, | |
| "grad_norm": 0.8931713700294495, | |
| "learning_rate": 1.476844110778943e-05, | |
| "loss": 2.0718, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 1.231540222981343, | |
| "grad_norm": 0.9451190829277039, | |
| "learning_rate": 1.4762475829230994e-05, | |
| "loss": 2.0826, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.231540222981343, | |
| "eval_loss": 2.078012466430664, | |
| "eval_runtime": 244.9722, | |
| "eval_samples_per_second": 22.896, | |
| "eval_steps_per_second": 5.727, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2390480123127745, | |
| "grad_norm": 0.9044253826141357, | |
| "learning_rate": 1.4756435923588899e-05, | |
| "loss": 2.0853, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 1.246555801644206, | |
| "grad_norm": 0.9442611336708069, | |
| "learning_rate": 1.4750321452927454e-05, | |
| "loss": 2.039, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.2540635909756372, | |
| "grad_norm": 0.8297872543334961, | |
| "learning_rate": 1.4744132480077177e-05, | |
| "loss": 2.0371, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 1.2615713803070685, | |
| "grad_norm": 0.783397912979126, | |
| "learning_rate": 1.4737869068634148e-05, | |
| "loss": 2.0508, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2615713803070685, | |
| "eval_loss": 2.076925754547119, | |
| "eval_runtime": 244.7969, | |
| "eval_samples_per_second": 22.913, | |
| "eval_steps_per_second": 5.731, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2690791696385, | |
| "grad_norm": 0.9161412119865417, | |
| "learning_rate": 1.4731531282959356e-05, | |
| "loss": 2.0785, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 1.2765869589699312, | |
| "grad_norm": 0.8472649455070496, | |
| "learning_rate": 1.4725119188178038e-05, | |
| "loss": 2.057, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.2840947483013627, | |
| "grad_norm": 0.777370035648346, | |
| "learning_rate": 1.4718632850179013e-05, | |
| "loss": 2.0842, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 1.291602537632794, | |
| "grad_norm": 0.9465096592903137, | |
| "learning_rate": 1.471207233561399e-05, | |
| "loss": 2.0788, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.291602537632794, | |
| "eval_loss": 2.0751006603240967, | |
| "eval_runtime": 244.7621, | |
| "eval_samples_per_second": 22.916, | |
| "eval_steps_per_second": 5.732, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.2991103269642255, | |
| "grad_norm": 0.9006996750831604, | |
| "learning_rate": 1.4705437711896914e-05, | |
| "loss": 2.0689, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 1.3066181162956567, | |
| "grad_norm": 0.8863036632537842, | |
| "learning_rate": 1.469872904720324e-05, | |
| "loss": 2.0536, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.3141259056270882, | |
| "grad_norm": 0.8076067566871643, | |
| "learning_rate": 1.4691946410469244e-05, | |
| "loss": 2.0704, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 1.3216336949585195, | |
| "grad_norm": 0.8585737943649292, | |
| "learning_rate": 1.4685089871391332e-05, | |
| "loss": 2.0566, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.3216336949585195, | |
| "eval_loss": 2.0732879638671875, | |
| "eval_runtime": 245.4201, | |
| "eval_samples_per_second": 22.855, | |
| "eval_steps_per_second": 5.717, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.3291414842899507, | |
| "grad_norm": 0.8773880004882812, | |
| "learning_rate": 1.4678159500425296e-05, | |
| "loss": 2.0661, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 1.3366492736213822, | |
| "grad_norm": 0.9763519763946533, | |
| "learning_rate": 1.4671155368785604e-05, | |
| "loss": 2.0684, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3441570629528137, | |
| "grad_norm": 0.8556541204452515, | |
| "learning_rate": 1.4664077548444675e-05, | |
| "loss": 2.0788, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 1.351664852284245, | |
| "grad_norm": 0.8426047563552856, | |
| "learning_rate": 1.4656926112132124e-05, | |
| "loss": 2.0645, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.351664852284245, | |
| "eval_loss": 2.0714945793151855, | |
| "eval_runtime": 271.6463, | |
| "eval_samples_per_second": 20.648, | |
| "eval_steps_per_second": 5.165, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3591726416156762, | |
| "grad_norm": 0.8249872326850891, | |
| "learning_rate": 1.4649701133334025e-05, | |
| "loss": 2.0679, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 1.3666804309471077, | |
| "grad_norm": 0.8870148658752441, | |
| "learning_rate": 1.4642402686292155e-05, | |
| "loss": 2.0873, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.374188220278539, | |
| "grad_norm": 0.8625667095184326, | |
| "learning_rate": 1.4635030846003225e-05, | |
| "loss": 2.0655, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 1.3816960096099704, | |
| "grad_norm": 1.0245722532272339, | |
| "learning_rate": 1.4627585688218116e-05, | |
| "loss": 2.0939, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.3816960096099704, | |
| "eval_loss": 2.0702602863311768, | |
| "eval_runtime": 244.5585, | |
| "eval_samples_per_second": 22.935, | |
| "eval_steps_per_second": 5.737, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.3892037989414017, | |
| "grad_norm": 0.9307467937469482, | |
| "learning_rate": 1.4620067289441101e-05, | |
| "loss": 2.0582, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 1.396711588272833, | |
| "grad_norm": 0.8650360703468323, | |
| "learning_rate": 1.461247572692905e-05, | |
| "loss": 2.0486, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.4042193776042644, | |
| "grad_norm": 0.8464282155036926, | |
| "learning_rate": 1.4604811078690648e-05, | |
| "loss": 2.0513, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 1.4117271669356959, | |
| "grad_norm": 0.9079179167747498, | |
| "learning_rate": 1.4597073423485583e-05, | |
| "loss": 2.0642, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.4117271669356959, | |
| "eval_loss": 2.068575143814087, | |
| "eval_runtime": 244.9525, | |
| "eval_samples_per_second": 22.898, | |
| "eval_steps_per_second": 5.728, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.4192349562671271, | |
| "grad_norm": 0.8237431049346924, | |
| "learning_rate": 1.4589262840823746e-05, | |
| "loss": 2.0619, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 1.4267427455985584, | |
| "grad_norm": 0.8957166075706482, | |
| "learning_rate": 1.4581379410964402e-05, | |
| "loss": 2.0896, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.4342505349299899, | |
| "grad_norm": 0.7650532722473145, | |
| "learning_rate": 1.4573423214915382e-05, | |
| "loss": 2.0554, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 1.4417583242614211, | |
| "grad_norm": 0.9083628058433533, | |
| "learning_rate": 1.4565394334432233e-05, | |
| "loss": 2.0811, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4417583242614211, | |
| "eval_loss": 2.066969394683838, | |
| "eval_runtime": 244.7686, | |
| "eval_samples_per_second": 22.916, | |
| "eval_steps_per_second": 5.732, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4492661135928526, | |
| "grad_norm": 0.963108479976654, | |
| "learning_rate": 1.4557292852017392e-05, | |
| "loss": 2.0727, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 1.4567739029242839, | |
| "grad_norm": 0.8735617399215698, | |
| "learning_rate": 1.454911885091933e-05, | |
| "loss": 2.0681, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.4642816922557154, | |
| "grad_norm": 1.0220097303390503, | |
| "learning_rate": 1.4540872415131695e-05, | |
| "loss": 2.0602, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 1.4717894815871466, | |
| "grad_norm": 0.9304827451705933, | |
| "learning_rate": 1.4532553629392455e-05, | |
| "loss": 2.0539, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.4717894815871466, | |
| "eval_loss": 2.0658257007598877, | |
| "eval_runtime": 244.4897, | |
| "eval_samples_per_second": 22.942, | |
| "eval_steps_per_second": 5.738, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.479297270918578, | |
| "grad_norm": 0.9377899765968323, | |
| "learning_rate": 1.4524162579183032e-05, | |
| "loss": 2.0552, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 1.4868050602500094, | |
| "grad_norm": 0.9211867451667786, | |
| "learning_rate": 1.451569935072741e-05, | |
| "loss": 2.0622, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.4943128495814406, | |
| "grad_norm": 1.0366291999816895, | |
| "learning_rate": 1.4507164030991254e-05, | |
| "loss": 2.0673, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 1.501820638912872, | |
| "grad_norm": 0.9624854326248169, | |
| "learning_rate": 1.449855670768102e-05, | |
| "loss": 2.0748, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.501820638912872, | |
| "eval_loss": 2.0644030570983887, | |
| "eval_runtime": 245.047, | |
| "eval_samples_per_second": 22.889, | |
| "eval_steps_per_second": 5.725, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.5093284282443036, | |
| "grad_norm": 0.8962668180465698, | |
| "learning_rate": 1.4489877469243053e-05, | |
| "loss": 2.0701, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 1.5168362175757348, | |
| "grad_norm": 0.8921008110046387, | |
| "learning_rate": 1.4481126404862677e-05, | |
| "loss": 2.0669, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.524344006907166, | |
| "grad_norm": 0.9402926564216614, | |
| "learning_rate": 1.4472303604463279e-05, | |
| "loss": 2.0576, | |
| "step": 5075 | |
| }, | |
| { | |
| "epoch": 1.5318517962385976, | |
| "grad_norm": 0.8990075588226318, | |
| "learning_rate": 1.4463409158705376e-05, | |
| "loss": 2.0517, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.5318517962385976, | |
| "eval_loss": 2.0629703998565674, | |
| "eval_runtime": 244.3655, | |
| "eval_samples_per_second": 22.953, | |
| "eval_steps_per_second": 5.741, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.539359585570029, | |
| "grad_norm": 1.0020679235458374, | |
| "learning_rate": 1.4454443158985708e-05, | |
| "loss": 2.0582, | |
| "step": 5125 | |
| }, | |
| { | |
| "epoch": 1.5468673749014603, | |
| "grad_norm": 0.9144858121871948, | |
| "learning_rate": 1.4445405697436267e-05, | |
| "loss": 2.0518, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.5543751642328916, | |
| "grad_norm": 0.9205281138420105, | |
| "learning_rate": 1.4436296866923373e-05, | |
| "loss": 2.0553, | |
| "step": 5175 | |
| }, | |
| { | |
| "epoch": 1.5618829535643228, | |
| "grad_norm": 1.0122096538543701, | |
| "learning_rate": 1.4427116761046714e-05, | |
| "loss": 2.0333, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.5618829535643228, | |
| "eval_loss": 2.061532735824585, | |
| "eval_runtime": 244.549, | |
| "eval_samples_per_second": 22.936, | |
| "eval_steps_per_second": 5.737, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.5693907428957543, | |
| "grad_norm": 0.9542369842529297, | |
| "learning_rate": 1.441786547413838e-05, | |
| "loss": 2.0722, | |
| "step": 5225 | |
| }, | |
| { | |
| "epoch": 1.5768985322271858, | |
| "grad_norm": 0.9306456446647644, | |
| "learning_rate": 1.4408543101261898e-05, | |
| "loss": 2.0731, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.584406321558617, | |
| "grad_norm": 0.8262733221054077, | |
| "learning_rate": 1.4399149738211251e-05, | |
| "loss": 2.0629, | |
| "step": 5275 | |
| }, | |
| { | |
| "epoch": 1.5919141108900483, | |
| "grad_norm": 0.9227537512779236, | |
| "learning_rate": 1.43896854815099e-05, | |
| "loss": 2.0832, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.5919141108900483, | |
| "eval_loss": 2.0603787899017334, | |
| "eval_runtime": 244.6958, | |
| "eval_samples_per_second": 22.922, | |
| "eval_steps_per_second": 5.734, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.5994219002214798, | |
| "grad_norm": 0.9182181358337402, | |
| "learning_rate": 1.4380150428409788e-05, | |
| "loss": 2.0516, | |
| "step": 5325 | |
| }, | |
| { | |
| "epoch": 1.6069296895529113, | |
| "grad_norm": 0.8036996126174927, | |
| "learning_rate": 1.4370544676890333e-05, | |
| "loss": 2.0531, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.6144374788843425, | |
| "grad_norm": 0.9126760363578796, | |
| "learning_rate": 1.4360868325657447e-05, | |
| "loss": 2.0665, | |
| "step": 5375 | |
| }, | |
| { | |
| "epoch": 1.6219452682157738, | |
| "grad_norm": 1.0143436193466187, | |
| "learning_rate": 1.4351121474142484e-05, | |
| "loss": 2.029, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.6219452682157738, | |
| "eval_loss": 2.0587964057922363, | |
| "eval_runtime": 244.7582, | |
| "eval_samples_per_second": 22.916, | |
| "eval_steps_per_second": 5.732, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.6294530575472053, | |
| "grad_norm": 0.9128186702728271, | |
| "learning_rate": 1.4341304222501254e-05, | |
| "loss": 2.0253, | |
| "step": 5425 | |
| }, | |
| { | |
| "epoch": 1.6369608468786367, | |
| "grad_norm": 0.915397584438324, | |
| "learning_rate": 1.4331416671612966e-05, | |
| "loss": 2.0771, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.644468636210068, | |
| "grad_norm": 0.8913278579711914, | |
| "learning_rate": 1.4321458923079216e-05, | |
| "loss": 2.0781, | |
| "step": 5475 | |
| }, | |
| { | |
| "epoch": 1.6519764255414993, | |
| "grad_norm": 1.062047004699707, | |
| "learning_rate": 1.431143107922292e-05, | |
| "loss": 2.0567, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6519764255414993, | |
| "eval_loss": 2.057093858718872, | |
| "eval_runtime": 245.0447, | |
| "eval_samples_per_second": 22.89, | |
| "eval_steps_per_second": 5.725, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6594842148729305, | |
| "grad_norm": 0.8677504658699036, | |
| "learning_rate": 1.4301333243087277e-05, | |
| "loss": 2.0696, | |
| "step": 5525 | |
| }, | |
| { | |
| "epoch": 1.666992004204362, | |
| "grad_norm": 0.9853184223175049, | |
| "learning_rate": 1.4291165518434707e-05, | |
| "loss": 2.0113, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.6744997935357935, | |
| "grad_norm": 0.8988690972328186, | |
| "learning_rate": 1.4280928009745786e-05, | |
| "loss": 2.0278, | |
| "step": 5575 | |
| }, | |
| { | |
| "epoch": 1.6820075828672247, | |
| "grad_norm": 0.877238929271698, | |
| "learning_rate": 1.4270620822218162e-05, | |
| "loss": 2.0231, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.6820075828672247, | |
| "eval_loss": 2.0566163063049316, | |
| "eval_runtime": 244.8536, | |
| "eval_samples_per_second": 22.908, | |
| "eval_steps_per_second": 5.73, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.689515372198656, | |
| "grad_norm": 0.8475340008735657, | |
| "learning_rate": 1.4260244061765492e-05, | |
| "loss": 2.0667, | |
| "step": 5625 | |
| }, | |
| { | |
| "epoch": 1.6970231615300875, | |
| "grad_norm": 1.0350947380065918, | |
| "learning_rate": 1.4249797835016339e-05, | |
| "loss": 2.0482, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.704530950861519, | |
| "grad_norm": 0.9984613656997681, | |
| "learning_rate": 1.4239282249313083e-05, | |
| "loss": 2.0553, | |
| "step": 5675 | |
| }, | |
| { | |
| "epoch": 1.7120387401929502, | |
| "grad_norm": 0.8884134888648987, | |
| "learning_rate": 1.4228697412710817e-05, | |
| "loss": 2.063, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.7120387401929502, | |
| "eval_loss": 2.0545597076416016, | |
| "eval_runtime": 244.9412, | |
| "eval_samples_per_second": 22.899, | |
| "eval_steps_per_second": 5.728, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.7195465295243815, | |
| "grad_norm": 0.8889881372451782, | |
| "learning_rate": 1.4218043433976232e-05, | |
| "loss": 2.0594, | |
| "step": 5725 | |
| }, | |
| { | |
| "epoch": 1.727054318855813, | |
| "grad_norm": 0.9351671934127808, | |
| "learning_rate": 1.4207320422586511e-05, | |
| "loss": 2.0317, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.7345621081872442, | |
| "grad_norm": 0.9845299124717712, | |
| "learning_rate": 1.4196528488728189e-05, | |
| "loss": 2.0613, | |
| "step": 5775 | |
| }, | |
| { | |
| "epoch": 1.7420698975186757, | |
| "grad_norm": 1.0036661624908447, | |
| "learning_rate": 1.418566774329603e-05, | |
| "loss": 2.0203, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.7420698975186757, | |
| "eval_loss": 2.052852153778076, | |
| "eval_runtime": 244.7583, | |
| "eval_samples_per_second": 22.916, | |
| "eval_steps_per_second": 5.732, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.749577686850107, | |
| "grad_norm": 1.1337708234786987, | |
| "learning_rate": 1.4174738297891891e-05, | |
| "loss": 2.035, | |
| "step": 5825 | |
| }, | |
| { | |
| "epoch": 1.7570854761815382, | |
| "grad_norm": 0.9224268198013306, | |
| "learning_rate": 1.416374026482356e-05, | |
| "loss": 2.068, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.7645932655129697, | |
| "grad_norm": 0.8932907581329346, | |
| "learning_rate": 1.4152673757103622e-05, | |
| "loss": 2.0668, | |
| "step": 5875 | |
| }, | |
| { | |
| "epoch": 1.7721010548444012, | |
| "grad_norm": 0.9014378786087036, | |
| "learning_rate": 1.414153888844828e-05, | |
| "loss": 2.0585, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.7721010548444012, | |
| "eval_loss": 2.0522830486297607, | |
| "eval_runtime": 244.4651, | |
| "eval_samples_per_second": 22.944, | |
| "eval_steps_per_second": 5.739, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.7796088441758324, | |
| "grad_norm": 0.9573795795440674, | |
| "learning_rate": 1.41303357732762e-05, | |
| "loss": 2.0726, | |
| "step": 5925 | |
| }, | |
| { | |
| "epoch": 1.7871166335072637, | |
| "grad_norm": 1.0068199634552002, | |
| "learning_rate": 1.4119064526707325e-05, | |
| "loss": 2.0117, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.7946244228386952, | |
| "grad_norm": 0.8137004971504211, | |
| "learning_rate": 1.4107725264561694e-05, | |
| "loss": 2.0531, | |
| "step": 5975 | |
| }, | |
| { | |
| "epoch": 1.8021322121701266, | |
| "grad_norm": 0.9432706832885742, | |
| "learning_rate": 1.4096318103358264e-05, | |
| "loss": 2.0528, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8021322121701266, | |
| "eval_loss": 2.0512585639953613, | |
| "eval_runtime": 244.6438, | |
| "eval_samples_per_second": 22.927, | |
| "eval_steps_per_second": 5.735, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.809640001501558, | |
| "grad_norm": 0.8738940954208374, | |
| "learning_rate": 1.4084843160313693e-05, | |
| "loss": 2.0486, | |
| "step": 6025 | |
| }, | |
| { | |
| "epoch": 1.8171477908329892, | |
| "grad_norm": 0.9203903079032898, | |
| "learning_rate": 1.407330055334115e-05, | |
| "loss": 2.0431, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.8246555801644204, | |
| "grad_norm": 0.8773927688598633, | |
| "learning_rate": 1.4061690401049101e-05, | |
| "loss": 2.0336, | |
| "step": 6075 | |
| }, | |
| { | |
| "epoch": 1.832163369495852, | |
| "grad_norm": 1.0781759023666382, | |
| "learning_rate": 1.4050012822740082e-05, | |
| "loss": 2.0839, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.832163369495852, | |
| "eval_loss": 2.0504093170166016, | |
| "eval_runtime": 244.864, | |
| "eval_samples_per_second": 22.907, | |
| "eval_steps_per_second": 5.73, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.8396711588272834, | |
| "grad_norm": 0.8537021279335022, | |
| "learning_rate": 1.4038267938409481e-05, | |
| "loss": 2.0394, | |
| "step": 6125 | |
| }, | |
| { | |
| "epoch": 1.8471789481587146, | |
| "grad_norm": 0.9055094122886658, | |
| "learning_rate": 1.4026455868744306e-05, | |
| "loss": 2.0267, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.854686737490146, | |
| "grad_norm": 0.8958349227905273, | |
| "learning_rate": 1.401457673512194e-05, | |
| "loss": 2.0427, | |
| "step": 6175 | |
| }, | |
| { | |
| "epoch": 1.8621945268215774, | |
| "grad_norm": 0.8849508166313171, | |
| "learning_rate": 1.4002630659608895e-05, | |
| "loss": 2.0492, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.8621945268215774, | |
| "eval_loss": 2.0487124919891357, | |
| "eval_runtime": 244.4909, | |
| "eval_samples_per_second": 22.942, | |
| "eval_steps_per_second": 5.738, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.8697023161530089, | |
| "grad_norm": 0.9771384000778198, | |
| "learning_rate": 1.3990617764959564e-05, | |
| "loss": 2.0473, | |
| "step": 6225 | |
| }, | |
| { | |
| "epoch": 1.8772101054844401, | |
| "grad_norm": 0.9234246611595154, | |
| "learning_rate": 1.3978538174614942e-05, | |
| "loss": 2.0408, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.8847178948158714, | |
| "grad_norm": 1.0580551624298096, | |
| "learning_rate": 1.3966392012701381e-05, | |
| "loss": 2.0299, | |
| "step": 6275 | |
| }, | |
| { | |
| "epoch": 1.8922256841473029, | |
| "grad_norm": 0.8676178455352783, | |
| "learning_rate": 1.3954179404029295e-05, | |
| "loss": 2.0513, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.8922256841473029, | |
| "eval_loss": 2.0470457077026367, | |
| "eval_runtime": 244.6825, | |
| "eval_samples_per_second": 22.924, | |
| "eval_steps_per_second": 5.734, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.8997334734787343, | |
| "grad_norm": 1.0486456155776978, | |
| "learning_rate": 1.3941900474091892e-05, | |
| "loss": 2.0646, | |
| "step": 6325 | |
| }, | |
| { | |
| "epoch": 1.9072412628101656, | |
| "grad_norm": 0.963049054145813, | |
| "learning_rate": 1.3929555349063875e-05, | |
| "loss": 2.0421, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.9147490521415969, | |
| "grad_norm": 0.9626838564872742, | |
| "learning_rate": 1.391714415580015e-05, | |
| "loss": 2.0369, | |
| "step": 6375 | |
| }, | |
| { | |
| "epoch": 1.922256841473028, | |
| "grad_norm": 0.9801763296127319, | |
| "learning_rate": 1.3904667021834514e-05, | |
| "loss": 2.0114, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.922256841473028, | |
| "eval_loss": 2.046201467514038, | |
| "eval_runtime": 244.6721, | |
| "eval_samples_per_second": 22.925, | |
| "eval_steps_per_second": 5.734, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.9297646308044596, | |
| "grad_norm": 1.0865575075149536, | |
| "learning_rate": 1.3892124075378364e-05, | |
| "loss": 2.0132, | |
| "step": 6425 | |
| }, | |
| { | |
| "epoch": 1.937272420135891, | |
| "grad_norm": 0.899895429611206, | |
| "learning_rate": 1.3879515445319353e-05, | |
| "loss": 2.0412, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.9447802094673223, | |
| "grad_norm": 0.9657663106918335, | |
| "learning_rate": 1.3866841261220093e-05, | |
| "loss": 2.0367, | |
| "step": 6475 | |
| }, | |
| { | |
| "epoch": 1.9522879987987536, | |
| "grad_norm": 0.8613144159317017, | |
| "learning_rate": 1.3854101653316798e-05, | |
| "loss": 2.0456, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9522879987987536, | |
| "eval_loss": 2.0444774627685547, | |
| "eval_runtime": 244.5805, | |
| "eval_samples_per_second": 22.933, | |
| "eval_steps_per_second": 5.736, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.959795788130185, | |
| "grad_norm": 0.8493949174880981, | |
| "learning_rate": 1.3841296752517967e-05, | |
| "loss": 2.0617, | |
| "step": 6525 | |
| }, | |
| { | |
| "epoch": 1.9673035774616165, | |
| "grad_norm": 0.9268197417259216, | |
| "learning_rate": 1.3828426690403026e-05, | |
| "loss": 2.0502, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.9748113667930478, | |
| "grad_norm": 0.9686461091041565, | |
| "learning_rate": 1.3815491599220977e-05, | |
| "loss": 2.057, | |
| "step": 6575 | |
| }, | |
| { | |
| "epoch": 1.982319156124479, | |
| "grad_norm": 0.9616640210151672, | |
| "learning_rate": 1.3802491611889048e-05, | |
| "loss": 2.0442, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.982319156124479, | |
| "eval_loss": 2.043835401535034, | |
| "eval_runtime": 244.9743, | |
| "eval_samples_per_second": 22.896, | |
| "eval_steps_per_second": 5.727, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.9898269454559105, | |
| "grad_norm": 0.8984593152999878, | |
| "learning_rate": 1.3789426861991317e-05, | |
| "loss": 2.0366, | |
| "step": 6625 | |
| }, | |
| { | |
| "epoch": 1.997334734787342, | |
| "grad_norm": 0.8971940875053406, | |
| "learning_rate": 1.3776297483777344e-05, | |
| "loss": 2.0255, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.0051052967453735, | |
| "grad_norm": 0.9031795859336853, | |
| "learning_rate": 1.3763103612160788e-05, | |
| "loss": 2.0926, | |
| "step": 6675 | |
| }, | |
| { | |
| "epoch": 2.012613086076805, | |
| "grad_norm": 0.8842533230781555, | |
| "learning_rate": 1.374984538271803e-05, | |
| "loss": 2.0172, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.012613086076805, | |
| "eval_loss": 2.0426952838897705, | |
| "eval_runtime": 244.4788, | |
| "eval_samples_per_second": 22.943, | |
| "eval_steps_per_second": 5.739, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.020120875408236, | |
| "grad_norm": 1.008647084236145, | |
| "learning_rate": 1.3736522931686765e-05, | |
| "loss": 2.0135, | |
| "step": 6725 | |
| }, | |
| { | |
| "epoch": 2.0276286647396673, | |
| "grad_norm": 1.0014972686767578, | |
| "learning_rate": 1.372313639596462e-05, | |
| "loss": 2.0175, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.0351364540710986, | |
| "grad_norm": 0.9429395198822021, | |
| "learning_rate": 1.3709685913107728e-05, | |
| "loss": 2.0228, | |
| "step": 6775 | |
| }, | |
| { | |
| "epoch": 2.0426442434025303, | |
| "grad_norm": 1.057131052017212, | |
| "learning_rate": 1.369617162132933e-05, | |
| "loss": 2.0281, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.0426442434025303, | |
| "eval_loss": 2.0424487590789795, | |
| "eval_runtime": 244.6503, | |
| "eval_samples_per_second": 22.927, | |
| "eval_steps_per_second": 5.735, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.0501520327339615, | |
| "grad_norm": 0.990040123462677, | |
| "learning_rate": 1.3682593659498343e-05, | |
| "loss": 2.0111, | |
| "step": 6825 | |
| }, | |
| { | |
| "epoch": 2.0576598220653928, | |
| "grad_norm": 0.9503148794174194, | |
| "learning_rate": 1.3668952167137948e-05, | |
| "loss": 2.0273, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.065167611396824, | |
| "grad_norm": 0.9117149710655212, | |
| "learning_rate": 1.3655247284424141e-05, | |
| "loss": 2.0239, | |
| "step": 6875 | |
| }, | |
| { | |
| "epoch": 2.0726754007282557, | |
| "grad_norm": 1.0101039409637451, | |
| "learning_rate": 1.36414791521843e-05, | |
| "loss": 2.0336, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.0726754007282557, | |
| "eval_loss": 2.0416696071624756, | |
| "eval_runtime": 245.4111, | |
| "eval_samples_per_second": 22.856, | |
| "eval_steps_per_second": 5.717, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.080183190059687, | |
| "grad_norm": 0.8587022423744202, | |
| "learning_rate": 1.3627647911895737e-05, | |
| "loss": 2.0239, | |
| "step": 6925 | |
| }, | |
| { | |
| "epoch": 2.0876909793911183, | |
| "grad_norm": 0.8640381693840027, | |
| "learning_rate": 1.3613753705684241e-05, | |
| "loss": 2.0079, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.0951987687225495, | |
| "grad_norm": 0.8698000907897949, | |
| "learning_rate": 1.3599796676322627e-05, | |
| "loss": 2.0181, | |
| "step": 6975 | |
| }, | |
| { | |
| "epoch": 2.102706558053981, | |
| "grad_norm": 0.9826030731201172, | |
| "learning_rate": 1.3585776967229254e-05, | |
| "loss": 2.0165, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.102706558053981, | |
| "eval_loss": 2.0403730869293213, | |
| "eval_runtime": 244.4187, | |
| "eval_samples_per_second": 22.948, | |
| "eval_steps_per_second": 5.74, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.1102143473854125, | |
| "grad_norm": 0.9374090433120728, | |
| "learning_rate": 1.3571694722466567e-05, | |
| "loss": 2.0125, | |
| "step": 7025 | |
| }, | |
| { | |
| "epoch": 2.1177221367168437, | |
| "grad_norm": 0.9569231271743774, | |
| "learning_rate": 1.3557550086739605e-05, | |
| "loss": 2.0426, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.125229926048275, | |
| "grad_norm": 1.0747652053833008, | |
| "learning_rate": 1.3543343205394521e-05, | |
| "loss": 2.0391, | |
| "step": 7075 | |
| }, | |
| { | |
| "epoch": 2.1327377153797062, | |
| "grad_norm": 0.9164227247238159, | |
| "learning_rate": 1.3529074224417086e-05, | |
| "loss": 2.0171, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.1327377153797062, | |
| "eval_loss": 2.0392725467681885, | |
| "eval_runtime": 244.3097, | |
| "eval_samples_per_second": 22.959, | |
| "eval_steps_per_second": 5.743, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.140245504711138, | |
| "grad_norm": 1.2145719528198242, | |
| "learning_rate": 1.3514743290431186e-05, | |
| "loss": 1.9985, | |
| "step": 7125 | |
| }, | |
| { | |
| "epoch": 2.147753294042569, | |
| "grad_norm": 1.0173206329345703, | |
| "learning_rate": 1.3500350550697316e-05, | |
| "loss": 2.0221, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.1552610833740005, | |
| "grad_norm": 1.0180777311325073, | |
| "learning_rate": 1.3485896153111076e-05, | |
| "loss": 2.0316, | |
| "step": 7175 | |
| }, | |
| { | |
| "epoch": 2.1627688727054317, | |
| "grad_norm": 0.9768148064613342, | |
| "learning_rate": 1.3471380246201637e-05, | |
| "loss": 2.0115, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.1627688727054317, | |
| "eval_loss": 2.038167953491211, | |
| "eval_runtime": 244.3446, | |
| "eval_samples_per_second": 22.955, | |
| "eval_steps_per_second": 5.742, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.1702766620368634, | |
| "grad_norm": 1.1061457395553589, | |
| "learning_rate": 1.3456802979130227e-05, | |
| "loss": 2.0091, | |
| "step": 7225 | |
| }, | |
| { | |
| "epoch": 2.1777844513682947, | |
| "grad_norm": 1.1214226484298706, | |
| "learning_rate": 1.3442164501688593e-05, | |
| "loss": 2.0287, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.185292240699726, | |
| "grad_norm": 0.9686478972434998, | |
| "learning_rate": 1.342746496429746e-05, | |
| "loss": 2.0485, | |
| "step": 7275 | |
| }, | |
| { | |
| "epoch": 2.192800030031157, | |
| "grad_norm": 0.971811056137085, | |
| "learning_rate": 1.3412704518004983e-05, | |
| "loss": 2.0011, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.192800030031157, | |
| "eval_loss": 2.0375237464904785, | |
| "eval_runtime": 244.4348, | |
| "eval_samples_per_second": 22.947, | |
| "eval_steps_per_second": 5.74, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.200307819362589, | |
| "grad_norm": 0.9958051443099976, | |
| "learning_rate": 1.3397883314485206e-05, | |
| "loss": 2.0151, | |
| "step": 7325 | |
| }, | |
| { | |
| "epoch": 2.20781560869402, | |
| "grad_norm": 0.9805117249488831, | |
| "learning_rate": 1.3383001506036497e-05, | |
| "loss": 2.012, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.2153233980254514, | |
| "grad_norm": 0.9299209117889404, | |
| "learning_rate": 1.3368059245579976e-05, | |
| "loss": 2.0226, | |
| "step": 7375 | |
| }, | |
| { | |
| "epoch": 2.2228311873568827, | |
| "grad_norm": 0.9592748880386353, | |
| "learning_rate": 1.3353056686657956e-05, | |
| "loss": 2.0256, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.2228311873568827, | |
| "eval_loss": 2.0365006923675537, | |
| "eval_runtime": 243.9271, | |
| "eval_samples_per_second": 22.995, | |
| "eval_steps_per_second": 5.752, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.230338976688314, | |
| "grad_norm": 0.9213986396789551, | |
| "learning_rate": 1.3337993983432353e-05, | |
| "loss": 2.0179, | |
| "step": 7425 | |
| }, | |
| { | |
| "epoch": 2.2378467660197456, | |
| "grad_norm": 0.9306337237358093, | |
| "learning_rate": 1.3322871290683117e-05, | |
| "loss": 2.0189, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.245354555351177, | |
| "grad_norm": 0.9785804152488708, | |
| "learning_rate": 1.3307688763806629e-05, | |
| "loss": 2.0228, | |
| "step": 7475 | |
| }, | |
| { | |
| "epoch": 2.252862344682608, | |
| "grad_norm": 0.9108986258506775, | |
| "learning_rate": 1.3292446558814106e-05, | |
| "loss": 2.0357, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.252862344682608, | |
| "eval_loss": 2.035933494567871, | |
| "eval_runtime": 244.2267, | |
| "eval_samples_per_second": 22.966, | |
| "eval_steps_per_second": 5.745, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.2603701340140394, | |
| "grad_norm": 0.9188127517700195, | |
| "learning_rate": 1.3277144832329998e-05, | |
| "loss": 2.0241, | |
| "step": 7525 | |
| }, | |
| { | |
| "epoch": 2.267877923345471, | |
| "grad_norm": 0.9804355502128601, | |
| "learning_rate": 1.3261783741590389e-05, | |
| "loss": 2.0234, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.2753857126769024, | |
| "grad_norm": 0.9870203137397766, | |
| "learning_rate": 1.3246363444441365e-05, | |
| "loss": 2.0078, | |
| "step": 7575 | |
| }, | |
| { | |
| "epoch": 2.2828935020083336, | |
| "grad_norm": 1.1177314519882202, | |
| "learning_rate": 1.3230884099337404e-05, | |
| "loss": 2.0186, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.2828935020083336, | |
| "eval_loss": 2.035186290740967, | |
| "eval_runtime": 244.2073, | |
| "eval_samples_per_second": 22.968, | |
| "eval_steps_per_second": 5.745, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.290401291339765, | |
| "grad_norm": 0.9781551957130432, | |
| "learning_rate": 1.3215345865339738e-05, | |
| "loss": 1.9881, | |
| "step": 7625 | |
| }, | |
| { | |
| "epoch": 2.2979090806711966, | |
| "grad_norm": 1.1340678930282593, | |
| "learning_rate": 1.3199748902114734e-05, | |
| "loss": 2.0113, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.305416870002628, | |
| "grad_norm": 0.8932919502258301, | |
| "learning_rate": 1.3184093369932237e-05, | |
| "loss": 2.0349, | |
| "step": 7675 | |
| }, | |
| { | |
| "epoch": 2.312924659334059, | |
| "grad_norm": 0.9024244546890259, | |
| "learning_rate": 1.3168379429663924e-05, | |
| "loss": 2.0241, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.312924659334059, | |
| "eval_loss": 2.0337536334991455, | |
| "eval_runtime": 243.8773, | |
| "eval_samples_per_second": 22.999, | |
| "eval_steps_per_second": 5.753, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.3204324486654904, | |
| "grad_norm": 0.9510346055030823, | |
| "learning_rate": 1.3152607242781668e-05, | |
| "loss": 2.0297, | |
| "step": 7725 | |
| }, | |
| { | |
| "epoch": 2.3279402379969216, | |
| "grad_norm": 1.004501461982727, | |
| "learning_rate": 1.313677697135586e-05, | |
| "loss": 2.0276, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.3354480273283533, | |
| "grad_norm": 1.0247652530670166, | |
| "learning_rate": 1.312088877805375e-05, | |
| "loss": 2.0152, | |
| "step": 7775 | |
| }, | |
| { | |
| "epoch": 2.3429558166597846, | |
| "grad_norm": 0.9948970675468445, | |
| "learning_rate": 1.3104942826137785e-05, | |
| "loss": 2.0104, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.3429558166597846, | |
| "eval_loss": 2.032724618911743, | |
| "eval_runtime": 244.6368, | |
| "eval_samples_per_second": 22.928, | |
| "eval_steps_per_second": 5.735, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.350463605991216, | |
| "grad_norm": 1.062002182006836, | |
| "learning_rate": 1.3088939279463914e-05, | |
| "loss": 2.0329, | |
| "step": 7825 | |
| }, | |
| { | |
| "epoch": 2.357971395322647, | |
| "grad_norm": 0.9641005396842957, | |
| "learning_rate": 1.3072878302479912e-05, | |
| "loss": 2.0121, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.3654791846540784, | |
| "grad_norm": 0.9504510164260864, | |
| "learning_rate": 1.30567600602237e-05, | |
| "loss": 2.0203, | |
| "step": 7875 | |
| }, | |
| { | |
| "epoch": 2.37298697398551, | |
| "grad_norm": 0.970635712146759, | |
| "learning_rate": 1.3040584718321629e-05, | |
| "loss": 2.0101, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.37298697398551, | |
| "eval_loss": 2.032496452331543, | |
| "eval_runtime": 243.9409, | |
| "eval_samples_per_second": 22.993, | |
| "eval_steps_per_second": 5.751, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.3804947633169413, | |
| "grad_norm": 0.9251878261566162, | |
| "learning_rate": 1.30243524429868e-05, | |
| "loss": 2.0166, | |
| "step": 7925 | |
| }, | |
| { | |
| "epoch": 2.3880025526483726, | |
| "grad_norm": 0.8651822805404663, | |
| "learning_rate": 1.300806340101734e-05, | |
| "loss": 2.0213, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.3955103419798043, | |
| "grad_norm": 1.0655325651168823, | |
| "learning_rate": 1.2991717759794689e-05, | |
| "loss": 1.9892, | |
| "step": 7975 | |
| }, | |
| { | |
| "epoch": 2.4030181313112355, | |
| "grad_norm": 0.8861711621284485, | |
| "learning_rate": 1.2975315687281895e-05, | |
| "loss": 2.0632, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.4030181313112355, | |
| "eval_loss": 2.031506299972534, | |
| "eval_runtime": 244.4184, | |
| "eval_samples_per_second": 22.948, | |
| "eval_steps_per_second": 5.74, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.410525920642667, | |
| "grad_norm": 1.0595537424087524, | |
| "learning_rate": 1.2958857352021873e-05, | |
| "loss": 2.0257, | |
| "step": 8025 | |
| }, | |
| { | |
| "epoch": 2.418033709974098, | |
| "grad_norm": 1.1569972038269043, | |
| "learning_rate": 1.2942342923135669e-05, | |
| "loss": 2.0165, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.4255414993055293, | |
| "grad_norm": 0.9342359900474548, | |
| "learning_rate": 1.2925772570320744e-05, | |
| "loss": 2.0085, | |
| "step": 8075 | |
| }, | |
| { | |
| "epoch": 2.433049288636961, | |
| "grad_norm": 0.9486634731292725, | |
| "learning_rate": 1.2909146463849207e-05, | |
| "loss": 1.9926, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.433049288636961, | |
| "eval_loss": 2.0305228233337402, | |
| "eval_runtime": 244.4927, | |
| "eval_samples_per_second": 22.941, | |
| "eval_steps_per_second": 5.738, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.4405570779683923, | |
| "grad_norm": 1.04513418674469, | |
| "learning_rate": 1.2892464774566082e-05, | |
| "loss": 2.0207, | |
| "step": 8125 | |
| }, | |
| { | |
| "epoch": 2.4480648672998235, | |
| "grad_norm": 1.0375896692276, | |
| "learning_rate": 1.2875727673887548e-05, | |
| "loss": 2.0299, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.455572656631255, | |
| "grad_norm": 0.8860157132148743, | |
| "learning_rate": 1.2858935333799161e-05, | |
| "loss": 2.0164, | |
| "step": 8175 | |
| }, | |
| { | |
| "epoch": 2.463080445962686, | |
| "grad_norm": 0.9642972350120544, | |
| "learning_rate": 1.2842087926854117e-05, | |
| "loss": 1.9905, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.463080445962686, | |
| "eval_loss": 2.029367208480835, | |
| "eval_runtime": 244.4104, | |
| "eval_samples_per_second": 22.949, | |
| "eval_steps_per_second": 5.74, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.4705882352941178, | |
| "grad_norm": 0.9699326753616333, | |
| "learning_rate": 1.282518562617145e-05, | |
| "loss": 2.05, | |
| "step": 8225 | |
| }, | |
| { | |
| "epoch": 2.478096024625549, | |
| "grad_norm": 1.12892746925354, | |
| "learning_rate": 1.2808228605434282e-05, | |
| "loss": 1.984, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.4856038139569803, | |
| "grad_norm": 0.9147679209709167, | |
| "learning_rate": 1.2791217038888008e-05, | |
| "loss": 2.0349, | |
| "step": 8275 | |
| }, | |
| { | |
| "epoch": 2.493111603288412, | |
| "grad_norm": 0.9576278328895569, | |
| "learning_rate": 1.2774151101338523e-05, | |
| "loss": 2.0547, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.493111603288412, | |
| "eval_loss": 2.0288000106811523, | |
| "eval_runtime": 244.145, | |
| "eval_samples_per_second": 22.974, | |
| "eval_steps_per_second": 5.747, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.5006193926198432, | |
| "grad_norm": 1.0111256837844849, | |
| "learning_rate": 1.2757030968150426e-05, | |
| "loss": 2.0108, | |
| "step": 8325 | |
| }, | |
| { | |
| "epoch": 2.5081271819512745, | |
| "grad_norm": 0.8969287276268005, | |
| "learning_rate": 1.2739856815245213e-05, | |
| "loss": 1.9897, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.5156349712827057, | |
| "grad_norm": 1.02077054977417, | |
| "learning_rate": 1.2722628819099472e-05, | |
| "loss": 2.0071, | |
| "step": 8375 | |
| }, | |
| { | |
| "epoch": 2.523142760614137, | |
| "grad_norm": 0.9784366488456726, | |
| "learning_rate": 1.2705347156743066e-05, | |
| "loss": 2.0018, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.523142760614137, | |
| "eval_loss": 2.027707099914551, | |
| "eval_runtime": 244.2262, | |
| "eval_samples_per_second": 22.966, | |
| "eval_steps_per_second": 5.745, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.5306505499455687, | |
| "grad_norm": 0.9159882664680481, | |
| "learning_rate": 1.2688012005757317e-05, | |
| "loss": 2.0298, | |
| "step": 8425 | |
| }, | |
| { | |
| "epoch": 2.538158339277, | |
| "grad_norm": 1.080963373184204, | |
| "learning_rate": 1.2670623544273182e-05, | |
| "loss": 2.015, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.5456661286084312, | |
| "grad_norm": 0.9042007923126221, | |
| "learning_rate": 1.2653181950969418e-05, | |
| "loss": 1.9907, | |
| "step": 8475 | |
| }, | |
| { | |
| "epoch": 2.5531739179398625, | |
| "grad_norm": 0.9830322861671448, | |
| "learning_rate": 1.2635687405070755e-05, | |
| "loss": 2.015, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.5531739179398625, | |
| "eval_loss": 2.0268571376800537, | |
| "eval_runtime": 244.5259, | |
| "eval_samples_per_second": 22.938, | |
| "eval_steps_per_second": 5.738, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.5606817072712937, | |
| "grad_norm": 0.8969373106956482, | |
| "learning_rate": 1.2618842990073232e-05, | |
| "loss": 1.985, | |
| "step": 8525 | |
| }, | |
| { | |
| "epoch": 2.5681894966027254, | |
| "grad_norm": 1.0655286312103271, | |
| "learning_rate": 1.2601245179065439e-05, | |
| "loss": 2.0409, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.5756972859341567, | |
| "grad_norm": 1.0102958679199219, | |
| "learning_rate": 1.2583594949149863e-05, | |
| "loss": 2.0358, | |
| "step": 8575 | |
| }, | |
| { | |
| "epoch": 2.583205075265588, | |
| "grad_norm": 0.9221513271331787, | |
| "learning_rate": 1.2565892481695126e-05, | |
| "loss": 2.0241, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.583205075265588, | |
| "eval_loss": 2.025696039199829, | |
| "eval_runtime": 244.8481, | |
| "eval_samples_per_second": 22.908, | |
| "eval_steps_per_second": 5.73, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.5907128645970197, | |
| "grad_norm": 1.0198999643325806, | |
| "learning_rate": 1.2548137958606616e-05, | |
| "loss": 2.0061, | |
| "step": 8625 | |
| }, | |
| { | |
| "epoch": 2.598220653928451, | |
| "grad_norm": 1.0228906869888306, | |
| "learning_rate": 1.2530331562324637e-05, | |
| "loss": 2.0183, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.605728443259882, | |
| "grad_norm": 0.9328727126121521, | |
| "learning_rate": 1.2512473475822524e-05, | |
| "loss": 2.0111, | |
| "step": 8675 | |
| }, | |
| { | |
| "epoch": 2.6132362325913134, | |
| "grad_norm": 1.0237301588058472, | |
| "learning_rate": 1.2494563882604764e-05, | |
| "loss": 2.0461, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.6132362325913134, | |
| "eval_loss": 2.025115489959717, | |
| "eval_runtime": 244.776, | |
| "eval_samples_per_second": 22.915, | |
| "eval_steps_per_second": 5.732, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.6207440219227447, | |
| "grad_norm": 1.0419483184814453, | |
| "learning_rate": 1.2476602966705117e-05, | |
| "loss": 2.0226, | |
| "step": 8725 | |
| }, | |
| { | |
| "epoch": 2.6282518112541764, | |
| "grad_norm": 1.0212359428405762, | |
| "learning_rate": 1.2458590912684718e-05, | |
| "loss": 2.0294, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.6357596005856077, | |
| "grad_norm": 0.9352961778640747, | |
| "learning_rate": 1.2440527905630174e-05, | |
| "loss": 2.0287, | |
| "step": 8775 | |
| }, | |
| { | |
| "epoch": 2.643267389917039, | |
| "grad_norm": 0.9289619326591492, | |
| "learning_rate": 1.2422414131151686e-05, | |
| "loss": 1.9629, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.643267389917039, | |
| "eval_loss": 2.023833751678467, | |
| "eval_runtime": 244.5795, | |
| "eval_samples_per_second": 22.933, | |
| "eval_steps_per_second": 5.736, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.65077517924847, | |
| "grad_norm": 1.081150770187378, | |
| "learning_rate": 1.2404249775381112e-05, | |
| "loss": 2.0166, | |
| "step": 8825 | |
| }, | |
| { | |
| "epoch": 2.6582829685799014, | |
| "grad_norm": 0.9818612933158875, | |
| "learning_rate": 1.2386035024970076e-05, | |
| "loss": 2.0314, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.665790757911333, | |
| "grad_norm": 0.9447384476661682, | |
| "learning_rate": 1.2367770067088045e-05, | |
| "loss": 2.0172, | |
| "step": 8875 | |
| }, | |
| { | |
| "epoch": 2.6732985472427644, | |
| "grad_norm": 0.9655535817146301, | |
| "learning_rate": 1.2349455089420397e-05, | |
| "loss": 2.0163, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.6732985472427644, | |
| "eval_loss": 2.0230913162231445, | |
| "eval_runtime": 244.504, | |
| "eval_samples_per_second": 22.94, | |
| "eval_steps_per_second": 5.738, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.6808063365741956, | |
| "grad_norm": 1.010567307472229, | |
| "learning_rate": 1.2331090280166499e-05, | |
| "loss": 2.0132, | |
| "step": 8925 | |
| }, | |
| { | |
| "epoch": 2.6883141259056273, | |
| "grad_norm": 1.014929175376892, | |
| "learning_rate": 1.2312675828037778e-05, | |
| "loss": 2.0155, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.6958219152370586, | |
| "grad_norm": 0.9091641902923584, | |
| "learning_rate": 1.2294211922255775e-05, | |
| "loss": 2.0069, | |
| "step": 8975 | |
| }, | |
| { | |
| "epoch": 2.70332970456849, | |
| "grad_norm": 1.0267935991287231, | |
| "learning_rate": 1.2275698752550196e-05, | |
| "loss": 2.0101, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.70332970456849, | |
| "eval_loss": 2.0226101875305176, | |
| "eval_runtime": 244.615, | |
| "eval_samples_per_second": 22.93, | |
| "eval_steps_per_second": 5.736, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.710837493899921, | |
| "grad_norm": 1.147930383682251, | |
| "learning_rate": 1.2257136509156978e-05, | |
| "loss": 1.9859, | |
| "step": 9025 | |
| }, | |
| { | |
| "epoch": 2.7183452832313524, | |
| "grad_norm": 1.0729800462722778, | |
| "learning_rate": 1.2238525382816322e-05, | |
| "loss": 2.0083, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.725853072562784, | |
| "grad_norm": 1.0532081127166748, | |
| "learning_rate": 1.2219865564770731e-05, | |
| "loss": 2.0317, | |
| "step": 9075 | |
| }, | |
| { | |
| "epoch": 2.7333608618942153, | |
| "grad_norm": 1.0475471019744873, | |
| "learning_rate": 1.2201157246763056e-05, | |
| "loss": 2.0117, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.7333608618942153, | |
| "eval_loss": 2.0220327377319336, | |
| "eval_runtime": 244.6775, | |
| "eval_samples_per_second": 22.924, | |
| "eval_steps_per_second": 5.734, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.7408686512256466, | |
| "grad_norm": 0.9435563683509827, | |
| "learning_rate": 1.2182400621034513e-05, | |
| "loss": 2.0271, | |
| "step": 9125 | |
| }, | |
| { | |
| "epoch": 2.748376440557078, | |
| "grad_norm": 0.9693319201469421, | |
| "learning_rate": 1.2163595880322726e-05, | |
| "loss": 2.0162, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.755884229888509, | |
| "grad_norm": 1.0163437128067017, | |
| "learning_rate": 1.2144743217859717e-05, | |
| "loss": 2.0039, | |
| "step": 9175 | |
| }, | |
| { | |
| "epoch": 2.763392019219941, | |
| "grad_norm": 0.8770220279693604, | |
| "learning_rate": 1.2125842827369955e-05, | |
| "loss": 2.0098, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.763392019219941, | |
| "eval_loss": 2.021249771118164, | |
| "eval_runtime": 244.5171, | |
| "eval_samples_per_second": 22.939, | |
| "eval_steps_per_second": 5.738, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.770899808551372, | |
| "grad_norm": 0.9660369753837585, | |
| "learning_rate": 1.2106894903068337e-05, | |
| "loss": 2.0, | |
| "step": 9225 | |
| }, | |
| { | |
| "epoch": 2.7784075978828033, | |
| "grad_norm": 1.1277518272399902, | |
| "learning_rate": 1.2087899639658208e-05, | |
| "loss": 2.0048, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.785915387214235, | |
| "grad_norm": 0.9551436305046082, | |
| "learning_rate": 1.2068857232329355e-05, | |
| "loss": 1.9856, | |
| "step": 9275 | |
| }, | |
| { | |
| "epoch": 2.793423176545666, | |
| "grad_norm": 0.9860432744026184, | |
| "learning_rate": 1.2049767876756002e-05, | |
| "loss": 2.0292, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.793423176545666, | |
| "eval_loss": 2.0205230712890625, | |
| "eval_runtime": 244.4184, | |
| "eval_samples_per_second": 22.948, | |
| "eval_steps_per_second": 5.74, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.8009309658770976, | |
| "grad_norm": 1.023398756980896, | |
| "learning_rate": 1.2030631769094799e-05, | |
| "loss": 2.0173, | |
| "step": 9325 | |
| }, | |
| { | |
| "epoch": 2.808438755208529, | |
| "grad_norm": 0.9791613817214966, | |
| "learning_rate": 1.2011449105982813e-05, | |
| "loss": 2.0237, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.81594654453996, | |
| "grad_norm": 0.9436085224151611, | |
| "learning_rate": 1.1992220084535487e-05, | |
| "loss": 1.99, | |
| "step": 9375 | |
| }, | |
| { | |
| "epoch": 2.8234543338713918, | |
| "grad_norm": 0.9325253367424011, | |
| "learning_rate": 1.1972944902344646e-05, | |
| "loss": 2.0368, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.8234543338713918, | |
| "eval_loss": 2.019615650177002, | |
| "eval_runtime": 244.3993, | |
| "eval_samples_per_second": 22.95, | |
| "eval_steps_per_second": 5.741, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.830962123202823, | |
| "grad_norm": 0.9791749119758606, | |
| "learning_rate": 1.1953623757476436e-05, | |
| "loss": 2.0055, | |
| "step": 9425 | |
| }, | |
| { | |
| "epoch": 2.8384699125342543, | |
| "grad_norm": 0.9658190608024597, | |
| "learning_rate": 1.1934256848469312e-05, | |
| "loss": 2.0166, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.8459777018656855, | |
| "grad_norm": 1.026522159576416, | |
| "learning_rate": 1.1914844374331974e-05, | |
| "loss": 1.9916, | |
| "step": 9475 | |
| }, | |
| { | |
| "epoch": 2.853485491197117, | |
| "grad_norm": 1.1535567045211792, | |
| "learning_rate": 1.1895386534541354e-05, | |
| "loss": 1.9948, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.853485491197117, | |
| "eval_loss": 2.0190258026123047, | |
| "eval_runtime": 244.5245, | |
| "eval_samples_per_second": 22.938, | |
| "eval_steps_per_second": 5.738, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.8609932805285485, | |
| "grad_norm": 0.8700292110443115, | |
| "learning_rate": 1.1875883529040534e-05, | |
| "loss": 1.9998, | |
| "step": 9525 | |
| }, | |
| { | |
| "epoch": 2.8685010698599798, | |
| "grad_norm": 1.00760018825531, | |
| "learning_rate": 1.1856335558236714e-05, | |
| "loss": 2.0286, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.876008859191411, | |
| "grad_norm": 1.0481544733047485, | |
| "learning_rate": 1.1836742822999139e-05, | |
| "loss": 2.0145, | |
| "step": 9575 | |
| }, | |
| { | |
| "epoch": 2.8835166485228423, | |
| "grad_norm": 0.9422263503074646, | |
| "learning_rate": 1.1817105524657043e-05, | |
| "loss": 2.0123, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.8835166485228423, | |
| "eval_loss": 2.018214702606201, | |
| "eval_runtime": 244.6614, | |
| "eval_samples_per_second": 22.926, | |
| "eval_steps_per_second": 5.734, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.8910244378542735, | |
| "grad_norm": 1.012352466583252, | |
| "learning_rate": 1.1797423864997577e-05, | |
| "loss": 2.0425, | |
| "step": 9625 | |
| }, | |
| { | |
| "epoch": 2.8985322271857052, | |
| "grad_norm": 1.0469133853912354, | |
| "learning_rate": 1.1777698046263735e-05, | |
| "loss": 2.0266, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.9060400165171365, | |
| "grad_norm": 1.0227727890014648, | |
| "learning_rate": 1.175792827115228e-05, | |
| "loss": 2.0272, | |
| "step": 9675 | |
| }, | |
| { | |
| "epoch": 2.9135478058485678, | |
| "grad_norm": 1.1656129360198975, | |
| "learning_rate": 1.1738114742811654e-05, | |
| "loss": 1.9813, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.9135478058485678, | |
| "eval_loss": 2.017220973968506, | |
| "eval_runtime": 244.7357, | |
| "eval_samples_per_second": 22.919, | |
| "eval_steps_per_second": 5.733, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.9210555951799995, | |
| "grad_norm": 0.9345014095306396, | |
| "learning_rate": 1.1718257664839896e-05, | |
| "loss": 1.9932, | |
| "step": 9725 | |
| }, | |
| { | |
| "epoch": 2.9285633845114307, | |
| "grad_norm": 1.0153813362121582, | |
| "learning_rate": 1.1698357241282546e-05, | |
| "loss": 2.0216, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.936071173842862, | |
| "grad_norm": 1.0141171216964722, | |
| "learning_rate": 1.167841367663056e-05, | |
| "loss": 2.0118, | |
| "step": 9775 | |
| }, | |
| { | |
| "epoch": 2.9435789631742932, | |
| "grad_norm": 1.0706440210342407, | |
| "learning_rate": 1.1658427175818184e-05, | |
| "loss": 1.9952, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.9435789631742932, | |
| "eval_loss": 2.016911029815674, | |
| "eval_runtime": 244.4656, | |
| "eval_samples_per_second": 22.944, | |
| "eval_steps_per_second": 5.739, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.9510867525057245, | |
| "grad_norm": 0.9770407676696777, | |
| "learning_rate": 1.1638397944220876e-05, | |
| "loss": 2.0154, | |
| "step": 9825 | |
| }, | |
| { | |
| "epoch": 2.958594541837156, | |
| "grad_norm": 0.9835750460624695, | |
| "learning_rate": 1.1618326187653178e-05, | |
| "loss": 2.0186, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.9661023311685875, | |
| "grad_norm": 1.0434762239456177, | |
| "learning_rate": 1.1598212112366606e-05, | |
| "loss": 1.9859, | |
| "step": 9875 | |
| }, | |
| { | |
| "epoch": 2.9736101205000187, | |
| "grad_norm": 1.0988759994506836, | |
| "learning_rate": 1.1578055925047533e-05, | |
| "loss": 2.0024, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.9736101205000187, | |
| "eval_loss": 2.0162084102630615, | |
| "eval_runtime": 244.4388, | |
| "eval_samples_per_second": 22.946, | |
| "eval_steps_per_second": 5.74, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.98111790983145, | |
| "grad_norm": 0.9690369367599487, | |
| "learning_rate": 1.1557857832815063e-05, | |
| "loss": 2.0261, | |
| "step": 9925 | |
| }, | |
| { | |
| "epoch": 2.9886256991628812, | |
| "grad_norm": 0.932151198387146, | |
| "learning_rate": 1.1537618043218898e-05, | |
| "loss": 2.0233, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.996133488494313, | |
| "grad_norm": 1.0118919610977173, | |
| "learning_rate": 1.1517336764237217e-05, | |
| "loss": 1.981, | |
| "step": 9975 | |
| }, | |
| { | |
| "epoch": 3.0039040504523444, | |
| "grad_norm": 1.0406084060668945, | |
| "learning_rate": 1.1497014204274526e-05, | |
| "loss": 2.0523, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.0039040504523444, | |
| "eval_loss": 2.0155766010284424, | |
| "eval_runtime": 243.5325, | |
| "eval_samples_per_second": 23.032, | |
| "eval_steps_per_second": 5.761, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.0114118397837757, | |
| "grad_norm": 1.0300322771072388, | |
| "learning_rate": 1.1476650572159522e-05, | |
| "loss": 1.9657, | |
| "step": 10025 | |
| }, | |
| { | |
| "epoch": 3.018919629115207, | |
| "grad_norm": 1.0281704664230347, | |
| "learning_rate": 1.1456246077142954e-05, | |
| "loss": 1.9883, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 3.026427418446638, | |
| "grad_norm": 1.0092098712921143, | |
| "learning_rate": 1.1435800928895464e-05, | |
| "loss": 2.003, | |
| "step": 10075 | |
| }, | |
| { | |
| "epoch": 3.03393520777807, | |
| "grad_norm": 1.0722483396530151, | |
| "learning_rate": 1.1415315337505426e-05, | |
| "loss": 1.9913, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.03393520777807, | |
| "eval_loss": 2.0157699584960938, | |
| "eval_runtime": 244.4253, | |
| "eval_samples_per_second": 22.948, | |
| "eval_steps_per_second": 5.74, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.041442997109501, | |
| "grad_norm": 0.9789544939994812, | |
| "learning_rate": 1.1394789513476809e-05, | |
| "loss": 1.9866, | |
| "step": 10125 | |
| }, | |
| { | |
| "epoch": 3.0489507864409324, | |
| "grad_norm": 1.0212770700454712, | |
| "learning_rate": 1.137422366772699e-05, | |
| "loss": 1.976, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 3.0564585757723637, | |
| "grad_norm": 1.1227072477340698, | |
| "learning_rate": 1.1353618011584607e-05, | |
| "loss": 1.9816, | |
| "step": 10175 | |
| }, | |
| { | |
| "epoch": 3.0639663651037954, | |
| "grad_norm": 1.0329065322875977, | |
| "learning_rate": 1.1332972756787368e-05, | |
| "loss": 1.9773, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.0639663651037954, | |
| "eval_loss": 2.01505708694458, | |
| "eval_runtime": 244.0878, | |
| "eval_samples_per_second": 22.979, | |
| "eval_steps_per_second": 5.748, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.0714741544352266, | |
| "grad_norm": 1.0419589281082153, | |
| "learning_rate": 1.1312288115479897e-05, | |
| "loss": 1.9966, | |
| "step": 10225 | |
| }, | |
| { | |
| "epoch": 3.078981943766658, | |
| "grad_norm": 1.0318610668182373, | |
| "learning_rate": 1.1291564300211533e-05, | |
| "loss": 1.9615, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.086489733098089, | |
| "grad_norm": 1.0802398920059204, | |
| "learning_rate": 1.1270801523934156e-05, | |
| "loss": 1.9815, | |
| "step": 10275 | |
| }, | |
| { | |
| "epoch": 3.0939975224295204, | |
| "grad_norm": 1.0594321489334106, | |
| "learning_rate": 1.125e-05, | |
| "loss": 2.0002, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.0939975224295204, | |
| "eval_loss": 2.0144717693328857, | |
| "eval_runtime": 244.0019, | |
| "eval_samples_per_second": 22.988, | |
| "eval_steps_per_second": 5.75, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.101505311760952, | |
| "grad_norm": 0.8644378781318665, | |
| "learning_rate": 1.122915994215946e-05, | |
| "loss": 1.9563, | |
| "step": 10325 | |
| }, | |
| { | |
| "epoch": 3.1090131010923834, | |
| "grad_norm": 1.0262008905410767, | |
| "learning_rate": 1.1208281564558895e-05, | |
| "loss": 1.9977, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.1165208904238146, | |
| "grad_norm": 1.1098688840866089, | |
| "learning_rate": 1.1187365081738422e-05, | |
| "loss": 1.9673, | |
| "step": 10375 | |
| }, | |
| { | |
| "epoch": 3.124028679755246, | |
| "grad_norm": 1.0585020780563354, | |
| "learning_rate": 1.1166410708629716e-05, | |
| "loss": 1.9967, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.124028679755246, | |
| "eval_loss": 2.014115571975708, | |
| "eval_runtime": 244.2712, | |
| "eval_samples_per_second": 22.962, | |
| "eval_steps_per_second": 5.744, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.1315364690866776, | |
| "grad_norm": 0.9442121386528015, | |
| "learning_rate": 1.1145418660553808e-05, | |
| "loss": 2.0003, | |
| "step": 10425 | |
| }, | |
| { | |
| "epoch": 3.139044258418109, | |
| "grad_norm": 1.0891814231872559, | |
| "learning_rate": 1.1124389153218861e-05, | |
| "loss": 2.0022, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.14655204774954, | |
| "grad_norm": 1.0310977697372437, | |
| "learning_rate": 1.1103322402717958e-05, | |
| "loss": 1.9881, | |
| "step": 10475 | |
| }, | |
| { | |
| "epoch": 3.1540598370809714, | |
| "grad_norm": 1.2457115650177002, | |
| "learning_rate": 1.1082218625526887e-05, | |
| "loss": 1.9545, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.1540598370809714, | |
| "eval_loss": 2.0137479305267334, | |
| "eval_runtime": 244.4917, | |
| "eval_samples_per_second": 22.941, | |
| "eval_steps_per_second": 5.738, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.161567626412403, | |
| "grad_norm": 1.0390257835388184, | |
| "learning_rate": 1.1061078038501906e-05, | |
| "loss": 1.9965, | |
| "step": 10525 | |
| }, | |
| { | |
| "epoch": 3.1690754157438343, | |
| "grad_norm": 0.9900075793266296, | |
| "learning_rate": 1.1039900858877521e-05, | |
| "loss": 2.0066, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.1765832050752656, | |
| "grad_norm": 1.074483871459961, | |
| "learning_rate": 1.1018687304264256e-05, | |
| "loss": 1.9794, | |
| "step": 10575 | |
| }, | |
| { | |
| "epoch": 3.184090994406697, | |
| "grad_norm": 0.9264243245124817, | |
| "learning_rate": 1.099743759264641e-05, | |
| "loss": 1.9793, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.184090994406697, | |
| "eval_loss": 2.013479709625244, | |
| "eval_runtime": 244.7217, | |
| "eval_samples_per_second": 22.92, | |
| "eval_steps_per_second": 5.733, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.191598783738128, | |
| "grad_norm": 1.0158064365386963, | |
| "learning_rate": 1.097615194237982e-05, | |
| "loss": 1.992, | |
| "step": 10625 | |
| }, | |
| { | |
| "epoch": 3.19910657306956, | |
| "grad_norm": 1.084500789642334, | |
| "learning_rate": 1.0954830572189625e-05, | |
| "loss": 1.981, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.206614362400991, | |
| "grad_norm": 1.1871960163116455, | |
| "learning_rate": 1.0933473701168006e-05, | |
| "loss": 2.0098, | |
| "step": 10675 | |
| }, | |
| { | |
| "epoch": 3.2141221517324223, | |
| "grad_norm": 1.0174176692962646, | |
| "learning_rate": 1.0912081548771941e-05, | |
| "loss": 1.9898, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.2141221517324223, | |
| "eval_loss": 2.012505054473877, | |
| "eval_runtime": 244.4334, | |
| "eval_samples_per_second": 22.947, | |
| "eval_steps_per_second": 5.74, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.2216299410638536, | |
| "grad_norm": 1.1954680681228638, | |
| "learning_rate": 1.089065433482095e-05, | |
| "loss": 1.9965, | |
| "step": 10725 | |
| }, | |
| { | |
| "epoch": 3.2291377303952853, | |
| "grad_norm": 1.0380609035491943, | |
| "learning_rate": 1.0869192279494832e-05, | |
| "loss": 2.0142, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.2366455197267165, | |
| "grad_norm": 1.1713154315948486, | |
| "learning_rate": 1.0847695603331412e-05, | |
| "loss": 2.0032, | |
| "step": 10775 | |
| }, | |
| { | |
| "epoch": 3.244153309058148, | |
| "grad_norm": 0.9350267648696899, | |
| "learning_rate": 1.0826164527224262e-05, | |
| "loss": 1.9926, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.244153309058148, | |
| "eval_loss": 2.0120630264282227, | |
| "eval_runtime": 244.3746, | |
| "eval_samples_per_second": 22.952, | |
| "eval_steps_per_second": 5.741, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.251661098389579, | |
| "grad_norm": 1.1291122436523438, | |
| "learning_rate": 1.0804599272420443e-05, | |
| "loss": 1.9854, | |
| "step": 10825 | |
| }, | |
| { | |
| "epoch": 3.2591688877210103, | |
| "grad_norm": 0.9929710030555725, | |
| "learning_rate": 1.0783000060518225e-05, | |
| "loss": 1.9712, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.266676677052442, | |
| "grad_norm": 0.9652737379074097, | |
| "learning_rate": 1.076136711346481e-05, | |
| "loss": 1.9767, | |
| "step": 10875 | |
| }, | |
| { | |
| "epoch": 3.2741844663838733, | |
| "grad_norm": 0.9600501656532288, | |
| "learning_rate": 1.0739700653554052e-05, | |
| "loss": 1.9792, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.2741844663838733, | |
| "eval_loss": 2.0115151405334473, | |
| "eval_runtime": 244.8887, | |
| "eval_samples_per_second": 22.904, | |
| "eval_steps_per_second": 5.729, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.2816922557153045, | |
| "grad_norm": 1.0329478979110718, | |
| "learning_rate": 1.0718000903424174e-05, | |
| "loss": 1.9961, | |
| "step": 10925 | |
| }, | |
| { | |
| "epoch": 3.289200045046736, | |
| "grad_norm": 1.1442408561706543, | |
| "learning_rate": 1.0696268086055482e-05, | |
| "loss": 1.9898, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.2967078343781675, | |
| "grad_norm": 1.0361113548278809, | |
| "learning_rate": 1.0674502424768066e-05, | |
| "loss": 1.9861, | |
| "step": 10975 | |
| }, | |
| { | |
| "epoch": 3.3042156237095988, | |
| "grad_norm": 0.997988760471344, | |
| "learning_rate": 1.0652704143219519e-05, | |
| "loss": 1.99, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.3042156237095988, | |
| "eval_loss": 2.0116584300994873, | |
| "eval_runtime": 243.9919, | |
| "eval_samples_per_second": 22.988, | |
| "eval_steps_per_second": 5.75, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.31172341304103, | |
| "grad_norm": 0.9052268266677856, | |
| "learning_rate": 1.0630873465402622e-05, | |
| "loss": 1.9942, | |
| "step": 11025 | |
| }, | |
| { | |
| "epoch": 3.3192312023724613, | |
| "grad_norm": 0.9491928815841675, | |
| "learning_rate": 1.0609010615643052e-05, | |
| "loss": 2.0145, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.326738991703893, | |
| "grad_norm": 1.0330880880355835, | |
| "learning_rate": 1.058711581859708e-05, | |
| "loss": 1.992, | |
| "step": 11075 | |
| }, | |
| { | |
| "epoch": 3.3342467810353242, | |
| "grad_norm": 1.0044811964035034, | |
| "learning_rate": 1.0565189299249254e-05, | |
| "loss": 2.0099, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.3342467810353242, | |
| "eval_loss": 2.0105700492858887, | |
| "eval_runtime": 244.4106, | |
| "eval_samples_per_second": 22.949, | |
| "eval_steps_per_second": 5.74, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.3417545703667555, | |
| "grad_norm": 1.0180730819702148, | |
| "learning_rate": 1.0543231282910093e-05, | |
| "loss": 1.9847, | |
| "step": 11125 | |
| }, | |
| { | |
| "epoch": 3.3492623596981868, | |
| "grad_norm": 1.0637898445129395, | |
| "learning_rate": 1.0521241995213771e-05, | |
| "loss": 1.9725, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.356770149029618, | |
| "grad_norm": 1.1966840028762817, | |
| "learning_rate": 1.049922166211579e-05, | |
| "loss": 1.9909, | |
| "step": 11175 | |
| }, | |
| { | |
| "epoch": 3.3642779383610497, | |
| "grad_norm": 1.0537995100021362, | |
| "learning_rate": 1.0477170509890681e-05, | |
| "loss": 2.0051, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.3642779383610497, | |
| "eval_loss": 2.0095300674438477, | |
| "eval_runtime": 244.5586, | |
| "eval_samples_per_second": 22.935, | |
| "eval_steps_per_second": 5.737, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.371785727692481, | |
| "grad_norm": 0.9709149599075317, | |
| "learning_rate": 1.0455088765129643e-05, | |
| "loss": 1.9907, | |
| "step": 11225 | |
| }, | |
| { | |
| "epoch": 3.3792935170239122, | |
| "grad_norm": 1.1112037897109985, | |
| "learning_rate": 1.043297665473825e-05, | |
| "loss": 1.9855, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.3868013063553435, | |
| "grad_norm": 0.9346416592597961, | |
| "learning_rate": 1.0410834405934099e-05, | |
| "loss": 2.0005, | |
| "step": 11275 | |
| }, | |
| { | |
| "epoch": 3.394309095686775, | |
| "grad_norm": 1.053544044494629, | |
| "learning_rate": 1.0388662246244482e-05, | |
| "loss": 1.9858, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.394309095686775, | |
| "eval_loss": 2.0087532997131348, | |
| "eval_runtime": 244.6298, | |
| "eval_samples_per_second": 22.929, | |
| "eval_steps_per_second": 5.735, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.4018168850182064, | |
| "grad_norm": 1.0392097234725952, | |
| "learning_rate": 1.0366460403504045e-05, | |
| "loss": 1.9907, | |
| "step": 11325 | |
| }, | |
| { | |
| "epoch": 3.4093246743496377, | |
| "grad_norm": 0.9744161367416382, | |
| "learning_rate": 1.0344229105852453e-05, | |
| "loss": 1.9888, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.416832463681069, | |
| "grad_norm": 1.0045557022094727, | |
| "learning_rate": 1.0321968581732035e-05, | |
| "loss": 2.0007, | |
| "step": 11375 | |
| }, | |
| { | |
| "epoch": 3.4243402530125007, | |
| "grad_norm": 1.0795562267303467, | |
| "learning_rate": 1.0299679059885441e-05, | |
| "loss": 1.9836, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.4243402530125007, | |
| "eval_loss": 2.008427381515503, | |
| "eval_runtime": 243.7629, | |
| "eval_samples_per_second": 23.01, | |
| "eval_steps_per_second": 5.756, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.431848042343932, | |
| "grad_norm": 1.0574262142181396, | |
| "learning_rate": 1.0277360769353302e-05, | |
| "loss": 1.9968, | |
| "step": 11425 | |
| }, | |
| { | |
| "epoch": 3.439355831675363, | |
| "grad_norm": 1.0723813772201538, | |
| "learning_rate": 1.0255013939471862e-05, | |
| "loss": 1.9778, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.4468636210067944, | |
| "grad_norm": 1.0221625566482544, | |
| "learning_rate": 1.0232638799870627e-05, | |
| "loss": 1.9795, | |
| "step": 11475 | |
| }, | |
| { | |
| "epoch": 3.4543714103382257, | |
| "grad_norm": 1.0293052196502686, | |
| "learning_rate": 1.0210235580470003e-05, | |
| "loss": 2.0101, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.4543714103382257, | |
| "eval_loss": 2.008002996444702, | |
| "eval_runtime": 244.5192, | |
| "eval_samples_per_second": 22.939, | |
| "eval_steps_per_second": 5.738, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.4618791996696574, | |
| "grad_norm": 0.9779027700424194, | |
| "learning_rate": 1.0187804511478948e-05, | |
| "loss": 2.0353, | |
| "step": 11525 | |
| }, | |
| { | |
| "epoch": 3.4693869890010887, | |
| "grad_norm": 1.3106768131256104, | |
| "learning_rate": 1.0165345823392577e-05, | |
| "loss": 1.9887, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.47689477833252, | |
| "grad_norm": 1.0175050497055054, | |
| "learning_rate": 1.0142859746989822e-05, | |
| "loss": 1.9838, | |
| "step": 11575 | |
| }, | |
| { | |
| "epoch": 3.484402567663951, | |
| "grad_norm": 1.142027735710144, | |
| "learning_rate": 1.0120346513331048e-05, | |
| "loss": 1.9585, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.484402567663951, | |
| "eval_loss": 2.0071005821228027, | |
| "eval_runtime": 244.0492, | |
| "eval_samples_per_second": 22.983, | |
| "eval_steps_per_second": 5.749, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.491910356995383, | |
| "grad_norm": 1.0209110975265503, | |
| "learning_rate": 1.0097806353755675e-05, | |
| "loss": 1.9731, | |
| "step": 11625 | |
| }, | |
| { | |
| "epoch": 3.499418146326814, | |
| "grad_norm": 1.046372413635254, | |
| "learning_rate": 1.0075239499879812e-05, | |
| "loss": 1.9688, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.5069259356582454, | |
| "grad_norm": 1.227776050567627, | |
| "learning_rate": 1.0052646183593868e-05, | |
| "loss": 1.9843, | |
| "step": 11675 | |
| }, | |
| { | |
| "epoch": 3.5144337249896767, | |
| "grad_norm": 1.0463147163391113, | |
| "learning_rate": 1.0030026637060175e-05, | |
| "loss": 2.0024, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.5144337249896767, | |
| "eval_loss": 2.0066797733306885, | |
| "eval_runtime": 243.8922, | |
| "eval_samples_per_second": 22.998, | |
| "eval_steps_per_second": 5.753, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.5219415143211084, | |
| "grad_norm": 1.0555408000946045, | |
| "learning_rate": 1.0007381092710587e-05, | |
| "loss": 1.9974, | |
| "step": 11725 | |
| }, | |
| { | |
| "epoch": 3.5294493036525396, | |
| "grad_norm": 1.007045865058899, | |
| "learning_rate": 9.984709783244125e-06, | |
| "loss": 2.004, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.536957092983971, | |
| "grad_norm": 1.170345425605774, | |
| "learning_rate": 9.962012941624547e-06, | |
| "loss": 1.9492, | |
| "step": 11775 | |
| }, | |
| { | |
| "epoch": 3.544464882315402, | |
| "grad_norm": 1.1506013870239258, | |
| "learning_rate": 9.939290801077979e-06, | |
| "loss": 1.9908, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.544464882315402, | |
| "eval_loss": 2.0061874389648438, | |
| "eval_runtime": 244.205, | |
| "eval_samples_per_second": 22.968, | |
| "eval_steps_per_second": 5.745, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.5519726716468334, | |
| "grad_norm": 0.9976746439933777, | |
| "learning_rate": 9.916543595090514e-06, | |
| "loss": 1.995, | |
| "step": 11825 | |
| }, | |
| { | |
| "epoch": 3.559480460978265, | |
| "grad_norm": 1.0817415714263916, | |
| "learning_rate": 9.893771557405803e-06, | |
| "loss": 1.9989, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.5669882503096964, | |
| "grad_norm": 0.9880387187004089, | |
| "learning_rate": 9.870974922022668e-06, | |
| "loss": 1.9706, | |
| "step": 11875 | |
| }, | |
| { | |
| "epoch": 3.5744960396411276, | |
| "grad_norm": 1.629197120666504, | |
| "learning_rate": 9.848153923192681e-06, | |
| "loss": 1.9957, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.5744960396411276, | |
| "eval_loss": 2.0057406425476074, | |
| "eval_runtime": 244.5085, | |
| "eval_samples_per_second": 22.94, | |
| "eval_steps_per_second": 5.738, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.582003828972559, | |
| "grad_norm": 1.1123307943344116, | |
| "learning_rate": 9.825308795417776e-06, | |
| "loss": 1.9746, | |
| "step": 11925 | |
| }, | |
| { | |
| "epoch": 3.58951161830399, | |
| "grad_norm": 1.107917308807373, | |
| "learning_rate": 9.802439773447818e-06, | |
| "loss": 1.983, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.597019407635422, | |
| "grad_norm": 1.0012487173080444, | |
| "learning_rate": 9.779547092278212e-06, | |
| "loss": 1.9592, | |
| "step": 11975 | |
| }, | |
| { | |
| "epoch": 3.604527196966853, | |
| "grad_norm": 0.9805944561958313, | |
| "learning_rate": 9.756630987147473e-06, | |
| "loss": 1.974, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.604527196966853, | |
| "eval_loss": 2.0051681995391846, | |
| "eval_runtime": 244.093, | |
| "eval_samples_per_second": 22.979, | |
| "eval_steps_per_second": 5.748, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.6120349862982843, | |
| "grad_norm": 0.9973050355911255, | |
| "learning_rate": 9.733691693534814e-06, | |
| "loss": 2.018, | |
| "step": 12025 | |
| }, | |
| { | |
| "epoch": 3.619542775629716, | |
| "grad_norm": 1.0701146125793457, | |
| "learning_rate": 9.710729447157725e-06, | |
| "loss": 1.9395, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 3.6270505649611473, | |
| "grad_norm": 0.9309558868408203, | |
| "learning_rate": 9.687744483969555e-06, | |
| "loss": 1.9866, | |
| "step": 12075 | |
| }, | |
| { | |
| "epoch": 3.6345583542925786, | |
| "grad_norm": 1.1145427227020264, | |
| "learning_rate": 9.66473704015708e-06, | |
| "loss": 1.9669, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.6345583542925786, | |
| "eval_loss": 2.004288911819458, | |
| "eval_runtime": 244.31, | |
| "eval_samples_per_second": 22.959, | |
| "eval_steps_per_second": 5.743, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.64206614362401, | |
| "grad_norm": 1.0386533737182617, | |
| "learning_rate": 9.641707352138083e-06, | |
| "loss": 1.9833, | |
| "step": 12125 | |
| }, | |
| { | |
| "epoch": 3.649573932955441, | |
| "grad_norm": 1.0102437734603882, | |
| "learning_rate": 9.618655656558927e-06, | |
| "loss": 2.0004, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 3.657081722286873, | |
| "grad_norm": 1.063219666481018, | |
| "learning_rate": 9.595582190292109e-06, | |
| "loss": 1.9995, | |
| "step": 12175 | |
| }, | |
| { | |
| "epoch": 3.664589511618304, | |
| "grad_norm": 1.0717073678970337, | |
| "learning_rate": 9.57248719043384e-06, | |
| "loss": 1.9995, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.664589511618304, | |
| "eval_loss": 2.0040318965911865, | |
| "eval_runtime": 244.4579, | |
| "eval_samples_per_second": 22.945, | |
| "eval_steps_per_second": 5.739, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.6720973009497353, | |
| "grad_norm": 1.0240517854690552, | |
| "learning_rate": 9.549370894301602e-06, | |
| "loss": 2.0077, | |
| "step": 12225 | |
| }, | |
| { | |
| "epoch": 3.6796050902811666, | |
| "grad_norm": 1.0465691089630127, | |
| "learning_rate": 9.526233539431713e-06, | |
| "loss": 2.0077, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 3.687112879612598, | |
| "grad_norm": 1.101195216178894, | |
| "learning_rate": 9.503075363576889e-06, | |
| "loss": 1.99, | |
| "step": 12275 | |
| }, | |
| { | |
| "epoch": 3.6946206689440295, | |
| "grad_norm": 1.0206913948059082, | |
| "learning_rate": 9.479896604703785e-06, | |
| "loss": 1.9897, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.6946206689440295, | |
| "eval_loss": 2.003530740737915, | |
| "eval_runtime": 244.8327, | |
| "eval_samples_per_second": 22.91, | |
| "eval_steps_per_second": 5.73, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.7021284582754608, | |
| "grad_norm": 0.9398745894432068, | |
| "learning_rate": 9.456697500990571e-06, | |
| "loss": 1.9811, | |
| "step": 12325 | |
| }, | |
| { | |
| "epoch": 3.709636247606892, | |
| "grad_norm": 1.0570793151855469, | |
| "learning_rate": 9.433478290824472e-06, | |
| "loss": 1.9719, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 3.7171440369383237, | |
| "grad_norm": 1.0618635416030884, | |
| "learning_rate": 9.410239212799315e-06, | |
| "loss": 1.9744, | |
| "step": 12375 | |
| }, | |
| { | |
| "epoch": 3.724651826269755, | |
| "grad_norm": 1.0616377592086792, | |
| "learning_rate": 9.387911227877156e-06, | |
| "loss": 1.9889, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.724651826269755, | |
| "eval_loss": 2.003262996673584, | |
| "eval_runtime": 244.6377, | |
| "eval_samples_per_second": 22.928, | |
| "eval_steps_per_second": 5.735, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.7321596156011863, | |
| "grad_norm": 1.0657788515090942, | |
| "learning_rate": 9.364633901740714e-06, | |
| "loss": 1.9712, | |
| "step": 12425 | |
| }, | |
| { | |
| "epoch": 3.7396674049326175, | |
| "grad_norm": 1.0607733726501465, | |
| "learning_rate": 9.341337415170081e-06, | |
| "loss": 1.9622, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 3.7471751942640488, | |
| "grad_norm": 1.1743979454040527, | |
| "learning_rate": 9.318022007553162e-06, | |
| "loss": 1.9693, | |
| "step": 12475 | |
| }, | |
| { | |
| "epoch": 3.7546829835954805, | |
| "grad_norm": 1.0691910982131958, | |
| "learning_rate": 9.294687918472286e-06, | |
| "loss": 1.9865, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.7546829835954805, | |
| "eval_loss": 2.0024280548095703, | |
| "eval_runtime": 244.387, | |
| "eval_samples_per_second": 22.951, | |
| "eval_steps_per_second": 5.741, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.7621907729269117, | |
| "grad_norm": 1.0780701637268066, | |
| "learning_rate": 9.271335387701745e-06, | |
| "loss": 1.9788, | |
| "step": 12525 | |
| }, | |
| { | |
| "epoch": 3.769698562258343, | |
| "grad_norm": 1.0889036655426025, | |
| "learning_rate": 9.247964655205333e-06, | |
| "loss": 2.0001, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 3.7772063515897742, | |
| "grad_norm": 1.0859447717666626, | |
| "learning_rate": 9.224575961133889e-06, | |
| "loss": 1.9875, | |
| "step": 12575 | |
| }, | |
| { | |
| "epoch": 3.7847141409212055, | |
| "grad_norm": 1.1142594814300537, | |
| "learning_rate": 9.201169545822806e-06, | |
| "loss": 1.9703, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.7847141409212055, | |
| "eval_loss": 2.0022220611572266, | |
| "eval_runtime": 244.6481, | |
| "eval_samples_per_second": 22.927, | |
| "eval_steps_per_second": 5.735, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.792221930252637, | |
| "grad_norm": 0.9859952926635742, | |
| "learning_rate": 9.177745649789582e-06, | |
| "loss": 1.9795, | |
| "step": 12625 | |
| }, | |
| { | |
| "epoch": 3.7997297195840685, | |
| "grad_norm": 1.0307040214538574, | |
| "learning_rate": 9.154304513731345e-06, | |
| "loss": 1.9635, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 3.8072375089154997, | |
| "grad_norm": 1.1140483617782593, | |
| "learning_rate": 9.130846378522373e-06, | |
| "loss": 1.9709, | |
| "step": 12675 | |
| }, | |
| { | |
| "epoch": 3.8147452982469314, | |
| "grad_norm": 1.2594614028930664, | |
| "learning_rate": 9.107371485211619e-06, | |
| "loss": 1.998, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.8147452982469314, | |
| "eval_loss": 2.0013692378997803, | |
| "eval_runtime": 244.2752, | |
| "eval_samples_per_second": 22.962, | |
| "eval_steps_per_second": 5.744, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.8222530875783627, | |
| "grad_norm": 1.0169751644134521, | |
| "learning_rate": 9.083880075020243e-06, | |
| "loss": 1.9712, | |
| "step": 12725 | |
| }, | |
| { | |
| "epoch": 3.829760876909794, | |
| "grad_norm": 0.9640651345252991, | |
| "learning_rate": 9.060372389339123e-06, | |
| "loss": 1.9748, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 3.837268666241225, | |
| "grad_norm": 1.0947884321212769, | |
| "learning_rate": 9.036848669726382e-06, | |
| "loss": 1.9854, | |
| "step": 12775 | |
| }, | |
| { | |
| "epoch": 3.8447764555726565, | |
| "grad_norm": 1.1233420372009277, | |
| "learning_rate": 9.013309157904907e-06, | |
| "loss": 1.9968, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.8447764555726565, | |
| "eval_loss": 2.001154661178589, | |
| "eval_runtime": 244.9198, | |
| "eval_samples_per_second": 22.901, | |
| "eval_steps_per_second": 5.728, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.852284244904088, | |
| "grad_norm": 0.9935488700866699, | |
| "learning_rate": 8.98975409575985e-06, | |
| "loss": 1.9756, | |
| "step": 12825 | |
| }, | |
| { | |
| "epoch": 3.8597920342355194, | |
| "grad_norm": 0.9727908372879028, | |
| "learning_rate": 8.966183725336167e-06, | |
| "loss": 1.9942, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 3.8672998235669507, | |
| "grad_norm": 1.1200799942016602, | |
| "learning_rate": 8.942598288836103e-06, | |
| "loss": 1.9982, | |
| "step": 12875 | |
| }, | |
| { | |
| "epoch": 3.874807612898382, | |
| "grad_norm": 1.172968864440918, | |
| "learning_rate": 8.91899802861673e-06, | |
| "loss": 1.9842, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.874807612898382, | |
| "eval_loss": 2.000430107116699, | |
| "eval_runtime": 244.7767, | |
| "eval_samples_per_second": 22.915, | |
| "eval_steps_per_second": 5.732, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.882315402229813, | |
| "grad_norm": 1.1125150918960571, | |
| "learning_rate": 8.89538318718744e-06, | |
| "loss": 1.9832, | |
| "step": 12925 | |
| }, | |
| { | |
| "epoch": 3.889823191561245, | |
| "grad_norm": 1.1382113695144653, | |
| "learning_rate": 8.871754007207454e-06, | |
| "loss": 1.9774, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 3.897330980892676, | |
| "grad_norm": 1.090171217918396, | |
| "learning_rate": 8.848110731483337e-06, | |
| "loss": 1.9914, | |
| "step": 12975 | |
| }, | |
| { | |
| "epoch": 3.9048387702241074, | |
| "grad_norm": 0.9999351501464844, | |
| "learning_rate": 8.824453602966493e-06, | |
| "loss": 1.9787, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.9048387702241074, | |
| "eval_loss": 2.0002853870391846, | |
| "eval_runtime": 244.3984, | |
| "eval_samples_per_second": 22.95, | |
| "eval_steps_per_second": 5.741, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.912346559555539, | |
| "grad_norm": 1.0934284925460815, | |
| "learning_rate": 8.800782864750677e-06, | |
| "loss": 1.9817, | |
| "step": 13025 | |
| }, | |
| { | |
| "epoch": 3.9198543488869704, | |
| "grad_norm": 1.0394964218139648, | |
| "learning_rate": 8.777098760069491e-06, | |
| "loss": 1.968, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 3.9273621382184016, | |
| "grad_norm": 1.1079460382461548, | |
| "learning_rate": 8.753401532293889e-06, | |
| "loss": 1.9757, | |
| "step": 13075 | |
| }, | |
| { | |
| "epoch": 3.934869927549833, | |
| "grad_norm": 0.9885277152061462, | |
| "learning_rate": 8.729691424929671e-06, | |
| "loss": 1.9789, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.934869927549833, | |
| "eval_loss": 1.9996843338012695, | |
| "eval_runtime": 245.1096, | |
| "eval_samples_per_second": 22.884, | |
| "eval_steps_per_second": 5.724, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.942377716881264, | |
| "grad_norm": 1.005743145942688, | |
| "learning_rate": 8.705968681614985e-06, | |
| "loss": 1.9701, | |
| "step": 13125 | |
| }, | |
| { | |
| "epoch": 3.949885506212696, | |
| "grad_norm": 1.0854625701904297, | |
| "learning_rate": 8.682233546117827e-06, | |
| "loss": 2.0009, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 3.957393295544127, | |
| "grad_norm": 0.9378837943077087, | |
| "learning_rate": 8.658486262333524e-06, | |
| "loss": 1.9618, | |
| "step": 13175 | |
| }, | |
| { | |
| "epoch": 3.9649010848755584, | |
| "grad_norm": 1.0081528425216675, | |
| "learning_rate": 8.63472707428224e-06, | |
| "loss": 1.9598, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.9649010848755584, | |
| "eval_loss": 1.9990559816360474, | |
| "eval_runtime": 244.3863, | |
| "eval_samples_per_second": 22.951, | |
| "eval_steps_per_second": 5.741, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.9724088742069896, | |
| "grad_norm": 1.0947321653366089, | |
| "learning_rate": 8.61095622610646e-06, | |
| "loss": 1.9754, | |
| "step": 13225 | |
| }, | |
| { | |
| "epoch": 3.979916663538421, | |
| "grad_norm": 1.01126229763031, | |
| "learning_rate": 8.587173962068493e-06, | |
| "loss": 2.0003, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 3.9874244528698526, | |
| "grad_norm": 1.0570297241210938, | |
| "learning_rate": 8.563380526547944e-06, | |
| "loss": 1.9662, | |
| "step": 13275 | |
| }, | |
| { | |
| "epoch": 3.994932242201284, | |
| "grad_norm": 1.103887677192688, | |
| "learning_rate": 8.539576164039218e-06, | |
| "loss": 1.9603, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.994932242201284, | |
| "eval_loss": 1.9989780187606812, | |
| "eval_runtime": 244.1926, | |
| "eval_samples_per_second": 22.97, | |
| "eval_steps_per_second": 5.745, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 4.002702804159315, | |
| "grad_norm": 0.9994622468948364, | |
| "learning_rate": 8.515761119149003e-06, | |
| "loss": 2.0651, | |
| "step": 13325 | |
| }, | |
| { | |
| "epoch": 4.010210593490747, | |
| "grad_norm": 1.1002482175827026, | |
| "learning_rate": 8.491935636593756e-06, | |
| "loss": 1.9639, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 4.017718382822178, | |
| "grad_norm": 1.1589230298995972, | |
| "learning_rate": 8.468099961197186e-06, | |
| "loss": 1.9654, | |
| "step": 13375 | |
| }, | |
| { | |
| "epoch": 4.02522617215361, | |
| "grad_norm": 1.0557494163513184, | |
| "learning_rate": 8.444254337887742e-06, | |
| "loss": 1.9567, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 4.02522617215361, | |
| "eval_loss": 1.9992824792861938, | |
| "eval_runtime": 244.4365, | |
| "eval_samples_per_second": 22.947, | |
| "eval_steps_per_second": 5.74, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 4.03273396148504, | |
| "grad_norm": 1.0956406593322754, | |
| "learning_rate": 8.420399011696096e-06, | |
| "loss": 1.9574, | |
| "step": 13425 | |
| }, | |
| { | |
| "epoch": 4.040241750816472, | |
| "grad_norm": 1.314028024673462, | |
| "learning_rate": 8.396534227752622e-06, | |
| "loss": 1.9599, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 4.047749540147904, | |
| "grad_norm": 1.048609972000122, | |
| "learning_rate": 8.372660231284883e-06, | |
| "loss": 1.9483, | |
| "step": 13475 | |
| }, | |
| { | |
| "epoch": 4.055257329479335, | |
| "grad_norm": 1.119491696357727, | |
| "learning_rate": 8.348777267615099e-06, | |
| "loss": 1.9838, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 4.055257329479335, | |
| "eval_loss": 1.998762607574463, | |
| "eval_runtime": 244.4149, | |
| "eval_samples_per_second": 22.949, | |
| "eval_steps_per_second": 5.74, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 4.062765118810766, | |
| "grad_norm": 1.0003256797790527, | |
| "learning_rate": 8.324885582157645e-06, | |
| "loss": 1.9629, | |
| "step": 13525 | |
| }, | |
| { | |
| "epoch": 4.070272908142197, | |
| "grad_norm": 1.059667706489563, | |
| "learning_rate": 8.300985420416509e-06, | |
| "loss": 1.9866, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 4.077780697473629, | |
| "grad_norm": 1.1236132383346558, | |
| "learning_rate": 8.277077027982787e-06, | |
| "loss": 1.9787, | |
| "step": 13575 | |
| }, | |
| { | |
| "epoch": 4.0852884868050605, | |
| "grad_norm": 1.0514492988586426, | |
| "learning_rate": 8.253160650532144e-06, | |
| "loss": 1.9829, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 4.0852884868050605, | |
| "eval_loss": 1.9986952543258667, | |
| "eval_runtime": 245.1032, | |
| "eval_samples_per_second": 22.884, | |
| "eval_steps_per_second": 5.724, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 4.092796276136491, | |
| "grad_norm": 1.0734481811523438, | |
| "learning_rate": 8.2292365338223e-06, | |
| "loss": 1.9832, | |
| "step": 13625 | |
| }, | |
| { | |
| "epoch": 4.100304065467923, | |
| "grad_norm": 1.0448415279388428, | |
| "learning_rate": 8.205304923690505e-06, | |
| "loss": 1.9827, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 4.107811854799355, | |
| "grad_norm": 1.1534922122955322, | |
| "learning_rate": 8.181366066051e-06, | |
| "loss": 1.9398, | |
| "step": 13675 | |
| }, | |
| { | |
| "epoch": 4.1153196441307855, | |
| "grad_norm": 1.0893254280090332, | |
| "learning_rate": 8.157420206892509e-06, | |
| "loss": 1.9696, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.1153196441307855, | |
| "eval_loss": 1.9981467723846436, | |
| "eval_runtime": 244.0215, | |
| "eval_samples_per_second": 22.986, | |
| "eval_steps_per_second": 5.749, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.122827433462217, | |
| "grad_norm": 1.1225614547729492, | |
| "learning_rate": 8.133467592275697e-06, | |
| "loss": 1.9785, | |
| "step": 13725 | |
| }, | |
| { | |
| "epoch": 4.130335222793648, | |
| "grad_norm": 1.1276017427444458, | |
| "learning_rate": 8.109508468330643e-06, | |
| "loss": 1.9679, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 4.13784301212508, | |
| "grad_norm": 1.0437787771224976, | |
| "learning_rate": 8.08554308125432e-06, | |
| "loss": 1.9794, | |
| "step": 13775 | |
| }, | |
| { | |
| "epoch": 4.1453508014565115, | |
| "grad_norm": 1.1491374969482422, | |
| "learning_rate": 8.061571677308061e-06, | |
| "loss": 1.9575, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.1453508014565115, | |
| "eval_loss": 1.9976245164871216, | |
| "eval_runtime": 244.1266, | |
| "eval_samples_per_second": 22.976, | |
| "eval_steps_per_second": 5.747, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.152858590787942, | |
| "grad_norm": 1.140905499458313, | |
| "learning_rate": 8.037594502815015e-06, | |
| "loss": 1.9591, | |
| "step": 13825 | |
| }, | |
| { | |
| "epoch": 4.160366380119374, | |
| "grad_norm": 0.9632274508476257, | |
| "learning_rate": 8.013611804157636e-06, | |
| "loss": 1.9593, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 4.167874169450805, | |
| "grad_norm": 1.1178561449050903, | |
| "learning_rate": 7.989623827775142e-06, | |
| "loss": 1.9729, | |
| "step": 13875 | |
| }, | |
| { | |
| "epoch": 4.1753819587822365, | |
| "grad_norm": 1.068928837776184, | |
| "learning_rate": 7.965630820160984e-06, | |
| "loss": 1.9359, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.1753819587822365, | |
| "eval_loss": 1.9976770877838135, | |
| "eval_runtime": 244.3884, | |
| "eval_samples_per_second": 22.951, | |
| "eval_steps_per_second": 5.741, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.182889748113668, | |
| "grad_norm": 1.0295666456222534, | |
| "learning_rate": 7.941633027860312e-06, | |
| "loss": 1.9739, | |
| "step": 13925 | |
| }, | |
| { | |
| "epoch": 4.190397537445099, | |
| "grad_norm": 1.0357112884521484, | |
| "learning_rate": 7.917630697467438e-06, | |
| "loss": 1.9554, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 4.197905326776531, | |
| "grad_norm": 1.0465984344482422, | |
| "learning_rate": 7.893624075623312e-06, | |
| "loss": 1.9688, | |
| "step": 13975 | |
| }, | |
| { | |
| "epoch": 4.205413116107962, | |
| "grad_norm": 1.0274240970611572, | |
| "learning_rate": 7.869613409012976e-06, | |
| "loss": 1.9705, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.205413116107962, | |
| "eval_loss": 1.9968942403793335, | |
| "eval_runtime": 244.9157, | |
| "eval_samples_per_second": 22.902, | |
| "eval_steps_per_second": 5.729, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.212920905439393, | |
| "grad_norm": 0.9973297119140625, | |
| "learning_rate": 7.845598944363041e-06, | |
| "loss": 1.9775, | |
| "step": 14025 | |
| }, | |
| { | |
| "epoch": 4.220428694770825, | |
| "grad_norm": 1.0587254762649536, | |
| "learning_rate": 7.821580928439141e-06, | |
| "loss": 1.9808, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 4.227936484102256, | |
| "grad_norm": 1.1307932138442993, | |
| "learning_rate": 7.797559608043403e-06, | |
| "loss": 1.9646, | |
| "step": 14075 | |
| }, | |
| { | |
| "epoch": 4.2354442734336875, | |
| "grad_norm": 1.0376613140106201, | |
| "learning_rate": 7.773535230011909e-06, | |
| "loss": 1.961, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.2354442734336875, | |
| "eval_loss": 1.9972692728042603, | |
| "eval_runtime": 244.4264, | |
| "eval_samples_per_second": 22.948, | |
| "eval_steps_per_second": 5.74, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.242952062765119, | |
| "grad_norm": 1.0353500843048096, | |
| "learning_rate": 7.749508041212167e-06, | |
| "loss": 1.9881, | |
| "step": 14125 | |
| }, | |
| { | |
| "epoch": 4.25045985209655, | |
| "grad_norm": 1.191989541053772, | |
| "learning_rate": 7.725478288540554e-06, | |
| "loss": 1.9307, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 4.257967641427982, | |
| "grad_norm": 1.1267927885055542, | |
| "learning_rate": 7.701446218919805e-06, | |
| "loss": 1.9837, | |
| "step": 14175 | |
| }, | |
| { | |
| "epoch": 4.2654754307594125, | |
| "grad_norm": 1.103934407234192, | |
| "learning_rate": 7.677412079296458e-06, | |
| "loss": 1.9557, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.2654754307594125, | |
| "eval_loss": 1.9968904256820679, | |
| "eval_runtime": 244.788, | |
| "eval_samples_per_second": 22.914, | |
| "eval_steps_per_second": 5.731, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.272983220090844, | |
| "grad_norm": 1.1149851083755493, | |
| "learning_rate": 7.653376116638324e-06, | |
| "loss": 1.9573, | |
| "step": 14225 | |
| }, | |
| { | |
| "epoch": 4.280491009422276, | |
| "grad_norm": 1.2663904428482056, | |
| "learning_rate": 7.629338577931943e-06, | |
| "loss": 1.9652, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 4.287998798753707, | |
| "grad_norm": 1.1402429342269897, | |
| "learning_rate": 7.605299710180056e-06, | |
| "loss": 1.9834, | |
| "step": 14275 | |
| }, | |
| { | |
| "epoch": 4.295506588085138, | |
| "grad_norm": 1.1735416650772095, | |
| "learning_rate": 7.581259760399059e-06, | |
| "loss": 1.9743, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.295506588085138, | |
| "eval_loss": 1.9964790344238281, | |
| "eval_runtime": 244.5619, | |
| "eval_samples_per_second": 22.935, | |
| "eval_steps_per_second": 5.737, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.30301437741657, | |
| "grad_norm": 1.0733146667480469, | |
| "learning_rate": 7.557218975616456e-06, | |
| "loss": 1.9297, | |
| "step": 14325 | |
| }, | |
| { | |
| "epoch": 4.310522166748001, | |
| "grad_norm": 1.0636229515075684, | |
| "learning_rate": 7.5331776028683485e-06, | |
| "loss": 2.0013, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 4.318029956079433, | |
| "grad_norm": 1.0287854671478271, | |
| "learning_rate": 7.509135889196871e-06, | |
| "loss": 1.9394, | |
| "step": 14375 | |
| }, | |
| { | |
| "epoch": 4.325537745410863, | |
| "grad_norm": 1.2089693546295166, | |
| "learning_rate": 7.485094081647659e-06, | |
| "loss": 1.9651, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.325537745410863, | |
| "eval_loss": 1.9961069822311401, | |
| "eval_runtime": 244.3299, | |
| "eval_samples_per_second": 22.957, | |
| "eval_steps_per_second": 5.742, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.333045534742295, | |
| "grad_norm": 1.0768557786941528, | |
| "learning_rate": 7.461052427267318e-06, | |
| "loss": 1.9671, | |
| "step": 14425 | |
| }, | |
| { | |
| "epoch": 4.340553324073727, | |
| "grad_norm": 1.1563024520874023, | |
| "learning_rate": 7.437011173100874e-06, | |
| "loss": 1.9492, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 4.348061113405158, | |
| "grad_norm": 1.1290167570114136, | |
| "learning_rate": 7.412970566189248e-06, | |
| "loss": 1.9858, | |
| "step": 14475 | |
| }, | |
| { | |
| "epoch": 4.355568902736589, | |
| "grad_norm": 1.0945930480957031, | |
| "learning_rate": 7.388930853566703e-06, | |
| "loss": 1.9662, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.355568902736589, | |
| "eval_loss": 1.9953595399856567, | |
| "eval_runtime": 244.4122, | |
| "eval_samples_per_second": 22.949, | |
| "eval_steps_per_second": 5.74, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.36307669206802, | |
| "grad_norm": 1.0695611238479614, | |
| "learning_rate": 7.364892282258315e-06, | |
| "loss": 1.947, | |
| "step": 14525 | |
| }, | |
| { | |
| "epoch": 4.370584481399452, | |
| "grad_norm": 1.0597783327102661, | |
| "learning_rate": 7.340855099277433e-06, | |
| "loss": 1.9644, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 4.378092270730884, | |
| "grad_norm": 1.0378893613815308, | |
| "learning_rate": 7.3168195516231395e-06, | |
| "loss": 1.9737, | |
| "step": 14575 | |
| }, | |
| { | |
| "epoch": 4.385600060062314, | |
| "grad_norm": 1.2585569620132446, | |
| "learning_rate": 7.2937471936532264e-06, | |
| "loss": 1.9779, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.385600060062314, | |
| "eval_loss": 1.9953750371932983, | |
| "eval_runtime": 244.2809, | |
| "eval_samples_per_second": 22.961, | |
| "eval_steps_per_second": 5.743, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.393107849393746, | |
| "grad_norm": 1.064031958580017, | |
| "learning_rate": 7.269715567667308e-06, | |
| "loss": 1.9663, | |
| "step": 14625 | |
| }, | |
| { | |
| "epoch": 4.400615638725178, | |
| "grad_norm": 1.1410213708877563, | |
| "learning_rate": 7.245686308017058e-06, | |
| "loss": 1.9573, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 4.408123428056609, | |
| "grad_norm": 1.088382601737976, | |
| "learning_rate": 7.221659661620141e-06, | |
| "loss": 1.9772, | |
| "step": 14675 | |
| }, | |
| { | |
| "epoch": 4.41563121738804, | |
| "grad_norm": 0.994836151599884, | |
| "learning_rate": 7.197635875367368e-06, | |
| "loss": 1.9703, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.41563121738804, | |
| "eval_loss": 1.9953012466430664, | |
| "eval_runtime": 244.5139, | |
| "eval_samples_per_second": 22.939, | |
| "eval_steps_per_second": 5.738, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.423139006719471, | |
| "grad_norm": 1.0412800312042236, | |
| "learning_rate": 7.173615196120162e-06, | |
| "loss": 1.9413, | |
| "step": 14725 | |
| }, | |
| { | |
| "epoch": 4.430646796050903, | |
| "grad_norm": 1.159559726715088, | |
| "learning_rate": 7.149597870708011e-06, | |
| "loss": 2.0046, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 4.4381545853823345, | |
| "grad_norm": 1.045264482498169, | |
| "learning_rate": 7.12558414592596e-06, | |
| "loss": 1.9684, | |
| "step": 14775 | |
| }, | |
| { | |
| "epoch": 4.445662374713765, | |
| "grad_norm": 1.1119288206100464, | |
| "learning_rate": 7.1015742685320326e-06, | |
| "loss": 1.9649, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.445662374713765, | |
| "eval_loss": 1.9939073324203491, | |
| "eval_runtime": 244.3543, | |
| "eval_samples_per_second": 22.954, | |
| "eval_steps_per_second": 5.742, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.453170164045197, | |
| "grad_norm": 1.2049273252487183, | |
| "learning_rate": 7.077568485244728e-06, | |
| "loss": 1.9586, | |
| "step": 14825 | |
| }, | |
| { | |
| "epoch": 4.460677953376628, | |
| "grad_norm": 1.0386916399002075, | |
| "learning_rate": 7.053567042740475e-06, | |
| "loss": 1.9811, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 4.46818574270806, | |
| "grad_norm": 1.0895438194274902, | |
| "learning_rate": 7.029570187651096e-06, | |
| "loss": 1.9829, | |
| "step": 14875 | |
| }, | |
| { | |
| "epoch": 4.475693532039491, | |
| "grad_norm": 1.1542959213256836, | |
| "learning_rate": 7.005578166561275e-06, | |
| "loss": 1.9678, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.475693532039491, | |
| "eval_loss": 1.9941613674163818, | |
| "eval_runtime": 244.7416, | |
| "eval_samples_per_second": 22.918, | |
| "eval_steps_per_second": 5.733, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.483201321370922, | |
| "grad_norm": 1.0670243501663208, | |
| "learning_rate": 6.9815912260060295e-06, | |
| "loss": 1.9542, | |
| "step": 14925 | |
| }, | |
| { | |
| "epoch": 4.490709110702354, | |
| "grad_norm": 1.1406601667404175, | |
| "learning_rate": 6.95760961246816e-06, | |
| "loss": 1.9947, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 4.4982169000337855, | |
| "grad_norm": 1.1366952657699585, | |
| "learning_rate": 6.933633572375736e-06, | |
| "loss": 1.9659, | |
| "step": 14975 | |
| }, | |
| { | |
| "epoch": 4.505724689365216, | |
| "grad_norm": 1.0811400413513184, | |
| "learning_rate": 6.909663352099552e-06, | |
| "loss": 1.9442, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.505724689365216, | |
| "eval_loss": 1.993889331817627, | |
| "eval_runtime": 244.6905, | |
| "eval_samples_per_second": 22.923, | |
| "eval_steps_per_second": 5.734, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.513232478696648, | |
| "grad_norm": 1.013418436050415, | |
| "learning_rate": 6.885699197950602e-06, | |
| "loss": 1.9702, | |
| "step": 15025 | |
| }, | |
| { | |
| "epoch": 4.520740268028079, | |
| "grad_norm": 1.097463846206665, | |
| "learning_rate": 6.86174135617754e-06, | |
| "loss": 1.9547, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 4.5282480573595105, | |
| "grad_norm": 1.1067347526550293, | |
| "learning_rate": 6.83779007296417e-06, | |
| "loss": 1.9772, | |
| "step": 15075 | |
| }, | |
| { | |
| "epoch": 4.535755846690942, | |
| "grad_norm": 1.0753045082092285, | |
| "learning_rate": 6.813845594426891e-06, | |
| "loss": 1.9522, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.535755846690942, | |
| "eval_loss": 1.9931405782699585, | |
| "eval_runtime": 244.8392, | |
| "eval_samples_per_second": 22.909, | |
| "eval_steps_per_second": 5.73, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.543263636022373, | |
| "grad_norm": 1.0194171667099, | |
| "learning_rate": 6.789908166612178e-06, | |
| "loss": 1.9643, | |
| "step": 15125 | |
| }, | |
| { | |
| "epoch": 4.550771425353805, | |
| "grad_norm": 1.123500108718872, | |
| "learning_rate": 6.76597803549406e-06, | |
| "loss": 1.954, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 4.558279214685236, | |
| "grad_norm": 1.1514633893966675, | |
| "learning_rate": 6.742055446971586e-06, | |
| "loss": 1.954, | |
| "step": 15175 | |
| }, | |
| { | |
| "epoch": 4.565787004016667, | |
| "grad_norm": 1.1776665449142456, | |
| "learning_rate": 6.718140646866296e-06, | |
| "loss": 1.9539, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.565787004016667, | |
| "eval_loss": 1.9931755065917969, | |
| "eval_runtime": 243.9594, | |
| "eval_samples_per_second": 22.992, | |
| "eval_steps_per_second": 5.751, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.573294793348099, | |
| "grad_norm": 1.1815805435180664, | |
| "learning_rate": 6.694233880919708e-06, | |
| "loss": 1.9478, | |
| "step": 15225 | |
| }, | |
| { | |
| "epoch": 4.58080258267953, | |
| "grad_norm": 1.0977429151535034, | |
| "learning_rate": 6.670335394790772e-06, | |
| "loss": 1.947, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 4.5883103720109615, | |
| "grad_norm": 1.1538454294204712, | |
| "learning_rate": 6.6464454340533655e-06, | |
| "loss": 1.9462, | |
| "step": 15275 | |
| }, | |
| { | |
| "epoch": 4.595818161342393, | |
| "grad_norm": 1.1371299028396606, | |
| "learning_rate": 6.622564244193754e-06, | |
| "loss": 1.9586, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.595818161342393, | |
| "eval_loss": 1.9928078651428223, | |
| "eval_runtime": 244.396, | |
| "eval_samples_per_second": 22.95, | |
| "eval_steps_per_second": 5.741, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.603325950673824, | |
| "grad_norm": 1.1348552703857422, | |
| "learning_rate": 6.598692070608083e-06, | |
| "loss": 1.9509, | |
| "step": 15325 | |
| }, | |
| { | |
| "epoch": 4.610833740005256, | |
| "grad_norm": 0.9622187614440918, | |
| "learning_rate": 6.5748291585998436e-06, | |
| "loss": 1.9359, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 4.6183415293366865, | |
| "grad_norm": 0.9866182208061218, | |
| "learning_rate": 6.55097575337736e-06, | |
| "loss": 1.9664, | |
| "step": 15375 | |
| }, | |
| { | |
| "epoch": 4.625849318668118, | |
| "grad_norm": 1.1888655424118042, | |
| "learning_rate": 6.5271321000512715e-06, | |
| "loss": 1.9483, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.625849318668118, | |
| "eval_loss": 1.9925552606582642, | |
| "eval_runtime": 244.5029, | |
| "eval_samples_per_second": 22.94, | |
| "eval_steps_per_second": 5.738, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.63335710799955, | |
| "grad_norm": 1.1576796770095825, | |
| "learning_rate": 6.503298443632006e-06, | |
| "loss": 1.9494, | |
| "step": 15425 | |
| }, | |
| { | |
| "epoch": 4.640864897330981, | |
| "grad_norm": 1.1134017705917358, | |
| "learning_rate": 6.479475029027266e-06, | |
| "loss": 1.9282, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 4.648372686662412, | |
| "grad_norm": 1.2257720232009888, | |
| "learning_rate": 6.45566210103951e-06, | |
| "loss": 1.9648, | |
| "step": 15475 | |
| }, | |
| { | |
| "epoch": 4.655880475993843, | |
| "grad_norm": 1.1228723526000977, | |
| "learning_rate": 6.431859904363441e-06, | |
| "loss": 1.9436, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.655880475993843, | |
| "eval_loss": 1.9922431707382202, | |
| "eval_runtime": 244.1596, | |
| "eval_samples_per_second": 22.973, | |
| "eval_steps_per_second": 5.746, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.663388265325275, | |
| "grad_norm": 1.1629343032836914, | |
| "learning_rate": 6.40806868358349e-06, | |
| "loss": 1.9939, | |
| "step": 15525 | |
| }, | |
| { | |
| "epoch": 4.670896054656707, | |
| "grad_norm": 1.229765772819519, | |
| "learning_rate": 6.38428868317131e-06, | |
| "loss": 1.9469, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 4.6784038439881375, | |
| "grad_norm": 1.10246741771698, | |
| "learning_rate": 6.360520147483243e-06, | |
| "loss": 1.97, | |
| "step": 15575 | |
| }, | |
| { | |
| "epoch": 4.685911633319569, | |
| "grad_norm": 1.0727444887161255, | |
| "learning_rate": 6.336763320757837e-06, | |
| "loss": 1.9598, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.685911633319569, | |
| "eval_loss": 1.9915326833724976, | |
| "eval_runtime": 244.8129, | |
| "eval_samples_per_second": 22.911, | |
| "eval_steps_per_second": 5.731, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.693419422651001, | |
| "grad_norm": 1.1504484415054321, | |
| "learning_rate": 6.313018447113308e-06, | |
| "loss": 2.0044, | |
| "step": 15625 | |
| }, | |
| { | |
| "epoch": 4.700927211982432, | |
| "grad_norm": 1.2329007387161255, | |
| "learning_rate": 6.289285770545056e-06, | |
| "loss": 1.9718, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 4.708435001313863, | |
| "grad_norm": 1.12412691116333, | |
| "learning_rate": 6.265565534923142e-06, | |
| "loss": 1.9716, | |
| "step": 15675 | |
| }, | |
| { | |
| "epoch": 4.715942790645294, | |
| "grad_norm": 1.1599684953689575, | |
| "learning_rate": 6.241857983989794e-06, | |
| "loss": 1.9562, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.715942790645294, | |
| "eval_loss": 1.9914188385009766, | |
| "eval_runtime": 244.1223, | |
| "eval_samples_per_second": 22.976, | |
| "eval_steps_per_second": 5.747, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.723450579976726, | |
| "grad_norm": 1.0815508365631104, | |
| "learning_rate": 6.21816336135689e-06, | |
| "loss": 1.9537, | |
| "step": 15725 | |
| }, | |
| { | |
| "epoch": 4.730958369308157, | |
| "grad_norm": 1.1671316623687744, | |
| "learning_rate": 6.1944819105034615e-06, | |
| "loss": 1.94, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 4.738466158639588, | |
| "grad_norm": 1.0050816535949707, | |
| "learning_rate": 6.170813874773193e-06, | |
| "loss": 1.9701, | |
| "step": 15775 | |
| }, | |
| { | |
| "epoch": 4.74597394797102, | |
| "grad_norm": 1.134464979171753, | |
| "learning_rate": 6.1471594973719145e-06, | |
| "loss": 1.9671, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.74597394797102, | |
| "eval_loss": 1.9911348819732666, | |
| "eval_runtime": 244.1165, | |
| "eval_samples_per_second": 22.977, | |
| "eval_steps_per_second": 5.747, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.753481737302451, | |
| "grad_norm": 1.177161455154419, | |
| "learning_rate": 6.123519021365107e-06, | |
| "loss": 1.9476, | |
| "step": 15825 | |
| }, | |
| { | |
| "epoch": 4.760989526633883, | |
| "grad_norm": 0.998101532459259, | |
| "learning_rate": 6.099892689675414e-06, | |
| "loss": 1.9599, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 4.768497315965314, | |
| "grad_norm": 1.0538263320922852, | |
| "learning_rate": 6.076280745080128e-06, | |
| "loss": 2.0034, | |
| "step": 15875 | |
| }, | |
| { | |
| "epoch": 4.776005105296745, | |
| "grad_norm": 1.2193068265914917, | |
| "learning_rate": 6.0526834302087054e-06, | |
| "loss": 1.9526, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.776005105296745, | |
| "eval_loss": 1.9908709526062012, | |
| "eval_runtime": 244.0685, | |
| "eval_samples_per_second": 22.981, | |
| "eval_steps_per_second": 5.748, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.783512894628177, | |
| "grad_norm": 1.0334097146987915, | |
| "learning_rate": 6.0291009875402705e-06, | |
| "loss": 1.9999, | |
| "step": 15925 | |
| }, | |
| { | |
| "epoch": 4.791020683959609, | |
| "grad_norm": 1.173577904701233, | |
| "learning_rate": 6.005533659401131e-06, | |
| "loss": 1.9886, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 4.798528473291039, | |
| "grad_norm": 1.270085334777832, | |
| "learning_rate": 5.98198168796227e-06, | |
| "loss": 1.9726, | |
| "step": 15975 | |
| }, | |
| { | |
| "epoch": 4.806036262622471, | |
| "grad_norm": 1.2580983638763428, | |
| "learning_rate": 5.958445315236885e-06, | |
| "loss": 1.9382, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.806036262622471, | |
| "eval_loss": 1.9913830757141113, | |
| "eval_runtime": 244.081, | |
| "eval_samples_per_second": 22.98, | |
| "eval_steps_per_second": 5.748, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.813544051953902, | |
| "grad_norm": 1.080772042274475, | |
| "learning_rate": 5.934924783077876e-06, | |
| "loss": 1.9402, | |
| "step": 16025 | |
| }, | |
| { | |
| "epoch": 4.821051841285334, | |
| "grad_norm": 1.1412278413772583, | |
| "learning_rate": 5.911420333175371e-06, | |
| "loss": 1.9609, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 4.828559630616764, | |
| "grad_norm": 1.0110164880752563, | |
| "learning_rate": 5.887932207054245e-06, | |
| "loss": 1.9922, | |
| "step": 16075 | |
| }, | |
| { | |
| "epoch": 4.836067419948196, | |
| "grad_norm": 1.1360834836959839, | |
| "learning_rate": 5.864460646071631e-06, | |
| "loss": 2.0002, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.836067419948196, | |
| "eval_loss": 1.9903969764709473, | |
| "eval_runtime": 244.6106, | |
| "eval_samples_per_second": 22.93, | |
| "eval_steps_per_second": 5.736, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.843575209279628, | |
| "grad_norm": 1.163109302520752, | |
| "learning_rate": 5.841005891414443e-06, | |
| "loss": 1.9692, | |
| "step": 16125 | |
| }, | |
| { | |
| "epoch": 4.851082998611059, | |
| "grad_norm": 1.1313296556472778, | |
| "learning_rate": 5.817568184096897e-06, | |
| "loss": 1.9648, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 4.85859078794249, | |
| "grad_norm": 1.1544945240020752, | |
| "learning_rate": 5.794147764958046e-06, | |
| "loss": 1.9696, | |
| "step": 16175 | |
| }, | |
| { | |
| "epoch": 4.866098577273922, | |
| "grad_norm": 1.0399378538131714, | |
| "learning_rate": 5.770744874659283e-06, | |
| "loss": 1.9396, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.866098577273922, | |
| "eval_loss": 1.9903674125671387, | |
| "eval_runtime": 244.9102, | |
| "eval_samples_per_second": 22.902, | |
| "eval_steps_per_second": 5.729, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.873606366605353, | |
| "grad_norm": 1.189995288848877, | |
| "learning_rate": 5.747359753681883e-06, | |
| "loss": 1.9542, | |
| "step": 16225 | |
| }, | |
| { | |
| "epoch": 4.8811141559367845, | |
| "grad_norm": 1.0425307750701904, | |
| "learning_rate": 5.7239926423245305e-06, | |
| "loss": 1.9764, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 4.888621945268216, | |
| "grad_norm": 1.1978663206100464, | |
| "learning_rate": 5.700643780700849e-06, | |
| "loss": 1.9624, | |
| "step": 16275 | |
| }, | |
| { | |
| "epoch": 4.896129734599647, | |
| "grad_norm": 1.045249104499817, | |
| "learning_rate": 5.677313408736924e-06, | |
| "loss": 1.9709, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.896129734599647, | |
| "eval_loss": 1.9895341396331787, | |
| "eval_runtime": 244.445, | |
| "eval_samples_per_second": 22.946, | |
| "eval_steps_per_second": 5.74, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.903637523931079, | |
| "grad_norm": 1.119350790977478, | |
| "learning_rate": 5.654001766168861e-06, | |
| "loss": 1.9712, | |
| "step": 16325 | |
| }, | |
| { | |
| "epoch": 4.91114531326251, | |
| "grad_norm": 1.090303897857666, | |
| "learning_rate": 5.630709092540301e-06, | |
| "loss": 1.9269, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 4.918653102593941, | |
| "grad_norm": 1.2654612064361572, | |
| "learning_rate": 5.607435627199961e-06, | |
| "loss": 1.9468, | |
| "step": 16375 | |
| }, | |
| { | |
| "epoch": 4.926160891925372, | |
| "grad_norm": 1.0900917053222656, | |
| "learning_rate": 5.584181609299187e-06, | |
| "loss": 1.9574, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.926160891925372, | |
| "eval_loss": 1.989732265472412, | |
| "eval_runtime": 244.1216, | |
| "eval_samples_per_second": 22.976, | |
| "eval_steps_per_second": 5.747, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.933668681256804, | |
| "grad_norm": 1.192901372909546, | |
| "learning_rate": 5.560947277789483e-06, | |
| "loss": 1.928, | |
| "step": 16425 | |
| }, | |
| { | |
| "epoch": 4.9411764705882355, | |
| "grad_norm": 1.0490167140960693, | |
| "learning_rate": 5.537732871420064e-06, | |
| "loss": 1.9452, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 4.948684259919666, | |
| "grad_norm": 1.0243791341781616, | |
| "learning_rate": 5.514538628735402e-06, | |
| "loss": 1.9646, | |
| "step": 16475 | |
| }, | |
| { | |
| "epoch": 4.956192049251098, | |
| "grad_norm": 1.091910481452942, | |
| "learning_rate": 5.491364788072769e-06, | |
| "loss": 1.982, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.956192049251098, | |
| "eval_loss": 1.9894477128982544, | |
| "eval_runtime": 244.6192, | |
| "eval_samples_per_second": 22.93, | |
| "eval_steps_per_second": 5.735, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.96369983858253, | |
| "grad_norm": 1.1949421167373657, | |
| "learning_rate": 5.468211587559794e-06, | |
| "loss": 1.9528, | |
| "step": 16525 | |
| }, | |
| { | |
| "epoch": 4.9712076279139605, | |
| "grad_norm": 1.0508161783218384, | |
| "learning_rate": 5.445079265112013e-06, | |
| "loss": 1.9485, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 4.978715417245392, | |
| "grad_norm": 1.2194594144821167, | |
| "learning_rate": 5.421968058430424e-06, | |
| "loss": 1.9324, | |
| "step": 16575 | |
| }, | |
| { | |
| "epoch": 4.986223206576824, | |
| "grad_norm": 1.1732969284057617, | |
| "learning_rate": 5.398878204999047e-06, | |
| "loss": 1.9588, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.986223206576824, | |
| "eval_loss": 1.9885412454605103, | |
| "eval_runtime": 244.0337, | |
| "eval_samples_per_second": 22.985, | |
| "eval_steps_per_second": 5.749, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.993730995908255, | |
| "grad_norm": 1.142801284790039, | |
| "learning_rate": 5.375809942082486e-06, | |
| "loss": 1.969, | |
| "step": 16625 | |
| }, | |
| { | |
| "epoch": 5.001501557866287, | |
| "grad_norm": 1.1804615259170532, | |
| "learning_rate": 5.35276350672348e-06, | |
| "loss": 2.0292, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 5.0090093471977175, | |
| "grad_norm": 1.184505581855774, | |
| "learning_rate": 5.329739135740479e-06, | |
| "loss": 1.9356, | |
| "step": 16675 | |
| }, | |
| { | |
| "epoch": 5.016517136529149, | |
| "grad_norm": 1.23818039894104, | |
| "learning_rate": 5.306737065725203e-06, | |
| "loss": 1.9537, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 5.016517136529149, | |
| "eval_loss": 1.9894059896469116, | |
| "eval_runtime": 244.4081, | |
| "eval_samples_per_second": 22.949, | |
| "eval_steps_per_second": 5.74, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 5.02402492586058, | |
| "grad_norm": 1.1360877752304077, | |
| "learning_rate": 5.283757533040218e-06, | |
| "loss": 1.9584, | |
| "step": 16725 | |
| }, | |
| { | |
| "epoch": 5.031532715192012, | |
| "grad_norm": 0.9773384928703308, | |
| "learning_rate": 5.260800773816495e-06, | |
| "loss": 1.9773, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 5.039040504523443, | |
| "grad_norm": 1.052291750907898, | |
| "learning_rate": 5.237867023951004e-06, | |
| "loss": 1.9516, | |
| "step": 16775 | |
| }, | |
| { | |
| "epoch": 5.046548293854874, | |
| "grad_norm": 1.086792230606079, | |
| "learning_rate": 5.214956519104266e-06, | |
| "loss": 1.9529, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 5.046548293854874, | |
| "eval_loss": 1.9890544414520264, | |
| "eval_runtime": 244.3105, | |
| "eval_samples_per_second": 22.958, | |
| "eval_steps_per_second": 5.743, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 5.054056083186306, | |
| "grad_norm": 1.0600550174713135, | |
| "learning_rate": 5.192069494697948e-06, | |
| "loss": 1.9553, | |
| "step": 16825 | |
| }, | |
| { | |
| "epoch": 5.061563872517737, | |
| "grad_norm": 1.2051581144332886, | |
| "learning_rate": 5.169206185912439e-06, | |
| "loss": 1.9469, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 5.0690716618491685, | |
| "grad_norm": 1.0981636047363281, | |
| "learning_rate": 5.146366827684433e-06, | |
| "loss": 1.9817, | |
| "step": 16875 | |
| }, | |
| { | |
| "epoch": 5.0765794511806, | |
| "grad_norm": 1.2117871046066284, | |
| "learning_rate": 5.123551654704513e-06, | |
| "loss": 1.9476, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 5.0765794511806, | |
| "eval_loss": 1.9889459609985352, | |
| "eval_runtime": 244.8126, | |
| "eval_samples_per_second": 22.911, | |
| "eval_steps_per_second": 5.731, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 5.084087240512031, | |
| "grad_norm": 1.1481753587722778, | |
| "learning_rate": 5.101672059749764e-06, | |
| "loss": 1.9257, | |
| "step": 16925 | |
| }, | |
| { | |
| "epoch": 5.091595029843463, | |
| "grad_norm": 1.0862995386123657, | |
| "learning_rate": 5.0789049696927284e-06, | |
| "loss": 1.9393, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 5.099102819174894, | |
| "grad_norm": 1.1692668199539185, | |
| "learning_rate": 5.056162758102157e-06, | |
| "loss": 1.9525, | |
| "step": 16975 | |
| }, | |
| { | |
| "epoch": 5.106610608506325, | |
| "grad_norm": 1.2036652565002441, | |
| "learning_rate": 5.033445658670386e-06, | |
| "loss": 1.9622, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 5.106610608506325, | |
| "eval_loss": 1.988864541053772, | |
| "eval_runtime": 244.6644, | |
| "eval_samples_per_second": 22.925, | |
| "eval_steps_per_second": 5.734, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 5.114118397837757, | |
| "grad_norm": 1.2949445247650146, | |
| "learning_rate": 5.0107539048317025e-06, | |
| "loss": 1.9454, | |
| "step": 17025 | |
| }, | |
| { | |
| "epoch": 5.121626187169188, | |
| "grad_norm": 1.3341749906539917, | |
| "learning_rate": 4.98808772975995e-06, | |
| "loss": 1.9501, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 5.129133976500619, | |
| "grad_norm": 1.147869348526001, | |
| "learning_rate": 4.965447366366137e-06, | |
| "loss": 1.9392, | |
| "step": 17075 | |
| }, | |
| { | |
| "epoch": 5.136641765832051, | |
| "grad_norm": 1.2364166975021362, | |
| "learning_rate": 4.9428330472960326e-06, | |
| "loss": 1.957, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 5.136641765832051, | |
| "eval_loss": 1.9885538816452026, | |
| "eval_runtime": 244.7413, | |
| "eval_samples_per_second": 22.918, | |
| "eval_steps_per_second": 5.733, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 5.144149555163482, | |
| "grad_norm": 1.0443626642227173, | |
| "learning_rate": 4.920245004927787e-06, | |
| "loss": 1.9461, | |
| "step": 17125 | |
| }, | |
| { | |
| "epoch": 5.151657344494914, | |
| "grad_norm": 1.1504952907562256, | |
| "learning_rate": 4.897683471369532e-06, | |
| "loss": 1.9492, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 5.1591651338263445, | |
| "grad_norm": 1.1174638271331787, | |
| "learning_rate": 4.875148678457012e-06, | |
| "loss": 1.9496, | |
| "step": 17175 | |
| }, | |
| { | |
| "epoch": 5.166672923157776, | |
| "grad_norm": 1.2215421199798584, | |
| "learning_rate": 4.852640857751181e-06, | |
| "loss": 1.9272, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.166672923157776, | |
| "eval_loss": 1.9891639947891235, | |
| "eval_runtime": 244.7795, | |
| "eval_samples_per_second": 22.915, | |
| "eval_steps_per_second": 5.732, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.174180712489208, | |
| "grad_norm": 1.179458498954773, | |
| "learning_rate": 4.830160240535846e-06, | |
| "loss": 1.965, | |
| "step": 17225 | |
| }, | |
| { | |
| "epoch": 5.181688501820639, | |
| "grad_norm": 1.1385215520858765, | |
| "learning_rate": 4.807707057815272e-06, | |
| "loss": 1.9466, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 5.18919629115207, | |
| "grad_norm": 1.4543343782424927, | |
| "learning_rate": 4.785281540311815e-06, | |
| "loss": 1.9864, | |
| "step": 17275 | |
| }, | |
| { | |
| "epoch": 5.196704080483501, | |
| "grad_norm": 1.1437246799468994, | |
| "learning_rate": 4.762883918463555e-06, | |
| "loss": 1.9545, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.196704080483501, | |
| "eval_loss": 1.988171935081482, | |
| "eval_runtime": 244.7849, | |
| "eval_samples_per_second": 22.914, | |
| "eval_steps_per_second": 5.732, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.204211869814933, | |
| "grad_norm": 1.154579520225525, | |
| "learning_rate": 4.740514422421921e-06, | |
| "loss": 1.9295, | |
| "step": 17325 | |
| }, | |
| { | |
| "epoch": 5.211719659146365, | |
| "grad_norm": 1.21454656124115, | |
| "learning_rate": 4.71817328204933e-06, | |
| "loss": 1.9554, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 5.219227448477795, | |
| "grad_norm": 1.1201882362365723, | |
| "learning_rate": 4.695860726916826e-06, | |
| "loss": 1.9313, | |
| "step": 17375 | |
| }, | |
| { | |
| "epoch": 5.226735237809227, | |
| "grad_norm": 1.1768020391464233, | |
| "learning_rate": 4.673576986301719e-06, | |
| "loss": 1.9316, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.226735237809227, | |
| "eval_loss": 1.9883191585540771, | |
| "eval_runtime": 245.1496, | |
| "eval_samples_per_second": 22.88, | |
| "eval_steps_per_second": 5.723, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.234243027140659, | |
| "grad_norm": 1.14753258228302, | |
| "learning_rate": 4.651322289185229e-06, | |
| "loss": 1.9224, | |
| "step": 17425 | |
| }, | |
| { | |
| "epoch": 5.24175081647209, | |
| "grad_norm": 1.2445884943008423, | |
| "learning_rate": 4.629096864250132e-06, | |
| "loss": 1.9336, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 5.249258605803521, | |
| "grad_norm": 1.0989011526107788, | |
| "learning_rate": 4.606900939878415e-06, | |
| "loss": 1.9434, | |
| "step": 17475 | |
| }, | |
| { | |
| "epoch": 5.256766395134952, | |
| "grad_norm": 1.2167655229568481, | |
| "learning_rate": 4.584734744148922e-06, | |
| "loss": 1.9219, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.256766395134952, | |
| "eval_loss": 1.9880566596984863, | |
| "eval_runtime": 245.0782, | |
| "eval_samples_per_second": 22.887, | |
| "eval_steps_per_second": 5.725, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.264274184466384, | |
| "grad_norm": 1.070075511932373, | |
| "learning_rate": 4.562598504835015e-06, | |
| "loss": 1.9723, | |
| "step": 17525 | |
| }, | |
| { | |
| "epoch": 5.2717819737978155, | |
| "grad_norm": 1.1149256229400635, | |
| "learning_rate": 4.540492449402237e-06, | |
| "loss": 1.9661, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 5.279289763129246, | |
| "grad_norm": 1.1833670139312744, | |
| "learning_rate": 4.5184168050059645e-06, | |
| "loss": 1.9208, | |
| "step": 17575 | |
| }, | |
| { | |
| "epoch": 5.286797552460678, | |
| "grad_norm": 1.1111880540847778, | |
| "learning_rate": 4.496371798489084e-06, | |
| "loss": 1.9621, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.286797552460678, | |
| "eval_loss": 1.9878884553909302, | |
| "eval_runtime": 245.0298, | |
| "eval_samples_per_second": 22.891, | |
| "eval_steps_per_second": 5.726, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.294305341792109, | |
| "grad_norm": 1.1070398092269897, | |
| "learning_rate": 4.47435765637965e-06, | |
| "loss": 1.9578, | |
| "step": 17625 | |
| }, | |
| { | |
| "epoch": 5.301813131123541, | |
| "grad_norm": 1.127094030380249, | |
| "learning_rate": 4.452374604888568e-06, | |
| "loss": 1.9291, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 5.309320920454972, | |
| "grad_norm": 1.1716080904006958, | |
| "learning_rate": 4.430422869907261e-06, | |
| "loss": 1.9694, | |
| "step": 17675 | |
| }, | |
| { | |
| "epoch": 5.316828709786403, | |
| "grad_norm": 1.0636411905288696, | |
| "learning_rate": 4.408502677005365e-06, | |
| "loss": 1.9692, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.316828709786403, | |
| "eval_loss": 1.9873278141021729, | |
| "eval_runtime": 245.0048, | |
| "eval_samples_per_second": 22.893, | |
| "eval_steps_per_second": 5.726, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.324336499117835, | |
| "grad_norm": 1.0959444046020508, | |
| "learning_rate": 4.386614251428382e-06, | |
| "loss": 1.9467, | |
| "step": 17725 | |
| }, | |
| { | |
| "epoch": 5.3318442884492665, | |
| "grad_norm": 1.3291634321212769, | |
| "learning_rate": 4.3647578180953905e-06, | |
| "loss": 1.9335, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 5.339352077780697, | |
| "grad_norm": 1.1393731832504272, | |
| "learning_rate": 4.342933601596728e-06, | |
| "loss": 1.9253, | |
| "step": 17775 | |
| }, | |
| { | |
| "epoch": 5.346859867112129, | |
| "grad_norm": 1.0294339656829834, | |
| "learning_rate": 4.321141826191677e-06, | |
| "loss": 1.9358, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.346859867112129, | |
| "eval_loss": 1.9870134592056274, | |
| "eval_runtime": 244.8052, | |
| "eval_samples_per_second": 22.912, | |
| "eval_steps_per_second": 5.731, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.35436765644356, | |
| "grad_norm": 1.1546337604522705, | |
| "learning_rate": 4.299382715806166e-06, | |
| "loss": 1.9828, | |
| "step": 17825 | |
| }, | |
| { | |
| "epoch": 5.3618754457749915, | |
| "grad_norm": 1.1130156517028809, | |
| "learning_rate": 4.27765649403047e-06, | |
| "loss": 1.9328, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 5.369383235106423, | |
| "grad_norm": 1.1644601821899414, | |
| "learning_rate": 4.2559633841169055e-06, | |
| "loss": 1.9425, | |
| "step": 17875 | |
| }, | |
| { | |
| "epoch": 5.376891024437854, | |
| "grad_norm": 1.1819103956222534, | |
| "learning_rate": 4.2343036089775444e-06, | |
| "loss": 1.9346, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.376891024437854, | |
| "eval_loss": 1.9867066144943237, | |
| "eval_runtime": 244.6669, | |
| "eval_samples_per_second": 22.925, | |
| "eval_steps_per_second": 5.734, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.384398813769286, | |
| "grad_norm": 1.126689076423645, | |
| "learning_rate": 4.212677391181919e-06, | |
| "loss": 1.9554, | |
| "step": 17925 | |
| }, | |
| { | |
| "epoch": 5.391906603100717, | |
| "grad_norm": 1.2009223699569702, | |
| "learning_rate": 4.191084952954739e-06, | |
| "loss": 1.9597, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 5.399414392432148, | |
| "grad_norm": 1.1235226392745972, | |
| "learning_rate": 4.169526516173596e-06, | |
| "loss": 1.9362, | |
| "step": 17975 | |
| }, | |
| { | |
| "epoch": 5.40692218176358, | |
| "grad_norm": 1.2246404886245728, | |
| "learning_rate": 4.148002302366707e-06, | |
| "loss": 1.9621, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.40692218176358, | |
| "eval_loss": 1.9868113994598389, | |
| "eval_runtime": 244.649, | |
| "eval_samples_per_second": 22.927, | |
| "eval_steps_per_second": 5.735, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.414429971095011, | |
| "grad_norm": 1.140601634979248, | |
| "learning_rate": 4.126512532710613e-06, | |
| "loss": 1.9313, | |
| "step": 18025 | |
| }, | |
| { | |
| "epoch": 5.4219377604264425, | |
| "grad_norm": 1.1360208988189697, | |
| "learning_rate": 4.105057428027919e-06, | |
| "loss": 1.9462, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 5.429445549757874, | |
| "grad_norm": 1.172402024269104, | |
| "learning_rate": 4.0836372087850255e-06, | |
| "loss": 1.9577, | |
| "step": 18075 | |
| }, | |
| { | |
| "epoch": 5.436953339089305, | |
| "grad_norm": 1.1650437116622925, | |
| "learning_rate": 4.062252095089857e-06, | |
| "loss": 1.9299, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.436953339089305, | |
| "eval_loss": 1.9864240884780884, | |
| "eval_runtime": 244.7091, | |
| "eval_samples_per_second": 22.921, | |
| "eval_steps_per_second": 5.733, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.444461128420737, | |
| "grad_norm": 1.1697797775268555, | |
| "learning_rate": 4.040902306689605e-06, | |
| "loss": 1.9483, | |
| "step": 18125 | |
| }, | |
| { | |
| "epoch": 5.4519689177521675, | |
| "grad_norm": 1.194689393043518, | |
| "learning_rate": 4.019588062968471e-06, | |
| "loss": 1.9468, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 5.459476707083599, | |
| "grad_norm": 1.0986504554748535, | |
| "learning_rate": 3.998309582945405e-06, | |
| "loss": 1.9472, | |
| "step": 18175 | |
| }, | |
| { | |
| "epoch": 5.466984496415031, | |
| "grad_norm": 1.2854065895080566, | |
| "learning_rate": 3.977067085271864e-06, | |
| "loss": 1.9455, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.466984496415031, | |
| "eval_loss": 1.9863779544830322, | |
| "eval_runtime": 244.6161, | |
| "eval_samples_per_second": 22.93, | |
| "eval_steps_per_second": 5.736, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.474492285746462, | |
| "grad_norm": 1.167863368988037, | |
| "learning_rate": 3.95586078822956e-06, | |
| "loss": 1.9287, | |
| "step": 18225 | |
| }, | |
| { | |
| "epoch": 5.482000075077893, | |
| "grad_norm": 1.190122365951538, | |
| "learning_rate": 3.934690909728214e-06, | |
| "loss": 1.9581, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 5.489507864409324, | |
| "grad_norm": 1.0951225757598877, | |
| "learning_rate": 3.913557667303326e-06, | |
| "loss": 1.93, | |
| "step": 18275 | |
| }, | |
| { | |
| "epoch": 5.497015653740756, | |
| "grad_norm": 1.052368402481079, | |
| "learning_rate": 3.8924612781139276e-06, | |
| "loss": 1.9753, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.497015653740756, | |
| "eval_loss": 1.9860328435897827, | |
| "eval_runtime": 245.3363, | |
| "eval_samples_per_second": 22.862, | |
| "eval_steps_per_second": 5.719, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.504523443072188, | |
| "grad_norm": 1.1306352615356445, | |
| "learning_rate": 3.87140195894037e-06, | |
| "loss": 1.9711, | |
| "step": 18325 | |
| }, | |
| { | |
| "epoch": 5.5120312324036185, | |
| "grad_norm": 1.174249291419983, | |
| "learning_rate": 3.850379926182069e-06, | |
| "loss": 1.9391, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 5.51953902173505, | |
| "grad_norm": 1.0850168466567993, | |
| "learning_rate": 3.8293953958553055e-06, | |
| "loss": 1.9709, | |
| "step": 18375 | |
| }, | |
| { | |
| "epoch": 5.527046811066482, | |
| "grad_norm": 1.1175942420959473, | |
| "learning_rate": 3.8084485835909922e-06, | |
| "loss": 1.9369, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.527046811066482, | |
| "eval_loss": 1.9858981370925903, | |
| "eval_runtime": 244.8528, | |
| "eval_samples_per_second": 22.908, | |
| "eval_steps_per_second": 5.73, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.534554600397913, | |
| "grad_norm": 1.1870449781417847, | |
| "learning_rate": 3.7875397046324636e-06, | |
| "loss": 1.9603, | |
| "step": 18425 | |
| }, | |
| { | |
| "epoch": 5.542062389729344, | |
| "grad_norm": 1.1581183671951294, | |
| "learning_rate": 3.766668973833262e-06, | |
| "loss": 1.9415, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 5.549570179060775, | |
| "grad_norm": 1.0704885721206665, | |
| "learning_rate": 3.7458366056549304e-06, | |
| "loss": 1.945, | |
| "step": 18475 | |
| }, | |
| { | |
| "epoch": 5.557077968392207, | |
| "grad_norm": 1.209778904914856, | |
| "learning_rate": 3.7250428141648097e-06, | |
| "loss": 1.9571, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.557077968392207, | |
| "eval_loss": 1.9858996868133545, | |
| "eval_runtime": 244.4136, | |
| "eval_samples_per_second": 22.949, | |
| "eval_steps_per_second": 5.74, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.564585757723639, | |
| "grad_norm": 1.1481252908706665, | |
| "learning_rate": 3.704287813033836e-06, | |
| "loss": 1.9445, | |
| "step": 18525 | |
| }, | |
| { | |
| "epoch": 5.572093547055069, | |
| "grad_norm": 1.1967343091964722, | |
| "learning_rate": 3.6835718155343483e-06, | |
| "loss": 1.9457, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 5.579601336386501, | |
| "grad_norm": 1.246741771697998, | |
| "learning_rate": 3.6628950345378965e-06, | |
| "loss": 1.951, | |
| "step": 18575 | |
| }, | |
| { | |
| "epoch": 5.587109125717932, | |
| "grad_norm": 1.1486597061157227, | |
| "learning_rate": 3.6422576825130477e-06, | |
| "loss": 1.9534, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.587109125717932, | |
| "eval_loss": 1.9850828647613525, | |
| "eval_runtime": 244.5498, | |
| "eval_samples_per_second": 22.936, | |
| "eval_steps_per_second": 5.737, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.594616915049364, | |
| "grad_norm": 1.0895804166793823, | |
| "learning_rate": 3.62165997152322e-06, | |
| "loss": 1.9507, | |
| "step": 18625 | |
| }, | |
| { | |
| "epoch": 5.602124704380795, | |
| "grad_norm": 1.150476098060608, | |
| "learning_rate": 3.6011021132244807e-06, | |
| "loss": 1.9709, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 5.609632493712226, | |
| "grad_norm": 1.2993876934051514, | |
| "learning_rate": 3.5805843188633868e-06, | |
| "loss": 1.9095, | |
| "step": 18675 | |
| }, | |
| { | |
| "epoch": 5.617140283043658, | |
| "grad_norm": 1.1421048641204834, | |
| "learning_rate": 3.56010679927481e-06, | |
| "loss": 1.9381, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.617140283043658, | |
| "eval_loss": 1.9856911897659302, | |
| "eval_runtime": 244.4243, | |
| "eval_samples_per_second": 22.948, | |
| "eval_steps_per_second": 5.74, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.62464807237509, | |
| "grad_norm": 1.2726351022720337, | |
| "learning_rate": 3.539669764879769e-06, | |
| "loss": 1.9533, | |
| "step": 18725 | |
| }, | |
| { | |
| "epoch": 5.63215586170652, | |
| "grad_norm": 1.3039084672927856, | |
| "learning_rate": 3.519273425683269e-06, | |
| "loss": 1.9381, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 5.639663651037952, | |
| "grad_norm": 1.2816251516342163, | |
| "learning_rate": 3.4989179912721443e-06, | |
| "loss": 1.9566, | |
| "step": 18775 | |
| }, | |
| { | |
| "epoch": 5.647171440369383, | |
| "grad_norm": 1.1940944194793701, | |
| "learning_rate": 3.4786036708129018e-06, | |
| "loss": 1.9684, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.647171440369383, | |
| "eval_loss": 1.985024094581604, | |
| "eval_runtime": 245.079, | |
| "eval_samples_per_second": 22.887, | |
| "eval_steps_per_second": 5.725, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.654679229700815, | |
| "grad_norm": 1.1532857418060303, | |
| "learning_rate": 3.4583306730495745e-06, | |
| "loss": 1.9131, | |
| "step": 18825 | |
| }, | |
| { | |
| "epoch": 5.662187019032246, | |
| "grad_norm": 1.1996498107910156, | |
| "learning_rate": 3.4380992063015747e-06, | |
| "loss": 1.9262, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 5.669694808363677, | |
| "grad_norm": 1.1328129768371582, | |
| "learning_rate": 3.4179094784615565e-06, | |
| "loss": 1.9509, | |
| "step": 18875 | |
| }, | |
| { | |
| "epoch": 5.677202597695109, | |
| "grad_norm": 1.124004602432251, | |
| "learning_rate": 3.3977616969932705e-06, | |
| "loss": 1.9334, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.677202597695109, | |
| "eval_loss": 1.9849857091903687, | |
| "eval_runtime": 244.5129, | |
| "eval_samples_per_second": 22.939, | |
| "eval_steps_per_second": 5.738, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.68471038702654, | |
| "grad_norm": 1.187667727470398, | |
| "learning_rate": 3.3776560689294486e-06, | |
| "loss": 1.9702, | |
| "step": 18925 | |
| }, | |
| { | |
| "epoch": 5.692218176357971, | |
| "grad_norm": 1.1103003025054932, | |
| "learning_rate": 3.3575928008696606e-06, | |
| "loss": 1.9825, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 5.699725965689403, | |
| "grad_norm": 1.1390193700790405, | |
| "learning_rate": 3.3375720989781967e-06, | |
| "loss": 1.9481, | |
| "step": 18975 | |
| }, | |
| { | |
| "epoch": 5.707233755020834, | |
| "grad_norm": 1.0689352750778198, | |
| "learning_rate": 3.3175941689819507e-06, | |
| "loss": 1.9633, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.707233755020834, | |
| "eval_loss": 1.9846566915512085, | |
| "eval_runtime": 244.7924, | |
| "eval_samples_per_second": 22.913, | |
| "eval_steps_per_second": 5.731, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.714741544352266, | |
| "grad_norm": 1.3017778396606445, | |
| "learning_rate": 3.297659216168305e-06, | |
| "loss": 1.9521, | |
| "step": 19025 | |
| }, | |
| { | |
| "epoch": 5.722249333683697, | |
| "grad_norm": 1.0697276592254639, | |
| "learning_rate": 3.277767445383023e-06, | |
| "loss": 1.926, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 5.729757123015128, | |
| "grad_norm": 1.2455774545669556, | |
| "learning_rate": 3.2579190610281378e-06, | |
| "loss": 1.9708, | |
| "step": 19075 | |
| }, | |
| { | |
| "epoch": 5.73726491234656, | |
| "grad_norm": 1.2440054416656494, | |
| "learning_rate": 3.238114267059859e-06, | |
| "loss": 1.9728, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.73726491234656, | |
| "eval_loss": 1.9845046997070312, | |
| "eval_runtime": 245.1879, | |
| "eval_samples_per_second": 22.876, | |
| "eval_steps_per_second": 5.722, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.744772701677991, | |
| "grad_norm": 1.1576147079467773, | |
| "learning_rate": 3.218353266986476e-06, | |
| "loss": 1.9956, | |
| "step": 19125 | |
| }, | |
| { | |
| "epoch": 5.752280491009422, | |
| "grad_norm": 1.4614973068237305, | |
| "learning_rate": 3.198636263866259e-06, | |
| "loss": 1.9471, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 5.759788280340854, | |
| "grad_norm": 1.2813773155212402, | |
| "learning_rate": 3.1789634603053846e-06, | |
| "loss": 1.9516, | |
| "step": 19175 | |
| }, | |
| { | |
| "epoch": 5.767296069672285, | |
| "grad_norm": 1.212929368019104, | |
| "learning_rate": 3.1593350584558446e-06, | |
| "loss": 1.9446, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.767296069672285, | |
| "eval_loss": 1.9842097759246826, | |
| "eval_runtime": 244.6725, | |
| "eval_samples_per_second": 22.925, | |
| "eval_steps_per_second": 5.734, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.7748038590037165, | |
| "grad_norm": 1.0693471431732178, | |
| "learning_rate": 3.1397512600133694e-06, | |
| "loss": 1.9767, | |
| "step": 19225 | |
| }, | |
| { | |
| "epoch": 5.782311648335147, | |
| "grad_norm": 1.2919217348098755, | |
| "learning_rate": 3.120212266215365e-06, | |
| "loss": 1.9476, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 5.789819437666579, | |
| "grad_norm": 1.1402935981750488, | |
| "learning_rate": 3.1007182778388315e-06, | |
| "loss": 1.9495, | |
| "step": 19275 | |
| }, | |
| { | |
| "epoch": 5.797327226998011, | |
| "grad_norm": 1.2192392349243164, | |
| "learning_rate": 3.0812694951983087e-06, | |
| "loss": 1.9633, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.797327226998011, | |
| "eval_loss": 1.9841300249099731, | |
| "eval_runtime": 245.1028, | |
| "eval_samples_per_second": 22.884, | |
| "eval_steps_per_second": 5.724, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.8048350163294415, | |
| "grad_norm": 1.232828140258789, | |
| "learning_rate": 3.0618661181438147e-06, | |
| "loss": 1.9147, | |
| "step": 19325 | |
| }, | |
| { | |
| "epoch": 5.812342805660873, | |
| "grad_norm": 1.1075702905654907, | |
| "learning_rate": 3.042508346058794e-06, | |
| "loss": 1.9493, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 5.819850594992305, | |
| "grad_norm": 1.3566619157791138, | |
| "learning_rate": 3.0231963778580643e-06, | |
| "loss": 1.9314, | |
| "step": 19375 | |
| }, | |
| { | |
| "epoch": 5.827358384323736, | |
| "grad_norm": 0.9761985540390015, | |
| "learning_rate": 3.0039304119857863e-06, | |
| "loss": 1.9674, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.827358384323736, | |
| "eval_loss": 1.9838725328445435, | |
| "eval_runtime": 244.9514, | |
| "eval_samples_per_second": 22.898, | |
| "eval_steps_per_second": 5.728, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.8348661736551675, | |
| "grad_norm": 1.236413836479187, | |
| "learning_rate": 2.984710646413399e-06, | |
| "loss": 1.9401, | |
| "step": 19425 | |
| }, | |
| { | |
| "epoch": 5.842373962986598, | |
| "grad_norm": 1.1869447231292725, | |
| "learning_rate": 2.965537278637612e-06, | |
| "loss": 1.9927, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 5.84988175231803, | |
| "grad_norm": 1.1049928665161133, | |
| "learning_rate": 2.946410505678359e-06, | |
| "loss": 1.9789, | |
| "step": 19475 | |
| }, | |
| { | |
| "epoch": 5.857389541649462, | |
| "grad_norm": 1.0848510265350342, | |
| "learning_rate": 2.927330524076784e-06, | |
| "loss": 1.9329, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.857389541649462, | |
| "eval_loss": 1.9838305711746216, | |
| "eval_runtime": 244.8394, | |
| "eval_samples_per_second": 22.909, | |
| "eval_steps_per_second": 5.73, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.8648973309808925, | |
| "grad_norm": 1.257003664970398, | |
| "learning_rate": 2.9082975298932073e-06, | |
| "loss": 1.9271, | |
| "step": 19525 | |
| }, | |
| { | |
| "epoch": 5.872405120312324, | |
| "grad_norm": 1.1217468976974487, | |
| "learning_rate": 2.889311718705135e-06, | |
| "loss": 1.9593, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 5.879912909643755, | |
| "grad_norm": 1.0415600538253784, | |
| "learning_rate": 2.8703732856052216e-06, | |
| "loss": 1.9436, | |
| "step": 19575 | |
| }, | |
| { | |
| "epoch": 5.887420698975187, | |
| "grad_norm": 1.1773449182510376, | |
| "learning_rate": 2.8514824251992834e-06, | |
| "loss": 1.9604, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.887420698975187, | |
| "eval_loss": 1.983793020248413, | |
| "eval_runtime": 244.4635, | |
| "eval_samples_per_second": 22.944, | |
| "eval_steps_per_second": 5.739, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.894928488306618, | |
| "grad_norm": 1.1694916486740112, | |
| "learning_rate": 2.832639331604292e-06, | |
| "loss": 1.9281, | |
| "step": 19625 | |
| }, | |
| { | |
| "epoch": 5.902436277638049, | |
| "grad_norm": 1.1439831256866455, | |
| "learning_rate": 2.813844198446383e-06, | |
| "loss": 1.9469, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 5.909944066969481, | |
| "grad_norm": 1.2244899272918701, | |
| "learning_rate": 2.7950972188588596e-06, | |
| "loss": 1.9203, | |
| "step": 19675 | |
| }, | |
| { | |
| "epoch": 5.917451856300913, | |
| "grad_norm": 1.0796282291412354, | |
| "learning_rate": 2.776398585480223e-06, | |
| "loss": 1.9569, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.917451856300913, | |
| "eval_loss": 1.983589768409729, | |
| "eval_runtime": 244.6683, | |
| "eval_samples_per_second": 22.925, | |
| "eval_steps_per_second": 5.734, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.9249596456323435, | |
| "grad_norm": 1.1817554235458374, | |
| "learning_rate": 2.757748490452177e-06, | |
| "loss": 1.967, | |
| "step": 19725 | |
| }, | |
| { | |
| "epoch": 5.932467434963775, | |
| "grad_norm": 1.1933224201202393, | |
| "learning_rate": 2.739147125417653e-06, | |
| "loss": 1.9553, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 5.939975224295206, | |
| "grad_norm": 1.0195425748825073, | |
| "learning_rate": 2.7205946815188563e-06, | |
| "loss": 1.9477, | |
| "step": 19775 | |
| }, | |
| { | |
| "epoch": 5.947483013626638, | |
| "grad_norm": 1.1039797067642212, | |
| "learning_rate": 2.7020913493952893e-06, | |
| "loss": 1.9508, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.947483013626638, | |
| "eval_loss": 1.9835751056671143, | |
| "eval_runtime": 244.4808, | |
| "eval_samples_per_second": 22.942, | |
| "eval_steps_per_second": 5.739, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.954990802958069, | |
| "grad_norm": 1.1363548040390015, | |
| "learning_rate": 2.6836373191817982e-06, | |
| "loss": 1.9466, | |
| "step": 19825 | |
| }, | |
| { | |
| "epoch": 5.9624985922895, | |
| "grad_norm": 1.182576298713684, | |
| "learning_rate": 2.6652327805066128e-06, | |
| "loss": 1.9549, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 5.970006381620932, | |
| "grad_norm": 1.083834171295166, | |
| "learning_rate": 2.6468779224894086e-06, | |
| "loss": 1.9421, | |
| "step": 19875 | |
| }, | |
| { | |
| "epoch": 5.977514170952363, | |
| "grad_norm": 1.1508004665374756, | |
| "learning_rate": 2.628572933739354e-06, | |
| "loss": 1.9237, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.977514170952363, | |
| "eval_loss": 1.9832342863082886, | |
| "eval_runtime": 276.0383, | |
| "eval_samples_per_second": 20.32, | |
| "eval_steps_per_second": 5.083, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.985021960283794, | |
| "grad_norm": 1.134469985961914, | |
| "learning_rate": 2.6103180023531726e-06, | |
| "loss": 1.9175, | |
| "step": 19925 | |
| }, | |
| { | |
| "epoch": 5.992529749615226, | |
| "grad_norm": 1.1148380041122437, | |
| "learning_rate": 2.592113315913217e-06, | |
| "loss": 1.96, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 6.000300311573257, | |
| "grad_norm": 2.1076784133911133, | |
| "learning_rate": 2.5739590614855353e-06, | |
| "loss": 2.0546, | |
| "step": 19975 | |
| }, | |
| { | |
| "epoch": 6.007808100904689, | |
| "grad_norm": 1.1790810823440552, | |
| "learning_rate": 2.5558554256179507e-06, | |
| "loss": 1.9568, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 6.007808100904689, | |
| "eval_loss": 1.983675241470337, | |
| "eval_runtime": 277.6925, | |
| "eval_samples_per_second": 20.199, | |
| "eval_steps_per_second": 5.052, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 6.01531589023612, | |
| "grad_norm": 1.3004977703094482, | |
| "learning_rate": 2.5378025943381482e-06, | |
| "loss": 1.9195, | |
| "step": 20025 | |
| }, | |
| { | |
| "epoch": 6.022823679567551, | |
| "grad_norm": 1.2546344995498657, | |
| "learning_rate": 2.519800753151757e-06, | |
| "loss": 1.9527, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 6.030331468898983, | |
| "grad_norm": 1.0866812467575073, | |
| "learning_rate": 2.501850087040448e-06, | |
| "loss": 1.937, | |
| "step": 20075 | |
| }, | |
| { | |
| "epoch": 6.037839258230414, | |
| "grad_norm": 1.1754050254821777, | |
| "learning_rate": 2.4839507804600274e-06, | |
| "loss": 1.8801, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 6.037839258230414, | |
| "eval_loss": 1.983474850654602, | |
| "eval_runtime": 244.4023, | |
| "eval_samples_per_second": 22.95, | |
| "eval_steps_per_second": 5.741, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 6.045347047561846, | |
| "grad_norm": 1.3076649904251099, | |
| "learning_rate": 2.466103017338552e-06, | |
| "loss": 1.9264, | |
| "step": 20125 | |
| }, | |
| { | |
| "epoch": 6.052854836893276, | |
| "grad_norm": 1.3242402076721191, | |
| "learning_rate": 2.448306981074428e-06, | |
| "loss": 1.9262, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 6.060362626224708, | |
| "grad_norm": 1.0890467166900635, | |
| "learning_rate": 2.4305628545345394e-06, | |
| "loss": 1.9743, | |
| "step": 20175 | |
| }, | |
| { | |
| "epoch": 6.06787041555614, | |
| "grad_norm": 1.1457139253616333, | |
| "learning_rate": 2.412870820052353e-06, | |
| "loss": 1.9558, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 6.06787041555614, | |
| "eval_loss": 1.983147144317627, | |
| "eval_runtime": 244.7147, | |
| "eval_samples_per_second": 22.921, | |
| "eval_steps_per_second": 5.733, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 6.075378204887571, | |
| "grad_norm": 1.1762498617172241, | |
| "learning_rate": 2.395231059426055e-06, | |
| "loss": 1.9198, | |
| "step": 20225 | |
| }, | |
| { | |
| "epoch": 6.082885994219002, | |
| "grad_norm": 1.1638132333755493, | |
| "learning_rate": 2.3776437539166825e-06, | |
| "loss": 1.9397, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 6.090393783550433, | |
| "grad_norm": 1.2441715002059937, | |
| "learning_rate": 2.3601090842462575e-06, | |
| "loss": 1.9676, | |
| "step": 20275 | |
| }, | |
| { | |
| "epoch": 6.097901572881865, | |
| "grad_norm": 1.1457374095916748, | |
| "learning_rate": 2.342627230595929e-06, | |
| "loss": 1.9574, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 6.097901572881865, | |
| "eval_loss": 1.9833580255508423, | |
| "eval_runtime": 322.7704, | |
| "eval_samples_per_second": 17.378, | |
| "eval_steps_per_second": 4.347, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 6.1054093622132966, | |
| "grad_norm": 1.2807676792144775, | |
| "learning_rate": 2.325198372604132e-06, | |
| "loss": 1.91, | |
| "step": 20325 | |
| }, | |
| { | |
| "epoch": 6.112917151544727, | |
| "grad_norm": 1.1415411233901978, | |
| "learning_rate": 2.3078226893647254e-06, | |
| "loss": 1.9255, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 6.120424940876159, | |
| "grad_norm": 1.1930123567581177, | |
| "learning_rate": 2.290500359425165e-06, | |
| "loss": 1.898, | |
| "step": 20375 | |
| }, | |
| { | |
| "epoch": 6.127932730207591, | |
| "grad_norm": 1.1319756507873535, | |
| "learning_rate": 2.2732315607846606e-06, | |
| "loss": 1.9043, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 6.127932730207591, | |
| "eval_loss": 1.9833526611328125, | |
| "eval_runtime": 244.7015, | |
| "eval_samples_per_second": 22.922, | |
| "eval_steps_per_second": 5.734, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 6.135440519539022, | |
| "grad_norm": 1.197733759880066, | |
| "learning_rate": 2.25601647089235e-06, | |
| "loss": 1.9325, | |
| "step": 20425 | |
| }, | |
| { | |
| "epoch": 6.142948308870453, | |
| "grad_norm": 1.1803226470947266, | |
| "learning_rate": 2.238855266645473e-06, | |
| "loss": 1.9357, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 6.150456098201884, | |
| "grad_norm": 1.2374463081359863, | |
| "learning_rate": 2.2217481243875666e-06, | |
| "loss": 1.9071, | |
| "step": 20475 | |
| }, | |
| { | |
| "epoch": 6.157963887533316, | |
| "grad_norm": 1.178080439567566, | |
| "learning_rate": 2.2046952199066323e-06, | |
| "loss": 1.936, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 6.157963887533316, | |
| "eval_loss": 1.9832499027252197, | |
| "eval_runtime": 244.9473, | |
| "eval_samples_per_second": 22.899, | |
| "eval_steps_per_second": 5.728, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 6.1654716768647475, | |
| "grad_norm": 1.1624387502670288, | |
| "learning_rate": 2.1876967284333436e-06, | |
| "loss": 1.9722, | |
| "step": 20525 | |
| }, | |
| { | |
| "epoch": 6.172979466196178, | |
| "grad_norm": 1.2391788959503174, | |
| "learning_rate": 2.170752824639242e-06, | |
| "loss": 1.971, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 6.18048725552761, | |
| "grad_norm": 1.183759331703186, | |
| "learning_rate": 2.153863682634941e-06, | |
| "loss": 1.9717, | |
| "step": 20575 | |
| }, | |
| { | |
| "epoch": 6.187995044859041, | |
| "grad_norm": 1.164625644683838, | |
| "learning_rate": 2.137029475968338e-06, | |
| "loss": 1.9668, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 6.187995044859041, | |
| "eval_loss": 1.982852578163147, | |
| "eval_runtime": 244.5369, | |
| "eval_samples_per_second": 22.937, | |
| "eval_steps_per_second": 5.737, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 6.1955028341904725, | |
| "grad_norm": 1.2210159301757812, | |
| "learning_rate": 2.1209204813122366e-06, | |
| "loss": 1.9451, | |
| "step": 20625 | |
| }, | |
| { | |
| "epoch": 6.203010623521904, | |
| "grad_norm": 1.2201151847839355, | |
| "learning_rate": 2.104194449172132e-06, | |
| "loss": 1.926, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 6.210518412853335, | |
| "grad_norm": 1.2492685317993164, | |
| "learning_rate": 2.0875238627562834e-06, | |
| "loss": 1.928, | |
| "step": 20675 | |
| }, | |
| { | |
| "epoch": 6.218026202184767, | |
| "grad_norm": 0.9965053796768188, | |
| "learning_rate": 2.0709088933667766e-06, | |
| "loss": 1.9374, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 6.218026202184767, | |
| "eval_loss": 1.9826812744140625, | |
| "eval_runtime": 244.5669, | |
| "eval_samples_per_second": 22.934, | |
| "eval_steps_per_second": 5.737, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 6.2255339915161985, | |
| "grad_norm": 1.0911799669265747, | |
| "learning_rate": 2.0543497117341904e-06, | |
| "loss": 1.9361, | |
| "step": 20725 | |
| }, | |
| { | |
| "epoch": 6.233041780847629, | |
| "grad_norm": 1.277250051498413, | |
| "learning_rate": 2.0378464880158453e-06, | |
| "loss": 1.9285, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 6.240549570179061, | |
| "grad_norm": 1.1859968900680542, | |
| "learning_rate": 2.0213993917940577e-06, | |
| "loss": 1.9531, | |
| "step": 20775 | |
| }, | |
| { | |
| "epoch": 6.248057359510492, | |
| "grad_norm": 1.2009955644607544, | |
| "learning_rate": 2.0050085920743904e-06, | |
| "loss": 1.9415, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 6.248057359510492, | |
| "eval_loss": 1.9828299283981323, | |
| "eval_runtime": 244.6515, | |
| "eval_samples_per_second": 22.926, | |
| "eval_steps_per_second": 5.735, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 6.2555651488419235, | |
| "grad_norm": 1.23262357711792, | |
| "learning_rate": 1.9886742572839227e-06, | |
| "loss": 1.9466, | |
| "step": 20825 | |
| }, | |
| { | |
| "epoch": 6.263072938173355, | |
| "grad_norm": 1.1354538202285767, | |
| "learning_rate": 1.9723965552695134e-06, | |
| "loss": 1.9538, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 6.270580727504786, | |
| "grad_norm": 1.2842826843261719, | |
| "learning_rate": 1.956175653296082e-06, | |
| "loss": 1.9547, | |
| "step": 20875 | |
| }, | |
| { | |
| "epoch": 6.278088516836218, | |
| "grad_norm": 1.2268083095550537, | |
| "learning_rate": 1.9400117180448872e-06, | |
| "loss": 1.9535, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 6.278088516836218, | |
| "eval_loss": 1.9827969074249268, | |
| "eval_runtime": 244.6424, | |
| "eval_samples_per_second": 22.927, | |
| "eval_steps_per_second": 5.735, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 6.2855963061676485, | |
| "grad_norm": 1.1584311723709106, | |
| "learning_rate": 1.923904915611814e-06, | |
| "loss": 1.9903, | |
| "step": 20925 | |
| }, | |
| { | |
| "epoch": 6.29310409549908, | |
| "grad_norm": 1.1765952110290527, | |
| "learning_rate": 1.9078554115056657e-06, | |
| "loss": 1.9313, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 6.300611884830512, | |
| "grad_norm": 1.0743718147277832, | |
| "learning_rate": 1.8918633706464663e-06, | |
| "loss": 1.937, | |
| "step": 20975 | |
| }, | |
| { | |
| "epoch": 6.308119674161943, | |
| "grad_norm": 1.1020997762680054, | |
| "learning_rate": 1.8759289573637645e-06, | |
| "loss": 1.9505, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.308119674161943, | |
| "eval_loss": 1.9827996492385864, | |
| "eval_runtime": 244.7905, | |
| "eval_samples_per_second": 22.913, | |
| "eval_steps_per_second": 5.731, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.3156274634933744, | |
| "grad_norm": 1.2609679698944092, | |
| "learning_rate": 1.8600523353949437e-06, | |
| "loss": 1.9424, | |
| "step": 21025 | |
| }, | |
| { | |
| "epoch": 6.323135252824806, | |
| "grad_norm": 1.2550972700119019, | |
| "learning_rate": 1.8442336678835417e-06, | |
| "loss": 1.9284, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 6.330643042156237, | |
| "grad_norm": 1.21172297000885, | |
| "learning_rate": 1.8284731173775695e-06, | |
| "loss": 1.9422, | |
| "step": 21075 | |
| }, | |
| { | |
| "epoch": 6.338150831487669, | |
| "grad_norm": 1.2744083404541016, | |
| "learning_rate": 1.8127708458278532e-06, | |
| "loss": 1.9512, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 6.338150831487669, | |
| "eval_loss": 1.9828649759292603, | |
| "eval_runtime": 244.7295, | |
| "eval_samples_per_second": 22.919, | |
| "eval_steps_per_second": 5.733, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 6.3456586208190995, | |
| "grad_norm": 1.006986141204834, | |
| "learning_rate": 1.7971270145863531e-06, | |
| "loss": 1.9737, | |
| "step": 21125 | |
| }, | |
| { | |
| "epoch": 6.353166410150531, | |
| "grad_norm": 1.1543078422546387, | |
| "learning_rate": 1.7815417844045175e-06, | |
| "loss": 1.9688, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 6.360674199481963, | |
| "grad_norm": 1.2171674966812134, | |
| "learning_rate": 1.7660153154316258e-06, | |
| "loss": 1.9549, | |
| "step": 21175 | |
| }, | |
| { | |
| "epoch": 6.368181988813394, | |
| "grad_norm": 1.1868822574615479, | |
| "learning_rate": 1.7505477672131454e-06, | |
| "loss": 1.9467, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 6.368181988813394, | |
| "eval_loss": 1.9822328090667725, | |
| "eval_runtime": 244.9166, | |
| "eval_samples_per_second": 22.902, | |
| "eval_steps_per_second": 5.728, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 6.375689778144825, | |
| "grad_norm": 1.2831307649612427, | |
| "learning_rate": 1.7351392986890915e-06, | |
| "loss": 1.9572, | |
| "step": 21225 | |
| }, | |
| { | |
| "epoch": 6.383197567476256, | |
| "grad_norm": 1.2353671789169312, | |
| "learning_rate": 1.7197900681923927e-06, | |
| "loss": 1.9286, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 6.390705356807688, | |
| "grad_norm": 1.2204623222351074, | |
| "learning_rate": 1.7045002334472654e-06, | |
| "loss": 1.959, | |
| "step": 21275 | |
| }, | |
| { | |
| "epoch": 6.39821314613912, | |
| "grad_norm": 1.3212610483169556, | |
| "learning_rate": 1.689269951567592e-06, | |
| "loss": 1.9591, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 6.39821314613912, | |
| "eval_loss": 1.9822728633880615, | |
| "eval_runtime": 244.7907, | |
| "eval_samples_per_second": 22.913, | |
| "eval_steps_per_second": 5.731, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 6.40572093547055, | |
| "grad_norm": 1.1899783611297607, | |
| "learning_rate": 1.674099379055308e-06, | |
| "loss": 1.9496, | |
| "step": 21325 | |
| }, | |
| { | |
| "epoch": 6.413228724801982, | |
| "grad_norm": 1.1367979049682617, | |
| "learning_rate": 1.6589886717987917e-06, | |
| "loss": 1.9283, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 6.420736514133413, | |
| "grad_norm": 1.343583106994629, | |
| "learning_rate": 1.6439379850712633e-06, | |
| "loss": 1.9282, | |
| "step": 21375 | |
| }, | |
| { | |
| "epoch": 6.428244303464845, | |
| "grad_norm": 1.0660362243652344, | |
| "learning_rate": 1.6289474735291935e-06, | |
| "loss": 1.9577, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 6.428244303464845, | |
| "eval_loss": 1.9821466207504272, | |
| "eval_runtime": 245.3705, | |
| "eval_samples_per_second": 22.859, | |
| "eval_steps_per_second": 5.718, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 6.435752092796276, | |
| "grad_norm": 1.1740394830703735, | |
| "learning_rate": 1.6140172912107054e-06, | |
| "loss": 1.9397, | |
| "step": 21425 | |
| }, | |
| { | |
| "epoch": 6.443259882127707, | |
| "grad_norm": 1.2688024044036865, | |
| "learning_rate": 1.5991475915339973e-06, | |
| "loss": 1.9066, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 6.450767671459139, | |
| "grad_norm": 1.2636834383010864, | |
| "learning_rate": 1.5843385272957686e-06, | |
| "loss": 1.9337, | |
| "step": 21475 | |
| }, | |
| { | |
| "epoch": 6.458275460790571, | |
| "grad_norm": 1.207352638244629, | |
| "learning_rate": 1.5695902506696439e-06, | |
| "loss": 1.9523, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.458275460790571, | |
| "eval_loss": 1.982376217842102, | |
| "eval_runtime": 245.0395, | |
| "eval_samples_per_second": 22.89, | |
| "eval_steps_per_second": 5.726, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.465783250122001, | |
| "grad_norm": 1.1533434391021729, | |
| "learning_rate": 1.5549029132046123e-06, | |
| "loss": 1.9335, | |
| "step": 21525 | |
| }, | |
| { | |
| "epoch": 6.473291039453433, | |
| "grad_norm": 1.138137936592102, | |
| "learning_rate": 1.5402766658234704e-06, | |
| "loss": 1.9457, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 6.480798828784864, | |
| "grad_norm": 1.1742804050445557, | |
| "learning_rate": 1.5257116588212709e-06, | |
| "loss": 1.9303, | |
| "step": 21575 | |
| }, | |
| { | |
| "epoch": 6.488306618116296, | |
| "grad_norm": 1.1066967248916626, | |
| "learning_rate": 1.511208041863778e-06, | |
| "loss": 1.9251, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.488306618116296, | |
| "eval_loss": 1.9820733070373535, | |
| "eval_runtime": 244.8749, | |
| "eval_samples_per_second": 22.906, | |
| "eval_steps_per_second": 5.729, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.495814407447727, | |
| "grad_norm": 1.1347085237503052, | |
| "learning_rate": 1.4967659639859308e-06, | |
| "loss": 1.9311, | |
| "step": 21625 | |
| }, | |
| { | |
| "epoch": 6.503322196779158, | |
| "grad_norm": 1.2577033042907715, | |
| "learning_rate": 1.4823855735903083e-06, | |
| "loss": 1.9354, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 6.51082998611059, | |
| "grad_norm": 1.1945990324020386, | |
| "learning_rate": 1.468067018445608e-06, | |
| "loss": 1.9046, | |
| "step": 21675 | |
| }, | |
| { | |
| "epoch": 6.518337775442021, | |
| "grad_norm": 1.195004940032959, | |
| "learning_rate": 1.4538104456851294e-06, | |
| "loss": 1.9374, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.518337775442021, | |
| "eval_loss": 1.9817756414413452, | |
| "eval_runtime": 244.4709, | |
| "eval_samples_per_second": 22.943, | |
| "eval_steps_per_second": 5.739, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.525845564773452, | |
| "grad_norm": 1.289342999458313, | |
| "learning_rate": 1.4396160018052555e-06, | |
| "loss": 1.9201, | |
| "step": 21725 | |
| }, | |
| { | |
| "epoch": 6.533353354104884, | |
| "grad_norm": 1.231141209602356, | |
| "learning_rate": 1.4254838326639514e-06, | |
| "loss": 1.9527, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 6.540861143436315, | |
| "grad_norm": 1.3896465301513672, | |
| "learning_rate": 1.4114140834792666e-06, | |
| "loss": 1.9347, | |
| "step": 21775 | |
| }, | |
| { | |
| "epoch": 6.548368932767747, | |
| "grad_norm": 1.2049739360809326, | |
| "learning_rate": 1.3974068988278402e-06, | |
| "loss": 1.969, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.548368932767747, | |
| "eval_loss": 1.9819016456604004, | |
| "eval_runtime": 244.8747, | |
| "eval_samples_per_second": 22.906, | |
| "eval_steps_per_second": 5.729, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.555876722099178, | |
| "grad_norm": 1.2919905185699463, | |
| "learning_rate": 1.3834624226434162e-06, | |
| "loss": 1.9555, | |
| "step": 21825 | |
| }, | |
| { | |
| "epoch": 6.563384511430609, | |
| "grad_norm": 1.2100296020507812, | |
| "learning_rate": 1.3695807982153666e-06, | |
| "loss": 1.9239, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 6.570892300762041, | |
| "grad_norm": 1.1903220415115356, | |
| "learning_rate": 1.3557621681872142e-06, | |
| "loss": 1.9201, | |
| "step": 21875 | |
| }, | |
| { | |
| "epoch": 6.578400090093472, | |
| "grad_norm": 1.1797667741775513, | |
| "learning_rate": 1.3420066745551715e-06, | |
| "loss": 1.9418, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.578400090093472, | |
| "eval_loss": 1.9816147089004517, | |
| "eval_runtime": 244.5673, | |
| "eval_samples_per_second": 22.934, | |
| "eval_steps_per_second": 5.737, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.585907879424903, | |
| "grad_norm": 1.155019760131836, | |
| "learning_rate": 1.3283144586666803e-06, | |
| "loss": 1.9466, | |
| "step": 21925 | |
| }, | |
| { | |
| "epoch": 6.593415668756335, | |
| "grad_norm": 1.2090644836425781, | |
| "learning_rate": 1.314685661218958e-06, | |
| "loss": 1.9444, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 6.600923458087766, | |
| "grad_norm": 1.1589152812957764, | |
| "learning_rate": 1.3011204222575515e-06, | |
| "loss": 1.9282, | |
| "step": 21975 | |
| }, | |
| { | |
| "epoch": 6.6084312474191975, | |
| "grad_norm": 1.3078739643096924, | |
| "learning_rate": 1.287618881174899e-06, | |
| "loss": 1.9273, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.6084312474191975, | |
| "eval_loss": 1.9818423986434937, | |
| "eval_runtime": 244.6939, | |
| "eval_samples_per_second": 22.923, | |
| "eval_steps_per_second": 5.734, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.615939036750628, | |
| "grad_norm": 1.070833683013916, | |
| "learning_rate": 1.2741811767089034e-06, | |
| "loss": 1.9397, | |
| "step": 22025 | |
| }, | |
| { | |
| "epoch": 6.62344682608206, | |
| "grad_norm": 1.1993708610534668, | |
| "learning_rate": 1.2608074469414949e-06, | |
| "loss": 1.959, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 6.630954615413492, | |
| "grad_norm": 1.1407665014266968, | |
| "learning_rate": 1.2474978292972209e-06, | |
| "loss": 1.9474, | |
| "step": 22075 | |
| }, | |
| { | |
| "epoch": 6.6384624047449226, | |
| "grad_norm": 1.2709163427352905, | |
| "learning_rate": 1.2342524605418293e-06, | |
| "loss": 1.9464, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.6384624047449226, | |
| "eval_loss": 1.9815821647644043, | |
| "eval_runtime": 244.5724, | |
| "eval_samples_per_second": 22.934, | |
| "eval_steps_per_second": 5.737, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.645970194076354, | |
| "grad_norm": 1.1849255561828613, | |
| "learning_rate": 1.221071476780867e-06, | |
| "loss": 1.9201, | |
| "step": 22125 | |
| }, | |
| { | |
| "epoch": 6.653477983407786, | |
| "grad_norm": 1.2153717279434204, | |
| "learning_rate": 1.207955013458281e-06, | |
| "loss": 1.9624, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 6.660985772739217, | |
| "grad_norm": 1.1668004989624023, | |
| "learning_rate": 1.1949032053550208e-06, | |
| "loss": 1.9304, | |
| "step": 22175 | |
| }, | |
| { | |
| "epoch": 6.6684935620706485, | |
| "grad_norm": 1.1738938093185425, | |
| "learning_rate": 1.1819161865876618e-06, | |
| "loss": 1.9117, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.6684935620706485, | |
| "eval_loss": 1.981676697731018, | |
| "eval_runtime": 244.4832, | |
| "eval_samples_per_second": 22.942, | |
| "eval_steps_per_second": 5.739, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.676001351402079, | |
| "grad_norm": 1.191311001777649, | |
| "learning_rate": 1.1689940906070203e-06, | |
| "loss": 1.9211, | |
| "step": 22225 | |
| }, | |
| { | |
| "epoch": 6.683509140733511, | |
| "grad_norm": 1.1772605180740356, | |
| "learning_rate": 1.1561370501967871e-06, | |
| "loss": 1.933, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 6.691016930064943, | |
| "grad_norm": 1.3537640571594238, | |
| "learning_rate": 1.1433451974721602e-06, | |
| "loss": 1.9239, | |
| "step": 22275 | |
| }, | |
| { | |
| "epoch": 6.6985247193963735, | |
| "grad_norm": 1.1915578842163086, | |
| "learning_rate": 1.1306186638784846e-06, | |
| "loss": 1.9429, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.6985247193963735, | |
| "eval_loss": 1.981661081314087, | |
| "eval_runtime": 244.4697, | |
| "eval_samples_per_second": 22.944, | |
| "eval_steps_per_second": 5.739, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.706032508727805, | |
| "grad_norm": 1.0470706224441528, | |
| "learning_rate": 1.1179575801899122e-06, | |
| "loss": 1.9428, | |
| "step": 22325 | |
| }, | |
| { | |
| "epoch": 6.713540298059236, | |
| "grad_norm": 1.2210986614227295, | |
| "learning_rate": 1.1053620765080458e-06, | |
| "loss": 1.9551, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 6.721048087390668, | |
| "grad_norm": 1.2881091833114624, | |
| "learning_rate": 1.0928322822606064e-06, | |
| "loss": 1.9365, | |
| "step": 22375 | |
| }, | |
| { | |
| "epoch": 6.728555876722099, | |
| "grad_norm": 1.2427425384521484, | |
| "learning_rate": 1.0803683262001066e-06, | |
| "loss": 1.9491, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.728555876722099, | |
| "eval_loss": 1.9814238548278809, | |
| "eval_runtime": 244.3102, | |
| "eval_samples_per_second": 22.959, | |
| "eval_steps_per_second": 5.743, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.73606366605353, | |
| "grad_norm": 1.2582703828811646, | |
| "learning_rate": 1.067970336402524e-06, | |
| "loss": 1.9398, | |
| "step": 22425 | |
| }, | |
| { | |
| "epoch": 6.743571455384962, | |
| "grad_norm": 1.2919580936431885, | |
| "learning_rate": 1.055638440265983e-06, | |
| "loss": 1.9626, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 6.751079244716394, | |
| "grad_norm": 1.2642123699188232, | |
| "learning_rate": 1.0433727645094574e-06, | |
| "loss": 1.9278, | |
| "step": 22475 | |
| }, | |
| { | |
| "epoch": 6.7585870340478245, | |
| "grad_norm": 1.1762003898620605, | |
| "learning_rate": 1.0311734351714533e-06, | |
| "loss": 1.9289, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.7585870340478245, | |
| "eval_loss": 1.981581687927246, | |
| "eval_runtime": 245.2807, | |
| "eval_samples_per_second": 22.868, | |
| "eval_steps_per_second": 5.72, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.766094823379256, | |
| "grad_norm": 1.1311445236206055, | |
| "learning_rate": 1.0190405776087183e-06, | |
| "loss": 1.9347, | |
| "step": 22525 | |
| }, | |
| { | |
| "epoch": 6.773602612710687, | |
| "grad_norm": 1.2351762056350708, | |
| "learning_rate": 1.0069743164949595e-06, | |
| "loss": 1.9398, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 6.781110402042119, | |
| "grad_norm": 1.097221851348877, | |
| "learning_rate": 9.949747758195568e-07, | |
| "loss": 1.9527, | |
| "step": 22575 | |
| }, | |
| { | |
| "epoch": 6.78861819137355, | |
| "grad_norm": 1.2283989191055298, | |
| "learning_rate": 9.830420788862903e-07, | |
| "loss": 1.9374, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.78861819137355, | |
| "eval_loss": 1.9813563823699951, | |
| "eval_runtime": 244.58, | |
| "eval_samples_per_second": 22.933, | |
| "eval_steps_per_second": 5.736, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.796125980704981, | |
| "grad_norm": 1.28317391872406, | |
| "learning_rate": 9.71176348312076e-07, | |
| "loss": 1.9048, | |
| "step": 22625 | |
| }, | |
| { | |
| "epoch": 6.803633770036413, | |
| "grad_norm": 1.3111134767532349, | |
| "learning_rate": 9.593777060257004e-07, | |
| "loss": 1.9211, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 6.811141559367844, | |
| "grad_norm": 1.1793931722640991, | |
| "learning_rate": 9.476462732665697e-07, | |
| "loss": 1.928, | |
| "step": 22675 | |
| }, | |
| { | |
| "epoch": 6.818649348699275, | |
| "grad_norm": 1.111651062965393, | |
| "learning_rate": 9.359821705834662e-07, | |
| "loss": 1.9336, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.818649348699275, | |
| "eval_loss": 1.9812109470367432, | |
| "eval_runtime": 244.4408, | |
| "eval_samples_per_second": 22.946, | |
| "eval_steps_per_second": 5.74, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.826157138030707, | |
| "grad_norm": 1.0364270210266113, | |
| "learning_rate": 9.243855178333066e-07, | |
| "loss": 1.9512, | |
| "step": 22725 | |
| }, | |
| { | |
| "epoch": 6.833664927362138, | |
| "grad_norm": 1.0628246068954468, | |
| "learning_rate": 9.128564341799139e-07, | |
| "loss": 1.9368, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 6.84117271669357, | |
| "grad_norm": 1.250428557395935, | |
| "learning_rate": 9.013950380927874e-07, | |
| "loss": 1.9603, | |
| "step": 22775 | |
| }, | |
| { | |
| "epoch": 6.848680506025001, | |
| "grad_norm": 1.2401816844940186, | |
| "learning_rate": 8.900014473458943e-07, | |
| "loss": 1.9414, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.848680506025001, | |
| "eval_loss": 1.981279969215393, | |
| "eval_runtime": 244.3713, | |
| "eval_samples_per_second": 22.953, | |
| "eval_steps_per_second": 5.741, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.856188295356432, | |
| "grad_norm": 1.2459220886230469, | |
| "learning_rate": 8.78675779016449e-07, | |
| "loss": 1.9228, | |
| "step": 22825 | |
| }, | |
| { | |
| "epoch": 6.863696084687864, | |
| "grad_norm": 1.0594732761383057, | |
| "learning_rate": 8.674181494837147e-07, | |
| "loss": 1.9627, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 6.871203874019295, | |
| "grad_norm": 1.1160441637039185, | |
| "learning_rate": 8.5622867442781e-07, | |
| "loss": 1.9599, | |
| "step": 22875 | |
| }, | |
| { | |
| "epoch": 6.878711663350726, | |
| "grad_norm": 1.4957025051116943, | |
| "learning_rate": 8.451074688285182e-07, | |
| "loss": 1.9485, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.878711663350726, | |
| "eval_loss": 1.9810361862182617, | |
| "eval_runtime": 244.8768, | |
| "eval_samples_per_second": 22.905, | |
| "eval_steps_per_second": 5.729, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.886219452682158, | |
| "grad_norm": 1.3058786392211914, | |
| "learning_rate": 8.340546469641027e-07, | |
| "loss": 1.9092, | |
| "step": 22925 | |
| }, | |
| { | |
| "epoch": 6.893727242013589, | |
| "grad_norm": 1.2844972610473633, | |
| "learning_rate": 8.23070322410141e-07, | |
| "loss": 1.9442, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 6.901235031345021, | |
| "grad_norm": 1.291462779045105, | |
| "learning_rate": 8.121546080383474e-07, | |
| "loss": 1.9241, | |
| "step": 22975 | |
| }, | |
| { | |
| "epoch": 6.908742820676451, | |
| "grad_norm": 1.2580238580703735, | |
| "learning_rate": 8.013076160154187e-07, | |
| "loss": 1.9412, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.908742820676451, | |
| "eval_loss": 1.9810972213745117, | |
| "eval_runtime": 244.8376, | |
| "eval_samples_per_second": 22.909, | |
| "eval_steps_per_second": 5.73, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.916250610007883, | |
| "grad_norm": 1.2016263008117676, | |
| "learning_rate": 7.905294578018824e-07, | |
| "loss": 1.932, | |
| "step": 23025 | |
| }, | |
| { | |
| "epoch": 6.923758399339315, | |
| "grad_norm": 1.2784329652786255, | |
| "learning_rate": 7.798202441509484e-07, | |
| "loss": 1.9505, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 6.931266188670746, | |
| "grad_norm": 1.3308900594711304, | |
| "learning_rate": 7.691800851073724e-07, | |
| "loss": 1.9416, | |
| "step": 23075 | |
| }, | |
| { | |
| "epoch": 6.938773978002177, | |
| "grad_norm": 1.1549803018569946, | |
| "learning_rate": 7.58609090006328e-07, | |
| "loss": 1.9469, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.938773978002177, | |
| "eval_loss": 1.9810361862182617, | |
| "eval_runtime": 244.9865, | |
| "eval_samples_per_second": 22.895, | |
| "eval_steps_per_second": 5.727, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.946281767333609, | |
| "grad_norm": 1.0882563591003418, | |
| "learning_rate": 7.481073674722763e-07, | |
| "loss": 1.9424, | |
| "step": 23125 | |
| }, | |
| { | |
| "epoch": 6.95378955666504, | |
| "grad_norm": 1.1521430015563965, | |
| "learning_rate": 7.37675025417856e-07, | |
| "loss": 1.9525, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 6.9612973459964715, | |
| "grad_norm": 1.1167752742767334, | |
| "learning_rate": 7.273121710427738e-07, | |
| "loss": 1.9644, | |
| "step": 23175 | |
| }, | |
| { | |
| "epoch": 6.968805135327902, | |
| "grad_norm": 1.0596712827682495, | |
| "learning_rate": 7.170189108326941e-07, | |
| "loss": 1.921, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.968805135327902, | |
| "eval_loss": 1.9809205532073975, | |
| "eval_runtime": 244.6054, | |
| "eval_samples_per_second": 22.931, | |
| "eval_steps_per_second": 5.736, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.976312924659334, | |
| "grad_norm": 1.2589685916900635, | |
| "learning_rate": 7.067953505581593e-07, | |
| "loss": 1.948, | |
| "step": 23225 | |
| }, | |
| { | |
| "epoch": 6.983820713990766, | |
| "grad_norm": 1.0607045888900757, | |
| "learning_rate": 6.966415952734953e-07, | |
| "loss": 1.9632, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 6.991328503322197, | |
| "grad_norm": 1.2630726099014282, | |
| "learning_rate": 6.86557749315728e-07, | |
| "loss": 1.9264, | |
| "step": 23275 | |
| }, | |
| { | |
| "epoch": 6.998836292653628, | |
| "grad_norm": 1.1883573532104492, | |
| "learning_rate": 6.765439163035183e-07, | |
| "loss": 1.9428, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.998836292653628, | |
| "eval_loss": 1.9808813333511353, | |
| "eval_runtime": 244.9512, | |
| "eval_samples_per_second": 22.898, | |
| "eval_steps_per_second": 5.728, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 7.006606854611659, | |
| "grad_norm": 1.3225353956222534, | |
| "learning_rate": 6.666001991360948e-07, | |
| "loss": 2.0098, | |
| "step": 23325 | |
| }, | |
| { | |
| "epoch": 7.014114643943091, | |
| "grad_norm": 1.1181532144546509, | |
| "learning_rate": 6.567266999921936e-07, | |
| "loss": 1.9435, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 7.021622433274523, | |
| "grad_norm": 1.167235016822815, | |
| "learning_rate": 6.469235203290125e-07, | |
| "loss": 1.9534, | |
| "step": 23375 | |
| }, | |
| { | |
| "epoch": 7.0291302226059535, | |
| "grad_norm": 1.1674944162368774, | |
| "learning_rate": 6.371907608811686e-07, | |
| "loss": 1.9374, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 7.0291302226059535, | |
| "eval_loss": 1.9810993671417236, | |
| "eval_runtime": 244.971, | |
| "eval_samples_per_second": 22.897, | |
| "eval_steps_per_second": 5.727, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 7.036638011937385, | |
| "grad_norm": 1.2668019533157349, | |
| "learning_rate": 6.275285216596583e-07, | |
| "loss": 1.9401, | |
| "step": 23425 | |
| }, | |
| { | |
| "epoch": 7.044145801268816, | |
| "grad_norm": 1.2170027494430542, | |
| "learning_rate": 6.179369019508346e-07, | |
| "loss": 1.9334, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 7.051653590600248, | |
| "grad_norm": 1.1965893507003784, | |
| "learning_rate": 6.084160003153849e-07, | |
| "loss": 1.9103, | |
| "step": 23475 | |
| }, | |
| { | |
| "epoch": 7.0591613799316795, | |
| "grad_norm": 1.1913440227508545, | |
| "learning_rate": 5.989659145873175e-07, | |
| "loss": 1.9268, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 7.0591613799316795, | |
| "eval_loss": 1.9811537265777588, | |
| "eval_runtime": 244.4854, | |
| "eval_samples_per_second": 22.942, | |
| "eval_steps_per_second": 5.739, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 7.06666916926311, | |
| "grad_norm": 1.3365952968597412, | |
| "learning_rate": 5.895867418729561e-07, | |
| "loss": 1.9736, | |
| "step": 23525 | |
| }, | |
| { | |
| "epoch": 7.074176958594542, | |
| "grad_norm": 1.1637241840362549, | |
| "learning_rate": 5.802785785499434e-07, | |
| "loss": 1.9338, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 7.081684747925973, | |
| "grad_norm": 1.279487133026123, | |
| "learning_rate": 5.710415202662539e-07, | |
| "loss": 1.9281, | |
| "step": 23575 | |
| }, | |
| { | |
| "epoch": 7.0891925372574045, | |
| "grad_norm": 1.1029129028320312, | |
| "learning_rate": 5.618756619392048e-07, | |
| "loss": 1.9513, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 7.0891925372574045, | |
| "eval_loss": 1.9810665845870972, | |
| "eval_runtime": 244.7676, | |
| "eval_samples_per_second": 22.916, | |
| "eval_steps_per_second": 5.732, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 7.096700326588836, | |
| "grad_norm": 1.3634998798370361, | |
| "learning_rate": 5.527810977544814e-07, | |
| "loss": 1.972, | |
| "step": 23625 | |
| }, | |
| { | |
| "epoch": 7.104208115920267, | |
| "grad_norm": 1.1559998989105225, | |
| "learning_rate": 5.437579211651739e-07, | |
| "loss": 1.9436, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 7.111715905251699, | |
| "grad_norm": 1.1781272888183594, | |
| "learning_rate": 5.348062248908126e-07, | |
| "loss": 1.9489, | |
| "step": 23675 | |
| }, | |
| { | |
| "epoch": 7.11922369458313, | |
| "grad_norm": 1.2078481912612915, | |
| "learning_rate": 5.259261009164179e-07, | |
| "loss": 1.973, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 7.11922369458313, | |
| "eval_loss": 1.9810374975204468, | |
| "eval_runtime": 244.6697, | |
| "eval_samples_per_second": 22.925, | |
| "eval_steps_per_second": 5.734, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 7.126731483914561, | |
| "grad_norm": 1.2811923027038574, | |
| "learning_rate": 5.171176404915562e-07, | |
| "loss": 1.9334, | |
| "step": 23725 | |
| }, | |
| { | |
| "epoch": 7.134239273245993, | |
| "grad_norm": 1.2578486204147339, | |
| "learning_rate": 5.08380934129396e-07, | |
| "loss": 1.9083, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 7.141747062577424, | |
| "grad_norm": 1.2639051675796509, | |
| "learning_rate": 4.99716071605785e-07, | |
| "loss": 1.9363, | |
| "step": 23775 | |
| }, | |
| { | |
| "epoch": 7.1492548519088555, | |
| "grad_norm": 1.4398607015609741, | |
| "learning_rate": 4.911231419583228e-07, | |
| "loss": 1.9547, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 7.1492548519088555, | |
| "eval_loss": 1.9809601306915283, | |
| "eval_runtime": 244.5082, | |
| "eval_samples_per_second": 22.94, | |
| "eval_steps_per_second": 5.738, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 7.156762641240287, | |
| "grad_norm": 1.1956835985183716, | |
| "learning_rate": 4.826022334854482e-07, | |
| "loss": 1.9501, | |
| "step": 23825 | |
| }, | |
| { | |
| "epoch": 7.164270430571718, | |
| "grad_norm": 1.192142367362976, | |
| "learning_rate": 4.741534337455333e-07, | |
| "loss": 1.9336, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 7.17177821990315, | |
| "grad_norm": 1.204443335533142, | |
| "learning_rate": 4.6577682955597804e-07, | |
| "loss": 1.9482, | |
| "step": 23875 | |
| }, | |
| { | |
| "epoch": 7.1792860092345805, | |
| "grad_norm": 1.205980896949768, | |
| "learning_rate": 4.5747250699232664e-07, | |
| "loss": 1.9229, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 7.1792860092345805, | |
| "eval_loss": 1.9809165000915527, | |
| "eval_runtime": 244.9164, | |
| "eval_samples_per_second": 22.902, | |
| "eval_steps_per_second": 5.728, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 7.186793798566012, | |
| "grad_norm": 1.1392741203308105, | |
| "learning_rate": 4.492405513873732e-07, | |
| "loss": 1.9091, | |
| "step": 23925 | |
| }, | |
| { | |
| "epoch": 7.194301587897444, | |
| "grad_norm": 1.1868606805801392, | |
| "learning_rate": 4.4108104733029506e-07, | |
| "loss": 1.9538, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 7.201809377228875, | |
| "grad_norm": 1.2095065116882324, | |
| "learning_rate": 4.32994078665776e-07, | |
| "loss": 1.9259, | |
| "step": 23975 | |
| }, | |
| { | |
| "epoch": 7.209317166560306, | |
| "grad_norm": 1.2137978076934814, | |
| "learning_rate": 4.2497972849314587e-07, | |
| "loss": 1.9086, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.209317166560306, | |
| "eval_loss": 1.9811201095581055, | |
| "eval_runtime": 244.5079, | |
| "eval_samples_per_second": 22.94, | |
| "eval_steps_per_second": 5.738, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.216824955891738, | |
| "grad_norm": 1.3852958679199219, | |
| "learning_rate": 4.170380791655323e-07, | |
| "loss": 1.9304, | |
| "step": 24025 | |
| }, | |
| { | |
| "epoch": 7.224332745223169, | |
| "grad_norm": 1.1705379486083984, | |
| "learning_rate": 4.0916921228900643e-07, | |
| "loss": 1.9256, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 7.231840534554601, | |
| "grad_norm": 1.2448861598968506, | |
| "learning_rate": 4.013732087217492e-07, | |
| "loss": 1.9281, | |
| "step": 24075 | |
| }, | |
| { | |
| "epoch": 7.2393483238860314, | |
| "grad_norm": 1.1513874530792236, | |
| "learning_rate": 3.9365014857322133e-07, | |
| "loss": 1.9121, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 7.2393483238860314, | |
| "eval_loss": 1.9809224605560303, | |
| "eval_runtime": 244.5719, | |
| "eval_samples_per_second": 22.934, | |
| "eval_steps_per_second": 5.737, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 7.246856113217463, | |
| "grad_norm": 1.2126648426055908, | |
| "learning_rate": 3.8600011120333483e-07, | |
| "loss": 1.9301, | |
| "step": 24125 | |
| }, | |
| { | |
| "epoch": 7.254363902548895, | |
| "grad_norm": 1.213840126991272, | |
| "learning_rate": 3.7842317522164274e-07, | |
| "loss": 1.9395, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 7.261871691880326, | |
| "grad_norm": 1.1836591958999634, | |
| "learning_rate": 3.709194184865314e-07, | |
| "loss": 1.9326, | |
| "step": 24175 | |
| }, | |
| { | |
| "epoch": 7.269379481211757, | |
| "grad_norm": 1.1955537796020508, | |
| "learning_rate": 3.6348891810441457e-07, | |
| "loss": 1.9385, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 7.269379481211757, | |
| "eval_loss": 1.9811633825302124, | |
| "eval_runtime": 244.8759, | |
| "eval_samples_per_second": 22.905, | |
| "eval_steps_per_second": 5.729, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 7.276887270543188, | |
| "grad_norm": 1.0566610097885132, | |
| "learning_rate": 3.5613175042894823e-07, | |
| "loss": 1.9263, | |
| "step": 24225 | |
| }, | |
| { | |
| "epoch": 7.28439505987462, | |
| "grad_norm": 1.196273922920227, | |
| "learning_rate": 3.4884799106024185e-07, | |
| "loss": 1.905, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 7.291902849206052, | |
| "grad_norm": 1.1962950229644775, | |
| "learning_rate": 3.4163771484408247e-07, | |
| "loss": 1.9178, | |
| "step": 24275 | |
| }, | |
| { | |
| "epoch": 7.299410638537482, | |
| "grad_norm": 1.1637682914733887, | |
| "learning_rate": 3.3450099587116533e-07, | |
| "loss": 1.9427, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 7.299410638537482, | |
| "eval_loss": 1.9810516834259033, | |
| "eval_runtime": 244.4733, | |
| "eval_samples_per_second": 22.943, | |
| "eval_steps_per_second": 5.739, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 7.306918427868914, | |
| "grad_norm": 1.2521005868911743, | |
| "learning_rate": 3.2743790747633285e-07, | |
| "loss": 1.9469, | |
| "step": 24325 | |
| }, | |
| { | |
| "epoch": 7.314426217200346, | |
| "grad_norm": 1.241564154624939, | |
| "learning_rate": 3.2044852223782337e-07, | |
| "loss": 1.9265, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 7.321934006531777, | |
| "grad_norm": 1.2285447120666504, | |
| "learning_rate": 3.135329119765204e-07, | |
| "loss": 1.9296, | |
| "step": 24375 | |
| }, | |
| { | |
| "epoch": 7.329441795863208, | |
| "grad_norm": 1.1928914785385132, | |
| "learning_rate": 3.0669114775521784e-07, | |
| "loss": 1.9409, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 7.329441795863208, | |
| "eval_loss": 1.9809722900390625, | |
| "eval_runtime": 244.5739, | |
| "eval_samples_per_second": 22.934, | |
| "eval_steps_per_second": 5.737, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 7.336949585194639, | |
| "grad_norm": 1.1230217218399048, | |
| "learning_rate": 2.9992329987789004e-07, | |
| "loss": 1.9087, | |
| "step": 24425 | |
| }, | |
| { | |
| "epoch": 7.344457374526071, | |
| "grad_norm": 1.2940632104873657, | |
| "learning_rate": 2.932294378889672e-07, | |
| "loss": 1.9574, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 7.3519651638575025, | |
| "grad_norm": 1.3414610624313354, | |
| "learning_rate": 2.8660963057262427e-07, | |
| "loss": 1.945, | |
| "step": 24475 | |
| }, | |
| { | |
| "epoch": 7.359472953188933, | |
| "grad_norm": 1.2023084163665771, | |
| "learning_rate": 2.800639459520693e-07, | |
| "loss": 1.9368, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.359472953188933, | |
| "eval_loss": 1.9809428453445435, | |
| "eval_runtime": 244.6252, | |
| "eval_samples_per_second": 22.929, | |
| "eval_steps_per_second": 5.735, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.366980742520365, | |
| "grad_norm": 1.3090476989746094, | |
| "learning_rate": 2.7359245128884935e-07, | |
| "loss": 1.9401, | |
| "step": 24525 | |
| }, | |
| { | |
| "epoch": 7.374488531851796, | |
| "grad_norm": 1.1625995635986328, | |
| "learning_rate": 2.6719521308215644e-07, | |
| "loss": 1.9421, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 7.381996321183228, | |
| "grad_norm": 1.1445448398590088, | |
| "learning_rate": 2.608722970681446e-07, | |
| "loss": 1.9201, | |
| "step": 24575 | |
| }, | |
| { | |
| "epoch": 7.389504110514659, | |
| "grad_norm": 1.2008908987045288, | |
| "learning_rate": 2.5462376821925453e-07, | |
| "loss": 1.9368, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 7.389504110514659, | |
| "eval_loss": 1.9809165000915527, | |
| "eval_runtime": 245.2653, | |
| "eval_samples_per_second": 22.869, | |
| "eval_steps_per_second": 5.72, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 7.39701189984609, | |
| "grad_norm": 1.1442121267318726, | |
| "learning_rate": 2.484496907435452e-07, | |
| "loss": 1.9356, | |
| "step": 24625 | |
| }, | |
| { | |
| "epoch": 7.404519689177522, | |
| "grad_norm": 1.194258689880371, | |
| "learning_rate": 2.42350128084039e-07, | |
| "loss": 1.957, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 7.4120274785089535, | |
| "grad_norm": 1.2677561044692993, | |
| "learning_rate": 2.3632514291806185e-07, | |
| "loss": 1.9405, | |
| "step": 24675 | |
| }, | |
| { | |
| "epoch": 7.419535267840384, | |
| "grad_norm": 1.1491544246673584, | |
| "learning_rate": 2.3037479715660337e-07, | |
| "loss": 1.921, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 7.419535267840384, | |
| "eval_loss": 1.9808403253555298, | |
| "eval_runtime": 245.1407, | |
| "eval_samples_per_second": 22.881, | |
| "eval_steps_per_second": 5.723, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 7.427043057171816, | |
| "grad_norm": 1.2121251821517944, | |
| "learning_rate": 2.2449915194368258e-07, | |
| "loss": 1.9255, | |
| "step": 24725 | |
| }, | |
| { | |
| "epoch": 7.434550846503247, | |
| "grad_norm": 1.2214640378952026, | |
| "learning_rate": 2.1869826765571505e-07, | |
| "loss": 1.9289, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 7.4420586358346785, | |
| "grad_norm": 1.3879413604736328, | |
| "learning_rate": 2.1297220390089662e-07, | |
| "loss": 1.9396, | |
| "step": 24775 | |
| }, | |
| { | |
| "epoch": 7.44956642516611, | |
| "grad_norm": 1.1950923204421997, | |
| "learning_rate": 2.0732101951858816e-07, | |
| "loss": 1.926, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 7.44956642516611, | |
| "eval_loss": 1.9808101654052734, | |
| "eval_runtime": 244.5077, | |
| "eval_samples_per_second": 22.94, | |
| "eval_steps_per_second": 5.738, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 7.457074214497541, | |
| "grad_norm": 1.1813709735870361, | |
| "learning_rate": 2.0174477257871277e-07, | |
| "loss": 1.9482, | |
| "step": 24825 | |
| }, | |
| { | |
| "epoch": 7.464582003828973, | |
| "grad_norm": 1.2802395820617676, | |
| "learning_rate": 1.9624352038115773e-07, | |
| "loss": 1.9456, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 7.472089793160404, | |
| "grad_norm": 1.1578987836837769, | |
| "learning_rate": 1.9103292580586406e-07, | |
| "loss": 1.9607, | |
| "step": 24875 | |
| }, | |
| { | |
| "epoch": 7.479597582491835, | |
| "grad_norm": 1.2001193761825562, | |
| "learning_rate": 1.856788265674736e-07, | |
| "loss": 1.8946, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 7.479597582491835, | |
| "eval_loss": 1.9809141159057617, | |
| "eval_runtime": 245.3503, | |
| "eval_samples_per_second": 22.861, | |
| "eval_steps_per_second": 5.718, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 7.487105371823267, | |
| "grad_norm": 1.0772101879119873, | |
| "learning_rate": 1.8039988716037763e-07, | |
| "loss": 1.9356, | |
| "step": 24925 | |
| }, | |
| { | |
| "epoch": 7.494613161154698, | |
| "grad_norm": 1.2263306379318237, | |
| "learning_rate": 1.7519616182942067e-07, | |
| "loss": 1.9384, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 7.5021209504861295, | |
| "grad_norm": 1.1009598970413208, | |
| "learning_rate": 1.7006770404656534e-07, | |
| "loss": 1.9524, | |
| "step": 24975 | |
| }, | |
| { | |
| "epoch": 7.509628739817561, | |
| "grad_norm": 1.192656397819519, | |
| "learning_rate": 1.6501456651034808e-07, | |
| "loss": 1.9367, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.509628739817561, | |
| "eval_loss": 1.9808765649795532, | |
| "eval_runtime": 244.5611, | |
| "eval_samples_per_second": 22.935, | |
| "eval_steps_per_second": 5.737, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.517136529148992, | |
| "grad_norm": 1.0909212827682495, | |
| "learning_rate": 1.6003680114533763e-07, | |
| "loss": 1.906, | |
| "step": 25025 | |
| }, | |
| { | |
| "epoch": 7.524644318480424, | |
| "grad_norm": 1.217670202255249, | |
| "learning_rate": 1.5513445910159823e-07, | |
| "loss": 1.9117, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 7.5321521078118545, | |
| "grad_norm": 1.1752556562423706, | |
| "learning_rate": 1.503075907541665e-07, | |
| "loss": 1.9262, | |
| "step": 25075 | |
| }, | |
| { | |
| "epoch": 7.539659897143286, | |
| "grad_norm": 1.1952420473098755, | |
| "learning_rate": 1.455562457025353e-07, | |
| "loss": 1.9484, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 7.539659897143286, | |
| "eval_loss": 1.9809269905090332, | |
| "eval_runtime": 244.6411, | |
| "eval_samples_per_second": 22.927, | |
| "eval_steps_per_second": 5.735, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 7.547167686474717, | |
| "grad_norm": 1.1993337869644165, | |
| "learning_rate": 1.4088047277013987e-07, | |
| "loss": 1.9473, | |
| "step": 25125 | |
| }, | |
| { | |
| "epoch": 7.554675475806149, | |
| "grad_norm": 1.2464344501495361, | |
| "learning_rate": 1.3628032000386008e-07, | |
| "loss": 1.9417, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 7.56218326513758, | |
| "grad_norm": 1.3131685256958008, | |
| "learning_rate": 1.3175583467352316e-07, | |
| "loss": 1.9431, | |
| "step": 25175 | |
| }, | |
| { | |
| "epoch": 7.569691054469011, | |
| "grad_norm": 1.255356788635254, | |
| "learning_rate": 1.2730706327142155e-07, | |
| "loss": 1.9323, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.569691054469011, | |
| "eval_loss": 1.9808063507080078, | |
| "eval_runtime": 244.7486, | |
| "eval_samples_per_second": 22.917, | |
| "eval_steps_per_second": 5.732, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.577198843800443, | |
| "grad_norm": 1.2939985990524292, | |
| "learning_rate": 1.2293405151183184e-07, | |
| "loss": 1.9484, | |
| "step": 25225 | |
| }, | |
| { | |
| "epoch": 7.584706633131875, | |
| "grad_norm": 1.2347224950790405, | |
| "learning_rate": 1.1863684433054994e-07, | |
| "loss": 1.9408, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 7.5922144224633055, | |
| "grad_norm": 1.142849087715149, | |
| "learning_rate": 1.1441548588442152e-07, | |
| "loss": 1.9449, | |
| "step": 25275 | |
| }, | |
| { | |
| "epoch": 7.599722211794737, | |
| "grad_norm": 1.2810004949569702, | |
| "learning_rate": 1.1027001955089572e-07, | |
| "loss": 1.9499, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.599722211794737, | |
| "eval_loss": 1.9808244705200195, | |
| "eval_runtime": 244.5894, | |
| "eval_samples_per_second": 22.932, | |
| "eval_steps_per_second": 5.736, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.607230001126169, | |
| "grad_norm": 1.2134476900100708, | |
| "learning_rate": 1.0620048792757464e-07, | |
| "loss": 1.9384, | |
| "step": 25325 | |
| }, | |
| { | |
| "epoch": 7.6147377904576, | |
| "grad_norm": 1.1853543519973755, | |
| "learning_rate": 1.0220693283177957e-07, | |
| "loss": 1.945, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 7.622245579789031, | |
| "grad_norm": 1.2892330884933472, | |
| "learning_rate": 9.82893953001171e-08, | |
| "loss": 1.9541, | |
| "step": 25375 | |
| }, | |
| { | |
| "epoch": 7.629753369120462, | |
| "grad_norm": 1.139492392539978, | |
| "learning_rate": 9.444791558806121e-08, | |
| "loss": 1.9462, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.629753369120462, | |
| "eval_loss": 1.9807994365692139, | |
| "eval_runtime": 244.5984, | |
| "eval_samples_per_second": 22.931, | |
| "eval_steps_per_second": 5.736, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.637261158451894, | |
| "grad_norm": 1.1218518018722534, | |
| "learning_rate": 9.068253316953684e-08, | |
| "loss": 1.926, | |
| "step": 25425 | |
| }, | |
| { | |
| "epoch": 7.644768947783325, | |
| "grad_norm": 1.1967252492904663, | |
| "learning_rate": 8.699328673651613e-08, | |
| "loss": 1.921, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 7.652276737114756, | |
| "grad_norm": 1.1184207201004028, | |
| "learning_rate": 8.338021419861868e-08, | |
| "loss": 1.9127, | |
| "step": 25475 | |
| }, | |
| { | |
| "epoch": 7.659784526446188, | |
| "grad_norm": 1.2853116989135742, | |
| "learning_rate": 7.984335268272441e-08, | |
| "loss": 1.9373, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.659784526446188, | |
| "eval_loss": 1.9808040857315063, | |
| "eval_runtime": 244.5622, | |
| "eval_samples_per_second": 22.935, | |
| "eval_steps_per_second": 5.737, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.667292315777619, | |
| "grad_norm": 1.1911348104476929, | |
| "learning_rate": 7.638273853259131e-08, | |
| "loss": 1.9449, | |
| "step": 25525 | |
| }, | |
| { | |
| "epoch": 7.674800105109051, | |
| "grad_norm": 1.2387864589691162, | |
| "learning_rate": 7.299840730847995e-08, | |
| "loss": 1.9314, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 7.682307894440482, | |
| "grad_norm": 1.2212029695510864, | |
| "learning_rate": 6.969039378679292e-08, | |
| "loss": 1.9205, | |
| "step": 25575 | |
| }, | |
| { | |
| "epoch": 7.689815683771913, | |
| "grad_norm": 1.2776437997817993, | |
| "learning_rate": 6.645873195971098e-08, | |
| "loss": 1.984, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.689815683771913, | |
| "eval_loss": 1.9808552265167236, | |
| "eval_runtime": 244.7299, | |
| "eval_samples_per_second": 22.919, | |
| "eval_steps_per_second": 5.733, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.697323473103345, | |
| "grad_norm": 1.2346168756484985, | |
| "learning_rate": 6.330345503484908e-08, | |
| "loss": 1.9367, | |
| "step": 25625 | |
| }, | |
| { | |
| "epoch": 7.704831262434777, | |
| "grad_norm": 1.2425425052642822, | |
| "learning_rate": 6.02245954349126e-08, | |
| "loss": 1.9449, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 7.712339051766207, | |
| "grad_norm": 1.1887537240982056, | |
| "learning_rate": 5.722218479736502e-08, | |
| "loss": 1.9207, | |
| "step": 25675 | |
| }, | |
| { | |
| "epoch": 7.719846841097639, | |
| "grad_norm": 1.1335868835449219, | |
| "learning_rate": 5.429625397410237e-08, | |
| "loss": 1.9374, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.719846841097639, | |
| "eval_loss": 1.9808275699615479, | |
| "eval_runtime": 244.9051, | |
| "eval_samples_per_second": 22.903, | |
| "eval_steps_per_second": 5.729, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.72735463042907, | |
| "grad_norm": 1.2654999494552612, | |
| "learning_rate": 5.144683303113684e-08, | |
| "loss": 1.9645, | |
| "step": 25725 | |
| }, | |
| { | |
| "epoch": 7.734862419760502, | |
| "grad_norm": 1.3081024885177612, | |
| "learning_rate": 4.8673951248286166e-08, | |
| "loss": 1.9205, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 7.742370209091932, | |
| "grad_norm": 1.2318024635314941, | |
| "learning_rate": 4.597763711887637e-08, | |
| "loss": 1.9425, | |
| "step": 25775 | |
| }, | |
| { | |
| "epoch": 7.749877998423364, | |
| "grad_norm": 1.2977827787399292, | |
| "learning_rate": 4.335791834944369e-08, | |
| "loss": 1.9496, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.749877998423364, | |
| "eval_loss": 1.9807723760604858, | |
| "eval_runtime": 244.5126, | |
| "eval_samples_per_second": 22.94, | |
| "eval_steps_per_second": 5.738, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.757385787754796, | |
| "grad_norm": 1.0694193840026855, | |
| "learning_rate": 4.081482185945479e-08, | |
| "loss": 1.9416, | |
| "step": 25825 | |
| }, | |
| { | |
| "epoch": 7.764893577086227, | |
| "grad_norm": 1.2503465414047241, | |
| "learning_rate": 3.8348373781026955e-08, | |
| "loss": 1.9512, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 7.772401366417658, | |
| "grad_norm": 1.2763334512710571, | |
| "learning_rate": 3.5958599458662537e-08, | |
| "loss": 1.9338, | |
| "step": 25875 | |
| }, | |
| { | |
| "epoch": 7.77990915574909, | |
| "grad_norm": 1.1357148885726929, | |
| "learning_rate": 3.3645523448984914e-08, | |
| "loss": 1.9579, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.77990915574909, | |
| "eval_loss": 1.9807769060134888, | |
| "eval_runtime": 244.8991, | |
| "eval_samples_per_second": 22.903, | |
| "eval_steps_per_second": 5.729, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.787416945080521, | |
| "grad_norm": 1.1911311149597168, | |
| "learning_rate": 3.149715032283562e-08, | |
| "loss": 1.9241, | |
| "step": 25925 | |
| }, | |
| { | |
| "epoch": 7.7949247344119525, | |
| "grad_norm": 1.2044905424118042, | |
| "learning_rate": 2.933447122186239e-08, | |
| "loss": 1.9325, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 7.802432523743384, | |
| "grad_norm": 1.355704426765442, | |
| "learning_rate": 2.724855850118585e-08, | |
| "loss": 1.9369, | |
| "step": 25975 | |
| }, | |
| { | |
| "epoch": 7.809940313074815, | |
| "grad_norm": 1.1674330234527588, | |
| "learning_rate": 2.5239433595037053e-08, | |
| "loss": 1.9114, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.809940313074815, | |
| "eval_loss": 1.9807677268981934, | |
| "eval_runtime": 244.6737, | |
| "eval_samples_per_second": 22.924, | |
| "eval_steps_per_second": 5.734, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.817448102406247, | |
| "grad_norm": 1.242080807685852, | |
| "learning_rate": 2.33071171485974e-08, | |
| "loss": 1.9247, | |
| "step": 26025 | |
| }, | |
| { | |
| "epoch": 7.824955891737678, | |
| "grad_norm": 1.2494958639144897, | |
| "learning_rate": 2.1451629017787133e-08, | |
| "loss": 1.9284, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 7.832463681069109, | |
| "grad_norm": 1.2800395488739014, | |
| "learning_rate": 1.9672988269061332e-08, | |
| "loss": 1.9365, | |
| "step": 26075 | |
| }, | |
| { | |
| "epoch": 7.83997147040054, | |
| "grad_norm": 1.2159162759780884, | |
| "learning_rate": 1.797121317921341e-08, | |
| "loss": 1.9213, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.83997147040054, | |
| "eval_loss": 1.9807840585708618, | |
| "eval_runtime": 245.0653, | |
| "eval_samples_per_second": 22.888, | |
| "eval_steps_per_second": 5.725, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.847479259731972, | |
| "grad_norm": 1.2391120195388794, | |
| "learning_rate": 1.6346321235187756e-08, | |
| "loss": 1.9321, | |
| "step": 26125 | |
| }, | |
| { | |
| "epoch": 7.8549870490634035, | |
| "grad_norm": 1.1283122301101685, | |
| "learning_rate": 1.4798329133900724e-08, | |
| "loss": 1.9741, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 7.862494838394834, | |
| "grad_norm": 1.1563724279403687, | |
| "learning_rate": 1.3327252782067423e-08, | |
| "loss": 1.9312, | |
| "step": 26175 | |
| }, | |
| { | |
| "epoch": 7.870002627726266, | |
| "grad_norm": 1.2058521509170532, | |
| "learning_rate": 1.1933107296039358e-08, | |
| "loss": 1.9255, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.870002627726266, | |
| "eval_loss": 1.980788230895996, | |
| "eval_runtime": 244.7317, | |
| "eval_samples_per_second": 22.919, | |
| "eval_steps_per_second": 5.733, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.877510417057698, | |
| "grad_norm": 1.1759904623031616, | |
| "learning_rate": 1.0615907001648717e-08, | |
| "loss": 1.9553, | |
| "step": 26225 | |
| }, | |
| { | |
| "epoch": 7.8850182063891285, | |
| "grad_norm": 1.2614035606384277, | |
| "learning_rate": 9.37566543406182e-09, | |
| "loss": 1.9662, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 7.89252599572056, | |
| "grad_norm": 1.3371256589889526, | |
| "learning_rate": 8.212395337640066e-09, | |
| "loss": 1.9287, | |
| "step": 26275 | |
| }, | |
| { | |
| "epoch": 7.900033785051991, | |
| "grad_norm": 1.3627214431762695, | |
| "learning_rate": 7.126108665805875e-09, | |
| "loss": 1.9213, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.900033785051991, | |
| "eval_loss": 1.980796217918396, | |
| "eval_runtime": 244.3911, | |
| "eval_samples_per_second": 22.951, | |
| "eval_steps_per_second": 5.741, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.907541574383423, | |
| "grad_norm": 1.236215353012085, | |
| "learning_rate": 6.11681658092611e-09, | |
| "loss": 1.9431, | |
| "step": 26325 | |
| }, | |
| { | |
| "epoch": 7.9150493637148545, | |
| "grad_norm": 1.1915570497512817, | |
| "learning_rate": 5.184529454191344e-09, | |
| "loss": 1.9467, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 7.922557153046285, | |
| "grad_norm": 1.1169012784957886, | |
| "learning_rate": 4.329256865511777e-09, | |
| "loss": 1.9403, | |
| "step": 26375 | |
| }, | |
| { | |
| "epoch": 7.930064942377717, | |
| "grad_norm": 1.4462562799453735, | |
| "learning_rate": 3.5510076034198093e-09, | |
| "loss": 1.9356, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.930064942377717, | |
| "eval_loss": 1.980790138244629, | |
| "eval_runtime": 244.7112, | |
| "eval_samples_per_second": 22.921, | |
| "eval_steps_per_second": 5.733, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.937572731709148, | |
| "grad_norm": 1.2058677673339844, | |
| "learning_rate": 2.8497896649767872e-09, | |
| "loss": 1.9233, | |
| "step": 26425 | |
| }, | |
| { | |
| "epoch": 7.9450805210405795, | |
| "grad_norm": 1.246536135673523, | |
| "learning_rate": 2.225610255694732e-09, | |
| "loss": 1.9439, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 7.952588310372011, | |
| "grad_norm": 1.4007676839828491, | |
| "learning_rate": 1.6784757894588998e-09, | |
| "loss": 1.947, | |
| "step": 26475 | |
| }, | |
| { | |
| "epoch": 7.960096099703442, | |
| "grad_norm": 1.2076547145843506, | |
| "learning_rate": 1.2083918884636668e-09, | |
| "loss": 1.9181, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.960096099703442, | |
| "eval_loss": 1.9807934761047363, | |
| "eval_runtime": 244.5038, | |
| "eval_samples_per_second": 22.94, | |
| "eval_steps_per_second": 5.738, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.967603889034874, | |
| "grad_norm": 1.1972334384918213, | |
| "learning_rate": 8.15363383154244e-10, | |
| "loss": 1.928, | |
| "step": 26525 | |
| }, | |
| { | |
| "epoch": 7.975111678366305, | |
| "grad_norm": 1.246110439300537, | |
| "learning_rate": 4.993943121767153e-10, | |
| "loss": 1.9361, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 7.982619467697736, | |
| "grad_norm": 1.2921074628829956, | |
| "learning_rate": 2.604879223364054e-10, | |
| "loss": 1.9193, | |
| "step": 26575 | |
| }, | |
| { | |
| "epoch": 7.990127257029168, | |
| "grad_norm": 1.2516605854034424, | |
| "learning_rate": 9.864666856707061e-11, | |
| "loss": 1.9276, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.990127257029168, | |
| "eval_loss": 1.9807960987091064, | |
| "eval_runtime": 244.5065, | |
| "eval_samples_per_second": 22.94, | |
| "eval_steps_per_second": 5.738, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.997635046360599, | |
| "grad_norm": 1.2723952531814575, | |
| "learning_rate": 1.3872213900922859e-11, | |
| "loss": 1.9274, | |
| "step": 26625 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 26632, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.055320138484023e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |