| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 9.0, |
| "eval_steps": 500, |
| "global_step": 2205, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.04094165813715456, |
| "grad_norm": 12.65462875366211, |
| "learning_rate": 9.000000000000001e-07, |
| "loss": 0.84, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.08188331627430911, |
| "grad_norm": 15.826253890991211, |
| "learning_rate": 1.9000000000000002e-06, |
| "loss": 0.9075, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.12282497441146366, |
| "grad_norm": 12.505751609802246, |
| "learning_rate": 2.9e-06, |
| "loss": 0.9015, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.16376663254861823, |
| "grad_norm": 15.053168296813965, |
| "learning_rate": 3.900000000000001e-06, |
| "loss": 0.8386, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2047082906857728, |
| "grad_norm": 15.304803848266602, |
| "learning_rate": 4.9000000000000005e-06, |
| "loss": 0.8111, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.24564994882292732, |
| "grad_norm": 11.536665916442871, |
| "learning_rate": 5.9e-06, |
| "loss": 1.0011, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2865916069600819, |
| "grad_norm": 15.474630355834961, |
| "learning_rate": 6.9e-06, |
| "loss": 0.9621, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.32753326509723646, |
| "grad_norm": 16.180992126464844, |
| "learning_rate": 7.9e-06, |
| "loss": 0.9414, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.368474923234391, |
| "grad_norm": 19.92721939086914, |
| "learning_rate": 8.900000000000001e-06, |
| "loss": 0.8467, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4094165813715456, |
| "grad_norm": 13.27595043182373, |
| "learning_rate": 9.9e-06, |
| "loss": 0.8855, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4503582395087001, |
| "grad_norm": 17.830875396728516, |
| "learning_rate": 9.961702127659575e-06, |
| "loss": 0.8637, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.49129989764585463, |
| "grad_norm": 13.432883262634277, |
| "learning_rate": 9.919148936170213e-06, |
| "loss": 0.8807, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5322415557830092, |
| "grad_norm": 19.632150650024414, |
| "learning_rate": 9.876595744680851e-06, |
| "loss": 0.7876, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5731832139201638, |
| "grad_norm": 8.750787734985352, |
| "learning_rate": 9.834042553191491e-06, |
| "loss": 0.8812, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6141248720573184, |
| "grad_norm": 11.828136444091797, |
| "learning_rate": 9.79148936170213e-06, |
| "loss": 0.7476, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6550665301944729, |
| "grad_norm": 8.025004386901855, |
| "learning_rate": 9.748936170212768e-06, |
| "loss": 0.7658, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6960081883316275, |
| "grad_norm": 17.197458267211914, |
| "learning_rate": 9.706382978723406e-06, |
| "loss": 0.8227, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.736949846468782, |
| "grad_norm": 18.768630981445312, |
| "learning_rate": 9.663829787234044e-06, |
| "loss": 0.8761, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7778915046059366, |
| "grad_norm": 11.378251075744629, |
| "learning_rate": 9.621276595744682e-06, |
| "loss": 0.7956, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8188331627430911, |
| "grad_norm": 12.888134956359863, |
| "learning_rate": 9.57872340425532e-06, |
| "loss": 0.7741, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8597748208802457, |
| "grad_norm": 9.256698608398438, |
| "learning_rate": 9.536170212765959e-06, |
| "loss": 0.8193, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.9007164790174002, |
| "grad_norm": 9.790871620178223, |
| "learning_rate": 9.493617021276597e-06, |
| "loss": 0.8279, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.9416581371545547, |
| "grad_norm": 17.922643661499023, |
| "learning_rate": 9.451063829787235e-06, |
| "loss": 0.8167, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.9825997952917093, |
| "grad_norm": 15.21545696258545, |
| "learning_rate": 9.408510638297873e-06, |
| "loss": 0.678, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_general_loss": 0.7405520677566528, |
| "eval_general_runtime": 257.7451, |
| "eval_general_samples_per_second": 3.55, |
| "eval_general_steps_per_second": 0.888, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_code_loss": 0.8061306476593018, |
| "eval_code_runtime": 300.2792, |
| "eval_code_samples_per_second": 3.057, |
| "eval_code_steps_per_second": 0.766, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_stem_loss": 0.7527948021888733, |
| "eval_stem_runtime": 253.8295, |
| "eval_stem_samples_per_second": 3.601, |
| "eval_stem_steps_per_second": 0.902, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0204708290685773, |
| "grad_norm": 9.183212280273438, |
| "learning_rate": 9.365957446808511e-06, |
| "loss": 0.6701, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.0614124872057318, |
| "grad_norm": 11.502631187438965, |
| "learning_rate": 9.32340425531915e-06, |
| "loss": 0.6915, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.1023541453428864, |
| "grad_norm": 14.637332916259766, |
| "learning_rate": 9.280851063829788e-06, |
| "loss": 0.7034, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.143295803480041, |
| "grad_norm": 22.042236328125, |
| "learning_rate": 9.238297872340426e-06, |
| "loss": 0.7613, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.1842374616171956, |
| "grad_norm": 10.717690467834473, |
| "learning_rate": 9.195744680851064e-06, |
| "loss": 0.7115, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.22517911975435, |
| "grad_norm": 12.595451354980469, |
| "learning_rate": 9.153191489361702e-06, |
| "loss": 0.6753, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.2661207778915047, |
| "grad_norm": 15.185698509216309, |
| "learning_rate": 9.11063829787234e-06, |
| "loss": 0.6543, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.3070624360286591, |
| "grad_norm": 8.96109676361084, |
| "learning_rate": 9.06808510638298e-06, |
| "loss": 0.7118, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.3480040941658138, |
| "grad_norm": 11.143041610717773, |
| "learning_rate": 9.025531914893619e-06, |
| "loss": 0.7155, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.3889457523029682, |
| "grad_norm": 13.331513404846191, |
| "learning_rate": 8.982978723404257e-06, |
| "loss": 0.5873, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.429887410440123, |
| "grad_norm": 7.453923225402832, |
| "learning_rate": 8.940425531914895e-06, |
| "loss": 0.6085, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.4708290685772774, |
| "grad_norm": 10.874267578125, |
| "learning_rate": 8.897872340425533e-06, |
| "loss": 0.7046, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.511770726714432, |
| "grad_norm": 18.965225219726562, |
| "learning_rate": 8.855319148936171e-06, |
| "loss": 0.7275, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.5527123848515865, |
| "grad_norm": 11.133731842041016, |
| "learning_rate": 8.81276595744681e-06, |
| "loss": 0.7185, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.593654042988741, |
| "grad_norm": 9.591411590576172, |
| "learning_rate": 8.770212765957448e-06, |
| "loss": 0.6325, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.6345957011258956, |
| "grad_norm": 9.676285743713379, |
| "learning_rate": 8.727659574468086e-06, |
| "loss": 0.5229, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.6755373592630503, |
| "grad_norm": 17.216745376586914, |
| "learning_rate": 8.685106382978724e-06, |
| "loss": 0.5627, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.7164790174002047, |
| "grad_norm": 12.413490295410156, |
| "learning_rate": 8.642553191489362e-06, |
| "loss": 0.6627, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.7574206755373591, |
| "grad_norm": 18.200937271118164, |
| "learning_rate": 8.6e-06, |
| "loss": 0.821, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.7983623336745138, |
| "grad_norm": 7.938803195953369, |
| "learning_rate": 8.557446808510639e-06, |
| "loss": 0.5181, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.8393039918116685, |
| "grad_norm": 16.005313873291016, |
| "learning_rate": 8.514893617021277e-06, |
| "loss": 0.5963, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.880245649948823, |
| "grad_norm": 7.592184066772461, |
| "learning_rate": 8.472340425531915e-06, |
| "loss": 0.5118, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.9211873080859774, |
| "grad_norm": 7.5901384353637695, |
| "learning_rate": 8.429787234042553e-06, |
| "loss": 0.6951, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.962128966223132, |
| "grad_norm": 15.962983131408691, |
| "learning_rate": 8.387234042553192e-06, |
| "loss": 0.5939, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 21.49174690246582, |
| "learning_rate": 8.34468085106383e-06, |
| "loss": 0.5632, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_general_loss": 0.6747614741325378, |
| "eval_general_runtime": 258.8315, |
| "eval_general_samples_per_second": 3.535, |
| "eval_general_steps_per_second": 0.885, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_code_loss": 0.7175714373588562, |
| "eval_code_runtime": 301.597, |
| "eval_code_samples_per_second": 3.044, |
| "eval_code_steps_per_second": 0.763, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_stem_loss": 0.6549679040908813, |
| "eval_stem_runtime": 254.6647, |
| "eval_stem_samples_per_second": 3.589, |
| "eval_stem_steps_per_second": 0.899, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.0409416581371547, |
| "grad_norm": 15.108848571777344, |
| "learning_rate": 8.30212765957447e-06, |
| "loss": 0.5936, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.0818833162743093, |
| "grad_norm": 5.768886566162109, |
| "learning_rate": 8.259574468085108e-06, |
| "loss": 0.567, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.1228249744114636, |
| "grad_norm": 7.63210916519165, |
| "learning_rate": 8.217021276595746e-06, |
| "loss": 0.4894, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.1637666325486182, |
| "grad_norm": 19.224069595336914, |
| "learning_rate": 8.174468085106384e-06, |
| "loss": 0.5285, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.204708290685773, |
| "grad_norm": 9.942646026611328, |
| "learning_rate": 8.131914893617023e-06, |
| "loss": 0.4813, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.245649948822927, |
| "grad_norm": 9.9766206741333, |
| "learning_rate": 8.08936170212766e-06, |
| "loss": 0.5319, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.286591606960082, |
| "grad_norm": 10.004892349243164, |
| "learning_rate": 8.046808510638299e-06, |
| "loss": 0.4969, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.3275332650972365, |
| "grad_norm": 10.710165977478027, |
| "learning_rate": 8.004255319148937e-06, |
| "loss": 0.5878, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.368474923234391, |
| "grad_norm": 12.612700462341309, |
| "learning_rate": 7.961702127659575e-06, |
| "loss": 0.479, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.409416581371546, |
| "grad_norm": 9.927840232849121, |
| "learning_rate": 7.919148936170214e-06, |
| "loss": 0.4759, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.4503582395087, |
| "grad_norm": 20.39271354675293, |
| "learning_rate": 7.876595744680852e-06, |
| "loss": 0.5486, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.4912998976458547, |
| "grad_norm": 10.480104446411133, |
| "learning_rate": 7.83404255319149e-06, |
| "loss": 0.4948, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.5322415557830094, |
| "grad_norm": 7.528670310974121, |
| "learning_rate": 7.791489361702128e-06, |
| "loss": 0.4786, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.5731832139201636, |
| "grad_norm": 13.00081729888916, |
| "learning_rate": 7.748936170212766e-06, |
| "loss": 0.6303, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.6141248720573182, |
| "grad_norm": 15.72313404083252, |
| "learning_rate": 7.706382978723405e-06, |
| "loss": 0.5574, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.655066530194473, |
| "grad_norm": 7.435535907745361, |
| "learning_rate": 7.663829787234043e-06, |
| "loss": 0.6302, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.6960081883316276, |
| "grad_norm": 12.462494850158691, |
| "learning_rate": 7.621276595744681e-06, |
| "loss": 0.5318, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.7369498464687823, |
| "grad_norm": 13.541357040405273, |
| "learning_rate": 7.578723404255319e-06, |
| "loss": 0.5983, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.7778915046059365, |
| "grad_norm": 10.254051208496094, |
| "learning_rate": 7.536170212765958e-06, |
| "loss": 0.5733, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.818833162743091, |
| "grad_norm": 9.48480224609375, |
| "learning_rate": 7.4936170212765964e-06, |
| "loss": 0.4989, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.859774820880246, |
| "grad_norm": 15.369186401367188, |
| "learning_rate": 7.4510638297872355e-06, |
| "loss": 0.5001, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.9007164790174, |
| "grad_norm": 19.331863403320312, |
| "learning_rate": 7.408510638297874e-06, |
| "loss": 0.5833, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.9416581371545547, |
| "grad_norm": 8.6927490234375, |
| "learning_rate": 7.365957446808512e-06, |
| "loss": 0.5061, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.9825997952917094, |
| "grad_norm": 10.688565254211426, |
| "learning_rate": 7.32340425531915e-06, |
| "loss": 0.5222, |
| "step": 730 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_general_loss": 0.672736406326294, |
| "eval_general_runtime": 258.2016, |
| "eval_general_samples_per_second": 3.544, |
| "eval_general_steps_per_second": 0.887, |
| "step": 735 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_code_loss": 0.6921422481536865, |
| "eval_code_runtime": 300.8317, |
| "eval_code_samples_per_second": 3.052, |
| "eval_code_steps_per_second": 0.765, |
| "step": 735 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_stem_loss": 0.6068028211593628, |
| "eval_stem_runtime": 254.127, |
| "eval_stem_samples_per_second": 3.597, |
| "eval_stem_steps_per_second": 0.901, |
| "step": 735 |
| }, |
| { |
| "epoch": 3.0204708290685773, |
| "grad_norm": 18.046438217163086, |
| "learning_rate": 7.280851063829788e-06, |
| "loss": 0.6208, |
| "step": 740 |
| }, |
| { |
| "epoch": 3.061412487205732, |
| "grad_norm": 9.861907005310059, |
| "learning_rate": 7.2382978723404265e-06, |
| "loss": 0.4168, |
| "step": 750 |
| }, |
| { |
| "epoch": 3.1023541453428862, |
| "grad_norm": 12.162917137145996, |
| "learning_rate": 7.195744680851065e-06, |
| "loss": 0.4801, |
| "step": 760 |
| }, |
| { |
| "epoch": 3.143295803480041, |
| "grad_norm": 8.550954818725586, |
| "learning_rate": 7.153191489361703e-06, |
| "loss": 0.4002, |
| "step": 770 |
| }, |
| { |
| "epoch": 3.1842374616171956, |
| "grad_norm": 13.538202285766602, |
| "learning_rate": 7.110638297872341e-06, |
| "loss": 0.4252, |
| "step": 780 |
| }, |
| { |
| "epoch": 3.2251791197543502, |
| "grad_norm": 16.993202209472656, |
| "learning_rate": 7.068085106382979e-06, |
| "loss": 0.4298, |
| "step": 790 |
| }, |
| { |
| "epoch": 3.2661207778915045, |
| "grad_norm": 8.579163551330566, |
| "learning_rate": 7.0255319148936175e-06, |
| "loss": 0.4718, |
| "step": 800 |
| }, |
| { |
| "epoch": 3.307062436028659, |
| "grad_norm": 11.938567161560059, |
| "learning_rate": 6.982978723404256e-06, |
| "loss": 0.4885, |
| "step": 810 |
| }, |
| { |
| "epoch": 3.348004094165814, |
| "grad_norm": 10.950126647949219, |
| "learning_rate": 6.940425531914894e-06, |
| "loss": 0.525, |
| "step": 820 |
| }, |
| { |
| "epoch": 3.3889457523029685, |
| "grad_norm": 11.802833557128906, |
| "learning_rate": 6.897872340425532e-06, |
| "loss": 0.4151, |
| "step": 830 |
| }, |
| { |
| "epoch": 3.4298874104401227, |
| "grad_norm": 11.907119750976562, |
| "learning_rate": 6.85531914893617e-06, |
| "loss": 0.4043, |
| "step": 840 |
| }, |
| { |
| "epoch": 3.4708290685772774, |
| "grad_norm": 16.365446090698242, |
| "learning_rate": 6.8127659574468085e-06, |
| "loss": 0.4343, |
| "step": 850 |
| }, |
| { |
| "epoch": 3.511770726714432, |
| "grad_norm": 15.271421432495117, |
| "learning_rate": 6.770212765957447e-06, |
| "loss": 0.3902, |
| "step": 860 |
| }, |
| { |
| "epoch": 3.5527123848515867, |
| "grad_norm": 13.381245613098145, |
| "learning_rate": 6.727659574468086e-06, |
| "loss": 0.3752, |
| "step": 870 |
| }, |
| { |
| "epoch": 3.593654042988741, |
| "grad_norm": 10.570069313049316, |
| "learning_rate": 6.685106382978725e-06, |
| "loss": 0.4237, |
| "step": 880 |
| }, |
| { |
| "epoch": 3.6345957011258956, |
| "grad_norm": 11.140630722045898, |
| "learning_rate": 6.642553191489363e-06, |
| "loss": 0.4037, |
| "step": 890 |
| }, |
| { |
| "epoch": 3.6755373592630503, |
| "grad_norm": 10.076521873474121, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 0.3291, |
| "step": 900 |
| }, |
| { |
| "epoch": 3.7164790174002045, |
| "grad_norm": 21.271434783935547, |
| "learning_rate": 6.557446808510639e-06, |
| "loss": 0.513, |
| "step": 910 |
| }, |
| { |
| "epoch": 3.757420675537359, |
| "grad_norm": 16.138254165649414, |
| "learning_rate": 6.514893617021278e-06, |
| "loss": 0.438, |
| "step": 920 |
| }, |
| { |
| "epoch": 3.798362333674514, |
| "grad_norm": 12.374245643615723, |
| "learning_rate": 6.472340425531916e-06, |
| "loss": 0.4408, |
| "step": 930 |
| }, |
| { |
| "epoch": 3.8393039918116685, |
| "grad_norm": 10.709432601928711, |
| "learning_rate": 6.429787234042554e-06, |
| "loss": 0.4978, |
| "step": 940 |
| }, |
| { |
| "epoch": 3.880245649948823, |
| "grad_norm": 19.277435302734375, |
| "learning_rate": 6.387234042553192e-06, |
| "loss": 0.4056, |
| "step": 950 |
| }, |
| { |
| "epoch": 3.9211873080859774, |
| "grad_norm": 15.328817367553711, |
| "learning_rate": 6.34468085106383e-06, |
| "loss": 0.3557, |
| "step": 960 |
| }, |
| { |
| "epoch": 3.962128966223132, |
| "grad_norm": 10.352958679199219, |
| "learning_rate": 6.302127659574469e-06, |
| "loss": 0.3695, |
| "step": 970 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 12.802416801452637, |
| "learning_rate": 6.259574468085107e-06, |
| "loss": 0.3172, |
| "step": 980 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_general_loss": 0.7220921516418457, |
| "eval_general_runtime": 258.5979, |
| "eval_general_samples_per_second": 3.538, |
| "eval_general_steps_per_second": 0.886, |
| "step": 980 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_code_loss": 0.7049041390419006, |
| "eval_code_runtime": 301.544, |
| "eval_code_samples_per_second": 3.044, |
| "eval_code_steps_per_second": 0.763, |
| "step": 980 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_stem_loss": 0.5855021476745605, |
| "eval_stem_runtime": 254.8479, |
| "eval_stem_samples_per_second": 3.586, |
| "eval_stem_steps_per_second": 0.899, |
| "step": 980 |
| }, |
| { |
| "epoch": 4.040941658137155, |
| "grad_norm": 17.460784912109375, |
| "learning_rate": 6.217021276595745e-06, |
| "loss": 0.3011, |
| "step": 990 |
| }, |
| { |
| "epoch": 4.081883316274309, |
| "grad_norm": 13.8486909866333, |
| "learning_rate": 6.174468085106383e-06, |
| "loss": 0.418, |
| "step": 1000 |
| }, |
| { |
| "epoch": 4.122824974411464, |
| "grad_norm": 10.446855545043945, |
| "learning_rate": 6.131914893617021e-06, |
| "loss": 0.3746, |
| "step": 1010 |
| }, |
| { |
| "epoch": 4.163766632548619, |
| "grad_norm": 9.332606315612793, |
| "learning_rate": 6.08936170212766e-06, |
| "loss": 0.3303, |
| "step": 1020 |
| }, |
| { |
| "epoch": 4.2047082906857725, |
| "grad_norm": 12.63461685180664, |
| "learning_rate": 6.046808510638298e-06, |
| "loss": 0.4094, |
| "step": 1030 |
| }, |
| { |
| "epoch": 4.245649948822927, |
| "grad_norm": 17.926715850830078, |
| "learning_rate": 6.004255319148936e-06, |
| "loss": 0.3677, |
| "step": 1040 |
| }, |
| { |
| "epoch": 4.286591606960082, |
| "grad_norm": 18.4414005279541, |
| "learning_rate": 5.961702127659575e-06, |
| "loss": 0.339, |
| "step": 1050 |
| }, |
| { |
| "epoch": 4.3275332650972365, |
| "grad_norm": 12.794388771057129, |
| "learning_rate": 5.919148936170214e-06, |
| "loss": 0.3158, |
| "step": 1060 |
| }, |
| { |
| "epoch": 4.368474923234391, |
| "grad_norm": 24.449329376220703, |
| "learning_rate": 5.876595744680852e-06, |
| "loss": 0.3316, |
| "step": 1070 |
| }, |
| { |
| "epoch": 4.409416581371546, |
| "grad_norm": 16.00278091430664, |
| "learning_rate": 5.8340425531914905e-06, |
| "loss": 0.3224, |
| "step": 1080 |
| }, |
| { |
| "epoch": 4.4503582395087005, |
| "grad_norm": 13.257128715515137, |
| "learning_rate": 5.791489361702129e-06, |
| "loss": 0.2868, |
| "step": 1090 |
| }, |
| { |
| "epoch": 4.491299897645854, |
| "grad_norm": 7.790375232696533, |
| "learning_rate": 5.748936170212767e-06, |
| "loss": 0.3271, |
| "step": 1100 |
| }, |
| { |
| "epoch": 4.532241555783009, |
| "grad_norm": 22.618724822998047, |
| "learning_rate": 5.706382978723405e-06, |
| "loss": 0.3771, |
| "step": 1110 |
| }, |
| { |
| "epoch": 4.573183213920164, |
| "grad_norm": 12.749550819396973, |
| "learning_rate": 5.663829787234043e-06, |
| "loss": 0.2641, |
| "step": 1120 |
| }, |
| { |
| "epoch": 4.614124872057318, |
| "grad_norm": 20.562509536743164, |
| "learning_rate": 5.6212765957446815e-06, |
| "loss": 0.3129, |
| "step": 1130 |
| }, |
| { |
| "epoch": 4.655066530194473, |
| "grad_norm": 17.5664119720459, |
| "learning_rate": 5.57872340425532e-06, |
| "loss": 0.3977, |
| "step": 1140 |
| }, |
| { |
| "epoch": 4.696008188331628, |
| "grad_norm": 20.270048141479492, |
| "learning_rate": 5.536170212765958e-06, |
| "loss": 0.275, |
| "step": 1150 |
| }, |
| { |
| "epoch": 4.736949846468782, |
| "grad_norm": 22.615859985351562, |
| "learning_rate": 5.493617021276596e-06, |
| "loss": 0.3605, |
| "step": 1160 |
| }, |
| { |
| "epoch": 4.777891504605937, |
| "grad_norm": 17.331926345825195, |
| "learning_rate": 5.451063829787234e-06, |
| "loss": 0.3862, |
| "step": 1170 |
| }, |
| { |
| "epoch": 4.818833162743092, |
| "grad_norm": 10.380560874938965, |
| "learning_rate": 5.4085106382978725e-06, |
| "loss": 0.3313, |
| "step": 1180 |
| }, |
| { |
| "epoch": 4.859774820880245, |
| "grad_norm": 18.984834671020508, |
| "learning_rate": 5.365957446808511e-06, |
| "loss": 0.3625, |
| "step": 1190 |
| }, |
| { |
| "epoch": 4.9007164790174, |
| "grad_norm": 12.365246772766113, |
| "learning_rate": 5.323404255319149e-06, |
| "loss": 0.3083, |
| "step": 1200 |
| }, |
| { |
| "epoch": 4.941658137154555, |
| "grad_norm": 16.08763313293457, |
| "learning_rate": 5.280851063829787e-06, |
| "loss": 0.2523, |
| "step": 1210 |
| }, |
| { |
| "epoch": 4.982599795291709, |
| "grad_norm": 14.736597061157227, |
| "learning_rate": 5.238297872340425e-06, |
| "loss": 0.3311, |
| "step": 1220 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_general_loss": 0.779835045337677, |
| "eval_general_runtime": 258.2842, |
| "eval_general_samples_per_second": 3.543, |
| "eval_general_steps_per_second": 0.887, |
| "step": 1225 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_code_loss": 0.7314270734786987, |
| "eval_code_runtime": 301.4852, |
| "eval_code_samples_per_second": 3.045, |
| "eval_code_steps_per_second": 0.763, |
| "step": 1225 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_stem_loss": 0.556253969669342, |
| "eval_stem_runtime": 254.3639, |
| "eval_stem_samples_per_second": 3.593, |
| "eval_stem_steps_per_second": 0.9, |
| "step": 1225 |
| }, |
| { |
| "epoch": 5.020470829068577, |
| "grad_norm": 13.4270658493042, |
| "learning_rate": 5.195744680851064e-06, |
| "loss": 0.387, |
| "step": 1230 |
| }, |
| { |
| "epoch": 5.061412487205732, |
| "grad_norm": 12.8772611618042, |
| "learning_rate": 5.153191489361703e-06, |
| "loss": 0.2378, |
| "step": 1240 |
| }, |
| { |
| "epoch": 5.102354145342886, |
| "grad_norm": 10.802362442016602, |
| "learning_rate": 5.110638297872342e-06, |
| "loss": 0.2646, |
| "step": 1250 |
| }, |
| { |
| "epoch": 5.143295803480041, |
| "grad_norm": 12.26051139831543, |
| "learning_rate": 5.06808510638298e-06, |
| "loss": 0.211, |
| "step": 1260 |
| }, |
| { |
| "epoch": 5.184237461617196, |
| "grad_norm": 31.475887298583984, |
| "learning_rate": 5.025531914893618e-06, |
| "loss": 0.2837, |
| "step": 1270 |
| }, |
| { |
| "epoch": 5.22517911975435, |
| "grad_norm": 23.216684341430664, |
| "learning_rate": 4.982978723404256e-06, |
| "loss": 0.2867, |
| "step": 1280 |
| }, |
| { |
| "epoch": 5.266120777891505, |
| "grad_norm": 21.613176345825195, |
| "learning_rate": 4.940425531914894e-06, |
| "loss": 0.2246, |
| "step": 1290 |
| }, |
| { |
| "epoch": 5.30706243602866, |
| "grad_norm": 11.157214164733887, |
| "learning_rate": 4.897872340425533e-06, |
| "loss": 0.2666, |
| "step": 1300 |
| }, |
| { |
| "epoch": 5.348004094165813, |
| "grad_norm": 15.762431144714355, |
| "learning_rate": 4.855319148936171e-06, |
| "loss": 0.2029, |
| "step": 1310 |
| }, |
| { |
| "epoch": 5.388945752302968, |
| "grad_norm": 10.91984748840332, |
| "learning_rate": 4.812765957446809e-06, |
| "loss": 0.3634, |
| "step": 1320 |
| }, |
| { |
| "epoch": 5.429887410440123, |
| "grad_norm": 8.84945297241211, |
| "learning_rate": 4.770212765957447e-06, |
| "loss": 0.2654, |
| "step": 1330 |
| }, |
| { |
| "epoch": 5.470829068577277, |
| "grad_norm": 14.776636123657227, |
| "learning_rate": 4.727659574468085e-06, |
| "loss": 0.3422, |
| "step": 1340 |
| }, |
| { |
| "epoch": 5.511770726714432, |
| "grad_norm": 8.540399551391602, |
| "learning_rate": 4.685106382978724e-06, |
| "loss": 0.2828, |
| "step": 1350 |
| }, |
| { |
| "epoch": 5.552712384851587, |
| "grad_norm": 8.571439743041992, |
| "learning_rate": 4.642553191489363e-06, |
| "loss": 0.2662, |
| "step": 1360 |
| }, |
| { |
| "epoch": 5.593654042988741, |
| "grad_norm": 11.396147727966309, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 0.1846, |
| "step": 1370 |
| }, |
| { |
| "epoch": 5.634595701125896, |
| "grad_norm": 24.86265754699707, |
| "learning_rate": 4.557446808510639e-06, |
| "loss": 0.2243, |
| "step": 1380 |
| }, |
| { |
| "epoch": 5.67553735926305, |
| "grad_norm": 6.110672473907471, |
| "learning_rate": 4.514893617021277e-06, |
| "loss": 0.2774, |
| "step": 1390 |
| }, |
| { |
| "epoch": 5.7164790174002045, |
| "grad_norm": 11.962754249572754, |
| "learning_rate": 4.4723404255319155e-06, |
| "loss": 0.2846, |
| "step": 1400 |
| }, |
| { |
| "epoch": 5.757420675537359, |
| "grad_norm": 15.486808776855469, |
| "learning_rate": 4.429787234042554e-06, |
| "loss": 0.2143, |
| "step": 1410 |
| }, |
| { |
| "epoch": 5.798362333674514, |
| "grad_norm": 25.425642013549805, |
| "learning_rate": 4.387234042553192e-06, |
| "loss": 0.2319, |
| "step": 1420 |
| }, |
| { |
| "epoch": 5.8393039918116685, |
| "grad_norm": 23.38964080810547, |
| "learning_rate": 4.34468085106383e-06, |
| "loss": 0.2954, |
| "step": 1430 |
| }, |
| { |
| "epoch": 5.880245649948823, |
| "grad_norm": 22.06728744506836, |
| "learning_rate": 4.302127659574468e-06, |
| "loss": 0.2101, |
| "step": 1440 |
| }, |
| { |
| "epoch": 5.921187308085978, |
| "grad_norm": 13.701712608337402, |
| "learning_rate": 4.259574468085107e-06, |
| "loss": 0.1925, |
| "step": 1450 |
| }, |
| { |
| "epoch": 5.962128966223132, |
| "grad_norm": 11.441407203674316, |
| "learning_rate": 4.2170212765957455e-06, |
| "loss": 0.2212, |
| "step": 1460 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 72.40815734863281, |
| "learning_rate": 4.174468085106384e-06, |
| "loss": 0.3422, |
| "step": 1470 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_general_loss": 0.8248616456985474, |
| "eval_general_runtime": 258.5737, |
| "eval_general_samples_per_second": 3.539, |
| "eval_general_steps_per_second": 0.886, |
| "step": 1470 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_code_loss": 0.756708025932312, |
| "eval_code_runtime": 301.597, |
| "eval_code_samples_per_second": 3.044, |
| "eval_code_steps_per_second": 0.763, |
| "step": 1470 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_stem_loss": 0.5508426427841187, |
| "eval_stem_runtime": 254.7551, |
| "eval_stem_samples_per_second": 3.588, |
| "eval_stem_steps_per_second": 0.899, |
| "step": 1470 |
| }, |
| { |
| "epoch": 6.040941658137155, |
| "grad_norm": 7.497542858123779, |
| "learning_rate": 4.131914893617022e-06, |
| "loss": 0.1778, |
| "step": 1480 |
| }, |
| { |
| "epoch": 6.081883316274309, |
| "grad_norm": 23.541690826416016, |
| "learning_rate": 4.08936170212766e-06, |
| "loss": 0.1269, |
| "step": 1490 |
| }, |
| { |
| "epoch": 6.122824974411464, |
| "grad_norm": 22.970813751220703, |
| "learning_rate": 4.046808510638298e-06, |
| "loss": 0.2644, |
| "step": 1500 |
| }, |
| { |
| "epoch": 6.163766632548619, |
| "grad_norm": 22.942781448364258, |
| "learning_rate": 4.0042553191489365e-06, |
| "loss": 0.1903, |
| "step": 1510 |
| }, |
| { |
| "epoch": 6.2047082906857725, |
| "grad_norm": 26.27296257019043, |
| "learning_rate": 3.961702127659575e-06, |
| "loss": 0.1649, |
| "step": 1520 |
| }, |
| { |
| "epoch": 6.245649948822927, |
| "grad_norm": 26.045181274414062, |
| "learning_rate": 3.919148936170213e-06, |
| "loss": 0.2009, |
| "step": 1530 |
| }, |
| { |
| "epoch": 6.286591606960082, |
| "grad_norm": 10.866025924682617, |
| "learning_rate": 3.876595744680852e-06, |
| "loss": 0.142, |
| "step": 1540 |
| }, |
| { |
| "epoch": 6.3275332650972365, |
| "grad_norm": 10.95513916015625, |
| "learning_rate": 3.83404255319149e-06, |
| "loss": 0.1989, |
| "step": 1550 |
| }, |
| { |
| "epoch": 6.368474923234391, |
| "grad_norm": 15.439634323120117, |
| "learning_rate": 3.791489361702128e-06, |
| "loss": 0.2761, |
| "step": 1560 |
| }, |
| { |
| "epoch": 6.409416581371546, |
| "grad_norm": 18.19639778137207, |
| "learning_rate": 3.748936170212766e-06, |
| "loss": 0.2585, |
| "step": 1570 |
| }, |
| { |
| "epoch": 6.4503582395087005, |
| "grad_norm": 4.902035713195801, |
| "learning_rate": 3.7063829787234048e-06, |
| "loss": 0.1663, |
| "step": 1580 |
| }, |
| { |
| "epoch": 6.491299897645854, |
| "grad_norm": 14.698819160461426, |
| "learning_rate": 3.663829787234043e-06, |
| "loss": 0.1555, |
| "step": 1590 |
| }, |
| { |
| "epoch": 6.532241555783009, |
| "grad_norm": 9.941628456115723, |
| "learning_rate": 3.621276595744681e-06, |
| "loss": 0.1812, |
| "step": 1600 |
| }, |
| { |
| "epoch": 6.573183213920164, |
| "grad_norm": 27.069652557373047, |
| "learning_rate": 3.5787234042553194e-06, |
| "loss": 0.2373, |
| "step": 1610 |
| }, |
| { |
| "epoch": 6.614124872057318, |
| "grad_norm": 10.796792984008789, |
| "learning_rate": 3.5361702127659576e-06, |
| "loss": 0.1699, |
| "step": 1620 |
| }, |
| { |
| "epoch": 6.655066530194473, |
| "grad_norm": 11.419804573059082, |
| "learning_rate": 3.493617021276596e-06, |
| "loss": 0.2121, |
| "step": 1630 |
| }, |
| { |
| "epoch": 6.696008188331628, |
| "grad_norm": 6.083690166473389, |
| "learning_rate": 3.4510638297872344e-06, |
| "loss": 0.1873, |
| "step": 1640 |
| }, |
| { |
| "epoch": 6.736949846468782, |
| "grad_norm": 17.91828155517578, |
| "learning_rate": 3.4085106382978726e-06, |
| "loss": 0.1834, |
| "step": 1650 |
| }, |
| { |
| "epoch": 6.777891504605937, |
| "grad_norm": 27.208126068115234, |
| "learning_rate": 3.3659574468085108e-06, |
| "loss": 0.2051, |
| "step": 1660 |
| }, |
| { |
| "epoch": 6.818833162743092, |
| "grad_norm": 11.397320747375488, |
| "learning_rate": 3.3234042553191494e-06, |
| "loss": 0.2659, |
| "step": 1670 |
| }, |
| { |
| "epoch": 6.859774820880245, |
| "grad_norm": 14.502364158630371, |
| "learning_rate": 3.2808510638297876e-06, |
| "loss": 0.1841, |
| "step": 1680 |
| }, |
| { |
| "epoch": 6.9007164790174, |
| "grad_norm": 12.21772289276123, |
| "learning_rate": 3.238297872340426e-06, |
| "loss": 0.2016, |
| "step": 1690 |
| }, |
| { |
| "epoch": 6.941658137154555, |
| "grad_norm": 15.077445983886719, |
| "learning_rate": 3.195744680851064e-06, |
| "loss": 0.1812, |
| "step": 1700 |
| }, |
| { |
| "epoch": 6.982599795291709, |
| "grad_norm": 13.093221664428711, |
| "learning_rate": 3.1531914893617022e-06, |
| "loss": 0.2056, |
| "step": 1710 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_general_loss": 0.9050961136817932, |
| "eval_general_runtime": 259.0231, |
| "eval_general_samples_per_second": 3.533, |
| "eval_general_steps_per_second": 0.884, |
| "step": 1715 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_code_loss": 0.7956153154373169, |
| "eval_code_runtime": 301.4062, |
| "eval_code_samples_per_second": 3.046, |
| "eval_code_steps_per_second": 0.763, |
| "step": 1715 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_stem_loss": 0.5608986020088196, |
| "eval_stem_runtime": 254.5486, |
| "eval_stem_samples_per_second": 3.591, |
| "eval_stem_steps_per_second": 0.9, |
| "step": 1715 |
| }, |
| { |
| "epoch": 7.020470829068577, |
| "grad_norm": 14.868309020996094, |
| "learning_rate": 3.110638297872341e-06, |
| "loss": 0.1469, |
| "step": 1720 |
| }, |
| { |
| "epoch": 7.061412487205732, |
| "grad_norm": 6.83207893371582, |
| "learning_rate": 3.068085106382979e-06, |
| "loss": 0.12, |
| "step": 1730 |
| }, |
| { |
| "epoch": 7.102354145342886, |
| "grad_norm": 24.612173080444336, |
| "learning_rate": 3.0255319148936172e-06, |
| "loss": 0.1619, |
| "step": 1740 |
| }, |
| { |
| "epoch": 7.143295803480041, |
| "grad_norm": 23.589954376220703, |
| "learning_rate": 2.9829787234042554e-06, |
| "loss": 0.164, |
| "step": 1750 |
| }, |
| { |
| "epoch": 7.184237461617196, |
| "grad_norm": 15.2968111038208, |
| "learning_rate": 2.940425531914894e-06, |
| "loss": 0.148, |
| "step": 1760 |
| }, |
| { |
| "epoch": 7.22517911975435, |
| "grad_norm": 7.013993263244629, |
| "learning_rate": 2.8978723404255323e-06, |
| "loss": 0.1285, |
| "step": 1770 |
| }, |
| { |
| "epoch": 7.266120777891505, |
| "grad_norm": 9.457442283630371, |
| "learning_rate": 2.8553191489361705e-06, |
| "loss": 0.145, |
| "step": 1780 |
| }, |
| { |
| "epoch": 7.30706243602866, |
| "grad_norm": 2.331630229949951, |
| "learning_rate": 2.8127659574468087e-06, |
| "loss": 0.1446, |
| "step": 1790 |
| }, |
| { |
| "epoch": 7.348004094165813, |
| "grad_norm": 11.692824363708496, |
| "learning_rate": 2.770212765957447e-06, |
| "loss": 0.149, |
| "step": 1800 |
| }, |
| { |
| "epoch": 7.388945752302968, |
| "grad_norm": 19.317241668701172, |
| "learning_rate": 2.7276595744680855e-06, |
| "loss": 0.2257, |
| "step": 1810 |
| }, |
| { |
| "epoch": 7.429887410440123, |
| "grad_norm": 10.800030708312988, |
| "learning_rate": 2.6851063829787237e-06, |
| "loss": 0.1752, |
| "step": 1820 |
| }, |
| { |
| "epoch": 7.470829068577277, |
| "grad_norm": 14.40637493133545, |
| "learning_rate": 2.642553191489362e-06, |
| "loss": 0.1514, |
| "step": 1830 |
| }, |
| { |
| "epoch": 7.511770726714432, |
| "grad_norm": 5.871068477630615, |
| "learning_rate": 2.6e-06, |
| "loss": 0.1303, |
| "step": 1840 |
| }, |
| { |
| "epoch": 7.552712384851587, |
| "grad_norm": 11.30500602722168, |
| "learning_rate": 2.5574468085106387e-06, |
| "loss": 0.1182, |
| "step": 1850 |
| }, |
| { |
| "epoch": 7.593654042988741, |
| "grad_norm": 26.69110679626465, |
| "learning_rate": 2.514893617021277e-06, |
| "loss": 0.1964, |
| "step": 1860 |
| }, |
| { |
| "epoch": 7.634595701125896, |
| "grad_norm": 18.002952575683594, |
| "learning_rate": 2.472340425531915e-06, |
| "loss": 0.1584, |
| "step": 1870 |
| }, |
| { |
| "epoch": 7.67553735926305, |
| "grad_norm": 26.592117309570312, |
| "learning_rate": 2.4297872340425533e-06, |
| "loss": 0.1992, |
| "step": 1880 |
| }, |
| { |
| "epoch": 7.7164790174002045, |
| "grad_norm": 19.570964813232422, |
| "learning_rate": 2.387234042553192e-06, |
| "loss": 0.0965, |
| "step": 1890 |
| }, |
| { |
| "epoch": 7.757420675537359, |
| "grad_norm": 16.35107421875, |
| "learning_rate": 2.34468085106383e-06, |
| "loss": 0.1296, |
| "step": 1900 |
| }, |
| { |
| "epoch": 7.798362333674514, |
| "grad_norm": 15.40775203704834, |
| "learning_rate": 2.3021276595744683e-06, |
| "loss": 0.1328, |
| "step": 1910 |
| }, |
| { |
| "epoch": 7.8393039918116685, |
| "grad_norm": 11.510584831237793, |
| "learning_rate": 2.2595744680851065e-06, |
| "loss": 0.1945, |
| "step": 1920 |
| }, |
| { |
| "epoch": 7.880245649948823, |
| "grad_norm": 8.702047348022461, |
| "learning_rate": 2.2170212765957447e-06, |
| "loss": 0.1984, |
| "step": 1930 |
| }, |
| { |
| "epoch": 7.921187308085978, |
| "grad_norm": 5.505640029907227, |
| "learning_rate": 2.1744680851063834e-06, |
| "loss": 0.1607, |
| "step": 1940 |
| }, |
| { |
| "epoch": 7.962128966223132, |
| "grad_norm": 3.8454320430755615, |
| "learning_rate": 2.1319148936170216e-06, |
| "loss": 0.1541, |
| "step": 1950 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 7.759490489959717, |
| "learning_rate": 2.0893617021276598e-06, |
| "loss": 0.1646, |
| "step": 1960 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_general_loss": 0.9728627800941467, |
| "eval_general_runtime": 258.8007, |
| "eval_general_samples_per_second": 3.536, |
| "eval_general_steps_per_second": 0.885, |
| "step": 1960 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_code_loss": 0.8330367803573608, |
| "eval_code_runtime": 301.3569, |
| "eval_code_samples_per_second": 3.046, |
| "eval_code_steps_per_second": 0.763, |
| "step": 1960 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_stem_loss": 0.5864213109016418, |
| "eval_stem_runtime": 254.5316, |
| "eval_stem_samples_per_second": 3.591, |
| "eval_stem_steps_per_second": 0.9, |
| "step": 1960 |
| }, |
| { |
| "epoch": 8.040941658137154, |
| "grad_norm": 26.345972061157227, |
| "learning_rate": 2.046808510638298e-06, |
| "loss": 0.1327, |
| "step": 1970 |
| }, |
| { |
| "epoch": 8.08188331627431, |
| "grad_norm": 9.560205459594727, |
| "learning_rate": 2.0042553191489366e-06, |
| "loss": 0.1402, |
| "step": 1980 |
| }, |
| { |
| "epoch": 8.122824974411463, |
| "grad_norm": 7.314760208129883, |
| "learning_rate": 1.961702127659575e-06, |
| "loss": 0.1227, |
| "step": 1990 |
| }, |
| { |
| "epoch": 8.163766632548619, |
| "grad_norm": 5.977358818054199, |
| "learning_rate": 1.919148936170213e-06, |
| "loss": 0.1612, |
| "step": 2000 |
| }, |
| { |
| "epoch": 8.204708290685772, |
| "grad_norm": 26.247756958007812, |
| "learning_rate": 1.8765957446808512e-06, |
| "loss": 0.1227, |
| "step": 2010 |
| }, |
| { |
| "epoch": 8.245649948822928, |
| "grad_norm": 9.510503768920898, |
| "learning_rate": 1.8340425531914896e-06, |
| "loss": 0.1109, |
| "step": 2020 |
| }, |
| { |
| "epoch": 8.286591606960082, |
| "grad_norm": 6.378188133239746, |
| "learning_rate": 1.7914893617021278e-06, |
| "loss": 0.1069, |
| "step": 2030 |
| }, |
| { |
| "epoch": 8.327533265097237, |
| "grad_norm": 14.533955574035645, |
| "learning_rate": 1.748936170212766e-06, |
| "loss": 0.1088, |
| "step": 2040 |
| }, |
| { |
| "epoch": 8.368474923234391, |
| "grad_norm": 10.068609237670898, |
| "learning_rate": 1.7063829787234042e-06, |
| "loss": 0.1631, |
| "step": 2050 |
| }, |
| { |
| "epoch": 8.409416581371545, |
| "grad_norm": 17.62321662902832, |
| "learning_rate": 1.6638297872340428e-06, |
| "loss": 0.1302, |
| "step": 2060 |
| }, |
| { |
| "epoch": 8.4503582395087, |
| "grad_norm": 6.006522178649902, |
| "learning_rate": 1.621276595744681e-06, |
| "loss": 0.1186, |
| "step": 2070 |
| }, |
| { |
| "epoch": 8.491299897645854, |
| "grad_norm": 13.717192649841309, |
| "learning_rate": 1.5787234042553192e-06, |
| "loss": 0.1684, |
| "step": 2080 |
| }, |
| { |
| "epoch": 8.53224155578301, |
| "grad_norm": 16.023700714111328, |
| "learning_rate": 1.5361702127659574e-06, |
| "loss": 0.12, |
| "step": 2090 |
| }, |
| { |
| "epoch": 8.573183213920164, |
| "grad_norm": 5.09406852722168, |
| "learning_rate": 1.4936170212765956e-06, |
| "loss": 0.1282, |
| "step": 2100 |
| }, |
| { |
| "epoch": 8.61412487205732, |
| "grad_norm": 10.95537281036377, |
| "learning_rate": 1.4510638297872343e-06, |
| "loss": 0.0831, |
| "step": 2110 |
| }, |
| { |
| "epoch": 8.655066530194473, |
| "grad_norm": 28.07141876220703, |
| "learning_rate": 1.4085106382978725e-06, |
| "loss": 0.1177, |
| "step": 2120 |
| }, |
| { |
| "epoch": 8.696008188331627, |
| "grad_norm": 8.062235832214355, |
| "learning_rate": 1.3659574468085107e-06, |
| "loss": 0.125, |
| "step": 2130 |
| }, |
| { |
| "epoch": 8.736949846468782, |
| "grad_norm": 5.076770305633545, |
| "learning_rate": 1.3234042553191489e-06, |
| "loss": 0.1576, |
| "step": 2140 |
| }, |
| { |
| "epoch": 8.777891504605936, |
| "grad_norm": 17.725698471069336, |
| "learning_rate": 1.2808510638297875e-06, |
| "loss": 0.1147, |
| "step": 2150 |
| }, |
| { |
| "epoch": 8.818833162743092, |
| "grad_norm": 3.1201858520507812, |
| "learning_rate": 1.2382978723404257e-06, |
| "loss": 0.1262, |
| "step": 2160 |
| }, |
| { |
| "epoch": 8.859774820880245, |
| "grad_norm": 9.379624366760254, |
| "learning_rate": 1.1957446808510639e-06, |
| "loss": 0.1748, |
| "step": 2170 |
| }, |
| { |
| "epoch": 8.900716479017401, |
| "grad_norm": 41.980812072753906, |
| "learning_rate": 1.153191489361702e-06, |
| "loss": 0.1284, |
| "step": 2180 |
| }, |
| { |
| "epoch": 8.941658137154555, |
| "grad_norm": 11.32117748260498, |
| "learning_rate": 1.1106382978723405e-06, |
| "loss": 0.1302, |
| "step": 2190 |
| }, |
| { |
| "epoch": 8.982599795291708, |
| "grad_norm": 20.424264907836914, |
| "learning_rate": 1.0680851063829787e-06, |
| "loss": 0.1124, |
| "step": 2200 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_general_loss": 1.0329692363739014, |
| "eval_general_runtime": 258.6599, |
| "eval_general_samples_per_second": 3.537, |
| "eval_general_steps_per_second": 0.885, |
| "step": 2205 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_code_loss": 0.8624024391174316, |
| "eval_code_runtime": 301.3223, |
| "eval_code_samples_per_second": 3.047, |
| "eval_code_steps_per_second": 0.763, |
| "step": 2205 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_stem_loss": 0.6094364523887634, |
| "eval_stem_runtime": 254.8446, |
| "eval_stem_samples_per_second": 3.586, |
| "eval_stem_steps_per_second": 0.899, |
| "step": 2205 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2450, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.805375367939883e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|