| { | |
| "best_global_step": 1750, | |
| "best_metric": 0.21951219512195122, | |
| "best_model_checkpoint": "out_qwen_4b_sft_augmented/checkpoint-1750", | |
| "epoch": 2.9185243637798384, | |
| "eval_steps": 50, | |
| "global_step": 1850, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015782205563227462, | |
| "grad_norm": 324.9078480271927, | |
| "learning_rate": 2.3560209424083772e-07, | |
| "loss": 36.8443, | |
| "step": 10, | |
| "true_loss": 4.5233 | |
| }, | |
| { | |
| "epoch": 0.031564411126454923, | |
| "grad_norm": 168.6194251999255, | |
| "learning_rate": 4.973821989528796e-07, | |
| "loss": 36.0645, | |
| "step": 20, | |
| "true_loss": 4.3899 | |
| }, | |
| { | |
| "epoch": 0.04734661668968238, | |
| "grad_norm": 192.6442003874084, | |
| "learning_rate": 7.591623036649215e-07, | |
| "loss": 37.1343, | |
| "step": 30, | |
| "true_loss": 4.7917 | |
| }, | |
| { | |
| "epoch": 0.06312882225290985, | |
| "grad_norm": 169.69405108448686, | |
| "learning_rate": 1.0209424083769635e-06, | |
| "loss": 35.8286, | |
| "step": 40, | |
| "true_loss": 4.5888 | |
| }, | |
| { | |
| "epoch": 0.0789110278161373, | |
| "grad_norm": 177.6752080525516, | |
| "learning_rate": 1.2827225130890052e-06, | |
| "loss": 35.1516, | |
| "step": 50, | |
| "true_loss": 4.1833 | |
| }, | |
| { | |
| "epoch": 0.0789110278161373, | |
| "eval_accuracy": 0.02328159645232816, | |
| "eval_loss": 4.40457820892334, | |
| "eval_runtime": 23.7263, | |
| "eval_samples_per_second": 38.017, | |
| "eval_steps_per_second": 4.763, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09469323337936476, | |
| "grad_norm": 175.29116271759068, | |
| "learning_rate": 1.5445026178010472e-06, | |
| "loss": 34.7314, | |
| "step": 60, | |
| "true_loss": 4.224 | |
| }, | |
| { | |
| "epoch": 0.11047543894259222, | |
| "grad_norm": 753.0264561952334, | |
| "learning_rate": 1.8062827225130891e-06, | |
| "loss": 33.9206, | |
| "step": 70, | |
| "true_loss": 4.0565 | |
| }, | |
| { | |
| "epoch": 0.1262576445058197, | |
| "grad_norm": 200.47588314223879, | |
| "learning_rate": 2.068062827225131e-06, | |
| "loss": 34.0037, | |
| "step": 80, | |
| "true_loss": 4.3621 | |
| }, | |
| { | |
| "epoch": 0.14203985006904715, | |
| "grad_norm": 164.18077066091436, | |
| "learning_rate": 2.329842931937173e-06, | |
| "loss": 33.3705, | |
| "step": 90, | |
| "true_loss": 4.1432 | |
| }, | |
| { | |
| "epoch": 0.1578220556322746, | |
| "grad_norm": 144.60587833889255, | |
| "learning_rate": 2.591623036649215e-06, | |
| "loss": 33.3276, | |
| "step": 100, | |
| "true_loss": 4.1424 | |
| }, | |
| { | |
| "epoch": 0.1578220556322746, | |
| "eval_accuracy": 0.043237250554323724, | |
| "eval_loss": 4.096348285675049, | |
| "eval_runtime": 23.3422, | |
| "eval_samples_per_second": 38.643, | |
| "eval_steps_per_second": 4.841, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17360426119550207, | |
| "grad_norm": 270.635578631285, | |
| "learning_rate": 2.853403141361257e-06, | |
| "loss": 32.6581, | |
| "step": 110, | |
| "true_loss": 4.1497 | |
| }, | |
| { | |
| "epoch": 0.18938646675872953, | |
| "grad_norm": 100.33088317059614, | |
| "learning_rate": 3.115183246073299e-06, | |
| "loss": 32.8629, | |
| "step": 120, | |
| "true_loss": 4.1273 | |
| }, | |
| { | |
| "epoch": 0.20516867232195699, | |
| "grad_norm": 108.96506382691295, | |
| "learning_rate": 3.3769633507853404e-06, | |
| "loss": 33.0279, | |
| "step": 130, | |
| "true_loss": 4.1729 | |
| }, | |
| { | |
| "epoch": 0.22095087788518444, | |
| "grad_norm": 101.57985399798042, | |
| "learning_rate": 3.6387434554973826e-06, | |
| "loss": 33.3713, | |
| "step": 140, | |
| "true_loss": 4.2379 | |
| }, | |
| { | |
| "epoch": 0.2367330834484119, | |
| "grad_norm": 109.81214136700734, | |
| "learning_rate": 3.900523560209425e-06, | |
| "loss": 33.2539, | |
| "step": 150, | |
| "true_loss": 4.388 | |
| }, | |
| { | |
| "epoch": 0.2367330834484119, | |
| "eval_accuracy": 0.04434589800443459, | |
| "eval_loss": 4.049750328063965, | |
| "eval_runtime": 23.5037, | |
| "eval_samples_per_second": 38.377, | |
| "eval_steps_per_second": 4.808, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2525152890116394, | |
| "grad_norm": 131.31761293379333, | |
| "learning_rate": 4.1623036649214665e-06, | |
| "loss": 32.8656, | |
| "step": 160, | |
| "true_loss": 4.1088 | |
| }, | |
| { | |
| "epoch": 0.26829749457486685, | |
| "grad_norm": 93.21133961816744, | |
| "learning_rate": 4.424083769633508e-06, | |
| "loss": 32.7579, | |
| "step": 170, | |
| "true_loss": 4.092 | |
| }, | |
| { | |
| "epoch": 0.2840797001380943, | |
| "grad_norm": 99.10049214777489, | |
| "learning_rate": 4.68586387434555e-06, | |
| "loss": 32.777, | |
| "step": 180, | |
| "true_loss": 4.1288 | |
| }, | |
| { | |
| "epoch": 0.29986190570132176, | |
| "grad_norm": 103.48370344024212, | |
| "learning_rate": 4.947643979057592e-06, | |
| "loss": 32.983, | |
| "step": 190, | |
| "true_loss": 3.9901 | |
| }, | |
| { | |
| "epoch": 0.3156441112645492, | |
| "grad_norm": 101.42410788589872, | |
| "learning_rate": 4.976621858562245e-06, | |
| "loss": 32.8281, | |
| "step": 200, | |
| "true_loss": 4.1047 | |
| }, | |
| { | |
| "epoch": 0.3156441112645492, | |
| "eval_accuracy": 0.06873614190687362, | |
| "eval_loss": 4.009181022644043, | |
| "eval_runtime": 23.3653, | |
| "eval_samples_per_second": 38.604, | |
| "eval_steps_per_second": 4.836, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3314263168277767, | |
| "grad_norm": 83.2901468921201, | |
| "learning_rate": 4.94739918176505e-06, | |
| "loss": 33.1028, | |
| "step": 210, | |
| "true_loss": 4.0052 | |
| }, | |
| { | |
| "epoch": 0.34720852239100414, | |
| "grad_norm": 170.23336077552057, | |
| "learning_rate": 4.9181765049678555e-06, | |
| "loss": 32.644, | |
| "step": 220, | |
| "true_loss": 4.1984 | |
| }, | |
| { | |
| "epoch": 0.3629907279542316, | |
| "grad_norm": 73.10967930914595, | |
| "learning_rate": 4.888953828170661e-06, | |
| "loss": 32.9998, | |
| "step": 230, | |
| "true_loss": 4.1035 | |
| }, | |
| { | |
| "epoch": 0.37877293351745905, | |
| "grad_norm": 75.66187004852954, | |
| "learning_rate": 4.859731151373466e-06, | |
| "loss": 32.6017, | |
| "step": 240, | |
| "true_loss": 3.9774 | |
| }, | |
| { | |
| "epoch": 0.3945551390806865, | |
| "grad_norm": 101.5303291876841, | |
| "learning_rate": 4.830508474576272e-06, | |
| "loss": 32.5215, | |
| "step": 250, | |
| "true_loss": 4.107 | |
| }, | |
| { | |
| "epoch": 0.3945551390806865, | |
| "eval_accuracy": 0.07095343680709534, | |
| "eval_loss": 3.933732271194458, | |
| "eval_runtime": 23.3822, | |
| "eval_samples_per_second": 38.576, | |
| "eval_steps_per_second": 4.833, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.41033734464391397, | |
| "grad_norm": 87.68390524050062, | |
| "learning_rate": 4.801285797779077e-06, | |
| "loss": 32.8537, | |
| "step": 260, | |
| "true_loss": 4.0852 | |
| }, | |
| { | |
| "epoch": 0.42611955020714143, | |
| "grad_norm": 83.56798040873913, | |
| "learning_rate": 4.772063120981883e-06, | |
| "loss": 32.452, | |
| "step": 270, | |
| "true_loss": 4.1138 | |
| }, | |
| { | |
| "epoch": 0.4419017557703689, | |
| "grad_norm": 76.69106361052454, | |
| "learning_rate": 4.742840444184687e-06, | |
| "loss": 31.8737, | |
| "step": 280, | |
| "true_loss": 3.8671 | |
| }, | |
| { | |
| "epoch": 0.45768396133359635, | |
| "grad_norm": 79.70560950668154, | |
| "learning_rate": 4.713617767387494e-06, | |
| "loss": 32.1678, | |
| "step": 290, | |
| "true_loss": 3.9949 | |
| }, | |
| { | |
| "epoch": 0.4734661668968238, | |
| "grad_norm": 77.14726547694464, | |
| "learning_rate": 4.684395090590298e-06, | |
| "loss": 32.2324, | |
| "step": 300, | |
| "true_loss": 4.1541 | |
| }, | |
| { | |
| "epoch": 0.4734661668968238, | |
| "eval_accuracy": 0.07206208425720621, | |
| "eval_loss": 3.944307804107666, | |
| "eval_runtime": 24.0666, | |
| "eval_samples_per_second": 37.479, | |
| "eval_steps_per_second": 4.695, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4892483724600513, | |
| "grad_norm": 74.39357075334843, | |
| "learning_rate": 4.655172413793104e-06, | |
| "loss": 32.7538, | |
| "step": 310, | |
| "true_loss": 3.9691 | |
| }, | |
| { | |
| "epoch": 0.5050305780232788, | |
| "grad_norm": 69.2140729496397, | |
| "learning_rate": 4.625949736995909e-06, | |
| "loss": 32.241, | |
| "step": 320, | |
| "true_loss": 4.1026 | |
| }, | |
| { | |
| "epoch": 0.5208127835865062, | |
| "grad_norm": 76.34039893901215, | |
| "learning_rate": 4.596727060198715e-06, | |
| "loss": 32.7047, | |
| "step": 330, | |
| "true_loss": 4.2664 | |
| }, | |
| { | |
| "epoch": 0.5365949891497337, | |
| "grad_norm": 76.48584051124504, | |
| "learning_rate": 4.56750438340152e-06, | |
| "loss": 32.2899, | |
| "step": 340, | |
| "true_loss": 4.1581 | |
| }, | |
| { | |
| "epoch": 0.5523771947129611, | |
| "grad_norm": 75.38836139042635, | |
| "learning_rate": 4.5382817066043256e-06, | |
| "loss": 32.8474, | |
| "step": 350, | |
| "true_loss": 4.0243 | |
| }, | |
| { | |
| "epoch": 0.5523771947129611, | |
| "eval_accuracy": 0.07317073170731707, | |
| "eval_loss": 3.8937127590179443, | |
| "eval_runtime": 23.6141, | |
| "eval_samples_per_second": 38.198, | |
| "eval_steps_per_second": 4.785, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5681594002761886, | |
| "grad_norm": 85.00115390444991, | |
| "learning_rate": 4.509059029807131e-06, | |
| "loss": 31.7532, | |
| "step": 360, | |
| "true_loss": 3.988 | |
| }, | |
| { | |
| "epoch": 0.583941605839416, | |
| "grad_norm": 76.08205167618928, | |
| "learning_rate": 4.479836353009936e-06, | |
| "loss": 31.7753, | |
| "step": 370, | |
| "true_loss": 3.9777 | |
| }, | |
| { | |
| "epoch": 0.5997238114026435, | |
| "grad_norm": 68.74246027512453, | |
| "learning_rate": 4.450613676212742e-06, | |
| "loss": 31.4578, | |
| "step": 380, | |
| "true_loss": 3.9916 | |
| }, | |
| { | |
| "epoch": 0.6155060169658709, | |
| "grad_norm": 71.81332443942438, | |
| "learning_rate": 4.4213909994155465e-06, | |
| "loss": 32.0656, | |
| "step": 390, | |
| "true_loss": 4.1254 | |
| }, | |
| { | |
| "epoch": 0.6312882225290984, | |
| "grad_norm": 71.77472037885094, | |
| "learning_rate": 4.392168322618352e-06, | |
| "loss": 32.2929, | |
| "step": 400, | |
| "true_loss": 3.8869 | |
| }, | |
| { | |
| "epoch": 0.6312882225290984, | |
| "eval_accuracy": 0.1130820399113082, | |
| "eval_loss": 3.7887723445892334, | |
| "eval_runtime": 23.904, | |
| "eval_samples_per_second": 37.734, | |
| "eval_steps_per_second": 4.727, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.647070428092326, | |
| "grad_norm": 71.4147738866297, | |
| "learning_rate": 4.3629456458211574e-06, | |
| "loss": 31.9759, | |
| "step": 410, | |
| "true_loss": 3.9893 | |
| }, | |
| { | |
| "epoch": 0.6628526336555534, | |
| "grad_norm": 73.99693807240743, | |
| "learning_rate": 4.333722969023963e-06, | |
| "loss": 31.8449, | |
| "step": 420, | |
| "true_loss": 3.9648 | |
| }, | |
| { | |
| "epoch": 0.6786348392187809, | |
| "grad_norm": 74.124412349921, | |
| "learning_rate": 4.304500292226768e-06, | |
| "loss": 31.9984, | |
| "step": 430, | |
| "true_loss": 3.8608 | |
| }, | |
| { | |
| "epoch": 0.6944170447820083, | |
| "grad_norm": 80.24673254296576, | |
| "learning_rate": 4.275277615429574e-06, | |
| "loss": 31.8185, | |
| "step": 440, | |
| "true_loss": 4.033 | |
| }, | |
| { | |
| "epoch": 0.7101992503452358, | |
| "grad_norm": 83.65131551190922, | |
| "learning_rate": 4.246054938632379e-06, | |
| "loss": 31.7604, | |
| "step": 450, | |
| "true_loss": 3.9851 | |
| }, | |
| { | |
| "epoch": 0.7101992503452358, | |
| "eval_accuracy": 0.11862527716186252, | |
| "eval_loss": 3.731326103210449, | |
| "eval_runtime": 23.8861, | |
| "eval_samples_per_second": 37.762, | |
| "eval_steps_per_second": 4.731, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7259814559084632, | |
| "grad_norm": 82.91785549569512, | |
| "learning_rate": 4.216832261835184e-06, | |
| "loss": 31.9212, | |
| "step": 460, | |
| "true_loss": 4.0262 | |
| }, | |
| { | |
| "epoch": 0.7417636614716907, | |
| "grad_norm": 77.3700837190592, | |
| "learning_rate": 4.18760958503799e-06, | |
| "loss": 31.558, | |
| "step": 470, | |
| "true_loss": 3.9037 | |
| }, | |
| { | |
| "epoch": 0.7575458670349181, | |
| "grad_norm": 79.2567711758356, | |
| "learning_rate": 4.158386908240795e-06, | |
| "loss": 31.9821, | |
| "step": 480, | |
| "true_loss": 3.9674 | |
| }, | |
| { | |
| "epoch": 0.7733280725981456, | |
| "grad_norm": 79.15209026205977, | |
| "learning_rate": 4.1291642314436e-06, | |
| "loss": 31.6287, | |
| "step": 490, | |
| "true_loss": 4.0236 | |
| }, | |
| { | |
| "epoch": 0.789110278161373, | |
| "grad_norm": 82.91621808879624, | |
| "learning_rate": 4.0999415546464065e-06, | |
| "loss": 31.1936, | |
| "step": 500, | |
| "true_loss": 3.8798 | |
| }, | |
| { | |
| "epoch": 0.789110278161373, | |
| "eval_accuracy": 0.1419068736141907, | |
| "eval_loss": 3.6688694953918457, | |
| "eval_runtime": 23.5535, | |
| "eval_samples_per_second": 38.296, | |
| "eval_steps_per_second": 4.798, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8048924837246005, | |
| "grad_norm": 81.33416413304444, | |
| "learning_rate": 4.070718877849211e-06, | |
| "loss": 31.9352, | |
| "step": 510, | |
| "true_loss": 3.9601 | |
| }, | |
| { | |
| "epoch": 0.8206746892878279, | |
| "grad_norm": 84.38809367146038, | |
| "learning_rate": 4.0414962010520166e-06, | |
| "loss": 31.4177, | |
| "step": 520, | |
| "true_loss": 4.0845 | |
| }, | |
| { | |
| "epoch": 0.8364568948510555, | |
| "grad_norm": 71.30053276240011, | |
| "learning_rate": 4.012273524254822e-06, | |
| "loss": 31.7603, | |
| "step": 530, | |
| "true_loss": 3.997 | |
| }, | |
| { | |
| "epoch": 0.8522391004142829, | |
| "grad_norm": 80.60566610130194, | |
| "learning_rate": 3.9830508474576275e-06, | |
| "loss": 31.2764, | |
| "step": 540, | |
| "true_loss": 3.8168 | |
| }, | |
| { | |
| "epoch": 0.8680213059775104, | |
| "grad_norm": 91.53898913592603, | |
| "learning_rate": 3.953828170660433e-06, | |
| "loss": 31.4915, | |
| "step": 550, | |
| "true_loss": 3.9316 | |
| }, | |
| { | |
| "epoch": 0.8680213059775104, | |
| "eval_accuracy": 0.1419068736141907, | |
| "eval_loss": 3.6105031967163086, | |
| "eval_runtime": 23.4993, | |
| "eval_samples_per_second": 38.384, | |
| "eval_steps_per_second": 4.809, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8838035115407378, | |
| "grad_norm": 80.87596019476308, | |
| "learning_rate": 3.924605493863238e-06, | |
| "loss": 31.1347, | |
| "step": 560, | |
| "true_loss": 3.7968 | |
| }, | |
| { | |
| "epoch": 0.8995857171039653, | |
| "grad_norm": 80.68070986181071, | |
| "learning_rate": 3.895382817066044e-06, | |
| "loss": 31.7808, | |
| "step": 570, | |
| "true_loss": 3.9588 | |
| }, | |
| { | |
| "epoch": 0.9153679226671927, | |
| "grad_norm": 82.14608732630818, | |
| "learning_rate": 3.8661601402688484e-06, | |
| "loss": 31.0608, | |
| "step": 580, | |
| "true_loss": 3.9779 | |
| }, | |
| { | |
| "epoch": 0.9311501282304202, | |
| "grad_norm": 79.11284293573352, | |
| "learning_rate": 3.836937463471655e-06, | |
| "loss": 30.9633, | |
| "step": 590, | |
| "true_loss": 4.0027 | |
| }, | |
| { | |
| "epoch": 0.9469323337936476, | |
| "grad_norm": 74.83038134855214, | |
| "learning_rate": 3.8077147866744598e-06, | |
| "loss": 31.2218, | |
| "step": 600, | |
| "true_loss": 3.8863 | |
| }, | |
| { | |
| "epoch": 0.9469323337936476, | |
| "eval_accuracy": 0.1419068736141907, | |
| "eval_loss": 3.6132075786590576, | |
| "eval_runtime": 24.0285, | |
| "eval_samples_per_second": 37.539, | |
| "eval_steps_per_second": 4.703, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9627145393568751, | |
| "grad_norm": 77.6288128447819, | |
| "learning_rate": 3.7784921098772652e-06, | |
| "loss": 31.104, | |
| "step": 610, | |
| "true_loss": 3.8553 | |
| }, | |
| { | |
| "epoch": 0.9784967449201026, | |
| "grad_norm": 78.73674435660126, | |
| "learning_rate": 3.7492694330800707e-06, | |
| "loss": 31.4592, | |
| "step": 620, | |
| "true_loss": 3.8146 | |
| }, | |
| { | |
| "epoch": 0.99427895048333, | |
| "grad_norm": 84.19142784542193, | |
| "learning_rate": 3.7200467562828757e-06, | |
| "loss": 31.4694, | |
| "step": 630, | |
| "true_loss": 3.8633 | |
| }, | |
| { | |
| "epoch": 1.0094693233379364, | |
| "grad_norm": 83.97120538549059, | |
| "learning_rate": 3.6908240794856816e-06, | |
| "loss": 29.9569, | |
| "step": 640, | |
| "true_loss": 3.9506 | |
| }, | |
| { | |
| "epoch": 1.0252515289011639, | |
| "grad_norm": 78.03329090311782, | |
| "learning_rate": 3.6616014026884866e-06, | |
| "loss": 30.6145, | |
| "step": 650, | |
| "true_loss": 3.8972 | |
| }, | |
| { | |
| "epoch": 1.0252515289011639, | |
| "eval_accuracy": 0.13303769401330376, | |
| "eval_loss": 3.5806682109832764, | |
| "eval_runtime": 23.4297, | |
| "eval_samples_per_second": 38.498, | |
| "eval_steps_per_second": 4.823, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.0410337344643914, | |
| "grad_norm": 90.48371602841107, | |
| "learning_rate": 3.6323787258912916e-06, | |
| "loss": 30.3281, | |
| "step": 660, | |
| "true_loss": 4.0951 | |
| }, | |
| { | |
| "epoch": 1.056815940027619, | |
| "grad_norm": 80.37129983569977, | |
| "learning_rate": 3.6031560490940975e-06, | |
| "loss": 30.4516, | |
| "step": 670, | |
| "true_loss": 4.0298 | |
| }, | |
| { | |
| "epoch": 1.0725981455908462, | |
| "grad_norm": 89.0421974632656, | |
| "learning_rate": 3.5739333722969025e-06, | |
| "loss": 30.5053, | |
| "step": 680, | |
| "true_loss": 3.838 | |
| }, | |
| { | |
| "epoch": 1.0883803511540737, | |
| "grad_norm": 87.6238382263665, | |
| "learning_rate": 3.544710695499708e-06, | |
| "loss": 30.1458, | |
| "step": 690, | |
| "true_loss": 3.4672 | |
| }, | |
| { | |
| "epoch": 1.1041625567173012, | |
| "grad_norm": 93.33199336725635, | |
| "learning_rate": 3.5154880187025135e-06, | |
| "loss": 29.9857, | |
| "step": 700, | |
| "true_loss": 3.8479 | |
| }, | |
| { | |
| "epoch": 1.1041625567173012, | |
| "eval_accuracy": 0.1574279379157428, | |
| "eval_loss": 3.5641186237335205, | |
| "eval_runtime": 23.6052, | |
| "eval_samples_per_second": 38.212, | |
| "eval_steps_per_second": 4.787, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.1199447622805287, | |
| "grad_norm": 92.22724838108498, | |
| "learning_rate": 3.486265341905319e-06, | |
| "loss": 29.8985, | |
| "step": 710, | |
| "true_loss": 3.6761 | |
| }, | |
| { | |
| "epoch": 1.1357269678437563, | |
| "grad_norm": 84.13971221580583, | |
| "learning_rate": 3.457042665108124e-06, | |
| "loss": 30.7157, | |
| "step": 720, | |
| "true_loss": 3.6518 | |
| }, | |
| { | |
| "epoch": 1.1515091734069836, | |
| "grad_norm": 85.8960190061776, | |
| "learning_rate": 3.42781998831093e-06, | |
| "loss": 30.2104, | |
| "step": 730, | |
| "true_loss": 3.7526 | |
| }, | |
| { | |
| "epoch": 1.167291378970211, | |
| "grad_norm": 86.69215126012148, | |
| "learning_rate": 3.398597311513735e-06, | |
| "loss": 29.405, | |
| "step": 740, | |
| "true_loss": 4.0226 | |
| }, | |
| { | |
| "epoch": 1.1830735845334386, | |
| "grad_norm": 87.86033437742292, | |
| "learning_rate": 3.3693746347165403e-06, | |
| "loss": 30.511, | |
| "step": 750, | |
| "true_loss": 3.7897 | |
| }, | |
| { | |
| "epoch": 1.1830735845334386, | |
| "eval_accuracy": 0.14523281596452328, | |
| "eval_loss": 3.493298292160034, | |
| "eval_runtime": 23.8274, | |
| "eval_samples_per_second": 37.856, | |
| "eval_steps_per_second": 4.742, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.198855790096666, | |
| "grad_norm": 111.93866332819323, | |
| "learning_rate": 3.3401519579193458e-06, | |
| "loss": 29.8573, | |
| "step": 760, | |
| "true_loss": 3.7307 | |
| }, | |
| { | |
| "epoch": 1.2146379956598934, | |
| "grad_norm": 78.19039410059621, | |
| "learning_rate": 3.310929281122151e-06, | |
| "loss": 30.2595, | |
| "step": 770, | |
| "true_loss": 3.7585 | |
| }, | |
| { | |
| "epoch": 1.230420201223121, | |
| "grad_norm": 136.24794475282891, | |
| "learning_rate": 3.2817066043249562e-06, | |
| "loss": 30.1392, | |
| "step": 780, | |
| "true_loss": 3.6031 | |
| }, | |
| { | |
| "epoch": 1.2462024067863484, | |
| "grad_norm": 83.26617285476617, | |
| "learning_rate": 3.252483927527762e-06, | |
| "loss": 29.7774, | |
| "step": 790, | |
| "true_loss": 3.8831 | |
| }, | |
| { | |
| "epoch": 1.261984612349576, | |
| "grad_norm": 82.12832696286748, | |
| "learning_rate": 3.223261250730567e-06, | |
| "loss": 30.2729, | |
| "step": 800, | |
| "true_loss": 3.7239 | |
| }, | |
| { | |
| "epoch": 1.261984612349576, | |
| "eval_accuracy": 0.15964523281596452, | |
| "eval_loss": 3.5300114154815674, | |
| "eval_runtime": 23.4485, | |
| "eval_samples_per_second": 38.467, | |
| "eval_steps_per_second": 4.819, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.2777668179128034, | |
| "grad_norm": 90.3512257461839, | |
| "learning_rate": 3.194038573933372e-06, | |
| "loss": 30.1961, | |
| "step": 810, | |
| "true_loss": 3.784 | |
| }, | |
| { | |
| "epoch": 1.2935490234760307, | |
| "grad_norm": 84.55475078707286, | |
| "learning_rate": 3.164815897136178e-06, | |
| "loss": 29.3181, | |
| "step": 820, | |
| "true_loss": 3.5114 | |
| }, | |
| { | |
| "epoch": 1.3093312290392582, | |
| "grad_norm": 89.4992221052368, | |
| "learning_rate": 3.135593220338983e-06, | |
| "loss": 30.2988, | |
| "step": 830, | |
| "true_loss": 3.7831 | |
| }, | |
| { | |
| "epoch": 1.3251134346024858, | |
| "grad_norm": 92.43123125189919, | |
| "learning_rate": 3.1063705435417885e-06, | |
| "loss": 30.0873, | |
| "step": 840, | |
| "true_loss": 3.8634 | |
| }, | |
| { | |
| "epoch": 1.340895640165713, | |
| "grad_norm": 89.04832868320074, | |
| "learning_rate": 3.0771478667445944e-06, | |
| "loss": 30.2665, | |
| "step": 850, | |
| "true_loss": 3.8983 | |
| }, | |
| { | |
| "epoch": 1.340895640165713, | |
| "eval_accuracy": 0.16851441241685144, | |
| "eval_loss": 3.462191343307495, | |
| "eval_runtime": 23.8924, | |
| "eval_samples_per_second": 37.753, | |
| "eval_steps_per_second": 4.73, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.3566778457289406, | |
| "grad_norm": 92.16494941680725, | |
| "learning_rate": 3.0479251899473994e-06, | |
| "loss": 29.7738, | |
| "step": 860, | |
| "true_loss": 4.0234 | |
| }, | |
| { | |
| "epoch": 1.372460051292168, | |
| "grad_norm": 85.57074344764075, | |
| "learning_rate": 3.0187025131502045e-06, | |
| "loss": 30.0651, | |
| "step": 870, | |
| "true_loss": 3.7162 | |
| }, | |
| { | |
| "epoch": 1.3882422568553956, | |
| "grad_norm": 97.12213474754635, | |
| "learning_rate": 2.9894798363530103e-06, | |
| "loss": 30.0534, | |
| "step": 880, | |
| "true_loss": 3.969 | |
| }, | |
| { | |
| "epoch": 1.404024462418623, | |
| "grad_norm": 85.14532114581857, | |
| "learning_rate": 2.9602571595558154e-06, | |
| "loss": 30.1717, | |
| "step": 890, | |
| "true_loss": 3.8654 | |
| }, | |
| { | |
| "epoch": 1.4198066679818504, | |
| "grad_norm": 89.01307343902022, | |
| "learning_rate": 2.931034482758621e-06, | |
| "loss": 30.2877, | |
| "step": 900, | |
| "true_loss": 3.6224 | |
| }, | |
| { | |
| "epoch": 1.4198066679818504, | |
| "eval_accuracy": 0.16186252771618626, | |
| "eval_loss": 3.4574382305145264, | |
| "eval_runtime": 23.3454, | |
| "eval_samples_per_second": 38.637, | |
| "eval_steps_per_second": 4.84, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.435588873545078, | |
| "grad_norm": 87.24388793474637, | |
| "learning_rate": 2.9018118059614263e-06, | |
| "loss": 30.8903, | |
| "step": 910, | |
| "true_loss": 3.8262 | |
| }, | |
| { | |
| "epoch": 1.4513710791083054, | |
| "grad_norm": 84.04918766102597, | |
| "learning_rate": 2.8725891291642317e-06, | |
| "loss": 29.8196, | |
| "step": 920, | |
| "true_loss": 3.918 | |
| }, | |
| { | |
| "epoch": 1.4671532846715327, | |
| "grad_norm": 83.16951966482195, | |
| "learning_rate": 2.8433664523670368e-06, | |
| "loss": 29.6097, | |
| "step": 930, | |
| "true_loss": 3.4664 | |
| }, | |
| { | |
| "epoch": 1.4829354902347602, | |
| "grad_norm": 96.29394268139959, | |
| "learning_rate": 2.8141437755698426e-06, | |
| "loss": 29.7147, | |
| "step": 940, | |
| "true_loss": 3.6577 | |
| }, | |
| { | |
| "epoch": 1.4987176957979877, | |
| "grad_norm": 101.35324066409235, | |
| "learning_rate": 2.7849210987726477e-06, | |
| "loss": 29.4593, | |
| "step": 950, | |
| "true_loss": 3.6391 | |
| }, | |
| { | |
| "epoch": 1.4987176957979877, | |
| "eval_accuracy": 0.15188470066518847, | |
| "eval_loss": 3.448209762573242, | |
| "eval_runtime": 23.74, | |
| "eval_samples_per_second": 37.995, | |
| "eval_steps_per_second": 4.76, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.5144999013612153, | |
| "grad_norm": 91.01958083657513, | |
| "learning_rate": 2.7556984219754535e-06, | |
| "loss": 29.4563, | |
| "step": 960, | |
| "true_loss": 3.5778 | |
| }, | |
| { | |
| "epoch": 1.5302821069244428, | |
| "grad_norm": 94.87158519297142, | |
| "learning_rate": 2.7264757451782586e-06, | |
| "loss": 29.8641, | |
| "step": 970, | |
| "true_loss": 3.9124 | |
| }, | |
| { | |
| "epoch": 1.5460643124876703, | |
| "grad_norm": 84.01959219280997, | |
| "learning_rate": 2.697253068381064e-06, | |
| "loss": 29.8265, | |
| "step": 980, | |
| "true_loss": 3.5047 | |
| }, | |
| { | |
| "epoch": 1.5618465180508976, | |
| "grad_norm": 91.64799353240991, | |
| "learning_rate": 2.6680303915838695e-06, | |
| "loss": 29.858, | |
| "step": 990, | |
| "true_loss": 3.701 | |
| }, | |
| { | |
| "epoch": 1.577628723614125, | |
| "grad_norm": 88.05009850532113, | |
| "learning_rate": 2.638807714786675e-06, | |
| "loss": 29.9069, | |
| "step": 1000, | |
| "true_loss": 3.6317 | |
| }, | |
| { | |
| "epoch": 1.577628723614125, | |
| "eval_accuracy": 0.14412416851441243, | |
| "eval_loss": 3.4386119842529297, | |
| "eval_runtime": 24.0921, | |
| "eval_samples_per_second": 37.44, | |
| "eval_steps_per_second": 4.69, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.5934109291773524, | |
| "grad_norm": 88.61468815912133, | |
| "learning_rate": 2.60958503798948e-06, | |
| "loss": 29.7709, | |
| "step": 1010, | |
| "true_loss": 3.6406 | |
| }, | |
| { | |
| "epoch": 1.60919313474058, | |
| "grad_norm": 85.25549864190619, | |
| "learning_rate": 2.580362361192286e-06, | |
| "loss": 29.8656, | |
| "step": 1020, | |
| "true_loss": 4.0205 | |
| }, | |
| { | |
| "epoch": 1.6249753403038074, | |
| "grad_norm": 95.17123739206981, | |
| "learning_rate": 2.551139684395091e-06, | |
| "loss": 29.3146, | |
| "step": 1030, | |
| "true_loss": 3.7517 | |
| }, | |
| { | |
| "epoch": 1.640757545867035, | |
| "grad_norm": 94.05266971651754, | |
| "learning_rate": 2.521917007597896e-06, | |
| "loss": 29.5985, | |
| "step": 1040, | |
| "true_loss": 3.8767 | |
| }, | |
| { | |
| "epoch": 1.6565397514302624, | |
| "grad_norm": 84.23330948930467, | |
| "learning_rate": 2.4926943308007014e-06, | |
| "loss": 29.6116, | |
| "step": 1050, | |
| "true_loss": 3.6806 | |
| }, | |
| { | |
| "epoch": 1.6565397514302624, | |
| "eval_accuracy": 0.1574279379157428, | |
| "eval_loss": 3.4331889152526855, | |
| "eval_runtime": 24.0418, | |
| "eval_samples_per_second": 37.518, | |
| "eval_steps_per_second": 4.7, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.67232195699349, | |
| "grad_norm": 95.80967931580453, | |
| "learning_rate": 2.463471654003507e-06, | |
| "loss": 29.4546, | |
| "step": 1060, | |
| "true_loss": 3.5296 | |
| }, | |
| { | |
| "epoch": 1.6881041625567172, | |
| "grad_norm": 89.21975858244956, | |
| "learning_rate": 2.4342489772063123e-06, | |
| "loss": 29.4728, | |
| "step": 1070, | |
| "true_loss": 3.8411 | |
| }, | |
| { | |
| "epoch": 1.7038863681199448, | |
| "grad_norm": 93.36233191510571, | |
| "learning_rate": 2.4050263004091177e-06, | |
| "loss": 29.5254, | |
| "step": 1080, | |
| "true_loss": 3.7742 | |
| }, | |
| { | |
| "epoch": 1.7196685736831723, | |
| "grad_norm": 107.3636400374849, | |
| "learning_rate": 2.375803623611923e-06, | |
| "loss": 29.5684, | |
| "step": 1090, | |
| "true_loss": 3.6389 | |
| }, | |
| { | |
| "epoch": 1.7354507792463996, | |
| "grad_norm": 1304.6259656582167, | |
| "learning_rate": 2.3465809468147286e-06, | |
| "loss": 29.638, | |
| "step": 1100, | |
| "true_loss": 3.7105 | |
| }, | |
| { | |
| "epoch": 1.7354507792463996, | |
| "eval_accuracy": 0.16518847006651885, | |
| "eval_loss": 3.4311046600341797, | |
| "eval_runtime": 23.6704, | |
| "eval_samples_per_second": 38.107, | |
| "eval_steps_per_second": 4.774, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.751232984809627, | |
| "grad_norm": 96.2130469861494, | |
| "learning_rate": 2.3173582700175337e-06, | |
| "loss": 29.3766, | |
| "step": 1110, | |
| "true_loss": 3.67 | |
| }, | |
| { | |
| "epoch": 1.7670151903728546, | |
| "grad_norm": 95.92673596052384, | |
| "learning_rate": 2.288135593220339e-06, | |
| "loss": 29.3919, | |
| "step": 1120, | |
| "true_loss": 3.8554 | |
| }, | |
| { | |
| "epoch": 1.782797395936082, | |
| "grad_norm": 88.05948564467495, | |
| "learning_rate": 2.2589129164231446e-06, | |
| "loss": 29.2821, | |
| "step": 1130, | |
| "true_loss": 3.7715 | |
| }, | |
| { | |
| "epoch": 1.7985796014993096, | |
| "grad_norm": 90.55869365935298, | |
| "learning_rate": 2.22969023962595e-06, | |
| "loss": 29.7986, | |
| "step": 1140, | |
| "true_loss": 3.6651 | |
| }, | |
| { | |
| "epoch": 1.8143618070625371, | |
| "grad_norm": 94.20405819159555, | |
| "learning_rate": 2.2004675628287555e-06, | |
| "loss": 29.8496, | |
| "step": 1150, | |
| "true_loss": 3.7782 | |
| }, | |
| { | |
| "epoch": 1.8143618070625371, | |
| "eval_accuracy": 0.16186252771618626, | |
| "eval_loss": 3.4140689373016357, | |
| "eval_runtime": 24.0501, | |
| "eval_samples_per_second": 37.505, | |
| "eval_steps_per_second": 4.699, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.8301440126257644, | |
| "grad_norm": 87.96745013401917, | |
| "learning_rate": 2.171244886031561e-06, | |
| "loss": 29.5695, | |
| "step": 1160, | |
| "true_loss": 3.8398 | |
| }, | |
| { | |
| "epoch": 1.845926218188992, | |
| "grad_norm": 92.10246815773178, | |
| "learning_rate": 2.142022209234366e-06, | |
| "loss": 29.8099, | |
| "step": 1170, | |
| "true_loss": 3.9006 | |
| }, | |
| { | |
| "epoch": 1.8617084237522192, | |
| "grad_norm": 83.12415816117908, | |
| "learning_rate": 2.1127995324371714e-06, | |
| "loss": 30.2325, | |
| "step": 1180, | |
| "true_loss": 3.623 | |
| }, | |
| { | |
| "epoch": 1.8774906293154467, | |
| "grad_norm": 107.24249122552868, | |
| "learning_rate": 2.083576855639977e-06, | |
| "loss": 29.8222, | |
| "step": 1190, | |
| "true_loss": 3.5215 | |
| }, | |
| { | |
| "epoch": 1.8932728348786743, | |
| "grad_norm": 91.4088303372265, | |
| "learning_rate": 2.054354178842782e-06, | |
| "loss": 29.498, | |
| "step": 1200, | |
| "true_loss": 3.653 | |
| }, | |
| { | |
| "epoch": 1.8932728348786743, | |
| "eval_accuracy": 0.16851441241685144, | |
| "eval_loss": 3.4037177562713623, | |
| "eval_runtime": 23.8616, | |
| "eval_samples_per_second": 37.801, | |
| "eval_steps_per_second": 4.736, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.9090550404419018, | |
| "grad_norm": 100.69059595068175, | |
| "learning_rate": 2.0251315020455873e-06, | |
| "loss": 29.1097, | |
| "step": 1210, | |
| "true_loss": 3.5445 | |
| }, | |
| { | |
| "epoch": 1.9248372460051293, | |
| "grad_norm": 96.05062058766768, | |
| "learning_rate": 1.995908825248393e-06, | |
| "loss": 29.5919, | |
| "step": 1220, | |
| "true_loss": 3.6302 | |
| }, | |
| { | |
| "epoch": 1.9406194515683568, | |
| "grad_norm": 89.1551204065743, | |
| "learning_rate": 1.9666861484511982e-06, | |
| "loss": 29.0917, | |
| "step": 1230, | |
| "true_loss": 3.5472 | |
| }, | |
| { | |
| "epoch": 1.956401657131584, | |
| "grad_norm": 101.09204426369772, | |
| "learning_rate": 1.9374634716540037e-06, | |
| "loss": 28.8413, | |
| "step": 1240, | |
| "true_loss": 3.8083 | |
| }, | |
| { | |
| "epoch": 1.9721838626948116, | |
| "grad_norm": 105.57133293543227, | |
| "learning_rate": 1.908240794856809e-06, | |
| "loss": 29.533, | |
| "step": 1250, | |
| "true_loss": 3.7014 | |
| }, | |
| { | |
| "epoch": 1.9721838626948116, | |
| "eval_accuracy": 0.17516629711751663, | |
| "eval_loss": 3.385300397872925, | |
| "eval_runtime": 23.5478, | |
| "eval_samples_per_second": 38.305, | |
| "eval_steps_per_second": 4.799, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.987966068258039, | |
| "grad_norm": 96.01274952042326, | |
| "learning_rate": 1.8790181180596146e-06, | |
| "loss": 29.2844, | |
| "step": 1260, | |
| "true_loss": 3.8112 | |
| }, | |
| { | |
| "epoch": 2.0031564411126457, | |
| "grad_norm": 94.04158733000145, | |
| "learning_rate": 1.8497954412624196e-06, | |
| "loss": 27.161, | |
| "step": 1270, | |
| "true_loss": 3.399 | |
| }, | |
| { | |
| "epoch": 2.0189386466758728, | |
| "grad_norm": 113.00306213639786, | |
| "learning_rate": 1.820572764465225e-06, | |
| "loss": 27.5121, | |
| "step": 1280, | |
| "true_loss": 3.5377 | |
| }, | |
| { | |
| "epoch": 2.0347208522391003, | |
| "grad_norm": 128.85739550143202, | |
| "learning_rate": 1.7913500876680305e-06, | |
| "loss": 27.3831, | |
| "step": 1290, | |
| "true_loss": 3.3378 | |
| }, | |
| { | |
| "epoch": 2.0505030578023278, | |
| "grad_norm": 135.357215369742, | |
| "learning_rate": 1.7621274108708358e-06, | |
| "loss": 28.0219, | |
| "step": 1300, | |
| "true_loss": 3.6161 | |
| }, | |
| { | |
| "epoch": 2.0505030578023278, | |
| "eval_accuracy": 0.17849223946784923, | |
| "eval_loss": 3.400832414627075, | |
| "eval_runtime": 24.2954, | |
| "eval_samples_per_second": 37.126, | |
| "eval_steps_per_second": 4.651, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.0662852633655553, | |
| "grad_norm": 133.18820676570508, | |
| "learning_rate": 1.7329047340736412e-06, | |
| "loss": 27.7807, | |
| "step": 1310, | |
| "true_loss": 3.5333 | |
| }, | |
| { | |
| "epoch": 2.082067468928783, | |
| "grad_norm": 132.12454176042834, | |
| "learning_rate": 1.7036820572764467e-06, | |
| "loss": 27.6285, | |
| "step": 1320, | |
| "true_loss": 3.122 | |
| }, | |
| { | |
| "epoch": 2.0978496744920103, | |
| "grad_norm": 121.63901914437486, | |
| "learning_rate": 1.674459380479252e-06, | |
| "loss": 27.2663, | |
| "step": 1330, | |
| "true_loss": 3.2256 | |
| }, | |
| { | |
| "epoch": 2.113631880055238, | |
| "grad_norm": 137.06803144579217, | |
| "learning_rate": 1.6452367036820574e-06, | |
| "loss": 27.6847, | |
| "step": 1340, | |
| "true_loss": 3.3579 | |
| }, | |
| { | |
| "epoch": 2.1294140856184653, | |
| "grad_norm": 127.41062741726843, | |
| "learning_rate": 1.6160140268848628e-06, | |
| "loss": 27.7085, | |
| "step": 1350, | |
| "true_loss": 3.45 | |
| }, | |
| { | |
| "epoch": 2.1294140856184653, | |
| "eval_accuracy": 0.18070953436807094, | |
| "eval_loss": 3.376422643661499, | |
| "eval_runtime": 23.8791, | |
| "eval_samples_per_second": 37.774, | |
| "eval_steps_per_second": 4.732, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.1451962911816924, | |
| "grad_norm": 121.2757103113971, | |
| "learning_rate": 1.586791350087668e-06, | |
| "loss": 27.6917, | |
| "step": 1360, | |
| "true_loss": 3.3579 | |
| }, | |
| { | |
| "epoch": 2.16097849674492, | |
| "grad_norm": 128.57906764071424, | |
| "learning_rate": 1.5575686732904735e-06, | |
| "loss": 26.6196, | |
| "step": 1370, | |
| "true_loss": 3.2661 | |
| }, | |
| { | |
| "epoch": 2.1767607023081474, | |
| "grad_norm": 144.7381021633525, | |
| "learning_rate": 1.528345996493279e-06, | |
| "loss": 28.1214, | |
| "step": 1380, | |
| "true_loss": 3.5539 | |
| }, | |
| { | |
| "epoch": 2.192542907871375, | |
| "grad_norm": 145.5114518133331, | |
| "learning_rate": 1.4991233196960842e-06, | |
| "loss": 27.5329, | |
| "step": 1390, | |
| "true_loss": 3.5611 | |
| }, | |
| { | |
| "epoch": 2.2083251134346025, | |
| "grad_norm": 137.75864220522647, | |
| "learning_rate": 1.4699006428988897e-06, | |
| "loss": 27.6466, | |
| "step": 1400, | |
| "true_loss": 3.4073 | |
| }, | |
| { | |
| "epoch": 2.2083251134346025, | |
| "eval_accuracy": 0.18514412416851442, | |
| "eval_loss": 3.3832011222839355, | |
| "eval_runtime": 23.5091, | |
| "eval_samples_per_second": 38.368, | |
| "eval_steps_per_second": 4.807, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.22410731899783, | |
| "grad_norm": 132.61505430150723, | |
| "learning_rate": 1.4406779661016951e-06, | |
| "loss": 27.1518, | |
| "step": 1410, | |
| "true_loss": 3.4681 | |
| }, | |
| { | |
| "epoch": 2.2398895245610575, | |
| "grad_norm": 140.85574741341603, | |
| "learning_rate": 1.4114552893045006e-06, | |
| "loss": 27.7041, | |
| "step": 1420, | |
| "true_loss": 3.6406 | |
| }, | |
| { | |
| "epoch": 2.255671730124285, | |
| "grad_norm": 245.85007563586552, | |
| "learning_rate": 1.3822326125073058e-06, | |
| "loss": 27.6303, | |
| "step": 1430, | |
| "true_loss": 3.5352 | |
| }, | |
| { | |
| "epoch": 2.2714539356875125, | |
| "grad_norm": 577.1875021093457, | |
| "learning_rate": 1.3530099357101113e-06, | |
| "loss": 28.509, | |
| "step": 1440, | |
| "true_loss": 3.5396 | |
| }, | |
| { | |
| "epoch": 2.28723614125074, | |
| "grad_norm": 147.03022521178113, | |
| "learning_rate": 1.3237872589129167e-06, | |
| "loss": 27.6055, | |
| "step": 1450, | |
| "true_loss": 3.4989 | |
| }, | |
| { | |
| "epoch": 2.28723614125074, | |
| "eval_accuracy": 0.18514412416851442, | |
| "eval_loss": 3.4081857204437256, | |
| "eval_runtime": 24.0487, | |
| "eval_samples_per_second": 37.507, | |
| "eval_steps_per_second": 4.699, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.303018346813967, | |
| "grad_norm": 158.25616293531198, | |
| "learning_rate": 1.2945645821157218e-06, | |
| "loss": 27.6877, | |
| "step": 1460, | |
| "true_loss": 3.6034 | |
| }, | |
| { | |
| "epoch": 2.3188005523771946, | |
| "grad_norm": 155.95509000199831, | |
| "learning_rate": 1.2653419053185272e-06, | |
| "loss": 27.8664, | |
| "step": 1470, | |
| "true_loss": 3.5089 | |
| }, | |
| { | |
| "epoch": 2.334582757940422, | |
| "grad_norm": 146.83540699519045, | |
| "learning_rate": 1.2361192285213327e-06, | |
| "loss": 27.413, | |
| "step": 1480, | |
| "true_loss": 3.5116 | |
| }, | |
| { | |
| "epoch": 2.3503649635036497, | |
| "grad_norm": 141.1565994249431, | |
| "learning_rate": 1.2068965517241381e-06, | |
| "loss": 27.1606, | |
| "step": 1490, | |
| "true_loss": 3.426 | |
| }, | |
| { | |
| "epoch": 2.366147169066877, | |
| "grad_norm": 159.8393327569506, | |
| "learning_rate": 1.1776738749269434e-06, | |
| "loss": 27.7131, | |
| "step": 1500, | |
| "true_loss": 3.3148 | |
| }, | |
| { | |
| "epoch": 2.366147169066877, | |
| "eval_accuracy": 0.19623059866962306, | |
| "eval_loss": 3.372471332550049, | |
| "eval_runtime": 23.4809, | |
| "eval_samples_per_second": 38.414, | |
| "eval_steps_per_second": 4.812, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.3819293746301047, | |
| "grad_norm": 168.70353622158657, | |
| "learning_rate": 1.1484511981297488e-06, | |
| "loss": 26.9731, | |
| "step": 1510, | |
| "true_loss": 3.3077 | |
| }, | |
| { | |
| "epoch": 2.397711580193332, | |
| "grad_norm": 163.63685602929394, | |
| "learning_rate": 1.1192285213325543e-06, | |
| "loss": 27.206, | |
| "step": 1520, | |
| "true_loss": 3.2603 | |
| }, | |
| { | |
| "epoch": 2.4134937857565593, | |
| "grad_norm": 171.43412737818744, | |
| "learning_rate": 1.0900058445353595e-06, | |
| "loss": 26.3347, | |
| "step": 1530, | |
| "true_loss": 3.2123 | |
| }, | |
| { | |
| "epoch": 2.4292759913197868, | |
| "grad_norm": 167.19508362270676, | |
| "learning_rate": 1.0607831677381648e-06, | |
| "loss": 27.482, | |
| "step": 1540, | |
| "true_loss": 3.4389 | |
| }, | |
| { | |
| "epoch": 2.4450581968830143, | |
| "grad_norm": 165.6072657994525, | |
| "learning_rate": 1.0315604909409702e-06, | |
| "loss": 26.6002, | |
| "step": 1550, | |
| "true_loss": 3.4554 | |
| }, | |
| { | |
| "epoch": 2.4450581968830143, | |
| "eval_accuracy": 0.1984478935698448, | |
| "eval_loss": 3.3719024658203125, | |
| "eval_runtime": 23.4592, | |
| "eval_samples_per_second": 38.45, | |
| "eval_steps_per_second": 4.817, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.460840402446242, | |
| "grad_norm": 173.48714728794272, | |
| "learning_rate": 1.0023378141437757e-06, | |
| "loss": 27.0777, | |
| "step": 1560, | |
| "true_loss": 3.4451 | |
| }, | |
| { | |
| "epoch": 2.4766226080094693, | |
| "grad_norm": 169.41551174748028, | |
| "learning_rate": 9.731151373465811e-07, | |
| "loss": 27.1345, | |
| "step": 1570, | |
| "true_loss": 3.3625 | |
| }, | |
| { | |
| "epoch": 2.492404813572697, | |
| "grad_norm": 179.8847072240023, | |
| "learning_rate": 9.438924605493864e-07, | |
| "loss": 27.0216, | |
| "step": 1580, | |
| "true_loss": 3.2843 | |
| }, | |
| { | |
| "epoch": 2.5081870191359243, | |
| "grad_norm": 166.02108936347767, | |
| "learning_rate": 9.146697837521917e-07, | |
| "loss": 26.7401, | |
| "step": 1590, | |
| "true_loss": 3.2657 | |
| }, | |
| { | |
| "epoch": 2.523969224699152, | |
| "grad_norm": 454.3584387218693, | |
| "learning_rate": 8.854471069549972e-07, | |
| "loss": 26.1234, | |
| "step": 1600, | |
| "true_loss": 3.3808 | |
| }, | |
| { | |
| "epoch": 2.523969224699152, | |
| "eval_accuracy": 0.20288248337028825, | |
| "eval_loss": 3.380685567855835, | |
| "eval_runtime": 24.1535, | |
| "eval_samples_per_second": 37.344, | |
| "eval_steps_per_second": 4.678, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.5397514302623794, | |
| "grad_norm": 164.3488500545829, | |
| "learning_rate": 8.562244301578025e-07, | |
| "loss": 26.6329, | |
| "step": 1610, | |
| "true_loss": 3.4502 | |
| }, | |
| { | |
| "epoch": 2.555533635825607, | |
| "grad_norm": 224.71536684248224, | |
| "learning_rate": 8.270017533606079e-07, | |
| "loss": 27.4489, | |
| "step": 1620, | |
| "true_loss": 3.4381 | |
| }, | |
| { | |
| "epoch": 2.571315841388834, | |
| "grad_norm": 189.92117872676408, | |
| "learning_rate": 7.977790765634133e-07, | |
| "loss": 27.2729, | |
| "step": 1630, | |
| "true_loss": 3.3647 | |
| }, | |
| { | |
| "epoch": 2.5870980469520615, | |
| "grad_norm": 172.51933992524627, | |
| "learning_rate": 7.685563997662187e-07, | |
| "loss": 27.1534, | |
| "step": 1640, | |
| "true_loss": 3.5251 | |
| }, | |
| { | |
| "epoch": 2.602880252515289, | |
| "grad_norm": 166.6926362336406, | |
| "learning_rate": 7.393337229690241e-07, | |
| "loss": 26.9745, | |
| "step": 1650, | |
| "true_loss": 3.3921 | |
| }, | |
| { | |
| "epoch": 2.602880252515289, | |
| "eval_accuracy": 0.2006651884700665, | |
| "eval_loss": 3.3468966484069824, | |
| "eval_runtime": 23.7179, | |
| "eval_samples_per_second": 38.03, | |
| "eval_steps_per_second": 4.764, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.6186624580785165, | |
| "grad_norm": 160.91931691807488, | |
| "learning_rate": 7.101110461718295e-07, | |
| "loss": 26.7737, | |
| "step": 1660, | |
| "true_loss": 3.4577 | |
| }, | |
| { | |
| "epoch": 2.634444663641744, | |
| "grad_norm": 181.67048483012417, | |
| "learning_rate": 6.808883693746347e-07, | |
| "loss": 26.7581, | |
| "step": 1670, | |
| "true_loss": 3.2471 | |
| }, | |
| { | |
| "epoch": 2.6502268692049715, | |
| "grad_norm": 217.44579386829224, | |
| "learning_rate": 6.516656925774401e-07, | |
| "loss": 26.4481, | |
| "step": 1680, | |
| "true_loss": 3.0002 | |
| }, | |
| { | |
| "epoch": 2.6660090747681986, | |
| "grad_norm": 195.64586609651414, | |
| "learning_rate": 6.224430157802455e-07, | |
| "loss": 26.7862, | |
| "step": 1690, | |
| "true_loss": 3.1025 | |
| }, | |
| { | |
| "epoch": 2.681791280331426, | |
| "grad_norm": 179.8032160582305, | |
| "learning_rate": 5.93220338983051e-07, | |
| "loss": 27.108, | |
| "step": 1700, | |
| "true_loss": 3.1252 | |
| }, | |
| { | |
| "epoch": 2.681791280331426, | |
| "eval_accuracy": 0.2017738359201774, | |
| "eval_loss": 3.3589749336242676, | |
| "eval_runtime": 23.6074, | |
| "eval_samples_per_second": 38.208, | |
| "eval_steps_per_second": 4.787, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.6975734858946536, | |
| "grad_norm": 189.69952810255015, | |
| "learning_rate": 5.639976621858563e-07, | |
| "loss": 26.6985, | |
| "step": 1710, | |
| "true_loss": 3.3933 | |
| }, | |
| { | |
| "epoch": 2.713355691457881, | |
| "grad_norm": 242.2858845486621, | |
| "learning_rate": 5.347749853886616e-07, | |
| "loss": 26.7437, | |
| "step": 1720, | |
| "true_loss": 3.1209 | |
| }, | |
| { | |
| "epoch": 2.7291378970211086, | |
| "grad_norm": 185.37642956971033, | |
| "learning_rate": 5.05552308591467e-07, | |
| "loss": 26.7456, | |
| "step": 1730, | |
| "true_loss": 3.1121 | |
| }, | |
| { | |
| "epoch": 2.744920102584336, | |
| "grad_norm": 201.70311652816994, | |
| "learning_rate": 4.763296317942724e-07, | |
| "loss": 26.8516, | |
| "step": 1740, | |
| "true_loss": 3.397 | |
| }, | |
| { | |
| "epoch": 2.7607023081475637, | |
| "grad_norm": 181.35331376149924, | |
| "learning_rate": 4.4710695499707774e-07, | |
| "loss": 27.6208, | |
| "step": 1750, | |
| "true_loss": 3.6422 | |
| }, | |
| { | |
| "epoch": 2.7607023081475637, | |
| "eval_accuracy": 0.21951219512195122, | |
| "eval_loss": 3.3623762130737305, | |
| "eval_runtime": 23.6173, | |
| "eval_samples_per_second": 38.192, | |
| "eval_steps_per_second": 4.785, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.776484513710791, | |
| "grad_norm": 207.9779142213798, | |
| "learning_rate": 4.1788427819988314e-07, | |
| "loss": 26.674, | |
| "step": 1760, | |
| "true_loss": 3.221 | |
| }, | |
| { | |
| "epoch": 2.7922667192740187, | |
| "grad_norm": 203.7549680190773, | |
| "learning_rate": 3.8866160140268854e-07, | |
| "loss": 26.5723, | |
| "step": 1770, | |
| "true_loss": 3.2956 | |
| }, | |
| { | |
| "epoch": 2.808048924837246, | |
| "grad_norm": 196.13189611302036, | |
| "learning_rate": 3.594389246054939e-07, | |
| "loss": 26.4406, | |
| "step": 1780, | |
| "true_loss": 3.4648 | |
| }, | |
| { | |
| "epoch": 2.8238311304004737, | |
| "grad_norm": 198.30227589105047, | |
| "learning_rate": 3.3021624780829924e-07, | |
| "loss": 26.1588, | |
| "step": 1790, | |
| "true_loss": 3.2933 | |
| }, | |
| { | |
| "epoch": 2.839613335963701, | |
| "grad_norm": 199.85226446602726, | |
| "learning_rate": 3.0099357101110464e-07, | |
| "loss": 26.2669, | |
| "step": 1800, | |
| "true_loss": 3.2296 | |
| }, | |
| { | |
| "epoch": 2.839613335963701, | |
| "eval_accuracy": 0.2073170731707317, | |
| "eval_loss": 3.3512346744537354, | |
| "eval_runtime": 23.6919, | |
| "eval_samples_per_second": 38.072, | |
| "eval_steps_per_second": 4.77, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.8553955415269283, | |
| "grad_norm": 217.9595424207247, | |
| "learning_rate": 2.7177089421391004e-07, | |
| "loss": 26.0995, | |
| "step": 1810, | |
| "true_loss": 3.0267 | |
| }, | |
| { | |
| "epoch": 2.871177747090156, | |
| "grad_norm": 222.64507007990645, | |
| "learning_rate": 2.425482174167154e-07, | |
| "loss": 25.9481, | |
| "step": 1820, | |
| "true_loss": 3.4288 | |
| }, | |
| { | |
| "epoch": 2.8869599526533833, | |
| "grad_norm": 201.21510324225665, | |
| "learning_rate": 2.1332554061952078e-07, | |
| "loss": 25.9752, | |
| "step": 1830, | |
| "true_loss": 3.1737 | |
| }, | |
| { | |
| "epoch": 2.902742158216611, | |
| "grad_norm": 192.09070144987018, | |
| "learning_rate": 1.8410286382232613e-07, | |
| "loss": 26.059, | |
| "step": 1840, | |
| "true_loss": 3.024 | |
| }, | |
| { | |
| "epoch": 2.9185243637798384, | |
| "grad_norm": 940.254161469039, | |
| "learning_rate": 1.5488018702513153e-07, | |
| "loss": 25.8304, | |
| "step": 1850, | |
| "true_loss": 2.8998 | |
| }, | |
| { | |
| "epoch": 2.9185243637798384, | |
| "eval_accuracy": 0.21840354767184036, | |
| "eval_loss": 3.3478667736053467, | |
| "eval_runtime": 23.6868, | |
| "eval_samples_per_second": 38.08, | |
| "eval_steps_per_second": 4.771, | |
| "step": 1850 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1902, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |