{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9898386132695756, "eval_steps": 500, "global_step": 278, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007172743574417215, "grad_norm": 1.7706772687409793, "learning_rate": 3.5714285714285716e-07, "loss": 1.8258, "step": 1 }, { "epoch": 0.03586371787208607, "grad_norm": 1.3979460761770042, "learning_rate": 1.7857142857142859e-06, "loss": 1.7889, "step": 5 }, { "epoch": 0.07172743574417215, "grad_norm": 0.8391387150686271, "learning_rate": 3.5714285714285718e-06, "loss": 1.4752, "step": 10 }, { "epoch": 0.10759115361625822, "grad_norm": 0.29622114410594524, "learning_rate": 5.357142857142857e-06, "loss": 0.8542, "step": 15 }, { "epoch": 0.1434548714883443, "grad_norm": 0.1867024790344403, "learning_rate": 7.1428571428571436e-06, "loss": 0.5935, "step": 20 }, { "epoch": 0.17931858936043035, "grad_norm": 0.1397115741987044, "learning_rate": 8.92857142857143e-06, "loss": 0.4806, "step": 25 }, { "epoch": 0.21518230723251644, "grad_norm": 0.11207900324892817, "learning_rate": 9.9984209464165e-06, "loss": 0.3543, "step": 30 }, { "epoch": 0.2510460251046025, "grad_norm": 0.12471379118682165, "learning_rate": 9.980668045715864e-06, "loss": 0.3099, "step": 35 }, { "epoch": 0.2869097429766886, "grad_norm": 0.13942124773513623, "learning_rate": 9.94325872368957e-06, "loss": 0.2671, "step": 40 }, { "epoch": 0.3227734608487747, "grad_norm": 0.08633657901146753, "learning_rate": 9.886340617840968e-06, "loss": 0.2409, "step": 45 }, { "epoch": 0.3586371787208607, "grad_norm": 0.09525370649740562, "learning_rate": 9.81013835793043e-06, "loss": 0.2193, "step": 50 }, { "epoch": 0.3945008965929468, "grad_norm": 0.08224573044279379, "learning_rate": 9.714952679464324e-06, "loss": 0.1873, "step": 55 }, { "epoch": 0.4303646144650329, "grad_norm": 0.0960148670109882, "learning_rate": 9.601159236829353e-06, "loss": 0.1986, "step": 60 }, { "epoch": 0.46622833233711897, "grad_norm": 0.08468885213082336, "learning_rate": 9.46920712075632e-06, "loss": 0.181, "step": 65 }, { "epoch": 0.502092050209205, "grad_norm": 0.06651705476346571, "learning_rate": 9.319617085964177e-06, "loss": 0.1776, "step": 70 }, { "epoch": 0.5379557680812911, "grad_norm": 0.07914440346820356, "learning_rate": 9.152979495979064e-06, "loss": 0.1729, "step": 75 }, { "epoch": 0.5738194859533772, "grad_norm": 0.06318757579211376, "learning_rate": 8.969951993239177e-06, "loss": 0.1544, "step": 80 }, { "epoch": 0.6096832038254633, "grad_norm": 0.06806462149173399, "learning_rate": 8.77125690368052e-06, "loss": 0.1452, "step": 85 }, { "epoch": 0.6455469216975493, "grad_norm": 0.09204580355773379, "learning_rate": 8.557678386046429e-06, "loss": 0.148, "step": 90 }, { "epoch": 0.6814106395696354, "grad_norm": 0.07404375144145035, "learning_rate": 8.33005933717126e-06, "loss": 0.1537, "step": 95 }, { "epoch": 0.7172743574417214, "grad_norm": 0.06673338321385565, "learning_rate": 8.089298065451673e-06, "loss": 0.154, "step": 100 }, { "epoch": 0.7531380753138075, "grad_norm": 0.07248494023312937, "learning_rate": 7.836344745633785e-06, "loss": 0.1415, "step": 105 }, { "epoch": 0.7890017931858936, "grad_norm": 0.06250391175809607, "learning_rate": 7.572197668907533e-06, "loss": 0.132, "step": 110 }, { "epoch": 0.8248655110579797, "grad_norm": 0.056578659740463, "learning_rate": 7.297899303107441e-06, "loss": 0.112, "step": 115 }, { "epoch": 0.8607292289300658, "grad_norm": 0.06486223998598174, "learning_rate": 7.014532178568314e-06, "loss": 0.121, "step": 120 }, { "epoch": 0.8965929468021518, "grad_norm": 0.056771834577397436, "learning_rate": 6.723214615872585e-06, "loss": 0.1134, "step": 125 }, { "epoch": 0.9324566646742379, "grad_norm": 0.06319732340937169, "learning_rate": 6.425096312349881e-06, "loss": 0.1166, "step": 130 }, { "epoch": 0.968320382546324, "grad_norm": 0.0649079486366395, "learning_rate": 6.121353804746907e-06, "loss": 0.1122, "step": 135 }, { "epoch": 1.0, "grad_norm": 0.07566991126398058, "learning_rate": 5.813185825974419e-06, "loss": 0.1192, "step": 140 }, { "epoch": 1.0, "eval_loss": 0.1111445426940918, "eval_runtime": 3.972, "eval_samples_per_second": 17.12, "eval_steps_per_second": 4.28, "step": 140 }, { "epoch": 1.035863717872086, "grad_norm": 0.06962658936027795, "learning_rate": 5.5018085742560745e-06, "loss": 0.0911, "step": 145 }, { "epoch": 1.0717274357441722, "grad_norm": 0.05916257421087811, "learning_rate": 5.188450913349674e-06, "loss": 0.0905, "step": 150 }, { "epoch": 1.1075911536162582, "grad_norm": 0.06400686740251811, "learning_rate": 4.874349522783313e-06, "loss": 0.0977, "step": 155 }, { "epoch": 1.1434548714883443, "grad_norm": 0.05927545440254994, "learning_rate": 4.560744017246284e-06, "loss": 0.0834, "step": 160 }, { "epoch": 1.1793185893604303, "grad_norm": 0.06649126661802829, "learning_rate": 4.248872054396215e-06, "loss": 0.0953, "step": 165 }, { "epoch": 1.2151823072325165, "grad_norm": 0.06680856093283863, "learning_rate": 3.939964450389728e-06, "loss": 0.096, "step": 170 }, { "epoch": 1.2510460251046025, "grad_norm": 0.07655116396488153, "learning_rate": 3.635240322413375e-06, "loss": 0.0843, "step": 175 }, { "epoch": 1.2869097429766887, "grad_norm": 0.0679118797019963, "learning_rate": 3.3359022773850673e-06, "loss": 0.0933, "step": 180 }, { "epoch": 1.3227734608487747, "grad_norm": 0.06352669282993445, "learning_rate": 3.043131665813988e-06, "loss": 0.0869, "step": 185 }, { "epoch": 1.3586371787208606, "grad_norm": 0.05614250533642248, "learning_rate": 2.7580839195498397e-06, "loss": 0.0784, "step": 190 }, { "epoch": 1.3945008965929468, "grad_norm": 0.06268476802798255, "learning_rate": 2.4818839918211963e-06, "loss": 0.0966, "step": 195 }, { "epoch": 1.4303646144650328, "grad_norm": 0.09016849271972376, "learning_rate": 2.2156219175590623e-06, "loss": 0.0861, "step": 200 }, { "epoch": 1.466228332337119, "grad_norm": 0.05616043304073107, "learning_rate": 1.9603485115269743e-06, "loss": 0.0821, "step": 205 }, { "epoch": 1.502092050209205, "grad_norm": 0.056894314885253386, "learning_rate": 1.7170712212352187e-06, "loss": 0.0759, "step": 210 }, { "epoch": 1.5379557680812912, "grad_norm": 0.05749554956523113, "learning_rate": 1.4867501510057548e-06, "loss": 0.0779, "step": 215 }, { "epoch": 1.5738194859533772, "grad_norm": 0.06101506302740175, "learning_rate": 1.2702942728790897e-06, "loss": 0.0812, "step": 220 }, { "epoch": 1.6096832038254631, "grad_norm": 0.052956740115013924, "learning_rate": 1.0685578393169054e-06, "loss": 0.0878, "step": 225 }, { "epoch": 1.6455469216975493, "grad_norm": 0.05444893484606994, "learning_rate": 8.823370118578628e-07, "loss": 0.0808, "step": 230 }, { "epoch": 1.6814106395696355, "grad_norm": 0.05577402210270524, "learning_rate": 7.123667190317396e-07, "loss": 0.0835, "step": 235 }, { "epoch": 1.7172743574417213, "grad_norm": 0.060795662497347525, "learning_rate": 5.593177559322776e-07, "loss": 0.0755, "step": 240 }, { "epoch": 1.7531380753138075, "grad_norm": 0.05146247489312187, "learning_rate": 4.237941368954124e-07, "loss": 0.0808, "step": 245 }, { "epoch": 1.7890017931858937, "grad_norm": 0.05742062783254797, "learning_rate": 3.0633071173062966e-07, "loss": 0.0772, "step": 250 }, { "epoch": 1.8248655110579797, "grad_norm": 0.06159979933911591, "learning_rate": 2.0739105491312028e-07, "loss": 0.0835, "step": 255 }, { "epoch": 1.8607292289300656, "grad_norm": 0.05865983987282493, "learning_rate": 1.2736563606711384e-07, "loss": 0.0752, "step": 260 }, { "epoch": 1.8965929468021518, "grad_norm": 0.06382021063164005, "learning_rate": 6.657027896065982e-08, "loss": 0.077, "step": 265 }, { "epoch": 1.932456664674238, "grad_norm": 0.04725385570056433, "learning_rate": 2.5244915093499134e-08, "loss": 0.0731, "step": 270 }, { "epoch": 1.968320382546324, "grad_norm": 0.053647864100340815, "learning_rate": 3.5526367970539765e-09, "loss": 0.0751, "step": 275 }, { "epoch": 1.9898386132695756, "eval_loss": 0.08560756593942642, "eval_runtime": 3.7696, "eval_samples_per_second": 18.039, "eval_steps_per_second": 4.51, "step": 278 }, { "epoch": 1.9898386132695756, "step": 278, "total_flos": 8.430395390385193e+17, "train_loss": 0.20757551041009614, "train_runtime": 3036.6171, "train_samples_per_second": 4.407, "train_steps_per_second": 0.092 } ], "logging_steps": 5, "max_steps": 278, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.430395390385193e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }