| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9898386132695756, | |
| "eval_steps": 500, | |
| "global_step": 278, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007172743574417215, | |
| "grad_norm": 1.7706772687409793, | |
| "learning_rate": 3.5714285714285716e-07, | |
| "loss": 1.8258, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03586371787208607, | |
| "grad_norm": 1.3979460761770042, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 1.7889, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07172743574417215, | |
| "grad_norm": 0.8391387150686271, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 1.4752, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10759115361625822, | |
| "grad_norm": 0.29622114410594524, | |
| "learning_rate": 5.357142857142857e-06, | |
| "loss": 0.8542, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1434548714883443, | |
| "grad_norm": 0.1867024790344403, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 0.5935, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.17931858936043035, | |
| "grad_norm": 0.1397115741987044, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 0.4806, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.21518230723251644, | |
| "grad_norm": 0.11207900324892817, | |
| "learning_rate": 9.9984209464165e-06, | |
| "loss": 0.3543, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2510460251046025, | |
| "grad_norm": 0.12471379118682165, | |
| "learning_rate": 9.980668045715864e-06, | |
| "loss": 0.3099, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2869097429766886, | |
| "grad_norm": 0.13942124773513623, | |
| "learning_rate": 9.94325872368957e-06, | |
| "loss": 0.2671, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3227734608487747, | |
| "grad_norm": 0.08633657901146753, | |
| "learning_rate": 9.886340617840968e-06, | |
| "loss": 0.2409, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.3586371787208607, | |
| "grad_norm": 0.09525370649740562, | |
| "learning_rate": 9.81013835793043e-06, | |
| "loss": 0.2193, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3945008965929468, | |
| "grad_norm": 0.08224573044279379, | |
| "learning_rate": 9.714952679464324e-06, | |
| "loss": 0.1873, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.4303646144650329, | |
| "grad_norm": 0.0960148670109882, | |
| "learning_rate": 9.601159236829353e-06, | |
| "loss": 0.1986, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.46622833233711897, | |
| "grad_norm": 0.08468885213082336, | |
| "learning_rate": 9.46920712075632e-06, | |
| "loss": 0.181, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.502092050209205, | |
| "grad_norm": 0.06651705476346571, | |
| "learning_rate": 9.319617085964177e-06, | |
| "loss": 0.1776, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5379557680812911, | |
| "grad_norm": 0.07914440346820356, | |
| "learning_rate": 9.152979495979064e-06, | |
| "loss": 0.1729, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5738194859533772, | |
| "grad_norm": 0.06318757579211376, | |
| "learning_rate": 8.969951993239177e-06, | |
| "loss": 0.1544, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6096832038254633, | |
| "grad_norm": 0.06806462149173399, | |
| "learning_rate": 8.77125690368052e-06, | |
| "loss": 0.1452, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.6455469216975493, | |
| "grad_norm": 0.09204580355773379, | |
| "learning_rate": 8.557678386046429e-06, | |
| "loss": 0.148, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6814106395696354, | |
| "grad_norm": 0.07404375144145035, | |
| "learning_rate": 8.33005933717126e-06, | |
| "loss": 0.1537, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7172743574417214, | |
| "grad_norm": 0.06673338321385565, | |
| "learning_rate": 8.089298065451673e-06, | |
| "loss": 0.154, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7531380753138075, | |
| "grad_norm": 0.07248494023312937, | |
| "learning_rate": 7.836344745633785e-06, | |
| "loss": 0.1415, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.7890017931858936, | |
| "grad_norm": 0.06250391175809607, | |
| "learning_rate": 7.572197668907533e-06, | |
| "loss": 0.132, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8248655110579797, | |
| "grad_norm": 0.056578659740463, | |
| "learning_rate": 7.297899303107441e-06, | |
| "loss": 0.112, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.8607292289300658, | |
| "grad_norm": 0.06486223998598174, | |
| "learning_rate": 7.014532178568314e-06, | |
| "loss": 0.121, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8965929468021518, | |
| "grad_norm": 0.056771834577397436, | |
| "learning_rate": 6.723214615872585e-06, | |
| "loss": 0.1134, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.9324566646742379, | |
| "grad_norm": 0.06319732340937169, | |
| "learning_rate": 6.425096312349881e-06, | |
| "loss": 0.1166, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.968320382546324, | |
| "grad_norm": 0.0649079486366395, | |
| "learning_rate": 6.121353804746907e-06, | |
| "loss": 0.1122, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.07566991126398058, | |
| "learning_rate": 5.813185825974419e-06, | |
| "loss": 0.1192, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.1111445426940918, | |
| "eval_runtime": 3.972, | |
| "eval_samples_per_second": 17.12, | |
| "eval_steps_per_second": 4.28, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.035863717872086, | |
| "grad_norm": 0.06962658936027795, | |
| "learning_rate": 5.5018085742560745e-06, | |
| "loss": 0.0911, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.0717274357441722, | |
| "grad_norm": 0.05916257421087811, | |
| "learning_rate": 5.188450913349674e-06, | |
| "loss": 0.0905, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.1075911536162582, | |
| "grad_norm": 0.06400686740251811, | |
| "learning_rate": 4.874349522783313e-06, | |
| "loss": 0.0977, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.1434548714883443, | |
| "grad_norm": 0.05927545440254994, | |
| "learning_rate": 4.560744017246284e-06, | |
| "loss": 0.0834, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.1793185893604303, | |
| "grad_norm": 0.06649126661802829, | |
| "learning_rate": 4.248872054396215e-06, | |
| "loss": 0.0953, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.2151823072325165, | |
| "grad_norm": 0.06680856093283863, | |
| "learning_rate": 3.939964450389728e-06, | |
| "loss": 0.096, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.2510460251046025, | |
| "grad_norm": 0.07655116396488153, | |
| "learning_rate": 3.635240322413375e-06, | |
| "loss": 0.0843, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.2869097429766887, | |
| "grad_norm": 0.0679118797019963, | |
| "learning_rate": 3.3359022773850673e-06, | |
| "loss": 0.0933, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.3227734608487747, | |
| "grad_norm": 0.06352669282993445, | |
| "learning_rate": 3.043131665813988e-06, | |
| "loss": 0.0869, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.3586371787208606, | |
| "grad_norm": 0.05614250533642248, | |
| "learning_rate": 2.7580839195498397e-06, | |
| "loss": 0.0784, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.3945008965929468, | |
| "grad_norm": 0.06268476802798255, | |
| "learning_rate": 2.4818839918211963e-06, | |
| "loss": 0.0966, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.4303646144650328, | |
| "grad_norm": 0.09016849271972376, | |
| "learning_rate": 2.2156219175590623e-06, | |
| "loss": 0.0861, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.466228332337119, | |
| "grad_norm": 0.05616043304073107, | |
| "learning_rate": 1.9603485115269743e-06, | |
| "loss": 0.0821, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.502092050209205, | |
| "grad_norm": 0.056894314885253386, | |
| "learning_rate": 1.7170712212352187e-06, | |
| "loss": 0.0759, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.5379557680812912, | |
| "grad_norm": 0.05749554956523113, | |
| "learning_rate": 1.4867501510057548e-06, | |
| "loss": 0.0779, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.5738194859533772, | |
| "grad_norm": 0.06101506302740175, | |
| "learning_rate": 1.2702942728790897e-06, | |
| "loss": 0.0812, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.6096832038254631, | |
| "grad_norm": 0.052956740115013924, | |
| "learning_rate": 1.0685578393169054e-06, | |
| "loss": 0.0878, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.6455469216975493, | |
| "grad_norm": 0.05444893484606994, | |
| "learning_rate": 8.823370118578628e-07, | |
| "loss": 0.0808, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.6814106395696355, | |
| "grad_norm": 0.05577402210270524, | |
| "learning_rate": 7.123667190317396e-07, | |
| "loss": 0.0835, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.7172743574417213, | |
| "grad_norm": 0.060795662497347525, | |
| "learning_rate": 5.593177559322776e-07, | |
| "loss": 0.0755, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.7531380753138075, | |
| "grad_norm": 0.05146247489312187, | |
| "learning_rate": 4.237941368954124e-07, | |
| "loss": 0.0808, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.7890017931858937, | |
| "grad_norm": 0.05742062783254797, | |
| "learning_rate": 3.0633071173062966e-07, | |
| "loss": 0.0772, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.8248655110579797, | |
| "grad_norm": 0.06159979933911591, | |
| "learning_rate": 2.0739105491312028e-07, | |
| "loss": 0.0835, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.8607292289300656, | |
| "grad_norm": 0.05865983987282493, | |
| "learning_rate": 1.2736563606711384e-07, | |
| "loss": 0.0752, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.8965929468021518, | |
| "grad_norm": 0.06382021063164005, | |
| "learning_rate": 6.657027896065982e-08, | |
| "loss": 0.077, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.932456664674238, | |
| "grad_norm": 0.04725385570056433, | |
| "learning_rate": 2.5244915093499134e-08, | |
| "loss": 0.0731, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.968320382546324, | |
| "grad_norm": 0.053647864100340815, | |
| "learning_rate": 3.5526367970539765e-09, | |
| "loss": 0.0751, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.9898386132695756, | |
| "eval_loss": 0.08560756593942642, | |
| "eval_runtime": 3.7696, | |
| "eval_samples_per_second": 18.039, | |
| "eval_steps_per_second": 4.51, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.9898386132695756, | |
| "step": 278, | |
| "total_flos": 8.430395390385193e+17, | |
| "train_loss": 0.20757551041009614, | |
| "train_runtime": 3036.6171, | |
| "train_samples_per_second": 4.407, | |
| "train_steps_per_second": 0.092 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 278, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.430395390385193e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |