| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 8310, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012033694344163659, | |
| "grad_norm": 0.10395803302526474, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 2.5454, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.024067388688327317, | |
| "grad_norm": 0.1665215641260147, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 2.5267, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.036101083032490974, | |
| "grad_norm": 0.3333851397037506, | |
| "learning_rate": 2e-05, | |
| "loss": 2.4777, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.048134777376654635, | |
| "grad_norm": 0.4827630817890167, | |
| "learning_rate": 1.9992309597956244e-05, | |
| "loss": 2.4484, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06016847172081829, | |
| "grad_norm": 0.4034655690193176, | |
| "learning_rate": 1.9969250220281687e-05, | |
| "loss": 2.374, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07220216606498195, | |
| "grad_norm": 0.4711494743824005, | |
| "learning_rate": 1.9930857334153374e-05, | |
| "loss": 2.3987, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0842358604091456, | |
| "grad_norm": 0.5272079110145569, | |
| "learning_rate": 1.987718999091729e-05, | |
| "loss": 2.3692, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.09626955475330927, | |
| "grad_norm": 0.48402172327041626, | |
| "learning_rate": 1.9808330735262657e-05, | |
| "loss": 2.3804, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.10830324909747292, | |
| "grad_norm": 0.5433884859085083, | |
| "learning_rate": 1.972438547826156e-05, | |
| "loss": 2.3637, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.12033694344163658, | |
| "grad_norm": 0.4431970417499542, | |
| "learning_rate": 1.9625483334469198e-05, | |
| "loss": 2.3256, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.13237063778580024, | |
| "grad_norm": 0.659107506275177, | |
| "learning_rate": 1.9511776423335327e-05, | |
| "loss": 2.3063, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.1444043321299639, | |
| "grad_norm": 0.5573317408561707, | |
| "learning_rate": 1.9383439635232296e-05, | |
| "loss": 2.3277, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.15643802647412755, | |
| "grad_norm": 0.6801158785820007, | |
| "learning_rate": 1.924067036245961e-05, | |
| "loss": 2.2647, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.1684717208182912, | |
| "grad_norm": 0.6493859887123108, | |
| "learning_rate": 1.9083688195638694e-05, | |
| "loss": 2.3073, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.18050541516245489, | |
| "grad_norm": 0.5970639586448669, | |
| "learning_rate": 1.891273458596486e-05, | |
| "loss": 2.3006, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.19253910950661854, | |
| "grad_norm": 0.5887291431427002, | |
| "learning_rate": 1.8728072473835944e-05, | |
| "loss": 2.3492, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2045728038507822, | |
| "grad_norm": 0.7891527414321899, | |
| "learning_rate": 1.8529985884428855e-05, | |
| "loss": 2.3155, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.21660649819494585, | |
| "grad_norm": 0.641476035118103, | |
| "learning_rate": 1.8318779490846005e-05, | |
| "loss": 2.2822, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2286401925391095, | |
| "grad_norm": 0.6066290736198425, | |
| "learning_rate": 1.8094778145503555e-05, | |
| "loss": 2.2936, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.24067388688327315, | |
| "grad_norm": 0.620587944984436, | |
| "learning_rate": 1.7858326380482313e-05, | |
| "loss": 2.2576, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2527075812274368, | |
| "grad_norm": 0.7826667428016663, | |
| "learning_rate": 1.7609787877609678e-05, | |
| "loss": 2.246, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.2647412755716005, | |
| "grad_norm": 0.9285653829574585, | |
| "learning_rate": 1.7349544909087737e-05, | |
| "loss": 2.2675, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.2767749699157641, | |
| "grad_norm": 0.8119534254074097, | |
| "learning_rate": 1.7077997749527884e-05, | |
| "loss": 2.2335, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.2888086642599278, | |
| "grad_norm": 0.7591002583503723, | |
| "learning_rate": 1.6795564060296295e-05, | |
| "loss": 2.2353, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3008423586040915, | |
| "grad_norm": 0.8516184687614441, | |
| "learning_rate": 1.6502678247117146e-05, | |
| "loss": 2.266, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3128760529482551, | |
| "grad_norm": 0.7212440967559814, | |
| "learning_rate": 1.6199790791921693e-05, | |
| "loss": 2.253, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.3249097472924188, | |
| "grad_norm": 0.6549546122550964, | |
| "learning_rate": 1.5887367559970825e-05, | |
| "loss": 2.2147, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.3369434416365824, | |
| "grad_norm": 0.7691039443016052, | |
| "learning_rate": 1.5565889083316847e-05, | |
| "loss": 2.2499, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.3489771359807461, | |
| "grad_norm": 0.7296893000602722, | |
| "learning_rate": 1.5235849821706531e-05, | |
| "loss": 2.2458, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.36101083032490977, | |
| "grad_norm": 0.6736641526222229, | |
| "learning_rate": 1.4897757402062285e-05, | |
| "loss": 2.2561, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3730445246690734, | |
| "grad_norm": 0.711357593536377, | |
| "learning_rate": 1.4552131837711108e-05, | |
| "loss": 2.2529, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.3850782190132371, | |
| "grad_norm": 0.7297707796096802, | |
| "learning_rate": 1.4199504728562294e-05, | |
| "loss": 2.2368, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.3971119133574007, | |
| "grad_norm": 0.7659047842025757, | |
| "learning_rate": 1.3840418443464015e-05, | |
| "loss": 2.247, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.4091456077015644, | |
| "grad_norm": 1.0112534761428833, | |
| "learning_rate": 1.3475425285996438e-05, | |
| "loss": 2.2343, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.42117930204572807, | |
| "grad_norm": 0.8520541787147522, | |
| "learning_rate": 1.310508664498439e-05, | |
| "loss": 2.2708, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.4332129963898917, | |
| "grad_norm": 0.7974518537521362, | |
| "learning_rate": 1.2729972131036212e-05, | |
| "loss": 2.2687, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.4452466907340554, | |
| "grad_norm": 0.6873015761375427, | |
| "learning_rate": 1.2350658700436852e-05, | |
| "loss": 2.2502, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.457280385078219, | |
| "grad_norm": 0.9805155396461487, | |
| "learning_rate": 1.1967729767742688e-05, | |
| "loss": 2.2026, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.4693140794223827, | |
| "grad_norm": 1.0702319145202637, | |
| "learning_rate": 1.1581774308443042e-05, | |
| "loss": 2.2294, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.4813477737665463, | |
| "grad_norm": 0.818926990032196, | |
| "learning_rate": 1.1193385953068512e-05, | |
| "loss": 2.2566, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.49338146811071, | |
| "grad_norm": 0.7473201155662537, | |
| "learning_rate": 1.0803162074139489e-05, | |
| "loss": 2.2348, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.5054151624548736, | |
| "grad_norm": 0.9612985849380493, | |
| "learning_rate": 1.041170286735918e-05, | |
| "loss": 2.2055, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.5174488567990373, | |
| "grad_norm": 0.6838109493255615, | |
| "learning_rate": 1.0019610428464354e-05, | |
| "loss": 2.2991, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.529482551143201, | |
| "grad_norm": 0.7192391753196716, | |
| "learning_rate": 9.627487827153704e-06, | |
| "loss": 2.2438, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.5415162454873647, | |
| "grad_norm": 0.8669145703315735, | |
| "learning_rate": 9.235938179518131e-06, | |
| "loss": 2.2375, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.5535499398315282, | |
| "grad_norm": 0.8066568970680237, | |
| "learning_rate": 8.845563720399715e-06, | |
| "loss": 2.2517, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.5655836341756919, | |
| "grad_norm": 0.8181942701339722, | |
| "learning_rate": 8.456964877106104e-06, | |
| "loss": 2.2433, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.5776173285198556, | |
| "grad_norm": 0.7640101313591003, | |
| "learning_rate": 8.070739345905032e-06, | |
| "loss": 2.2323, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.5896510228640193, | |
| "grad_norm": 0.8630154728889465, | |
| "learning_rate": 7.687481172719402e-06, | |
| "loss": 2.2258, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.601684717208183, | |
| "grad_norm": 0.8093004822731018, | |
| "learning_rate": 7.307779839436878e-06, | |
| "loss": 2.1821, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.6137184115523465, | |
| "grad_norm": 0.831082820892334, | |
| "learning_rate": 6.932219357239362e-06, | |
| "loss": 2.208, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.6257521058965102, | |
| "grad_norm": 1.0010204315185547, | |
| "learning_rate": 6.561377368346824e-06, | |
| "loss": 2.2223, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.6377858002406739, | |
| "grad_norm": 0.7786392569541931, | |
| "learning_rate": 6.195824257557126e-06, | |
| "loss": 2.2135, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.6498194945848376, | |
| "grad_norm": 1.119176983833313, | |
| "learning_rate": 5.8361222749483246e-06, | |
| "loss": 2.2343, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.6618531889290012, | |
| "grad_norm": 1.0055018663406372, | |
| "learning_rate": 5.482824671092862e-06, | |
| "loss": 2.2361, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.6738868832731648, | |
| "grad_norm": 1.0241854190826416, | |
| "learning_rate": 5.136474846113688e-06, | |
| "loss": 2.2512, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.6859205776173285, | |
| "grad_norm": 0.792901337146759, | |
| "learning_rate": 4.797605513891179e-06, | |
| "loss": 2.2203, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.6979542719614922, | |
| "grad_norm": 1.4910557270050049, | |
| "learning_rate": 4.46673788270635e-06, | |
| "loss": 2.2261, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.7099879663056559, | |
| "grad_norm": 0.8699010610580444, | |
| "learning_rate": 4.1443808535806195e-06, | |
| "loss": 2.2282, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.7220216606498195, | |
| "grad_norm": 0.7998189330101013, | |
| "learning_rate": 3.83103023754511e-06, | |
| "loss": 2.2035, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7340553549939831, | |
| "grad_norm": 0.8159136772155762, | |
| "learning_rate": 3.527167993043411e-06, | |
| "loss": 2.2378, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.7460890493381468, | |
| "grad_norm": 0.7632446885108948, | |
| "learning_rate": 3.233261484640753e-06, | |
| "loss": 2.2091, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.7581227436823105, | |
| "grad_norm": 0.9270642399787903, | |
| "learning_rate": 2.949762764179711e-06, | |
| "loss": 2.2278, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.7701564380264742, | |
| "grad_norm": 0.8804704546928406, | |
| "learning_rate": 2.6771078754881417e-06, | |
| "loss": 2.2409, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.7821901323706378, | |
| "grad_norm": 1.0230224132537842, | |
| "learning_rate": 2.415716183708684e-06, | |
| "loss": 2.2658, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.7942238267148014, | |
| "grad_norm": 0.7484972476959229, | |
| "learning_rate": 2.165989730281475e-06, | |
| "loss": 2.2396, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.8062575210589651, | |
| "grad_norm": 0.7870795130729675, | |
| "learning_rate": 1.928312614572083e-06, | |
| "loss": 2.206, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.8182912154031288, | |
| "grad_norm": 0.8243082761764526, | |
| "learning_rate": 1.703050403095783e-06, | |
| "loss": 2.2403, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.8303249097472925, | |
| "grad_norm": 0.88877934217453, | |
| "learning_rate": 1.4905495672468784e-06, | |
| "loss": 2.2549, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.8423586040914561, | |
| "grad_norm": 0.7939454317092896, | |
| "learning_rate": 1.2911369503978389e-06, | |
| "loss": 2.253, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.8543922984356197, | |
| "grad_norm": 0.845038652420044, | |
| "learning_rate": 1.1051192651878938e-06, | |
| "loss": 2.2124, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.8664259927797834, | |
| "grad_norm": 1.047326683998108, | |
| "learning_rate": 9.327826217743452e-07, | |
| "loss": 2.2202, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.8784596871239471, | |
| "grad_norm": 1.1349254846572876, | |
| "learning_rate": 7.743920877721378e-07, | |
| "loss": 2.2059, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.8904933814681107, | |
| "grad_norm": 0.9006612300872803, | |
| "learning_rate": 6.30191280558553e-07, | |
| "loss": 2.2327, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.9025270758122743, | |
| "grad_norm": 1.3734016418457031, | |
| "learning_rate": 5.004019925700921e-07, | |
| "loss": 2.2116, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.914560770156438, | |
| "grad_norm": 0.7956872582435608, | |
| "learning_rate": 3.852238501678751e-07, | |
| "loss": 2.2349, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.9265944645006017, | |
| "grad_norm": 0.7709358930587769, | |
| "learning_rate": 2.8483400659624737e-07, | |
| "loss": 2.2159, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.9386281588447654, | |
| "grad_norm": 0.9854726195335388, | |
| "learning_rate": 1.993868695068457e-07, | |
| "loss": 2.2324, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.950661853188929, | |
| "grad_norm": 0.957122802734375, | |
| "learning_rate": 1.290138634672089e-07, | |
| "loss": 2.2256, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.9626955475330926, | |
| "grad_norm": 1.1688376665115356, | |
| "learning_rate": 7.382322781923301e-08, | |
| "loss": 2.2578, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.9747292418772563, | |
| "grad_norm": 0.8519200682640076, | |
| "learning_rate": 3.38998501983534e-08, | |
| "loss": 2.2476, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.98676293622142, | |
| "grad_norm": 0.7625014185905457, | |
| "learning_rate": 9.475062435719828e-09, | |
| "loss": 2.2162, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.9987966305655837, | |
| "grad_norm": 0.9132429361343384, | |
| "learning_rate": 9.306564984878919e-11, | |
| "loss": 2.2016, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 8310, | |
| "total_flos": 1.510104094015488e+17, | |
| "train_loss": 2.265305694231201, | |
| "train_runtime": 2607.4148, | |
| "train_samples_per_second": 6.374, | |
| "train_steps_per_second": 3.187 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 8310, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.510104094015488e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |