{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 8310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012033694344163659, "grad_norm": 0.10395803302526474, "learning_rate": 6.666666666666667e-06, "loss": 2.5454, "step": 100 }, { "epoch": 0.024067388688327317, "grad_norm": 0.1665215641260147, "learning_rate": 1.3333333333333333e-05, "loss": 2.5267, "step": 200 }, { "epoch": 0.036101083032490974, "grad_norm": 0.3333851397037506, "learning_rate": 2e-05, "loss": 2.4777, "step": 300 }, { "epoch": 0.048134777376654635, "grad_norm": 0.4827630817890167, "learning_rate": 1.9992309597956244e-05, "loss": 2.4484, "step": 400 }, { "epoch": 0.06016847172081829, "grad_norm": 0.4034655690193176, "learning_rate": 1.9969250220281687e-05, "loss": 2.374, "step": 500 }, { "epoch": 0.07220216606498195, "grad_norm": 0.4711494743824005, "learning_rate": 1.9930857334153374e-05, "loss": 2.3987, "step": 600 }, { "epoch": 0.0842358604091456, "grad_norm": 0.5272079110145569, "learning_rate": 1.987718999091729e-05, "loss": 2.3692, "step": 700 }, { "epoch": 0.09626955475330927, "grad_norm": 0.48402172327041626, "learning_rate": 1.9808330735262657e-05, "loss": 2.3804, "step": 800 }, { "epoch": 0.10830324909747292, "grad_norm": 0.5433884859085083, "learning_rate": 1.972438547826156e-05, "loss": 2.3637, "step": 900 }, { "epoch": 0.12033694344163658, "grad_norm": 0.4431970417499542, "learning_rate": 1.9625483334469198e-05, "loss": 2.3256, "step": 1000 }, { "epoch": 0.13237063778580024, "grad_norm": 0.659107506275177, "learning_rate": 1.9511776423335327e-05, "loss": 2.3063, "step": 1100 }, { "epoch": 0.1444043321299639, "grad_norm": 0.5573317408561707, "learning_rate": 1.9383439635232296e-05, "loss": 2.3277, "step": 1200 }, { "epoch": 0.15643802647412755, "grad_norm": 0.6801158785820007, "learning_rate": 1.924067036245961e-05, "loss": 2.2647, "step": 1300 }, { "epoch": 0.1684717208182912, "grad_norm": 0.6493859887123108, "learning_rate": 1.9083688195638694e-05, "loss": 2.3073, "step": 1400 }, { "epoch": 0.18050541516245489, "grad_norm": 0.5970639586448669, "learning_rate": 1.891273458596486e-05, "loss": 2.3006, "step": 1500 }, { "epoch": 0.19253910950661854, "grad_norm": 0.5887291431427002, "learning_rate": 1.8728072473835944e-05, "loss": 2.3492, "step": 1600 }, { "epoch": 0.2045728038507822, "grad_norm": 0.7891527414321899, "learning_rate": 1.8529985884428855e-05, "loss": 2.3155, "step": 1700 }, { "epoch": 0.21660649819494585, "grad_norm": 0.641476035118103, "learning_rate": 1.8318779490846005e-05, "loss": 2.2822, "step": 1800 }, { "epoch": 0.2286401925391095, "grad_norm": 0.6066290736198425, "learning_rate": 1.8094778145503555e-05, "loss": 2.2936, "step": 1900 }, { "epoch": 0.24067388688327315, "grad_norm": 0.620587944984436, "learning_rate": 1.7858326380482313e-05, "loss": 2.2576, "step": 2000 }, { "epoch": 0.2527075812274368, "grad_norm": 0.7826667428016663, "learning_rate": 1.7609787877609678e-05, "loss": 2.246, "step": 2100 }, { "epoch": 0.2647412755716005, "grad_norm": 0.9285653829574585, "learning_rate": 1.7349544909087737e-05, "loss": 2.2675, "step": 2200 }, { "epoch": 0.2767749699157641, "grad_norm": 0.8119534254074097, "learning_rate": 1.7077997749527884e-05, "loss": 2.2335, "step": 2300 }, { "epoch": 0.2888086642599278, "grad_norm": 0.7591002583503723, "learning_rate": 1.6795564060296295e-05, "loss": 2.2353, "step": 2400 }, { "epoch": 0.3008423586040915, "grad_norm": 0.8516184687614441, "learning_rate": 1.6502678247117146e-05, "loss": 2.266, "step": 2500 }, { "epoch": 0.3128760529482551, "grad_norm": 0.7212440967559814, "learning_rate": 1.6199790791921693e-05, "loss": 2.253, "step": 2600 }, { "epoch": 0.3249097472924188, "grad_norm": 0.6549546122550964, "learning_rate": 1.5887367559970825e-05, "loss": 2.2147, "step": 2700 }, { "epoch": 0.3369434416365824, "grad_norm": 0.7691039443016052, "learning_rate": 1.5565889083316847e-05, "loss": 2.2499, "step": 2800 }, { "epoch": 0.3489771359807461, "grad_norm": 0.7296893000602722, "learning_rate": 1.5235849821706531e-05, "loss": 2.2458, "step": 2900 }, { "epoch": 0.36101083032490977, "grad_norm": 0.6736641526222229, "learning_rate": 1.4897757402062285e-05, "loss": 2.2561, "step": 3000 }, { "epoch": 0.3730445246690734, "grad_norm": 0.711357593536377, "learning_rate": 1.4552131837711108e-05, "loss": 2.2529, "step": 3100 }, { "epoch": 0.3850782190132371, "grad_norm": 0.7297707796096802, "learning_rate": 1.4199504728562294e-05, "loss": 2.2368, "step": 3200 }, { "epoch": 0.3971119133574007, "grad_norm": 0.7659047842025757, "learning_rate": 1.3840418443464015e-05, "loss": 2.247, "step": 3300 }, { "epoch": 0.4091456077015644, "grad_norm": 1.0112534761428833, "learning_rate": 1.3475425285996438e-05, "loss": 2.2343, "step": 3400 }, { "epoch": 0.42117930204572807, "grad_norm": 0.8520541787147522, "learning_rate": 1.310508664498439e-05, "loss": 2.2708, "step": 3500 }, { "epoch": 0.4332129963898917, "grad_norm": 0.7974518537521362, "learning_rate": 1.2729972131036212e-05, "loss": 2.2687, "step": 3600 }, { "epoch": 0.4452466907340554, "grad_norm": 0.6873015761375427, "learning_rate": 1.2350658700436852e-05, "loss": 2.2502, "step": 3700 }, { "epoch": 0.457280385078219, "grad_norm": 0.9805155396461487, "learning_rate": 1.1967729767742688e-05, "loss": 2.2026, "step": 3800 }, { "epoch": 0.4693140794223827, "grad_norm": 1.0702319145202637, "learning_rate": 1.1581774308443042e-05, "loss": 2.2294, "step": 3900 }, { "epoch": 0.4813477737665463, "grad_norm": 0.818926990032196, "learning_rate": 1.1193385953068512e-05, "loss": 2.2566, "step": 4000 }, { "epoch": 0.49338146811071, "grad_norm": 0.7473201155662537, "learning_rate": 1.0803162074139489e-05, "loss": 2.2348, "step": 4100 }, { "epoch": 0.5054151624548736, "grad_norm": 0.9612985849380493, "learning_rate": 1.041170286735918e-05, "loss": 2.2055, "step": 4200 }, { "epoch": 0.5174488567990373, "grad_norm": 0.6838109493255615, "learning_rate": 1.0019610428464354e-05, "loss": 2.2991, "step": 4300 }, { "epoch": 0.529482551143201, "grad_norm": 0.7192391753196716, "learning_rate": 9.627487827153704e-06, "loss": 2.2438, "step": 4400 }, { "epoch": 0.5415162454873647, "grad_norm": 0.8669145703315735, "learning_rate": 9.235938179518131e-06, "loss": 2.2375, "step": 4500 }, { "epoch": 0.5535499398315282, "grad_norm": 0.8066568970680237, "learning_rate": 8.845563720399715e-06, "loss": 2.2517, "step": 4600 }, { "epoch": 0.5655836341756919, "grad_norm": 0.8181942701339722, "learning_rate": 8.456964877106104e-06, "loss": 2.2433, "step": 4700 }, { "epoch": 0.5776173285198556, "grad_norm": 0.7640101313591003, "learning_rate": 8.070739345905032e-06, "loss": 2.2323, "step": 4800 }, { "epoch": 0.5896510228640193, "grad_norm": 0.8630154728889465, "learning_rate": 7.687481172719402e-06, "loss": 2.2258, "step": 4900 }, { "epoch": 0.601684717208183, "grad_norm": 0.8093004822731018, "learning_rate": 7.307779839436878e-06, "loss": 2.1821, "step": 5000 }, { "epoch": 0.6137184115523465, "grad_norm": 0.831082820892334, "learning_rate": 6.932219357239362e-06, "loss": 2.208, "step": 5100 }, { "epoch": 0.6257521058965102, "grad_norm": 1.0010204315185547, "learning_rate": 6.561377368346824e-06, "loss": 2.2223, "step": 5200 }, { "epoch": 0.6377858002406739, "grad_norm": 0.7786392569541931, "learning_rate": 6.195824257557126e-06, "loss": 2.2135, "step": 5300 }, { "epoch": 0.6498194945848376, "grad_norm": 1.119176983833313, "learning_rate": 5.8361222749483246e-06, "loss": 2.2343, "step": 5400 }, { "epoch": 0.6618531889290012, "grad_norm": 1.0055018663406372, "learning_rate": 5.482824671092862e-06, "loss": 2.2361, "step": 5500 }, { "epoch": 0.6738868832731648, "grad_norm": 1.0241854190826416, "learning_rate": 5.136474846113688e-06, "loss": 2.2512, "step": 5600 }, { "epoch": 0.6859205776173285, "grad_norm": 0.792901337146759, "learning_rate": 4.797605513891179e-06, "loss": 2.2203, "step": 5700 }, { "epoch": 0.6979542719614922, "grad_norm": 1.4910557270050049, "learning_rate": 4.46673788270635e-06, "loss": 2.2261, "step": 5800 }, { "epoch": 0.7099879663056559, "grad_norm": 0.8699010610580444, "learning_rate": 4.1443808535806195e-06, "loss": 2.2282, "step": 5900 }, { "epoch": 0.7220216606498195, "grad_norm": 0.7998189330101013, "learning_rate": 3.83103023754511e-06, "loss": 2.2035, "step": 6000 }, { "epoch": 0.7340553549939831, "grad_norm": 0.8159136772155762, "learning_rate": 3.527167993043411e-06, "loss": 2.2378, "step": 6100 }, { "epoch": 0.7460890493381468, "grad_norm": 0.7632446885108948, "learning_rate": 3.233261484640753e-06, "loss": 2.2091, "step": 6200 }, { "epoch": 0.7581227436823105, "grad_norm": 0.9270642399787903, "learning_rate": 2.949762764179711e-06, "loss": 2.2278, "step": 6300 }, { "epoch": 0.7701564380264742, "grad_norm": 0.8804704546928406, "learning_rate": 2.6771078754881417e-06, "loss": 2.2409, "step": 6400 }, { "epoch": 0.7821901323706378, "grad_norm": 1.0230224132537842, "learning_rate": 2.415716183708684e-06, "loss": 2.2658, "step": 6500 }, { "epoch": 0.7942238267148014, "grad_norm": 0.7484972476959229, "learning_rate": 2.165989730281475e-06, "loss": 2.2396, "step": 6600 }, { "epoch": 0.8062575210589651, "grad_norm": 0.7870795130729675, "learning_rate": 1.928312614572083e-06, "loss": 2.206, "step": 6700 }, { "epoch": 0.8182912154031288, "grad_norm": 0.8243082761764526, "learning_rate": 1.703050403095783e-06, "loss": 2.2403, "step": 6800 }, { "epoch": 0.8303249097472925, "grad_norm": 0.88877934217453, "learning_rate": 1.4905495672468784e-06, "loss": 2.2549, "step": 6900 }, { "epoch": 0.8423586040914561, "grad_norm": 0.7939454317092896, "learning_rate": 1.2911369503978389e-06, "loss": 2.253, "step": 7000 }, { "epoch": 0.8543922984356197, "grad_norm": 0.845038652420044, "learning_rate": 1.1051192651878938e-06, "loss": 2.2124, "step": 7100 }, { "epoch": 0.8664259927797834, "grad_norm": 1.047326683998108, "learning_rate": 9.327826217743452e-07, "loss": 2.2202, "step": 7200 }, { "epoch": 0.8784596871239471, "grad_norm": 1.1349254846572876, "learning_rate": 7.743920877721378e-07, "loss": 2.2059, "step": 7300 }, { "epoch": 0.8904933814681107, "grad_norm": 0.9006612300872803, "learning_rate": 6.30191280558553e-07, "loss": 2.2327, "step": 7400 }, { "epoch": 0.9025270758122743, "grad_norm": 1.3734016418457031, "learning_rate": 5.004019925700921e-07, "loss": 2.2116, "step": 7500 }, { "epoch": 0.914560770156438, "grad_norm": 0.7956872582435608, "learning_rate": 3.852238501678751e-07, "loss": 2.2349, "step": 7600 }, { "epoch": 0.9265944645006017, "grad_norm": 0.7709358930587769, "learning_rate": 2.8483400659624737e-07, "loss": 2.2159, "step": 7700 }, { "epoch": 0.9386281588447654, "grad_norm": 0.9854726195335388, "learning_rate": 1.993868695068457e-07, "loss": 2.2324, "step": 7800 }, { "epoch": 0.950661853188929, "grad_norm": 0.957122802734375, "learning_rate": 1.290138634672089e-07, "loss": 2.2256, "step": 7900 }, { "epoch": 0.9626955475330926, "grad_norm": 1.1688376665115356, "learning_rate": 7.382322781923301e-08, "loss": 2.2578, "step": 8000 }, { "epoch": 0.9747292418772563, "grad_norm": 0.8519200682640076, "learning_rate": 3.38998501983534e-08, "loss": 2.2476, "step": 8100 }, { "epoch": 0.98676293622142, "grad_norm": 0.7625014185905457, "learning_rate": 9.475062435719828e-09, "loss": 2.2162, "step": 8200 }, { "epoch": 0.9987966305655837, "grad_norm": 0.9132429361343384, "learning_rate": 9.306564984878919e-11, "loss": 2.2016, "step": 8300 }, { "epoch": 1.0, "step": 8310, "total_flos": 1.510104094015488e+17, "train_loss": 2.265305694231201, "train_runtime": 2607.4148, "train_samples_per_second": 6.374, "train_steps_per_second": 3.187 } ], "logging_steps": 100, "max_steps": 8310, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.510104094015488e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }