{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.1008142690965492, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07754943776657619, "grad_norm": 0.41543584930406274, "learning_rate": 1.984472049689441e-05, "loss": 0.3571, "step": 50 }, { "epoch": 0.15509887553315238, "grad_norm": 0.2360007761618504, "learning_rate": 1.9689440993788823e-05, "loss": 0.1481, "step": 100 }, { "epoch": 0.23264831329972857, "grad_norm": 0.2459469865789585, "learning_rate": 1.9534161490683232e-05, "loss": 0.1413, "step": 150 }, { "epoch": 0.31019775106630476, "grad_norm": 0.1836701486527724, "learning_rate": 1.937888198757764e-05, "loss": 0.139, "step": 200 }, { "epoch": 0.38774718883288095, "grad_norm": 0.2053715213473303, "learning_rate": 1.922360248447205e-05, "loss": 0.1359, "step": 250 }, { "epoch": 0.46529662659945714, "grad_norm": 0.20210517044823975, "learning_rate": 1.906832298136646e-05, "loss": 0.1343, "step": 300 }, { "epoch": 0.5428460643660333, "grad_norm": 0.17133333163173686, "learning_rate": 1.891304347826087e-05, "loss": 0.1336, "step": 350 }, { "epoch": 0.6203955021326095, "grad_norm": 0.16513031282114732, "learning_rate": 1.875776397515528e-05, "loss": 0.1323, "step": 400 }, { "epoch": 0.6979449398991857, "grad_norm": 0.16414324471781971, "learning_rate": 1.8602484472049693e-05, "loss": 0.1318, "step": 450 }, { "epoch": 0.7754943776657619, "grad_norm": 0.16672201846671922, "learning_rate": 1.84472049689441e-05, "loss": 0.1307, "step": 500 }, { "epoch": 0.8530438154323381, "grad_norm": 0.1588831815209266, "learning_rate": 1.829192546583851e-05, "loss": 0.1301, "step": 550 }, { "epoch": 0.9305932531989143, "grad_norm": 0.17229438485787515, "learning_rate": 1.8136645962732923e-05, "loss": 0.13, "step": 600 }, { "epoch": 1.0077549437766575, "grad_norm": 0.1626649495069495, "learning_rate": 1.798136645962733e-05, "loss": 0.1284, "step": 650 }, { "epoch": 1.0853043815432337, "grad_norm": 0.16302373242598406, "learning_rate": 1.782608695652174e-05, "loss": 0.1256, "step": 700 }, { "epoch": 1.16285381930981, "grad_norm": 0.15938032051749196, "learning_rate": 1.767080745341615e-05, "loss": 0.1252, "step": 750 }, { "epoch": 1.240403257076386, "grad_norm": 0.19138770209482472, "learning_rate": 1.751552795031056e-05, "loss": 0.1244, "step": 800 }, { "epoch": 1.3179526948429623, "grad_norm": 0.15984339587089894, "learning_rate": 1.736024844720497e-05, "loss": 0.1253, "step": 850 }, { "epoch": 1.3955021326095385, "grad_norm": 0.1502502706654219, "learning_rate": 1.720496894409938e-05, "loss": 0.1248, "step": 900 }, { "epoch": 1.4730515703761147, "grad_norm": 0.13661135477508957, "learning_rate": 1.704968944099379e-05, "loss": 0.125, "step": 950 }, { "epoch": 1.5506010081426909, "grad_norm": 0.24839381800982097, "learning_rate": 1.68944099378882e-05, "loss": 0.1253, "step": 1000 }, { "epoch": 1.628150445909267, "grad_norm": 0.12815184233515442, "learning_rate": 1.673913043478261e-05, "loss": 0.1243, "step": 1050 }, { "epoch": 1.7056998836758432, "grad_norm": 0.13153520379094585, "learning_rate": 1.658385093167702e-05, "loss": 0.1237, "step": 1100 }, { "epoch": 1.7832493214424194, "grad_norm": 0.1189084339669079, "learning_rate": 1.642857142857143e-05, "loss": 0.1245, "step": 1150 }, { "epoch": 1.8607987592089956, "grad_norm": 0.15491708781159905, "learning_rate": 1.627329192546584e-05, "loss": 0.1235, "step": 1200 }, { "epoch": 1.9383481969755718, "grad_norm": 0.12739351593431672, "learning_rate": 1.611801242236025e-05, "loss": 0.1243, "step": 1250 }, { "epoch": 2.015509887553315, "grad_norm": 0.12465194041174449, "learning_rate": 1.596273291925466e-05, "loss": 0.1219, "step": 1300 }, { "epoch": 2.0930593253198913, "grad_norm": 0.1404295274618665, "learning_rate": 1.580745341614907e-05, "loss": 0.1186, "step": 1350 }, { "epoch": 2.1706087630864674, "grad_norm": 0.1359342551816161, "learning_rate": 1.565217391304348e-05, "loss": 0.1179, "step": 1400 }, { "epoch": 2.2481582008530436, "grad_norm": 0.15332233562241915, "learning_rate": 1.549689440993789e-05, "loss": 0.1185, "step": 1450 }, { "epoch": 2.32570763861962, "grad_norm": 0.11859966428735469, "learning_rate": 1.5341614906832298e-05, "loss": 0.1185, "step": 1500 }, { "epoch": 2.403257076386196, "grad_norm": 0.1493931915889296, "learning_rate": 1.5186335403726709e-05, "loss": 0.1186, "step": 1550 }, { "epoch": 2.480806514152772, "grad_norm": 0.1319324405407719, "learning_rate": 1.5031055900621118e-05, "loss": 0.1189, "step": 1600 }, { "epoch": 2.5583559519193484, "grad_norm": 0.12024679968154829, "learning_rate": 1.4875776397515529e-05, "loss": 0.1187, "step": 1650 }, { "epoch": 2.6359053896859246, "grad_norm": 0.11739796193835754, "learning_rate": 1.472049689440994e-05, "loss": 0.1187, "step": 1700 }, { "epoch": 2.7134548274525008, "grad_norm": 0.13178912063786213, "learning_rate": 1.456521739130435e-05, "loss": 0.1187, "step": 1750 }, { "epoch": 2.791004265219077, "grad_norm": 0.11853885559187641, "learning_rate": 1.4409937888198759e-05, "loss": 0.119, "step": 1800 }, { "epoch": 2.868553702985653, "grad_norm": 0.12827528414635014, "learning_rate": 1.425465838509317e-05, "loss": 0.1187, "step": 1850 }, { "epoch": 2.9461031407522293, "grad_norm": 0.13153122815676796, "learning_rate": 1.409937888198758e-05, "loss": 0.1188, "step": 1900 }, { "epoch": 3.023264831329973, "grad_norm": 0.15315655570709347, "learning_rate": 1.3944099378881988e-05, "loss": 0.1156, "step": 1950 }, { "epoch": 3.1008142690965492, "grad_norm": 0.11997442542975086, "learning_rate": 1.3788819875776398e-05, "loss": 0.1105, "step": 2000 } ], "logging_steps": 50, "max_steps": 6440, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0334530111995904e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }