{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021362956633198035, "grad_norm": 0.10347568243741989, "learning_rate": 6.666666666666667e-06, "loss": 2.4928, "step": 100 }, { "epoch": 0.04272591326639607, "grad_norm": 0.21450009942054749, "learning_rate": 1.3333333333333333e-05, "loss": 2.482, "step": 200 }, { "epoch": 0.0640888698995941, "grad_norm": 0.33770546317100525, "learning_rate": 2e-05, "loss": 2.4373, "step": 300 }, { "epoch": 0.08545182653279214, "grad_norm": 0.3730420470237732, "learning_rate": 1.9974299762831266e-05, "loss": 2.3748, "step": 400 }, { "epoch": 0.10681478316599018, "grad_norm": 0.40363097190856934, "learning_rate": 1.9897331151763162e-05, "loss": 2.3395, "step": 500 }, { "epoch": 0.1281777397991882, "grad_norm": 0.4664216935634613, "learning_rate": 1.9769489789107492e-05, "loss": 2.3456, "step": 600 }, { "epoch": 0.14954069643238624, "grad_norm": 0.4879497289657593, "learning_rate": 1.9591432785532302e-05, "loss": 2.3481, "step": 700 }, { "epoch": 0.17090365306558428, "grad_norm": 0.4861871004104614, "learning_rate": 1.9364075362481876e-05, "loss": 2.3305, "step": 800 }, { "epoch": 0.19226660969878231, "grad_norm": 0.5032399296760559, "learning_rate": 1.908858614789511e-05, "loss": 2.2901, "step": 900 }, { "epoch": 0.21362956633198035, "grad_norm": 0.5441191792488098, "learning_rate": 1.8766381169402465e-05, "loss": 2.3201, "step": 1000 }, { "epoch": 0.2349925229651784, "grad_norm": 0.4986329674720764, "learning_rate": 1.839911657587678e-05, "loss": 2.2981, "step": 1100 }, { "epoch": 0.2563554795983764, "grad_norm": 0.506102979183197, "learning_rate": 1.7988680124749516e-05, "loss": 2.2633, "step": 1200 }, { "epoch": 0.27771843623157444, "grad_norm": 0.5421627163887024, "learning_rate": 1.7537181478848007e-05, "loss": 2.2985, "step": 1300 }, { "epoch": 0.2990813928647725, "grad_norm": 0.6916339993476868, "learning_rate": 1.704694136262846e-05, "loss": 2.2716, "step": 1400 }, { "epoch": 0.3204443494979705, "grad_norm": 0.7127671837806702, "learning_rate": 1.6520479633542167e-05, "loss": 2.2697, "step": 1500 }, { "epoch": 0.34180730613116855, "grad_norm": 0.6163765788078308, "learning_rate": 1.5960502329848683e-05, "loss": 2.3061, "step": 1600 }, { "epoch": 0.3631702627643666, "grad_norm": 0.6348161697387695, "learning_rate": 1.5369887761450813e-05, "loss": 2.2746, "step": 1700 }, { "epoch": 0.38453321939756463, "grad_norm": 0.6660177111625671, "learning_rate": 1.475167171524519e-05, "loss": 2.2785, "step": 1800 }, { "epoch": 0.40589617603076267, "grad_norm": 0.7842796444892883, "learning_rate": 1.4109031851033612e-05, "loss": 2.2649, "step": 1900 }, { "epoch": 0.4272591326639607, "grad_norm": 0.6408088803291321, "learning_rate": 1.344527136820094e-05, "loss": 2.253, "step": 2000 }, { "epoch": 0.44862208929715874, "grad_norm": 0.6853693127632141, "learning_rate": 1.2763802027113587e-05, "loss": 2.2758, "step": 2100 }, { "epoch": 0.4699850459303568, "grad_norm": 0.5584832429885864, "learning_rate": 1.2068126612509384e-05, "loss": 2.2917, "step": 2200 }, { "epoch": 0.4913480025635548, "grad_norm": 0.5917975306510925, "learning_rate": 1.1361820929017884e-05, "loss": 2.2838, "step": 2300 }, { "epoch": 0.5127109591967528, "grad_norm": 0.6518538594245911, "learning_rate": 1.0648515421354968e-05, "loss": 2.2407, "step": 2400 }, { "epoch": 0.5340739158299509, "grad_norm": 0.6702529788017273, "learning_rate": 9.931876513664764e-06, "loss": 2.2241, "step": 2500 }, { "epoch": 0.5554368724631489, "grad_norm": 0.7369861602783203, "learning_rate": 9.215587763925683e-06, "loss": 2.2469, "step": 2600 }, { "epoch": 0.576799829096347, "grad_norm": 0.8747798204421997, "learning_rate": 8.503330930287628e-06, "loss": 2.2703, "step": 2700 }, { "epoch": 0.598162785729545, "grad_norm": 0.8590816855430603, "learning_rate": 7.798767046660521e-06, "loss": 2.2236, "step": 2800 }, { "epoch": 0.619525742362743, "grad_norm": 0.6715850830078125, "learning_rate": 7.1055176048263085e-06, "loss": 2.2296, "step": 2900 }, { "epoch": 0.640888698995941, "grad_norm": 0.8593222498893738, "learning_rate": 6.42714593979943e-06, "loss": 2.2466, "step": 3000 }, { "epoch": 0.6622516556291391, "grad_norm": 0.8626837134361267, "learning_rate": 5.767138914115842e-06, "loss": 2.2454, "step": 3100 }, { "epoch": 0.6836146122623371, "grad_norm": 0.7464703917503357, "learning_rate": 5.128888995194161e-06, "loss": 2.267, "step": 3200 }, { "epoch": 0.7049775688955351, "grad_norm": 0.813230037689209, "learning_rate": 4.515676817892231e-06, "loss": 2.2443, "step": 3300 }, { "epoch": 0.7263405255287332, "grad_norm": 0.6884306073188782, "learning_rate": 3.930654321888331e-06, "loss": 2.2186, "step": 3400 }, { "epoch": 0.7477034821619312, "grad_norm": 0.6503286957740784, "learning_rate": 3.3768285505617404e-06, "loss": 2.2633, "step": 3500 }, { "epoch": 0.7690664387951293, "grad_norm": 0.7110804319381714, "learning_rate": 2.8570461946470963e-06, "loss": 2.2259, "step": 3600 }, { "epoch": 0.7904293954283272, "grad_norm": 0.6175870299339294, "learning_rate": 2.3739789601090347e-06, "loss": 2.2678, "step": 3700 }, { "epoch": 0.8117923520615253, "grad_norm": 0.7111446261405945, "learning_rate": 1.9301098354467707e-06, "loss": 2.2305, "step": 3800 }, { "epoch": 0.8331553086947233, "grad_norm": 0.6799335479736328, "learning_rate": 1.5277203290154342e-06, "loss": 2.2314, "step": 3900 }, { "epoch": 0.8545182653279214, "grad_norm": 0.6913052797317505, "learning_rate": 1.1688787419649338e-06, "loss": 2.2394, "step": 4000 }, { "epoch": 0.8758812219611194, "grad_norm": 0.6040618419647217, "learning_rate": 8.554295370739074e-07, "loss": 2.2329, "step": 4100 }, { "epoch": 0.8972441785943175, "grad_norm": 1.026239275932312, "learning_rate": 5.889838581235641e-07, "loss": 2.2698, "step": 4200 }, { "epoch": 0.9186071352275155, "grad_norm": 0.7688636183738708, "learning_rate": 3.7091124854222613e-07, "loss": 2.2419, "step": 4300 }, { "epoch": 0.9399700918607136, "grad_norm": 0.8546819090843201, "learning_rate": 2.0233261188714491e-07, "loss": 2.2185, "step": 4400 }, { "epoch": 0.9613330484939115, "grad_norm": 0.940643310546875, "learning_rate": 8.411445034704258e-08, "loss": 2.1983, "step": 4500 }, { "epoch": 0.9826960051271096, "grad_norm": 0.8350269794464111, "learning_rate": 1.6864410879763316e-08, "loss": 2.2649, "step": 4600 }, { "epoch": 1.0, "step": 4681, "total_flos": 8.50597775081472e+16, "train_loss": 2.2806778850526896, "train_runtime": 1455.669, "train_samples_per_second": 6.431, "train_steps_per_second": 3.216 } ], "logging_steps": 100, "max_steps": 4681, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.50597775081472e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }