{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9976580796252925, "eval_steps": 500, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0624512099921936, "grad_norm": 5.695734079803832, "learning_rate": 5e-06, "loss": 0.7013, "step": 10 }, { "epoch": 0.1249024199843872, "grad_norm": 1.6672905613542388, "learning_rate": 5e-06, "loss": 0.6275, "step": 20 }, { "epoch": 0.1873536299765808, "grad_norm": 1.231437836117211, "learning_rate": 5e-06, "loss": 0.6011, "step": 30 }, { "epoch": 0.2498048399687744, "grad_norm": 0.6660382816468988, "learning_rate": 5e-06, "loss": 0.5905, "step": 40 }, { "epoch": 0.312256049960968, "grad_norm": 0.6878662625368892, "learning_rate": 5e-06, "loss": 0.5739, "step": 50 }, { "epoch": 0.3747072599531616, "grad_norm": 0.8735325294404173, "learning_rate": 5e-06, "loss": 0.5545, "step": 60 }, { "epoch": 0.4371584699453552, "grad_norm": 0.865826564748345, "learning_rate": 5e-06, "loss": 0.5532, "step": 70 }, { "epoch": 0.4996096799375488, "grad_norm": 0.6254769035827942, "learning_rate": 5e-06, "loss": 0.5474, "step": 80 }, { "epoch": 0.5620608899297423, "grad_norm": 0.633780736055842, "learning_rate": 5e-06, "loss": 0.5477, "step": 90 }, { "epoch": 0.624512099921936, "grad_norm": 0.429646086801005, "learning_rate": 5e-06, "loss": 0.5536, "step": 100 }, { "epoch": 0.6869633099141296, "grad_norm": 0.47226853053932544, "learning_rate": 5e-06, "loss": 0.547, "step": 110 }, { "epoch": 0.7494145199063232, "grad_norm": 0.5108448433850594, "learning_rate": 5e-06, "loss": 0.5456, "step": 120 }, { "epoch": 0.8118657298985168, "grad_norm": 0.5120890633578824, "learning_rate": 5e-06, "loss": 0.5338, "step": 130 }, { "epoch": 0.8743169398907104, "grad_norm": 0.5054688976629029, "learning_rate": 5e-06, "loss": 0.5415, "step": 140 }, { "epoch": 0.936768149882904, "grad_norm": 0.4546611078615034, "learning_rate": 5e-06, "loss": 0.5266, "step": 150 }, { "epoch": 0.9992193598750976, "grad_norm": 0.5777441256544157, "learning_rate": 5e-06, "loss": 0.5327, "step": 160 }, { "epoch": 0.9992193598750976, "eval_loss": 0.5309467315673828, "eval_runtime": 112.9263, "eval_samples_per_second": 38.193, "eval_steps_per_second": 0.602, "step": 160 }, { "epoch": 1.0616705698672912, "grad_norm": 0.56357097034926, "learning_rate": 5e-06, "loss": 0.5284, "step": 170 }, { "epoch": 1.1241217798594847, "grad_norm": 0.6406606004904944, "learning_rate": 5e-06, "loss": 0.491, "step": 180 }, { "epoch": 1.1865729898516784, "grad_norm": 0.4470486896482656, "learning_rate": 5e-06, "loss": 0.4924, "step": 190 }, { "epoch": 1.249024199843872, "grad_norm": 0.43090473351959857, "learning_rate": 5e-06, "loss": 0.4824, "step": 200 }, { "epoch": 1.3114754098360657, "grad_norm": 0.43197317838800425, "learning_rate": 5e-06, "loss": 0.4911, "step": 210 }, { "epoch": 1.3739266198282591, "grad_norm": 0.49491142532222065, "learning_rate": 5e-06, "loss": 0.4888, "step": 220 }, { "epoch": 1.4363778298204528, "grad_norm": 0.4582690739604794, "learning_rate": 5e-06, "loss": 0.4922, "step": 230 }, { "epoch": 1.4988290398126463, "grad_norm": 0.5179775405720543, "learning_rate": 5e-06, "loss": 0.4902, "step": 240 }, { "epoch": 1.56128024980484, "grad_norm": 0.46044165550988314, "learning_rate": 5e-06, "loss": 0.4853, "step": 250 }, { "epoch": 1.6237314597970336, "grad_norm": 0.49784346733707135, "learning_rate": 5e-06, "loss": 0.4885, "step": 260 }, { "epoch": 1.6861826697892273, "grad_norm": 0.45069349680457893, "learning_rate": 5e-06, "loss": 0.486, "step": 270 }, { "epoch": 1.748633879781421, "grad_norm": 0.44312447239988956, "learning_rate": 5e-06, "loss": 0.4865, "step": 280 }, { "epoch": 1.8110850897736144, "grad_norm": 0.5540661328732004, "learning_rate": 5e-06, "loss": 0.4917, "step": 290 }, { "epoch": 1.8735362997658078, "grad_norm": 0.4234046253558461, "learning_rate": 5e-06, "loss": 0.4828, "step": 300 }, { "epoch": 1.9359875097580015, "grad_norm": 0.4388004925712562, "learning_rate": 5e-06, "loss": 0.4789, "step": 310 }, { "epoch": 1.9984387197501952, "grad_norm": 0.5610659198792445, "learning_rate": 5e-06, "loss": 0.4787, "step": 320 }, { "epoch": 1.9984387197501952, "eval_loss": 0.5198609828948975, "eval_runtime": 110.8922, "eval_samples_per_second": 38.894, "eval_steps_per_second": 0.613, "step": 320 }, { "epoch": 2.060889929742389, "grad_norm": 0.5484717697155285, "learning_rate": 5e-06, "loss": 0.4873, "step": 330 }, { "epoch": 2.1233411397345825, "grad_norm": 0.45228929433011156, "learning_rate": 5e-06, "loss": 0.4362, "step": 340 }, { "epoch": 2.185792349726776, "grad_norm": 0.5293854629631661, "learning_rate": 5e-06, "loss": 0.4425, "step": 350 }, { "epoch": 2.2482435597189694, "grad_norm": 0.6002239962428959, "learning_rate": 5e-06, "loss": 0.4355, "step": 360 }, { "epoch": 2.310694769711163, "grad_norm": 0.5258587782431947, "learning_rate": 5e-06, "loss": 0.4384, "step": 370 }, { "epoch": 2.3731459797033567, "grad_norm": 0.5243329962737817, "learning_rate": 5e-06, "loss": 0.4381, "step": 380 }, { "epoch": 2.4355971896955504, "grad_norm": 0.5890309295737689, "learning_rate": 5e-06, "loss": 0.4425, "step": 390 }, { "epoch": 2.498048399687744, "grad_norm": 0.5358514650116517, "learning_rate": 5e-06, "loss": 0.4439, "step": 400 }, { "epoch": 2.5604996096799377, "grad_norm": 0.4710746244266771, "learning_rate": 5e-06, "loss": 0.4434, "step": 410 }, { "epoch": 2.6229508196721314, "grad_norm": 0.4783994299645445, "learning_rate": 5e-06, "loss": 0.4376, "step": 420 }, { "epoch": 2.6854020296643246, "grad_norm": 0.5532060358770908, "learning_rate": 5e-06, "loss": 0.4418, "step": 430 }, { "epoch": 2.7478532396565183, "grad_norm": 0.5357167310213664, "learning_rate": 5e-06, "loss": 0.4489, "step": 440 }, { "epoch": 2.810304449648712, "grad_norm": 0.49033181773774204, "learning_rate": 5e-06, "loss": 0.4446, "step": 450 }, { "epoch": 2.8727556596409056, "grad_norm": 0.4295027030082963, "learning_rate": 5e-06, "loss": 0.4427, "step": 460 }, { "epoch": 2.9352068696330993, "grad_norm": 0.5261820799303164, "learning_rate": 5e-06, "loss": 0.4448, "step": 470 }, { "epoch": 2.9976580796252925, "grad_norm": 0.45577050194466817, "learning_rate": 5e-06, "loss": 0.443, "step": 480 }, { "epoch": 2.9976580796252925, "eval_loss": 0.5204988121986389, "eval_runtime": 106.3066, "eval_samples_per_second": 40.571, "eval_steps_per_second": 0.64, "step": 480 }, { "epoch": 2.9976580796252925, "step": 480, "total_flos": 803808498155520.0, "train_loss": 0.5004926055669785, "train_runtime": 16013.1849, "train_samples_per_second": 15.351, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 480, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 803808498155520.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }