{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 1000, "global_step": 16570, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002414875633904854, "grad_norm": 7.886749267578125, "learning_rate": 0.0, "loss": 3.6713, "step": 1 }, { "epoch": 0.24148756339048538, "grad_norm": 6.218736171722412, "learning_rate": 9.647513278609368e-06, "loss": 2.8183, "step": 1000 }, { "epoch": 0.24148756339048538, "eval_cosine_accuracy": 0.9434220194816589, "eval_loss": 0.585781455039978, "eval_runtime": 34.9645, "eval_samples_per_second": 271.962, "eval_steps_per_second": 2.145, "step": 1000 }, { "epoch": 0.48297512678097076, "grad_norm": 6.299063682556152, "learning_rate": 1.9304683727667796e-05, "loss": 2.1179, "step": 2000 }, { "epoch": 0.48297512678097076, "eval_cosine_accuracy": 0.9497318267822266, "eval_loss": 0.5328050851821899, "eval_runtime": 34.2691, "eval_samples_per_second": 277.48, "eval_steps_per_second": 2.189, "step": 2000 }, { "epoch": 0.7244626901714561, "grad_norm": 9.58321475982666, "learning_rate": 1.9003971235376196e-05, "loss": 1.4826, "step": 3000 }, { "epoch": 0.7244626901714561, "eval_cosine_accuracy": 0.9538332223892212, "eval_loss": 0.49323785305023193, "eval_runtime": 34.0684, "eval_samples_per_second": 279.115, "eval_steps_per_second": 2.201, "step": 3000 }, { "epoch": 0.9659502535619415, "grad_norm": 15.6340970993042, "learning_rate": 1.79317376838038e-05, "loss": 0.949, "step": 4000 }, { "epoch": 0.9659502535619415, "eval_cosine_accuracy": 0.9546745419502258, "eval_loss": 0.4723624587059021, "eval_runtime": 33.7991, "eval_samples_per_second": 281.339, "eval_steps_per_second": 2.219, "step": 4000 }, { "epoch": 1.2073376780111031, "grad_norm": 6.6801862716674805, "learning_rate": 1.6858430825372976e-05, "loss": 1.1823, "step": 5000 }, { "epoch": 1.2073376780111031, "eval_cosine_accuracy": 0.960037887096405, "eval_loss": 0.4632853865623474, "eval_runtime": 34.2938, "eval_samples_per_second": 277.28, "eval_steps_per_second": 2.187, "step": 5000 }, { "epoch": 1.4487086652184407, "grad_norm": 6.39561653137207, "learning_rate": 1.578619727380058e-05, "loss": 1.1665, "step": 6000 }, { "epoch": 1.4487086652184407, "eval_cosine_accuracy": 0.9617204666137695, "eval_loss": 0.4432311952114105, "eval_runtime": 35.5061, "eval_samples_per_second": 267.813, "eval_steps_per_second": 2.112, "step": 6000 }, { "epoch": 1.6900796524257784, "grad_norm": 6.977872371673584, "learning_rate": 1.4712890415369757e-05, "loss": 1.1042, "step": 7000 }, { "epoch": 1.6900796524257784, "eval_cosine_accuracy": 0.9625617861747742, "eval_loss": 0.43880006670951843, "eval_runtime": 34.1311, "eval_samples_per_second": 278.603, "eval_steps_per_second": 2.197, "step": 7000 }, { "epoch": 1.9314506396331161, "grad_norm": 7.896651268005371, "learning_rate": 1.3640656863797362e-05, "loss": 1.0525, "step": 8000 }, { "epoch": 1.9314506396331161, "eval_cosine_accuracy": 0.9643495678901672, "eval_loss": 0.4344501495361328, "eval_runtime": 34.0153, "eval_samples_per_second": 279.55, "eval_steps_per_second": 2.205, "step": 8000 }, { "epoch": 2.1728216268404537, "grad_norm": 6.305137634277344, "learning_rate": 1.2567350005366534e-05, "loss": 0.9752, "step": 9000 }, { "epoch": 2.1728216268404537, "eval_cosine_accuracy": 0.9641392230987549, "eval_loss": 0.43460169434547424, "eval_runtime": 33.9573, "eval_samples_per_second": 280.028, "eval_steps_per_second": 2.209, "step": 9000 }, { "epoch": 2.4141926140477916, "grad_norm": 6.064307689666748, "learning_rate": 1.1495116453794142e-05, "loss": 0.9177, "step": 10000 }, { "epoch": 2.4141926140477916, "eval_cosine_accuracy": 0.9636133909225464, "eval_loss": 0.4276249408721924, "eval_runtime": 34.07, "eval_samples_per_second": 279.102, "eval_steps_per_second": 2.201, "step": 10000 }, { "epoch": 2.655563601255129, "grad_norm": 11.2512788772583, "learning_rate": 1.0421809595363315e-05, "loss": 0.9044, "step": 11000 }, { "epoch": 2.655563601255129, "eval_cosine_accuracy": 0.9652960300445557, "eval_loss": 0.42561274766921997, "eval_runtime": 34.6345, "eval_samples_per_second": 274.553, "eval_steps_per_second": 2.165, "step": 11000 }, { "epoch": 2.8969345884624667, "grad_norm": 5.970104694366455, "learning_rate": 9.34850273693249e-06, "loss": 0.8924, "step": 12000 }, { "epoch": 2.8969345884624667, "eval_cosine_accuracy": 0.9664528369903564, "eval_loss": 0.42230626940727234, "eval_runtime": 33.7931, "eval_samples_per_second": 281.389, "eval_steps_per_second": 2.219, "step": 12000 }, { "epoch": 3.1383055756698046, "grad_norm": 6.617201805114746, "learning_rate": 8.275195878501664e-06, "loss": 0.8378, "step": 13000 }, { "epoch": 3.1383055756698046, "eval_cosine_accuracy": 0.9656115174293518, "eval_loss": 0.42513710260391235, "eval_runtime": 33.8615, "eval_samples_per_second": 280.821, "eval_steps_per_second": 2.215, "step": 13000 }, { "epoch": 3.379676562877142, "grad_norm": 5.853105545043945, "learning_rate": 7.201889020070839e-06, "loss": 0.831, "step": 14000 }, { "epoch": 3.379676562877142, "eval_cosine_accuracy": 0.9662424921989441, "eval_loss": 0.42466071248054504, "eval_runtime": 33.8525, "eval_samples_per_second": 280.895, "eval_steps_per_second": 2.215, "step": 14000 }, { "epoch": 3.62104755008448, "grad_norm": 5.566030025482178, "learning_rate": 6.128582161640013e-06, "loss": 0.8012, "step": 15000 }, { "epoch": 3.62104755008448, "eval_cosine_accuracy": 0.9660322070121765, "eval_loss": 0.4249342978000641, "eval_runtime": 34.3475, "eval_samples_per_second": 276.847, "eval_steps_per_second": 2.184, "step": 15000 }, { "epoch": 3.8624185372918176, "grad_norm": 5.444942474365234, "learning_rate": 5.056348610067619e-06, "loss": 0.7952, "step": 16000 }, { "epoch": 3.8624185372918176, "eval_cosine_accuracy": 0.9661373496055603, "eval_loss": 0.42102375626564026, "eval_runtime": 33.7901, "eval_samples_per_second": 281.414, "eval_steps_per_second": 2.22, "step": 16000 } ], "logging_steps": 1000, "max_steps": 20705, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }