{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 1000, "global_step": 3360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08928571428571429, "grad_norm": 0.42009279659859, "learning_rate": 2.9761904761904763e-06, "loss": 0.5182, "step": 100 }, { "epoch": 0.17857142857142858, "grad_norm": 0.38633756820312576, "learning_rate": 5.9523809523809525e-06, "loss": 0.4851, "step": 200 }, { "epoch": 0.26785714285714285, "grad_norm": 0.45962638823829793, "learning_rate": 8.92857142857143e-06, "loss": 0.4816, "step": 300 }, { "epoch": 0.35714285714285715, "grad_norm": 0.4226066822293375, "learning_rate": 9.988952191691925e-06, "loss": 0.4868, "step": 400 }, { "epoch": 0.44642857142857145, "grad_norm": 0.3827199419252431, "learning_rate": 9.927604254015586e-06, "loss": 0.4895, "step": 500 }, { "epoch": 0.5357142857142857, "grad_norm": 0.446330637822688, "learning_rate": 9.81312123475006e-06, "loss": 0.49, "step": 600 }, { "epoch": 0.625, "grad_norm": 0.38997662785153187, "learning_rate": 9.646737621134112e-06, "loss": 0.4875, "step": 700 }, { "epoch": 0.7142857142857143, "grad_norm": 0.3660382818183833, "learning_rate": 9.430247552150673e-06, "loss": 0.4856, "step": 800 }, { "epoch": 0.8035714285714286, "grad_norm": 0.3644345773145445, "learning_rate": 9.165985472062245e-06, "loss": 0.4846, "step": 900 }, { "epoch": 0.8928571428571429, "grad_norm": 0.3563534770415374, "learning_rate": 8.856800957755e-06, "loss": 0.4844, "step": 1000 }, { "epoch": 0.8928571428571429, "eval_loss": 0.4723494052886963, "eval_runtime": 527.1023, "eval_samples_per_second": 3.777, "eval_steps_per_second": 0.472, "step": 1000 }, { "epoch": 0.9821428571428571, "grad_norm": 0.3915169737374782, "learning_rate": 8.50602799133199e-06, "loss": 0.4835, "step": 1100 }, { "epoch": 1.0714285714285714, "grad_norm": 0.3986100916298589, "learning_rate": 8.117449009293668e-06, "loss": 0.4388, "step": 1200 }, { "epoch": 1.1607142857142858, "grad_norm": 0.38571453527776084, "learning_rate": 7.69525411596865e-06, "loss": 0.4311, "step": 1300 }, { "epoch": 1.25, "grad_norm": 0.3530259346548572, "learning_rate": 7.243995901002312e-06, "loss": 0.427, "step": 1400 }, { "epoch": 1.3392857142857144, "grad_norm": 0.4069536036588859, "learning_rate": 6.768540348112908e-06, "loss": 0.429, "step": 1500 }, { "epoch": 1.4285714285714286, "grad_norm": 0.40040431306856655, "learning_rate": 6.274014364473274e-06, "loss": 0.4282, "step": 1600 }, { "epoch": 1.5178571428571428, "grad_norm": 0.34105714911676593, "learning_rate": 5.765750496516547e-06, "loss": 0.4318, "step": 1700 }, { "epoch": 1.6071428571428572, "grad_norm": 0.3679992579349683, "learning_rate": 5.249229428303486e-06, "loss": 0.4272, "step": 1800 }, { "epoch": 1.6964285714285714, "grad_norm": 0.37091945305243507, "learning_rate": 4.730020882499964e-06, "loss": 0.4292, "step": 1900 }, { "epoch": 1.7857142857142856, "grad_norm": 0.4210486903676345, "learning_rate": 4.213723561238074e-06, "loss": 0.4258, "step": 2000 }, { "epoch": 1.7857142857142856, "eval_loss": 0.472212553024292, "eval_runtime": 526.9195, "eval_samples_per_second": 3.779, "eval_steps_per_second": 0.473, "step": 2000 }, { "epoch": 1.875, "grad_norm": 0.4028542180886112, "learning_rate": 3.705904774487396e-06, "loss": 0.4307, "step": 2100 }, { "epoch": 1.9642857142857144, "grad_norm": 0.4621878586501629, "learning_rate": 3.2120404069325695e-06, "loss": 0.4271, "step": 2200 }, { "epoch": 2.0535714285714284, "grad_norm": 0.4678503190324282, "learning_rate": 2.737455870703155e-06, "loss": 0.3889, "step": 2300 }, { "epoch": 2.142857142857143, "grad_norm": 0.4575407722156448, "learning_rate": 2.2872686806712037e-06, "loss": 0.374, "step": 2400 }, { "epoch": 2.232142857142857, "grad_norm": 0.4241549942755108, "learning_rate": 1.8663332715355399e-06, "loss": 0.3694, "step": 2500 }, { "epoch": 2.3214285714285716, "grad_norm": 0.4261640971882884, "learning_rate": 1.4791886517382415e-06, "loss": 0.3695, "step": 2600 }, { "epoch": 2.4107142857142856, "grad_norm": 0.4640732620056673, "learning_rate": 1.1300094586688632e-06, "loss": 0.3719, "step": 2700 }, { "epoch": 2.5, "grad_norm": 0.3661173371298237, "learning_rate": 8.225609429353187e-07, "loss": 0.3749, "step": 2800 }, { "epoch": 2.5892857142857144, "grad_norm": 0.4357767645022839, "learning_rate": 5.601583671126532e-07, "loss": 0.3702, "step": 2900 }, { "epoch": 2.678571428571429, "grad_norm": 0.41225890414805616, "learning_rate": 3.4563125677897936e-07, "loss": 0.3709, "step": 3000 }, { "epoch": 2.678571428571429, "eval_loss": 0.48118627071380615, "eval_runtime": 526.9164, "eval_samples_per_second": 3.779, "eval_steps_per_second": 0.473, "step": 3000 }, { "epoch": 2.767857142857143, "grad_norm": 0.5194673179252723, "learning_rate": 1.8129288932490276e-07, "loss": 0.3689, "step": 3100 }, { "epoch": 2.857142857142857, "grad_norm": 0.4488783086501226, "learning_rate": 6.891534954310886e-08, "loss": 0.3666, "step": 3200 }, { "epoch": 2.946428571428571, "grad_norm": 0.3580886076427873, "learning_rate": 9.710420977340763e-09, "loss": 0.3702, "step": 3300 }, { "epoch": 3.0, "step": 3360, "total_flos": 3000194903834624.0, "train_loss": 0.4290710630871001, "train_runtime": 89946.115, "train_samples_per_second": 0.597, "train_steps_per_second": 0.037 } ], "logging_steps": 100, "max_steps": 3360, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3000194903834624.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }