| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 1000, | |
| "global_step": 3360, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08928571428571429, | |
| "grad_norm": 0.42009279659859, | |
| "learning_rate": 2.9761904761904763e-06, | |
| "loss": 0.5182, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 0.38633756820312576, | |
| "learning_rate": 5.9523809523809525e-06, | |
| "loss": 0.4851, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.26785714285714285, | |
| "grad_norm": 0.45962638823829793, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 0.4816, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.4226066822293375, | |
| "learning_rate": 9.988952191691925e-06, | |
| "loss": 0.4868, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.44642857142857145, | |
| "grad_norm": 0.3827199419252431, | |
| "learning_rate": 9.927604254015586e-06, | |
| "loss": 0.4895, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 0.446330637822688, | |
| "learning_rate": 9.81312123475006e-06, | |
| "loss": 0.49, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.38997662785153187, | |
| "learning_rate": 9.646737621134112e-06, | |
| "loss": 0.4875, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.3660382818183833, | |
| "learning_rate": 9.430247552150673e-06, | |
| "loss": 0.4856, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8035714285714286, | |
| "grad_norm": 0.3644345773145445, | |
| "learning_rate": 9.165985472062245e-06, | |
| "loss": 0.4846, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 0.3563534770415374, | |
| "learning_rate": 8.856800957755e-06, | |
| "loss": 0.4844, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "eval_loss": 0.4723494052886963, | |
| "eval_runtime": 527.1023, | |
| "eval_samples_per_second": 3.777, | |
| "eval_steps_per_second": 0.472, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9821428571428571, | |
| "grad_norm": 0.3915169737374782, | |
| "learning_rate": 8.50602799133199e-06, | |
| "loss": 0.4835, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.3986100916298589, | |
| "learning_rate": 8.117449009293668e-06, | |
| "loss": 0.4388, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.1607142857142858, | |
| "grad_norm": 0.38571453527776084, | |
| "learning_rate": 7.69525411596865e-06, | |
| "loss": 0.4311, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.3530259346548572, | |
| "learning_rate": 7.243995901002312e-06, | |
| "loss": 0.427, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.3392857142857144, | |
| "grad_norm": 0.4069536036588859, | |
| "learning_rate": 6.768540348112908e-06, | |
| "loss": 0.429, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.40040431306856655, | |
| "learning_rate": 6.274014364473274e-06, | |
| "loss": 0.4282, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.5178571428571428, | |
| "grad_norm": 0.34105714911676593, | |
| "learning_rate": 5.765750496516547e-06, | |
| "loss": 0.4318, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.6071428571428572, | |
| "grad_norm": 0.3679992579349683, | |
| "learning_rate": 5.249229428303486e-06, | |
| "loss": 0.4272, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.6964285714285714, | |
| "grad_norm": 0.37091945305243507, | |
| "learning_rate": 4.730020882499964e-06, | |
| "loss": 0.4292, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 0.4210486903676345, | |
| "learning_rate": 4.213723561238074e-06, | |
| "loss": 0.4258, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.7857142857142856, | |
| "eval_loss": 0.472212553024292, | |
| "eval_runtime": 526.9195, | |
| "eval_samples_per_second": 3.779, | |
| "eval_steps_per_second": 0.473, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 0.4028542180886112, | |
| "learning_rate": 3.705904774487396e-06, | |
| "loss": 0.4307, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.9642857142857144, | |
| "grad_norm": 0.4621878586501629, | |
| "learning_rate": 3.2120404069325695e-06, | |
| "loss": 0.4271, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.0535714285714284, | |
| "grad_norm": 0.4678503190324282, | |
| "learning_rate": 2.737455870703155e-06, | |
| "loss": 0.3889, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.4575407722156448, | |
| "learning_rate": 2.2872686806712037e-06, | |
| "loss": 0.374, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.232142857142857, | |
| "grad_norm": 0.4241549942755108, | |
| "learning_rate": 1.8663332715355399e-06, | |
| "loss": 0.3694, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.3214285714285716, | |
| "grad_norm": 0.4261640971882884, | |
| "learning_rate": 1.4791886517382415e-06, | |
| "loss": 0.3695, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.4107142857142856, | |
| "grad_norm": 0.4640732620056673, | |
| "learning_rate": 1.1300094586688632e-06, | |
| "loss": 0.3719, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.3661173371298237, | |
| "learning_rate": 8.225609429353187e-07, | |
| "loss": 0.3749, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.5892857142857144, | |
| "grad_norm": 0.4357767645022839, | |
| "learning_rate": 5.601583671126532e-07, | |
| "loss": 0.3702, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.678571428571429, | |
| "grad_norm": 0.41225890414805616, | |
| "learning_rate": 3.4563125677897936e-07, | |
| "loss": 0.3709, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.678571428571429, | |
| "eval_loss": 0.48118627071380615, | |
| "eval_runtime": 526.9164, | |
| "eval_samples_per_second": 3.779, | |
| "eval_steps_per_second": 0.473, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.767857142857143, | |
| "grad_norm": 0.5194673179252723, | |
| "learning_rate": 1.8129288932490276e-07, | |
| "loss": 0.3689, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.4488783086501226, | |
| "learning_rate": 6.891534954310886e-08, | |
| "loss": 0.3666, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.946428571428571, | |
| "grad_norm": 0.3580886076427873, | |
| "learning_rate": 9.710420977340763e-09, | |
| "loss": 0.3702, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 3360, | |
| "total_flos": 3000194903834624.0, | |
| "train_loss": 0.4290710630871001, | |
| "train_runtime": 89946.115, | |
| "train_samples_per_second": 0.597, | |
| "train_steps_per_second": 0.037 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 3360, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3000194903834624.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |