| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 28450, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.351493848857645, | |
| "grad_norm": 12501.5126953125, | |
| "learning_rate": 0.0005993999999999999, | |
| "loss": 0.6616, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.70298769771529, | |
| "grad_norm": 11799.552734375, | |
| "learning_rate": 0.0005781639344262295, | |
| "loss": 0.5298, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.39387861735030005, | |
| "eval_loss": 1.9969476461410522, | |
| "eval_runtime": 10.2477, | |
| "eval_samples_per_second": 58.843, | |
| "eval_steps_per_second": 1.854, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 1.054481546572935, | |
| "grad_norm": 10105.4248046875, | |
| "learning_rate": 0.0005563060109289616, | |
| "loss": 0.4896, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.40597539543058, | |
| "grad_norm": 9273.8095703125, | |
| "learning_rate": 0.0005344480874316939, | |
| "loss": 0.4687, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.757469244288225, | |
| "grad_norm": 8524.5703125, | |
| "learning_rate": 0.0005125901639344262, | |
| "loss": 0.4512, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.43839510860569947, | |
| "eval_loss": 1.821701169013977, | |
| "eval_runtime": 9.7306, | |
| "eval_samples_per_second": 61.97, | |
| "eval_steps_per_second": 1.953, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 2.10896309314587, | |
| "grad_norm": 8911.6494140625, | |
| "learning_rate": 0.0004907322404371584, | |
| "loss": 0.437, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.460456942003515, | |
| "grad_norm": 6983.92236328125, | |
| "learning_rate": 0.00046887431693989066, | |
| "loss": 0.4253, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.81195079086116, | |
| "grad_norm": 7203.26025390625, | |
| "learning_rate": 0.00044701639344262294, | |
| "loss": 0.4173, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.4640041800131761, | |
| "eval_loss": 1.729548692703247, | |
| "eval_runtime": 9.8999, | |
| "eval_samples_per_second": 60.909, | |
| "eval_steps_per_second": 1.919, | |
| "step": 8535 | |
| }, | |
| { | |
| "epoch": 3.1634446397188047, | |
| "grad_norm": 7449.33203125, | |
| "learning_rate": 0.00042515846994535517, | |
| "loss": 0.4092, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.51493848857645, | |
| "grad_norm": 7554.9697265625, | |
| "learning_rate": 0.0004033005464480874, | |
| "loss": 0.4028, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.866432337434095, | |
| "grad_norm": 7199.42626953125, | |
| "learning_rate": 0.0003814426229508197, | |
| "loss": 0.3977, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.4796013409793823, | |
| "eval_loss": 1.6785619258880615, | |
| "eval_runtime": 10.0375, | |
| "eval_samples_per_second": 60.075, | |
| "eval_steps_per_second": 1.893, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 4.21792618629174, | |
| "grad_norm": 7292.9921875, | |
| "learning_rate": 0.0003595846994535519, | |
| "loss": 0.392, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.569420035149385, | |
| "grad_norm": 6805.3798828125, | |
| "learning_rate": 0.00033772677595628414, | |
| "loss": 0.3879, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.92091388400703, | |
| "grad_norm": 7368.5224609375, | |
| "learning_rate": 0.00031586885245901637, | |
| "loss": 0.3841, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.49063553725177117, | |
| "eval_loss": 1.6362360715866089, | |
| "eval_runtime": 9.729, | |
| "eval_samples_per_second": 61.98, | |
| "eval_steps_per_second": 1.953, | |
| "step": 14225 | |
| }, | |
| { | |
| "epoch": 5.272407732864675, | |
| "grad_norm": 7300.4189453125, | |
| "learning_rate": 0.0002940109289617486, | |
| "loss": 0.3799, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 5.62390158172232, | |
| "grad_norm": 6178.0908203125, | |
| "learning_rate": 0.0002721530054644809, | |
| "loss": 0.3764, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 5.975395430579965, | |
| "grad_norm": 6489.77490234375, | |
| "learning_rate": 0.0002502950819672131, | |
| "loss": 0.3747, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.4991123962704417, | |
| "eval_loss": 1.6064575910568237, | |
| "eval_runtime": 9.7095, | |
| "eval_samples_per_second": 62.104, | |
| "eval_steps_per_second": 1.957, | |
| "step": 17070 | |
| }, | |
| { | |
| "epoch": 6.3268892794376095, | |
| "grad_norm": 7217.43701171875, | |
| "learning_rate": 0.00022843715846994535, | |
| "loss": 0.3693, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 6.678383128295255, | |
| "grad_norm": 6669.8017578125, | |
| "learning_rate": 0.00020657923497267757, | |
| "loss": 0.3677, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.5070570175865616, | |
| "eval_loss": 1.5771993398666382, | |
| "eval_runtime": 9.7124, | |
| "eval_samples_per_second": 62.086, | |
| "eval_steps_per_second": 1.956, | |
| "step": 19915 | |
| }, | |
| { | |
| "epoch": 7.0298769771529, | |
| "grad_norm": 7067.46533203125, | |
| "learning_rate": 0.00018472131147540983, | |
| "loss": 0.3655, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 7.381370826010545, | |
| "grad_norm": 6705.837890625, | |
| "learning_rate": 0.00016286338797814206, | |
| "loss": 0.3605, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 7.73286467486819, | |
| "grad_norm": 6395.66162109375, | |
| "learning_rate": 0.00014100546448087432, | |
| "loss": 0.3586, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.5143298510708039, | |
| "eval_loss": 1.5533959865570068, | |
| "eval_runtime": 9.7631, | |
| "eval_samples_per_second": 61.763, | |
| "eval_steps_per_second": 1.946, | |
| "step": 22760 | |
| }, | |
| { | |
| "epoch": 8.084358523725834, | |
| "grad_norm": 7270.5849609375, | |
| "learning_rate": 0.00011914754098360655, | |
| "loss": 0.3549, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 8.43585237258348, | |
| "grad_norm": 7109.36328125, | |
| "learning_rate": 9.728961748633879e-05, | |
| "loss": 0.3524, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 8.787346221441124, | |
| "grad_norm": 6891.5283203125, | |
| "learning_rate": 7.543169398907103e-05, | |
| "loss": 0.3504, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.5215475135736841, | |
| "eval_loss": 1.5324053764343262, | |
| "eval_runtime": 9.718, | |
| "eval_samples_per_second": 62.05, | |
| "eval_steps_per_second": 1.955, | |
| "step": 25605 | |
| }, | |
| { | |
| "epoch": 9.13884007029877, | |
| "grad_norm": 7154.31298828125, | |
| "learning_rate": 5.357377049180328e-05, | |
| "loss": 0.3472, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 9.490333919156415, | |
| "grad_norm": 7199.95068359375, | |
| "learning_rate": 3.171584699453552e-05, | |
| "loss": 0.3449, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 9.84182776801406, | |
| "grad_norm": 6995.279296875, | |
| "learning_rate": 9.857923497267758e-06, | |
| "loss": 0.3437, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.5249648690662798, | |
| "eval_loss": 1.523409128189087, | |
| "eval_runtime": 9.7518, | |
| "eval_samples_per_second": 61.835, | |
| "eval_steps_per_second": 1.948, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 28450, | |
| "total_flos": 2.3783323336704e+17, | |
| "train_loss": 0.40263440499196784, | |
| "train_runtime": 24635.8124, | |
| "train_samples_per_second": 36.947, | |
| "train_steps_per_second": 1.155 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 28450, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.3783323336704e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |