| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 28560, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.35014005602240894, | |
| "grad_norm": 13124.1923828125, | |
| "learning_rate": 0.0005993999999999999, | |
| "loss": 0.6555, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.7002801120448179, | |
| "grad_norm": 9263.9365234375, | |
| "learning_rate": 0.0005782510885341073, | |
| "loss": 0.485, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.4941692129868049, | |
| "eval_loss": 1.695952296257019, | |
| "eval_runtime": 15.3882, | |
| "eval_samples_per_second": 44.45, | |
| "eval_steps_per_second": 1.43, | |
| "step": 2856 | |
| }, | |
| { | |
| "epoch": 1.050420168067227, | |
| "grad_norm": 8019.04931640625, | |
| "learning_rate": 0.0005564804063860668, | |
| "loss": 0.4275, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.4005602240896358, | |
| "grad_norm": 7421.74853515625, | |
| "learning_rate": 0.0005347097242380261, | |
| "loss": 0.4009, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.7507002801120448, | |
| "grad_norm": 6680.69140625, | |
| "learning_rate": 0.0005129390420899855, | |
| "loss": 0.3846, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.5348302262505579, | |
| "eval_loss": 1.5430383682250977, | |
| "eval_runtime": 10.493, | |
| "eval_samples_per_second": 65.186, | |
| "eval_steps_per_second": 2.097, | |
| "step": 5712 | |
| }, | |
| { | |
| "epoch": 2.100840336134454, | |
| "grad_norm": 6414.6875, | |
| "learning_rate": 0.0004911683599419448, | |
| "loss": 0.3745, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.450980392156863, | |
| "grad_norm": 6100.486328125, | |
| "learning_rate": 0.00046939767779390415, | |
| "loss": 0.3654, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.8011204481792715, | |
| "grad_norm": 6541.07275390625, | |
| "learning_rate": 0.00044762699564586357, | |
| "loss": 0.3593, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.5546629129902382, | |
| "eval_loss": 1.4664632081985474, | |
| "eval_runtime": 10.4709, | |
| "eval_samples_per_second": 65.324, | |
| "eval_steps_per_second": 2.101, | |
| "step": 8568 | |
| }, | |
| { | |
| "epoch": 3.1512605042016806, | |
| "grad_norm": 6968.591796875, | |
| "learning_rate": 0.00042585631349782287, | |
| "loss": 0.3535, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.5014005602240896, | |
| "grad_norm": 6259.02685546875, | |
| "learning_rate": 0.00040408563134978223, | |
| "loss": 0.3476, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.8515406162464987, | |
| "grad_norm": 5834.7548828125, | |
| "learning_rate": 0.0003823149492017416, | |
| "loss": 0.3438, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.5707419233014042, | |
| "eval_loss": 1.4174976348876953, | |
| "eval_runtime": 15.0661, | |
| "eval_samples_per_second": 45.4, | |
| "eval_steps_per_second": 1.46, | |
| "step": 11424 | |
| }, | |
| { | |
| "epoch": 4.201680672268908, | |
| "grad_norm": 5648.84814453125, | |
| "learning_rate": 0.000360544267053701, | |
| "loss": 0.3403, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.551820728291316, | |
| "grad_norm": 6200.31298828125, | |
| "learning_rate": 0.00033877358490566037, | |
| "loss": 0.3361, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.901960784313726, | |
| "grad_norm": 5265.44775390625, | |
| "learning_rate": 0.00031700290275761973, | |
| "loss": 0.3341, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.5773938270333367, | |
| "eval_loss": 1.3927898406982422, | |
| "eval_runtime": 15.378, | |
| "eval_samples_per_second": 44.479, | |
| "eval_steps_per_second": 1.431, | |
| "step": 14280 | |
| }, | |
| { | |
| "epoch": 5.2521008403361344, | |
| "grad_norm": 5555.48486328125, | |
| "learning_rate": 0.0002952322206095791, | |
| "loss": 0.3302, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 5.602240896358543, | |
| "grad_norm": 5694.87890625, | |
| "learning_rate": 0.00027346153846153845, | |
| "loss": 0.3271, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 5.9523809523809526, | |
| "grad_norm": 5353.783203125, | |
| "learning_rate": 0.0002516908563134978, | |
| "loss": 0.3252, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.5860970920451815, | |
| "eval_loss": 1.3625472784042358, | |
| "eval_runtime": 15.2445, | |
| "eval_samples_per_second": 44.869, | |
| "eval_steps_per_second": 1.443, | |
| "step": 17136 | |
| }, | |
| { | |
| "epoch": 6.302521008403361, | |
| "grad_norm": 5537.6591796875, | |
| "learning_rate": 0.00022992017416545717, | |
| "loss": 0.3209, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 6.652661064425771, | |
| "grad_norm": 5149.9501953125, | |
| "learning_rate": 0.00020814949201741653, | |
| "loss": 0.3192, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.5942910930293771, | |
| "eval_loss": 1.334036111831665, | |
| "eval_runtime": 15.3529, | |
| "eval_samples_per_second": 44.552, | |
| "eval_steps_per_second": 1.433, | |
| "step": 19992 | |
| }, | |
| { | |
| "epoch": 7.002801120448179, | |
| "grad_norm": 6287.0166015625, | |
| "learning_rate": 0.0001863788098693759, | |
| "loss": 0.3174, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 7.352941176470588, | |
| "grad_norm": 5925.09228515625, | |
| "learning_rate": 0.00016460812772133528, | |
| "loss": 0.3133, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 7.703081232492997, | |
| "grad_norm": 5756.06787109375, | |
| "learning_rate": 0.0001428374455732946, | |
| "loss": 0.3112, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.5998243325207997, | |
| "eval_loss": 1.3174924850463867, | |
| "eval_runtime": 15.3544, | |
| "eval_samples_per_second": 44.547, | |
| "eval_steps_per_second": 1.433, | |
| "step": 22848 | |
| }, | |
| { | |
| "epoch": 8.053221288515406, | |
| "grad_norm": 5942.7607421875, | |
| "learning_rate": 0.00012106676342525399, | |
| "loss": 0.3097, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 8.403361344537815, | |
| "grad_norm": 6329.1103515625, | |
| "learning_rate": 9.929608127721335e-05, | |
| "loss": 0.306, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 8.753501400560225, | |
| "grad_norm": 6101.3408203125, | |
| "learning_rate": 7.75253991291727e-05, | |
| "loss": 0.3038, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.6055206509424246, | |
| "eval_loss": 1.3009241819381714, | |
| "eval_runtime": 10.6137, | |
| "eval_samples_per_second": 64.445, | |
| "eval_steps_per_second": 2.073, | |
| "step": 25704 | |
| }, | |
| { | |
| "epoch": 9.103641456582633, | |
| "grad_norm": 6770.501953125, | |
| "learning_rate": 5.5754716981132066e-05, | |
| "loss": 0.3025, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 9.453781512605042, | |
| "grad_norm": 6355.373046875, | |
| "learning_rate": 3.3984034833091434e-05, | |
| "loss": 0.2991, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 9.803921568627452, | |
| "grad_norm": 5946.85693359375, | |
| "learning_rate": 1.2213352685050797e-05, | |
| "loss": 0.298, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.6093458532175187, | |
| "eval_loss": 1.288467288017273, | |
| "eval_runtime": 10.3972, | |
| "eval_samples_per_second": 65.787, | |
| "eval_steps_per_second": 2.116, | |
| "step": 28560 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 28560, | |
| "total_flos": 2.387294650368e+17, | |
| "train_loss": 0.35219172576562363, | |
| "train_runtime": 24797.7753, | |
| "train_samples_per_second": 36.844, | |
| "train_steps_per_second": 1.152 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 28560, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.387294650368e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |