| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 1000, |
| "global_step": 19760, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.025303643724696356, |
| "grad_norm": 1.5727399587631226, |
| "learning_rate": 4.8734817813765186e-05, |
| "loss": 4.2638, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05060728744939271, |
| "grad_norm": 1.6128734350204468, |
| "learning_rate": 4.746963562753037e-05, |
| "loss": 2.8323, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.05060728744939271, |
| "eval_accuracy": 0.4558127658658102, |
| "eval_loss": 2.4024298191070557, |
| "eval_runtime": 54.1352, |
| "eval_samples_per_second": 117.391, |
| "eval_steps_per_second": 3.676, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.07591093117408906, |
| "grad_norm": 2.72668719291687, |
| "learning_rate": 4.6204453441295545e-05, |
| "loss": 2.2432, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.10121457489878542, |
| "grad_norm": 1.4283599853515625, |
| "learning_rate": 4.4939271255060735e-05, |
| "loss": 1.9289, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.10121457489878542, |
| "eval_accuracy": 0.5789619245166059, |
| "eval_loss": 1.7509254217147827, |
| "eval_runtime": 54.2605, |
| "eval_samples_per_second": 117.12, |
| "eval_steps_per_second": 3.667, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.12651821862348178, |
| "grad_norm": 1.8954753875732422, |
| "learning_rate": 4.367408906882591e-05, |
| "loss": 1.7775, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.15182186234817813, |
| "grad_norm": 1.2609208822250366, |
| "learning_rate": 4.2408906882591095e-05, |
| "loss": 1.6823, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.15182186234817813, |
| "eval_accuracy": 0.6104347451572141, |
| "eval_loss": 1.572474718093872, |
| "eval_runtime": 54.4114, |
| "eval_samples_per_second": 116.795, |
| "eval_steps_per_second": 3.657, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.1771255060728745, |
| "grad_norm": 1.3487361669540405, |
| "learning_rate": 4.114372469635628e-05, |
| "loss": 1.6154, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.20242914979757085, |
| "grad_norm": 1.4665900468826294, |
| "learning_rate": 3.9878542510121455e-05, |
| "loss": 1.5643, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.20242914979757085, |
| "eval_accuracy": 0.6285182732633305, |
| "eval_loss": 1.4746477603912354, |
| "eval_runtime": 54.1648, |
| "eval_samples_per_second": 117.327, |
| "eval_steps_per_second": 3.674, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.22773279352226722, |
| "grad_norm": 1.2297464609146118, |
| "learning_rate": 3.8613360323886645e-05, |
| "loss": 1.5269, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.25303643724696356, |
| "grad_norm": 1.3741837739944458, |
| "learning_rate": 3.734817813765182e-05, |
| "loss": 1.4919, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.25303643724696356, |
| "eval_accuracy": 0.6401629554087613, |
| "eval_loss": 1.4118025302886963, |
| "eval_runtime": 54.4887, |
| "eval_samples_per_second": 116.63, |
| "eval_steps_per_second": 3.652, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2783400809716599, |
| "grad_norm": 1.1640024185180664, |
| "learning_rate": 3.6082995951417005e-05, |
| "loss": 1.4683, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.30364372469635625, |
| "grad_norm": 1.2266194820404053, |
| "learning_rate": 3.481781376518219e-05, |
| "loss": 1.443, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.30364372469635625, |
| "eval_accuracy": 0.6483573328780303, |
| "eval_loss": 1.369125247001648, |
| "eval_runtime": 54.587, |
| "eval_samples_per_second": 116.42, |
| "eval_steps_per_second": 3.646, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.32894736842105265, |
| "grad_norm": 1.2047063112258911, |
| "learning_rate": 3.355263157894737e-05, |
| "loss": 1.4232, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.354251012145749, |
| "grad_norm": 1.114903211593628, |
| "learning_rate": 3.2287449392712554e-05, |
| "loss": 1.4054, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.354251012145749, |
| "eval_accuracy": 0.6552170264867913, |
| "eval_loss": 1.3325867652893066, |
| "eval_runtime": 54.126, |
| "eval_samples_per_second": 117.411, |
| "eval_steps_per_second": 3.677, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.37955465587044535, |
| "grad_norm": 1.2796965837478638, |
| "learning_rate": 3.102226720647773e-05, |
| "loss": 1.387, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.4048582995951417, |
| "grad_norm": 1.2552096843719482, |
| "learning_rate": 2.9757085020242914e-05, |
| "loss": 1.3716, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.4048582995951417, |
| "eval_accuracy": 0.6604371677999251, |
| "eval_loss": 1.3062158823013306, |
| "eval_runtime": 54.4456, |
| "eval_samples_per_second": 116.722, |
| "eval_steps_per_second": 3.655, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.43016194331983804, |
| "grad_norm": 1.0873684883117676, |
| "learning_rate": 2.84919028340081e-05, |
| "loss": 1.3627, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.45546558704453444, |
| "grad_norm": 1.127432107925415, |
| "learning_rate": 2.722672064777328e-05, |
| "loss": 1.3477, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.45546558704453444, |
| "eval_accuracy": 0.6648636052153729, |
| "eval_loss": 1.2838101387023926, |
| "eval_runtime": 54.7412, |
| "eval_samples_per_second": 116.092, |
| "eval_steps_per_second": 3.635, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.4807692307692308, |
| "grad_norm": 1.1843162775039673, |
| "learning_rate": 2.5961538461538464e-05, |
| "loss": 1.3365, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.5060728744939271, |
| "grad_norm": 1.1098982095718384, |
| "learning_rate": 2.4696356275303644e-05, |
| "loss": 1.3298, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5060728744939271, |
| "eval_accuracy": 0.6686581558843684, |
| "eval_loss": 1.2636867761611938, |
| "eval_runtime": 53.8567, |
| "eval_samples_per_second": 117.998, |
| "eval_steps_per_second": 3.695, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5313765182186235, |
| "grad_norm": 1.098755121231079, |
| "learning_rate": 2.3431174089068827e-05, |
| "loss": 1.321, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.5566801619433198, |
| "grad_norm": 1.1298458576202393, |
| "learning_rate": 2.216599190283401e-05, |
| "loss": 1.3088, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5566801619433198, |
| "eval_accuracy": 0.6723328818757869, |
| "eval_loss": 1.2462453842163086, |
| "eval_runtime": 53.514, |
| "eval_samples_per_second": 118.754, |
| "eval_steps_per_second": 3.719, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5819838056680162, |
| "grad_norm": 1.0954270362854004, |
| "learning_rate": 2.090080971659919e-05, |
| "loss": 1.3024, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.6072874493927125, |
| "grad_norm": 1.0527666807174683, |
| "learning_rate": 1.9635627530364373e-05, |
| "loss": 1.2944, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6072874493927125, |
| "eval_accuracy": 0.6748384328039667, |
| "eval_loss": 1.2335460186004639, |
| "eval_runtime": 53.6987, |
| "eval_samples_per_second": 118.345, |
| "eval_steps_per_second": 3.706, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6325910931174089, |
| "grad_norm": 1.1186504364013672, |
| "learning_rate": 1.8370445344129557e-05, |
| "loss": 1.2908, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.6578947368421053, |
| "grad_norm": 1.1064029932022095, |
| "learning_rate": 1.7105263157894737e-05, |
| "loss": 1.278, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6578947368421053, |
| "eval_accuracy": 0.6772795337451056, |
| "eval_loss": 1.220670223236084, |
| "eval_runtime": 53.6254, |
| "eval_samples_per_second": 118.507, |
| "eval_steps_per_second": 3.711, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6831983805668016, |
| "grad_norm": 1.0741119384765625, |
| "learning_rate": 1.584008097165992e-05, |
| "loss": 1.2757, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.708502024291498, |
| "grad_norm": 1.0686042308807373, |
| "learning_rate": 1.4574898785425101e-05, |
| "loss": 1.2658, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.708502024291498, |
| "eval_accuracy": 0.6795963492697078, |
| "eval_loss": 1.209822177886963, |
| "eval_runtime": 53.4731, |
| "eval_samples_per_second": 118.845, |
| "eval_steps_per_second": 3.721, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7338056680161943, |
| "grad_norm": 1.1075104475021362, |
| "learning_rate": 1.3309716599190283e-05, |
| "loss": 1.2628, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.7591093117408907, |
| "grad_norm": 1.0740045309066772, |
| "learning_rate": 1.2044534412955466e-05, |
| "loss": 1.2567, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7591093117408907, |
| "eval_accuracy": 0.6815859926643917, |
| "eval_loss": 1.2005436420440674, |
| "eval_runtime": 54.2582, |
| "eval_samples_per_second": 117.125, |
| "eval_steps_per_second": 3.668, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7844129554655871, |
| "grad_norm": 1.0769619941711426, |
| "learning_rate": 1.077935222672065e-05, |
| "loss": 1.2551, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.8097165991902834, |
| "grad_norm": 1.1006090641021729, |
| "learning_rate": 9.51417004048583e-06, |
| "loss": 1.2506, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8097165991902834, |
| "eval_accuracy": 0.6832303133361483, |
| "eval_loss": 1.1920998096466064, |
| "eval_runtime": 53.7111, |
| "eval_samples_per_second": 118.318, |
| "eval_steps_per_second": 3.705, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8350202429149798, |
| "grad_norm": 1.109384536743164, |
| "learning_rate": 8.248987854251013e-06, |
| "loss": 1.2438, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.8603238866396761, |
| "grad_norm": 1.1033892631530762, |
| "learning_rate": 6.983805668016195e-06, |
| "loss": 1.241, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8603238866396761, |
| "eval_accuracy": 0.6846748236662198, |
| "eval_loss": 1.1846812963485718, |
| "eval_runtime": 53.9126, |
| "eval_samples_per_second": 117.876, |
| "eval_steps_per_second": 3.691, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8856275303643725, |
| "grad_norm": 1.1079332828521729, |
| "learning_rate": 5.718623481781377e-06, |
| "loss": 1.2366, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.9109311740890689, |
| "grad_norm": 1.0562292337417603, |
| "learning_rate": 4.453441295546559e-06, |
| "loss": 1.2338, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9109311740890689, |
| "eval_accuracy": 0.6860184290046476, |
| "eval_loss": 1.178944706916809, |
| "eval_runtime": 53.6984, |
| "eval_samples_per_second": 118.346, |
| "eval_steps_per_second": 3.706, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9362348178137652, |
| "grad_norm": 1.1196858882904053, |
| "learning_rate": 3.1882591093117414e-06, |
| "loss": 1.2353, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.9615384615384616, |
| "grad_norm": 1.0644774436950684, |
| "learning_rate": 1.9230769230769234e-06, |
| "loss": 1.2306, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.9615384615384616, |
| "eval_accuracy": 0.6870910059966175, |
| "eval_loss": 1.1743125915527344, |
| "eval_runtime": 54.6413, |
| "eval_samples_per_second": 116.304, |
| "eval_steps_per_second": 3.642, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.9868421052631579, |
| "grad_norm": 1.0787763595581055, |
| "learning_rate": 6.578947368421053e-07, |
| "loss": 1.2287, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 19760, |
| "total_flos": 3.30439310180352e+17, |
| "train_loss": 1.504390393770658, |
| "train_runtime": 7655.2302, |
| "train_samples_per_second": 82.599, |
| "train_steps_per_second": 2.581 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 19760, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.30439310180352e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|