| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 1000, |
| "global_step": 19122, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02614789247986612, |
| "grad_norm": 1.8049263954162598, |
| "learning_rate": 4.8692605376006696e-05, |
| "loss": 4.386, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05229578495973224, |
| "grad_norm": 2.1683125495910645, |
| "learning_rate": 4.738521075201339e-05, |
| "loss": 2.8993, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.05229578495973224, |
| "eval_accuracy": 0.4572555901412006, |
| "eval_loss": 2.412806749343872, |
| "eval_runtime": 55.3639, |
| "eval_samples_per_second": 111.137, |
| "eval_steps_per_second": 3.486, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.07844367743959837, |
| "grad_norm": 2.7267353534698486, |
| "learning_rate": 4.6077816128020084e-05, |
| "loss": 2.2597, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.10459156991946449, |
| "grad_norm": 1.5406957864761353, |
| "learning_rate": 4.477042150402678e-05, |
| "loss": 1.9582, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.10459156991946449, |
| "eval_accuracy": 0.573149274789702, |
| "eval_loss": 1.7776833772659302, |
| "eval_runtime": 53.5813, |
| "eval_samples_per_second": 114.835, |
| "eval_steps_per_second": 3.602, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.13073946239933062, |
| "grad_norm": 1.8701914548873901, |
| "learning_rate": 4.346302688003347e-05, |
| "loss": 1.8076, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.15688735487919675, |
| "grad_norm": 1.4391542673110962, |
| "learning_rate": 4.2155632256040165e-05, |
| "loss": 1.7164, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.15688735487919675, |
| "eval_accuracy": 0.6048887929323908, |
| "eval_loss": 1.5985183715820312, |
| "eval_runtime": 53.9723, |
| "eval_samples_per_second": 114.003, |
| "eval_steps_per_second": 3.576, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.18303524735906285, |
| "grad_norm": 1.450477957725525, |
| "learning_rate": 4.084823763204686e-05, |
| "loss": 1.6474, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.20918313983892897, |
| "grad_norm": 1.2564880847930908, |
| "learning_rate": 3.954084300805355e-05, |
| "loss": 1.5976, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.20918313983892897, |
| "eval_accuracy": 0.6224453369669708, |
| "eval_loss": 1.5014071464538574, |
| "eval_runtime": 53.8551, |
| "eval_samples_per_second": 114.251, |
| "eval_steps_per_second": 3.584, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.2353310323187951, |
| "grad_norm": 1.2992570400238037, |
| "learning_rate": 3.8233448384060246e-05, |
| "loss": 1.5568, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.26147892479866125, |
| "grad_norm": 1.233372688293457, |
| "learning_rate": 3.692605376006694e-05, |
| "loss": 1.5257, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.26147892479866125, |
| "eval_accuracy": 0.634163627117497, |
| "eval_loss": 1.439920425415039, |
| "eval_runtime": 52.748, |
| "eval_samples_per_second": 116.649, |
| "eval_steps_per_second": 3.659, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.28762681727852735, |
| "grad_norm": 1.2047041654586792, |
| "learning_rate": 3.5618659136073633e-05, |
| "loss": 1.4979, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.3137747097583935, |
| "grad_norm": 1.3004554510116577, |
| "learning_rate": 3.431126451208033e-05, |
| "loss": 1.4723, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.3137747097583935, |
| "eval_accuracy": 0.6424584626720485, |
| "eval_loss": 1.3954555988311768, |
| "eval_runtime": 52.3736, |
| "eval_samples_per_second": 117.483, |
| "eval_steps_per_second": 3.685, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.3399226022382596, |
| "grad_norm": 1.1953301429748535, |
| "learning_rate": 3.300386988808703e-05, |
| "loss": 1.453, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.3660704947181257, |
| "grad_norm": 1.2482521533966064, |
| "learning_rate": 3.1696475264093715e-05, |
| "loss": 1.4337, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3660704947181257, |
| "eval_accuracy": 0.6488618113631875, |
| "eval_loss": 1.3617639541625977, |
| "eval_runtime": 53.7239, |
| "eval_samples_per_second": 114.53, |
| "eval_steps_per_second": 3.592, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.39221838719799185, |
| "grad_norm": 1.1894769668579102, |
| "learning_rate": 3.038908064010041e-05, |
| "loss": 1.417, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.41836627967785794, |
| "grad_norm": 1.125042200088501, |
| "learning_rate": 2.9081686016107102e-05, |
| "loss": 1.4068, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.41836627967785794, |
| "eval_accuracy": 0.6549491708580115, |
| "eval_loss": 1.3317630290985107, |
| "eval_runtime": 53.4037, |
| "eval_samples_per_second": 115.217, |
| "eval_steps_per_second": 3.614, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.4445141721577241, |
| "grad_norm": 1.270693063735962, |
| "learning_rate": 2.77742913921138e-05, |
| "loss": 1.388, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.4706620646375902, |
| "grad_norm": 1.1014580726623535, |
| "learning_rate": 2.6466896768120493e-05, |
| "loss": 1.3766, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.4706620646375902, |
| "eval_accuracy": 0.6593696833705641, |
| "eval_loss": 1.3081881999969482, |
| "eval_runtime": 53.9417, |
| "eval_samples_per_second": 114.068, |
| "eval_steps_per_second": 3.578, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.49680995711745635, |
| "grad_norm": 1.1594932079315186, |
| "learning_rate": 2.5159502144127183e-05, |
| "loss": 1.3686, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.5229578495973225, |
| "grad_norm": 1.1389472484588623, |
| "learning_rate": 2.385210752013388e-05, |
| "loss": 1.3567, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5229578495973225, |
| "eval_accuracy": 0.6632432120706919, |
| "eval_loss": 1.2883645296096802, |
| "eval_runtime": 53.6247, |
| "eval_samples_per_second": 114.742, |
| "eval_steps_per_second": 3.599, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5491057420771885, |
| "grad_norm": 1.08192777633667, |
| "learning_rate": 2.254471289614057e-05, |
| "loss": 1.3455, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.5752536345570547, |
| "grad_norm": 1.1092888116836548, |
| "learning_rate": 2.1237318272147268e-05, |
| "loss": 1.3373, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5752536345570547, |
| "eval_accuracy": 0.6666822357673398, |
| "eval_loss": 1.2716636657714844, |
| "eval_runtime": 53.5985, |
| "eval_samples_per_second": 114.798, |
| "eval_steps_per_second": 3.601, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.6014015270369208, |
| "grad_norm": 1.1305238008499146, |
| "learning_rate": 1.9929923648153958e-05, |
| "loss": 1.3273, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.627549419516787, |
| "grad_norm": 1.1033620834350586, |
| "learning_rate": 1.8622529024160655e-05, |
| "loss": 1.3231, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.627549419516787, |
| "eval_accuracy": 0.6691524801180201, |
| "eval_loss": 1.2592648267745972, |
| "eval_runtime": 54.0954, |
| "eval_samples_per_second": 113.743, |
| "eval_steps_per_second": 3.568, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.653697311996653, |
| "grad_norm": 1.108879566192627, |
| "learning_rate": 1.7315134400167346e-05, |
| "loss": 1.3152, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.6798452044765192, |
| "grad_norm": 1.1043877601623535, |
| "learning_rate": 1.6007739776174043e-05, |
| "loss": 1.3101, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6798452044765192, |
| "eval_accuracy": 0.671657040037531, |
| "eval_loss": 1.2451061010360718, |
| "eval_runtime": 53.6863, |
| "eval_samples_per_second": 114.61, |
| "eval_steps_per_second": 3.595, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.7059930969563853, |
| "grad_norm": 1.0998560190200806, |
| "learning_rate": 1.4700345152180736e-05, |
| "loss": 1.3042, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.7321409894362514, |
| "grad_norm": 1.1624592542648315, |
| "learning_rate": 1.3392950528187429e-05, |
| "loss": 1.2962, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7321409894362514, |
| "eval_accuracy": 0.6740324399688046, |
| "eval_loss": 1.2344375848770142, |
| "eval_runtime": 53.7465, |
| "eval_samples_per_second": 114.482, |
| "eval_steps_per_second": 3.591, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7582888819161175, |
| "grad_norm": 1.0827239751815796, |
| "learning_rate": 1.2085555904194122e-05, |
| "loss": 1.2915, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.7844367743959837, |
| "grad_norm": 1.1423588991165161, |
| "learning_rate": 1.0778161280200816e-05, |
| "loss": 1.2842, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7844367743959837, |
| "eval_accuracy": 0.6758603794825307, |
| "eval_loss": 1.22589910030365, |
| "eval_runtime": 53.8871, |
| "eval_samples_per_second": 114.183, |
| "eval_steps_per_second": 3.582, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.8105846668758498, |
| "grad_norm": 1.1340547800064087, |
| "learning_rate": 9.47076665620751e-06, |
| "loss": 1.2796, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.8367325593557159, |
| "grad_norm": 1.1284784078598022, |
| "learning_rate": 8.163372032214205e-06, |
| "loss": 1.2752, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8367325593557159, |
| "eval_accuracy": 0.6775672612951045, |
| "eval_loss": 1.2177867889404297, |
| "eval_runtime": 53.7311, |
| "eval_samples_per_second": 114.515, |
| "eval_steps_per_second": 3.592, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.862880451835582, |
| "grad_norm": 1.138634443283081, |
| "learning_rate": 6.855977408220898e-06, |
| "loss": 1.2715, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.8890283443154482, |
| "grad_norm": 1.1163477897644043, |
| "learning_rate": 5.548582784227591e-06, |
| "loss": 1.2718, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8890283443154482, |
| "eval_accuracy": 0.6789395663115799, |
| "eval_loss": 1.2103002071380615, |
| "eval_runtime": 53.645, |
| "eval_samples_per_second": 114.699, |
| "eval_steps_per_second": 3.598, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.9151762367953143, |
| "grad_norm": 1.133549451828003, |
| "learning_rate": 4.241188160234285e-06, |
| "loss": 1.2698, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.9413241292751804, |
| "grad_norm": 1.1056196689605713, |
| "learning_rate": 2.933793536240979e-06, |
| "loss": 1.2628, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9413241292751804, |
| "eval_accuracy": 0.6802642108157907, |
| "eval_loss": 1.204982876777649, |
| "eval_runtime": 53.7116, |
| "eval_samples_per_second": 114.556, |
| "eval_steps_per_second": 3.593, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9674720217550465, |
| "grad_norm": 1.126265525817871, |
| "learning_rate": 1.6263989122476732e-06, |
| "loss": 1.2634, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.9936199142349127, |
| "grad_norm": 1.1292709112167358, |
| "learning_rate": 3.190042882543667e-07, |
| "loss": 1.2576, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.9936199142349127, |
| "eval_accuracy": 0.6809181130440626, |
| "eval_loss": 1.2015681266784668, |
| "eval_runtime": 53.7272, |
| "eval_samples_per_second": 114.523, |
| "eval_steps_per_second": 3.592, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 19122, |
| "total_flos": 3.19770233929728e+17, |
| "train_loss": 1.5458110461810084, |
| "train_runtime": 7443.7007, |
| "train_samples_per_second": 82.204, |
| "train_steps_per_second": 2.569 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 19122, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.19770233929728e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|