| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 30.979827089337174, | |
| "eval_steps": 1200, | |
| "global_step": 21500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.7204610951008645, | |
| "grad_norm": 0.19696219265460968, | |
| "learning_rate": 0.0004967269843558504, | |
| "loss": 0.115, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.440922190201729, | |
| "grad_norm": 0.18757623434066772, | |
| "learning_rate": 0.0004931223415759498, | |
| "loss": 0.0605, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.729106628242075, | |
| "eval_loss": 0.051301125437021255, | |
| "eval_runtime": 20.8394, | |
| "eval_samples_per_second": 111.039, | |
| "eval_steps_per_second": 0.096, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.161383285302594, | |
| "grad_norm": 0.23383216559886932, | |
| "learning_rate": 0.0004895176987960493, | |
| "loss": 0.06, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.881844380403458, | |
| "grad_norm": 0.8578475713729858, | |
| "learning_rate": 0.00048591305601614884, | |
| "loss": 0.045, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.4582132564841497, | |
| "eval_loss": 0.048763249069452286, | |
| "eval_runtime": 21.4687, | |
| "eval_samples_per_second": 107.785, | |
| "eval_steps_per_second": 0.093, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.602305475504323, | |
| "grad_norm": 0.5358479619026184, | |
| "learning_rate": 0.0004823084132362483, | |
| "loss": 0.034, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.322766570605188, | |
| "grad_norm": 0.3984196186065674, | |
| "learning_rate": 0.0004787037704563478, | |
| "loss": 0.0278, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.043227665706052, | |
| "grad_norm": 0.5611603856086731, | |
| "learning_rate": 0.00047509912767644725, | |
| "loss": 0.021, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 5.187319884726225, | |
| "eval_loss": 0.05793336406350136, | |
| "eval_runtime": 21.4122, | |
| "eval_samples_per_second": 108.069, | |
| "eval_steps_per_second": 0.093, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 5.763688760806916, | |
| "grad_norm": 0.33251306414604187, | |
| "learning_rate": 0.0004714944848965468, | |
| "loss": 0.019, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 6.484149855907781, | |
| "grad_norm": 0.5683927536010742, | |
| "learning_rate": 0.00046788984211664625, | |
| "loss": 0.0168, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 6.916426512968299, | |
| "eval_loss": 0.047444652765989304, | |
| "eval_runtime": 21.5287, | |
| "eval_samples_per_second": 107.484, | |
| "eval_steps_per_second": 0.093, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 7.204610951008646, | |
| "grad_norm": 1.6696492433547974, | |
| "learning_rate": 0.0004642851993367457, | |
| "loss": 0.0153, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 7.92507204610951, | |
| "grad_norm": 0.6783491373062134, | |
| "learning_rate": 0.0004606805565568452, | |
| "loss": 0.0116, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 8.645533141210375, | |
| "grad_norm": 0.4771524667739868, | |
| "learning_rate": 0.0004570759137769447, | |
| "loss": 0.0118, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 8.645533141210375, | |
| "eval_loss": 0.06448203325271606, | |
| "eval_runtime": 20.7245, | |
| "eval_samples_per_second": 111.655, | |
| "eval_steps_per_second": 0.097, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 9.36599423631124, | |
| "grad_norm": 0.45867717266082764, | |
| "learning_rate": 0.0004534712709970442, | |
| "loss": 0.0095, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 10.086455331412104, | |
| "grad_norm": 1.0143071413040161, | |
| "learning_rate": 0.0004498666282171437, | |
| "loss": 0.0081, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 10.37463976945245, | |
| "eval_loss": 0.059642400592565536, | |
| "eval_runtime": 20.5013, | |
| "eval_samples_per_second": 112.871, | |
| "eval_steps_per_second": 0.098, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 10.806916426512968, | |
| "grad_norm": 0.34545987844467163, | |
| "learning_rate": 0.0004462619854372432, | |
| "loss": 0.0077, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 11.527377521613833, | |
| "grad_norm": 0.6745367050170898, | |
| "learning_rate": 0.00044265734265734266, | |
| "loss": 0.0073, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 12.103746397694524, | |
| "eval_loss": 0.057360123842954636, | |
| "eval_runtime": 21.5407, | |
| "eval_samples_per_second": 107.425, | |
| "eval_steps_per_second": 0.093, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 12.247838616714697, | |
| "grad_norm": 0.3190229535102844, | |
| "learning_rate": 0.0004390526998774422, | |
| "loss": 0.0065, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 12.968299711815561, | |
| "grad_norm": 0.20763935148715973, | |
| "learning_rate": 0.00043544805709754166, | |
| "loss": 0.0064, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 13.688760806916427, | |
| "grad_norm": 0.11372426152229309, | |
| "learning_rate": 0.00043184341431764113, | |
| "loss": 0.0059, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 13.832853025936599, | |
| "eval_loss": 0.08313994109630585, | |
| "eval_runtime": 21.5595, | |
| "eval_samples_per_second": 107.331, | |
| "eval_steps_per_second": 0.093, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 14.409221902017292, | |
| "grad_norm": 0.6901423335075378, | |
| "learning_rate": 0.0004282387715377406, | |
| "loss": 0.0055, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 15.129682997118156, | |
| "grad_norm": 0.5882952213287354, | |
| "learning_rate": 0.0004246341287578401, | |
| "loss": 0.005, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 15.561959654178674, | |
| "eval_loss": 0.06821350008249283, | |
| "eval_runtime": 20.3166, | |
| "eval_samples_per_second": 113.897, | |
| "eval_steps_per_second": 0.098, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 15.85014409221902, | |
| "grad_norm": 0.4642440676689148, | |
| "learning_rate": 0.0004210294859779396, | |
| "loss": 0.0049, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 16.570605187319885, | |
| "grad_norm": 0.9032358527183533, | |
| "learning_rate": 0.00041742484319803907, | |
| "loss": 0.0048, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 17.29106628242075, | |
| "grad_norm": 0.5521640777587891, | |
| "learning_rate": 0.00041382020041813854, | |
| "loss": 0.0046, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 17.29106628242075, | |
| "eval_loss": 0.08423992991447449, | |
| "eval_runtime": 21.1812, | |
| "eval_samples_per_second": 109.248, | |
| "eval_steps_per_second": 0.094, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 18.011527377521613, | |
| "grad_norm": 0.7376463413238525, | |
| "learning_rate": 0.000410215557638238, | |
| "loss": 0.0044, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 18.73198847262248, | |
| "grad_norm": 1.1471983194351196, | |
| "learning_rate": 0.0004066109148583376, | |
| "loss": 0.0045, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 19.020172910662826, | |
| "eval_loss": 0.07880275696516037, | |
| "eval_runtime": 21.5701, | |
| "eval_samples_per_second": 107.278, | |
| "eval_steps_per_second": 0.093, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 19.45244956772334, | |
| "grad_norm": 0.053835347294807434, | |
| "learning_rate": 0.00040300627207843706, | |
| "loss": 0.0041, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 20.172910662824208, | |
| "grad_norm": 0.7777488231658936, | |
| "learning_rate": 0.00039940162929853653, | |
| "loss": 0.0042, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 20.7492795389049, | |
| "eval_loss": 0.062229253351688385, | |
| "eval_runtime": 20.4938, | |
| "eval_samples_per_second": 112.912, | |
| "eval_steps_per_second": 0.098, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 20.89337175792507, | |
| "grad_norm": 0.14320553839206696, | |
| "learning_rate": 0.000395796986518636, | |
| "loss": 0.004, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 21.613832853025936, | |
| "grad_norm": 0.3327866494655609, | |
| "learning_rate": 0.00039219234373873553, | |
| "loss": 0.004, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 22.334293948126803, | |
| "grad_norm": 0.29509493708610535, | |
| "learning_rate": 0.000388587700958835, | |
| "loss": 0.0037, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 22.478386167146976, | |
| "eval_loss": 0.07450389117002487, | |
| "eval_runtime": 21.8716, | |
| "eval_samples_per_second": 105.799, | |
| "eval_steps_per_second": 0.091, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 23.054755043227665, | |
| "grad_norm": 0.5017435550689697, | |
| "learning_rate": 0.00038498305817893447, | |
| "loss": 0.0038, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 23.77521613832853, | |
| "grad_norm": 0.05931377038359642, | |
| "learning_rate": 0.00038137841539903394, | |
| "loss": 0.0038, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 24.207492795389047, | |
| "eval_loss": 0.09549176692962646, | |
| "eval_runtime": 21.5513, | |
| "eval_samples_per_second": 107.372, | |
| "eval_steps_per_second": 0.093, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 24.495677233429394, | |
| "grad_norm": 0.13349242508411407, | |
| "learning_rate": 0.0003777737726191334, | |
| "loss": 0.0034, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 25.21613832853026, | |
| "grad_norm": 0.19320227205753326, | |
| "learning_rate": 0.00037416912983923294, | |
| "loss": 0.0034, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 25.936599423631122, | |
| "grad_norm": 0.24608492851257324, | |
| "learning_rate": 0.0003705644870593324, | |
| "loss": 0.0034, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 25.936599423631122, | |
| "eval_loss": 0.10036125034093857, | |
| "eval_runtime": 22.0387, | |
| "eval_samples_per_second": 104.997, | |
| "eval_steps_per_second": 0.091, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 26.65706051873199, | |
| "grad_norm": 0.11887585371732712, | |
| "learning_rate": 0.0003669598442794319, | |
| "loss": 0.0033, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 27.377521613832855, | |
| "grad_norm": 0.5103694796562195, | |
| "learning_rate": 0.0003633552014995314, | |
| "loss": 0.0031, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 27.665706051873197, | |
| "eval_loss": 0.0853080227971077, | |
| "eval_runtime": 21.6671, | |
| "eval_samples_per_second": 106.798, | |
| "eval_steps_per_second": 0.092, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 28.097982708933717, | |
| "grad_norm": 0.9122279286384583, | |
| "learning_rate": 0.00035975055871963093, | |
| "loss": 0.0034, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 28.818443804034583, | |
| "grad_norm": 0.028490234166383743, | |
| "learning_rate": 0.0003561459159397304, | |
| "loss": 0.0035, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 29.394812680115272, | |
| "eval_loss": 0.05787323787808418, | |
| "eval_runtime": 21.1854, | |
| "eval_samples_per_second": 109.226, | |
| "eval_steps_per_second": 0.094, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 29.538904899135446, | |
| "grad_norm": 0.32352131605148315, | |
| "learning_rate": 0.0003525412731598299, | |
| "loss": 0.0036, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 30.259365994236312, | |
| "grad_norm": 0.43146830797195435, | |
| "learning_rate": 0.00034893663037992935, | |
| "loss": 0.0032, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 30.979827089337174, | |
| "grad_norm": 0.22915582358837128, | |
| "learning_rate": 0.0003453319876000288, | |
| "loss": 0.0026, | |
| "step": 21500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 69400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.477693523839612e+17, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |