{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 30.979827089337174, "eval_steps": 1200, "global_step": 21500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.7204610951008645, "grad_norm": 0.19696219265460968, "learning_rate": 0.0004967269843558504, "loss": 0.115, "step": 500 }, { "epoch": 1.440922190201729, "grad_norm": 0.18757623434066772, "learning_rate": 0.0004931223415759498, "loss": 0.0605, "step": 1000 }, { "epoch": 1.729106628242075, "eval_loss": 0.051301125437021255, "eval_runtime": 20.8394, "eval_samples_per_second": 111.039, "eval_steps_per_second": 0.096, "step": 1200 }, { "epoch": 2.161383285302594, "grad_norm": 0.23383216559886932, "learning_rate": 0.0004895176987960493, "loss": 0.06, "step": 1500 }, { "epoch": 2.881844380403458, "grad_norm": 0.8578475713729858, "learning_rate": 0.00048591305601614884, "loss": 0.045, "step": 2000 }, { "epoch": 3.4582132564841497, "eval_loss": 0.048763249069452286, "eval_runtime": 21.4687, "eval_samples_per_second": 107.785, "eval_steps_per_second": 0.093, "step": 2400 }, { "epoch": 3.602305475504323, "grad_norm": 0.5358479619026184, "learning_rate": 0.0004823084132362483, "loss": 0.034, "step": 2500 }, { "epoch": 4.322766570605188, "grad_norm": 0.3984196186065674, "learning_rate": 0.0004787037704563478, "loss": 0.0278, "step": 3000 }, { "epoch": 5.043227665706052, "grad_norm": 0.5611603856086731, "learning_rate": 0.00047509912767644725, "loss": 0.021, "step": 3500 }, { "epoch": 5.187319884726225, "eval_loss": 0.05793336406350136, "eval_runtime": 21.4122, "eval_samples_per_second": 108.069, "eval_steps_per_second": 0.093, "step": 3600 }, { "epoch": 5.763688760806916, "grad_norm": 0.33251306414604187, "learning_rate": 0.0004714944848965468, "loss": 0.019, "step": 4000 }, { "epoch": 6.484149855907781, "grad_norm": 0.5683927536010742, "learning_rate": 0.00046788984211664625, "loss": 0.0168, "step": 4500 }, { "epoch": 6.916426512968299, "eval_loss": 0.047444652765989304, "eval_runtime": 21.5287, "eval_samples_per_second": 107.484, "eval_steps_per_second": 0.093, "step": 4800 }, { "epoch": 7.204610951008646, "grad_norm": 1.6696492433547974, "learning_rate": 0.0004642851993367457, "loss": 0.0153, "step": 5000 }, { "epoch": 7.92507204610951, "grad_norm": 0.6783491373062134, "learning_rate": 0.0004606805565568452, "loss": 0.0116, "step": 5500 }, { "epoch": 8.645533141210375, "grad_norm": 0.4771524667739868, "learning_rate": 0.0004570759137769447, "loss": 0.0118, "step": 6000 }, { "epoch": 8.645533141210375, "eval_loss": 0.06448203325271606, "eval_runtime": 20.7245, "eval_samples_per_second": 111.655, "eval_steps_per_second": 0.097, "step": 6000 }, { "epoch": 9.36599423631124, "grad_norm": 0.45867717266082764, "learning_rate": 0.0004534712709970442, "loss": 0.0095, "step": 6500 }, { "epoch": 10.086455331412104, "grad_norm": 1.0143071413040161, "learning_rate": 0.0004498666282171437, "loss": 0.0081, "step": 7000 }, { "epoch": 10.37463976945245, "eval_loss": 0.059642400592565536, "eval_runtime": 20.5013, "eval_samples_per_second": 112.871, "eval_steps_per_second": 0.098, "step": 7200 }, { "epoch": 10.806916426512968, "grad_norm": 0.34545987844467163, "learning_rate": 0.0004462619854372432, "loss": 0.0077, "step": 7500 }, { "epoch": 11.527377521613833, "grad_norm": 0.6745367050170898, "learning_rate": 0.00044265734265734266, "loss": 0.0073, "step": 8000 }, { "epoch": 12.103746397694524, "eval_loss": 0.057360123842954636, "eval_runtime": 21.5407, "eval_samples_per_second": 107.425, "eval_steps_per_second": 0.093, "step": 8400 }, { "epoch": 12.247838616714697, "grad_norm": 0.3190229535102844, "learning_rate": 0.0004390526998774422, "loss": 0.0065, "step": 8500 }, { "epoch": 12.968299711815561, "grad_norm": 0.20763935148715973, "learning_rate": 0.00043544805709754166, "loss": 0.0064, "step": 9000 }, { "epoch": 13.688760806916427, "grad_norm": 0.11372426152229309, "learning_rate": 0.00043184341431764113, "loss": 0.0059, "step": 9500 }, { "epoch": 13.832853025936599, "eval_loss": 0.08313994109630585, "eval_runtime": 21.5595, "eval_samples_per_second": 107.331, "eval_steps_per_second": 0.093, "step": 9600 }, { "epoch": 14.409221902017292, "grad_norm": 0.6901423335075378, "learning_rate": 0.0004282387715377406, "loss": 0.0055, "step": 10000 }, { "epoch": 15.129682997118156, "grad_norm": 0.5882952213287354, "learning_rate": 0.0004246341287578401, "loss": 0.005, "step": 10500 }, { "epoch": 15.561959654178674, "eval_loss": 0.06821350008249283, "eval_runtime": 20.3166, "eval_samples_per_second": 113.897, "eval_steps_per_second": 0.098, "step": 10800 }, { "epoch": 15.85014409221902, "grad_norm": 0.4642440676689148, "learning_rate": 0.0004210294859779396, "loss": 0.0049, "step": 11000 }, { "epoch": 16.570605187319885, "grad_norm": 0.9032358527183533, "learning_rate": 0.00041742484319803907, "loss": 0.0048, "step": 11500 }, { "epoch": 17.29106628242075, "grad_norm": 0.5521640777587891, "learning_rate": 0.00041382020041813854, "loss": 0.0046, "step": 12000 }, { "epoch": 17.29106628242075, "eval_loss": 0.08423992991447449, "eval_runtime": 21.1812, "eval_samples_per_second": 109.248, "eval_steps_per_second": 0.094, "step": 12000 }, { "epoch": 18.011527377521613, "grad_norm": 0.7376463413238525, "learning_rate": 0.000410215557638238, "loss": 0.0044, "step": 12500 }, { "epoch": 18.73198847262248, "grad_norm": 1.1471983194351196, "learning_rate": 0.0004066109148583376, "loss": 0.0045, "step": 13000 }, { "epoch": 19.020172910662826, "eval_loss": 0.07880275696516037, "eval_runtime": 21.5701, "eval_samples_per_second": 107.278, "eval_steps_per_second": 0.093, "step": 13200 }, { "epoch": 19.45244956772334, "grad_norm": 0.053835347294807434, "learning_rate": 0.00040300627207843706, "loss": 0.0041, "step": 13500 }, { "epoch": 20.172910662824208, "grad_norm": 0.7777488231658936, "learning_rate": 0.00039940162929853653, "loss": 0.0042, "step": 14000 }, { "epoch": 20.7492795389049, "eval_loss": 0.062229253351688385, "eval_runtime": 20.4938, "eval_samples_per_second": 112.912, "eval_steps_per_second": 0.098, "step": 14400 }, { "epoch": 20.89337175792507, "grad_norm": 0.14320553839206696, "learning_rate": 0.000395796986518636, "loss": 0.004, "step": 14500 }, { "epoch": 21.613832853025936, "grad_norm": 0.3327866494655609, "learning_rate": 0.00039219234373873553, "loss": 0.004, "step": 15000 }, { "epoch": 22.334293948126803, "grad_norm": 0.29509493708610535, "learning_rate": 0.000388587700958835, "loss": 0.0037, "step": 15500 }, { "epoch": 22.478386167146976, "eval_loss": 0.07450389117002487, "eval_runtime": 21.8716, "eval_samples_per_second": 105.799, "eval_steps_per_second": 0.091, "step": 15600 }, { "epoch": 23.054755043227665, "grad_norm": 0.5017435550689697, "learning_rate": 0.00038498305817893447, "loss": 0.0038, "step": 16000 }, { "epoch": 23.77521613832853, "grad_norm": 0.05931377038359642, "learning_rate": 0.00038137841539903394, "loss": 0.0038, "step": 16500 }, { "epoch": 24.207492795389047, "eval_loss": 0.09549176692962646, "eval_runtime": 21.5513, "eval_samples_per_second": 107.372, "eval_steps_per_second": 0.093, "step": 16800 }, { "epoch": 24.495677233429394, "grad_norm": 0.13349242508411407, "learning_rate": 0.0003777737726191334, "loss": 0.0034, "step": 17000 }, { "epoch": 25.21613832853026, "grad_norm": 0.19320227205753326, "learning_rate": 0.00037416912983923294, "loss": 0.0034, "step": 17500 }, { "epoch": 25.936599423631122, "grad_norm": 0.24608492851257324, "learning_rate": 0.0003705644870593324, "loss": 0.0034, "step": 18000 }, { "epoch": 25.936599423631122, "eval_loss": 0.10036125034093857, "eval_runtime": 22.0387, "eval_samples_per_second": 104.997, "eval_steps_per_second": 0.091, "step": 18000 }, { "epoch": 26.65706051873199, "grad_norm": 0.11887585371732712, "learning_rate": 0.0003669598442794319, "loss": 0.0033, "step": 18500 }, { "epoch": 27.377521613832855, "grad_norm": 0.5103694796562195, "learning_rate": 0.0003633552014995314, "loss": 0.0031, "step": 19000 }, { "epoch": 27.665706051873197, "eval_loss": 0.0853080227971077, "eval_runtime": 21.6671, "eval_samples_per_second": 106.798, "eval_steps_per_second": 0.092, "step": 19200 }, { "epoch": 28.097982708933717, "grad_norm": 0.9122279286384583, "learning_rate": 0.00035975055871963093, "loss": 0.0034, "step": 19500 }, { "epoch": 28.818443804034583, "grad_norm": 0.028490234166383743, "learning_rate": 0.0003561459159397304, "loss": 0.0035, "step": 20000 }, { "epoch": 29.394812680115272, "eval_loss": 0.05787323787808418, "eval_runtime": 21.1854, "eval_samples_per_second": 109.226, "eval_steps_per_second": 0.094, "step": 20400 }, { "epoch": 29.538904899135446, "grad_norm": 0.32352131605148315, "learning_rate": 0.0003525412731598299, "loss": 0.0036, "step": 20500 }, { "epoch": 30.259365994236312, "grad_norm": 0.43146830797195435, "learning_rate": 0.00034893663037992935, "loss": 0.0032, "step": 21000 }, { "epoch": 30.979827089337174, "grad_norm": 0.22915582358837128, "learning_rate": 0.0003453319876000288, "loss": 0.0026, "step": 21500 } ], "logging_steps": 500, "max_steps": 69400, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.477693523839612e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }