| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 11270, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.0909643173217773, |
| "epoch": 0.00044365572315882877, |
| "grad_norm": 56.12086486816406, |
| "learning_rate": 0.0, |
| "loss": 3.0658, |
| "mean_token_accuracy": 0.48502805829048157, |
| "num_tokens": 3210.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.3591915201806353, |
| "epoch": 0.22182786157941436, |
| "grad_norm": 19.471004486083984, |
| "learning_rate": 2.9905649405771675e-06, |
| "loss": 1.4761, |
| "mean_token_accuracy": 0.6628800271030418, |
| "num_tokens": 206054.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.22182786157941436, |
| "eval_entropy": 1.2849119341542534, |
| "eval_loss": 1.3277472257614136, |
| "eval_mean_token_accuracy": 0.6782622895342239, |
| "eval_num_tokens": 206054.0, |
| "eval_runtime": 33.7353, |
| "eval_samples_per_second": 33.407, |
| "eval_steps_per_second": 8.359, |
| "step": 500 |
| }, |
| { |
| "entropy": 1.3819819614887237, |
| "epoch": 0.44365572315882873, |
| "grad_norm": 15.931497573852539, |
| "learning_rate": 2.9523065141902646e-06, |
| "loss": 1.3619, |
| "mean_token_accuracy": 0.6711735002994538, |
| "num_tokens": 417292.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.44365572315882873, |
| "eval_entropy": 1.2668320029339892, |
| "eval_loss": 1.3029544353485107, |
| "eval_mean_token_accuracy": 0.6819594891358775, |
| "eval_num_tokens": 417292.0, |
| "eval_runtime": 33.7783, |
| "eval_samples_per_second": 33.365, |
| "eval_steps_per_second": 8.349, |
| "step": 1000 |
| }, |
| { |
| "entropy": 1.3488010201454164, |
| "epoch": 0.6654835847382431, |
| "grad_norm": 15.746204376220703, |
| "learning_rate": 2.885374907463648e-06, |
| "loss": 1.3193, |
| "mean_token_accuracy": 0.6784288173913956, |
| "num_tokens": 621936.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.6654835847382431, |
| "eval_entropy": 1.278814361661884, |
| "eval_loss": 1.2931554317474365, |
| "eval_mean_token_accuracy": 0.6839457212399084, |
| "eval_num_tokens": 621936.0, |
| "eval_runtime": 34.1849, |
| "eval_samples_per_second": 32.968, |
| "eval_steps_per_second": 8.249, |
| "step": 1500 |
| }, |
| { |
| "entropy": 1.3254250472784042, |
| "epoch": 0.8873114463176575, |
| "grad_norm": 14.376286506652832, |
| "learning_rate": 2.7910915646044115e-06, |
| "loss": 1.2938, |
| "mean_token_accuracy": 0.6840605319142342, |
| "num_tokens": 830692.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.8873114463176575, |
| "eval_entropy": 1.2728644338905388, |
| "eval_loss": 1.2851120233535767, |
| "eval_mean_token_accuracy": 0.6846338094126249, |
| "eval_num_tokens": 830692.0, |
| "eval_runtime": 34.3989, |
| "eval_samples_per_second": 32.763, |
| "eval_steps_per_second": 8.198, |
| "step": 2000 |
| }, |
| { |
| "entropy": 1.17150056040287, |
| "epoch": 1.109139307897072, |
| "grad_norm": 17.192914962768555, |
| "learning_rate": 2.671317940661071e-06, |
| "loss": 1.1239, |
| "mean_token_accuracy": 0.7167075086236, |
| "num_tokens": 1036856.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.109139307897072, |
| "eval_entropy": 0.9878316506849113, |
| "eval_loss": 1.3588604927062988, |
| "eval_mean_token_accuracy": 0.6802238186834552, |
| "eval_num_tokens": 1036856.0, |
| "eval_runtime": 34.9345, |
| "eval_samples_per_second": 32.26, |
| "eval_steps_per_second": 8.072, |
| "step": 2500 |
| }, |
| { |
| "entropy": 1.0019279054403305, |
| "epoch": 1.3309671694764862, |
| "grad_norm": 15.853100776672363, |
| "learning_rate": 2.5284187504412197e-06, |
| "loss": 0.9613, |
| "mean_token_accuracy": 0.751466910123825, |
| "num_tokens": 1236950.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.3309671694764862, |
| "eval_entropy": 1.062070348161332, |
| "eval_loss": 1.3535487651824951, |
| "eval_mean_token_accuracy": 0.6781793389971374, |
| "eval_num_tokens": 1236950.0, |
| "eval_runtime": 34.1483, |
| "eval_samples_per_second": 33.003, |
| "eval_steps_per_second": 8.258, |
| "step": 3000 |
| }, |
| { |
| "entropy": 1.0105634088516235, |
| "epoch": 1.5527950310559007, |
| "grad_norm": 18.761213302612305, |
| "learning_rate": 2.365215281470278e-06, |
| "loss": 0.974, |
| "mean_token_accuracy": 0.7476956839561463, |
| "num_tokens": 1445121.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.5527950310559007, |
| "eval_entropy": 1.0477840112033465, |
| "eval_loss": 1.3453229665756226, |
| "eval_mean_token_accuracy": 0.6804118481933648, |
| "eval_num_tokens": 1445121.0, |
| "eval_runtime": 33.969, |
| "eval_samples_per_second": 33.177, |
| "eval_steps_per_second": 8.302, |
| "step": 3500 |
| }, |
| { |
| "entropy": 1.030442102789879, |
| "epoch": 1.774622892635315, |
| "grad_norm": 18.44453239440918, |
| "learning_rate": 2.184929692743022e-06, |
| "loss": 0.9963, |
| "mean_token_accuracy": 0.743294857621193, |
| "num_tokens": 1656262.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.774622892635315, |
| "eval_entropy": 1.0542190546262349, |
| "eval_loss": 1.3501262664794922, |
| "eval_mean_token_accuracy": 0.6793523312460447, |
| "eval_num_tokens": 1656262.0, |
| "eval_runtime": 34.4167, |
| "eval_samples_per_second": 32.746, |
| "eval_steps_per_second": 8.194, |
| "step": 4000 |
| }, |
| { |
| "entropy": 1.0064435719251632, |
| "epoch": 1.9964507542147294, |
| "grad_norm": 11.64708137512207, |
| "learning_rate": 1.9911213989888633e-06, |
| "loss": 0.9697, |
| "mean_token_accuracy": 0.7490394617319107, |
| "num_tokens": 1867079.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.9964507542147294, |
| "eval_entropy": 1.0442877196251077, |
| "eval_loss": 1.3445523977279663, |
| "eval_mean_token_accuracy": 0.6808659062317922, |
| "eval_num_tokens": 1867079.0, |
| "eval_runtime": 34.1214, |
| "eval_samples_per_second": 33.029, |
| "eval_steps_per_second": 8.265, |
| "step": 4500 |
| }, |
| { |
| "entropy": 0.7014432374835015, |
| "epoch": 2.218278615794144, |
| "grad_norm": 16.21389389038086, |
| "learning_rate": 1.7876167964291556e-06, |
| "loss": 0.6614, |
| "mean_token_accuracy": 0.824757103562355, |
| "num_tokens": 2077059.0, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.218278615794144, |
| "eval_entropy": 0.8500108015029988, |
| "eval_loss": 1.556144118309021, |
| "eval_mean_token_accuracy": 0.6692640443220206, |
| "eval_num_tokens": 2077059.0, |
| "eval_runtime": 35.0607, |
| "eval_samples_per_second": 32.144, |
| "eval_steps_per_second": 8.043, |
| "step": 5000 |
| }, |
| { |
| "entropy": 0.6947842536568641, |
| "epoch": 2.440106477373558, |
| "grad_norm": 14.940324783325195, |
| "learning_rate": 1.5784337174650764e-06, |
| "loss": 0.6639, |
| "mean_token_accuracy": 0.8239517654180527, |
| "num_tokens": 2287905.0, |
| "step": 5500 |
| }, |
| { |
| "epoch": 2.440106477373558, |
| "eval_entropy": 0.8482754902636751, |
| "eval_loss": 1.5751625299453735, |
| "eval_mean_token_accuracy": 0.6686883025558282, |
| "eval_num_tokens": 2287905.0, |
| "eval_runtime": 34.8507, |
| "eval_samples_per_second": 32.338, |
| "eval_steps_per_second": 8.092, |
| "step": 5500 |
| }, |
| { |
| "entropy": 0.7052372665405273, |
| "epoch": 2.6619343389529724, |
| "grad_norm": 22.80525779724121, |
| "learning_rate": 1.3677021058024131e-06, |
| "loss": 0.6723, |
| "mean_token_accuracy": 0.8226301606893539, |
| "num_tokens": 2491365.0, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.6619343389529724, |
| "eval_entropy": 0.8458687763476203, |
| "eval_loss": 1.5676090717315674, |
| "eval_mean_token_accuracy": 0.6686736252290982, |
| "eval_num_tokens": 2491365.0, |
| "eval_runtime": 34.7445, |
| "eval_samples_per_second": 32.437, |
| "eval_steps_per_second": 8.116, |
| "step": 6000 |
| }, |
| { |
| "entropy": 0.6978846169710159, |
| "epoch": 2.883762200532387, |
| "grad_norm": 60.48308181762695, |
| "learning_rate": 1.1595824781402537e-06, |
| "loss": 0.6656, |
| "mean_token_accuracy": 0.8238938546180725, |
| "num_tokens": 2695935.0, |
| "step": 6500 |
| }, |
| { |
| "epoch": 2.883762200532387, |
| "eval_entropy": 0.8410944103771913, |
| "eval_loss": 1.569481611251831, |
| "eval_mean_token_accuracy": 0.6696258430362593, |
| "eval_num_tokens": 2695935.0, |
| "eval_runtime": 33.9566, |
| "eval_samples_per_second": 33.189, |
| "eval_steps_per_second": 8.305, |
| "step": 6500 |
| }, |
| { |
| "entropy": 0.5982560626268387, |
| "epoch": 3.1055900621118013, |
| "grad_norm": 20.41059112548828, |
| "learning_rate": 9.581837822509056e-07, |
| "loss": 0.5633, |
| "mean_token_accuracy": 0.8522541173696518, |
| "num_tokens": 2905707.0, |
| "step": 7000 |
| }, |
| { |
| "epoch": 3.1055900621118013, |
| "eval_entropy": 0.6810612559107179, |
| "eval_loss": 1.9356529712677002, |
| "eval_mean_token_accuracy": 0.6580131651027828, |
| "eval_num_tokens": 2905707.0, |
| "eval_runtime": 34.3377, |
| "eval_samples_per_second": 32.821, |
| "eval_steps_per_second": 8.213, |
| "step": 7000 |
| }, |
| { |
| "entropy": 0.4691380001306534, |
| "epoch": 3.3274179236912156, |
| "grad_norm": 12.475813865661621, |
| "learning_rate": 7.674822731955381e-07, |
| "loss": 0.4464, |
| "mean_token_accuracy": 0.8821721758842468, |
| "num_tokens": 3119436.0, |
| "step": 7500 |
| }, |
| { |
| "epoch": 3.3274179236912156, |
| "eval_entropy": 0.679166760639096, |
| "eval_loss": 1.958307147026062, |
| "eval_mean_token_accuracy": 0.6585667126990379, |
| "eval_num_tokens": 3119436.0, |
| "eval_runtime": 34.2034, |
| "eval_samples_per_second": 32.95, |
| "eval_steps_per_second": 8.245, |
| "step": 7500 |
| }, |
| { |
| "entropy": 0.4722338750064373, |
| "epoch": 3.54924578527063, |
| "grad_norm": 29.548173904418945, |
| "learning_rate": 5.912430093187734e-07, |
| "loss": 0.4503, |
| "mean_token_accuracy": 0.8828264862298966, |
| "num_tokens": 3323865.0, |
| "step": 8000 |
| }, |
| { |
| "epoch": 3.54924578527063, |
| "eval_entropy": 0.6674613842727445, |
| "eval_loss": 2.0193898677825928, |
| "eval_mean_token_accuracy": 0.6584047967871876, |
| "eval_num_tokens": 3323865.0, |
| "eval_runtime": 34.1338, |
| "eval_samples_per_second": 33.017, |
| "eval_steps_per_second": 8.262, |
| "step": 8000 |
| }, |
| { |
| "entropy": 0.4721588716506958, |
| "epoch": 3.771073646850044, |
| "grad_norm": 13.878427505493164, |
| "learning_rate": 4.329455179426337e-07, |
| "loss": 0.4503, |
| "mean_token_accuracy": 0.8825598682165146, |
| "num_tokens": 3528694.0, |
| "step": 8500 |
| }, |
| { |
| "epoch": 3.771073646850044, |
| "eval_entropy": 0.6828910686234211, |
| "eval_loss": 1.9711394309997559, |
| "eval_mean_token_accuracy": 0.6582514842351278, |
| "eval_num_tokens": 3528694.0, |
| "eval_runtime": 15.9432, |
| "eval_samples_per_second": 70.688, |
| "eval_steps_per_second": 17.688, |
| "step": 8500 |
| }, |
| { |
| "entropy": 0.4666064047217369, |
| "epoch": 3.992901508429459, |
| "grad_norm": 12.754748344421387, |
| "learning_rate": 2.957150983570442e-07, |
| "loss": 0.4459, |
| "mean_token_accuracy": 0.8827585883140564, |
| "num_tokens": 3734266.0, |
| "step": 9000 |
| }, |
| { |
| "epoch": 3.992901508429459, |
| "eval_entropy": 0.6604011273341821, |
| "eval_loss": 2.0170204639434814, |
| "eval_mean_token_accuracy": 0.6589750934999885, |
| "eval_num_tokens": 3734266.0, |
| "eval_runtime": 16.004, |
| "eval_samples_per_second": 70.42, |
| "eval_steps_per_second": 17.621, |
| "step": 9000 |
| }, |
| { |
| "entropy": 0.3668721870481968, |
| "epoch": 4.2147293700088735, |
| "grad_norm": 12.95799446105957, |
| "learning_rate": 1.8226111840579329e-07, |
| "loss": 0.3403, |
| "mean_token_accuracy": 0.9130678927898407, |
| "num_tokens": 3949370.0, |
| "step": 9500 |
| }, |
| { |
| "epoch": 4.2147293700088735, |
| "eval_entropy": 0.5831356482936981, |
| "eval_loss": 2.341752052307129, |
| "eval_mean_token_accuracy": 0.6525708041715284, |
| "eval_num_tokens": 3949370.0, |
| "eval_runtime": 16.1748, |
| "eval_samples_per_second": 69.676, |
| "eval_steps_per_second": 17.435, |
| "step": 9500 |
| }, |
| { |
| "entropy": 0.3616520670354366, |
| "epoch": 4.436557231588288, |
| "grad_norm": 10.559103965759277, |
| "learning_rate": 9.482352289090136e-08, |
| "loss": 0.3364, |
| "mean_token_accuracy": 0.913170881986618, |
| "num_tokens": 4152059.0, |
| "step": 10000 |
| }, |
| { |
| "epoch": 4.436557231588288, |
| "eval_entropy": 0.5844697422803716, |
| "eval_loss": 2.3465816974639893, |
| "eval_mean_token_accuracy": 0.6521602052111998, |
| "eval_num_tokens": 4152059.0, |
| "eval_runtime": 16.0213, |
| "eval_samples_per_second": 70.344, |
| "eval_steps_per_second": 17.602, |
| "step": 10000 |
| }, |
| { |
| "entropy": 0.354730488717556, |
| "epoch": 4.658385093167702, |
| "grad_norm": 11.545758247375488, |
| "learning_rate": 3.512860989075112e-08, |
| "loss": 0.3285, |
| "mean_token_accuracy": 0.9157207467556, |
| "num_tokens": 4357778.0, |
| "step": 10500 |
| }, |
| { |
| "epoch": 4.658385093167702, |
| "eval_entropy": 0.5713372949167346, |
| "eval_loss": 2.3891968727111816, |
| "eval_mean_token_accuracy": 0.652075339293649, |
| "eval_num_tokens": 4357778.0, |
| "eval_runtime": 15.9489, |
| "eval_samples_per_second": 70.663, |
| "eval_steps_per_second": 17.681, |
| "step": 10500 |
| }, |
| { |
| "entropy": 0.34955123990774156, |
| "epoch": 4.880212954747116, |
| "grad_norm": 11.0723237991333, |
| "learning_rate": 4.354948109051016e-09, |
| "loss": 0.3294, |
| "mean_token_accuracy": 0.9162505613565445, |
| "num_tokens": 4563930.0, |
| "step": 11000 |
| }, |
| { |
| "epoch": 4.880212954747116, |
| "eval_entropy": 0.5768938803292335, |
| "eval_loss": 2.3820862770080566, |
| "eval_mean_token_accuracy": 0.6512273330215022, |
| "eval_num_tokens": 4563930.0, |
| "eval_runtime": 16.1542, |
| "eval_samples_per_second": 69.765, |
| "eval_steps_per_second": 17.457, |
| "step": 11000 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 11270, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 20000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.097310295674384e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|