{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.962962962962963, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.7838703233003617, "epoch": 0.07407407407407407, "grad_norm": 2.2315239906311035, "learning_rate": 8.032786885245902e-05, "loss": 1.5419, "mean_token_accuracy": 0.675746806114912, "num_tokens": 406250.0, "step": 50 }, { "entropy": 0.42496320378035307, "epoch": 0.14814814814814814, "grad_norm": 0.3177862763404846, "learning_rate": 9.990765991730485e-05, "loss": 0.3491, "mean_token_accuracy": 0.9115325964987278, "num_tokens": 810885.0, "step": 100 }, { "entropy": 0.17912146519869565, "epoch": 0.2222222222222222, "grad_norm": 0.30210021138191223, "learning_rate": 9.950545603782162e-05, "loss": 0.1608, "mean_token_accuracy": 0.9592751894891262, "num_tokens": 1216948.0, "step": 150 }, { "entropy": 0.13433791074901819, "epoch": 0.2962962962962963, "grad_norm": 0.4639471173286438, "learning_rate": 9.878674879048427e-05, "loss": 0.1177, "mean_token_accuracy": 0.9702847249805927, "num_tokens": 1623039.0, "step": 200 }, { "entropy": 0.11072772483341396, "epoch": 0.37037037037037035, "grad_norm": 0.239689439535141, "learning_rate": 9.775613308830824e-05, "loss": 0.0996, "mean_token_accuracy": 0.9741994588077069, "num_tokens": 2029168.0, "step": 250 }, { "entropy": 0.1015299869235605, "epoch": 0.4444444444444444, "grad_norm": 0.14000816643238068, "learning_rate": 9.642019796948866e-05, "loss": 0.0946, "mean_token_accuracy": 0.9754497842490673, "num_tokens": 2433618.0, "step": 300 }, { "entropy": 0.0944658778142184, "epoch": 0.5185185185185185, "grad_norm": 0.2911929786205292, "learning_rate": 9.478748447168449e-05, "loss": 0.0888, "mean_token_accuracy": 0.9764833557605743, "num_tokens": 2837007.0, "step": 350 }, { "entropy": 0.09286680690012872, "epoch": 0.5925925925925926, "grad_norm": 0.1825980246067047, "learning_rate": 9.28684310265789e-05, "loss": 0.0884, "mean_token_accuracy": 0.9767837685346603, "num_tokens": 3241067.0, "step": 400 }, { "entropy": 0.09173870420083403, "epoch": 0.6666666666666666, "grad_norm": 0.16845248639583588, "learning_rate": 9.067530672382544e-05, "loss": 0.0871, "mean_token_accuracy": 0.9771137611567974, "num_tokens": 3644774.0, "step": 450 }, { "entropy": 0.08837487244978547, "epoch": 0.7407407407407407, "grad_norm": 0.15667857229709625, "learning_rate": 8.822213287104348e-05, "loss": 0.0846, "mean_token_accuracy": 0.9784404304623604, "num_tokens": 4050472.0, "step": 500 }, { "entropy": 0.0885874280706048, "epoch": 0.8148148148148148, "grad_norm": 0.10147374123334885, "learning_rate": 8.552459335135381e-05, "loss": 0.0848, "mean_token_accuracy": 0.977893346697092, "num_tokens": 4453374.0, "step": 550 }, { "entropy": 0.08721992008388042, "epoch": 0.8888888888888888, "grad_norm": 0.08062940090894699, "learning_rate": 8.259993435156559e-05, "loss": 0.0844, "mean_token_accuracy": 0.9785151568055153, "num_tokens": 4859149.0, "step": 600 }, { "entropy": 0.08216898602433503, "epoch": 0.9629629629629629, "grad_norm": 0.1335250586271286, "learning_rate": 7.946685410208296e-05, "loss": 0.0798, "mean_token_accuracy": 0.9796955060958862, "num_tokens": 5264437.0, "step": 650 }, { "entropy": 0.08460669645108282, "epoch": 1.037037037037037, "grad_norm": 0.0986652821302414, "learning_rate": 7.614538333345735e-05, "loss": 0.0822, "mean_token_accuracy": 0.9785672229528427, "num_tokens": 5669023.0, "step": 700 }, { "entropy": 0.0850414677709341, "epoch": 1.1111111111111112, "grad_norm": 0.07220367342233658, "learning_rate": 7.265675721386285e-05, "loss": 0.0824, "mean_token_accuracy": 0.9784497334063054, "num_tokens": 6073001.0, "step": 750 }, { "entropy": 0.08401141031645239, "epoch": 1.1851851851851851, "grad_norm": 0.1240311786532402, "learning_rate": 6.902327958623736e-05, "loss": 0.0826, "mean_token_accuracy": 0.9786691051721573, "num_tokens": 6478325.0, "step": 800 }, { "entropy": 0.08358457050286233, "epoch": 1.2592592592592593, "grad_norm": 0.10120349377393723, "learning_rate": 6.526818037306228e-05, "loss": 0.0811, "mean_token_accuracy": 0.9787746147811413, "num_tokens": 6882747.0, "step": 850 }, { "entropy": 0.0835177150182426, "epoch": 1.3333333333333333, "grad_norm": 0.10029594600200653, "learning_rate": 6.14154670604355e-05, "loss": 0.0818, "mean_token_accuracy": 0.9788197261095047, "num_tokens": 7287039.0, "step": 900 }, { "entropy": 0.08201941348612309, "epoch": 1.4074074074074074, "grad_norm": 0.09068141877651215, "learning_rate": 5.7489771210944564e-05, "loss": 0.0802, "mean_token_accuracy": 0.9791601756215096, "num_tokens": 7692281.0, "step": 950 }, { "entropy": 0.08500135038048029, "epoch": 1.4814814814814814, "grad_norm": 0.10811195522546768, "learning_rate": 5.351619098663021e-05, "loss": 0.0829, "mean_token_accuracy": 0.9783452861011028, "num_tokens": 8096455.0, "step": 1000 }, { "entropy": 0.08261076767928899, "epoch": 1.5555555555555556, "grad_norm": 0.07817448675632477, "learning_rate": 4.952013068883795e-05, "loss": 0.0807, "mean_token_accuracy": 0.9787566863000393, "num_tokens": 8501481.0, "step": 1050 }, { "entropy": 0.08088674335740506, "epoch": 1.6296296296296298, "grad_norm": 0.0741722360253334, "learning_rate": 4.5527138340828776e-05, "loss": 0.0794, "mean_token_accuracy": 0.9796176181733608, "num_tokens": 8907814.0, "step": 1100 }, { "entropy": 0.08092430792748928, "epoch": 1.7037037037037037, "grad_norm": 0.0863470658659935, "learning_rate": 4.156274235153189e-05, "loss": 0.0792, "mean_token_accuracy": 0.9792905601859093, "num_tokens": 9312142.0, "step": 1150 }, { "entropy": 0.07963837143965065, "epoch": 1.7777777777777777, "grad_norm": 0.10823621600866318, "learning_rate": 3.765228830469794e-05, "loss": 0.0791, "mean_token_accuracy": 0.9794147987663746, "num_tokens": 9716258.0, "step": 1200 }, { "entropy": 0.08162923349067569, "epoch": 1.8518518518518519, "grad_norm": 0.1495106816291809, "learning_rate": 3.3820776916908857e-05, "loss": 0.0801, "mean_token_accuracy": 0.9793653392791748, "num_tokens": 10121713.0, "step": 1250 }, { "entropy": 0.08032218031585217, "epoch": 1.925925925925926, "grad_norm": 0.08044654875993729, "learning_rate": 3.0092704200428058e-05, "loss": 0.079, "mean_token_accuracy": 0.9795299915969372, "num_tokens": 10526002.0, "step": 1300 }, { "entropy": 0.07872624884359539, "epoch": 2.0, "grad_norm": 0.07752422988414764, "learning_rate": 2.649190485277792e-05, "loss": 0.0775, "mean_token_accuracy": 0.980090646147728, "num_tokens": 10932428.0, "step": 1350 }, { "entropy": 0.08063295830972493, "epoch": 2.074074074074074, "grad_norm": 0.09133461862802505, "learning_rate": 2.3041399874302905e-05, "loss": 0.0793, "mean_token_accuracy": 0.9794050461053848, "num_tokens": 11337209.0, "step": 1400 }, { "entropy": 0.08033578357659281, "epoch": 2.148148148148148, "grad_norm": 0.06361774355173111, "learning_rate": 1.976324938794482e-05, "loss": 0.0792, "mean_token_accuracy": 0.9797105365991592, "num_tokens": 11741968.0, "step": 1450 }, { "entropy": 0.07973854598589242, "epoch": 2.2222222222222223, "grad_norm": 0.09148402512073517, "learning_rate": 1.667841160219835e-05, "loss": 0.0778, "mean_token_accuracy": 0.9796544459462165, "num_tokens": 12147108.0, "step": 1500 }, { "entropy": 0.07991634771227836, "epoch": 2.2962962962962963, "grad_norm": 0.058334823697805405, "learning_rate": 1.3806608818939203e-05, "loss": 0.0787, "mean_token_accuracy": 0.9793905445933342, "num_tokens": 12551885.0, "step": 1550 }, { "entropy": 0.07991615429520607, "epoch": 2.3703703703703702, "grad_norm": 0.07122901827096939, "learning_rate": 1.1166201342777438e-05, "loss": 0.0785, "mean_token_accuracy": 0.979671506434679, "num_tokens": 12956475.0, "step": 1600 }, { "entropy": 0.07969259418547153, "epoch": 2.4444444444444446, "grad_norm": 0.11193029582500458, "learning_rate": 8.774070098071668e-06, "loss": 0.0787, "mean_token_accuracy": 0.979515576660633, "num_tokens": 13362716.0, "step": 1650 }, { "entropy": 0.08067716302350164, "epoch": 2.5185185185185186, "grad_norm": 0.09012539684772491, "learning_rate": 6.645508704069003e-06, "loss": 0.0802, "mean_token_accuracy": 0.9791687172651291, "num_tokens": 13766986.0, "step": 1700 }, { "entropy": 0.07974634082056582, "epoch": 2.5925925925925926, "grad_norm": 0.09309827536344528, "learning_rate": 4.794125698167262e-06, "loss": 0.0787, "mean_token_accuracy": 0.9794514080882073, "num_tokens": 14171018.0, "step": 1750 }, { "entropy": 0.08121541824191808, "epoch": 2.6666666666666665, "grad_norm": 0.06920253485441208, "learning_rate": 3.231757532415458e-06, "loss": 0.0794, "mean_token_accuracy": 0.9792174778878688, "num_tokens": 14575902.0, "step": 1800 }, { "entropy": 0.07990395256318152, "epoch": 2.7407407407407405, "grad_norm": 0.05824149027466774, "learning_rate": 1.9683928994924385e-06, "loss": 0.0781, "mean_token_accuracy": 0.9798404219746589, "num_tokens": 14980838.0, "step": 1850 }, { "entropy": 0.08023261365480722, "epoch": 2.814814814814815, "grad_norm": 0.08101186901330948, "learning_rate": 1.0121088719706296e-06, "loss": 0.0795, "mean_token_accuracy": 0.9791903717815876, "num_tokens": 15385944.0, "step": 1900 }, { "entropy": 0.08040859408676625, "epoch": 2.888888888888889, "grad_norm": 0.07091067731380463, "learning_rate": 3.6901926314575894e-07, "loss": 0.0797, "mean_token_accuracy": 0.9792876356840133, "num_tokens": 15791071.0, "step": 1950 }, { "entropy": 0.0779489404708147, "epoch": 2.962962962962963, "grad_norm": 0.05987590551376343, "learning_rate": 4.323553957759629e-08, "loss": 0.0778, "mean_token_accuracy": 0.9800369493663311, "num_tokens": 16196052.0, "step": 2000 } ], "logging_steps": 50, "max_steps": 2025, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.765523181366723e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }