| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.962962962962963, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.7838703233003617, | |
| "epoch": 0.07407407407407407, | |
| "grad_norm": 2.2315239906311035, | |
| "learning_rate": 8.032786885245902e-05, | |
| "loss": 1.5419, | |
| "mean_token_accuracy": 0.675746806114912, | |
| "num_tokens": 406250.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.42496320378035307, | |
| "epoch": 0.14814814814814814, | |
| "grad_norm": 0.3177862763404846, | |
| "learning_rate": 9.990765991730485e-05, | |
| "loss": 0.3491, | |
| "mean_token_accuracy": 0.9115325964987278, | |
| "num_tokens": 810885.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.17912146519869565, | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 0.30210021138191223, | |
| "learning_rate": 9.950545603782162e-05, | |
| "loss": 0.1608, | |
| "mean_token_accuracy": 0.9592751894891262, | |
| "num_tokens": 1216948.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.13433791074901819, | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 0.4639471173286438, | |
| "learning_rate": 9.878674879048427e-05, | |
| "loss": 0.1177, | |
| "mean_token_accuracy": 0.9702847249805927, | |
| "num_tokens": 1623039.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.11072772483341396, | |
| "epoch": 0.37037037037037035, | |
| "grad_norm": 0.239689439535141, | |
| "learning_rate": 9.775613308830824e-05, | |
| "loss": 0.0996, | |
| "mean_token_accuracy": 0.9741994588077069, | |
| "num_tokens": 2029168.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.1015299869235605, | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 0.14000816643238068, | |
| "learning_rate": 9.642019796948866e-05, | |
| "loss": 0.0946, | |
| "mean_token_accuracy": 0.9754497842490673, | |
| "num_tokens": 2433618.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.0944658778142184, | |
| "epoch": 0.5185185185185185, | |
| "grad_norm": 0.2911929786205292, | |
| "learning_rate": 9.478748447168449e-05, | |
| "loss": 0.0888, | |
| "mean_token_accuracy": 0.9764833557605743, | |
| "num_tokens": 2837007.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.09286680690012872, | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 0.1825980246067047, | |
| "learning_rate": 9.28684310265789e-05, | |
| "loss": 0.0884, | |
| "mean_token_accuracy": 0.9767837685346603, | |
| "num_tokens": 3241067.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.09173870420083403, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.16845248639583588, | |
| "learning_rate": 9.067530672382544e-05, | |
| "loss": 0.0871, | |
| "mean_token_accuracy": 0.9771137611567974, | |
| "num_tokens": 3644774.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.08837487244978547, | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 0.15667857229709625, | |
| "learning_rate": 8.822213287104348e-05, | |
| "loss": 0.0846, | |
| "mean_token_accuracy": 0.9784404304623604, | |
| "num_tokens": 4050472.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.0885874280706048, | |
| "epoch": 0.8148148148148148, | |
| "grad_norm": 0.10147374123334885, | |
| "learning_rate": 8.552459335135381e-05, | |
| "loss": 0.0848, | |
| "mean_token_accuracy": 0.977893346697092, | |
| "num_tokens": 4453374.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.08721992008388042, | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.08062940090894699, | |
| "learning_rate": 8.259993435156559e-05, | |
| "loss": 0.0844, | |
| "mean_token_accuracy": 0.9785151568055153, | |
| "num_tokens": 4859149.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.08216898602433503, | |
| "epoch": 0.9629629629629629, | |
| "grad_norm": 0.1335250586271286, | |
| "learning_rate": 7.946685410208296e-05, | |
| "loss": 0.0798, | |
| "mean_token_accuracy": 0.9796955060958862, | |
| "num_tokens": 5264437.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.08460669645108282, | |
| "epoch": 1.037037037037037, | |
| "grad_norm": 0.0986652821302414, | |
| "learning_rate": 7.614538333345735e-05, | |
| "loss": 0.0822, | |
| "mean_token_accuracy": 0.9785672229528427, | |
| "num_tokens": 5669023.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.0850414677709341, | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 0.07220367342233658, | |
| "learning_rate": 7.265675721386285e-05, | |
| "loss": 0.0824, | |
| "mean_token_accuracy": 0.9784497334063054, | |
| "num_tokens": 6073001.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.08401141031645239, | |
| "epoch": 1.1851851851851851, | |
| "grad_norm": 0.1240311786532402, | |
| "learning_rate": 6.902327958623736e-05, | |
| "loss": 0.0826, | |
| "mean_token_accuracy": 0.9786691051721573, | |
| "num_tokens": 6478325.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.08358457050286233, | |
| "epoch": 1.2592592592592593, | |
| "grad_norm": 0.10120349377393723, | |
| "learning_rate": 6.526818037306228e-05, | |
| "loss": 0.0811, | |
| "mean_token_accuracy": 0.9787746147811413, | |
| "num_tokens": 6882747.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.0835177150182426, | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.10029594600200653, | |
| "learning_rate": 6.14154670604355e-05, | |
| "loss": 0.0818, | |
| "mean_token_accuracy": 0.9788197261095047, | |
| "num_tokens": 7287039.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.08201941348612309, | |
| "epoch": 1.4074074074074074, | |
| "grad_norm": 0.09068141877651215, | |
| "learning_rate": 5.7489771210944564e-05, | |
| "loss": 0.0802, | |
| "mean_token_accuracy": 0.9791601756215096, | |
| "num_tokens": 7692281.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.08500135038048029, | |
| "epoch": 1.4814814814814814, | |
| "grad_norm": 0.10811195522546768, | |
| "learning_rate": 5.351619098663021e-05, | |
| "loss": 0.0829, | |
| "mean_token_accuracy": 0.9783452861011028, | |
| "num_tokens": 8096455.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.08261076767928899, | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 0.07817448675632477, | |
| "learning_rate": 4.952013068883795e-05, | |
| "loss": 0.0807, | |
| "mean_token_accuracy": 0.9787566863000393, | |
| "num_tokens": 8501481.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.08088674335740506, | |
| "epoch": 1.6296296296296298, | |
| "grad_norm": 0.0741722360253334, | |
| "learning_rate": 4.5527138340828776e-05, | |
| "loss": 0.0794, | |
| "mean_token_accuracy": 0.9796176181733608, | |
| "num_tokens": 8907814.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.08092430792748928, | |
| "epoch": 1.7037037037037037, | |
| "grad_norm": 0.0863470658659935, | |
| "learning_rate": 4.156274235153189e-05, | |
| "loss": 0.0792, | |
| "mean_token_accuracy": 0.9792905601859093, | |
| "num_tokens": 9312142.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.07963837143965065, | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 0.10823621600866318, | |
| "learning_rate": 3.765228830469794e-05, | |
| "loss": 0.0791, | |
| "mean_token_accuracy": 0.9794147987663746, | |
| "num_tokens": 9716258.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.08162923349067569, | |
| "epoch": 1.8518518518518519, | |
| "grad_norm": 0.1495106816291809, | |
| "learning_rate": 3.3820776916908857e-05, | |
| "loss": 0.0801, | |
| "mean_token_accuracy": 0.9793653392791748, | |
| "num_tokens": 10121713.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.08032218031585217, | |
| "epoch": 1.925925925925926, | |
| "grad_norm": 0.08044654875993729, | |
| "learning_rate": 3.0092704200428058e-05, | |
| "loss": 0.079, | |
| "mean_token_accuracy": 0.9795299915969372, | |
| "num_tokens": 10526002.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.07872624884359539, | |
| "epoch": 2.0, | |
| "grad_norm": 0.07752422988414764, | |
| "learning_rate": 2.649190485277792e-05, | |
| "loss": 0.0775, | |
| "mean_token_accuracy": 0.980090646147728, | |
| "num_tokens": 10932428.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.08063295830972493, | |
| "epoch": 2.074074074074074, | |
| "grad_norm": 0.09133461862802505, | |
| "learning_rate": 2.3041399874302905e-05, | |
| "loss": 0.0793, | |
| "mean_token_accuracy": 0.9794050461053848, | |
| "num_tokens": 11337209.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.08033578357659281, | |
| "epoch": 2.148148148148148, | |
| "grad_norm": 0.06361774355173111, | |
| "learning_rate": 1.976324938794482e-05, | |
| "loss": 0.0792, | |
| "mean_token_accuracy": 0.9797105365991592, | |
| "num_tokens": 11741968.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.07973854598589242, | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 0.09148402512073517, | |
| "learning_rate": 1.667841160219835e-05, | |
| "loss": 0.0778, | |
| "mean_token_accuracy": 0.9796544459462165, | |
| "num_tokens": 12147108.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.07991634771227836, | |
| "epoch": 2.2962962962962963, | |
| "grad_norm": 0.058334823697805405, | |
| "learning_rate": 1.3806608818939203e-05, | |
| "loss": 0.0787, | |
| "mean_token_accuracy": 0.9793905445933342, | |
| "num_tokens": 12551885.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.07991615429520607, | |
| "epoch": 2.3703703703703702, | |
| "grad_norm": 0.07122901827096939, | |
| "learning_rate": 1.1166201342777438e-05, | |
| "loss": 0.0785, | |
| "mean_token_accuracy": 0.979671506434679, | |
| "num_tokens": 12956475.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.07969259418547153, | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 0.11193029582500458, | |
| "learning_rate": 8.774070098071668e-06, | |
| "loss": 0.0787, | |
| "mean_token_accuracy": 0.979515576660633, | |
| "num_tokens": 13362716.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.08067716302350164, | |
| "epoch": 2.5185185185185186, | |
| "grad_norm": 0.09012539684772491, | |
| "learning_rate": 6.645508704069003e-06, | |
| "loss": 0.0802, | |
| "mean_token_accuracy": 0.9791687172651291, | |
| "num_tokens": 13766986.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.07974634082056582, | |
| "epoch": 2.5925925925925926, | |
| "grad_norm": 0.09309827536344528, | |
| "learning_rate": 4.794125698167262e-06, | |
| "loss": 0.0787, | |
| "mean_token_accuracy": 0.9794514080882073, | |
| "num_tokens": 14171018.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.08121541824191808, | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.06920253485441208, | |
| "learning_rate": 3.231757532415458e-06, | |
| "loss": 0.0794, | |
| "mean_token_accuracy": 0.9792174778878688, | |
| "num_tokens": 14575902.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.07990395256318152, | |
| "epoch": 2.7407407407407405, | |
| "grad_norm": 0.05824149027466774, | |
| "learning_rate": 1.9683928994924385e-06, | |
| "loss": 0.0781, | |
| "mean_token_accuracy": 0.9798404219746589, | |
| "num_tokens": 14980838.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.08023261365480722, | |
| "epoch": 2.814814814814815, | |
| "grad_norm": 0.08101186901330948, | |
| "learning_rate": 1.0121088719706296e-06, | |
| "loss": 0.0795, | |
| "mean_token_accuracy": 0.9791903717815876, | |
| "num_tokens": 15385944.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.08040859408676625, | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 0.07091067731380463, | |
| "learning_rate": 3.6901926314575894e-07, | |
| "loss": 0.0797, | |
| "mean_token_accuracy": 0.9792876356840133, | |
| "num_tokens": 15791071.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.0779489404708147, | |
| "epoch": 2.962962962962963, | |
| "grad_norm": 0.05987590551376343, | |
| "learning_rate": 4.323553957759629e-08, | |
| "loss": 0.0778, | |
| "mean_token_accuracy": 0.9800369493663311, | |
| "num_tokens": 16196052.0, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 2025, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.765523181366723e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |