{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1456, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.7024799610674382, "epoch": 0.034352456200618345, "grad_norm": 0.3786041736602783, "learning_rate": 0.00019999381220544408, "loss": 3.4501, "mean_token_accuracy": 0.8102489286661148, "num_tokens": 4778405.0, "step": 50 }, { "entropy": 0.4085597662627697, "epoch": 0.06870491240123669, "grad_norm": 0.45050907135009766, "learning_rate": 0.0001992522030006205, "loss": 1.6395, "mean_token_accuracy": 0.8959537535905838, "num_tokens": 9520461.0, "step": 100 }, { "entropy": 0.3411180632561445, "epoch": 0.10305736860185503, "grad_norm": 0.541092574596405, "learning_rate": 0.00019728354289827674, "loss": 1.3659, "mean_token_accuracy": 0.9108871224522591, "num_tokens": 14289882.0, "step": 150 }, { "entropy": 0.3092808402329683, "epoch": 0.13740982480247338, "grad_norm": 0.5023065805435181, "learning_rate": 0.0001941121703621694, "loss": 1.2382, "mean_token_accuracy": 0.9177950304746628, "num_tokens": 19043686.0, "step": 200 }, { "entropy": 0.28368816956877707, "epoch": 0.1717622810030917, "grad_norm": 0.6615238189697266, "learning_rate": 0.0001897772929403342, "loss": 1.1313, "mean_token_accuracy": 0.9239873242378235, "num_tokens": 23799258.0, "step": 250 }, { "entropy": 0.2734688103199005, "epoch": 0.20611473720371007, "grad_norm": 0.5904623866081238, "learning_rate": 0.00018433250254378466, "loss": 1.0942, "mean_token_accuracy": 0.9259539729356766, "num_tokens": 28560023.0, "step": 300 }, { "entropy": 0.2526649332791567, "epoch": 0.2404671934043284, "grad_norm": 0.5767380595207214, "learning_rate": 0.00017784511289192364, "loss": 1.0103, "mean_token_accuracy": 0.930404528081417, "num_tokens": 33319338.0, "step": 350 }, { "entropy": 0.23849865928292274, "epoch": 0.27481964960494676, "grad_norm": 0.5684878826141357, "learning_rate": 0.00017039532731580285, "loss": 0.9525, "mean_token_accuracy": 0.9337890791893005, "num_tokens": 38077374.0, "step": 400 }, { "entropy": 0.2240120116621256, "epoch": 0.3091721058055651, "grad_norm": 0.6722640991210938, "learning_rate": 0.00016207524720764495, "loss": 0.8956, "mean_token_accuracy": 0.9374561330676079, "num_tokens": 42835757.0, "step": 450 }, { "entropy": 0.21341249123215675, "epoch": 0.3435245620061834, "grad_norm": 0.7091155052185059, "learning_rate": 0.0001529877333751247, "loss": 0.851, "mean_token_accuracy": 0.9395769771933555, "num_tokens": 47599437.0, "step": 500 }, { "entropy": 0.20303959287703038, "epoch": 0.3778770182068018, "grad_norm": 0.6120350956916809, "learning_rate": 0.00014324513437744094, "loss": 0.8086, "mean_token_accuracy": 0.9425579416751861, "num_tokens": 52365150.0, "step": 550 }, { "entropy": 0.190861323364079, "epoch": 0.41222947440742014, "grad_norm": 0.6448450088500977, "learning_rate": 0.00013296789756470645, "loss": 0.7598, "mean_token_accuracy": 0.9453012818098068, "num_tokens": 57104680.0, "step": 600 }, { "entropy": 0.18692783065140248, "epoch": 0.44658193060803847, "grad_norm": 0.611838698387146, "learning_rate": 0.00012228307999232003, "loss": 0.7456, "mean_token_accuracy": 0.9463415560126305, "num_tokens": 61869876.0, "step": 650 }, { "entropy": 0.18007670246064664, "epoch": 0.4809343868086568, "grad_norm": 0.628646969795227, "learning_rate": 0.00011132277761982368, "loss": 0.7165, "mean_token_accuracy": 0.9481008306145668, "num_tokens": 66633630.0, "step": 700 }, { "entropy": 0.17309307347983122, "epoch": 0.5152868430092752, "grad_norm": 0.5906488299369812, "learning_rate": 0.0001002224922139962, "loss": 0.6884, "mean_token_accuracy": 0.9499120843410492, "num_tokens": 71378556.0, "step": 750 }, { "entropy": 0.16796923141926526, "epoch": 0.5496392992098935, "grad_norm": 0.5354777574539185, "learning_rate": 8.911945614609149e-05, "loss": 0.667, "mean_token_accuracy": 0.9511926004290581, "num_tokens": 76120380.0, "step": 800 }, { "entropy": 0.1641952520981431, "epoch": 0.5839917554105118, "grad_norm": 0.5642014145851135, "learning_rate": 7.815093579368544e-05, "loss": 0.6517, "mean_token_accuracy": 0.9521191772818566, "num_tokens": 80892616.0, "step": 850 }, { "entropy": 0.1647306801378727, "epoch": 0.6183442116111302, "grad_norm": 0.6312394142150879, "learning_rate": 6.74525345221039e-05, "loss": 0.6563, "mean_token_accuracy": 0.9517807376384735, "num_tokens": 85659341.0, "step": 900 }, { "entropy": 0.16419454056769608, "epoch": 0.6526966678117485, "grad_norm": 0.522739827632904, "learning_rate": 5.7156516225602575e-05, "loss": 0.6513, "mean_token_accuracy": 0.9520121681690216, "num_tokens": 90396690.0, "step": 950 }, { "entropy": 0.15384907972067594, "epoch": 0.6870491240123668, "grad_norm": 0.5194680690765381, "learning_rate": 4.739017015429047e-05, "loss": 0.6125, "mean_token_accuracy": 0.9544967138767242, "num_tokens": 95165173.0, "step": 1000 }, { "entropy": 0.1492863280326128, "epoch": 0.7214015802129853, "grad_norm": 0.5211461782455444, "learning_rate": 3.8274237242373465e-05, "loss": 0.5918, "mean_token_accuracy": 0.9560691934823989, "num_tokens": 99907260.0, "step": 1050 }, { "entropy": 0.15369875568896532, "epoch": 0.7557540364136036, "grad_norm": 0.6547698378562927, "learning_rate": 2.992141739295844e-05, "loss": 0.6108, "mean_token_accuracy": 0.9546975934505463, "num_tokens": 104664377.0, "step": 1100 }, { "entropy": 0.14807064708322287, "epoch": 0.790106492614222, "grad_norm": 0.5042953491210938, "learning_rate": 2.2434976173792755e-05, "loss": 0.5875, "mean_token_accuracy": 0.9564316233992577, "num_tokens": 109428518.0, "step": 1150 }, { "entropy": 0.15006819926202297, "epoch": 0.8244589488148403, "grad_norm": 0.4698965549468994, "learning_rate": 1.5907468149304162e-05, "loss": 0.5941, "mean_token_accuracy": 0.9558491191267967, "num_tokens": 114191645.0, "step": 1200 }, { "entropy": 0.13952667724341153, "epoch": 0.8588114050154586, "grad_norm": 0.5912120938301086, "learning_rate": 1.0419592632332752e-05, "loss": 0.5543, "mean_token_accuracy": 0.9583651649951935, "num_tokens": 118945985.0, "step": 1250 }, { "entropy": 0.14712075240910052, "epoch": 0.8931638612160769, "grad_norm": 0.49117299914360046, "learning_rate": 6.039196001850489e-06, "loss": 0.5828, "mean_token_accuracy": 0.9564252620935441, "num_tokens": 123701124.0, "step": 1300 }, { "entropy": 0.14798750165849925, "epoch": 0.9275163174166953, "grad_norm": 0.5953987240791321, "learning_rate": 2.8204329209724155e-06, "loss": 0.5874, "mean_token_accuracy": 0.9561514291167259, "num_tokens": 128465077.0, "step": 1350 }, { "entropy": 0.1426806965842843, "epoch": 0.9618687736173136, "grad_norm": 0.5978220105171204, "learning_rate": 8.030968250873128e-07, "loss": 0.5633, "mean_token_accuracy": 0.957860791683197, "num_tokens": 133220292.0, "step": 1400 }, { "entropy": 0.14601790152490138, "epoch": 0.9962212298179319, "grad_norm": 0.4329601526260376, "learning_rate": 1.2127957254959298e-08, "loss": 0.5793, "mean_token_accuracy": 0.9565826177597045, "num_tokens": 137969995.0, "step": 1450 } ], "logging_steps": 50, "max_steps": 1456, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.342671846811982e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }