| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1456, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.7024799610674382, | |
| "epoch": 0.034352456200618345, | |
| "grad_norm": 0.3786041736602783, | |
| "learning_rate": 0.00019999381220544408, | |
| "loss": 3.4501, | |
| "mean_token_accuracy": 0.8102489286661148, | |
| "num_tokens": 4778405.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.4085597662627697, | |
| "epoch": 0.06870491240123669, | |
| "grad_norm": 0.45050907135009766, | |
| "learning_rate": 0.0001992522030006205, | |
| "loss": 1.6395, | |
| "mean_token_accuracy": 0.8959537535905838, | |
| "num_tokens": 9520461.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.3411180632561445, | |
| "epoch": 0.10305736860185503, | |
| "grad_norm": 0.541092574596405, | |
| "learning_rate": 0.00019728354289827674, | |
| "loss": 1.3659, | |
| "mean_token_accuracy": 0.9108871224522591, | |
| "num_tokens": 14289882.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.3092808402329683, | |
| "epoch": 0.13740982480247338, | |
| "grad_norm": 0.5023065805435181, | |
| "learning_rate": 0.0001941121703621694, | |
| "loss": 1.2382, | |
| "mean_token_accuracy": 0.9177950304746628, | |
| "num_tokens": 19043686.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.28368816956877707, | |
| "epoch": 0.1717622810030917, | |
| "grad_norm": 0.6615238189697266, | |
| "learning_rate": 0.0001897772929403342, | |
| "loss": 1.1313, | |
| "mean_token_accuracy": 0.9239873242378235, | |
| "num_tokens": 23799258.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.2734688103199005, | |
| "epoch": 0.20611473720371007, | |
| "grad_norm": 0.5904623866081238, | |
| "learning_rate": 0.00018433250254378466, | |
| "loss": 1.0942, | |
| "mean_token_accuracy": 0.9259539729356766, | |
| "num_tokens": 28560023.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.2526649332791567, | |
| "epoch": 0.2404671934043284, | |
| "grad_norm": 0.5767380595207214, | |
| "learning_rate": 0.00017784511289192364, | |
| "loss": 1.0103, | |
| "mean_token_accuracy": 0.930404528081417, | |
| "num_tokens": 33319338.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.23849865928292274, | |
| "epoch": 0.27481964960494676, | |
| "grad_norm": 0.5684878826141357, | |
| "learning_rate": 0.00017039532731580285, | |
| "loss": 0.9525, | |
| "mean_token_accuracy": 0.9337890791893005, | |
| "num_tokens": 38077374.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.2240120116621256, | |
| "epoch": 0.3091721058055651, | |
| "grad_norm": 0.6722640991210938, | |
| "learning_rate": 0.00016207524720764495, | |
| "loss": 0.8956, | |
| "mean_token_accuracy": 0.9374561330676079, | |
| "num_tokens": 42835757.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.21341249123215675, | |
| "epoch": 0.3435245620061834, | |
| "grad_norm": 0.7091155052185059, | |
| "learning_rate": 0.0001529877333751247, | |
| "loss": 0.851, | |
| "mean_token_accuracy": 0.9395769771933555, | |
| "num_tokens": 47599437.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.20303959287703038, | |
| "epoch": 0.3778770182068018, | |
| "grad_norm": 0.6120350956916809, | |
| "learning_rate": 0.00014324513437744094, | |
| "loss": 0.8086, | |
| "mean_token_accuracy": 0.9425579416751861, | |
| "num_tokens": 52365150.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.190861323364079, | |
| "epoch": 0.41222947440742014, | |
| "grad_norm": 0.6448450088500977, | |
| "learning_rate": 0.00013296789756470645, | |
| "loss": 0.7598, | |
| "mean_token_accuracy": 0.9453012818098068, | |
| "num_tokens": 57104680.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.18692783065140248, | |
| "epoch": 0.44658193060803847, | |
| "grad_norm": 0.611838698387146, | |
| "learning_rate": 0.00012228307999232003, | |
| "loss": 0.7456, | |
| "mean_token_accuracy": 0.9463415560126305, | |
| "num_tokens": 61869876.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.18007670246064664, | |
| "epoch": 0.4809343868086568, | |
| "grad_norm": 0.628646969795227, | |
| "learning_rate": 0.00011132277761982368, | |
| "loss": 0.7165, | |
| "mean_token_accuracy": 0.9481008306145668, | |
| "num_tokens": 66633630.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.17309307347983122, | |
| "epoch": 0.5152868430092752, | |
| "grad_norm": 0.5906488299369812, | |
| "learning_rate": 0.0001002224922139962, | |
| "loss": 0.6884, | |
| "mean_token_accuracy": 0.9499120843410492, | |
| "num_tokens": 71378556.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.16796923141926526, | |
| "epoch": 0.5496392992098935, | |
| "grad_norm": 0.5354777574539185, | |
| "learning_rate": 8.911945614609149e-05, | |
| "loss": 0.667, | |
| "mean_token_accuracy": 0.9511926004290581, | |
| "num_tokens": 76120380.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.1641952520981431, | |
| "epoch": 0.5839917554105118, | |
| "grad_norm": 0.5642014145851135, | |
| "learning_rate": 7.815093579368544e-05, | |
| "loss": 0.6517, | |
| "mean_token_accuracy": 0.9521191772818566, | |
| "num_tokens": 80892616.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.1647306801378727, | |
| "epoch": 0.6183442116111302, | |
| "grad_norm": 0.6312394142150879, | |
| "learning_rate": 6.74525345221039e-05, | |
| "loss": 0.6563, | |
| "mean_token_accuracy": 0.9517807376384735, | |
| "num_tokens": 85659341.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.16419454056769608, | |
| "epoch": 0.6526966678117485, | |
| "grad_norm": 0.522739827632904, | |
| "learning_rate": 5.7156516225602575e-05, | |
| "loss": 0.6513, | |
| "mean_token_accuracy": 0.9520121681690216, | |
| "num_tokens": 90396690.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.15384907972067594, | |
| "epoch": 0.6870491240123668, | |
| "grad_norm": 0.5194680690765381, | |
| "learning_rate": 4.739017015429047e-05, | |
| "loss": 0.6125, | |
| "mean_token_accuracy": 0.9544967138767242, | |
| "num_tokens": 95165173.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.1492863280326128, | |
| "epoch": 0.7214015802129853, | |
| "grad_norm": 0.5211461782455444, | |
| "learning_rate": 3.8274237242373465e-05, | |
| "loss": 0.5918, | |
| "mean_token_accuracy": 0.9560691934823989, | |
| "num_tokens": 99907260.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.15369875568896532, | |
| "epoch": 0.7557540364136036, | |
| "grad_norm": 0.6547698378562927, | |
| "learning_rate": 2.992141739295844e-05, | |
| "loss": 0.6108, | |
| "mean_token_accuracy": 0.9546975934505463, | |
| "num_tokens": 104664377.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.14807064708322287, | |
| "epoch": 0.790106492614222, | |
| "grad_norm": 0.5042953491210938, | |
| "learning_rate": 2.2434976173792755e-05, | |
| "loss": 0.5875, | |
| "mean_token_accuracy": 0.9564316233992577, | |
| "num_tokens": 109428518.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.15006819926202297, | |
| "epoch": 0.8244589488148403, | |
| "grad_norm": 0.4698965549468994, | |
| "learning_rate": 1.5907468149304162e-05, | |
| "loss": 0.5941, | |
| "mean_token_accuracy": 0.9558491191267967, | |
| "num_tokens": 114191645.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.13952667724341153, | |
| "epoch": 0.8588114050154586, | |
| "grad_norm": 0.5912120938301086, | |
| "learning_rate": 1.0419592632332752e-05, | |
| "loss": 0.5543, | |
| "mean_token_accuracy": 0.9583651649951935, | |
| "num_tokens": 118945985.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.14712075240910052, | |
| "epoch": 0.8931638612160769, | |
| "grad_norm": 0.49117299914360046, | |
| "learning_rate": 6.039196001850489e-06, | |
| "loss": 0.5828, | |
| "mean_token_accuracy": 0.9564252620935441, | |
| "num_tokens": 123701124.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.14798750165849925, | |
| "epoch": 0.9275163174166953, | |
| "grad_norm": 0.5953987240791321, | |
| "learning_rate": 2.8204329209724155e-06, | |
| "loss": 0.5874, | |
| "mean_token_accuracy": 0.9561514291167259, | |
| "num_tokens": 128465077.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.1426806965842843, | |
| "epoch": 0.9618687736173136, | |
| "grad_norm": 0.5978220105171204, | |
| "learning_rate": 8.030968250873128e-07, | |
| "loss": 0.5633, | |
| "mean_token_accuracy": 0.957860791683197, | |
| "num_tokens": 133220292.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.14601790152490138, | |
| "epoch": 0.9962212298179319, | |
| "grad_norm": 0.4329601526260376, | |
| "learning_rate": 1.2127957254959298e-08, | |
| "loss": 0.5793, | |
| "mean_token_accuracy": 0.9565826177597045, | |
| "num_tokens": 137969995.0, | |
| "step": 1450 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 1456, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.342671846811982e+18, | |
| "train_batch_size": 3, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |