{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1332, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.4947163558006284, "epoch": 0.07507507507507508, "grad_norm": 2.2692031860351562, "learning_rate": 1.99976055134129e-05, "loss": 9.666, "mean_token_accuracy": 0.3714864462614059, "num_tokens": 175430.0, "step": 50 }, { "entropy": 4.957374696731567, "epoch": 0.15015015015015015, "grad_norm": 0.4004897177219391, "learning_rate": 1.98972684724999e-05, "loss": 3.687, "mean_token_accuracy": 0.5028938430547715, "num_tokens": 350404.0, "step": 100 }, { "entropy": 4.707881135940552, "epoch": 0.22522522522522523, "grad_norm": 0.4421725571155548, "learning_rate": 1.9650816346189154e-05, "loss": 3.4827, "mean_token_accuracy": 0.5126789313554764, "num_tokens": 526514.0, "step": 150 }, { "entropy": 4.64449327468872, "epoch": 0.3003003003003003, "grad_norm": 0.4718657433986664, "learning_rate": 1.926188754982551e-05, "loss": 3.4128, "mean_token_accuracy": 0.5216181075572968, "num_tokens": 701236.0, "step": 200 }, { "entropy": 4.598890199661255, "epoch": 0.37537537537537535, "grad_norm": 0.4660639762878418, "learning_rate": 1.8736223906463698e-05, "loss": 3.3675, "mean_token_accuracy": 0.528342125415802, "num_tokens": 876499.0, "step": 250 }, { "entropy": 4.603140239715576, "epoch": 0.45045045045045046, "grad_norm": 0.47321391105651855, "learning_rate": 1.8081585879342008e-05, "loss": 3.3821, "mean_token_accuracy": 0.5266077440977096, "num_tokens": 1051864.0, "step": 300 }, { "entropy": 4.587698755264282, "epoch": 0.5255255255255256, "grad_norm": 0.5947113633155823, "learning_rate": 1.7307638002821942e-05, "loss": 3.3509, "mean_token_accuracy": 0.5320688891410827, "num_tokens": 1226112.0, "step": 350 }, { "entropy": 4.596508531570435, "epoch": 0.6006006006006006, "grad_norm": 0.5403749942779541, "learning_rate": 1.6425806203196734e-05, "loss": 3.3672, "mean_token_accuracy": 0.5314443480968475, "num_tokens": 1401199.0, "step": 400 }, { "entropy": 4.5638793849945065, "epoch": 0.6756756756756757, "grad_norm": 0.5813198685646057, "learning_rate": 1.544910911576629e-05, "loss": 3.3328, "mean_token_accuracy": 0.5359585750102996, "num_tokens": 1576517.0, "step": 450 }, { "entropy": 4.571099786758423, "epoch": 0.7507507507507507, "grad_norm": 0.5679024457931519, "learning_rate": 1.4391965888473705e-05, "loss": 3.3438, "mean_token_accuracy": 0.5345500099658966, "num_tokens": 1751444.0, "step": 500 }, { "entropy": 4.5622615623474125, "epoch": 0.8258258258258259, "grad_norm": 0.6713469624519348, "learning_rate": 1.3269983309531584e-05, "loss": 3.3411, "mean_token_accuracy": 0.5363436281681061, "num_tokens": 1926907.0, "step": 550 }, { "entropy": 4.562127561569214, "epoch": 0.9009009009009009, "grad_norm": 0.6330661177635193, "learning_rate": 1.2099725401709685e-05, "loss": 3.3382, "mean_token_accuracy": 0.5368253135681152, "num_tokens": 2102132.0, "step": 600 }, { "entropy": 4.561514482498169, "epoch": 0.975975975975976, "grad_norm": 0.7525476813316345, "learning_rate": 1.0898468884803366e-05, "loss": 3.3411, "mean_token_accuracy": 0.537065578699112, "num_tokens": 2277537.0, "step": 650 }, { "entropy": 4.548706264495849, "epoch": 1.0510510510510511, "grad_norm": 0.6428335309028625, "learning_rate": 9.683948116432609e-06, "loss": 3.3205, "mean_token_accuracy": 0.5413484466075897, "num_tokens": 2449566.0, "step": 700 }, { "entropy": 4.5432649517059325, "epoch": 1.1261261261261262, "grad_norm": 0.6926820278167725, "learning_rate": 8.474093276654764e-06, "loss": 3.3025, "mean_token_accuracy": 0.5445451009273529, "num_tokens": 2623865.0, "step": 750 }, { "entropy": 4.537814807891846, "epoch": 1.2012012012012012, "grad_norm": 0.6473987698554993, "learning_rate": 7.286765661616761e-06, "loss": 3.3031, "mean_token_accuracy": 0.5444674134254456, "num_tokens": 2798793.0, "step": 800 }, { "entropy": 4.533882551193237, "epoch": 1.2762762762762763, "grad_norm": 0.7260850667953491, "learning_rate": 6.139493994152428e-06, "loss": 3.3086, "mean_token_accuracy": 0.5432299053668976, "num_tokens": 2975077.0, "step": 850 }, { "entropy": 4.509051198959351, "epoch": 1.3513513513513513, "grad_norm": 0.7649775743484497, "learning_rate": 5.0492156442170914e-06, "loss": 3.262, "mean_token_accuracy": 0.5496996784210205, "num_tokens": 3149867.0, "step": 900 }, { "entropy": 4.533229093551636, "epoch": 1.4264264264264264, "grad_norm": 0.7427539229393005, "learning_rate": 4.0320265795669815e-06, "loss": 3.2888, "mean_token_accuracy": 0.5470581102371216, "num_tokens": 3324230.0, "step": 950 }, { "entropy": 4.5299335289001466, "epoch": 1.5015015015015014, "grad_norm": 0.7631803154945374, "learning_rate": 3.1029437382047368e-06, "loss": 3.2959, "mean_token_accuracy": 0.5465954422950745, "num_tokens": 3499600.0, "step": 1000 }, { "entropy": 4.519344367980957, "epoch": 1.5765765765765765, "grad_norm": 0.7936219573020935, "learning_rate": 2.275683330727697e-06, "loss": 3.2927, "mean_token_accuracy": 0.5452944958209991, "num_tokens": 3676089.0, "step": 1050 }, { "entropy": 4.532338409423828, "epoch": 1.6516516516516515, "grad_norm": 0.735207200050354, "learning_rate": 1.562458345539739e-06, "loss": 3.3022, "mean_token_accuracy": 0.5458044326305389, "num_tokens": 3851878.0, "step": 1100 }, { "entropy": 4.519860811233521, "epoch": 1.7267267267267268, "grad_norm": 0.9440169930458069, "learning_rate": 9.737982463922102e-07, "loss": 3.2767, "mean_token_accuracy": 0.5487184035778045, "num_tokens": 4026785.0, "step": 1150 }, { "entropy": 4.5121307468414305, "epoch": 1.8018018018018018, "grad_norm": 0.8288739919662476, "learning_rate": 5.183935240903415e-07, "loss": 3.2591, "mean_token_accuracy": 0.5512541651725769, "num_tokens": 4200908.0, "step": 1200 }, { "entropy": 4.517954025268555, "epoch": 1.8768768768768769, "grad_norm": 0.7847370505332947, "learning_rate": 2.0296739727517335e-07, "loss": 3.2798, "mean_token_accuracy": 0.5484513938426971, "num_tokens": 4376428.0, "step": 1250 }, { "entropy": 4.50932373046875, "epoch": 1.951951951951952, "grad_norm": 0.7920858860015869, "learning_rate": 3.217655638451112e-08, "loss": 3.2683, "mean_token_accuracy": 0.5490582883358002, "num_tokens": 4551678.0, "step": 1300 } ], "logging_steps": 50, "max_steps": 1332, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.767706644103712e+17, "train_batch_size": 24, "trial_name": null, "trial_params": null }