{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 244, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.0842015266418457, "epoch": 0.0823045267489712, "grad_norm": 1.2566404342651367, "learning_rate": 1.999911398855782e-05, "loss": 1.1726, "mean_token_accuracy": 0.6686171144247055, "num_tokens": 609981.0, "step": 10 }, { "entropy": 0.9032916635274887, "epoch": 0.1646090534979424, "grad_norm": 0.7735289931297302, "learning_rate": 1.9892982458192286e-05, "loss": 0.8736, "mean_token_accuracy": 0.7255816429853439, "num_tokens": 1206011.0, "step": 20 }, { "entropy": 0.8239103972911834, "epoch": 0.24691358024691357, "grad_norm": 0.6542304158210754, "learning_rate": 1.9611801152462715e-05, "loss": 0.7927, "mean_token_accuracy": 0.7448168516159057, "num_tokens": 1810330.0, "step": 30 }, { "entropy": 0.7995721608400345, "epoch": 0.3292181069958848, "grad_norm": 0.6543221473693848, "learning_rate": 1.916054538842971e-05, "loss": 0.7673, "mean_token_accuracy": 0.7500824242830276, "num_tokens": 2411257.0, "step": 40 }, { "entropy": 0.7843733370304108, "epoch": 0.411522633744856, "grad_norm": 0.6735917329788208, "learning_rate": 1.8547199838102904e-05, "loss": 0.7481, "mean_token_accuracy": 0.7548356115818023, "num_tokens": 3022014.0, "step": 50 }, { "entropy": 0.76707524061203, "epoch": 0.49382716049382713, "grad_norm": 0.6267600655555725, "learning_rate": 1.778261724495566e-05, "loss": 0.7368, "mean_token_accuracy": 0.7574090182781219, "num_tokens": 3637233.0, "step": 60 }, { "entropy": 0.7542366564273835, "epoch": 0.5761316872427984, "grad_norm": 0.6589949131011963, "learning_rate": 1.6880326391813917e-05, "loss": 0.7172, "mean_token_accuracy": 0.7609850108623505, "num_tokens": 4242671.0, "step": 70 }, { "entropy": 0.74325290620327, "epoch": 0.6584362139917695, "grad_norm": 0.6847052574157715, "learning_rate": 1.5856292718000235e-05, "loss": 0.7053, "mean_token_accuracy": 0.7654363572597503, "num_tokens": 4842730.0, "step": 80 }, { "entropy": 0.7401553928852082, "epoch": 0.7407407407407407, "grad_norm": 0.6277093291282654, "learning_rate": 1.4728635821454255e-05, "loss": 0.7037, "mean_token_accuracy": 0.7650831431150437, "num_tokens": 5460571.0, "step": 90 }, { "entropy": 0.7254371553659439, "epoch": 0.823045267489712, "grad_norm": 0.6459594368934631, "learning_rate": 1.351730884444245e-05, "loss": 0.6869, "mean_token_accuracy": 0.7702283948659897, "num_tokens": 6058130.0, "step": 100 }, { "entropy": 0.7227081388235093, "epoch": 0.9053497942386831, "grad_norm": 0.6610374450683594, "learning_rate": 1.2243745415914882e-05, "loss": 0.6827, "mean_token_accuracy": 0.7705969363451004, "num_tokens": 6665354.0, "step": 110 }, { "entropy": 0.7182952612638474, "epoch": 0.9876543209876543, "grad_norm": 0.6171860098838806, "learning_rate": 1.0930480397630146e-05, "loss": 0.6793, "mean_token_accuracy": 0.7711377799510956, "num_tokens": 7279924.0, "step": 120 }, { "entropy": 0.6223280147502297, "epoch": 1.0658436213991769, "grad_norm": 0.6846635341644287, "learning_rate": 9.600751144694827e-06, "loss": 0.5587, "mean_token_accuracy": 0.8087636044150904, "num_tokens": 7855601.0, "step": 130 }, { "entropy": 0.5616398304700851, "epoch": 1.1481481481481481, "grad_norm": 0.6427133083343506, "learning_rate": 8.278086335948191e-06, "loss": 0.5056, "mean_token_accuracy": 0.821845856308937, "num_tokens": 8458600.0, "step": 140 }, { "entropy": 0.5549788802862168, "epoch": 1.2304526748971194, "grad_norm": 0.623350203037262, "learning_rate": 6.9858896495663046e-06, "loss": 0.5165, "mean_token_accuracy": 0.8178219944238663, "num_tokens": 9066971.0, "step": 150 }, { "entropy": 0.556912761926651, "epoch": 1.3127572016460904, "grad_norm": 0.6478646397590637, "learning_rate": 5.747025650470135e-06, "loss": 0.5115, "mean_token_accuracy": 0.8192351371049881, "num_tokens": 9676134.0, "step": 160 }, { "entropy": 0.5518900156021118, "epoch": 1.3950617283950617, "grad_norm": 0.6248000264167786, "learning_rate": 4.583415216985791e-06, "loss": 0.5095, "mean_token_accuracy": 0.8200697928667069, "num_tokens": 10281801.0, "step": 170 }, { "entropy": 0.5520358562469483, "epoch": 1.477366255144033, "grad_norm": 0.6373401284217834, "learning_rate": 3.51564766541435e-06, "loss": 0.5063, "mean_token_accuracy": 0.8212656050920486, "num_tokens": 10894137.0, "step": 180 }, { "entropy": 0.5422273725271225, "epoch": 1.5596707818930042, "grad_norm": 0.6336973905563354, "learning_rate": 2.5626164357101857e-06, "loss": 0.5016, "mean_token_accuracy": 0.8229126304388046, "num_tokens": 11497132.0, "step": 190 }, { "entropy": 0.5529572516679764, "epoch": 1.6419753086419753, "grad_norm": 0.6020342707633972, "learning_rate": 1.7411847845686082e-06, "loss": 0.5072, "mean_token_accuracy": 0.8205246955156327, "num_tokens": 12111651.0, "step": 200 }, { "entropy": 0.5367681249976158, "epoch": 1.7242798353909465, "grad_norm": 0.6049624681472778, "learning_rate": 1.0658874012622244e-06, "loss": 0.4914, "mean_token_accuracy": 0.8258270412683487, "num_tokens": 12715771.0, "step": 210 }, { "entropy": 0.5322598591446877, "epoch": 1.8065843621399176, "grad_norm": 0.6055402755737305, "learning_rate": 5.486732259363647e-07, "loss": 0.4899, "mean_token_accuracy": 0.8261870980262757, "num_tokens": 13321360.0, "step": 220 }, { "entropy": 0.537242217361927, "epoch": 1.8888888888888888, "grad_norm": 0.6070998311042786, "learning_rate": 1.986940210234922e-07, "loss": 0.4928, "mean_token_accuracy": 0.8254955172538757, "num_tokens": 13923829.0, "step": 230 }, { "entropy": 0.5378372967243195, "epoch": 1.97119341563786, "grad_norm": 0.6216578483581543, "learning_rate": 2.2142436865499884e-08, "loss": 0.4978, "mean_token_accuracy": 0.823452839255333, "num_tokens": 14530681.0, "step": 240 } ], "logging_steps": 10, "max_steps": 244, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.30369816421335e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }