{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9009009009009009, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.7490966856479644, "epoch": 0.09009009009009009, "grad_norm": 4.318613052368164, "learning_rate": 9e-05, "loss": 6.8516, "mean_token_accuracy": 0.31817560344934465, "num_tokens": 5273.0, "step": 10 }, { "entropy": 2.358074116706848, "epoch": 0.18018018018018017, "grad_norm": 2.591895341873169, "learning_rate": 0.00019, "loss": 4.7791, "mean_token_accuracy": 0.4064402043819427, "num_tokens": 10323.0, "step": 20 }, { "entropy": 2.7442060232162477, "epoch": 0.2702702702702703, "grad_norm": 2.085822105407715, "learning_rate": 0.00019424920127795528, "loss": 2.528, "mean_token_accuracy": 0.6114133253693581, "num_tokens": 15657.0, "step": 30 }, { "entropy": 1.6624038934707641, "epoch": 0.36036036036036034, "grad_norm": 1.898437261581421, "learning_rate": 0.0001878594249201278, "loss": 1.5967, "mean_token_accuracy": 0.75713110268116, "num_tokens": 20566.0, "step": 40 }, { "entropy": 1.3454128205776215, "epoch": 0.45045045045045046, "grad_norm": 2.2532074451446533, "learning_rate": 0.00018146964856230032, "loss": 1.3224, "mean_token_accuracy": 0.7765294492244721, "num_tokens": 25625.0, "step": 50 }, { "entropy": 1.1165625154972076, "epoch": 0.5405405405405406, "grad_norm": 2.300675868988037, "learning_rate": 0.00017507987220447287, "loss": 1.0734, "mean_token_accuracy": 0.803844365477562, "num_tokens": 30624.0, "step": 60 }, { "entropy": 0.975060197710991, "epoch": 0.6306306306306306, "grad_norm": 1.954689383506775, "learning_rate": 0.00016869009584664536, "loss": 0.9733, "mean_token_accuracy": 0.818100169301033, "num_tokens": 35746.0, "step": 70 }, { "entropy": 0.8530668556690216, "epoch": 0.7207207207207207, "grad_norm": 2.083409309387207, "learning_rate": 0.0001623003194888179, "loss": 0.8131, "mean_token_accuracy": 0.8403199791908265, "num_tokens": 41031.0, "step": 80 }, { "entropy": 0.8242652952671051, "epoch": 0.8108108108108109, "grad_norm": 2.1543054580688477, "learning_rate": 0.00015591054313099042, "loss": 0.7285, "mean_token_accuracy": 0.8522787302732467, "num_tokens": 45910.0, "step": 90 }, { "entropy": 0.7216496258974076, "epoch": 0.9009009009009009, "grad_norm": 1.8068711757659912, "learning_rate": 0.00014952076677316297, "loss": 0.632, "mean_token_accuracy": 0.8595692455768585, "num_tokens": 51027.0, "step": 100 } ], "logging_steps": 10, "max_steps": 333, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 561296602595328.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }