{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 50, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.9739606760442256, "epoch": 0.12598425196850394, "grad_norm": 0.968841016292572, "learning_rate": 7.6e-05, "loss": 2.2682, "mean_token_accuracy": 0.5962976351380348, "num_tokens": 99995.0, "step": 20 }, { "entropy": 0.9236388027667999, "epoch": 0.25196850393700787, "grad_norm": 0.30187129974365234, "learning_rate": 0.00015600000000000002, "loss": 0.9381, "mean_token_accuracy": 0.7860403966158629, "num_tokens": 200771.0, "step": 40 }, { "epoch": 0.31496062992125984, "eval_entropy": 0.7079527728587577, "eval_loss": 0.6960555911064148, "eval_mean_token_accuracy": 0.8306159124059497, "eval_num_tokens": 251130.0, "eval_runtime": 45.7017, "eval_samples_per_second": 13.894, "eval_steps_per_second": 6.958, "step": 50 }, { "entropy": 0.7168553218245506, "epoch": 0.3779527559055118, "grad_norm": 0.28825637698173523, "learning_rate": 0.0001997808505782075, "loss": 0.7182, "mean_token_accuracy": 0.8267303571105004, "num_tokens": 300943.0, "step": 60 }, { "entropy": 0.647084547393024, "epoch": 0.5039370078740157, "grad_norm": 0.3307661712169647, "learning_rate": 0.00019773242425416768, "loss": 0.6472, "mean_token_accuracy": 0.8401378501206637, "num_tokens": 401448.0, "step": 80 }, { "entropy": 0.6049567876383662, "epoch": 0.6299212598425197, "grad_norm": 0.3357570469379425, "learning_rate": 0.00019357168190404936, "loss": 0.6097, "mean_token_accuracy": 0.8464727878570557, "num_tokens": 501464.0, "step": 100 }, { "epoch": 0.6299212598425197, "eval_entropy": 0.60639403676087, "eval_loss": 0.5814101696014404, "eval_mean_token_accuracy": 0.8527061408795651, "eval_num_tokens": 501464.0, "eval_runtime": 45.6184, "eval_samples_per_second": 13.92, "eval_steps_per_second": 6.971, "step": 100 }, { "entropy": 0.580510495044291, "epoch": 0.7559055118110236, "grad_norm": 0.35506585240364075, "learning_rate": 0.0001873885507225743, "loss": 0.5852, "mean_token_accuracy": 0.8522923283278943, "num_tokens": 602072.0, "step": 120 }, { "entropy": 0.5525572836399079, "epoch": 0.8818897637795275, "grad_norm": 0.3671925663948059, "learning_rate": 0.00017931666831451536, "loss": 0.5538, "mean_token_accuracy": 0.8564509227871895, "num_tokens": 701808.0, "step": 140 }, { "epoch": 0.9448818897637795, "eval_entropy": 0.5495308028452052, "eval_loss": 0.5309950113296509, "eval_mean_token_accuracy": 0.8605339890755948, "eval_num_tokens": 751756.0, "eval_runtime": 46.259, "eval_samples_per_second": 13.727, "eval_steps_per_second": 6.874, "step": 150 }, { "entropy": 0.5359916405964501, "epoch": 1.0062992125984251, "grad_norm": 0.3443862497806549, "learning_rate": 0.0001695304943507677, "loss": 0.5351, "mean_token_accuracy": 0.8600816813450826, "num_tokens": 800169.0, "step": 160 }, { "entropy": 0.47207170017063615, "epoch": 1.132283464566929, "grad_norm": 0.4181714951992035, "learning_rate": 0.0001582415399266036, "loss": 0.4724, "mean_token_accuracy": 0.8726930443197489, "num_tokens": 899852.0, "step": 180 }, { "entropy": 0.4565439449623227, "epoch": 1.258267716535433, "grad_norm": 0.45549947023391724, "learning_rate": 0.00014569379611796137, "loss": 0.4599, "mean_token_accuracy": 0.875334644690156, "num_tokens": 999653.0, "step": 200 }, { "epoch": 1.258267716535433, "eval_entropy": 0.4601464904141876, "eval_loss": 0.5056809782981873, "eval_mean_token_accuracy": 0.8665201171014294, "eval_num_tokens": 999653.0, "eval_runtime": 45.7221, "eval_samples_per_second": 13.888, "eval_steps_per_second": 6.955, "step": 200 }, { "entropy": 0.4486727064475417, "epoch": 1.384251968503937, "grad_norm": 0.4510684311389923, "learning_rate": 0.00013215846053955683, "loss": 0.4513, "mean_token_accuracy": 0.878431486710906, "num_tokens": 1100570.0, "step": 220 }, { "entropy": 0.4511976413428783, "epoch": 1.510236220472441, "grad_norm": 0.44284847378730774, "learning_rate": 0.00011792807588107357, "loss": 0.4521, "mean_token_accuracy": 0.8780730802565813, "num_tokens": 1201178.0, "step": 240 }, { "epoch": 1.573228346456693, "eval_entropy": 0.46230017211077346, "eval_loss": 0.4856663942337036, "eval_mean_token_accuracy": 0.8700332990232503, "eval_num_tokens": 1251074.0, "eval_runtime": 45.5388, "eval_samples_per_second": 13.944, "eval_steps_per_second": 6.983, "step": 250 }, { "entropy": 0.451329773850739, "epoch": 1.6362204724409448, "grad_norm": 0.4431185722351074, "learning_rate": 0.00010331020710675729, "loss": 0.4531, "mean_token_accuracy": 0.8772743601351977, "num_tokens": 1301346.0, "step": 260 }, { "entropy": 0.42816779650747777, "epoch": 1.762204724409449, "grad_norm": 0.45769160985946655, "learning_rate": 8.862079397472553e-05, "loss": 0.4285, "mean_token_accuracy": 0.8827753882855177, "num_tokens": 1401547.0, "step": 280 }, { "entropy": 0.4320015098899603, "epoch": 1.8881889763779527, "grad_norm": 0.44558027386665344, "learning_rate": 7.417732254970317e-05, "loss": 0.4326, "mean_token_accuracy": 0.8826998364180326, "num_tokens": 1502107.0, "step": 300 }, { "epoch": 1.8881889763779527, "eval_entropy": 0.42231335453454805, "eval_loss": 0.4666292071342468, "eval_mean_token_accuracy": 0.875962255885766, "eval_num_tokens": 1502107.0, "eval_runtime": 46.1255, "eval_samples_per_second": 13.767, "eval_steps_per_second": 6.894, "step": 300 }, { "entropy": 0.4177426643009427, "epoch": 2.0125984251968503, "grad_norm": 0.42371708154678345, "learning_rate": 6.0291963295035484e-05, "loss": 0.4157, "mean_token_accuracy": 0.8861905902246886, "num_tokens": 1600674.0, "step": 320 }, { "entropy": 0.3515403226017952, "epoch": 2.1385826771653544, "grad_norm": 0.5003436803817749, "learning_rate": 4.726482405216125e-05, "loss": 0.3405, "mean_token_accuracy": 0.9042681198567152, "num_tokens": 1700569.0, "step": 340 }, { "epoch": 2.2015748031496063, "eval_entropy": 0.3585739609955242, "eval_loss": 0.46917417645454407, "eval_mean_token_accuracy": 0.8779247264442204, "eval_num_tokens": 1750522.0, "eval_runtime": 46.0421, "eval_samples_per_second": 13.792, "eval_steps_per_second": 6.907, "step": 350 }, { "entropy": 0.33075640462338923, "epoch": 2.264566929133858, "grad_norm": 0.5350939035415649, "learning_rate": 3.537746373263589e-05, "loss": 0.3258, "mean_token_accuracy": 0.9079407677054405, "num_tokens": 1800230.0, "step": 360 }, { "entropy": 0.3440810150466859, "epoch": 2.3905511811023623, "grad_norm": 0.5540388822555542, "learning_rate": 2.4886806912948035e-05, "loss": 0.3391, "mean_token_accuracy": 0.9038707241415977, "num_tokens": 1900074.0, "step": 380 }, { "entropy": 0.3350882642902434, "epoch": 2.516535433070866, "grad_norm": 0.630711019039154, "learning_rate": 1.601959085755641e-05, "loss": 0.3338, "mean_token_accuracy": 0.9055462963879108, "num_tokens": 2000844.0, "step": 400 }, { "epoch": 2.516535433070866, "eval_entropy": 0.3513495825559088, "eval_loss": 0.46354979276657104, "eval_mean_token_accuracy": 0.8799726563804554, "eval_num_tokens": 2000844.0, "eval_runtime": 45.6998, "eval_samples_per_second": 13.895, "eval_steps_per_second": 6.958, "step": 400 }, { "entropy": 0.3295320746488869, "epoch": 2.64251968503937, "grad_norm": 0.5727705359458923, "learning_rate": 8.967464988067475e-06, "loss": 0.3217, "mean_token_accuracy": 0.9084840539842844, "num_tokens": 2100721.0, "step": 420 }, { "entropy": 0.3233298393897712, "epoch": 2.768503937007874, "grad_norm": 0.5514662265777588, "learning_rate": 3.882848714986243e-06, "loss": 0.3262, "mean_token_accuracy": 0.9086818728595972, "num_tokens": 2201603.0, "step": 440 }, { "epoch": 2.8314960629921258, "eval_entropy": 0.34981531504565067, "eval_loss": 0.4621703028678894, "eval_mean_token_accuracy": 0.8799344349957112, "eval_num_tokens": 2251922.0, "eval_runtime": 46.0709, "eval_samples_per_second": 13.783, "eval_steps_per_second": 6.902, "step": 450 }, { "entropy": 0.32763075120747087, "epoch": 2.894488188976378, "grad_norm": 0.5462954044342041, "learning_rate": 8.75637157788689e-07, "loss": 0.3236, "mean_token_accuracy": 0.9094593167304993, "num_tokens": 2301918.0, "step": 460 } ], "logging_steps": 20, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2172041861270528e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }