| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 50, | |
| "global_step": 477, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.9739606760442256, | |
| "epoch": 0.12598425196850394, | |
| "grad_norm": 0.968841016292572, | |
| "learning_rate": 7.6e-05, | |
| "loss": 2.2682, | |
| "mean_token_accuracy": 0.5962976351380348, | |
| "num_tokens": 99995.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.9236388027667999, | |
| "epoch": 0.25196850393700787, | |
| "grad_norm": 0.30187129974365234, | |
| "learning_rate": 0.00015600000000000002, | |
| "loss": 0.9381, | |
| "mean_token_accuracy": 0.7860403966158629, | |
| "num_tokens": 200771.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.31496062992125984, | |
| "eval_entropy": 0.7079527728587577, | |
| "eval_loss": 0.6960555911064148, | |
| "eval_mean_token_accuracy": 0.8306159124059497, | |
| "eval_num_tokens": 251130.0, | |
| "eval_runtime": 45.7017, | |
| "eval_samples_per_second": 13.894, | |
| "eval_steps_per_second": 6.958, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.7168553218245506, | |
| "epoch": 0.3779527559055118, | |
| "grad_norm": 0.28825637698173523, | |
| "learning_rate": 0.0001997808505782075, | |
| "loss": 0.7182, | |
| "mean_token_accuracy": 0.8267303571105004, | |
| "num_tokens": 300943.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.647084547393024, | |
| "epoch": 0.5039370078740157, | |
| "grad_norm": 0.3307661712169647, | |
| "learning_rate": 0.00019773242425416768, | |
| "loss": 0.6472, | |
| "mean_token_accuracy": 0.8401378501206637, | |
| "num_tokens": 401448.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.6049567876383662, | |
| "epoch": 0.6299212598425197, | |
| "grad_norm": 0.3357570469379425, | |
| "learning_rate": 0.00019357168190404936, | |
| "loss": 0.6097, | |
| "mean_token_accuracy": 0.8464727878570557, | |
| "num_tokens": 501464.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "eval_entropy": 0.60639403676087, | |
| "eval_loss": 0.5814101696014404, | |
| "eval_mean_token_accuracy": 0.8527061408795651, | |
| "eval_num_tokens": 501464.0, | |
| "eval_runtime": 45.6184, | |
| "eval_samples_per_second": 13.92, | |
| "eval_steps_per_second": 6.971, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.580510495044291, | |
| "epoch": 0.7559055118110236, | |
| "grad_norm": 0.35506585240364075, | |
| "learning_rate": 0.0001873885507225743, | |
| "loss": 0.5852, | |
| "mean_token_accuracy": 0.8522923283278943, | |
| "num_tokens": 602072.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5525572836399079, | |
| "epoch": 0.8818897637795275, | |
| "grad_norm": 0.3671925663948059, | |
| "learning_rate": 0.00017931666831451536, | |
| "loss": 0.5538, | |
| "mean_token_accuracy": 0.8564509227871895, | |
| "num_tokens": 701808.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9448818897637795, | |
| "eval_entropy": 0.5495308028452052, | |
| "eval_loss": 0.5309950113296509, | |
| "eval_mean_token_accuracy": 0.8605339890755948, | |
| "eval_num_tokens": 751756.0, | |
| "eval_runtime": 46.259, | |
| "eval_samples_per_second": 13.727, | |
| "eval_steps_per_second": 6.874, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5359916405964501, | |
| "epoch": 1.0062992125984251, | |
| "grad_norm": 0.3443862497806549, | |
| "learning_rate": 0.0001695304943507677, | |
| "loss": 0.5351, | |
| "mean_token_accuracy": 0.8600816813450826, | |
| "num_tokens": 800169.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.47207170017063615, | |
| "epoch": 1.132283464566929, | |
| "grad_norm": 0.4181714951992035, | |
| "learning_rate": 0.0001582415399266036, | |
| "loss": 0.4724, | |
| "mean_token_accuracy": 0.8726930443197489, | |
| "num_tokens": 899852.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.4565439449623227, | |
| "epoch": 1.258267716535433, | |
| "grad_norm": 0.45549947023391724, | |
| "learning_rate": 0.00014569379611796137, | |
| "loss": 0.4599, | |
| "mean_token_accuracy": 0.875334644690156, | |
| "num_tokens": 999653.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.258267716535433, | |
| "eval_entropy": 0.4601464904141876, | |
| "eval_loss": 0.5056809782981873, | |
| "eval_mean_token_accuracy": 0.8665201171014294, | |
| "eval_num_tokens": 999653.0, | |
| "eval_runtime": 45.7221, | |
| "eval_samples_per_second": 13.888, | |
| "eval_steps_per_second": 6.955, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.4486727064475417, | |
| "epoch": 1.384251968503937, | |
| "grad_norm": 0.4510684311389923, | |
| "learning_rate": 0.00013215846053955683, | |
| "loss": 0.4513, | |
| "mean_token_accuracy": 0.878431486710906, | |
| "num_tokens": 1100570.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.4511976413428783, | |
| "epoch": 1.510236220472441, | |
| "grad_norm": 0.44284847378730774, | |
| "learning_rate": 0.00011792807588107357, | |
| "loss": 0.4521, | |
| "mean_token_accuracy": 0.8780730802565813, | |
| "num_tokens": 1201178.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.573228346456693, | |
| "eval_entropy": 0.46230017211077346, | |
| "eval_loss": 0.4856663942337036, | |
| "eval_mean_token_accuracy": 0.8700332990232503, | |
| "eval_num_tokens": 1251074.0, | |
| "eval_runtime": 45.5388, | |
| "eval_samples_per_second": 13.944, | |
| "eval_steps_per_second": 6.983, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.451329773850739, | |
| "epoch": 1.6362204724409448, | |
| "grad_norm": 0.4431185722351074, | |
| "learning_rate": 0.00010331020710675729, | |
| "loss": 0.4531, | |
| "mean_token_accuracy": 0.8772743601351977, | |
| "num_tokens": 1301346.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.42816779650747777, | |
| "epoch": 1.762204724409449, | |
| "grad_norm": 0.45769160985946655, | |
| "learning_rate": 8.862079397472553e-05, | |
| "loss": 0.4285, | |
| "mean_token_accuracy": 0.8827753882855177, | |
| "num_tokens": 1401547.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.4320015098899603, | |
| "epoch": 1.8881889763779527, | |
| "grad_norm": 0.44558027386665344, | |
| "learning_rate": 7.417732254970317e-05, | |
| "loss": 0.4326, | |
| "mean_token_accuracy": 0.8826998364180326, | |
| "num_tokens": 1502107.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.8881889763779527, | |
| "eval_entropy": 0.42231335453454805, | |
| "eval_loss": 0.4666292071342468, | |
| "eval_mean_token_accuracy": 0.875962255885766, | |
| "eval_num_tokens": 1502107.0, | |
| "eval_runtime": 46.1255, | |
| "eval_samples_per_second": 13.767, | |
| "eval_steps_per_second": 6.894, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.4177426643009427, | |
| "epoch": 2.0125984251968503, | |
| "grad_norm": 0.42371708154678345, | |
| "learning_rate": 6.0291963295035484e-05, | |
| "loss": 0.4157, | |
| "mean_token_accuracy": 0.8861905902246886, | |
| "num_tokens": 1600674.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.3515403226017952, | |
| "epoch": 2.1385826771653544, | |
| "grad_norm": 0.5003436803817749, | |
| "learning_rate": 4.726482405216125e-05, | |
| "loss": 0.3405, | |
| "mean_token_accuracy": 0.9042681198567152, | |
| "num_tokens": 1700569.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.2015748031496063, | |
| "eval_entropy": 0.3585739609955242, | |
| "eval_loss": 0.46917417645454407, | |
| "eval_mean_token_accuracy": 0.8779247264442204, | |
| "eval_num_tokens": 1750522.0, | |
| "eval_runtime": 46.0421, | |
| "eval_samples_per_second": 13.792, | |
| "eval_steps_per_second": 6.907, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.33075640462338923, | |
| "epoch": 2.264566929133858, | |
| "grad_norm": 0.5350939035415649, | |
| "learning_rate": 3.537746373263589e-05, | |
| "loss": 0.3258, | |
| "mean_token_accuracy": 0.9079407677054405, | |
| "num_tokens": 1800230.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.3440810150466859, | |
| "epoch": 2.3905511811023623, | |
| "grad_norm": 0.5540388822555542, | |
| "learning_rate": 2.4886806912948035e-05, | |
| "loss": 0.3391, | |
| "mean_token_accuracy": 0.9038707241415977, | |
| "num_tokens": 1900074.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.3350882642902434, | |
| "epoch": 2.516535433070866, | |
| "grad_norm": 0.630711019039154, | |
| "learning_rate": 1.601959085755641e-05, | |
| "loss": 0.3338, | |
| "mean_token_accuracy": 0.9055462963879108, | |
| "num_tokens": 2000844.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.516535433070866, | |
| "eval_entropy": 0.3513495825559088, | |
| "eval_loss": 0.46354979276657104, | |
| "eval_mean_token_accuracy": 0.8799726563804554, | |
| "eval_num_tokens": 2000844.0, | |
| "eval_runtime": 45.6998, | |
| "eval_samples_per_second": 13.895, | |
| "eval_steps_per_second": 6.958, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.3295320746488869, | |
| "epoch": 2.64251968503937, | |
| "grad_norm": 0.5727705359458923, | |
| "learning_rate": 8.967464988067475e-06, | |
| "loss": 0.3217, | |
| "mean_token_accuracy": 0.9084840539842844, | |
| "num_tokens": 2100721.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.3233298393897712, | |
| "epoch": 2.768503937007874, | |
| "grad_norm": 0.5514662265777588, | |
| "learning_rate": 3.882848714986243e-06, | |
| "loss": 0.3262, | |
| "mean_token_accuracy": 0.9086818728595972, | |
| "num_tokens": 2201603.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.8314960629921258, | |
| "eval_entropy": 0.34981531504565067, | |
| "eval_loss": 0.4621703028678894, | |
| "eval_mean_token_accuracy": 0.8799344349957112, | |
| "eval_num_tokens": 2251922.0, | |
| "eval_runtime": 46.0709, | |
| "eval_samples_per_second": 13.783, | |
| "eval_steps_per_second": 6.902, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.32763075120747087, | |
| "epoch": 2.894488188976378, | |
| "grad_norm": 0.5462954044342041, | |
| "learning_rate": 8.75637157788689e-07, | |
| "loss": 0.3236, | |
| "mean_token_accuracy": 0.9094593167304993, | |
| "num_tokens": 2301918.0, | |
| "step": 460 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 477, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2172041861270528e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |