{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.992414100847836, "eval_steps": 500, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.6259768772125245, "epoch": 0.0892458723784025, "grad_norm": 0.9921875, "learning_rate": 9.702970297029703e-05, "loss": 1.5974, "mean_token_accuracy": 0.6319892483949662, "num_tokens": 204800.0, "step": 50 }, { "entropy": 1.1296514749526978, "epoch": 0.178491744756805, "grad_norm": 0.7734375, "learning_rate": 0.00019603960396039606, "loss": 1.095, "mean_token_accuracy": 0.7227028369903564, "num_tokens": 409600.0, "step": 100 }, { "entropy": 1.0354267311096192, "epoch": 0.2677376171352075, "grad_norm": 0.640625, "learning_rate": 0.0001998929698787665, "loss": 1.0001, "mean_token_accuracy": 0.738949174284935, "num_tokens": 614400.0, "step": 150 }, { "entropy": 0.9883857735991478, "epoch": 0.35698348951361, "grad_norm": 0.609375, "learning_rate": 0.00019955410749920795, "loss": 0.9576, "mean_token_accuracy": 0.7486803528666496, "num_tokens": 819200.0, "step": 200 }, { "entropy": 0.9801795700192452, "epoch": 0.4462293618920125, "grad_norm": 0.54296875, "learning_rate": 0.00019898401407651969, "loss": 0.9485, "mean_token_accuracy": 0.7502834811806679, "num_tokens": 1024000.0, "step": 250 }, { "entropy": 0.976861891746521, "epoch": 0.535475234270415, "grad_norm": 0.58984375, "learning_rate": 0.00019818401374789826, "loss": 0.942, "mean_token_accuracy": 0.7504301071166992, "num_tokens": 1228800.0, "step": 300 }, { "entropy": 0.9461924117803574, "epoch": 0.6247211066488175, "grad_norm": 0.53125, "learning_rate": 0.00019715596464773042, "loss": 0.9121, "mean_token_accuracy": 0.7569012671709061, "num_tokens": 1433600.0, "step": 350 }, { "entropy": 0.9457061448693276, "epoch": 0.71396697902722, "grad_norm": 0.57421875, "learning_rate": 0.00019590225459176582, "loss": 0.9115, "mean_token_accuracy": 0.7561534664034844, "num_tokens": 1638400.0, "step": 400 }, { "entropy": 0.9444361236691475, "epoch": 0.8032128514056225, "grad_norm": 0.5625, "learning_rate": 0.00019442579553101584, "loss": 0.9109, "mean_token_accuracy": 0.7562658813595772, "num_tokens": 1843200.0, "step": 450 }, { "entropy": 0.9286397603154183, "epoch": 0.892458723784025, "grad_norm": 0.60546875, "learning_rate": 0.00019273001678826114, "loss": 0.8951, "mean_token_accuracy": 0.7600635370612144, "num_tokens": 2048000.0, "step": 500 }, { "entropy": 0.9270413678884506, "epoch": 0.9817045961624274, "grad_norm": 0.55859375, "learning_rate": 0.00019081885709287667, "loss": 0.8924, "mean_token_accuracy": 0.7599315717816353, "num_tokens": 2252800.0, "step": 550 }, { "entropy": 0.871931555307456, "epoch": 1.069611780455154, "grad_norm": 0.5390625, "learning_rate": 0.00018869675543247482, "loss": 0.8402, "mean_token_accuracy": 0.7711419060750662, "num_tokens": 2454528.0, "step": 600 }, { "entropy": 0.866090109050274, "epoch": 1.1588576528335564, "grad_norm": 0.640625, "learning_rate": 0.00018636864074261523, "loss": 0.8309, "mean_token_accuracy": 0.7731769263744355, "num_tokens": 2659328.0, "step": 650 }, { "entropy": 0.8680649262666702, "epoch": 1.248103525211959, "grad_norm": 0.6015625, "learning_rate": 0.00018383992045852872, "loss": 0.8363, "mean_token_accuracy": 0.7720087966322899, "num_tokens": 2864128.0, "step": 700 }, { "entropy": 0.8708215129375457, "epoch": 1.3373493975903614, "grad_norm": 0.578125, "learning_rate": 0.0001811164679554457, "loss": 0.8353, "mean_token_accuracy": 0.7726930573582649, "num_tokens": 3068928.0, "step": 750 }, { "entropy": 0.8597633948922158, "epoch": 1.426595269968764, "grad_norm": 0.5703125, "learning_rate": 0.0001782046089067012, "loss": 0.8265, "mean_token_accuracy": 0.7739687168598175, "num_tokens": 3273728.0, "step": 800 }, { "entropy": 0.85432891279459, "epoch": 1.5158411423471665, "grad_norm": 0.59375, "learning_rate": 0.000175111106591302, "loss": 0.8231, "mean_token_accuracy": 0.7762707683444023, "num_tokens": 3478528.0, "step": 850 }, { "entropy": 0.8619935244321824, "epoch": 1.605087014725569, "grad_norm": 0.61328125, "learning_rate": 0.00017184314618508148, "loss": 0.8254, "mean_token_accuracy": 0.7741544449329376, "num_tokens": 3683328.0, "step": 900 }, { "entropy": 0.8650275564193726, "epoch": 1.6943328871039713, "grad_norm": 0.58984375, "learning_rate": 0.00016840831807192854, "loss": 0.8275, "mean_token_accuracy": 0.7748044946789742, "num_tokens": 3888128.0, "step": 950 }, { "entropy": 0.860476321876049, "epoch": 1.783578759482374, "grad_norm": 0.57421875, "learning_rate": 0.00016481460021385323, "loss": 0.8255, "mean_token_accuracy": 0.7732942277193069, "num_tokens": 4092928.0, "step": 1000 }, { "entropy": 0.8491663599014282, "epoch": 1.8728246318607764, "grad_norm": 0.57421875, "learning_rate": 0.0001610703396208375, "loss": 0.8172, "mean_token_accuracy": 0.7759139758348464, "num_tokens": 4297728.0, "step": 1050 }, { "entropy": 0.8503634676337242, "epoch": 1.962070504239179, "grad_norm": 0.58203125, "learning_rate": 0.0001571842329635102, "loss": 0.8165, "mean_token_accuracy": 0.7770869982242584, "num_tokens": 4502528.0, "step": 1100 }, { "entropy": 0.7979691500591143, "epoch": 2.0499776885319054, "grad_norm": 0.625, "learning_rate": 0.00015316530637367708, "loss": 0.7612, "mean_token_accuracy": 0.7890002040693602, "num_tokens": 4704256.0, "step": 1150 }, { "entropy": 0.7788776361942291, "epoch": 2.139223560910308, "grad_norm": 0.62890625, "learning_rate": 0.00014902289447962187, "loss": 0.7431, "mean_token_accuracy": 0.7926588499546051, "num_tokens": 4909056.0, "step": 1200 }, { "entropy": 0.7778185418248177, "epoch": 2.2284694332887103, "grad_norm": 0.71484375, "learning_rate": 0.0001447666187248731, "loss": 0.7431, "mean_token_accuracy": 0.7919403752684593, "num_tokens": 5113856.0, "step": 1250 }, { "entropy": 0.7770297473669052, "epoch": 2.3177153056671127, "grad_norm": 0.671875, "learning_rate": 0.00014040636502079434, "loss": 0.7421, "mean_token_accuracy": 0.7937096789479255, "num_tokens": 5318656.0, "step": 1300 }, { "entropy": 0.774446559548378, "epoch": 2.4069611780455156, "grad_norm": 0.69921875, "learning_rate": 0.00013595226078490395, "loss": 0.7405, "mean_token_accuracy": 0.7925708714127541, "num_tokens": 5523456.0, "step": 1350 }, { "entropy": 0.7771885851025582, "epoch": 2.496207050423918, "grad_norm": 0.61328125, "learning_rate": 0.00013141465141825603, "loss": 0.7402, "mean_token_accuracy": 0.7931915977597237, "num_tokens": 5728256.0, "step": 1400 }, { "entropy": 0.7688560289144516, "epoch": 2.5854529228023204, "grad_norm": 0.66796875, "learning_rate": 0.0001268040762765189, "loss": 0.7369, "mean_token_accuracy": 0.7952443835139275, "num_tokens": 5933056.0, "step": 1450 }, { "entropy": 0.78252092897892, "epoch": 2.674698795180723, "grad_norm": 0.70703125, "learning_rate": 0.00012213124419056074, "loss": 0.7474, "mean_token_accuracy": 0.7915493679046631, "num_tokens": 6137856.0, "step": 1500 }, { "entropy": 0.7763542786240578, "epoch": 2.7639446675591253, "grad_norm": 0.625, "learning_rate": 0.00011740700859340161, "loss": 0.7383, "mean_token_accuracy": 0.7942082145810128, "num_tokens": 6342656.0, "step": 1550 }, { "entropy": 0.7749826022982598, "epoch": 2.853190539937528, "grad_norm": 0.6796875, "learning_rate": 0.00011264234231130209, "loss": 0.7394, "mean_token_accuracy": 0.794452593922615, "num_tokens": 6547456.0, "step": 1600 }, { "entropy": 0.7712367391586303, "epoch": 2.9424364123159306, "grad_norm": 0.65234375, "learning_rate": 0.00010784831207754171, "loss": 0.7352, "mean_token_accuracy": 0.793088955283165, "num_tokens": 6752256.0, "step": 1650 }, { "entropy": 0.7491341354278138, "epoch": 3.030343596608657, "grad_norm": 0.71875, "learning_rate": 0.00010303605282808242, "loss": 0.7141, "mean_token_accuracy": 0.7993410486860324, "num_tokens": 6953984.0, "step": 1700 }, { "entropy": 0.7142950230836869, "epoch": 3.1195894689870594, "grad_norm": 0.66796875, "learning_rate": 9.821674183881982e-05, "loss": 0.6733, "mean_token_accuracy": 0.809511242210865, "num_tokens": 7158784.0, "step": 1750 }, { "entropy": 0.7075088465213776, "epoch": 3.208835341365462, "grad_norm": 0.734375, "learning_rate": 9.34015727644931e-05, "loss": 0.6705, "mean_token_accuracy": 0.80923753708601, "num_tokens": 7363584.0, "step": 1800 }, { "entropy": 0.7165163627266884, "epoch": 3.298081213743864, "grad_norm": 0.734375, "learning_rate": 8.860172963955215e-05, "loss": 0.683, "mean_token_accuracy": 0.8069452607631683, "num_tokens": 7568384.0, "step": 1850 }, { "entropy": 0.7122274199128151, "epoch": 3.3873270861222666, "grad_norm": 0.75390625, "learning_rate": 8.382836090136962e-05, "loss": 0.6751, "mean_token_accuracy": 0.8079178902506828, "num_tokens": 7773184.0, "step": 1900 }, { "entropy": 0.7034698343276977, "epoch": 3.4765729585006695, "grad_norm": 0.6953125, "learning_rate": 7.909255349613283e-05, "loss": 0.6673, "mean_token_accuracy": 0.8113440865278244, "num_tokens": 7977984.0, "step": 1950 }, { "entropy": 0.7057863634824753, "epoch": 3.565818830879072, "grad_norm": 0.76171875, "learning_rate": 7.440530712755951e-05, "loss": 0.6688, "mean_token_accuracy": 0.8105962842702865, "num_tokens": 8182784.0, "step": 2000 }, { "entropy": 0.7125656777620315, "epoch": 3.6550647032574743, "grad_norm": 0.76953125, "learning_rate": 6.977750870824863e-05, "loss": 0.6761, "mean_token_accuracy": 0.8088660803437233, "num_tokens": 8387584.0, "step": 2050 }, { "entropy": 0.70076868891716, "epoch": 3.7443105756358768, "grad_norm": 0.765625, "learning_rate": 6.521990707300736e-05, "loss": 0.6634, "mean_token_accuracy": 0.8110703819990158, "num_tokens": 8592384.0, "step": 2100 }, { "entropy": 0.700417303442955, "epoch": 3.833556448014279, "grad_norm": 0.73828125, "learning_rate": 6.074308801288713e-05, "loss": 0.6631, "mean_token_accuracy": 0.8109530797600746, "num_tokens": 8797184.0, "step": 2150 }, { "entropy": 0.7015222778916359, "epoch": 3.922802320392682, "grad_norm": 0.77734375, "learning_rate": 5.6357449687915386e-05, "loss": 0.6665, "mean_token_accuracy": 0.8110703811049461, "num_tokens": 9001984.0, "step": 2200 }, { "entropy": 0.7122828648780203, "epoch": 4.010709504685408, "grad_norm": 0.703125, "learning_rate": 5.207317847563248e-05, "loss": 0.6758, "mean_token_accuracy": 0.8095082153523634, "num_tokens": 9203712.0, "step": 2250 }, { "entropy": 0.6674378645420075, "epoch": 4.099955377063811, "grad_norm": 0.74609375, "learning_rate": 4.7900225311528094e-05, "loss": 0.6269, "mean_token_accuracy": 0.8208357748389244, "num_tokens": 9408512.0, "step": 2300 }, { "entropy": 0.6730345389246941, "epoch": 4.189201249442213, "grad_norm": 0.75390625, "learning_rate": 4.384828257633177e-05, "loss": 0.6365, "mean_token_accuracy": 0.8188856270909309, "num_tokens": 9613312.0, "step": 2350 }, { "entropy": 0.6662819012999535, "epoch": 4.278447121820616, "grad_norm": 0.8515625, "learning_rate": 3.992676158383957e-05, "loss": 0.6271, "mean_token_accuracy": 0.8203372398018837, "num_tokens": 9818112.0, "step": 2400 }, { "entropy": 0.664862583577633, "epoch": 4.367692994199018, "grad_norm": 0.80078125, "learning_rate": 3.6144770721565844e-05, "loss": 0.6261, "mean_token_accuracy": 0.8207477974891663, "num_tokens": 10022912.0, "step": 2450 }, { "entropy": 0.6621513772010803, "epoch": 4.4569388665774206, "grad_norm": 0.77734375, "learning_rate": 3.251109429499194e-05, "loss": 0.6238, "mean_token_accuracy": 0.8212072286009788, "num_tokens": 10227712.0, "step": 2500 }, { "entropy": 0.6667330291867256, "epoch": 4.546184738955823, "grad_norm": 0.77734375, "learning_rate": 2.9034172124549263e-05, "loss": 0.6275, "mean_token_accuracy": 0.8213147559762001, "num_tokens": 10432512.0, "step": 2550 }, { "entropy": 0.6676543334126472, "epoch": 4.635430611334225, "grad_norm": 0.78125, "learning_rate": 2.5722079942726964e-05, "loss": 0.6295, "mean_token_accuracy": 0.8205034193396569, "num_tokens": 10637312.0, "step": 2600 }, { "entropy": 0.6684669059514999, "epoch": 4.724676483712628, "grad_norm": 0.796875, "learning_rate": 2.2582510636834064e-05, "loss": 0.6347, "mean_token_accuracy": 0.8190029296278953, "num_tokens": 10842112.0, "step": 2650 }, { "entropy": 0.6723845577239991, "epoch": 4.813922356091031, "grad_norm": 0.8125, "learning_rate": 1.9622756380983887e-05, "loss": 0.6353, "mean_token_accuracy": 0.8182209166884422, "num_tokens": 11046912.0, "step": 2700 }, { "entropy": 0.6706709080934524, "epoch": 4.903168228469434, "grad_norm": 0.75390625, "learning_rate": 1.684969169880165e-05, "loss": 0.6291, "mean_token_accuracy": 0.8204301059246063, "num_tokens": 11251712.0, "step": 2750 }, { "entropy": 0.6609847331047058, "epoch": 4.992414100847836, "grad_norm": 0.7734375, "learning_rate": 1.4269757496194991e-05, "loss": 0.6283, "mean_token_accuracy": 0.8204398784041405, "num_tokens": 11456512.0, "step": 2800 } ], "logging_steps": 50, "max_steps": 3360, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 560, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.897183183625257e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }