| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.992414100847836, |
| "eval_steps": 500, |
| "global_step": 2800, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.6259768772125245, |
| "epoch": 0.0892458723784025, |
| "grad_norm": 0.9921875, |
| "learning_rate": 9.702970297029703e-05, |
| "loss": 1.5974, |
| "mean_token_accuracy": 0.6319892483949662, |
| "num_tokens": 204800.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.1296514749526978, |
| "epoch": 0.178491744756805, |
| "grad_norm": 0.7734375, |
| "learning_rate": 0.00019603960396039606, |
| "loss": 1.095, |
| "mean_token_accuracy": 0.7227028369903564, |
| "num_tokens": 409600.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.0354267311096192, |
| "epoch": 0.2677376171352075, |
| "grad_norm": 0.640625, |
| "learning_rate": 0.0001998929698787665, |
| "loss": 1.0001, |
| "mean_token_accuracy": 0.738949174284935, |
| "num_tokens": 614400.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.9883857735991478, |
| "epoch": 0.35698348951361, |
| "grad_norm": 0.609375, |
| "learning_rate": 0.00019955410749920795, |
| "loss": 0.9576, |
| "mean_token_accuracy": 0.7486803528666496, |
| "num_tokens": 819200.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.9801795700192452, |
| "epoch": 0.4462293618920125, |
| "grad_norm": 0.54296875, |
| "learning_rate": 0.00019898401407651969, |
| "loss": 0.9485, |
| "mean_token_accuracy": 0.7502834811806679, |
| "num_tokens": 1024000.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.976861891746521, |
| "epoch": 0.535475234270415, |
| "grad_norm": 0.58984375, |
| "learning_rate": 0.00019818401374789826, |
| "loss": 0.942, |
| "mean_token_accuracy": 0.7504301071166992, |
| "num_tokens": 1228800.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.9461924117803574, |
| "epoch": 0.6247211066488175, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.00019715596464773042, |
| "loss": 0.9121, |
| "mean_token_accuracy": 0.7569012671709061, |
| "num_tokens": 1433600.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.9457061448693276, |
| "epoch": 0.71396697902722, |
| "grad_norm": 0.57421875, |
| "learning_rate": 0.00019590225459176582, |
| "loss": 0.9115, |
| "mean_token_accuracy": 0.7561534664034844, |
| "num_tokens": 1638400.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.9444361236691475, |
| "epoch": 0.8032128514056225, |
| "grad_norm": 0.5625, |
| "learning_rate": 0.00019442579553101584, |
| "loss": 0.9109, |
| "mean_token_accuracy": 0.7562658813595772, |
| "num_tokens": 1843200.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.9286397603154183, |
| "epoch": 0.892458723784025, |
| "grad_norm": 0.60546875, |
| "learning_rate": 0.00019273001678826114, |
| "loss": 0.8951, |
| "mean_token_accuracy": 0.7600635370612144, |
| "num_tokens": 2048000.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.9270413678884506, |
| "epoch": 0.9817045961624274, |
| "grad_norm": 0.55859375, |
| "learning_rate": 0.00019081885709287667, |
| "loss": 0.8924, |
| "mean_token_accuracy": 0.7599315717816353, |
| "num_tokens": 2252800.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.871931555307456, |
| "epoch": 1.069611780455154, |
| "grad_norm": 0.5390625, |
| "learning_rate": 0.00018869675543247482, |
| "loss": 0.8402, |
| "mean_token_accuracy": 0.7711419060750662, |
| "num_tokens": 2454528.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.866090109050274, |
| "epoch": 1.1588576528335564, |
| "grad_norm": 0.640625, |
| "learning_rate": 0.00018636864074261523, |
| "loss": 0.8309, |
| "mean_token_accuracy": 0.7731769263744355, |
| "num_tokens": 2659328.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.8680649262666702, |
| "epoch": 1.248103525211959, |
| "grad_norm": 0.6015625, |
| "learning_rate": 0.00018383992045852872, |
| "loss": 0.8363, |
| "mean_token_accuracy": 0.7720087966322899, |
| "num_tokens": 2864128.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.8708215129375457, |
| "epoch": 1.3373493975903614, |
| "grad_norm": 0.578125, |
| "learning_rate": 0.0001811164679554457, |
| "loss": 0.8353, |
| "mean_token_accuracy": 0.7726930573582649, |
| "num_tokens": 3068928.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.8597633948922158, |
| "epoch": 1.426595269968764, |
| "grad_norm": 0.5703125, |
| "learning_rate": 0.0001782046089067012, |
| "loss": 0.8265, |
| "mean_token_accuracy": 0.7739687168598175, |
| "num_tokens": 3273728.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.85432891279459, |
| "epoch": 1.5158411423471665, |
| "grad_norm": 0.59375, |
| "learning_rate": 0.000175111106591302, |
| "loss": 0.8231, |
| "mean_token_accuracy": 0.7762707683444023, |
| "num_tokens": 3478528.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.8619935244321824, |
| "epoch": 1.605087014725569, |
| "grad_norm": 0.61328125, |
| "learning_rate": 0.00017184314618508148, |
| "loss": 0.8254, |
| "mean_token_accuracy": 0.7741544449329376, |
| "num_tokens": 3683328.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.8650275564193726, |
| "epoch": 1.6943328871039713, |
| "grad_norm": 0.58984375, |
| "learning_rate": 0.00016840831807192854, |
| "loss": 0.8275, |
| "mean_token_accuracy": 0.7748044946789742, |
| "num_tokens": 3888128.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.860476321876049, |
| "epoch": 1.783578759482374, |
| "grad_norm": 0.57421875, |
| "learning_rate": 0.00016481460021385323, |
| "loss": 0.8255, |
| "mean_token_accuracy": 0.7732942277193069, |
| "num_tokens": 4092928.0, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.8491663599014282, |
| "epoch": 1.8728246318607764, |
| "grad_norm": 0.57421875, |
| "learning_rate": 0.0001610703396208375, |
| "loss": 0.8172, |
| "mean_token_accuracy": 0.7759139758348464, |
| "num_tokens": 4297728.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.8503634676337242, |
| "epoch": 1.962070504239179, |
| "grad_norm": 0.58203125, |
| "learning_rate": 0.0001571842329635102, |
| "loss": 0.8165, |
| "mean_token_accuracy": 0.7770869982242584, |
| "num_tokens": 4502528.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.7979691500591143, |
| "epoch": 2.0499776885319054, |
| "grad_norm": 0.625, |
| "learning_rate": 0.00015316530637367708, |
| "loss": 0.7612, |
| "mean_token_accuracy": 0.7890002040693602, |
| "num_tokens": 4704256.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.7788776361942291, |
| "epoch": 2.139223560910308, |
| "grad_norm": 0.62890625, |
| "learning_rate": 0.00014902289447962187, |
| "loss": 0.7431, |
| "mean_token_accuracy": 0.7926588499546051, |
| "num_tokens": 4909056.0, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.7778185418248177, |
| "epoch": 2.2284694332887103, |
| "grad_norm": 0.71484375, |
| "learning_rate": 0.0001447666187248731, |
| "loss": 0.7431, |
| "mean_token_accuracy": 0.7919403752684593, |
| "num_tokens": 5113856.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.7770297473669052, |
| "epoch": 2.3177153056671127, |
| "grad_norm": 0.671875, |
| "learning_rate": 0.00014040636502079434, |
| "loss": 0.7421, |
| "mean_token_accuracy": 0.7937096789479255, |
| "num_tokens": 5318656.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.774446559548378, |
| "epoch": 2.4069611780455156, |
| "grad_norm": 0.69921875, |
| "learning_rate": 0.00013595226078490395, |
| "loss": 0.7405, |
| "mean_token_accuracy": 0.7925708714127541, |
| "num_tokens": 5523456.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.7771885851025582, |
| "epoch": 2.496207050423918, |
| "grad_norm": 0.61328125, |
| "learning_rate": 0.00013141465141825603, |
| "loss": 0.7402, |
| "mean_token_accuracy": 0.7931915977597237, |
| "num_tokens": 5728256.0, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.7688560289144516, |
| "epoch": 2.5854529228023204, |
| "grad_norm": 0.66796875, |
| "learning_rate": 0.0001268040762765189, |
| "loss": 0.7369, |
| "mean_token_accuracy": 0.7952443835139275, |
| "num_tokens": 5933056.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.78252092897892, |
| "epoch": 2.674698795180723, |
| "grad_norm": 0.70703125, |
| "learning_rate": 0.00012213124419056074, |
| "loss": 0.7474, |
| "mean_token_accuracy": 0.7915493679046631, |
| "num_tokens": 6137856.0, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.7763542786240578, |
| "epoch": 2.7639446675591253, |
| "grad_norm": 0.625, |
| "learning_rate": 0.00011740700859340161, |
| "loss": 0.7383, |
| "mean_token_accuracy": 0.7942082145810128, |
| "num_tokens": 6342656.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.7749826022982598, |
| "epoch": 2.853190539937528, |
| "grad_norm": 0.6796875, |
| "learning_rate": 0.00011264234231130209, |
| "loss": 0.7394, |
| "mean_token_accuracy": 0.794452593922615, |
| "num_tokens": 6547456.0, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.7712367391586303, |
| "epoch": 2.9424364123159306, |
| "grad_norm": 0.65234375, |
| "learning_rate": 0.00010784831207754171, |
| "loss": 0.7352, |
| "mean_token_accuracy": 0.793088955283165, |
| "num_tokens": 6752256.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 0.7491341354278138, |
| "epoch": 3.030343596608657, |
| "grad_norm": 0.71875, |
| "learning_rate": 0.00010303605282808242, |
| "loss": 0.7141, |
| "mean_token_accuracy": 0.7993410486860324, |
| "num_tokens": 6953984.0, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.7142950230836869, |
| "epoch": 3.1195894689870594, |
| "grad_norm": 0.66796875, |
| "learning_rate": 9.821674183881982e-05, |
| "loss": 0.6733, |
| "mean_token_accuracy": 0.809511242210865, |
| "num_tokens": 7158784.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 0.7075088465213776, |
| "epoch": 3.208835341365462, |
| "grad_norm": 0.734375, |
| "learning_rate": 9.34015727644931e-05, |
| "loss": 0.6705, |
| "mean_token_accuracy": 0.80923753708601, |
| "num_tokens": 7363584.0, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.7165163627266884, |
| "epoch": 3.298081213743864, |
| "grad_norm": 0.734375, |
| "learning_rate": 8.860172963955215e-05, |
| "loss": 0.683, |
| "mean_token_accuracy": 0.8069452607631683, |
| "num_tokens": 7568384.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 0.7122274199128151, |
| "epoch": 3.3873270861222666, |
| "grad_norm": 0.75390625, |
| "learning_rate": 8.382836090136962e-05, |
| "loss": 0.6751, |
| "mean_token_accuracy": 0.8079178902506828, |
| "num_tokens": 7773184.0, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.7034698343276977, |
| "epoch": 3.4765729585006695, |
| "grad_norm": 0.6953125, |
| "learning_rate": 7.909255349613283e-05, |
| "loss": 0.6673, |
| "mean_token_accuracy": 0.8113440865278244, |
| "num_tokens": 7977984.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 0.7057863634824753, |
| "epoch": 3.565818830879072, |
| "grad_norm": 0.76171875, |
| "learning_rate": 7.440530712755951e-05, |
| "loss": 0.6688, |
| "mean_token_accuracy": 0.8105962842702865, |
| "num_tokens": 8182784.0, |
| "step": 2000 |
| }, |
| { |
| "entropy": 0.7125656777620315, |
| "epoch": 3.6550647032574743, |
| "grad_norm": 0.76953125, |
| "learning_rate": 6.977750870824863e-05, |
| "loss": 0.6761, |
| "mean_token_accuracy": 0.8088660803437233, |
| "num_tokens": 8387584.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 0.70076868891716, |
| "epoch": 3.7443105756358768, |
| "grad_norm": 0.765625, |
| "learning_rate": 6.521990707300736e-05, |
| "loss": 0.6634, |
| "mean_token_accuracy": 0.8110703819990158, |
| "num_tokens": 8592384.0, |
| "step": 2100 |
| }, |
| { |
| "entropy": 0.700417303442955, |
| "epoch": 3.833556448014279, |
| "grad_norm": 0.73828125, |
| "learning_rate": 6.074308801288713e-05, |
| "loss": 0.6631, |
| "mean_token_accuracy": 0.8109530797600746, |
| "num_tokens": 8797184.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 0.7015222778916359, |
| "epoch": 3.922802320392682, |
| "grad_norm": 0.77734375, |
| "learning_rate": 5.6357449687915386e-05, |
| "loss": 0.6665, |
| "mean_token_accuracy": 0.8110703811049461, |
| "num_tokens": 9001984.0, |
| "step": 2200 |
| }, |
| { |
| "entropy": 0.7122828648780203, |
| "epoch": 4.010709504685408, |
| "grad_norm": 0.703125, |
| "learning_rate": 5.207317847563248e-05, |
| "loss": 0.6758, |
| "mean_token_accuracy": 0.8095082153523634, |
| "num_tokens": 9203712.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 0.6674378645420075, |
| "epoch": 4.099955377063811, |
| "grad_norm": 0.74609375, |
| "learning_rate": 4.7900225311528094e-05, |
| "loss": 0.6269, |
| "mean_token_accuracy": 0.8208357748389244, |
| "num_tokens": 9408512.0, |
| "step": 2300 |
| }, |
| { |
| "entropy": 0.6730345389246941, |
| "epoch": 4.189201249442213, |
| "grad_norm": 0.75390625, |
| "learning_rate": 4.384828257633177e-05, |
| "loss": 0.6365, |
| "mean_token_accuracy": 0.8188856270909309, |
| "num_tokens": 9613312.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 0.6662819012999535, |
| "epoch": 4.278447121820616, |
| "grad_norm": 0.8515625, |
| "learning_rate": 3.992676158383957e-05, |
| "loss": 0.6271, |
| "mean_token_accuracy": 0.8203372398018837, |
| "num_tokens": 9818112.0, |
| "step": 2400 |
| }, |
| { |
| "entropy": 0.664862583577633, |
| "epoch": 4.367692994199018, |
| "grad_norm": 0.80078125, |
| "learning_rate": 3.6144770721565844e-05, |
| "loss": 0.6261, |
| "mean_token_accuracy": 0.8207477974891663, |
| "num_tokens": 10022912.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 0.6621513772010803, |
| "epoch": 4.4569388665774206, |
| "grad_norm": 0.77734375, |
| "learning_rate": 3.251109429499194e-05, |
| "loss": 0.6238, |
| "mean_token_accuracy": 0.8212072286009788, |
| "num_tokens": 10227712.0, |
| "step": 2500 |
| }, |
| { |
| "entropy": 0.6667330291867256, |
| "epoch": 4.546184738955823, |
| "grad_norm": 0.77734375, |
| "learning_rate": 2.9034172124549263e-05, |
| "loss": 0.6275, |
| "mean_token_accuracy": 0.8213147559762001, |
| "num_tokens": 10432512.0, |
| "step": 2550 |
| }, |
| { |
| "entropy": 0.6676543334126472, |
| "epoch": 4.635430611334225, |
| "grad_norm": 0.78125, |
| "learning_rate": 2.5722079942726964e-05, |
| "loss": 0.6295, |
| "mean_token_accuracy": 0.8205034193396569, |
| "num_tokens": 10637312.0, |
| "step": 2600 |
| }, |
| { |
| "entropy": 0.6684669059514999, |
| "epoch": 4.724676483712628, |
| "grad_norm": 0.796875, |
| "learning_rate": 2.2582510636834064e-05, |
| "loss": 0.6347, |
| "mean_token_accuracy": 0.8190029296278953, |
| "num_tokens": 10842112.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 0.6723845577239991, |
| "epoch": 4.813922356091031, |
| "grad_norm": 0.8125, |
| "learning_rate": 1.9622756380983887e-05, |
| "loss": 0.6353, |
| "mean_token_accuracy": 0.8182209166884422, |
| "num_tokens": 11046912.0, |
| "step": 2700 |
| }, |
| { |
| "entropy": 0.6706709080934524, |
| "epoch": 4.903168228469434, |
| "grad_norm": 0.75390625, |
| "learning_rate": 1.684969169880165e-05, |
| "loss": 0.6291, |
| "mean_token_accuracy": 0.8204301059246063, |
| "num_tokens": 11251712.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 0.6609847331047058, |
| "epoch": 4.992414100847836, |
| "grad_norm": 0.7734375, |
| "learning_rate": 1.4269757496194991e-05, |
| "loss": 0.6283, |
| "mean_token_accuracy": 0.8204398784041405, |
| "num_tokens": 11456512.0, |
| "step": 2800 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 3360, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 560, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.897183183625257e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|