{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 2.332974433898926, "learning_rate": 1.9804000000000002e-05, "loss": 9.66635009765625, "step": 50 }, { "epoch": 0.02, "grad_norm": 1.6665964126586914, "learning_rate": 1.9604e-05, "loss": 8.088862915039062, "step": 100 }, { "epoch": 0.03, "grad_norm": 2.068279504776001, "learning_rate": 1.9404e-05, "loss": 7.215781860351562, "step": 150 }, { "epoch": 0.04, "grad_norm": 1.76992666721344, "learning_rate": 1.9204e-05, "loss": 6.690926513671875, "step": 200 }, { "epoch": 0.05, "grad_norm": 1.480324149131775, "learning_rate": 1.9004000000000003e-05, "loss": 6.430185546875, "step": 250 }, { "epoch": 0.06, "grad_norm": 1.1323567628860474, "learning_rate": 1.8804e-05, "loss": 6.076779174804687, "step": 300 }, { "epoch": 0.07, "grad_norm": 1.248739242553711, "learning_rate": 1.8604000000000003e-05, "loss": 6.090475463867188, "step": 350 }, { "epoch": 0.08, "grad_norm": 1.2340136766433716, "learning_rate": 1.8404000000000002e-05, "loss": 6.04998046875, "step": 400 }, { "epoch": 0.09, "grad_norm": 1.1271188259124756, "learning_rate": 1.8204e-05, "loss": 5.8744744873046875, "step": 450 }, { "epoch": 0.1, "grad_norm": 1.7444993257522583, "learning_rate": 1.8004000000000002e-05, "loss": 5.79928466796875, "step": 500 }, { "epoch": 0.11, "grad_norm": 1.294396162033081, "learning_rate": 1.7804e-05, "loss": 5.830994873046875, "step": 550 }, { "epoch": 0.12, "grad_norm": 1.1351591348648071, "learning_rate": 1.7604e-05, "loss": 5.764346313476563, "step": 600 }, { "epoch": 0.13, "grad_norm": 1.6005209684371948, "learning_rate": 1.7404e-05, "loss": 5.688011474609375, "step": 650 }, { "epoch": 0.14, "grad_norm": 1.3918774127960205, "learning_rate": 1.7204e-05, "loss": 5.67168212890625, "step": 700 }, { "epoch": 0.15, "grad_norm": 1.0998594760894775, "learning_rate": 1.7004000000000002e-05, "loss": 5.563177490234375, "step": 750 }, { "epoch": 0.16, "grad_norm": 1.2702234983444214, "learning_rate": 1.6804e-05, "loss": 5.638896484375, "step": 800 }, { "epoch": 0.17, "grad_norm": 1.2025340795516968, "learning_rate": 1.6604000000000002e-05, "loss": 5.447022705078125, "step": 850 }, { "epoch": 0.18, "grad_norm": 1.0838000774383545, "learning_rate": 1.6404e-05, "loss": 5.563392944335938, "step": 900 }, { "epoch": 0.19, "grad_norm": 1.6105167865753174, "learning_rate": 1.6204000000000003e-05, "loss": 5.441434326171875, "step": 950 }, { "epoch": 0.2, "grad_norm": 1.195156455039978, "learning_rate": 1.6004e-05, "loss": 5.535233154296875, "step": 1000 }, { "epoch": 0.21, "grad_norm": 1.3313252925872803, "learning_rate": 1.5804000000000003e-05, "loss": 5.4950341796875, "step": 1050 }, { "epoch": 0.22, "grad_norm": 1.2405613660812378, "learning_rate": 1.5604000000000002e-05, "loss": 5.525492553710937, "step": 1100 }, { "epoch": 0.23, "grad_norm": 1.3697841167449951, "learning_rate": 1.5404e-05, "loss": 5.394120483398438, "step": 1150 }, { "epoch": 0.24, "grad_norm": 1.6689515113830566, "learning_rate": 1.5204e-05, "loss": 5.42336181640625, "step": 1200 }, { "epoch": 0.25, "grad_norm": 1.6169382333755493, "learning_rate": 1.5004e-05, "loss": 5.263718872070313, "step": 1250 }, { "epoch": 0.26, "grad_norm": 1.3931348323822021, "learning_rate": 1.4804000000000001e-05, "loss": 5.332710571289063, "step": 1300 }, { "epoch": 0.27, "grad_norm": 1.3986672163009644, "learning_rate": 1.4604000000000001e-05, "loss": 5.300538330078125, "step": 1350 }, { "epoch": 0.28, "grad_norm": 1.2856848239898682, "learning_rate": 1.4404e-05, "loss": 5.265980834960938, "step": 1400 }, { "epoch": 0.29, "grad_norm": 1.4729372262954712, "learning_rate": 1.4204000000000002e-05, "loss": 5.241402587890625, "step": 1450 }, { "epoch": 0.3, "grad_norm": 1.5896576642990112, "learning_rate": 1.4004000000000002e-05, "loss": 5.286559448242188, "step": 1500 }, { "epoch": 0.31, "grad_norm": 1.5870860815048218, "learning_rate": 1.3804000000000002e-05, "loss": 5.2646923828125, "step": 1550 }, { "epoch": 0.32, "grad_norm": 1.9024347066879272, "learning_rate": 1.3604000000000002e-05, "loss": 5.213834838867188, "step": 1600 }, { "epoch": 0.33, "grad_norm": 1.545648217201233, "learning_rate": 1.3404e-05, "loss": 5.284638671875, "step": 1650 }, { "epoch": 0.34, "grad_norm": 1.7494972944259644, "learning_rate": 1.3204000000000001e-05, "loss": 5.29470703125, "step": 1700 }, { "epoch": 0.35, "grad_norm": 1.534767746925354, "learning_rate": 1.3004000000000001e-05, "loss": 5.17380126953125, "step": 1750 }, { "epoch": 0.36, "grad_norm": 1.9378814697265625, "learning_rate": 1.2804000000000001e-05, "loss": 5.2462841796875, "step": 1800 }, { "epoch": 0.37, "grad_norm": 1.7414913177490234, "learning_rate": 1.2604e-05, "loss": 5.184336547851562, "step": 1850 }, { "epoch": 0.38, "grad_norm": 1.312013030052185, "learning_rate": 1.2404e-05, "loss": 5.2285107421875, "step": 1900 }, { "epoch": 0.39, "grad_norm": 2.7188756465911865, "learning_rate": 1.2204e-05, "loss": 5.183685302734375, "step": 1950 }, { "epoch": 0.4, "grad_norm": 1.5776464939117432, "learning_rate": 1.2004e-05, "loss": 5.212086181640625, "step": 2000 }, { "epoch": 0.41, "grad_norm": 1.8002684116363525, "learning_rate": 1.1803999999999999e-05, "loss": 5.187786865234375, "step": 2050 }, { "epoch": 0.42, "grad_norm": 1.690958023071289, "learning_rate": 1.1604000000000003e-05, "loss": 5.070486145019531, "step": 2100 }, { "epoch": 0.43, "grad_norm": 1.566103219985962, "learning_rate": 1.1404000000000001e-05, "loss": 5.091676330566406, "step": 2150 }, { "epoch": 0.44, "grad_norm": 1.6997359991073608, "learning_rate": 1.1204000000000001e-05, "loss": 5.151467895507812, "step": 2200 }, { "epoch": 0.45, "grad_norm": 1.4722249507904053, "learning_rate": 1.1004000000000002e-05, "loss": 5.095538024902344, "step": 2250 }, { "epoch": 0.46, "grad_norm": 1.7524068355560303, "learning_rate": 1.0804000000000002e-05, "loss": 5.041690673828125, "step": 2300 }, { "epoch": 0.47, "grad_norm": 1.56508207321167, "learning_rate": 1.0604e-05, "loss": 5.047481689453125, "step": 2350 }, { "epoch": 0.48, "grad_norm": 1.5206327438354492, "learning_rate": 1.0404e-05, "loss": 5.138922119140625, "step": 2400 }, { "epoch": 0.49, "grad_norm": 1.7860289812088013, "learning_rate": 1.0204000000000001e-05, "loss": 5.1789208984375, "step": 2450 }, { "epoch": 0.5, "grad_norm": 1.762772798538208, "learning_rate": 1.0004000000000001e-05, "loss": 4.9995901489257815, "step": 2500 }, { "epoch": 0.51, "grad_norm": 1.9011698961257935, "learning_rate": 9.804000000000001e-06, "loss": 5.030536804199219, "step": 2550 }, { "epoch": 0.52, "grad_norm": 1.7511317729949951, "learning_rate": 9.604000000000002e-06, "loss": 5.054608459472656, "step": 2600 }, { "epoch": 0.53, "grad_norm": 1.454971194267273, "learning_rate": 9.404e-06, "loss": 5.037495422363281, "step": 2650 }, { "epoch": 0.54, "grad_norm": 1.5396337509155273, "learning_rate": 9.204e-06, "loss": 5.031622314453125, "step": 2700 }, { "epoch": 0.55, "grad_norm": 1.467636227607727, "learning_rate": 9.004e-06, "loss": 5.0833673095703125, "step": 2750 }, { "epoch": 0.56, "grad_norm": 1.5613443851470947, "learning_rate": 8.804e-06, "loss": 4.928188171386719, "step": 2800 }, { "epoch": 0.57, "grad_norm": 1.7963453531265259, "learning_rate": 8.604000000000001e-06, "loss": 5.066728515625, "step": 2850 }, { "epoch": 0.58, "grad_norm": 1.9899804592132568, "learning_rate": 8.404000000000001e-06, "loss": 4.9547052001953125, "step": 2900 }, { "epoch": 0.59, "grad_norm": 1.502890944480896, "learning_rate": 8.204000000000001e-06, "loss": 5.096377868652343, "step": 2950 }, { "epoch": 0.6, "grad_norm": 1.554468035697937, "learning_rate": 8.004e-06, "loss": 5.010852966308594, "step": 3000 }, { "epoch": 0.61, "grad_norm": 1.768509030342102, "learning_rate": 7.804e-06, "loss": 4.954717102050782, "step": 3050 }, { "epoch": 0.62, "grad_norm": 1.5894496440887451, "learning_rate": 7.604e-06, "loss": 4.946051025390625, "step": 3100 }, { "epoch": 0.63, "grad_norm": 1.7610846757888794, "learning_rate": 7.404e-06, "loss": 4.855561828613281, "step": 3150 }, { "epoch": 0.64, "grad_norm": 1.7571163177490234, "learning_rate": 7.204000000000001e-06, "loss": 4.930430297851562, "step": 3200 }, { "epoch": 0.65, "grad_norm": 1.6853914260864258, "learning_rate": 7.004000000000001e-06, "loss": 4.855824584960938, "step": 3250 }, { "epoch": 0.66, "grad_norm": 1.4607149362564087, "learning_rate": 6.804e-06, "loss": 5.078975524902344, "step": 3300 }, { "epoch": 0.67, "grad_norm": 1.9196784496307373, "learning_rate": 6.604000000000001e-06, "loss": 4.9388720703125, "step": 3350 }, { "epoch": 0.68, "grad_norm": 1.8639936447143555, "learning_rate": 6.404e-06, "loss": 4.99400146484375, "step": 3400 }, { "epoch": 0.69, "grad_norm": 1.7838945388793945, "learning_rate": 6.204e-06, "loss": 4.887441101074219, "step": 3450 }, { "epoch": 0.7, "grad_norm": 1.659825086593628, "learning_rate": 6.004000000000001e-06, "loss": 4.677823181152344, "step": 3500 }, { "epoch": 0.71, "grad_norm": 1.4775367975234985, "learning_rate": 5.804000000000001e-06, "loss": 4.8725653076171875, "step": 3550 }, { "epoch": 0.72, "grad_norm": 1.4383823871612549, "learning_rate": 5.604000000000001e-06, "loss": 4.973861389160156, "step": 3600 }, { "epoch": 0.73, "grad_norm": 1.7640321254730225, "learning_rate": 5.404e-06, "loss": 4.926856994628906, "step": 3650 }, { "epoch": 0.74, "grad_norm": 1.3515348434448242, "learning_rate": 5.2040000000000005e-06, "loss": 4.904393005371094, "step": 3700 }, { "epoch": 0.75, "grad_norm": 1.3852506875991821, "learning_rate": 5.004e-06, "loss": 4.93823486328125, "step": 3750 }, { "epoch": 0.76, "grad_norm": 1.5573221445083618, "learning_rate": 4.804e-06, "loss": 4.901606140136718, "step": 3800 }, { "epoch": 0.77, "grad_norm": 1.7575520277023315, "learning_rate": 4.604e-06, "loss": 4.8115087890625, "step": 3850 }, { "epoch": 0.78, "grad_norm": 1.5037921667099, "learning_rate": 4.4040000000000005e-06, "loss": 4.910712890625, "step": 3900 }, { "epoch": 0.79, "grad_norm": 2.2725980281829834, "learning_rate": 4.204e-06, "loss": 4.771448364257813, "step": 3950 }, { "epoch": 0.8, "grad_norm": 1.4463030099868774, "learning_rate": 4.004e-06, "loss": 4.809368286132813, "step": 4000 }, { "epoch": 0.81, "grad_norm": 1.726515293121338, "learning_rate": 3.8040000000000003e-06, "loss": 4.944158935546875, "step": 4050 }, { "epoch": 0.82, "grad_norm": 1.6566334962844849, "learning_rate": 3.604e-06, "loss": 4.9215576171875, "step": 4100 }, { "epoch": 0.83, "grad_norm": 1.5525240898132324, "learning_rate": 3.404e-06, "loss": 4.8733984375, "step": 4150 }, { "epoch": 0.84, "grad_norm": 1.4844027757644653, "learning_rate": 3.2040000000000006e-06, "loss": 4.825590515136719, "step": 4200 }, { "epoch": 0.85, "grad_norm": 1.5394399166107178, "learning_rate": 3.0040000000000004e-06, "loss": 4.887457580566406, "step": 4250 }, { "epoch": 0.86, "grad_norm": 1.463205099105835, "learning_rate": 2.804e-06, "loss": 4.960470275878906, "step": 4300 }, { "epoch": 0.87, "grad_norm": 1.674668550491333, "learning_rate": 2.6040000000000004e-06, "loss": 4.964275207519531, "step": 4350 }, { "epoch": 0.88, "grad_norm": 1.597741723060608, "learning_rate": 2.404e-06, "loss": 4.844057312011719, "step": 4400 }, { "epoch": 0.89, "grad_norm": 1.4939367771148682, "learning_rate": 2.2040000000000004e-06, "loss": 4.818087463378906, "step": 4450 }, { "epoch": 0.9, "grad_norm": 1.5718450546264648, "learning_rate": 2.004e-06, "loss": 4.905419006347656, "step": 4500 }, { "epoch": 0.91, "grad_norm": 1.5969394445419312, "learning_rate": 1.8040000000000002e-06, "loss": 4.818899230957031, "step": 4550 }, { "epoch": 0.92, "grad_norm": 1.6721904277801514, "learning_rate": 1.604e-06, "loss": 4.901571044921875, "step": 4600 }, { "epoch": 0.93, "grad_norm": 1.3588262796401978, "learning_rate": 1.404e-06, "loss": 4.869562377929688, "step": 4650 }, { "epoch": 0.94, "grad_norm": 1.6911239624023438, "learning_rate": 1.204e-06, "loss": 4.782047729492188, "step": 4700 }, { "epoch": 0.95, "grad_norm": 1.359279990196228, "learning_rate": 1.004e-06, "loss": 4.815296936035156, "step": 4750 }, { "epoch": 0.96, "grad_norm": 1.4526113271713257, "learning_rate": 8.04e-07, "loss": 4.827503051757812, "step": 4800 }, { "epoch": 0.97, "grad_norm": 1.6256376504898071, "learning_rate": 6.040000000000001e-07, "loss": 4.810068359375, "step": 4850 }, { "epoch": 0.98, "grad_norm": 1.2820252180099487, "learning_rate": 4.04e-07, "loss": 4.8670956420898435, "step": 4900 }, { "epoch": 0.99, "grad_norm": 1.3670392036437988, "learning_rate": 2.0400000000000003e-07, "loss": 4.911158447265625, "step": 4950 }, { "epoch": 1.0, "grad_norm": 1.4537466764450073, "learning_rate": 4e-09, "loss": 4.827361145019531, "step": 5000 } ], "logging_steps": 50, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.090336256e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }