| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01, |
| "grad_norm": 2.332974433898926, |
| "learning_rate": 1.9804000000000002e-05, |
| "loss": 9.66635009765625, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.6665964126586914, |
| "learning_rate": 1.9604e-05, |
| "loss": 8.088862915039062, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 2.068279504776001, |
| "learning_rate": 1.9404e-05, |
| "loss": 7.215781860351562, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.76992666721344, |
| "learning_rate": 1.9204e-05, |
| "loss": 6.690926513671875, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 1.480324149131775, |
| "learning_rate": 1.9004000000000003e-05, |
| "loss": 6.430185546875, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.1323567628860474, |
| "learning_rate": 1.8804e-05, |
| "loss": 6.076779174804687, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 1.248739242553711, |
| "learning_rate": 1.8604000000000003e-05, |
| "loss": 6.090475463867188, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 1.2340136766433716, |
| "learning_rate": 1.8404000000000002e-05, |
| "loss": 6.04998046875, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 1.1271188259124756, |
| "learning_rate": 1.8204e-05, |
| "loss": 5.8744744873046875, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 1.7444993257522583, |
| "learning_rate": 1.8004000000000002e-05, |
| "loss": 5.79928466796875, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 1.294396162033081, |
| "learning_rate": 1.7804e-05, |
| "loss": 5.830994873046875, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 1.1351591348648071, |
| "learning_rate": 1.7604e-05, |
| "loss": 5.764346313476563, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.6005209684371948, |
| "learning_rate": 1.7404e-05, |
| "loss": 5.688011474609375, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 1.3918774127960205, |
| "learning_rate": 1.7204e-05, |
| "loss": 5.67168212890625, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 1.0998594760894775, |
| "learning_rate": 1.7004000000000002e-05, |
| "loss": 5.563177490234375, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 1.2702234983444214, |
| "learning_rate": 1.6804e-05, |
| "loss": 5.638896484375, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 1.2025340795516968, |
| "learning_rate": 1.6604000000000002e-05, |
| "loss": 5.447022705078125, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 1.0838000774383545, |
| "learning_rate": 1.6404e-05, |
| "loss": 5.563392944335938, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 1.6105167865753174, |
| "learning_rate": 1.6204000000000003e-05, |
| "loss": 5.441434326171875, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 1.195156455039978, |
| "learning_rate": 1.6004e-05, |
| "loss": 5.535233154296875, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 1.3313252925872803, |
| "learning_rate": 1.5804000000000003e-05, |
| "loss": 5.4950341796875, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 1.2405613660812378, |
| "learning_rate": 1.5604000000000002e-05, |
| "loss": 5.525492553710937, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 1.3697841167449951, |
| "learning_rate": 1.5404e-05, |
| "loss": 5.394120483398438, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.6689515113830566, |
| "learning_rate": 1.5204e-05, |
| "loss": 5.42336181640625, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 1.6169382333755493, |
| "learning_rate": 1.5004e-05, |
| "loss": 5.263718872070313, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 1.3931348323822021, |
| "learning_rate": 1.4804000000000001e-05, |
| "loss": 5.332710571289063, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 1.3986672163009644, |
| "learning_rate": 1.4604000000000001e-05, |
| "loss": 5.300538330078125, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 1.2856848239898682, |
| "learning_rate": 1.4404e-05, |
| "loss": 5.265980834960938, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 1.4729372262954712, |
| "learning_rate": 1.4204000000000002e-05, |
| "loss": 5.241402587890625, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 1.5896576642990112, |
| "learning_rate": 1.4004000000000002e-05, |
| "loss": 5.286559448242188, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 1.5870860815048218, |
| "learning_rate": 1.3804000000000002e-05, |
| "loss": 5.2646923828125, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 1.9024347066879272, |
| "learning_rate": 1.3604000000000002e-05, |
| "loss": 5.213834838867188, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 1.545648217201233, |
| "learning_rate": 1.3404e-05, |
| "loss": 5.284638671875, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 1.7494972944259644, |
| "learning_rate": 1.3204000000000001e-05, |
| "loss": 5.29470703125, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 1.534767746925354, |
| "learning_rate": 1.3004000000000001e-05, |
| "loss": 5.17380126953125, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 1.9378814697265625, |
| "learning_rate": 1.2804000000000001e-05, |
| "loss": 5.2462841796875, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 1.7414913177490234, |
| "learning_rate": 1.2604e-05, |
| "loss": 5.184336547851562, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 1.312013030052185, |
| "learning_rate": 1.2404e-05, |
| "loss": 5.2285107421875, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 2.7188756465911865, |
| "learning_rate": 1.2204e-05, |
| "loss": 5.183685302734375, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 1.5776464939117432, |
| "learning_rate": 1.2004e-05, |
| "loss": 5.212086181640625, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 1.8002684116363525, |
| "learning_rate": 1.1803999999999999e-05, |
| "loss": 5.187786865234375, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 1.690958023071289, |
| "learning_rate": 1.1604000000000003e-05, |
| "loss": 5.070486145019531, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 1.566103219985962, |
| "learning_rate": 1.1404000000000001e-05, |
| "loss": 5.091676330566406, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 1.6997359991073608, |
| "learning_rate": 1.1204000000000001e-05, |
| "loss": 5.151467895507812, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 1.4722249507904053, |
| "learning_rate": 1.1004000000000002e-05, |
| "loss": 5.095538024902344, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 1.7524068355560303, |
| "learning_rate": 1.0804000000000002e-05, |
| "loss": 5.041690673828125, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 1.56508207321167, |
| "learning_rate": 1.0604e-05, |
| "loss": 5.047481689453125, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 1.5206327438354492, |
| "learning_rate": 1.0404e-05, |
| "loss": 5.138922119140625, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 1.7860289812088013, |
| "learning_rate": 1.0204000000000001e-05, |
| "loss": 5.1789208984375, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 1.762772798538208, |
| "learning_rate": 1.0004000000000001e-05, |
| "loss": 4.9995901489257815, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 1.9011698961257935, |
| "learning_rate": 9.804000000000001e-06, |
| "loss": 5.030536804199219, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 1.7511317729949951, |
| "learning_rate": 9.604000000000002e-06, |
| "loss": 5.054608459472656, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 1.454971194267273, |
| "learning_rate": 9.404e-06, |
| "loss": 5.037495422363281, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 1.5396337509155273, |
| "learning_rate": 9.204e-06, |
| "loss": 5.031622314453125, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 1.467636227607727, |
| "learning_rate": 9.004e-06, |
| "loss": 5.0833673095703125, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 1.5613443851470947, |
| "learning_rate": 8.804e-06, |
| "loss": 4.928188171386719, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 1.7963453531265259, |
| "learning_rate": 8.604000000000001e-06, |
| "loss": 5.066728515625, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 1.9899804592132568, |
| "learning_rate": 8.404000000000001e-06, |
| "loss": 4.9547052001953125, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 1.502890944480896, |
| "learning_rate": 8.204000000000001e-06, |
| "loss": 5.096377868652343, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 1.554468035697937, |
| "learning_rate": 8.004e-06, |
| "loss": 5.010852966308594, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 1.768509030342102, |
| "learning_rate": 7.804e-06, |
| "loss": 4.954717102050782, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 1.5894496440887451, |
| "learning_rate": 7.604e-06, |
| "loss": 4.946051025390625, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 1.7610846757888794, |
| "learning_rate": 7.404e-06, |
| "loss": 4.855561828613281, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 1.7571163177490234, |
| "learning_rate": 7.204000000000001e-06, |
| "loss": 4.930430297851562, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 1.6853914260864258, |
| "learning_rate": 7.004000000000001e-06, |
| "loss": 4.855824584960938, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 1.4607149362564087, |
| "learning_rate": 6.804e-06, |
| "loss": 5.078975524902344, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 1.9196784496307373, |
| "learning_rate": 6.604000000000001e-06, |
| "loss": 4.9388720703125, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 1.8639936447143555, |
| "learning_rate": 6.404e-06, |
| "loss": 4.99400146484375, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 1.7838945388793945, |
| "learning_rate": 6.204e-06, |
| "loss": 4.887441101074219, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 1.659825086593628, |
| "learning_rate": 6.004000000000001e-06, |
| "loss": 4.677823181152344, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 1.4775367975234985, |
| "learning_rate": 5.804000000000001e-06, |
| "loss": 4.8725653076171875, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 1.4383823871612549, |
| "learning_rate": 5.604000000000001e-06, |
| "loss": 4.973861389160156, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 1.7640321254730225, |
| "learning_rate": 5.404e-06, |
| "loss": 4.926856994628906, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 1.3515348434448242, |
| "learning_rate": 5.2040000000000005e-06, |
| "loss": 4.904393005371094, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 1.3852506875991821, |
| "learning_rate": 5.004e-06, |
| "loss": 4.93823486328125, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 1.5573221445083618, |
| "learning_rate": 4.804e-06, |
| "loss": 4.901606140136718, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 1.7575520277023315, |
| "learning_rate": 4.604e-06, |
| "loss": 4.8115087890625, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 1.5037921667099, |
| "learning_rate": 4.4040000000000005e-06, |
| "loss": 4.910712890625, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 2.2725980281829834, |
| "learning_rate": 4.204e-06, |
| "loss": 4.771448364257813, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 1.4463030099868774, |
| "learning_rate": 4.004e-06, |
| "loss": 4.809368286132813, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 1.726515293121338, |
| "learning_rate": 3.8040000000000003e-06, |
| "loss": 4.944158935546875, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 1.6566334962844849, |
| "learning_rate": 3.604e-06, |
| "loss": 4.9215576171875, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 1.5525240898132324, |
| "learning_rate": 3.404e-06, |
| "loss": 4.8733984375, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 1.4844027757644653, |
| "learning_rate": 3.2040000000000006e-06, |
| "loss": 4.825590515136719, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 1.5394399166107178, |
| "learning_rate": 3.0040000000000004e-06, |
| "loss": 4.887457580566406, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 1.463205099105835, |
| "learning_rate": 2.804e-06, |
| "loss": 4.960470275878906, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 1.674668550491333, |
| "learning_rate": 2.6040000000000004e-06, |
| "loss": 4.964275207519531, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 1.597741723060608, |
| "learning_rate": 2.404e-06, |
| "loss": 4.844057312011719, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 1.4939367771148682, |
| "learning_rate": 2.2040000000000004e-06, |
| "loss": 4.818087463378906, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 1.5718450546264648, |
| "learning_rate": 2.004e-06, |
| "loss": 4.905419006347656, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 1.5969394445419312, |
| "learning_rate": 1.8040000000000002e-06, |
| "loss": 4.818899230957031, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 1.6721904277801514, |
| "learning_rate": 1.604e-06, |
| "loss": 4.901571044921875, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 1.3588262796401978, |
| "learning_rate": 1.404e-06, |
| "loss": 4.869562377929688, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 1.6911239624023438, |
| "learning_rate": 1.204e-06, |
| "loss": 4.782047729492188, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 1.359279990196228, |
| "learning_rate": 1.004e-06, |
| "loss": 4.815296936035156, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 1.4526113271713257, |
| "learning_rate": 8.04e-07, |
| "loss": 4.827503051757812, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 1.6256376504898071, |
| "learning_rate": 6.040000000000001e-07, |
| "loss": 4.810068359375, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 1.2820252180099487, |
| "learning_rate": 4.04e-07, |
| "loss": 4.8670956420898435, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 1.3670392036437988, |
| "learning_rate": 2.0400000000000003e-07, |
| "loss": 4.911158447265625, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.4537466764450073, |
| "learning_rate": 4e-09, |
| "loss": 4.827361145019531, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.090336256e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|