| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 97.44590163934426, | |
| "eval_steps": 500, | |
| "global_step": 3800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.2885245901639344, | |
| "grad_norm": 49.25, | |
| "learning_rate": 0.00019747235387045816, | |
| "loss": 6.9218, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 2.577049180327869, | |
| "grad_norm": 73.5, | |
| "learning_rate": 0.0001948393891521854, | |
| "loss": 3.5446, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.865573770491803, | |
| "grad_norm": 58.25, | |
| "learning_rate": 0.0001922064244339126, | |
| "loss": 3.191, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 5.131147540983607, | |
| "grad_norm": 49.0, | |
| "learning_rate": 0.00018957345971563983, | |
| "loss": 2.9104, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 6.419672131147541, | |
| "grad_norm": 66.5, | |
| "learning_rate": 0.00018694049499736707, | |
| "loss": 2.0795, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 7.7081967213114755, | |
| "grad_norm": 45.75, | |
| "learning_rate": 0.00018430753027909427, | |
| "loss": 2.3055, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 8.99672131147541, | |
| "grad_norm": 56.25, | |
| "learning_rate": 0.0001816745655608215, | |
| "loss": 1.8394, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 10.262295081967213, | |
| "grad_norm": 50.75, | |
| "learning_rate": 0.00017904160084254874, | |
| "loss": 1.5723, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 11.550819672131148, | |
| "grad_norm": 48.5, | |
| "learning_rate": 0.00017640863612427594, | |
| "loss": 1.4006, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 12.839344262295082, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.00017377567140600318, | |
| "loss": 1.363, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 14.104918032786886, | |
| "grad_norm": 38.5, | |
| "learning_rate": 0.0001711427066877304, | |
| "loss": 1.3352, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 15.39344262295082, | |
| "grad_norm": 45.0, | |
| "learning_rate": 0.00016850974196945762, | |
| "loss": 1.1165, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 16.681967213114753, | |
| "grad_norm": 44.0, | |
| "learning_rate": 0.00016587677725118485, | |
| "loss": 0.8736, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 17.970491803278687, | |
| "grad_norm": 43.5, | |
| "learning_rate": 0.00016324381253291208, | |
| "loss": 1.0635, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 19.236065573770492, | |
| "grad_norm": 34.25, | |
| "learning_rate": 0.0001606108478146393, | |
| "loss": 0.7858, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 20.524590163934427, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.00015797788309636652, | |
| "loss": 0.8236, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 21.81311475409836, | |
| "grad_norm": 35.5, | |
| "learning_rate": 0.00015534491837809376, | |
| "loss": 0.7766, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 23.078688524590163, | |
| "grad_norm": 33.0, | |
| "learning_rate": 0.00015271195365982096, | |
| "loss": 0.6612, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 24.367213114754097, | |
| "grad_norm": 33.75, | |
| "learning_rate": 0.0001500789889415482, | |
| "loss": 0.6364, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 25.65573770491803, | |
| "grad_norm": 38.25, | |
| "learning_rate": 0.00014744602422327543, | |
| "loss": 0.6553, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 26.944262295081966, | |
| "grad_norm": 29.25, | |
| "learning_rate": 0.00014481305950500263, | |
| "loss": 0.5468, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 28.20983606557377, | |
| "grad_norm": 35.25, | |
| "learning_rate": 0.00014218009478672987, | |
| "loss": 0.5311, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 29.498360655737706, | |
| "grad_norm": 27.75, | |
| "learning_rate": 0.0001395471300684571, | |
| "loss": 0.5019, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 30.78688524590164, | |
| "grad_norm": 28.125, | |
| "learning_rate": 0.0001369141653501843, | |
| "loss": 0.6387, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 32.05245901639344, | |
| "grad_norm": 38.75, | |
| "learning_rate": 0.00013428120063191154, | |
| "loss": 0.5054, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 33.34098360655738, | |
| "grad_norm": 21.875, | |
| "learning_rate": 0.00013164823591363877, | |
| "loss": 0.4805, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 34.62950819672131, | |
| "grad_norm": 29.375, | |
| "learning_rate": 0.00012901527119536598, | |
| "loss": 0.5118, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 35.91803278688525, | |
| "grad_norm": 36.0, | |
| "learning_rate": 0.0001263823064770932, | |
| "loss": 0.447, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 37.18360655737705, | |
| "grad_norm": 24.125, | |
| "learning_rate": 0.00012374934175882045, | |
| "loss": 0.3921, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 38.472131147540985, | |
| "grad_norm": 21.875, | |
| "learning_rate": 0.00012111637704054765, | |
| "loss": 0.4268, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 39.760655737704916, | |
| "grad_norm": 22.25, | |
| "learning_rate": 0.00011848341232227489, | |
| "loss": 0.3317, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 41.02622950819672, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 0.00011585044760400212, | |
| "loss": 0.387, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 42.31475409836066, | |
| "grad_norm": 20.875, | |
| "learning_rate": 0.00011321748288572934, | |
| "loss": 0.3285, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 43.60327868852459, | |
| "grad_norm": 21.375, | |
| "learning_rate": 0.00011058451816745656, | |
| "loss": 0.3281, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 44.89180327868853, | |
| "grad_norm": 22.75, | |
| "learning_rate": 0.00010795155344918379, | |
| "loss": 0.3148, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 46.157377049180326, | |
| "grad_norm": 18.75, | |
| "learning_rate": 0.00010531858873091101, | |
| "loss": 0.2567, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 47.445901639344264, | |
| "grad_norm": 23.75, | |
| "learning_rate": 0.00010268562401263824, | |
| "loss": 0.2609, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 48.734426229508195, | |
| "grad_norm": 18.75, | |
| "learning_rate": 0.00010005265929436546, | |
| "loss": 0.2365, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 9.74196945760927e-05, | |
| "loss": 0.2555, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 51.28852459016394, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 9.478672985781992e-05, | |
| "loss": 0.2184, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 52.57704918032787, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 9.215376513954714e-05, | |
| "loss": 0.2279, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 53.86557377049181, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 8.952080042127437e-05, | |
| "loss": 0.202, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 55.131147540983605, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 8.688783570300159e-05, | |
| "loss": 0.1651, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 56.41967213114754, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 8.425487098472881e-05, | |
| "loss": 0.2015, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 57.708196721311474, | |
| "grad_norm": 16.375, | |
| "learning_rate": 8.162190626645604e-05, | |
| "loss": 0.1504, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 58.99672131147541, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 7.898894154818326e-05, | |
| "loss": 0.1725, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 60.26229508196721, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 7.635597682991048e-05, | |
| "loss": 0.1499, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 61.55081967213115, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 7.372301211163771e-05, | |
| "loss": 0.145, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 62.83934426229508, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 7.109004739336493e-05, | |
| "loss": 0.1379, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 64.10491803278688, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 6.845708267509215e-05, | |
| "loss": 0.1244, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 65.39344262295081, | |
| "grad_norm": 7.0, | |
| "learning_rate": 6.582411795681939e-05, | |
| "loss": 0.1214, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 66.68196721311476, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 6.31911532385466e-05, | |
| "loss": 0.1341, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 67.97049180327869, | |
| "grad_norm": 7.09375, | |
| "learning_rate": 6.0558188520273826e-05, | |
| "loss": 0.1201, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 69.23606557377049, | |
| "grad_norm": 20.5, | |
| "learning_rate": 5.792522380200106e-05, | |
| "loss": 0.1049, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 70.52459016393442, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 5.529225908372828e-05, | |
| "loss": 0.1033, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 71.81311475409836, | |
| "grad_norm": 3.25, | |
| "learning_rate": 5.2659294365455505e-05, | |
| "loss": 0.1028, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 73.07868852459016, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 5.002632964718273e-05, | |
| "loss": 0.1003, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 74.3672131147541, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 4.739336492890996e-05, | |
| "loss": 0.0993, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 75.65573770491804, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 4.4760400210637185e-05, | |
| "loss": 0.0988, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 76.94426229508197, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 4.2127435492364404e-05, | |
| "loss": 0.0885, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 78.20983606557377, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 3.949447077409163e-05, | |
| "loss": 0.0816, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 79.4983606557377, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 3.686150605581886e-05, | |
| "loss": 0.0969, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 80.78688524590164, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 3.422854133754608e-05, | |
| "loss": 0.0886, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 82.05245901639344, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 3.15955766192733e-05, | |
| "loss": 0.0801, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 83.34098360655737, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 2.896261190100053e-05, | |
| "loss": 0.0888, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 84.62950819672132, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 2.6329647182727753e-05, | |
| "loss": 0.0872, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 85.91803278688525, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 2.369668246445498e-05, | |
| "loss": 0.0807, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 87.18360655737705, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 2.1063717746182202e-05, | |
| "loss": 0.0779, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 88.47213114754098, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.843075302790943e-05, | |
| "loss": 0.0741, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 89.76065573770492, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.579778830963665e-05, | |
| "loss": 0.0833, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 91.02622950819672, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.3164823591363876e-05, | |
| "loss": 0.0861, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 92.31475409836065, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.0531858873091101e-05, | |
| "loss": 0.08, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 93.6032786885246, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 7.898894154818326e-06, | |
| "loss": 0.0785, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 94.89180327868853, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.2659294365455505e-06, | |
| "loss": 0.0936, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 96.15737704918033, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 2.6329647182727753e-06, | |
| "loss": 0.0741, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 97.44590163934426, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 0.0, | |
| "loss": 0.0927, | |
| "step": 3800 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 3800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.470967617037125e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |