{ "best_metric": 0.33792173862457275, "best_model_checkpoint": "/workspace/result/modern_bert/checkpoint-15000", "epoch": 1.7065449270415565, "eval_steps": 5000, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028443837642574736, "grad_norm": 10.82409381866455, "learning_rate": 5e-06, "loss": 9.665, "step": 250 }, { "epoch": 0.05688767528514947, "grad_norm": 11.049951553344727, "learning_rate": 1e-05, "loss": 4.056, "step": 500 }, { "epoch": 0.08533151292772422, "grad_norm": 10.50940227508545, "learning_rate": 1.5e-05, "loss": 2.8264, "step": 750 }, { "epoch": 0.11377535057029894, "grad_norm": 8.805991172790527, "learning_rate": 2e-05, "loss": 2.3758, "step": 1000 }, { "epoch": 0.1422191882128737, "grad_norm": 8.621254920959473, "learning_rate": 2.5e-05, "loss": 2.1501, "step": 1250 }, { "epoch": 0.17066302585544843, "grad_norm": 9.690339088439941, "learning_rate": 3e-05, "loss": 2.0101, "step": 1500 }, { "epoch": 0.19910686349802315, "grad_norm": 8.770583152770996, "learning_rate": 3.5e-05, "loss": 1.8995, "step": 1750 }, { "epoch": 0.2275507011405979, "grad_norm": 8.335855484008789, "learning_rate": 4e-05, "loss": 1.881, "step": 2000 }, { "epoch": 0.25599453878317263, "grad_norm": 8.480774879455566, "learning_rate": 4.5e-05, "loss": 1.8097, "step": 2250 }, { "epoch": 0.2844383764257474, "grad_norm": 7.207503318786621, "learning_rate": 5e-05, "loss": 1.7579, "step": 2500 }, { "epoch": 0.3128822140683221, "grad_norm": 6.785529613494873, "learning_rate": 5.500000000000001e-05, "loss": 1.7239, "step": 2750 }, { "epoch": 0.34132605171089686, "grad_norm": 5.305034637451172, "learning_rate": 6e-05, "loss": 1.7191, "step": 3000 }, { "epoch": 0.36976988935347155, "grad_norm": 5.429253101348877, "learning_rate": 6.500000000000001e-05, "loss": 1.6932, "step": 3250 }, { "epoch": 0.3982137269960463, "grad_norm": 5.111564636230469, "learning_rate": 7e-05, "loss": 1.6629, "step": 3500 }, { "epoch": 0.42665756463862103, "grad_norm": 4.902778625488281, "learning_rate": 7.500000000000001e-05, "loss": 1.6665, "step": 3750 }, { "epoch": 0.4551014022811958, "grad_norm": 4.092489242553711, "learning_rate": 8e-05, "loss": 1.6507, "step": 4000 }, { "epoch": 0.4835452399237705, "grad_norm": 4.722158432006836, "learning_rate": 8.5e-05, "loss": 1.6289, "step": 4250 }, { "epoch": 0.5119890775663453, "grad_norm": 5.072029113769531, "learning_rate": 9e-05, "loss": 1.6367, "step": 4500 }, { "epoch": 0.54043291520892, "grad_norm": 4.800993919372559, "learning_rate": 9.5e-05, "loss": 1.6001, "step": 4750 }, { "epoch": 0.5688767528514947, "grad_norm": 3.870645761489868, "learning_rate": 0.0001, "loss": 1.6057, "step": 5000 }, { "epoch": 0.5688767528514947, "eval_loss": 0.4018039107322693, "eval_runtime": 575.4905, "eval_samples_per_second": 868.824, "eval_steps_per_second": 6.789, "step": 5000 }, { "epoch": 0.5973205904940695, "grad_norm": 4.046791076660156, "learning_rate": 0.000105, "loss": 1.5728, "step": 5250 }, { "epoch": 0.6257644281366442, "grad_norm": 3.785233736038208, "learning_rate": 0.00011000000000000002, "loss": 1.5712, "step": 5500 }, { "epoch": 0.654208265779219, "grad_norm": 3.365227460861206, "learning_rate": 0.00011499999999999999, "loss": 1.5575, "step": 5750 }, { "epoch": 0.6826521034217937, "grad_norm": 3.3776111602783203, "learning_rate": 0.00012, "loss": 1.5616, "step": 6000 }, { "epoch": 0.7110959410643684, "grad_norm": 3.636517286300659, "learning_rate": 0.000125, "loss": 1.5548, "step": 6250 }, { "epoch": 0.7395397787069431, "grad_norm": 3.1290106773376465, "learning_rate": 0.00013000000000000002, "loss": 1.5377, "step": 6500 }, { "epoch": 0.7679836163495178, "grad_norm": 3.2603325843811035, "learning_rate": 0.00013500000000000003, "loss": 1.5418, "step": 6750 }, { "epoch": 0.7964274539920926, "grad_norm": 2.521440029144287, "learning_rate": 0.00014, "loss": 1.5444, "step": 7000 }, { "epoch": 0.8248712916346673, "grad_norm": 3.360039710998535, "learning_rate": 0.000145, "loss": 1.5442, "step": 7250 }, { "epoch": 0.8533151292772421, "grad_norm": 3.529284715652466, "learning_rate": 0.00015000000000000001, "loss": 1.5225, "step": 7500 }, { "epoch": 0.8817589669198168, "grad_norm": 2.673048257827759, "learning_rate": 0.000155, "loss": 1.5402, "step": 7750 }, { "epoch": 0.9102028045623916, "grad_norm": 3.269730806350708, "learning_rate": 0.00016, "loss": 1.5309, "step": 8000 }, { "epoch": 0.9386466422049663, "grad_norm": 2.5737032890319824, "learning_rate": 0.000165, "loss": 1.5179, "step": 8250 }, { "epoch": 0.967090479847541, "grad_norm": 2.7734227180480957, "learning_rate": 0.00017, "loss": 1.5027, "step": 8500 }, { "epoch": 0.9955343174901158, "grad_norm": 2.7340924739837646, "learning_rate": 0.000175, "loss": 1.5224, "step": 8750 }, { "epoch": 1.0238928236197629, "grad_norm": 2.8657028675079346, "learning_rate": 0.00018, "loss": 1.4871, "step": 9000 }, { "epoch": 1.0523366612623375, "grad_norm": 2.7939274311065674, "learning_rate": 0.00018500000000000002, "loss": 1.4865, "step": 9250 }, { "epoch": 1.0807804989049123, "grad_norm": 2.5338730812072754, "learning_rate": 0.00019, "loss": 1.4753, "step": 9500 }, { "epoch": 1.109224336547487, "grad_norm": 2.5085394382476807, "learning_rate": 0.000195, "loss": 1.4732, "step": 9750 }, { "epoch": 1.1376681741900616, "grad_norm": 2.2118537425994873, "learning_rate": 0.0002, "loss": 1.4932, "step": 10000 }, { "epoch": 1.1376681741900616, "eval_loss": 0.37532275915145874, "eval_runtime": 575.1259, "eval_samples_per_second": 869.375, "eval_steps_per_second": 6.793, "step": 10000 }, { "epoch": 1.1661120118326365, "grad_norm": 2.238839864730835, "learning_rate": 0.00019694507240178408, "loss": 1.5027, "step": 10250 }, { "epoch": 1.1945558494752113, "grad_norm": 2.1695666313171387, "learning_rate": 0.00019389014480356817, "loss": 1.4716, "step": 10500 }, { "epoch": 1.222999687117786, "grad_norm": 2.2739689350128174, "learning_rate": 0.00019083521720535224, "loss": 1.4586, "step": 10750 }, { "epoch": 1.2514435247603606, "grad_norm": 2.2888431549072266, "learning_rate": 0.0001877802896071363, "loss": 1.4584, "step": 11000 }, { "epoch": 1.2798873624029354, "grad_norm": 1.8000621795654297, "learning_rate": 0.00018472536200892038, "loss": 1.4447, "step": 11250 }, { "epoch": 1.30833120004551, "grad_norm": 2.0863656997680664, "learning_rate": 0.00018167043441070447, "loss": 1.4604, "step": 11500 }, { "epoch": 1.336775037688085, "grad_norm": 2.6197357177734375, "learning_rate": 0.00017861550681248857, "loss": 1.4261, "step": 11750 }, { "epoch": 1.3652188753306596, "grad_norm": 2.4154961109161377, "learning_rate": 0.00017556057921427263, "loss": 1.4352, "step": 12000 }, { "epoch": 1.3936627129732344, "grad_norm": 2.2526612281799316, "learning_rate": 0.00017250565161605673, "loss": 1.417, "step": 12250 }, { "epoch": 1.422106550615809, "grad_norm": 2.244547128677368, "learning_rate": 0.0001694507240178408, "loss": 1.404, "step": 12500 }, { "epoch": 1.450550388258384, "grad_norm": 2.0988707542419434, "learning_rate": 0.00016639579641962486, "loss": 1.4072, "step": 12750 }, { "epoch": 1.4789942259009585, "grad_norm": 2.1393070220947266, "learning_rate": 0.00016334086882140893, "loss": 1.3902, "step": 13000 }, { "epoch": 1.5074380635435332, "grad_norm": 2.1057634353637695, "learning_rate": 0.00016028594122319303, "loss": 1.384, "step": 13250 }, { "epoch": 1.535881901186108, "grad_norm": 2.112243175506592, "learning_rate": 0.0001572310136249771, "loss": 1.3577, "step": 13500 }, { "epoch": 1.5643257388286829, "grad_norm": 2.0681891441345215, "learning_rate": 0.00015417608602676116, "loss": 1.3657, "step": 13750 }, { "epoch": 1.5927695764712575, "grad_norm": 1.8423341512680054, "learning_rate": 0.00015112115842854523, "loss": 1.3538, "step": 14000 }, { "epoch": 1.6212134141138321, "grad_norm": 2.0194766521453857, "learning_rate": 0.00014806623083032933, "loss": 1.3563, "step": 14250 }, { "epoch": 1.649657251756407, "grad_norm": 2.087153911590576, "learning_rate": 0.0001450113032321134, "loss": 1.3666, "step": 14500 }, { "epoch": 1.6781010893989818, "grad_norm": 1.9989159107208252, "learning_rate": 0.0001419563756338975, "loss": 1.3584, "step": 14750 }, { "epoch": 1.7065449270415565, "grad_norm": 1.8271609544754028, "learning_rate": 0.00013890144803568156, "loss": 1.3198, "step": 15000 }, { "epoch": 1.7065449270415565, "eval_loss": 0.33792173862457275, "eval_runtime": 576.4758, "eval_samples_per_second": 867.339, "eval_steps_per_second": 6.777, "step": 15000 } ], "logging_steps": 250, "max_steps": 26367, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.6171614759862927e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }