{ "best_global_step": 350, "best_metric": 0.9981072077528771, "best_model_checkpoint": "saves/qwen2_5-coder-1.5b/freeze/sft/checkpoint-200", "epoch": 0.48470160557406844, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012117540139351712, "grad_norm": 1.1778391599655151, "learning_rate": 1.0843373493975905e-06, "loss": 0.0668, "step": 10 }, { "epoch": 0.024235080278703424, "grad_norm": 0.807381808757782, "learning_rate": 2.2891566265060243e-06, "loss": 0.0256, "step": 20 }, { "epoch": 0.036352620418055134, "grad_norm": 0.5498034954071045, "learning_rate": 3.4939759036144583e-06, "loss": 0.0519, "step": 30 }, { "epoch": 0.04847016055740685, "grad_norm": 1.5650098323822021, "learning_rate": 4.698795180722892e-06, "loss": 0.0133, "step": 40 }, { "epoch": 0.060587700696758555, "grad_norm": 2.43689227104187, "learning_rate": 5.9036144578313255e-06, "loss": 0.0147, "step": 50 }, { "epoch": 0.060587700696758555, "eval_accuracy": 0.9958737129012719, "eval_loss": 0.014296969398856163, "eval_runtime": 528.9314, "eval_samples_per_second": 3.121, "eval_steps_per_second": 3.121, "step": 50 }, { "epoch": 0.07270524083611027, "grad_norm": 0.7519652247428894, "learning_rate": 7.1084337349397595e-06, "loss": 0.0091, "step": 60 }, { "epoch": 0.08482278097546198, "grad_norm": 0.13313020765781403, "learning_rate": 8.313253012048194e-06, "loss": 0.0072, "step": 70 }, { "epoch": 0.0969403211148137, "grad_norm": 1.2990883588790894, "learning_rate": 9.518072289156628e-06, "loss": 0.0065, "step": 80 }, { "epoch": 0.1090578612541654, "grad_norm": 3.4033000469207764, "learning_rate": 1.0722891566265062e-05, "loss": 0.0178, "step": 90 }, { "epoch": 0.12117540139351711, "grad_norm": 1.2542734146118164, "learning_rate": 1.1927710843373494e-05, "loss": 0.0081, "step": 100 }, { "epoch": 0.12117540139351711, "eval_accuracy": 0.996290127195639, "eval_loss": 0.013640574179589748, "eval_runtime": 528.0812, "eval_samples_per_second": 3.126, "eval_steps_per_second": 3.126, "step": 100 }, { "epoch": 0.13329294153286883, "grad_norm": 0.1944456845521927, "learning_rate": 1.3132530120481928e-05, "loss": 0.0109, "step": 110 }, { "epoch": 0.14541048167222054, "grad_norm": 0.07844550907611847, "learning_rate": 1.4337349397590364e-05, "loss": 0.0074, "step": 120 }, { "epoch": 0.15752802181157224, "grad_norm": 1.4879134893417358, "learning_rate": 1.5542168674698796e-05, "loss": 0.0138, "step": 130 }, { "epoch": 0.16964556195092395, "grad_norm": 0.02974073402583599, "learning_rate": 1.6746987951807228e-05, "loss": 0.0054, "step": 140 }, { "epoch": 0.18176310209027569, "grad_norm": 1.4775711297988892, "learning_rate": 1.7951807228915664e-05, "loss": 0.0301, "step": 150 }, { "epoch": 0.18176310209027569, "eval_accuracy": 0.9965172622652938, "eval_loss": 0.012703897431492805, "eval_runtime": 528.1973, "eval_samples_per_second": 3.126, "eval_steps_per_second": 3.126, "step": 150 }, { "epoch": 0.1938806422296274, "grad_norm": 0.1594158411026001, "learning_rate": 1.91566265060241e-05, "loss": 0.0099, "step": 160 }, { "epoch": 0.2059981823689791, "grad_norm": 0.12540395557880402, "learning_rate": 1.9999798871699227e-05, "loss": 0.0072, "step": 170 }, { "epoch": 0.2181157225083308, "grad_norm": 0.7899434566497803, "learning_rate": 1.9996223482528378e-05, "loss": 0.0125, "step": 180 }, { "epoch": 0.2302332626476825, "grad_norm": 6.3130903244018555, "learning_rate": 1.9988180414914266e-05, "loss": 0.0198, "step": 190 }, { "epoch": 0.24235080278703422, "grad_norm": 0.9658088684082031, "learning_rate": 1.997567326359842e-05, "loss": 0.0106, "step": 200 }, { "epoch": 0.24235080278703422, "eval_accuracy": 0.997198667474258, "eval_loss": 0.013726573437452316, "eval_runtime": 528.9048, "eval_samples_per_second": 3.122, "eval_steps_per_second": 3.122, "step": 200 }, { "epoch": 0.2544683429263859, "grad_norm": 0.9940381646156311, "learning_rate": 1.995870761848492e-05, "loss": 0.0155, "step": 210 }, { "epoch": 0.26658588306573766, "grad_norm": 0.3996841013431549, "learning_rate": 1.993729106214203e-05, "loss": 0.0041, "step": 220 }, { "epoch": 0.27870342320508934, "grad_norm": 0.31791451573371887, "learning_rate": 1.9911433166413277e-05, "loss": 0.004, "step": 230 }, { "epoch": 0.2908209633444411, "grad_norm": 0.010878289118409157, "learning_rate": 1.988114548813946e-05, "loss": 0.0044, "step": 240 }, { "epoch": 0.3029385034837928, "grad_norm": 0.741184413433075, "learning_rate": 1.9846441563993465e-05, "loss": 0.0567, "step": 250 }, { "epoch": 0.3029385034837928, "eval_accuracy": 0.9970472440944882, "eval_loss": 0.010016725398600101, "eval_runtime": 527.5691, "eval_samples_per_second": 3.129, "eval_steps_per_second": 3.129, "step": 250 }, { "epoch": 0.3150560436231445, "grad_norm": 1.0590384006500244, "learning_rate": 1.980733690443021e-05, "loss": 0.0121, "step": 260 }, { "epoch": 0.3271735837624962, "grad_norm": 0.07605874538421631, "learning_rate": 1.9763848986754495e-05, "loss": 0.0072, "step": 270 }, { "epoch": 0.3392911239018479, "grad_norm": 1.010120153427124, "learning_rate": 1.971599724730972e-05, "loss": 0.0078, "step": 280 }, { "epoch": 0.35140866404119964, "grad_norm": 0.3639002740383148, "learning_rate": 1.966380307279109e-05, "loss": 0.0169, "step": 290 }, { "epoch": 0.36352620418055137, "grad_norm": 0.0576571561396122, "learning_rate": 1.9607289790687104e-05, "loss": 0.0062, "step": 300 }, { "epoch": 0.36352620418055137, "eval_accuracy": 0.9971608116293156, "eval_loss": 0.009564626030623913, "eval_runtime": 528.2806, "eval_samples_per_second": 3.125, "eval_steps_per_second": 3.125, "step": 300 }, { "epoch": 0.37564374431990305, "grad_norm": 0.06373932957649231, "learning_rate": 1.954648265885366e-05, "loss": 0.0176, "step": 310 }, { "epoch": 0.3877612844592548, "grad_norm": 0.22646072506904602, "learning_rate": 1.948140885422538e-05, "loss": 0.0042, "step": 320 }, { "epoch": 0.39987882459860646, "grad_norm": 0.009251746349036694, "learning_rate": 1.9412097460669258e-05, "loss": 0.002, "step": 330 }, { "epoch": 0.4119963647379582, "grad_norm": 0.04042113944888115, "learning_rate": 1.9338579455986e-05, "loss": 0.0083, "step": 340 }, { "epoch": 0.42411390487730993, "grad_norm": 0.2013320028781891, "learning_rate": 1.9260887698064912e-05, "loss": 0.0078, "step": 350 }, { "epoch": 0.42411390487730993, "eval_accuracy": 0.9981072077528771, "eval_loss": 0.006962946616113186, "eval_runtime": 527.8415, "eval_samples_per_second": 3.128, "eval_steps_per_second": 3.128, "step": 350 }, { "epoch": 0.4362314450166616, "grad_norm": 0.06966914236545563, "learning_rate": 1.9179056910198515e-05, "loss": 0.0072, "step": 360 }, { "epoch": 0.44834898515601335, "grad_norm": 0.25409865379333496, "learning_rate": 1.9093123665563434e-05, "loss": 0.006, "step": 370 }, { "epoch": 0.460466525295365, "grad_norm": 0.8325411081314087, "learning_rate": 1.9003126370874493e-05, "loss": 0.0105, "step": 380 }, { "epoch": 0.47258406543471676, "grad_norm": 0.9408934116363525, "learning_rate": 1.8909105249219345e-05, "loss": 0.0096, "step": 390 }, { "epoch": 0.48470160557406844, "grad_norm": 0.14342345297336578, "learning_rate": 1.881110232208133e-05, "loss": 0.0085, "step": 400 }, { "epoch": 0.48470160557406844, "eval_accuracy": 0.9978800726832223, "eval_loss": 0.006634953897446394, "eval_runtime": 528.8373, "eval_samples_per_second": 3.122, "eval_steps_per_second": 3.122, "step": 400 } ], "logging_steps": 10, "max_steps": 1652, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5031896623767552.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }