| { | |
| "best_global_step": 350, | |
| "best_metric": 0.9981072077528771, | |
| "best_model_checkpoint": "saves/qwen2_5-coder-1.5b/freeze/sft/checkpoint-200", | |
| "epoch": 0.48470160557406844, | |
| "eval_steps": 50, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012117540139351712, | |
| "grad_norm": 1.1778391599655151, | |
| "learning_rate": 1.0843373493975905e-06, | |
| "loss": 0.0668, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.024235080278703424, | |
| "grad_norm": 0.807381808757782, | |
| "learning_rate": 2.2891566265060243e-06, | |
| "loss": 0.0256, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.036352620418055134, | |
| "grad_norm": 0.5498034954071045, | |
| "learning_rate": 3.4939759036144583e-06, | |
| "loss": 0.0519, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04847016055740685, | |
| "grad_norm": 1.5650098323822021, | |
| "learning_rate": 4.698795180722892e-06, | |
| "loss": 0.0133, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.060587700696758555, | |
| "grad_norm": 2.43689227104187, | |
| "learning_rate": 5.9036144578313255e-06, | |
| "loss": 0.0147, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.060587700696758555, | |
| "eval_accuracy": 0.9958737129012719, | |
| "eval_loss": 0.014296969398856163, | |
| "eval_runtime": 528.9314, | |
| "eval_samples_per_second": 3.121, | |
| "eval_steps_per_second": 3.121, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07270524083611027, | |
| "grad_norm": 0.7519652247428894, | |
| "learning_rate": 7.1084337349397595e-06, | |
| "loss": 0.0091, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08482278097546198, | |
| "grad_norm": 0.13313020765781403, | |
| "learning_rate": 8.313253012048194e-06, | |
| "loss": 0.0072, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0969403211148137, | |
| "grad_norm": 1.2990883588790894, | |
| "learning_rate": 9.518072289156628e-06, | |
| "loss": 0.0065, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1090578612541654, | |
| "grad_norm": 3.4033000469207764, | |
| "learning_rate": 1.0722891566265062e-05, | |
| "loss": 0.0178, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.12117540139351711, | |
| "grad_norm": 1.2542734146118164, | |
| "learning_rate": 1.1927710843373494e-05, | |
| "loss": 0.0081, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12117540139351711, | |
| "eval_accuracy": 0.996290127195639, | |
| "eval_loss": 0.013640574179589748, | |
| "eval_runtime": 528.0812, | |
| "eval_samples_per_second": 3.126, | |
| "eval_steps_per_second": 3.126, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.13329294153286883, | |
| "grad_norm": 0.1944456845521927, | |
| "learning_rate": 1.3132530120481928e-05, | |
| "loss": 0.0109, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.14541048167222054, | |
| "grad_norm": 0.07844550907611847, | |
| "learning_rate": 1.4337349397590364e-05, | |
| "loss": 0.0074, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.15752802181157224, | |
| "grad_norm": 1.4879134893417358, | |
| "learning_rate": 1.5542168674698796e-05, | |
| "loss": 0.0138, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.16964556195092395, | |
| "grad_norm": 0.02974073402583599, | |
| "learning_rate": 1.6746987951807228e-05, | |
| "loss": 0.0054, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.18176310209027569, | |
| "grad_norm": 1.4775711297988892, | |
| "learning_rate": 1.7951807228915664e-05, | |
| "loss": 0.0301, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18176310209027569, | |
| "eval_accuracy": 0.9965172622652938, | |
| "eval_loss": 0.012703897431492805, | |
| "eval_runtime": 528.1973, | |
| "eval_samples_per_second": 3.126, | |
| "eval_steps_per_second": 3.126, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1938806422296274, | |
| "grad_norm": 0.1594158411026001, | |
| "learning_rate": 1.91566265060241e-05, | |
| "loss": 0.0099, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2059981823689791, | |
| "grad_norm": 0.12540395557880402, | |
| "learning_rate": 1.9999798871699227e-05, | |
| "loss": 0.0072, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2181157225083308, | |
| "grad_norm": 0.7899434566497803, | |
| "learning_rate": 1.9996223482528378e-05, | |
| "loss": 0.0125, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2302332626476825, | |
| "grad_norm": 6.3130903244018555, | |
| "learning_rate": 1.9988180414914266e-05, | |
| "loss": 0.0198, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.24235080278703422, | |
| "grad_norm": 0.9658088684082031, | |
| "learning_rate": 1.997567326359842e-05, | |
| "loss": 0.0106, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.24235080278703422, | |
| "eval_accuracy": 0.997198667474258, | |
| "eval_loss": 0.013726573437452316, | |
| "eval_runtime": 528.9048, | |
| "eval_samples_per_second": 3.122, | |
| "eval_steps_per_second": 3.122, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2544683429263859, | |
| "grad_norm": 0.9940381646156311, | |
| "learning_rate": 1.995870761848492e-05, | |
| "loss": 0.0155, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.26658588306573766, | |
| "grad_norm": 0.3996841013431549, | |
| "learning_rate": 1.993729106214203e-05, | |
| "loss": 0.0041, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.27870342320508934, | |
| "grad_norm": 0.31791451573371887, | |
| "learning_rate": 1.9911433166413277e-05, | |
| "loss": 0.004, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2908209633444411, | |
| "grad_norm": 0.010878289118409157, | |
| "learning_rate": 1.988114548813946e-05, | |
| "loss": 0.0044, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3029385034837928, | |
| "grad_norm": 0.741184413433075, | |
| "learning_rate": 1.9846441563993465e-05, | |
| "loss": 0.0567, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3029385034837928, | |
| "eval_accuracy": 0.9970472440944882, | |
| "eval_loss": 0.010016725398600101, | |
| "eval_runtime": 527.5691, | |
| "eval_samples_per_second": 3.129, | |
| "eval_steps_per_second": 3.129, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3150560436231445, | |
| "grad_norm": 1.0590384006500244, | |
| "learning_rate": 1.980733690443021e-05, | |
| "loss": 0.0121, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3271735837624962, | |
| "grad_norm": 0.07605874538421631, | |
| "learning_rate": 1.9763848986754495e-05, | |
| "loss": 0.0072, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3392911239018479, | |
| "grad_norm": 1.010120153427124, | |
| "learning_rate": 1.971599724730972e-05, | |
| "loss": 0.0078, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.35140866404119964, | |
| "grad_norm": 0.3639002740383148, | |
| "learning_rate": 1.966380307279109e-05, | |
| "loss": 0.0169, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.36352620418055137, | |
| "grad_norm": 0.0576571561396122, | |
| "learning_rate": 1.9607289790687104e-05, | |
| "loss": 0.0062, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.36352620418055137, | |
| "eval_accuracy": 0.9971608116293156, | |
| "eval_loss": 0.009564626030623913, | |
| "eval_runtime": 528.2806, | |
| "eval_samples_per_second": 3.125, | |
| "eval_steps_per_second": 3.125, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.37564374431990305, | |
| "grad_norm": 0.06373932957649231, | |
| "learning_rate": 1.954648265885366e-05, | |
| "loss": 0.0176, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3877612844592548, | |
| "grad_norm": 0.22646072506904602, | |
| "learning_rate": 1.948140885422538e-05, | |
| "loss": 0.0042, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.39987882459860646, | |
| "grad_norm": 0.009251746349036694, | |
| "learning_rate": 1.9412097460669258e-05, | |
| "loss": 0.002, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4119963647379582, | |
| "grad_norm": 0.04042113944888115, | |
| "learning_rate": 1.9338579455986e-05, | |
| "loss": 0.0083, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.42411390487730993, | |
| "grad_norm": 0.2013320028781891, | |
| "learning_rate": 1.9260887698064912e-05, | |
| "loss": 0.0078, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.42411390487730993, | |
| "eval_accuracy": 0.9981072077528771, | |
| "eval_loss": 0.006962946616113186, | |
| "eval_runtime": 527.8415, | |
| "eval_samples_per_second": 3.128, | |
| "eval_steps_per_second": 3.128, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4362314450166616, | |
| "grad_norm": 0.06966914236545563, | |
| "learning_rate": 1.9179056910198515e-05, | |
| "loss": 0.0072, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.44834898515601335, | |
| "grad_norm": 0.25409865379333496, | |
| "learning_rate": 1.9093123665563434e-05, | |
| "loss": 0.006, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.460466525295365, | |
| "grad_norm": 0.8325411081314087, | |
| "learning_rate": 1.9003126370874493e-05, | |
| "loss": 0.0105, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.47258406543471676, | |
| "grad_norm": 0.9408934116363525, | |
| "learning_rate": 1.8909105249219345e-05, | |
| "loss": 0.0096, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.48470160557406844, | |
| "grad_norm": 0.14342345297336578, | |
| "learning_rate": 1.881110232208133e-05, | |
| "loss": 0.0085, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.48470160557406844, | |
| "eval_accuracy": 0.9978800726832223, | |
| "eval_loss": 0.006634953897446394, | |
| "eval_runtime": 528.8373, | |
| "eval_samples_per_second": 3.122, | |
| "eval_steps_per_second": 3.122, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1652, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5031896623767552.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |