{ "best_global_step": 140, "best_metric": 0.22836367785930634, "best_model_checkpoint": "/content/models/gemma_jigsaw_lmh/checkpoint-140", "epoch": 2.7450980392156863, "eval_steps": 20, "global_step": 140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.560735213756561, "epoch": 0.39215686274509803, "grad_norm": 6.186038494110107, "learning_rate": 8.758169934640524e-06, "loss": 0.5541, "mean_token_accuracy": 0.7453125, "num_tokens": 67412.0, "step": 20 }, { "epoch": 0.39215686274509803, "eval_entropy": 2.491304580981915, "eval_loss": 0.3785918951034546, "eval_mean_token_accuracy": 0.7578671345343957, "eval_num_tokens": 67412.0, "eval_runtime": 2.6932, "eval_samples_per_second": 75.376, "eval_steps_per_second": 4.827, "step": 20 }, { "entropy": 2.419952464103699, "epoch": 0.7843137254901961, "grad_norm": 2.4226012229919434, "learning_rate": 7.450980392156863e-06, "loss": 0.3293, "mean_token_accuracy": 0.81640625, "num_tokens": 134808.0, "step": 40 }, { "epoch": 0.7843137254901961, "eval_entropy": 2.4435020043299747, "eval_loss": 0.2946617007255554, "eval_mean_token_accuracy": 0.8513986009817857, "eval_num_tokens": 134808.0, "eval_runtime": 2.6642, "eval_samples_per_second": 76.195, "eval_steps_per_second": 4.879, "step": 40 }, { "entropy": 2.423317462205887, "epoch": 1.1764705882352942, "grad_norm": 10.875091552734375, "learning_rate": 6.143790849673204e-06, "loss": 0.2871, "mean_token_accuracy": 0.8487723216414451, "num_tokens": 201046.0, "step": 60 }, { "epoch": 1.1764705882352942, "eval_entropy": 2.448130937723013, "eval_loss": 0.2594100534915924, "eval_mean_token_accuracy": 0.8621066441902747, "eval_num_tokens": 201046.0, "eval_runtime": 2.6633, "eval_samples_per_second": 76.221, "eval_steps_per_second": 4.881, "step": 60 }, { "entropy": 2.3511528968811035, "epoch": 1.5686274509803921, "grad_norm": 5.017323970794678, "learning_rate": 4.836601307189543e-06, "loss": 0.2503, "mean_token_accuracy": 0.87109375, "num_tokens": 269334.0, "step": 80 }, { "epoch": 1.5686274509803921, "eval_entropy": 2.3352334682758036, "eval_loss": 0.2594275176525116, "eval_mean_token_accuracy": 0.8824300720141485, "eval_num_tokens": 269334.0, "eval_runtime": 2.6613, "eval_samples_per_second": 76.278, "eval_steps_per_second": 4.885, "step": 80 }, { "entropy": 2.3079891920089723, "epoch": 1.9607843137254903, "grad_norm": 7.2991743087768555, "learning_rate": 3.529411764705883e-06, "loss": 0.247, "mean_token_accuracy": 0.87890625, "num_tokens": 337424.0, "step": 100 }, { "epoch": 1.9607843137254903, "eval_entropy": 2.3540270145122824, "eval_loss": 0.23124322295188904, "eval_mean_token_accuracy": 0.8861451057287363, "eval_num_tokens": 337424.0, "eval_runtime": 2.7035, "eval_samples_per_second": 75.087, "eval_steps_per_second": 4.809, "step": 100 }, { "entropy": 2.3495707869529725, "epoch": 2.3529411764705883, "grad_norm": 5.988176345825195, "learning_rate": 2.222222222222222e-06, "loss": 0.2009, "mean_token_accuracy": 0.9171875, "num_tokens": 404955.0, "step": 120 }, { "epoch": 2.3529411764705883, "eval_entropy": 2.4085900966937723, "eval_loss": 0.23444519937038422, "eval_mean_token_accuracy": 0.8957604903441209, "eval_num_tokens": 404955.0, "eval_runtime": 2.6365, "eval_samples_per_second": 76.995, "eval_steps_per_second": 4.931, "step": 120 }, { "entropy": 2.3811947822570803, "epoch": 2.7450980392156863, "grad_norm": 9.767471313476562, "learning_rate": 9.150326797385621e-07, "loss": 0.1973, "mean_token_accuracy": 0.91328125, "num_tokens": 472246.0, "step": 140 }, { "epoch": 2.7450980392156863, "eval_entropy": 2.4008009983943057, "eval_loss": 0.22836367785930634, "eval_mean_token_accuracy": 0.9040646873987638, "eval_num_tokens": 472246.0, "eval_runtime": 2.645, "eval_samples_per_second": 76.748, "eval_steps_per_second": 4.915, "step": 140 } ], "logging_steps": 20, "max_steps": 153, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0222031863807488e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }