engels / trainer_state.json
jpena-173's picture
Upload phase 1 epoch 1 - finetuned gemma-4-e4b-it teacher
d2c050c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 234,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.042735042735042736,
"grad_norm": 25.421951293945312,
"learning_rate": 3.888888888888889e-05,
"loss": 14.74276123046875,
"step": 10
},
{
"epoch": 0.08547008547008547,
"grad_norm": 12.622594833374023,
"learning_rate": 9.444444444444444e-05,
"loss": 9.163805389404297,
"step": 20
},
{
"epoch": 0.1282051282051282,
"grad_norm": 8.006195068359375,
"learning_rate": 0.00015000000000000001,
"loss": 5.28853759765625,
"step": 30
},
{
"epoch": 0.17094017094017094,
"grad_norm": 3.498350143432617,
"learning_rate": 0.00019999888744757143,
"loss": 4.076284027099609,
"step": 40
},
{
"epoch": 0.21367521367521367,
"grad_norm": 4.330902576446533,
"learning_rate": 0.00019986541110764565,
"loss": 3.210728073120117,
"step": 50
},
{
"epoch": 0.2564102564102564,
"grad_norm": 5.267370700836182,
"learning_rate": 0.0001995097645450266,
"loss": 2.6838237762451174,
"step": 60
},
{
"epoch": 0.29914529914529914,
"grad_norm": 3.2072625160217285,
"learning_rate": 0.00019893273896534936,
"loss": 2.4369382858276367,
"step": 70
},
{
"epoch": 0.3418803418803419,
"grad_norm": 3.1016528606414795,
"learning_rate": 0.00019813561807535598,
"loss": 2.205874443054199,
"step": 80
},
{
"epoch": 0.38461538461538464,
"grad_norm": 3.8450214862823486,
"learning_rate": 0.00019712017522703764,
"loss": 1.9279813766479492,
"step": 90
},
{
"epoch": 0.42735042735042733,
"grad_norm": 2.348071575164795,
"learning_rate": 0.00019588866947246498,
"loss": 1.8235645294189453,
"step": 100
},
{
"epoch": 0.4700854700854701,
"grad_norm": 3.2652463912963867,
"learning_rate": 0.00019444384053808288,
"loss": 1.8220790863037108,
"step": 110
},
{
"epoch": 0.5128205128205128,
"grad_norm": 2.6423192024230957,
"learning_rate": 0.00019278890272965096,
"loss": 1.7959518432617188,
"step": 120
},
{
"epoch": 0.5555555555555556,
"grad_norm": 2.6279354095458984,
"learning_rate": 0.00019092753778138886,
"loss": 1.7804344177246094,
"step": 130
},
{
"epoch": 0.5982905982905983,
"grad_norm": 2.6313953399658203,
"learning_rate": 0.0001888638866652356,
"loss": 1.642679214477539,
"step": 140
},
{
"epoch": 0.6410256410256411,
"grad_norm": 2.1009438037872314,
"learning_rate": 0.00018660254037844388,
"loss": 1.545415496826172,
"step": 150
},
{
"epoch": 0.6837606837606838,
"grad_norm": 2.672374963760376,
"learning_rate": 0.00018414852973000503,
"loss": 1.5645628929138184,
"step": 160
},
{
"epoch": 0.7264957264957265,
"grad_norm": 2.6783759593963623,
"learning_rate": 0.00018150731414862622,
"loss": 1.5343215942382813,
"step": 170
},
{
"epoch": 0.7692307692307693,
"grad_norm": 2.3677117824554443,
"learning_rate": 0.000178684769537159,
"loss": 1.5453574180603027,
"step": 180
},
{
"epoch": 0.811965811965812,
"grad_norm": 2.3082728385925293,
"learning_rate": 0.0001756871752004992,
"loss": 1.5324308395385742,
"step": 190
},
{
"epoch": 0.8547008547008547,
"grad_norm": 1.969205617904663,
"learning_rate": 0.00017252119987603973,
"loss": 1.5409900665283203,
"step": 200
},
{
"epoch": 0.8974358974358975,
"grad_norm": 2.5397582054138184,
"learning_rate": 0.00016919388689775464,
"loss": 1.4344990730285645,
"step": 210
},
{
"epoch": 0.9401709401709402,
"grad_norm": 2.0636305809020996,
"learning_rate": 0.00016571263852691888,
"loss": 1.4311028480529786,
"step": 220
},
{
"epoch": 0.9829059829059829,
"grad_norm": 2.4687087535858154,
"learning_rate": 0.0001620851994843244,
"loss": 1.461498737335205,
"step": 230
}
],
"logging_steps": 10,
"max_steps": 702,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.961214772268672e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}