gemma-3-4b-pt_cfact_01 / trainer_state.json
auroresearch's picture
Upload trained model
823b3f0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 11270,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 2.1398513317108154,
"epoch": 0.00044365572315882877,
"grad_norm": 8.540080070495605,
"learning_rate": 0.0,
"loss": 2.1688,
"mean_token_accuracy": 0.5243293642997742,
"num_tokens": 3210.0,
"step": 1
},
{
"entropy": 1.3569303842966924,
"epoch": 0.22182786157941436,
"grad_norm": 15.995442390441895,
"learning_rate": 2.9905649405771675e-06,
"loss": 1.3462,
"mean_token_accuracy": 0.6747287360126365,
"num_tokens": 204845.0,
"step": 500
},
{
"epoch": 0.22182786157941436,
"eval_entropy": 1.3003936431086656,
"eval_loss": 1.2755075693130493,
"eval_mean_token_accuracy": 0.6848681526826629,
"eval_num_tokens": 204845.0,
"eval_runtime": 25.1794,
"eval_samples_per_second": 44.759,
"eval_steps_per_second": 11.2,
"step": 500
},
{
"entropy": 1.3297723724842072,
"epoch": 0.44365572315882873,
"grad_norm": 16.810760498046875,
"learning_rate": 2.9523065141902646e-06,
"loss": 1.2976,
"mean_token_accuracy": 0.6786080609560012,
"num_tokens": 413967.0,
"step": 1000
},
{
"epoch": 0.44365572315882873,
"eval_entropy": 1.2563454367167561,
"eval_loss": 1.2633785009384155,
"eval_mean_token_accuracy": 0.6865419322717274,
"eval_num_tokens": 413967.0,
"eval_runtime": 25.7837,
"eval_samples_per_second": 43.71,
"eval_steps_per_second": 10.937,
"step": 1000
},
{
"entropy": 1.3071968636512756,
"epoch": 0.6654835847382431,
"grad_norm": 16.702791213989258,
"learning_rate": 2.885374907463648e-06,
"loss": 1.2762,
"mean_token_accuracy": 0.6824663616418839,
"num_tokens": 617323.0,
"step": 1500
},
{
"epoch": 0.6654835847382431,
"eval_entropy": 1.2650799679417981,
"eval_loss": 1.2535133361816406,
"eval_mean_token_accuracy": 0.6874359858796951,
"eval_num_tokens": 617323.0,
"eval_runtime": 25.5547,
"eval_samples_per_second": 44.101,
"eval_steps_per_second": 11.035,
"step": 1500
},
{
"entropy": 1.2826146301031112,
"epoch": 0.8873114463176575,
"grad_norm": 14.513501167297363,
"learning_rate": 2.7910915646044115e-06,
"loss": 1.2478,
"mean_token_accuracy": 0.6892818967103959,
"num_tokens": 824392.0,
"step": 2000
},
{
"epoch": 0.8873114463176575,
"eval_entropy": 1.2778702333886574,
"eval_loss": 1.250695824623108,
"eval_mean_token_accuracy": 0.6861607891025273,
"eval_num_tokens": 824392.0,
"eval_runtime": 25.7423,
"eval_samples_per_second": 43.78,
"eval_steps_per_second": 10.955,
"step": 2000
},
{
"entropy": 1.138995201587677,
"epoch": 1.109139307897072,
"grad_norm": 16.4141845703125,
"learning_rate": 2.671317940661071e-06,
"loss": 1.0831,
"mean_token_accuracy": 0.7227783533334732,
"num_tokens": 1029377.0,
"step": 2500
},
{
"epoch": 1.109139307897072,
"eval_entropy": 0.9666957462087591,
"eval_loss": 1.338200330734253,
"eval_mean_token_accuracy": 0.6823811653658007,
"eval_num_tokens": 1029377.0,
"eval_runtime": 25.7279,
"eval_samples_per_second": 43.805,
"eval_steps_per_second": 10.961,
"step": 2500
},
{
"entropy": 0.972771362900734,
"epoch": 1.3309671694764862,
"grad_norm": 16.756074905395508,
"learning_rate": 2.5284187504412197e-06,
"loss": 0.9187,
"mean_token_accuracy": 0.7588569446802139,
"num_tokens": 1228420.0,
"step": 3000
},
{
"epoch": 1.3309671694764862,
"eval_entropy": 1.04000264871205,
"eval_loss": 1.3259884119033813,
"eval_mean_token_accuracy": 0.6818445779330341,
"eval_num_tokens": 1228420.0,
"eval_runtime": 25.9704,
"eval_samples_per_second": 43.396,
"eval_steps_per_second": 10.859,
"step": 3000
},
{
"entropy": 0.9834260470867157,
"epoch": 1.5527950310559007,
"grad_norm": 30.475343704223633,
"learning_rate": 2.365215281470278e-06,
"loss": 0.935,
"mean_token_accuracy": 0.7537981454133987,
"num_tokens": 1435031.0,
"step": 3500
},
{
"epoch": 1.5527950310559007,
"eval_entropy": 1.0289501801450203,
"eval_loss": 1.3225187063217163,
"eval_mean_token_accuracy": 0.6830336544530612,
"eval_num_tokens": 1435031.0,
"eval_runtime": 27.4967,
"eval_samples_per_second": 40.987,
"eval_steps_per_second": 10.256,
"step": 3500
},
{
"entropy": 1.0000405538082122,
"epoch": 1.774622892635315,
"grad_norm": 21.4952392578125,
"learning_rate": 2.184929692743022e-06,
"loss": 0.9525,
"mean_token_accuracy": 0.7497001212835311,
"num_tokens": 1644240.0,
"step": 4000
},
{
"epoch": 1.774622892635315,
"eval_entropy": 1.0376262810636074,
"eval_loss": 1.326087474822998,
"eval_mean_token_accuracy": 0.6824532375268056,
"eval_num_tokens": 1644240.0,
"eval_runtime": 29.5174,
"eval_samples_per_second": 38.181,
"eval_steps_per_second": 9.554,
"step": 4000
},
{
"entropy": 0.9729628480672836,
"epoch": 1.9964507542147294,
"grad_norm": 13.072418212890625,
"learning_rate": 1.9911213989888633e-06,
"loss": 0.925,
"mean_token_accuracy": 0.7565750381946563,
"num_tokens": 1852665.0,
"step": 4500
},
{
"epoch": 1.9964507542147294,
"eval_entropy": 1.030793750539739,
"eval_loss": 1.3192322254180908,
"eval_mean_token_accuracy": 0.6828778567889058,
"eval_num_tokens": 1852665.0,
"eval_runtime": 26.6237,
"eval_samples_per_second": 42.331,
"eval_steps_per_second": 10.592,
"step": 4500
},
{
"entropy": 0.6869870555996895,
"epoch": 2.218278615794144,
"grad_norm": 19.351865768432617,
"learning_rate": 1.7876167964291556e-06,
"loss": 0.6228,
"mean_token_accuracy": 0.8327721027135849,
"num_tokens": 2061170.0,
"step": 5000
},
{
"epoch": 2.218278615794144,
"eval_entropy": 0.8354786801422741,
"eval_loss": 1.5526158809661865,
"eval_mean_token_accuracy": 0.671358603534969,
"eval_num_tokens": 2061170.0,
"eval_runtime": 27.1345,
"eval_samples_per_second": 41.534,
"eval_steps_per_second": 10.393,
"step": 5000
},
{
"entropy": 0.674837928891182,
"epoch": 2.440106477373558,
"grad_norm": 27.183914184570312,
"learning_rate": 1.5784337174650764e-06,
"loss": 0.6227,
"mean_token_accuracy": 0.8330682731866836,
"num_tokens": 2270310.0,
"step": 5500
},
{
"epoch": 2.440106477373558,
"eval_entropy": 0.8246719059369243,
"eval_loss": 1.5887655019760132,
"eval_mean_token_accuracy": 0.6711076922873234,
"eval_num_tokens": 2270310.0,
"eval_runtime": 27.2625,
"eval_samples_per_second": 41.339,
"eval_steps_per_second": 10.344,
"step": 5500
},
{
"entropy": 0.6849963802099228,
"epoch": 2.6619343389529724,
"grad_norm": 18.668190002441406,
"learning_rate": 1.3677021058024131e-06,
"loss": 0.6311,
"mean_token_accuracy": 0.831780132651329,
"num_tokens": 2472299.0,
"step": 6000
},
{
"epoch": 2.6619343389529724,
"eval_entropy": 0.8398415027780736,
"eval_loss": 1.5611518621444702,
"eval_mean_token_accuracy": 0.6720390224710424,
"eval_num_tokens": 2472299.0,
"eval_runtime": 27.3302,
"eval_samples_per_second": 41.236,
"eval_steps_per_second": 10.318,
"step": 6000
},
{
"entropy": 0.6844089294672012,
"epoch": 2.883762200532387,
"grad_norm": 22.124780654907227,
"learning_rate": 1.1595824781402537e-06,
"loss": 0.6286,
"mean_token_accuracy": 0.8318599998950958,
"num_tokens": 2675700.0,
"step": 6500
},
{
"epoch": 2.883762200532387,
"eval_entropy": 0.8314323486588525,
"eval_loss": 1.5601654052734375,
"eval_mean_token_accuracy": 0.6732265980953865,
"eval_num_tokens": 2675700.0,
"eval_runtime": 26.5088,
"eval_samples_per_second": 42.514,
"eval_steps_per_second": 10.638,
"step": 6500
},
{
"entropy": 0.5888768406510353,
"epoch": 3.1055900621118013,
"grad_norm": 10.715188026428223,
"learning_rate": 9.581837822509056e-07,
"loss": 0.5211,
"mean_token_accuracy": 0.8616769021749496,
"num_tokens": 2883705.0,
"step": 7000
},
{
"epoch": 3.1055900621118013,
"eval_entropy": 0.6851836236867499,
"eval_loss": 1.921167016029358,
"eval_mean_token_accuracy": 0.6621406837135342,
"eval_num_tokens": 2883705.0,
"eval_runtime": 27.1046,
"eval_samples_per_second": 41.58,
"eval_steps_per_second": 10.404,
"step": 7000
},
{
"entropy": 0.47865216064453125,
"epoch": 3.3274179236912156,
"grad_norm": 12.066122055053711,
"learning_rate": 7.674822731955381e-07,
"loss": 0.4101,
"mean_token_accuracy": 0.8913237881660462,
"num_tokens": 3095397.0,
"step": 7500
},
{
"epoch": 3.3274179236912156,
"eval_entropy": 0.6766338091581425,
"eval_loss": 1.936546802520752,
"eval_mean_token_accuracy": 0.6625262957515446,
"eval_num_tokens": 3095397.0,
"eval_runtime": 27.6428,
"eval_samples_per_second": 40.77,
"eval_steps_per_second": 10.202,
"step": 7500
},
{
"entropy": 0.4799597150683403,
"epoch": 3.54924578527063,
"grad_norm": 36.101654052734375,
"learning_rate": 5.912430093187734e-07,
"loss": 0.4147,
"mean_token_accuracy": 0.8905937492847442,
"num_tokens": 3298070.0,
"step": 8000
},
{
"epoch": 3.54924578527063,
"eval_entropy": 0.6814008391072565,
"eval_loss": 1.9532058238983154,
"eval_mean_token_accuracy": 0.6622793906969382,
"eval_num_tokens": 3298070.0,
"eval_runtime": 27.6698,
"eval_samples_per_second": 40.73,
"eval_steps_per_second": 10.192,
"step": 8000
},
{
"entropy": 0.48215484571456907,
"epoch": 3.771073646850044,
"grad_norm": 18.890460968017578,
"learning_rate": 4.329455179426337e-07,
"loss": 0.4135,
"mean_token_accuracy": 0.8913494249582291,
"num_tokens": 3501201.0,
"step": 8500
},
{
"epoch": 3.771073646850044,
"eval_entropy": 0.6757127725063486,
"eval_loss": 1.955000400543213,
"eval_mean_token_accuracy": 0.6628981636348346,
"eval_num_tokens": 3501201.0,
"eval_runtime": 25.0963,
"eval_samples_per_second": 44.907,
"eval_steps_per_second": 11.237,
"step": 8500
},
{
"entropy": 0.478462237238884,
"epoch": 3.992901508429459,
"grad_norm": 13.628124237060547,
"learning_rate": 2.957150983570442e-07,
"loss": 0.4124,
"mean_token_accuracy": 0.8902216546535492,
"num_tokens": 3705447.0,
"step": 9000
},
{
"epoch": 3.992901508429459,
"eval_entropy": 0.6740765650855735,
"eval_loss": 1.9665794372558594,
"eval_mean_token_accuracy": 0.6630828968176605,
"eval_num_tokens": 3705447.0,
"eval_runtime": 25.9015,
"eval_samples_per_second": 43.511,
"eval_steps_per_second": 10.887,
"step": 9000
},
{
"entropy": 0.40061669325828553,
"epoch": 4.2147293700088735,
"grad_norm": 13.977828979492188,
"learning_rate": 1.8226111840579329e-07,
"loss": 0.3101,
"mean_token_accuracy": 0.9211609426736832,
"num_tokens": 3918593.0,
"step": 9500
},
{
"epoch": 4.2147293700088735,
"eval_entropy": 0.616510918377139,
"eval_loss": 2.1938819885253906,
"eval_mean_token_accuracy": 0.6570417981409857,
"eval_num_tokens": 3918593.0,
"eval_runtime": 26.7149,
"eval_samples_per_second": 42.186,
"eval_steps_per_second": 10.556,
"step": 9500
},
{
"entropy": 0.39905927059054375,
"epoch": 4.436557231588288,
"grad_norm": 19.7631893157959,
"learning_rate": 9.482352289090136e-08,
"loss": 0.31,
"mean_token_accuracy": 0.9201671047210693,
"num_tokens": 4120134.0,
"step": 10000
},
{
"epoch": 4.436557231588288,
"eval_entropy": 0.6110973414165753,
"eval_loss": 2.213070869445801,
"eval_mean_token_accuracy": 0.6563809237158891,
"eval_num_tokens": 4120134.0,
"eval_runtime": 26.0758,
"eval_samples_per_second": 43.22,
"eval_steps_per_second": 10.815,
"step": 10000
},
{
"entropy": 0.39375011357665063,
"epoch": 4.658385093167702,
"grad_norm": 16.576738357543945,
"learning_rate": 3.512860989075112e-08,
"loss": 0.304,
"mean_token_accuracy": 0.9224952676296234,
"num_tokens": 4324661.0,
"step": 10500
},
{
"epoch": 4.658385093167702,
"eval_entropy": 0.6000370889479387,
"eval_loss": 2.239130735397339,
"eval_mean_token_accuracy": 0.6568194350875016,
"eval_num_tokens": 4324661.0,
"eval_runtime": 25.5058,
"eval_samples_per_second": 44.186,
"eval_steps_per_second": 11.056,
"step": 10500
},
{
"entropy": 0.38745686200261115,
"epoch": 4.880212954747116,
"grad_norm": 12.541617393493652,
"learning_rate": 4.354948109051016e-09,
"loss": 0.3047,
"mean_token_accuracy": 0.9224670052528381,
"num_tokens": 4529096.0,
"step": 11000
},
{
"epoch": 4.880212954747116,
"eval_entropy": 0.6041199495183661,
"eval_loss": 2.2367258071899414,
"eval_mean_token_accuracy": 0.6559004331311435,
"eval_num_tokens": 4529096.0,
"eval_runtime": 25.881,
"eval_samples_per_second": 43.545,
"eval_steps_per_second": 10.896,
"step": 11000
}
],
"logging_steps": 500,
"max_steps": 11270,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.088531545414896e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}