Gemma3_4B_Instruct_01_CharFlipped / trainer_state.json
auroresearch's picture
Upload trained model
c70e395 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 11270,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.0909643173217773,
"epoch": 0.00044365572315882877,
"grad_norm": 56.12086486816406,
"learning_rate": 0.0,
"loss": 3.0658,
"mean_token_accuracy": 0.48502805829048157,
"num_tokens": 3210.0,
"step": 1
},
{
"entropy": 1.3591915201806353,
"epoch": 0.22182786157941436,
"grad_norm": 19.471004486083984,
"learning_rate": 2.9905649405771675e-06,
"loss": 1.4761,
"mean_token_accuracy": 0.6628800271030418,
"num_tokens": 206054.0,
"step": 500
},
{
"epoch": 0.22182786157941436,
"eval_entropy": 1.2849119341542534,
"eval_loss": 1.3277472257614136,
"eval_mean_token_accuracy": 0.6782622895342239,
"eval_num_tokens": 206054.0,
"eval_runtime": 33.7353,
"eval_samples_per_second": 33.407,
"eval_steps_per_second": 8.359,
"step": 500
},
{
"entropy": 1.3819819614887237,
"epoch": 0.44365572315882873,
"grad_norm": 15.931497573852539,
"learning_rate": 2.9523065141902646e-06,
"loss": 1.3619,
"mean_token_accuracy": 0.6711735002994538,
"num_tokens": 417292.0,
"step": 1000
},
{
"epoch": 0.44365572315882873,
"eval_entropy": 1.2668320029339892,
"eval_loss": 1.3029544353485107,
"eval_mean_token_accuracy": 0.6819594891358775,
"eval_num_tokens": 417292.0,
"eval_runtime": 33.7783,
"eval_samples_per_second": 33.365,
"eval_steps_per_second": 8.349,
"step": 1000
},
{
"entropy": 1.3488010201454164,
"epoch": 0.6654835847382431,
"grad_norm": 15.746204376220703,
"learning_rate": 2.885374907463648e-06,
"loss": 1.3193,
"mean_token_accuracy": 0.6784288173913956,
"num_tokens": 621936.0,
"step": 1500
},
{
"epoch": 0.6654835847382431,
"eval_entropy": 1.278814361661884,
"eval_loss": 1.2931554317474365,
"eval_mean_token_accuracy": 0.6839457212399084,
"eval_num_tokens": 621936.0,
"eval_runtime": 34.1849,
"eval_samples_per_second": 32.968,
"eval_steps_per_second": 8.249,
"step": 1500
},
{
"entropy": 1.3254250472784042,
"epoch": 0.8873114463176575,
"grad_norm": 14.376286506652832,
"learning_rate": 2.7910915646044115e-06,
"loss": 1.2938,
"mean_token_accuracy": 0.6840605319142342,
"num_tokens": 830692.0,
"step": 2000
},
{
"epoch": 0.8873114463176575,
"eval_entropy": 1.2728644338905388,
"eval_loss": 1.2851120233535767,
"eval_mean_token_accuracy": 0.6846338094126249,
"eval_num_tokens": 830692.0,
"eval_runtime": 34.3989,
"eval_samples_per_second": 32.763,
"eval_steps_per_second": 8.198,
"step": 2000
},
{
"entropy": 1.17150056040287,
"epoch": 1.109139307897072,
"grad_norm": 17.192914962768555,
"learning_rate": 2.671317940661071e-06,
"loss": 1.1239,
"mean_token_accuracy": 0.7167075086236,
"num_tokens": 1036856.0,
"step": 2500
},
{
"epoch": 1.109139307897072,
"eval_entropy": 0.9878316506849113,
"eval_loss": 1.3588604927062988,
"eval_mean_token_accuracy": 0.6802238186834552,
"eval_num_tokens": 1036856.0,
"eval_runtime": 34.9345,
"eval_samples_per_second": 32.26,
"eval_steps_per_second": 8.072,
"step": 2500
},
{
"entropy": 1.0019279054403305,
"epoch": 1.3309671694764862,
"grad_norm": 15.853100776672363,
"learning_rate": 2.5284187504412197e-06,
"loss": 0.9613,
"mean_token_accuracy": 0.751466910123825,
"num_tokens": 1236950.0,
"step": 3000
},
{
"epoch": 1.3309671694764862,
"eval_entropy": 1.062070348161332,
"eval_loss": 1.3535487651824951,
"eval_mean_token_accuracy": 0.6781793389971374,
"eval_num_tokens": 1236950.0,
"eval_runtime": 34.1483,
"eval_samples_per_second": 33.003,
"eval_steps_per_second": 8.258,
"step": 3000
},
{
"entropy": 1.0105634088516235,
"epoch": 1.5527950310559007,
"grad_norm": 18.761213302612305,
"learning_rate": 2.365215281470278e-06,
"loss": 0.974,
"mean_token_accuracy": 0.7476956839561463,
"num_tokens": 1445121.0,
"step": 3500
},
{
"epoch": 1.5527950310559007,
"eval_entropy": 1.0477840112033465,
"eval_loss": 1.3453229665756226,
"eval_mean_token_accuracy": 0.6804118481933648,
"eval_num_tokens": 1445121.0,
"eval_runtime": 33.969,
"eval_samples_per_second": 33.177,
"eval_steps_per_second": 8.302,
"step": 3500
},
{
"entropy": 1.030442102789879,
"epoch": 1.774622892635315,
"grad_norm": 18.44453239440918,
"learning_rate": 2.184929692743022e-06,
"loss": 0.9963,
"mean_token_accuracy": 0.743294857621193,
"num_tokens": 1656262.0,
"step": 4000
},
{
"epoch": 1.774622892635315,
"eval_entropy": 1.0542190546262349,
"eval_loss": 1.3501262664794922,
"eval_mean_token_accuracy": 0.6793523312460447,
"eval_num_tokens": 1656262.0,
"eval_runtime": 34.4167,
"eval_samples_per_second": 32.746,
"eval_steps_per_second": 8.194,
"step": 4000
},
{
"entropy": 1.0064435719251632,
"epoch": 1.9964507542147294,
"grad_norm": 11.64708137512207,
"learning_rate": 1.9911213989888633e-06,
"loss": 0.9697,
"mean_token_accuracy": 0.7490394617319107,
"num_tokens": 1867079.0,
"step": 4500
},
{
"epoch": 1.9964507542147294,
"eval_entropy": 1.0442877196251077,
"eval_loss": 1.3445523977279663,
"eval_mean_token_accuracy": 0.6808659062317922,
"eval_num_tokens": 1867079.0,
"eval_runtime": 34.1214,
"eval_samples_per_second": 33.029,
"eval_steps_per_second": 8.265,
"step": 4500
},
{
"entropy": 0.7014432374835015,
"epoch": 2.218278615794144,
"grad_norm": 16.21389389038086,
"learning_rate": 1.7876167964291556e-06,
"loss": 0.6614,
"mean_token_accuracy": 0.824757103562355,
"num_tokens": 2077059.0,
"step": 5000
},
{
"epoch": 2.218278615794144,
"eval_entropy": 0.8500108015029988,
"eval_loss": 1.556144118309021,
"eval_mean_token_accuracy": 0.6692640443220206,
"eval_num_tokens": 2077059.0,
"eval_runtime": 35.0607,
"eval_samples_per_second": 32.144,
"eval_steps_per_second": 8.043,
"step": 5000
},
{
"entropy": 0.6947842536568641,
"epoch": 2.440106477373558,
"grad_norm": 14.940324783325195,
"learning_rate": 1.5784337174650764e-06,
"loss": 0.6639,
"mean_token_accuracy": 0.8239517654180527,
"num_tokens": 2287905.0,
"step": 5500
},
{
"epoch": 2.440106477373558,
"eval_entropy": 0.8482754902636751,
"eval_loss": 1.5751625299453735,
"eval_mean_token_accuracy": 0.6686883025558282,
"eval_num_tokens": 2287905.0,
"eval_runtime": 34.8507,
"eval_samples_per_second": 32.338,
"eval_steps_per_second": 8.092,
"step": 5500
},
{
"entropy": 0.7052372665405273,
"epoch": 2.6619343389529724,
"grad_norm": 22.80525779724121,
"learning_rate": 1.3677021058024131e-06,
"loss": 0.6723,
"mean_token_accuracy": 0.8226301606893539,
"num_tokens": 2491365.0,
"step": 6000
},
{
"epoch": 2.6619343389529724,
"eval_entropy": 0.8458687763476203,
"eval_loss": 1.5676090717315674,
"eval_mean_token_accuracy": 0.6686736252290982,
"eval_num_tokens": 2491365.0,
"eval_runtime": 34.7445,
"eval_samples_per_second": 32.437,
"eval_steps_per_second": 8.116,
"step": 6000
},
{
"entropy": 0.6978846169710159,
"epoch": 2.883762200532387,
"grad_norm": 60.48308181762695,
"learning_rate": 1.1595824781402537e-06,
"loss": 0.6656,
"mean_token_accuracy": 0.8238938546180725,
"num_tokens": 2695935.0,
"step": 6500
},
{
"epoch": 2.883762200532387,
"eval_entropy": 0.8410944103771913,
"eval_loss": 1.569481611251831,
"eval_mean_token_accuracy": 0.6696258430362593,
"eval_num_tokens": 2695935.0,
"eval_runtime": 33.9566,
"eval_samples_per_second": 33.189,
"eval_steps_per_second": 8.305,
"step": 6500
},
{
"entropy": 0.5982560626268387,
"epoch": 3.1055900621118013,
"grad_norm": 20.41059112548828,
"learning_rate": 9.581837822509056e-07,
"loss": 0.5633,
"mean_token_accuracy": 0.8522541173696518,
"num_tokens": 2905707.0,
"step": 7000
},
{
"epoch": 3.1055900621118013,
"eval_entropy": 0.6810612559107179,
"eval_loss": 1.9356529712677002,
"eval_mean_token_accuracy": 0.6580131651027828,
"eval_num_tokens": 2905707.0,
"eval_runtime": 34.3377,
"eval_samples_per_second": 32.821,
"eval_steps_per_second": 8.213,
"step": 7000
},
{
"entropy": 0.4691380001306534,
"epoch": 3.3274179236912156,
"grad_norm": 12.475813865661621,
"learning_rate": 7.674822731955381e-07,
"loss": 0.4464,
"mean_token_accuracy": 0.8821721758842468,
"num_tokens": 3119436.0,
"step": 7500
},
{
"epoch": 3.3274179236912156,
"eval_entropy": 0.679166760639096,
"eval_loss": 1.958307147026062,
"eval_mean_token_accuracy": 0.6585667126990379,
"eval_num_tokens": 3119436.0,
"eval_runtime": 34.2034,
"eval_samples_per_second": 32.95,
"eval_steps_per_second": 8.245,
"step": 7500
},
{
"entropy": 0.4722338750064373,
"epoch": 3.54924578527063,
"grad_norm": 29.548173904418945,
"learning_rate": 5.912430093187734e-07,
"loss": 0.4503,
"mean_token_accuracy": 0.8828264862298966,
"num_tokens": 3323865.0,
"step": 8000
},
{
"epoch": 3.54924578527063,
"eval_entropy": 0.6674613842727445,
"eval_loss": 2.0193898677825928,
"eval_mean_token_accuracy": 0.6584047967871876,
"eval_num_tokens": 3323865.0,
"eval_runtime": 34.1338,
"eval_samples_per_second": 33.017,
"eval_steps_per_second": 8.262,
"step": 8000
},
{
"entropy": 0.4721588716506958,
"epoch": 3.771073646850044,
"grad_norm": 13.878427505493164,
"learning_rate": 4.329455179426337e-07,
"loss": 0.4503,
"mean_token_accuracy": 0.8825598682165146,
"num_tokens": 3528694.0,
"step": 8500
},
{
"epoch": 3.771073646850044,
"eval_entropy": 0.6828910686234211,
"eval_loss": 1.9711394309997559,
"eval_mean_token_accuracy": 0.6582514842351278,
"eval_num_tokens": 3528694.0,
"eval_runtime": 15.9432,
"eval_samples_per_second": 70.688,
"eval_steps_per_second": 17.688,
"step": 8500
},
{
"entropy": 0.4666064047217369,
"epoch": 3.992901508429459,
"grad_norm": 12.754748344421387,
"learning_rate": 2.957150983570442e-07,
"loss": 0.4459,
"mean_token_accuracy": 0.8827585883140564,
"num_tokens": 3734266.0,
"step": 9000
},
{
"epoch": 3.992901508429459,
"eval_entropy": 0.6604011273341821,
"eval_loss": 2.0170204639434814,
"eval_mean_token_accuracy": 0.6589750934999885,
"eval_num_tokens": 3734266.0,
"eval_runtime": 16.004,
"eval_samples_per_second": 70.42,
"eval_steps_per_second": 17.621,
"step": 9000
},
{
"entropy": 0.3668721870481968,
"epoch": 4.2147293700088735,
"grad_norm": 12.95799446105957,
"learning_rate": 1.8226111840579329e-07,
"loss": 0.3403,
"mean_token_accuracy": 0.9130678927898407,
"num_tokens": 3949370.0,
"step": 9500
},
{
"epoch": 4.2147293700088735,
"eval_entropy": 0.5831356482936981,
"eval_loss": 2.341752052307129,
"eval_mean_token_accuracy": 0.6525708041715284,
"eval_num_tokens": 3949370.0,
"eval_runtime": 16.1748,
"eval_samples_per_second": 69.676,
"eval_steps_per_second": 17.435,
"step": 9500
},
{
"entropy": 0.3616520670354366,
"epoch": 4.436557231588288,
"grad_norm": 10.559103965759277,
"learning_rate": 9.482352289090136e-08,
"loss": 0.3364,
"mean_token_accuracy": 0.913170881986618,
"num_tokens": 4152059.0,
"step": 10000
},
{
"epoch": 4.436557231588288,
"eval_entropy": 0.5844697422803716,
"eval_loss": 2.3465816974639893,
"eval_mean_token_accuracy": 0.6521602052111998,
"eval_num_tokens": 4152059.0,
"eval_runtime": 16.0213,
"eval_samples_per_second": 70.344,
"eval_steps_per_second": 17.602,
"step": 10000
},
{
"entropy": 0.354730488717556,
"epoch": 4.658385093167702,
"grad_norm": 11.545758247375488,
"learning_rate": 3.512860989075112e-08,
"loss": 0.3285,
"mean_token_accuracy": 0.9157207467556,
"num_tokens": 4357778.0,
"step": 10500
},
{
"epoch": 4.658385093167702,
"eval_entropy": 0.5713372949167346,
"eval_loss": 2.3891968727111816,
"eval_mean_token_accuracy": 0.652075339293649,
"eval_num_tokens": 4357778.0,
"eval_runtime": 15.9489,
"eval_samples_per_second": 70.663,
"eval_steps_per_second": 17.681,
"step": 10500
},
{
"entropy": 0.34955123990774156,
"epoch": 4.880212954747116,
"grad_norm": 11.0723237991333,
"learning_rate": 4.354948109051016e-09,
"loss": 0.3294,
"mean_token_accuracy": 0.9162505613565445,
"num_tokens": 4563930.0,
"step": 11000
},
{
"epoch": 4.880212954747116,
"eval_entropy": 0.5768938803292335,
"eval_loss": 2.3820862770080566,
"eval_mean_token_accuracy": 0.6512273330215022,
"eval_num_tokens": 4563930.0,
"eval_runtime": 16.1542,
"eval_samples_per_second": 69.765,
"eval_steps_per_second": 17.457,
"step": 11000
}
],
"logging_steps": 500,
"max_steps": 11270,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.097310295674384e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}