oh_v1.3_camel_math_x2 / trainer_state.json
sedrickkeh's picture
End of training
0dc2c96 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9948364888123926,
"eval_steps": 500,
"global_step": 870,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03442340791738382,
"grad_norm": 3.6819635678440528,
"learning_rate": 5e-06,
"loss": 1.0121,
"step": 10
},
{
"epoch": 0.06884681583476764,
"grad_norm": 5.925274879357907,
"learning_rate": 5e-06,
"loss": 0.8959,
"step": 20
},
{
"epoch": 0.10327022375215146,
"grad_norm": 1.9452350190286591,
"learning_rate": 5e-06,
"loss": 0.877,
"step": 30
},
{
"epoch": 0.13769363166953527,
"grad_norm": 1.180696985048175,
"learning_rate": 5e-06,
"loss": 0.8454,
"step": 40
},
{
"epoch": 0.1721170395869191,
"grad_norm": 0.9223333997999451,
"learning_rate": 5e-06,
"loss": 0.8155,
"step": 50
},
{
"epoch": 0.20654044750430292,
"grad_norm": 0.9307430421223478,
"learning_rate": 5e-06,
"loss": 0.7988,
"step": 60
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.791390062841487,
"learning_rate": 5e-06,
"loss": 0.7889,
"step": 70
},
{
"epoch": 0.27538726333907054,
"grad_norm": 0.6218724507386446,
"learning_rate": 5e-06,
"loss": 0.7806,
"step": 80
},
{
"epoch": 0.3098106712564544,
"grad_norm": 0.7264494941987614,
"learning_rate": 5e-06,
"loss": 0.7705,
"step": 90
},
{
"epoch": 0.3442340791738382,
"grad_norm": 0.6924488359878446,
"learning_rate": 5e-06,
"loss": 0.7613,
"step": 100
},
{
"epoch": 0.37865748709122204,
"grad_norm": 0.8803235860035055,
"learning_rate": 5e-06,
"loss": 0.761,
"step": 110
},
{
"epoch": 0.41308089500860584,
"grad_norm": 0.6841880149421407,
"learning_rate": 5e-06,
"loss": 0.7555,
"step": 120
},
{
"epoch": 0.4475043029259897,
"grad_norm": 0.667216275591224,
"learning_rate": 5e-06,
"loss": 0.7507,
"step": 130
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.6640712451607944,
"learning_rate": 5e-06,
"loss": 0.7474,
"step": 140
},
{
"epoch": 0.5163511187607573,
"grad_norm": 0.5645446586749623,
"learning_rate": 5e-06,
"loss": 0.7452,
"step": 150
},
{
"epoch": 0.5507745266781411,
"grad_norm": 0.7052276498344197,
"learning_rate": 5e-06,
"loss": 0.7461,
"step": 160
},
{
"epoch": 0.5851979345955249,
"grad_norm": 0.7026553647920556,
"learning_rate": 5e-06,
"loss": 0.7467,
"step": 170
},
{
"epoch": 0.6196213425129088,
"grad_norm": 0.5956245872933223,
"learning_rate": 5e-06,
"loss": 0.7483,
"step": 180
},
{
"epoch": 0.6540447504302926,
"grad_norm": 0.5945615019725103,
"learning_rate": 5e-06,
"loss": 0.7424,
"step": 190
},
{
"epoch": 0.6884681583476764,
"grad_norm": 0.5926282356688969,
"learning_rate": 5e-06,
"loss": 0.7393,
"step": 200
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.529261150364574,
"learning_rate": 5e-06,
"loss": 0.734,
"step": 210
},
{
"epoch": 0.7573149741824441,
"grad_norm": 0.70244337869977,
"learning_rate": 5e-06,
"loss": 0.7344,
"step": 220
},
{
"epoch": 0.7917383820998278,
"grad_norm": 0.5208460618800276,
"learning_rate": 5e-06,
"loss": 0.7332,
"step": 230
},
{
"epoch": 0.8261617900172117,
"grad_norm": 0.6781034767797038,
"learning_rate": 5e-06,
"loss": 0.7323,
"step": 240
},
{
"epoch": 0.8605851979345955,
"grad_norm": 0.6364816529741125,
"learning_rate": 5e-06,
"loss": 0.7311,
"step": 250
},
{
"epoch": 0.8950086058519794,
"grad_norm": 0.6857634900258707,
"learning_rate": 5e-06,
"loss": 0.73,
"step": 260
},
{
"epoch": 0.9294320137693631,
"grad_norm": 0.7533537266440626,
"learning_rate": 5e-06,
"loss": 0.7274,
"step": 270
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.668483865607749,
"learning_rate": 5e-06,
"loss": 0.7281,
"step": 280
},
{
"epoch": 0.9982788296041308,
"grad_norm": 0.640731101658078,
"learning_rate": 5e-06,
"loss": 0.73,
"step": 290
},
{
"epoch": 0.9982788296041308,
"eval_loss": 0.7240723967552185,
"eval_runtime": 311.5856,
"eval_samples_per_second": 25.123,
"eval_steps_per_second": 0.395,
"step": 290
},
{
"epoch": 1.0327022375215147,
"grad_norm": 0.8232801616508892,
"learning_rate": 5e-06,
"loss": 0.7164,
"step": 300
},
{
"epoch": 1.0671256454388984,
"grad_norm": 0.6877231272007057,
"learning_rate": 5e-06,
"loss": 0.6796,
"step": 310
},
{
"epoch": 1.1015490533562822,
"grad_norm": 0.7867017260974334,
"learning_rate": 5e-06,
"loss": 0.6731,
"step": 320
},
{
"epoch": 1.1359724612736661,
"grad_norm": 0.6102991857765998,
"learning_rate": 5e-06,
"loss": 0.6804,
"step": 330
},
{
"epoch": 1.1703958691910499,
"grad_norm": 0.7250816197036796,
"learning_rate": 5e-06,
"loss": 0.676,
"step": 340
},
{
"epoch": 1.2048192771084336,
"grad_norm": 0.6971293258638788,
"learning_rate": 5e-06,
"loss": 0.6758,
"step": 350
},
{
"epoch": 1.2392426850258176,
"grad_norm": 0.6980055976515607,
"learning_rate": 5e-06,
"loss": 0.6812,
"step": 360
},
{
"epoch": 1.2736660929432013,
"grad_norm": 0.6257924181521026,
"learning_rate": 5e-06,
"loss": 0.6829,
"step": 370
},
{
"epoch": 1.3080895008605853,
"grad_norm": 0.6620444223829324,
"learning_rate": 5e-06,
"loss": 0.6787,
"step": 380
},
{
"epoch": 1.342512908777969,
"grad_norm": 0.6019054885784155,
"learning_rate": 5e-06,
"loss": 0.6793,
"step": 390
},
{
"epoch": 1.3769363166953528,
"grad_norm": 0.6430051610733118,
"learning_rate": 5e-06,
"loss": 0.6774,
"step": 400
},
{
"epoch": 1.4113597246127367,
"grad_norm": 0.5807368932507306,
"learning_rate": 5e-06,
"loss": 0.6812,
"step": 410
},
{
"epoch": 1.4457831325301205,
"grad_norm": 0.6470925978408152,
"learning_rate": 5e-06,
"loss": 0.6747,
"step": 420
},
{
"epoch": 1.4802065404475044,
"grad_norm": 0.6423216146537339,
"learning_rate": 5e-06,
"loss": 0.6764,
"step": 430
},
{
"epoch": 1.5146299483648882,
"grad_norm": 0.5134608684735672,
"learning_rate": 5e-06,
"loss": 0.6744,
"step": 440
},
{
"epoch": 1.549053356282272,
"grad_norm": 0.5563124728753217,
"learning_rate": 5e-06,
"loss": 0.6728,
"step": 450
},
{
"epoch": 1.5834767641996557,
"grad_norm": 0.6269436233978866,
"learning_rate": 5e-06,
"loss": 0.6761,
"step": 460
},
{
"epoch": 1.6179001721170396,
"grad_norm": 0.589734978264397,
"learning_rate": 5e-06,
"loss": 0.6792,
"step": 470
},
{
"epoch": 1.6523235800344234,
"grad_norm": 0.6327759222361318,
"learning_rate": 5e-06,
"loss": 0.6768,
"step": 480
},
{
"epoch": 1.6867469879518073,
"grad_norm": 0.6962103362892431,
"learning_rate": 5e-06,
"loss": 0.677,
"step": 490
},
{
"epoch": 1.721170395869191,
"grad_norm": 0.5760289071453567,
"learning_rate": 5e-06,
"loss": 0.6799,
"step": 500
},
{
"epoch": 1.7555938037865748,
"grad_norm": 0.6442600102377914,
"learning_rate": 5e-06,
"loss": 0.6773,
"step": 510
},
{
"epoch": 1.7900172117039586,
"grad_norm": 0.7715377748849698,
"learning_rate": 5e-06,
"loss": 0.6761,
"step": 520
},
{
"epoch": 1.8244406196213425,
"grad_norm": 0.5533000553027299,
"learning_rate": 5e-06,
"loss": 0.6736,
"step": 530
},
{
"epoch": 1.8588640275387265,
"grad_norm": 0.6543045883003663,
"learning_rate": 5e-06,
"loss": 0.6724,
"step": 540
},
{
"epoch": 1.8932874354561102,
"grad_norm": 0.7812179906299692,
"learning_rate": 5e-06,
"loss": 0.6745,
"step": 550
},
{
"epoch": 1.927710843373494,
"grad_norm": 0.7706494630311692,
"learning_rate": 5e-06,
"loss": 0.6744,
"step": 560
},
{
"epoch": 1.9621342512908777,
"grad_norm": 0.6182434646754749,
"learning_rate": 5e-06,
"loss": 0.6755,
"step": 570
},
{
"epoch": 1.9965576592082617,
"grad_norm": 0.6295557645635617,
"learning_rate": 5e-06,
"loss": 0.6787,
"step": 580
},
{
"epoch": 2.0,
"eval_loss": 0.7113586664199829,
"eval_runtime": 311.8588,
"eval_samples_per_second": 25.101,
"eval_steps_per_second": 0.394,
"step": 581
},
{
"epoch": 2.0309810671256456,
"grad_norm": 1.2061368930293643,
"learning_rate": 5e-06,
"loss": 0.6659,
"step": 590
},
{
"epoch": 2.0654044750430294,
"grad_norm": 1.3280429163631766,
"learning_rate": 5e-06,
"loss": 0.6232,
"step": 600
},
{
"epoch": 2.099827882960413,
"grad_norm": 0.8615634723401497,
"learning_rate": 5e-06,
"loss": 0.6239,
"step": 610
},
{
"epoch": 2.134251290877797,
"grad_norm": 0.7137137740055365,
"learning_rate": 5e-06,
"loss": 0.6196,
"step": 620
},
{
"epoch": 2.1686746987951806,
"grad_norm": 0.7012119673623688,
"learning_rate": 5e-06,
"loss": 0.6257,
"step": 630
},
{
"epoch": 2.2030981067125643,
"grad_norm": 0.7539553553577881,
"learning_rate": 5e-06,
"loss": 0.6232,
"step": 640
},
{
"epoch": 2.2375215146299485,
"grad_norm": 0.7635231238603634,
"learning_rate": 5e-06,
"loss": 0.6203,
"step": 650
},
{
"epoch": 2.2719449225473323,
"grad_norm": 0.6908410296367468,
"learning_rate": 5e-06,
"loss": 0.6254,
"step": 660
},
{
"epoch": 2.306368330464716,
"grad_norm": 0.6587745940287006,
"learning_rate": 5e-06,
"loss": 0.6301,
"step": 670
},
{
"epoch": 2.3407917383820998,
"grad_norm": 0.5798868468674587,
"learning_rate": 5e-06,
"loss": 0.6279,
"step": 680
},
{
"epoch": 2.3752151462994835,
"grad_norm": 0.8440728118550425,
"learning_rate": 5e-06,
"loss": 0.6287,
"step": 690
},
{
"epoch": 2.4096385542168672,
"grad_norm": 0.6066489275997706,
"learning_rate": 5e-06,
"loss": 0.6295,
"step": 700
},
{
"epoch": 2.4440619621342514,
"grad_norm": 0.7165812340817078,
"learning_rate": 5e-06,
"loss": 0.6277,
"step": 710
},
{
"epoch": 2.478485370051635,
"grad_norm": 0.6122168594678861,
"learning_rate": 5e-06,
"loss": 0.6293,
"step": 720
},
{
"epoch": 2.512908777969019,
"grad_norm": 0.81573767147419,
"learning_rate": 5e-06,
"loss": 0.6287,
"step": 730
},
{
"epoch": 2.5473321858864026,
"grad_norm": 0.5606648215554753,
"learning_rate": 5e-06,
"loss": 0.6294,
"step": 740
},
{
"epoch": 2.581755593803787,
"grad_norm": 0.6406975384981994,
"learning_rate": 5e-06,
"loss": 0.6232,
"step": 750
},
{
"epoch": 2.6161790017211706,
"grad_norm": 0.6856546267607884,
"learning_rate": 5e-06,
"loss": 0.6291,
"step": 760
},
{
"epoch": 2.6506024096385543,
"grad_norm": 0.6347450877099359,
"learning_rate": 5e-06,
"loss": 0.6283,
"step": 770
},
{
"epoch": 2.685025817555938,
"grad_norm": 0.6621950677045059,
"learning_rate": 5e-06,
"loss": 0.6305,
"step": 780
},
{
"epoch": 2.719449225473322,
"grad_norm": 0.6174426541448764,
"learning_rate": 5e-06,
"loss": 0.6255,
"step": 790
},
{
"epoch": 2.7538726333907055,
"grad_norm": 0.6772601640104119,
"learning_rate": 5e-06,
"loss": 0.6314,
"step": 800
},
{
"epoch": 2.7882960413080893,
"grad_norm": 0.5940690265376317,
"learning_rate": 5e-06,
"loss": 0.6261,
"step": 810
},
{
"epoch": 2.8227194492254735,
"grad_norm": 0.5557807625472435,
"learning_rate": 5e-06,
"loss": 0.6266,
"step": 820
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.7023723168127282,
"learning_rate": 5e-06,
"loss": 0.6278,
"step": 830
},
{
"epoch": 2.891566265060241,
"grad_norm": 0.5869122563169644,
"learning_rate": 5e-06,
"loss": 0.6272,
"step": 840
},
{
"epoch": 2.9259896729776247,
"grad_norm": 0.6112033798118853,
"learning_rate": 5e-06,
"loss": 0.6304,
"step": 850
},
{
"epoch": 2.960413080895009,
"grad_norm": 0.6445615202118182,
"learning_rate": 5e-06,
"loss": 0.631,
"step": 860
},
{
"epoch": 2.9948364888123926,
"grad_norm": 0.6223406143063472,
"learning_rate": 5e-06,
"loss": 0.6305,
"step": 870
},
{
"epoch": 2.9948364888123926,
"eval_loss": 0.7126539349555969,
"eval_runtime": 315.6621,
"eval_samples_per_second": 24.799,
"eval_steps_per_second": 0.39,
"step": 870
},
{
"epoch": 2.9948364888123926,
"step": 870,
"total_flos": 1457073023877120.0,
"train_loss": 0.6931865083760229,
"train_runtime": 51465.7493,
"train_samples_per_second": 8.669,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 870,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1457073023877120.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}