oh_v1.3_camel_math_x.25 / trainer_state.json
sedrickkeh's picture
End of training
914fd17 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.995899772209567,
"eval_steps": 500,
"global_step": 822,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03644646924829157,
"grad_norm": 13.273536409408566,
"learning_rate": 5e-06,
"loss": 1.0215,
"step": 10
},
{
"epoch": 0.07289293849658314,
"grad_norm": 2.0256934340222124,
"learning_rate": 5e-06,
"loss": 0.901,
"step": 20
},
{
"epoch": 0.10933940774487472,
"grad_norm": 1.1110745496419128,
"learning_rate": 5e-06,
"loss": 0.8577,
"step": 30
},
{
"epoch": 0.14578587699316628,
"grad_norm": 1.2500399664289437,
"learning_rate": 5e-06,
"loss": 0.8319,
"step": 40
},
{
"epoch": 0.18223234624145787,
"grad_norm": 0.993614371192878,
"learning_rate": 5e-06,
"loss": 0.8081,
"step": 50
},
{
"epoch": 0.21867881548974943,
"grad_norm": 0.97363008151794,
"learning_rate": 5e-06,
"loss": 0.7963,
"step": 60
},
{
"epoch": 0.255125284738041,
"grad_norm": 0.9129099469883597,
"learning_rate": 5e-06,
"loss": 0.7888,
"step": 70
},
{
"epoch": 0.29157175398633256,
"grad_norm": 0.9344193654869811,
"learning_rate": 5e-06,
"loss": 0.7758,
"step": 80
},
{
"epoch": 0.32801822323462415,
"grad_norm": 1.0188210165751896,
"learning_rate": 5e-06,
"loss": 0.775,
"step": 90
},
{
"epoch": 0.36446469248291574,
"grad_norm": 0.9232347440747942,
"learning_rate": 5e-06,
"loss": 0.7692,
"step": 100
},
{
"epoch": 0.4009111617312073,
"grad_norm": 0.6783140292643164,
"learning_rate": 5e-06,
"loss": 0.7621,
"step": 110
},
{
"epoch": 0.43735763097949887,
"grad_norm": 0.6184031681315292,
"learning_rate": 5e-06,
"loss": 0.7556,
"step": 120
},
{
"epoch": 0.47380410022779046,
"grad_norm": 0.5469349607429109,
"learning_rate": 5e-06,
"loss": 0.7551,
"step": 130
},
{
"epoch": 0.510250569476082,
"grad_norm": 0.8877445271536203,
"learning_rate": 5e-06,
"loss": 0.7554,
"step": 140
},
{
"epoch": 0.5466970387243736,
"grad_norm": 1.0291306012367956,
"learning_rate": 5e-06,
"loss": 0.7495,
"step": 150
},
{
"epoch": 0.5831435079726651,
"grad_norm": 0.6607692936776239,
"learning_rate": 5e-06,
"loss": 0.7459,
"step": 160
},
{
"epoch": 0.6195899772209568,
"grad_norm": 0.6336927673267501,
"learning_rate": 5e-06,
"loss": 0.748,
"step": 170
},
{
"epoch": 0.6560364464692483,
"grad_norm": 0.8944901762240539,
"learning_rate": 5e-06,
"loss": 0.7452,
"step": 180
},
{
"epoch": 0.6924829157175398,
"grad_norm": 0.8843897330408937,
"learning_rate": 5e-06,
"loss": 0.7445,
"step": 190
},
{
"epoch": 0.7289293849658315,
"grad_norm": 0.6972008986400734,
"learning_rate": 5e-06,
"loss": 0.7385,
"step": 200
},
{
"epoch": 0.765375854214123,
"grad_norm": 0.5894622066220608,
"learning_rate": 5e-06,
"loss": 0.7375,
"step": 210
},
{
"epoch": 0.8018223234624146,
"grad_norm": 0.5755055010916849,
"learning_rate": 5e-06,
"loss": 0.7366,
"step": 220
},
{
"epoch": 0.8382687927107062,
"grad_norm": 0.6956357933104967,
"learning_rate": 5e-06,
"loss": 0.7418,
"step": 230
},
{
"epoch": 0.8747152619589977,
"grad_norm": 0.7222014717098794,
"learning_rate": 5e-06,
"loss": 0.7289,
"step": 240
},
{
"epoch": 0.9111617312072893,
"grad_norm": 0.6509301599056833,
"learning_rate": 5e-06,
"loss": 0.7353,
"step": 250
},
{
"epoch": 0.9476082004555809,
"grad_norm": 0.5946415096003963,
"learning_rate": 5e-06,
"loss": 0.739,
"step": 260
},
{
"epoch": 0.9840546697038725,
"grad_norm": 0.6399983053851734,
"learning_rate": 5e-06,
"loss": 0.7315,
"step": 270
},
{
"epoch": 0.9986332574031891,
"eval_loss": 0.7315455079078674,
"eval_runtime": 290.5947,
"eval_samples_per_second": 25.437,
"eval_steps_per_second": 0.399,
"step": 274
},
{
"epoch": 1.020501138952164,
"grad_norm": 0.8395093526707935,
"learning_rate": 5e-06,
"loss": 0.747,
"step": 280
},
{
"epoch": 1.0569476082004556,
"grad_norm": 0.9199729633560787,
"learning_rate": 5e-06,
"loss": 0.6787,
"step": 290
},
{
"epoch": 1.0933940774487472,
"grad_norm": 0.6628677057491944,
"learning_rate": 5e-06,
"loss": 0.6791,
"step": 300
},
{
"epoch": 1.1298405466970387,
"grad_norm": 0.6614989831751948,
"learning_rate": 5e-06,
"loss": 0.6854,
"step": 310
},
{
"epoch": 1.1662870159453302,
"grad_norm": 0.6964522514874895,
"learning_rate": 5e-06,
"loss": 0.6763,
"step": 320
},
{
"epoch": 1.2027334851936218,
"grad_norm": 0.7090306269606215,
"learning_rate": 5e-06,
"loss": 0.6753,
"step": 330
},
{
"epoch": 1.2391799544419135,
"grad_norm": 0.648532712130652,
"learning_rate": 5e-06,
"loss": 0.68,
"step": 340
},
{
"epoch": 1.275626423690205,
"grad_norm": 0.7822954196339824,
"learning_rate": 5e-06,
"loss": 0.6817,
"step": 350
},
{
"epoch": 1.3120728929384966,
"grad_norm": 0.6766423459315555,
"learning_rate": 5e-06,
"loss": 0.6803,
"step": 360
},
{
"epoch": 1.3485193621867881,
"grad_norm": 0.7731309625470634,
"learning_rate": 5e-06,
"loss": 0.6788,
"step": 370
},
{
"epoch": 1.3849658314350797,
"grad_norm": 0.6229285700860081,
"learning_rate": 5e-06,
"loss": 0.6856,
"step": 380
},
{
"epoch": 1.4214123006833712,
"grad_norm": 0.6927410350677501,
"learning_rate": 5e-06,
"loss": 0.6808,
"step": 390
},
{
"epoch": 1.4578587699316627,
"grad_norm": 0.834486739783265,
"learning_rate": 5e-06,
"loss": 0.6772,
"step": 400
},
{
"epoch": 1.4943052391799545,
"grad_norm": 0.7099676513539387,
"learning_rate": 5e-06,
"loss": 0.6803,
"step": 410
},
{
"epoch": 1.530751708428246,
"grad_norm": 0.6104516289365347,
"learning_rate": 5e-06,
"loss": 0.683,
"step": 420
},
{
"epoch": 1.5671981776765376,
"grad_norm": 0.5971848121166693,
"learning_rate": 5e-06,
"loss": 0.6787,
"step": 430
},
{
"epoch": 1.603644646924829,
"grad_norm": 0.6649414637192727,
"learning_rate": 5e-06,
"loss": 0.6849,
"step": 440
},
{
"epoch": 1.6400911161731209,
"grad_norm": 0.7320907085872882,
"learning_rate": 5e-06,
"loss": 0.6817,
"step": 450
},
{
"epoch": 1.6765375854214124,
"grad_norm": 0.5705453457499549,
"learning_rate": 5e-06,
"loss": 0.6836,
"step": 460
},
{
"epoch": 1.712984054669704,
"grad_norm": 0.6288020854363963,
"learning_rate": 5e-06,
"loss": 0.6788,
"step": 470
},
{
"epoch": 1.7494305239179955,
"grad_norm": 0.5726327402033801,
"learning_rate": 5e-06,
"loss": 0.6808,
"step": 480
},
{
"epoch": 1.785876993166287,
"grad_norm": 0.5173548522448698,
"learning_rate": 5e-06,
"loss": 0.6799,
"step": 490
},
{
"epoch": 1.8223234624145785,
"grad_norm": 0.5790339638087626,
"learning_rate": 5e-06,
"loss": 0.6747,
"step": 500
},
{
"epoch": 1.85876993166287,
"grad_norm": 0.6584239869836397,
"learning_rate": 5e-06,
"loss": 0.677,
"step": 510
},
{
"epoch": 1.8952164009111616,
"grad_norm": 0.5311060458966043,
"learning_rate": 5e-06,
"loss": 0.6854,
"step": 520
},
{
"epoch": 1.9316628701594531,
"grad_norm": 0.6512560331845895,
"learning_rate": 5e-06,
"loss": 0.6784,
"step": 530
},
{
"epoch": 1.968109339407745,
"grad_norm": 0.584794911589519,
"learning_rate": 5e-06,
"loss": 0.6805,
"step": 540
},
{
"epoch": 1.9972665148063782,
"eval_loss": 0.7187947630882263,
"eval_runtime": 291.938,
"eval_samples_per_second": 25.32,
"eval_steps_per_second": 0.397,
"step": 548
},
{
"epoch": 2.0045558086560367,
"grad_norm": 0.885264185792607,
"learning_rate": 5e-06,
"loss": 0.7193,
"step": 550
},
{
"epoch": 2.041002277904328,
"grad_norm": 0.7654881044126012,
"learning_rate": 5e-06,
"loss": 0.6233,
"step": 560
},
{
"epoch": 2.0774487471526197,
"grad_norm": 0.6151945001911823,
"learning_rate": 5e-06,
"loss": 0.6283,
"step": 570
},
{
"epoch": 2.1138952164009113,
"grad_norm": 0.8374365216828517,
"learning_rate": 5e-06,
"loss": 0.6207,
"step": 580
},
{
"epoch": 2.150341685649203,
"grad_norm": 0.7031485699411321,
"learning_rate": 5e-06,
"loss": 0.6244,
"step": 590
},
{
"epoch": 2.1867881548974943,
"grad_norm": 0.8633299586157721,
"learning_rate": 5e-06,
"loss": 0.6258,
"step": 600
},
{
"epoch": 2.223234624145786,
"grad_norm": 0.7458455143129973,
"learning_rate": 5e-06,
"loss": 0.628,
"step": 610
},
{
"epoch": 2.2596810933940774,
"grad_norm": 1.0109682737601884,
"learning_rate": 5e-06,
"loss": 0.6264,
"step": 620
},
{
"epoch": 2.296127562642369,
"grad_norm": 0.6211087319945944,
"learning_rate": 5e-06,
"loss": 0.6242,
"step": 630
},
{
"epoch": 2.3325740318906605,
"grad_norm": 0.7036393621519607,
"learning_rate": 5e-06,
"loss": 0.6305,
"step": 640
},
{
"epoch": 2.369020501138952,
"grad_norm": 0.6058323256112293,
"learning_rate": 5e-06,
"loss": 0.6297,
"step": 650
},
{
"epoch": 2.4054669703872436,
"grad_norm": 0.6608686938446593,
"learning_rate": 5e-06,
"loss": 0.631,
"step": 660
},
{
"epoch": 2.4419134396355355,
"grad_norm": 0.6199043838308076,
"learning_rate": 5e-06,
"loss": 0.6264,
"step": 670
},
{
"epoch": 2.478359908883827,
"grad_norm": 0.607673754805363,
"learning_rate": 5e-06,
"loss": 0.6266,
"step": 680
},
{
"epoch": 2.5148063781321186,
"grad_norm": 0.9277091135129097,
"learning_rate": 5e-06,
"loss": 0.6263,
"step": 690
},
{
"epoch": 2.55125284738041,
"grad_norm": 0.9381891238069503,
"learning_rate": 5e-06,
"loss": 0.6317,
"step": 700
},
{
"epoch": 2.5876993166287017,
"grad_norm": 0.6592786383334494,
"learning_rate": 5e-06,
"loss": 0.6264,
"step": 710
},
{
"epoch": 2.624145785876993,
"grad_norm": 0.7421181566721138,
"learning_rate": 5e-06,
"loss": 0.6295,
"step": 720
},
{
"epoch": 2.6605922551252847,
"grad_norm": 0.6781081672896357,
"learning_rate": 5e-06,
"loss": 0.6273,
"step": 730
},
{
"epoch": 2.6970387243735763,
"grad_norm": 0.609137054982541,
"learning_rate": 5e-06,
"loss": 0.6328,
"step": 740
},
{
"epoch": 2.733485193621868,
"grad_norm": 0.6919361244155826,
"learning_rate": 5e-06,
"loss": 0.6333,
"step": 750
},
{
"epoch": 2.7699316628701594,
"grad_norm": 0.6379259386020866,
"learning_rate": 5e-06,
"loss": 0.6306,
"step": 760
},
{
"epoch": 2.806378132118451,
"grad_norm": 0.6035608731746878,
"learning_rate": 5e-06,
"loss": 0.6338,
"step": 770
},
{
"epoch": 2.8428246013667424,
"grad_norm": 0.7325417971133363,
"learning_rate": 5e-06,
"loss": 0.6352,
"step": 780
},
{
"epoch": 2.879271070615034,
"grad_norm": 0.8532590605538493,
"learning_rate": 5e-06,
"loss": 0.6284,
"step": 790
},
{
"epoch": 2.9157175398633255,
"grad_norm": 0.6185281977585761,
"learning_rate": 5e-06,
"loss": 0.6306,
"step": 800
},
{
"epoch": 2.9521640091116175,
"grad_norm": 0.6806046770942457,
"learning_rate": 5e-06,
"loss": 0.6402,
"step": 810
},
{
"epoch": 2.988610478359909,
"grad_norm": 0.6802410015239903,
"learning_rate": 5e-06,
"loss": 0.6348,
"step": 820
},
{
"epoch": 2.995899772209567,
"eval_loss": 0.7211272120475769,
"eval_runtime": 291.822,
"eval_samples_per_second": 25.331,
"eval_steps_per_second": 0.398,
"step": 822
},
{
"epoch": 2.995899772209567,
"step": 822,
"total_flos": 1376671236096000.0,
"train_loss": 0.6964337152866261,
"train_runtime": 48380.1477,
"train_samples_per_second": 8.709,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 822,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1376671236096000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}