Mini-Spyra-v.1.3 / trainer_state.json
Kwokou's picture
Upload folder using huggingface_hub
e78a3c4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.684931506849315,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00684931506849315,
"grad_norm": 166.6063995361328,
"learning_rate": 5e-06,
"loss": 3.913,
"step": 1
},
{
"epoch": 0.0136986301369863,
"grad_norm": 88.24701690673828,
"learning_rate": 1e-05,
"loss": 3.7453,
"step": 2
},
{
"epoch": 0.02054794520547945,
"grad_norm": 78.30535125732422,
"learning_rate": 9.89795918367347e-06,
"loss": 2.1474,
"step": 3
},
{
"epoch": 0.0273972602739726,
"grad_norm": 83.51199340820312,
"learning_rate": 9.795918367346939e-06,
"loss": 2.2588,
"step": 4
},
{
"epoch": 0.03424657534246575,
"grad_norm": 49.122493743896484,
"learning_rate": 9.693877551020408e-06,
"loss": 1.8187,
"step": 5
},
{
"epoch": 0.0410958904109589,
"grad_norm": 31.191837310791016,
"learning_rate": 9.591836734693878e-06,
"loss": 1.8402,
"step": 6
},
{
"epoch": 0.04794520547945205,
"grad_norm": 43.579193115234375,
"learning_rate": 9.489795918367348e-06,
"loss": 1.8214,
"step": 7
},
{
"epoch": 0.0547945205479452,
"grad_norm": 62.58639144897461,
"learning_rate": 9.387755102040818e-06,
"loss": 2.3602,
"step": 8
},
{
"epoch": 0.06164383561643835,
"grad_norm": 40.35031509399414,
"learning_rate": 9.285714285714288e-06,
"loss": 1.8161,
"step": 9
},
{
"epoch": 0.0684931506849315,
"grad_norm": 39.441307067871094,
"learning_rate": 9.183673469387756e-06,
"loss": 1.8453,
"step": 10
},
{
"epoch": 0.07534246575342465,
"grad_norm": 36.04607391357422,
"learning_rate": 9.081632653061225e-06,
"loss": 1.4184,
"step": 11
},
{
"epoch": 0.0821917808219178,
"grad_norm": 37.545467376708984,
"learning_rate": 8.979591836734695e-06,
"loss": 1.4609,
"step": 12
},
{
"epoch": 0.08904109589041095,
"grad_norm": 33.20708465576172,
"learning_rate": 8.877551020408163e-06,
"loss": 1.8817,
"step": 13
},
{
"epoch": 0.0958904109589041,
"grad_norm": 40.35082244873047,
"learning_rate": 8.775510204081633e-06,
"loss": 1.6072,
"step": 14
},
{
"epoch": 0.10273972602739725,
"grad_norm": 52.9511604309082,
"learning_rate": 8.673469387755103e-06,
"loss": 1.4684,
"step": 15
},
{
"epoch": 0.1095890410958904,
"grad_norm": 41.79618453979492,
"learning_rate": 8.571428571428571e-06,
"loss": 1.7434,
"step": 16
},
{
"epoch": 0.11643835616438356,
"grad_norm": 39.11565017700195,
"learning_rate": 8.469387755102042e-06,
"loss": 1.7047,
"step": 17
},
{
"epoch": 0.1232876712328767,
"grad_norm": 45.29304122924805,
"learning_rate": 8.36734693877551e-06,
"loss": 1.9334,
"step": 18
},
{
"epoch": 0.13013698630136986,
"grad_norm": 24.95364761352539,
"learning_rate": 8.26530612244898e-06,
"loss": 2.2292,
"step": 19
},
{
"epoch": 0.136986301369863,
"grad_norm": 35.88187789916992,
"learning_rate": 8.16326530612245e-06,
"loss": 2.1195,
"step": 20
},
{
"epoch": 0.14383561643835616,
"grad_norm": 30.69417381286621,
"learning_rate": 8.06122448979592e-06,
"loss": 1.4519,
"step": 21
},
{
"epoch": 0.1506849315068493,
"grad_norm": 33.624210357666016,
"learning_rate": 7.959183673469388e-06,
"loss": 1.1561,
"step": 22
},
{
"epoch": 0.15753424657534246,
"grad_norm": 29.83182144165039,
"learning_rate": 7.857142857142858e-06,
"loss": 1.9838,
"step": 23
},
{
"epoch": 0.1643835616438356,
"grad_norm": 30.646284103393555,
"learning_rate": 7.755102040816327e-06,
"loss": 1.9414,
"step": 24
},
{
"epoch": 0.17123287671232876,
"grad_norm": 32.19529724121094,
"learning_rate": 7.653061224489796e-06,
"loss": 1.4498,
"step": 25
},
{
"epoch": 0.1780821917808219,
"grad_norm": 41.54957580566406,
"learning_rate": 7.551020408163265e-06,
"loss": 2.3265,
"step": 26
},
{
"epoch": 0.18493150684931506,
"grad_norm": 62.288414001464844,
"learning_rate": 7.448979591836736e-06,
"loss": 3.0618,
"step": 27
},
{
"epoch": 0.1917808219178082,
"grad_norm": 32.133243560791016,
"learning_rate": 7.346938775510205e-06,
"loss": 1.7778,
"step": 28
},
{
"epoch": 0.19863013698630136,
"grad_norm": 37.86830520629883,
"learning_rate": 7.244897959183675e-06,
"loss": 1.9299,
"step": 29
},
{
"epoch": 0.2054794520547945,
"grad_norm": 27.436555862426758,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.485,
"step": 30
},
{
"epoch": 0.21232876712328766,
"grad_norm": 27.29859161376953,
"learning_rate": 7.0408163265306125e-06,
"loss": 1.7159,
"step": 31
},
{
"epoch": 0.2191780821917808,
"grad_norm": 33.742835998535156,
"learning_rate": 6.938775510204082e-06,
"loss": 1.7625,
"step": 32
},
{
"epoch": 0.22602739726027396,
"grad_norm": 38.45572280883789,
"learning_rate": 6.836734693877551e-06,
"loss": 1.8054,
"step": 33
},
{
"epoch": 0.2328767123287671,
"grad_norm": 29.832292556762695,
"learning_rate": 6.734693877551021e-06,
"loss": 1.6,
"step": 34
},
{
"epoch": 0.23972602739726026,
"grad_norm": 32.1478157043457,
"learning_rate": 6.63265306122449e-06,
"loss": 1.9878,
"step": 35
},
{
"epoch": 0.2465753424657534,
"grad_norm": 21.848527908325195,
"learning_rate": 6.530612244897959e-06,
"loss": 1.9843,
"step": 36
},
{
"epoch": 0.2534246575342466,
"grad_norm": 35.852169036865234,
"learning_rate": 6.4285714285714295e-06,
"loss": 1.7578,
"step": 37
},
{
"epoch": 0.2602739726027397,
"grad_norm": 27.202524185180664,
"learning_rate": 6.326530612244899e-06,
"loss": 1.6341,
"step": 38
},
{
"epoch": 0.2671232876712329,
"grad_norm": 28.326839447021484,
"learning_rate": 6.224489795918368e-06,
"loss": 2.0204,
"step": 39
},
{
"epoch": 0.273972602739726,
"grad_norm": 20.435285568237305,
"learning_rate": 6.122448979591837e-06,
"loss": 1.6218,
"step": 40
},
{
"epoch": 0.2808219178082192,
"grad_norm": 81.60685729980469,
"learning_rate": 6.020408163265307e-06,
"loss": 1.6894,
"step": 41
},
{
"epoch": 0.2876712328767123,
"grad_norm": 31.553621292114258,
"learning_rate": 5.918367346938776e-06,
"loss": 1.7057,
"step": 42
},
{
"epoch": 0.2945205479452055,
"grad_norm": 28.031139373779297,
"learning_rate": 5.816326530612246e-06,
"loss": 1.5383,
"step": 43
},
{
"epoch": 0.3013698630136986,
"grad_norm": 23.66860008239746,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.5447,
"step": 44
},
{
"epoch": 0.3082191780821918,
"grad_norm": 34.923824310302734,
"learning_rate": 5.6122448979591834e-06,
"loss": 1.8337,
"step": 45
},
{
"epoch": 0.3150684931506849,
"grad_norm": 20.53199577331543,
"learning_rate": 5.510204081632653e-06,
"loss": 2.0762,
"step": 46
},
{
"epoch": 0.3219178082191781,
"grad_norm": 22.510568618774414,
"learning_rate": 5.408163265306123e-06,
"loss": 1.7565,
"step": 47
},
{
"epoch": 0.3287671232876712,
"grad_norm": 24.311054229736328,
"learning_rate": 5.306122448979593e-06,
"loss": 1.8084,
"step": 48
},
{
"epoch": 0.3356164383561644,
"grad_norm": 28.965652465820312,
"learning_rate": 5.204081632653062e-06,
"loss": 1.6366,
"step": 49
},
{
"epoch": 0.3424657534246575,
"grad_norm": 14.990488052368164,
"learning_rate": 5.1020408163265315e-06,
"loss": 0.8044,
"step": 50
},
{
"epoch": 0.3493150684931507,
"grad_norm": 30.92911148071289,
"learning_rate": 5e-06,
"loss": 0.9733,
"step": 51
},
{
"epoch": 0.3561643835616438,
"grad_norm": 25.58120346069336,
"learning_rate": 4.897959183673469e-06,
"loss": 1.8185,
"step": 52
},
{
"epoch": 0.363013698630137,
"grad_norm": 20.015125274658203,
"learning_rate": 4.795918367346939e-06,
"loss": 1.8311,
"step": 53
},
{
"epoch": 0.3698630136986301,
"grad_norm": 31.811553955078125,
"learning_rate": 4.693877551020409e-06,
"loss": 2.0963,
"step": 54
},
{
"epoch": 0.3767123287671233,
"grad_norm": 20.206634521484375,
"learning_rate": 4.591836734693878e-06,
"loss": 0.6103,
"step": 55
},
{
"epoch": 0.3835616438356164,
"grad_norm": 19.538440704345703,
"learning_rate": 4.489795918367348e-06,
"loss": 2.4183,
"step": 56
},
{
"epoch": 0.3904109589041096,
"grad_norm": 77.06710052490234,
"learning_rate": 4.3877551020408165e-06,
"loss": 2.3644,
"step": 57
},
{
"epoch": 0.3972602739726027,
"grad_norm": 31.162639617919922,
"learning_rate": 4.2857142857142855e-06,
"loss": 2.1601,
"step": 58
},
{
"epoch": 0.4041095890410959,
"grad_norm": 22.61947250366211,
"learning_rate": 4.183673469387755e-06,
"loss": 1.1591,
"step": 59
},
{
"epoch": 0.410958904109589,
"grad_norm": 37.861270904541016,
"learning_rate": 4.081632653061225e-06,
"loss": 2.2028,
"step": 60
},
{
"epoch": 0.4178082191780822,
"grad_norm": 17.079059600830078,
"learning_rate": 3.979591836734694e-06,
"loss": 1.7653,
"step": 61
},
{
"epoch": 0.4246575342465753,
"grad_norm": 28.447805404663086,
"learning_rate": 3.877551020408164e-06,
"loss": 1.8112,
"step": 62
},
{
"epoch": 0.4315068493150685,
"grad_norm": 33.241146087646484,
"learning_rate": 3.7755102040816327e-06,
"loss": 1.6641,
"step": 63
},
{
"epoch": 0.4383561643835616,
"grad_norm": 30.07863998413086,
"learning_rate": 3.6734693877551024e-06,
"loss": 2.1975,
"step": 64
},
{
"epoch": 0.4452054794520548,
"grad_norm": 37.52287292480469,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.9424,
"step": 65
},
{
"epoch": 0.4520547945205479,
"grad_norm": 28.23394203186035,
"learning_rate": 3.469387755102041e-06,
"loss": 1.8685,
"step": 66
},
{
"epoch": 0.4589041095890411,
"grad_norm": 28.84389305114746,
"learning_rate": 3.3673469387755105e-06,
"loss": 1.1797,
"step": 67
},
{
"epoch": 0.4657534246575342,
"grad_norm": 29.903711318969727,
"learning_rate": 3.2653061224489794e-06,
"loss": 2.0214,
"step": 68
},
{
"epoch": 0.4726027397260274,
"grad_norm": 36.350582122802734,
"learning_rate": 3.1632653061224496e-06,
"loss": 2.2569,
"step": 69
},
{
"epoch": 0.4794520547945205,
"grad_norm": 31.127033233642578,
"learning_rate": 3.0612244897959185e-06,
"loss": 1.3549,
"step": 70
},
{
"epoch": 0.4863013698630137,
"grad_norm": 47.7249755859375,
"learning_rate": 2.959183673469388e-06,
"loss": 2.2439,
"step": 71
},
{
"epoch": 0.4931506849315068,
"grad_norm": 39.26190185546875,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.8277,
"step": 72
},
{
"epoch": 0.5,
"grad_norm": 25.255807876586914,
"learning_rate": 2.7551020408163266e-06,
"loss": 1.8571,
"step": 73
},
{
"epoch": 0.5068493150684932,
"grad_norm": 25.397775650024414,
"learning_rate": 2.6530612244897964e-06,
"loss": 1.5798,
"step": 74
},
{
"epoch": 0.5136986301369864,
"grad_norm": 18.698389053344727,
"learning_rate": 2.5510204081632657e-06,
"loss": 1.8486,
"step": 75
},
{
"epoch": 0.5205479452054794,
"grad_norm": 17.65583610534668,
"learning_rate": 2.4489795918367347e-06,
"loss": 0.3439,
"step": 76
},
{
"epoch": 0.5273972602739726,
"grad_norm": 29.23255729675293,
"learning_rate": 2.3469387755102044e-06,
"loss": 1.4103,
"step": 77
},
{
"epoch": 0.5342465753424658,
"grad_norm": 20.359149932861328,
"learning_rate": 2.244897959183674e-06,
"loss": 1.8338,
"step": 78
},
{
"epoch": 0.541095890410959,
"grad_norm": 30.629518508911133,
"learning_rate": 2.1428571428571427e-06,
"loss": 1.9468,
"step": 79
},
{
"epoch": 0.547945205479452,
"grad_norm": 18.888212203979492,
"learning_rate": 2.0408163265306125e-06,
"loss": 1.6648,
"step": 80
},
{
"epoch": 0.5547945205479452,
"grad_norm": 32.43148422241211,
"learning_rate": 1.938775510204082e-06,
"loss": 1.2884,
"step": 81
},
{
"epoch": 0.5616438356164384,
"grad_norm": 23.12215805053711,
"learning_rate": 1.8367346938775512e-06,
"loss": 1.6872,
"step": 82
},
{
"epoch": 0.5684931506849316,
"grad_norm": 17.43967056274414,
"learning_rate": 1.7346938775510206e-06,
"loss": 1.7704,
"step": 83
},
{
"epoch": 0.5753424657534246,
"grad_norm": 23.49708366394043,
"learning_rate": 1.6326530612244897e-06,
"loss": 1.4206,
"step": 84
},
{
"epoch": 0.5821917808219178,
"grad_norm": 23.952125549316406,
"learning_rate": 1.5306122448979593e-06,
"loss": 1.2965,
"step": 85
},
{
"epoch": 0.589041095890411,
"grad_norm": 26.057159423828125,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.5459,
"step": 86
},
{
"epoch": 0.5958904109589042,
"grad_norm": 28.730058670043945,
"learning_rate": 1.3265306122448982e-06,
"loss": 1.2233,
"step": 87
},
{
"epoch": 0.6027397260273972,
"grad_norm": 25.128461837768555,
"learning_rate": 1.2244897959183673e-06,
"loss": 1.677,
"step": 88
},
{
"epoch": 0.6095890410958904,
"grad_norm": 21.31721305847168,
"learning_rate": 1.122448979591837e-06,
"loss": 1.7159,
"step": 89
},
{
"epoch": 0.6164383561643836,
"grad_norm": 24.109394073486328,
"learning_rate": 1.0204081632653063e-06,
"loss": 1.3831,
"step": 90
},
{
"epoch": 0.6232876712328768,
"grad_norm": 32.16295623779297,
"learning_rate": 9.183673469387756e-07,
"loss": 1.4889,
"step": 91
},
{
"epoch": 0.6301369863013698,
"grad_norm": 23.5495548248291,
"learning_rate": 8.163265306122449e-07,
"loss": 1.6685,
"step": 92
},
{
"epoch": 0.636986301369863,
"grad_norm": 31.10687828063965,
"learning_rate": 7.142857142857143e-07,
"loss": 1.1017,
"step": 93
},
{
"epoch": 0.6438356164383562,
"grad_norm": 27.091115951538086,
"learning_rate": 6.122448979591837e-07,
"loss": 1.9554,
"step": 94
},
{
"epoch": 0.6506849315068494,
"grad_norm": 29.03687286376953,
"learning_rate": 5.102040816326531e-07,
"loss": 1.4978,
"step": 95
},
{
"epoch": 0.6575342465753424,
"grad_norm": 21.225196838378906,
"learning_rate": 4.0816326530612243e-07,
"loss": 1.8585,
"step": 96
},
{
"epoch": 0.6643835616438356,
"grad_norm": 27.35712242126465,
"learning_rate": 3.0612244897959183e-07,
"loss": 1.3636,
"step": 97
},
{
"epoch": 0.6712328767123288,
"grad_norm": 26.40558624267578,
"learning_rate": 2.0408163265306121e-07,
"loss": 1.5249,
"step": 98
},
{
"epoch": 0.678082191780822,
"grad_norm": 29.14642906188965,
"learning_rate": 1.0204081632653061e-07,
"loss": 2.0083,
"step": 99
},
{
"epoch": 0.684931506849315,
"grad_norm": 18.334836959838867,
"learning_rate": 0.0,
"loss": 1.4028,
"step": 100
},
{
"epoch": 0.684931506849315,
"step": 100,
"total_flos": 2305515375820800.0,
"train_loss": 1.771125696003437,
"train_runtime": 5623.6023,
"train_samples_per_second": 0.018,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2305515375820800.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}