model_25124293 / checkpoint-460 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
0709a1c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 460,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008695652173913044,
"grad_norm": 33.915157318115234,
"learning_rate": 5.0000000000000004e-08,
"loss": 2.3126,
"step": 1
},
{
"epoch": 0.017391304347826087,
"grad_norm": 35.09430694580078,
"learning_rate": 1.0000000000000001e-07,
"loss": 2.4328,
"step": 2
},
{
"epoch": 0.02608695652173913,
"grad_norm": 33.54511260986328,
"learning_rate": 1.5000000000000002e-07,
"loss": 2.2895,
"step": 3
},
{
"epoch": 0.034782608695652174,
"grad_norm": 34.5639762878418,
"learning_rate": 2.0000000000000002e-07,
"loss": 2.376,
"step": 4
},
{
"epoch": 0.043478260869565216,
"grad_norm": 34.895896911621094,
"learning_rate": 2.5000000000000004e-07,
"loss": 2.4092,
"step": 5
},
{
"epoch": 0.05217391304347826,
"grad_norm": 33.44582748413086,
"learning_rate": 3.0000000000000004e-07,
"loss": 2.3196,
"step": 6
},
{
"epoch": 0.06086956521739131,
"grad_norm": 34.687496185302734,
"learning_rate": 3.5000000000000004e-07,
"loss": 2.3925,
"step": 7
},
{
"epoch": 0.06956521739130435,
"grad_norm": 34.72901153564453,
"learning_rate": 4.0000000000000003e-07,
"loss": 2.3679,
"step": 8
},
{
"epoch": 0.0782608695652174,
"grad_norm": 34.008853912353516,
"learning_rate": 4.5000000000000003e-07,
"loss": 2.2935,
"step": 9
},
{
"epoch": 0.08695652173913043,
"grad_norm": 33.60919189453125,
"learning_rate": 5.000000000000001e-07,
"loss": 2.2791,
"step": 10
},
{
"epoch": 0.09565217391304348,
"grad_norm": 32.73677444458008,
"learning_rate": 5.5e-07,
"loss": 2.172,
"step": 11
},
{
"epoch": 0.10434782608695652,
"grad_norm": 32.384212493896484,
"learning_rate": 6.000000000000001e-07,
"loss": 2.1622,
"step": 12
},
{
"epoch": 0.11304347826086956,
"grad_norm": 34.02764129638672,
"learning_rate": 6.5e-07,
"loss": 2.2014,
"step": 13
},
{
"epoch": 0.12173913043478261,
"grad_norm": 33.0348014831543,
"learning_rate": 7.000000000000001e-07,
"loss": 2.1775,
"step": 14
},
{
"epoch": 0.13043478260869565,
"grad_norm": 32.571834564208984,
"learning_rate": 7.5e-07,
"loss": 2.0714,
"step": 15
},
{
"epoch": 0.1391304347826087,
"grad_norm": 33.0487174987793,
"learning_rate": 8.000000000000001e-07,
"loss": 2.0866,
"step": 16
},
{
"epoch": 0.14782608695652175,
"grad_norm": 30.354747772216797,
"learning_rate": 8.500000000000001e-07,
"loss": 1.8247,
"step": 17
},
{
"epoch": 0.1565217391304348,
"grad_norm": 29.680463790893555,
"learning_rate": 9.000000000000001e-07,
"loss": 1.7154,
"step": 18
},
{
"epoch": 0.16521739130434782,
"grad_norm": 29.8133544921875,
"learning_rate": 9.500000000000001e-07,
"loss": 1.6374,
"step": 19
},
{
"epoch": 0.17391304347826086,
"grad_norm": 30.196664810180664,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.5918,
"step": 20
},
{
"epoch": 0.1826086956521739,
"grad_norm": 29.475982666015625,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.3912,
"step": 21
},
{
"epoch": 0.19130434782608696,
"grad_norm": 30.726896286010742,
"learning_rate": 1.1e-06,
"loss": 1.2958,
"step": 22
},
{
"epoch": 0.2,
"grad_norm": 30.79201889038086,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.1146,
"step": 23
},
{
"epoch": 0.20869565217391303,
"grad_norm": 30.13853645324707,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.9209,
"step": 24
},
{
"epoch": 0.21739130434782608,
"grad_norm": 29.31069564819336,
"learning_rate": 1.25e-06,
"loss": 0.7969,
"step": 25
},
{
"epoch": 0.22608695652173913,
"grad_norm": 27.00128746032715,
"learning_rate": 1.3e-06,
"loss": 0.653,
"step": 26
},
{
"epoch": 0.23478260869565218,
"grad_norm": 27.202844619750977,
"learning_rate": 1.3500000000000002e-06,
"loss": 0.4821,
"step": 27
},
{
"epoch": 0.24347826086956523,
"grad_norm": 23.356842041015625,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.3932,
"step": 28
},
{
"epoch": 0.25217391304347825,
"grad_norm": 16.253108978271484,
"learning_rate": 1.45e-06,
"loss": 0.2794,
"step": 29
},
{
"epoch": 0.2608695652173913,
"grad_norm": 11.343944549560547,
"learning_rate": 1.5e-06,
"loss": 0.2122,
"step": 30
},
{
"epoch": 0.26956521739130435,
"grad_norm": 6.002540111541748,
"learning_rate": 1.5500000000000002e-06,
"loss": 0.1371,
"step": 31
},
{
"epoch": 0.2782608695652174,
"grad_norm": 4.205584526062012,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.1406,
"step": 32
},
{
"epoch": 0.28695652173913044,
"grad_norm": 3.3316493034362793,
"learning_rate": 1.6500000000000003e-06,
"loss": 0.1172,
"step": 33
},
{
"epoch": 0.2956521739130435,
"grad_norm": 2.546919822692871,
"learning_rate": 1.7000000000000002e-06,
"loss": 0.1013,
"step": 34
},
{
"epoch": 0.30434782608695654,
"grad_norm": 1.871219515800476,
"learning_rate": 1.75e-06,
"loss": 0.1,
"step": 35
},
{
"epoch": 0.3130434782608696,
"grad_norm": 1.7010533809661865,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.0903,
"step": 36
},
{
"epoch": 0.3217391304347826,
"grad_norm": 1.7138007879257202,
"learning_rate": 1.85e-06,
"loss": 0.0882,
"step": 37
},
{
"epoch": 0.33043478260869563,
"grad_norm": 1.897299885749817,
"learning_rate": 1.9000000000000002e-06,
"loss": 0.0926,
"step": 38
},
{
"epoch": 0.3391304347826087,
"grad_norm": 1.2912218570709229,
"learning_rate": 1.9500000000000004e-06,
"loss": 0.08,
"step": 39
},
{
"epoch": 0.34782608695652173,
"grad_norm": 1.2403124570846558,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0851,
"step": 40
},
{
"epoch": 0.3565217391304348,
"grad_norm": 1.0497726202011108,
"learning_rate": 2.05e-06,
"loss": 0.0854,
"step": 41
},
{
"epoch": 0.3652173913043478,
"grad_norm": 1.2289705276489258,
"learning_rate": 2.1000000000000002e-06,
"loss": 0.0706,
"step": 42
},
{
"epoch": 0.3739130434782609,
"grad_norm": 1.0778898000717163,
"learning_rate": 2.15e-06,
"loss": 0.0735,
"step": 43
},
{
"epoch": 0.3826086956521739,
"grad_norm": 1.101493239402771,
"learning_rate": 2.2e-06,
"loss": 0.0736,
"step": 44
},
{
"epoch": 0.391304347826087,
"grad_norm": 1.746185064315796,
"learning_rate": 2.25e-06,
"loss": 0.0973,
"step": 45
},
{
"epoch": 0.4,
"grad_norm": 1.2611403465270996,
"learning_rate": 2.3000000000000004e-06,
"loss": 0.0684,
"step": 46
},
{
"epoch": 0.40869565217391307,
"grad_norm": 1.09405517578125,
"learning_rate": 2.35e-06,
"loss": 0.0698,
"step": 47
},
{
"epoch": 0.41739130434782606,
"grad_norm": 1.1591057777404785,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.0757,
"step": 48
},
{
"epoch": 0.4260869565217391,
"grad_norm": 0.9820723533630371,
"learning_rate": 2.4500000000000003e-06,
"loss": 0.0721,
"step": 49
},
{
"epoch": 0.43478260869565216,
"grad_norm": 1.1460777521133423,
"learning_rate": 2.5e-06,
"loss": 0.0849,
"step": 50
},
{
"epoch": 0.4434782608695652,
"grad_norm": 0.951232373714447,
"learning_rate": 2.55e-06,
"loss": 0.0707,
"step": 51
},
{
"epoch": 0.45217391304347826,
"grad_norm": 1.0160707235336304,
"learning_rate": 2.6e-06,
"loss": 0.0695,
"step": 52
},
{
"epoch": 0.4608695652173913,
"grad_norm": 1.0926896333694458,
"learning_rate": 2.6500000000000005e-06,
"loss": 0.0815,
"step": 53
},
{
"epoch": 0.46956521739130436,
"grad_norm": 0.8407694697380066,
"learning_rate": 2.7000000000000004e-06,
"loss": 0.0718,
"step": 54
},
{
"epoch": 0.4782608695652174,
"grad_norm": 0.9152198433876038,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0733,
"step": 55
},
{
"epoch": 0.48695652173913045,
"grad_norm": 0.8152011632919312,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.0827,
"step": 56
},
{
"epoch": 0.4956521739130435,
"grad_norm": 0.8188056349754333,
"learning_rate": 2.85e-06,
"loss": 0.0707,
"step": 57
},
{
"epoch": 0.5043478260869565,
"grad_norm": 0.8655344843864441,
"learning_rate": 2.9e-06,
"loss": 0.0696,
"step": 58
},
{
"epoch": 0.5130434782608696,
"grad_norm": 0.8174591660499573,
"learning_rate": 2.95e-06,
"loss": 0.0718,
"step": 59
},
{
"epoch": 0.5217391304347826,
"grad_norm": 1.2075238227844238,
"learning_rate": 3e-06,
"loss": 0.0817,
"step": 60
},
{
"epoch": 0.5304347826086957,
"grad_norm": 0.7282372117042542,
"learning_rate": 3.05e-06,
"loss": 0.0675,
"step": 61
},
{
"epoch": 0.5391304347826087,
"grad_norm": 0.8066464066505432,
"learning_rate": 3.1000000000000004e-06,
"loss": 0.0638,
"step": 62
},
{
"epoch": 0.5478260869565217,
"grad_norm": 0.9105910062789917,
"learning_rate": 3.1500000000000003e-06,
"loss": 0.0714,
"step": 63
},
{
"epoch": 0.5565217391304348,
"grad_norm": 1.090287208557129,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.0742,
"step": 64
},
{
"epoch": 0.5652173913043478,
"grad_norm": 0.7674099802970886,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0594,
"step": 65
},
{
"epoch": 0.5739130434782609,
"grad_norm": 0.9247289299964905,
"learning_rate": 3.3000000000000006e-06,
"loss": 0.0671,
"step": 66
},
{
"epoch": 0.5826086956521739,
"grad_norm": 0.7552309632301331,
"learning_rate": 3.3500000000000005e-06,
"loss": 0.0609,
"step": 67
},
{
"epoch": 0.591304347826087,
"grad_norm": 1.1822036504745483,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.07,
"step": 68
},
{
"epoch": 0.6,
"grad_norm": 0.805238664150238,
"learning_rate": 3.45e-06,
"loss": 0.0637,
"step": 69
},
{
"epoch": 0.6086956521739131,
"grad_norm": 1.0068074464797974,
"learning_rate": 3.5e-06,
"loss": 0.0659,
"step": 70
},
{
"epoch": 0.6173913043478261,
"grad_norm": 0.7666197419166565,
"learning_rate": 3.5500000000000003e-06,
"loss": 0.0641,
"step": 71
},
{
"epoch": 0.6260869565217392,
"grad_norm": 0.8774266242980957,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.0639,
"step": 72
},
{
"epoch": 0.6347826086956522,
"grad_norm": 1.4135913848876953,
"learning_rate": 3.65e-06,
"loss": 0.0716,
"step": 73
},
{
"epoch": 0.6434782608695652,
"grad_norm": 1.052467942237854,
"learning_rate": 3.7e-06,
"loss": 0.0617,
"step": 74
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.6905954480171204,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0655,
"step": 75
},
{
"epoch": 0.6608695652173913,
"grad_norm": 0.8480051755905151,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.055,
"step": 76
},
{
"epoch": 0.6695652173913044,
"grad_norm": 0.8274970054626465,
"learning_rate": 3.85e-06,
"loss": 0.0634,
"step": 77
},
{
"epoch": 0.6782608695652174,
"grad_norm": 0.8180427551269531,
"learning_rate": 3.900000000000001e-06,
"loss": 0.0591,
"step": 78
},
{
"epoch": 0.6869565217391305,
"grad_norm": 1.117491602897644,
"learning_rate": 3.95e-06,
"loss": 0.0596,
"step": 79
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.80575031042099,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0587,
"step": 80
},
{
"epoch": 0.7043478260869566,
"grad_norm": 1.1225630044937134,
"learning_rate": 4.05e-06,
"loss": 0.0571,
"step": 81
},
{
"epoch": 0.7130434782608696,
"grad_norm": 0.9180626273155212,
"learning_rate": 4.1e-06,
"loss": 0.0514,
"step": 82
},
{
"epoch": 0.7217391304347827,
"grad_norm": 0.9540777206420898,
"learning_rate": 4.15e-06,
"loss": 0.0539,
"step": 83
},
{
"epoch": 0.7304347826086957,
"grad_norm": 1.032495379447937,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.0593,
"step": 84
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.8488896489143372,
"learning_rate": 4.25e-06,
"loss": 0.0508,
"step": 85
},
{
"epoch": 0.7478260869565218,
"grad_norm": 0.7575100660324097,
"learning_rate": 4.3e-06,
"loss": 0.0604,
"step": 86
},
{
"epoch": 0.7565217391304347,
"grad_norm": 0.8140726089477539,
"learning_rate": 4.350000000000001e-06,
"loss": 0.0538,
"step": 87
},
{
"epoch": 0.7652173913043478,
"grad_norm": 0.8753012418746948,
"learning_rate": 4.4e-06,
"loss": 0.0627,
"step": 88
},
{
"epoch": 0.7739130434782608,
"grad_norm": 0.7341794967651367,
"learning_rate": 4.450000000000001e-06,
"loss": 0.0517,
"step": 89
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.8983039259910583,
"learning_rate": 4.5e-06,
"loss": 0.0553,
"step": 90
},
{
"epoch": 0.7913043478260869,
"grad_norm": 0.8660378456115723,
"learning_rate": 4.5500000000000005e-06,
"loss": 0.0577,
"step": 91
},
{
"epoch": 0.8,
"grad_norm": 0.8682013154029846,
"learning_rate": 4.600000000000001e-06,
"loss": 0.0504,
"step": 92
},
{
"epoch": 0.808695652173913,
"grad_norm": 1.1238961219787598,
"learning_rate": 4.65e-06,
"loss": 0.0466,
"step": 93
},
{
"epoch": 0.8173913043478261,
"grad_norm": 0.9690432548522949,
"learning_rate": 4.7e-06,
"loss": 0.0506,
"step": 94
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.8743138313293457,
"learning_rate": 4.75e-06,
"loss": 0.052,
"step": 95
},
{
"epoch": 0.8347826086956521,
"grad_norm": 1.1654411554336548,
"learning_rate": 4.800000000000001e-06,
"loss": 0.0541,
"step": 96
},
{
"epoch": 0.8434782608695652,
"grad_norm": 0.9813936948776245,
"learning_rate": 4.85e-06,
"loss": 0.0511,
"step": 97
},
{
"epoch": 0.8521739130434782,
"grad_norm": 0.8186525702476501,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.0524,
"step": 98
},
{
"epoch": 0.8608695652173913,
"grad_norm": 0.7779029607772827,
"learning_rate": 4.95e-06,
"loss": 0.0414,
"step": 99
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.7967627048492432,
"learning_rate": 5e-06,
"loss": 0.0424,
"step": 100
},
{
"epoch": 0.8782608695652174,
"grad_norm": 0.9512422680854797,
"learning_rate": 4.999964559102694e-06,
"loss": 0.0459,
"step": 101
},
{
"epoch": 0.8869565217391304,
"grad_norm": 1.0316367149353027,
"learning_rate": 4.999858237415621e-06,
"loss": 0.0415,
"step": 102
},
{
"epoch": 0.8956521739130435,
"grad_norm": 0.955311119556427,
"learning_rate": 4.999681037953289e-06,
"loss": 0.0462,
"step": 103
},
{
"epoch": 0.9043478260869565,
"grad_norm": 0.8455808758735657,
"learning_rate": 4.999432965739786e-06,
"loss": 0.045,
"step": 104
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.9638617038726807,
"learning_rate": 4.999114027808632e-06,
"loss": 0.0454,
"step": 105
},
{
"epoch": 0.9217391304347826,
"grad_norm": 0.9750456213951111,
"learning_rate": 4.998724233202585e-06,
"loss": 0.0411,
"step": 106
},
{
"epoch": 0.9304347826086956,
"grad_norm": 1.0203043222427368,
"learning_rate": 4.998263592973382e-06,
"loss": 0.0467,
"step": 107
},
{
"epoch": 0.9391304347826087,
"grad_norm": 1.0116546154022217,
"learning_rate": 4.9977321201814235e-06,
"loss": 0.0432,
"step": 108
},
{
"epoch": 0.9478260869565217,
"grad_norm": 0.8726145029067993,
"learning_rate": 4.997129829895409e-06,
"loss": 0.0482,
"step": 109
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.7527220845222473,
"learning_rate": 4.996456739191905e-06,
"loss": 0.04,
"step": 110
},
{
"epoch": 0.9652173913043478,
"grad_norm": 0.8785611391067505,
"learning_rate": 4.995712867154863e-06,
"loss": 0.0483,
"step": 111
},
{
"epoch": 0.9739130434782609,
"grad_norm": 0.9951519966125488,
"learning_rate": 4.994898234875075e-06,
"loss": 0.0454,
"step": 112
},
{
"epoch": 0.9826086956521739,
"grad_norm": 1.0761841535568237,
"learning_rate": 4.9940128654495826e-06,
"loss": 0.0375,
"step": 113
},
{
"epoch": 0.991304347826087,
"grad_norm": 0.8400607109069824,
"learning_rate": 4.9930567839810125e-06,
"loss": 0.0369,
"step": 114
},
{
"epoch": 1.0,
"grad_norm": 0.9820336103439331,
"learning_rate": 4.992030017576876e-06,
"loss": 0.0323,
"step": 115
},
{
"epoch": 1.008695652173913,
"grad_norm": 0.8266397714614868,
"learning_rate": 4.990932595348788e-06,
"loss": 0.0301,
"step": 116
},
{
"epoch": 1.017391304347826,
"grad_norm": 0.9163568615913391,
"learning_rate": 4.989764548411654e-06,
"loss": 0.0284,
"step": 117
},
{
"epoch": 1.0260869565217392,
"grad_norm": 1.0225435495376587,
"learning_rate": 4.988525909882779e-06,
"loss": 0.0229,
"step": 118
},
{
"epoch": 1.0347826086956522,
"grad_norm": 1.2318713665008545,
"learning_rate": 4.987216714880929e-06,
"loss": 0.0304,
"step": 119
},
{
"epoch": 1.0434782608695652,
"grad_norm": 1.0533114671707153,
"learning_rate": 4.9858370005253435e-06,
"loss": 0.0333,
"step": 120
},
{
"epoch": 1.0521739130434782,
"grad_norm": 1.0429824590682983,
"learning_rate": 4.9843868059346725e-06,
"loss": 0.0265,
"step": 121
},
{
"epoch": 1.0608695652173914,
"grad_norm": 0.9843570590019226,
"learning_rate": 4.982866172225876e-06,
"loss": 0.0306,
"step": 122
},
{
"epoch": 1.0695652173913044,
"grad_norm": 0.8079569935798645,
"learning_rate": 4.981275142513049e-06,
"loss": 0.0227,
"step": 123
},
{
"epoch": 1.0782608695652174,
"grad_norm": 0.8699679970741272,
"learning_rate": 4.979613761906212e-06,
"loss": 0.0244,
"step": 124
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.87395179271698,
"learning_rate": 4.977882077510018e-06,
"loss": 0.0238,
"step": 125
},
{
"epoch": 1.0956521739130434,
"grad_norm": 1.1012191772460938,
"learning_rate": 4.9760801384224274e-06,
"loss": 0.0265,
"step": 126
},
{
"epoch": 1.1043478260869566,
"grad_norm": 0.6216080784797668,
"learning_rate": 4.97420799573331e-06,
"loss": 0.0161,
"step": 127
},
{
"epoch": 1.1130434782608696,
"grad_norm": 0.9586366415023804,
"learning_rate": 4.972265702523001e-06,
"loss": 0.0226,
"step": 128
},
{
"epoch": 1.1217391304347826,
"grad_norm": 1.3069987297058105,
"learning_rate": 4.970253313860788e-06,
"loss": 0.0213,
"step": 129
},
{
"epoch": 1.1304347826086956,
"grad_norm": 1.4542529582977295,
"learning_rate": 4.968170886803361e-06,
"loss": 0.0242,
"step": 130
},
{
"epoch": 1.1391304347826088,
"grad_norm": 1.221802830696106,
"learning_rate": 4.966018480393189e-06,
"loss": 0.0244,
"step": 131
},
{
"epoch": 1.1478260869565218,
"grad_norm": 0.8851010203361511,
"learning_rate": 4.9637961556568405e-06,
"loss": 0.0231,
"step": 132
},
{
"epoch": 1.1565217391304348,
"grad_norm": 0.7256351709365845,
"learning_rate": 4.961503975603263e-06,
"loss": 0.016,
"step": 133
},
{
"epoch": 1.1652173913043478,
"grad_norm": 0.9153109192848206,
"learning_rate": 4.959142005221991e-06,
"loss": 0.0212,
"step": 134
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.9752025604248047,
"learning_rate": 4.956710311481303e-06,
"loss": 0.0213,
"step": 135
},
{
"epoch": 1.182608695652174,
"grad_norm": 0.9055240154266357,
"learning_rate": 4.954208963326327e-06,
"loss": 0.0147,
"step": 136
},
{
"epoch": 1.191304347826087,
"grad_norm": 0.6810442805290222,
"learning_rate": 4.951638031677081e-06,
"loss": 0.0218,
"step": 137
},
{
"epoch": 1.2,
"grad_norm": 0.8518972396850586,
"learning_rate": 4.948997589426463e-06,
"loss": 0.0161,
"step": 138
},
{
"epoch": 1.208695652173913,
"grad_norm": 0.9990325570106506,
"learning_rate": 4.94628771143819e-06,
"loss": 0.0134,
"step": 139
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.86704421043396,
"learning_rate": 4.943508474544667e-06,
"loss": 0.0152,
"step": 140
},
{
"epoch": 1.2260869565217392,
"grad_norm": 0.7871954441070557,
"learning_rate": 4.940659957544813e-06,
"loss": 0.014,
"step": 141
},
{
"epoch": 1.2347826086956522,
"grad_norm": 1.038091778755188,
"learning_rate": 4.937742241201826e-06,
"loss": 0.0191,
"step": 142
},
{
"epoch": 1.2434782608695651,
"grad_norm": 0.9917914867401123,
"learning_rate": 4.934755408240896e-06,
"loss": 0.0156,
"step": 143
},
{
"epoch": 1.2521739130434781,
"grad_norm": 0.9815549254417419,
"learning_rate": 4.931699543346854e-06,
"loss": 0.0164,
"step": 144
},
{
"epoch": 1.2608695652173914,
"grad_norm": 1.0097852945327759,
"learning_rate": 4.928574733161775e-06,
"loss": 0.0229,
"step": 145
},
{
"epoch": 1.2695652173913043,
"grad_norm": 0.767440676689148,
"learning_rate": 4.925381066282522e-06,
"loss": 0.0144,
"step": 146
},
{
"epoch": 1.2782608695652173,
"grad_norm": 0.883701503276825,
"learning_rate": 4.922118633258229e-06,
"loss": 0.0177,
"step": 147
},
{
"epoch": 1.2869565217391306,
"grad_norm": 0.9894592761993408,
"learning_rate": 4.918787526587739e-06,
"loss": 0.0217,
"step": 148
},
{
"epoch": 1.2956521739130435,
"grad_norm": 0.8317010998725891,
"learning_rate": 4.9153878407169815e-06,
"loss": 0.0162,
"step": 149
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.4255380928516388,
"learning_rate": 4.911919672036291e-06,
"loss": 0.0107,
"step": 150
},
{
"epoch": 1.3130434782608695,
"grad_norm": 0.614287793636322,
"learning_rate": 4.908383118877672e-06,
"loss": 0.0165,
"step": 151
},
{
"epoch": 1.3217391304347825,
"grad_norm": 0.6787184476852417,
"learning_rate": 4.904778281512022e-06,
"loss": 0.0144,
"step": 152
},
{
"epoch": 1.3304347826086955,
"grad_norm": 0.8526840209960938,
"learning_rate": 4.901105262146275e-06,
"loss": 0.0089,
"step": 153
},
{
"epoch": 1.3391304347826087,
"grad_norm": 1.5631215572357178,
"learning_rate": 4.897364164920515e-06,
"loss": 0.0168,
"step": 154
},
{
"epoch": 1.3478260869565217,
"grad_norm": 1.2826169729232788,
"learning_rate": 4.8935550959050135e-06,
"loss": 0.017,
"step": 155
},
{
"epoch": 1.3565217391304347,
"grad_norm": 0.875330924987793,
"learning_rate": 4.889678163097233e-06,
"loss": 0.0144,
"step": 156
},
{
"epoch": 1.365217391304348,
"grad_norm": 0.8514088988304138,
"learning_rate": 4.885733476418752e-06,
"loss": 0.0142,
"step": 157
},
{
"epoch": 1.373913043478261,
"grad_norm": 1.1872806549072266,
"learning_rate": 4.8817211477121615e-06,
"loss": 0.0153,
"step": 158
},
{
"epoch": 1.382608695652174,
"grad_norm": 0.7295001745223999,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.0085,
"step": 159
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.7863972187042236,
"learning_rate": 4.8734940211709535e-06,
"loss": 0.0121,
"step": 160
},
{
"epoch": 1.4,
"grad_norm": 1.0080970525741577,
"learning_rate": 4.8692794565977335e-06,
"loss": 0.0107,
"step": 161
},
{
"epoch": 1.4086956521739131,
"grad_norm": 1.3618495464324951,
"learning_rate": 4.864997716512584e-06,
"loss": 0.0157,
"step": 162
},
{
"epoch": 1.4173913043478261,
"grad_norm": 1.3909306526184082,
"learning_rate": 4.8606489223144744e-06,
"loss": 0.0127,
"step": 163
},
{
"epoch": 1.4260869565217391,
"grad_norm": 0.8300015330314636,
"learning_rate": 4.8562331973035396e-06,
"loss": 0.0114,
"step": 164
},
{
"epoch": 1.434782608695652,
"grad_norm": 1.1622614860534668,
"learning_rate": 4.851750666677583e-06,
"loss": 0.0155,
"step": 165
},
{
"epoch": 1.4434782608695653,
"grad_norm": 0.9247199296951294,
"learning_rate": 4.847201457528533e-06,
"loss": 0.0121,
"step": 166
},
{
"epoch": 1.4521739130434783,
"grad_norm": 1.0048704147338867,
"learning_rate": 4.842585698838832e-06,
"loss": 0.0154,
"step": 167
},
{
"epoch": 1.4608695652173913,
"grad_norm": 1.0037480592727661,
"learning_rate": 4.837903521477784e-06,
"loss": 0.0138,
"step": 168
},
{
"epoch": 1.4695652173913043,
"grad_norm": 0.8620800971984863,
"learning_rate": 4.833155058197842e-06,
"loss": 0.0145,
"step": 169
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.8210186958312988,
"learning_rate": 4.828340443630847e-06,
"loss": 0.0175,
"step": 170
},
{
"epoch": 1.4869565217391305,
"grad_norm": 0.5821270942687988,
"learning_rate": 4.823459814284205e-06,
"loss": 0.0103,
"step": 171
},
{
"epoch": 1.4956521739130435,
"grad_norm": 0.622521698474884,
"learning_rate": 4.818513308537025e-06,
"loss": 0.008,
"step": 172
},
{
"epoch": 1.5043478260869565,
"grad_norm": 1.004116415977478,
"learning_rate": 4.813501066636188e-06,
"loss": 0.011,
"step": 173
},
{
"epoch": 1.5130434782608697,
"grad_norm": 0.7552922964096069,
"learning_rate": 4.808423230692374e-06,
"loss": 0.0073,
"step": 174
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.5907073616981506,
"learning_rate": 4.8032799446760326e-06,
"loss": 0.0083,
"step": 175
},
{
"epoch": 1.5304347826086957,
"grad_norm": 0.40509092807769775,
"learning_rate": 4.798071354413302e-06,
"loss": 0.0067,
"step": 176
},
{
"epoch": 1.5391304347826087,
"grad_norm": 1.027583122253418,
"learning_rate": 4.792797607581872e-06,
"loss": 0.0023,
"step": 177
},
{
"epoch": 1.5478260869565217,
"grad_norm": 1.1601897478103638,
"learning_rate": 4.787458853706798e-06,
"loss": 0.0091,
"step": 178
},
{
"epoch": 1.5565217391304347,
"grad_norm": 0.9880419373512268,
"learning_rate": 4.7820552441562625e-06,
"loss": 0.009,
"step": 179
},
{
"epoch": 1.5652173913043477,
"grad_norm": 1.218587875366211,
"learning_rate": 4.7765869321372835e-06,
"loss": 0.0082,
"step": 180
},
{
"epoch": 1.5739130434782609,
"grad_norm": 1.20319664478302,
"learning_rate": 4.771054072691367e-06,
"loss": 0.0123,
"step": 181
},
{
"epoch": 1.5826086956521739,
"grad_norm": 0.43793171644210815,
"learning_rate": 4.7654568226901165e-06,
"loss": 0.0028,
"step": 182
},
{
"epoch": 1.591304347826087,
"grad_norm": 0.5504114627838135,
"learning_rate": 4.759795340830782e-06,
"loss": 0.0054,
"step": 183
},
{
"epoch": 1.6,
"grad_norm": 2.2305221557617188,
"learning_rate": 4.754069787631761e-06,
"loss": 0.0094,
"step": 184
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.8716424703598022,
"learning_rate": 4.7482803254280485e-06,
"loss": 0.0093,
"step": 185
},
{
"epoch": 1.617391304347826,
"grad_norm": 1.02736496925354,
"learning_rate": 4.742427118366632e-06,
"loss": 0.01,
"step": 186
},
{
"epoch": 1.626086956521739,
"grad_norm": 0.6050184965133667,
"learning_rate": 4.736510332401841e-06,
"loss": 0.0054,
"step": 187
},
{
"epoch": 1.634782608695652,
"grad_norm": 0.6020815372467041,
"learning_rate": 4.730530135290638e-06,
"loss": 0.0075,
"step": 188
},
{
"epoch": 1.643478260869565,
"grad_norm": 1.0702062845230103,
"learning_rate": 4.724486696587862e-06,
"loss": 0.0058,
"step": 189
},
{
"epoch": 1.6521739130434783,
"grad_norm": 1.1906393766403198,
"learning_rate": 4.718380187641429e-06,
"loss": 0.0061,
"step": 190
},
{
"epoch": 1.6608695652173913,
"grad_norm": 0.36051103472709656,
"learning_rate": 4.712210781587463e-06,
"loss": 0.0027,
"step": 191
},
{
"epoch": 1.6695652173913045,
"grad_norm": 0.5215916037559509,
"learning_rate": 4.705978653345392e-06,
"loss": 0.0061,
"step": 192
},
{
"epoch": 1.6782608695652175,
"grad_norm": 0.2960876524448395,
"learning_rate": 4.699683979612991e-06,
"loss": 0.0038,
"step": 193
},
{
"epoch": 1.6869565217391305,
"grad_norm": 0.48366671800613403,
"learning_rate": 4.693326938861367e-06,
"loss": 0.0045,
"step": 194
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.4141710698604584,
"learning_rate": 4.686907711329903e-06,
"loss": 0.0055,
"step": 195
},
{
"epoch": 1.7043478260869565,
"grad_norm": 1.1449170112609863,
"learning_rate": 4.680426479021147e-06,
"loss": 0.0081,
"step": 196
},
{
"epoch": 1.7130434782608694,
"grad_norm": 0.2535400390625,
"learning_rate": 4.67388342569565e-06,
"loss": 0.0027,
"step": 197
},
{
"epoch": 1.7217391304347827,
"grad_norm": 0.2910863161087036,
"learning_rate": 4.667278736866755e-06,
"loss": 0.0026,
"step": 198
},
{
"epoch": 1.7304347826086957,
"grad_norm": 0.7556977272033691,
"learning_rate": 4.660612599795343e-06,
"loss": 0.0059,
"step": 199
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.45072904229164124,
"learning_rate": 4.653885203484516e-06,
"loss": 0.0039,
"step": 200
},
{
"epoch": 1.7478260869565219,
"grad_norm": 0.45628419518470764,
"learning_rate": 4.647096738674243e-06,
"loss": 0.0017,
"step": 201
},
{
"epoch": 1.7565217391304349,
"grad_norm": 0.31578192114830017,
"learning_rate": 4.640247397835953e-06,
"loss": 0.0021,
"step": 202
},
{
"epoch": 1.7652173913043478,
"grad_norm": 0.6553907990455627,
"learning_rate": 4.633337375167074e-06,
"loss": 0.0035,
"step": 203
},
{
"epoch": 1.7739130434782608,
"grad_norm": 0.39887183904647827,
"learning_rate": 4.626366866585528e-06,
"loss": 0.0027,
"step": 204
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.5914686918258667,
"learning_rate": 4.619336069724177e-06,
"loss": 0.0052,
"step": 205
},
{
"epoch": 1.7913043478260868,
"grad_norm": 0.47106510400772095,
"learning_rate": 4.612245183925225e-06,
"loss": 0.0041,
"step": 206
},
{
"epoch": 1.8,
"grad_norm": 0.4898064136505127,
"learning_rate": 4.605094410234551e-06,
"loss": 0.0016,
"step": 207
},
{
"epoch": 1.808695652173913,
"grad_norm": 0.584793210029602,
"learning_rate": 4.597883951396027e-06,
"loss": 0.002,
"step": 208
},
{
"epoch": 1.8173913043478263,
"grad_norm": 0.5560891628265381,
"learning_rate": 4.590614011845758e-06,
"loss": 0.0039,
"step": 209
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.5522558689117432,
"learning_rate": 4.583284797706288e-06,
"loss": 0.0023,
"step": 210
},
{
"epoch": 1.8347826086956522,
"grad_norm": 0.6028457283973694,
"learning_rate": 4.575896516780757e-06,
"loss": 0.0033,
"step": 211
},
{
"epoch": 1.8434782608695652,
"grad_norm": 0.7641825079917908,
"learning_rate": 4.568449378547011e-06,
"loss": 0.0061,
"step": 212
},
{
"epoch": 1.8521739130434782,
"grad_norm": 2.1374592781066895,
"learning_rate": 4.560943594151657e-06,
"loss": 0.0064,
"step": 213
},
{
"epoch": 1.8608695652173912,
"grad_norm": 1.233405351638794,
"learning_rate": 4.553379376404085e-06,
"loss": 0.003,
"step": 214
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.3678194582462311,
"learning_rate": 4.5457569397704226e-06,
"loss": 0.0028,
"step": 215
},
{
"epoch": 1.8782608695652174,
"grad_norm": 0.21607166528701782,
"learning_rate": 4.538076500367469e-06,
"loss": 0.0011,
"step": 216
},
{
"epoch": 1.8869565217391304,
"grad_norm": 0.13464906811714172,
"learning_rate": 4.530338275956553e-06,
"loss": 0.0006,
"step": 217
},
{
"epoch": 1.8956521739130436,
"grad_norm": 2.7032434940338135,
"learning_rate": 4.522542485937369e-06,
"loss": 0.0057,
"step": 218
},
{
"epoch": 1.9043478260869566,
"grad_norm": 0.549523651599884,
"learning_rate": 4.514689351341751e-06,
"loss": 0.0026,
"step": 219
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.26894959807395935,
"learning_rate": 4.506779094827409e-06,
"loss": 0.0017,
"step": 220
},
{
"epoch": 1.9217391304347826,
"grad_norm": 0.09632059931755066,
"learning_rate": 4.498811940671615e-06,
"loss": 0.001,
"step": 221
},
{
"epoch": 1.9304347826086956,
"grad_norm": 0.23288941383361816,
"learning_rate": 4.49078811476484e-06,
"loss": 0.0017,
"step": 222
},
{
"epoch": 1.9391304347826086,
"grad_norm": 0.3012441396713257,
"learning_rate": 4.482707844604359e-06,
"loss": 0.0038,
"step": 223
},
{
"epoch": 1.9478260869565216,
"grad_norm": 0.2756352126598358,
"learning_rate": 4.474571359287791e-06,
"loss": 0.0012,
"step": 224
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.24359366297721863,
"learning_rate": 4.466378889506607e-06,
"loss": 0.0013,
"step": 225
},
{
"epoch": 1.9652173913043478,
"grad_norm": 0.1803927719593048,
"learning_rate": 4.458130667539592e-06,
"loss": 0.0017,
"step": 226
},
{
"epoch": 1.973913043478261,
"grad_norm": 0.5478237271308899,
"learning_rate": 4.449826927246257e-06,
"loss": 0.0024,
"step": 227
},
{
"epoch": 1.982608695652174,
"grad_norm": 0.1454983353614807,
"learning_rate": 4.441467904060207e-06,
"loss": 0.0007,
"step": 228
},
{
"epoch": 1.991304347826087,
"grad_norm": 0.7362433671951294,
"learning_rate": 4.4330538349824684e-06,
"loss": 0.0086,
"step": 229
},
{
"epoch": 2.0,
"grad_norm": 0.12488367408514023,
"learning_rate": 4.424584958574766e-06,
"loss": 0.0014,
"step": 230
},
{
"epoch": 2.008695652173913,
"grad_norm": 0.15276969969272614,
"learning_rate": 4.4160615149527646e-06,
"loss": 0.0015,
"step": 231
},
{
"epoch": 2.017391304347826,
"grad_norm": 0.19061294198036194,
"learning_rate": 4.407483745779256e-06,
"loss": 0.0007,
"step": 232
},
{
"epoch": 2.026086956521739,
"grad_norm": 0.31356436014175415,
"learning_rate": 4.39885189425731e-06,
"loss": 0.0008,
"step": 233
},
{
"epoch": 2.034782608695652,
"grad_norm": 0.3470340669155121,
"learning_rate": 4.3901662051233755e-06,
"loss": 0.002,
"step": 234
},
{
"epoch": 2.0434782608695654,
"grad_norm": 0.37595614790916443,
"learning_rate": 4.381426924640346e-06,
"loss": 0.0013,
"step": 235
},
{
"epoch": 2.0521739130434784,
"grad_norm": 0.08062059432268143,
"learning_rate": 4.372634300590578e-06,
"loss": 0.0004,
"step": 236
},
{
"epoch": 2.0608695652173914,
"grad_norm": 0.643486499786377,
"learning_rate": 4.363788582268857e-06,
"loss": 0.0012,
"step": 237
},
{
"epoch": 2.0695652173913044,
"grad_norm": 0.06273315846920013,
"learning_rate": 4.35489002047534e-06,
"loss": 0.0003,
"step": 238
},
{
"epoch": 2.0782608695652174,
"grad_norm": 0.20653115212917328,
"learning_rate": 4.345938867508439e-06,
"loss": 0.0013,
"step": 239
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.09760613739490509,
"learning_rate": 4.336935377157668e-06,
"loss": 0.0004,
"step": 240
},
{
"epoch": 2.0956521739130434,
"grad_norm": 0.31116411089897156,
"learning_rate": 4.32787980469645e-06,
"loss": 0.0008,
"step": 241
},
{
"epoch": 2.1043478260869564,
"grad_norm": 0.24201999604701996,
"learning_rate": 4.318772406874873e-06,
"loss": 0.0005,
"step": 242
},
{
"epoch": 2.1130434782608694,
"grad_norm": 0.3692140579223633,
"learning_rate": 4.309613441912421e-06,
"loss": 0.0028,
"step": 243
},
{
"epoch": 2.121739130434783,
"grad_norm": 0.2164740115404129,
"learning_rate": 4.30040316949064e-06,
"loss": 0.0014,
"step": 244
},
{
"epoch": 2.130434782608696,
"grad_norm": 0.05342044681310654,
"learning_rate": 4.291141850745788e-06,
"loss": 0.0002,
"step": 245
},
{
"epoch": 2.139130434782609,
"grad_norm": 0.19637945294380188,
"learning_rate": 4.281829748261422e-06,
"loss": 0.0014,
"step": 246
},
{
"epoch": 2.1478260869565218,
"grad_norm": 0.12176893651485443,
"learning_rate": 4.272467126060954e-06,
"loss": 0.0003,
"step": 247
},
{
"epoch": 2.1565217391304348,
"grad_norm": 0.15314780175685883,
"learning_rate": 4.263054249600172e-06,
"loss": 0.0005,
"step": 248
},
{
"epoch": 2.1652173913043478,
"grad_norm": 0.08518808335065842,
"learning_rate": 4.253591385759705e-06,
"loss": 0.0003,
"step": 249
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.2676379680633545,
"learning_rate": 4.244078802837462e-06,
"loss": 0.0008,
"step": 250
},
{
"epoch": 2.1826086956521737,
"grad_norm": 0.16651524603366852,
"learning_rate": 4.234516770541023e-06,
"loss": 0.0004,
"step": 251
},
{
"epoch": 2.1913043478260867,
"grad_norm": 0.10508158802986145,
"learning_rate": 4.224905559979991e-06,
"loss": 0.0005,
"step": 252
},
{
"epoch": 2.2,
"grad_norm": 0.18296688795089722,
"learning_rate": 4.215245443658307e-06,
"loss": 0.0008,
"step": 253
},
{
"epoch": 2.208695652173913,
"grad_norm": 0.1019248366355896,
"learning_rate": 4.205536695466524e-06,
"loss": 0.0004,
"step": 254
},
{
"epoch": 2.217391304347826,
"grad_norm": 0.31611302495002747,
"learning_rate": 4.1957795906740404e-06,
"loss": 0.0031,
"step": 255
},
{
"epoch": 2.226086956521739,
"grad_norm": 0.1458199918270111,
"learning_rate": 4.1859744059212945e-06,
"loss": 0.0006,
"step": 256
},
{
"epoch": 2.234782608695652,
"grad_norm": 0.11204175651073456,
"learning_rate": 4.176121419211924e-06,
"loss": 0.0001,
"step": 257
},
{
"epoch": 2.243478260869565,
"grad_norm": 0.1589597463607788,
"learning_rate": 4.16622090990488e-06,
"loss": 0.0004,
"step": 258
},
{
"epoch": 2.252173913043478,
"grad_norm": 0.17105410993099213,
"learning_rate": 4.15627315870651e-06,
"loss": 0.001,
"step": 259
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.025157850235700607,
"learning_rate": 4.146278447662597e-06,
"loss": 0.0001,
"step": 260
},
{
"epoch": 2.269565217391304,
"grad_norm": 0.3840286135673523,
"learning_rate": 4.136237060150363e-06,
"loss": 0.0013,
"step": 261
},
{
"epoch": 2.2782608695652176,
"grad_norm": 0.4167248606681824,
"learning_rate": 4.126149280870434e-06,
"loss": 0.0009,
"step": 262
},
{
"epoch": 2.2869565217391306,
"grad_norm": 0.07005563378334045,
"learning_rate": 4.116015395838772e-06,
"loss": 0.0001,
"step": 263
},
{
"epoch": 2.2956521739130435,
"grad_norm": 0.33618828654289246,
"learning_rate": 4.105835692378557e-06,
"loss": 0.0005,
"step": 264
},
{
"epoch": 2.3043478260869565,
"grad_norm": 0.22151310741901398,
"learning_rate": 4.095610459112051e-06,
"loss": 0.0007,
"step": 265
},
{
"epoch": 2.3130434782608695,
"grad_norm": 0.6059427261352539,
"learning_rate": 4.0853399859524066e-06,
"loss": 0.0043,
"step": 266
},
{
"epoch": 2.3217391304347825,
"grad_norm": 0.28740134835243225,
"learning_rate": 4.075024564095452e-06,
"loss": 0.0005,
"step": 267
},
{
"epoch": 2.3304347826086955,
"grad_norm": 0.3348411023616791,
"learning_rate": 4.064664486011433e-06,
"loss": 0.0005,
"step": 268
},
{
"epoch": 2.3391304347826085,
"grad_norm": 0.024883409962058067,
"learning_rate": 4.05426004543672e-06,
"loss": 0.0001,
"step": 269
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.01836530677974224,
"learning_rate": 4.04381153736548e-06,
"loss": 0.0001,
"step": 270
},
{
"epoch": 2.356521739130435,
"grad_norm": 0.09869615733623505,
"learning_rate": 4.033319258041316e-06,
"loss": 0.0002,
"step": 271
},
{
"epoch": 2.365217391304348,
"grad_norm": 0.026213495060801506,
"learning_rate": 4.022783504948862e-06,
"loss": 0.0001,
"step": 272
},
{
"epoch": 2.373913043478261,
"grad_norm": 0.010184271261096,
"learning_rate": 4.012204576805352e-06,
"loss": 0.0,
"step": 273
},
{
"epoch": 2.382608695652174,
"grad_norm": 0.5611284971237183,
"learning_rate": 4.001582773552153e-06,
"loss": 0.0026,
"step": 274
},
{
"epoch": 2.391304347826087,
"grad_norm": 0.16219332814216614,
"learning_rate": 3.990918396346254e-06,
"loss": 0.0002,
"step": 275
},
{
"epoch": 2.4,
"grad_norm": 0.016482144594192505,
"learning_rate": 3.9802117475517335e-06,
"loss": 0.0,
"step": 276
},
{
"epoch": 2.408695652173913,
"grad_norm": 0.7226945757865906,
"learning_rate": 3.969463130731183e-06,
"loss": 0.0036,
"step": 277
},
{
"epoch": 2.417391304347826,
"grad_norm": 0.14231452345848083,
"learning_rate": 3.958672850637104e-06,
"loss": 0.0001,
"step": 278
},
{
"epoch": 2.426086956521739,
"grad_norm": 0.3705194890499115,
"learning_rate": 3.947841213203262e-06,
"loss": 0.0007,
"step": 279
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.2774112820625305,
"learning_rate": 3.936968525536018e-06,
"loss": 0.0025,
"step": 280
},
{
"epoch": 2.4434782608695653,
"grad_norm": 0.4041725695133209,
"learning_rate": 3.926055095905616e-06,
"loss": 0.0015,
"step": 281
},
{
"epoch": 2.4521739130434783,
"grad_norm": 0.2777579128742218,
"learning_rate": 3.9151012337374495e-06,
"loss": 0.0013,
"step": 282
},
{
"epoch": 2.4608695652173913,
"grad_norm": 0.2716004550457001,
"learning_rate": 3.9041072496032805e-06,
"loss": 0.001,
"step": 283
},
{
"epoch": 2.4695652173913043,
"grad_norm": 0.06586720794439316,
"learning_rate": 3.893073455212438e-06,
"loss": 0.0003,
"step": 284
},
{
"epoch": 2.4782608695652173,
"grad_norm": 0.22377079725265503,
"learning_rate": 3.882000163402984e-06,
"loss": 0.0006,
"step": 285
},
{
"epoch": 2.4869565217391303,
"grad_norm": 0.6000126600265503,
"learning_rate": 3.870887688132834e-06,
"loss": 0.0015,
"step": 286
},
{
"epoch": 2.4956521739130437,
"grad_norm": 0.2133096158504486,
"learning_rate": 3.859736344470866e-06,
"loss": 0.0008,
"step": 287
},
{
"epoch": 2.5043478260869563,
"grad_norm": 0.4249497652053833,
"learning_rate": 3.8485464485879785e-06,
"loss": 0.0024,
"step": 288
},
{
"epoch": 2.5130434782608697,
"grad_norm": 0.1991584748029709,
"learning_rate": 3.837318317748134e-06,
"loss": 0.0017,
"step": 289
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.21226167678833008,
"learning_rate": 3.826052270299356e-06,
"loss": 0.0021,
"step": 290
},
{
"epoch": 2.5304347826086957,
"grad_norm": 0.1490064263343811,
"learning_rate": 3.814748625664711e-06,
"loss": 0.0005,
"step": 291
},
{
"epoch": 2.5391304347826087,
"grad_norm": 0.05340861901640892,
"learning_rate": 3.8034077043332463e-06,
"loss": 0.0002,
"step": 292
},
{
"epoch": 2.5478260869565217,
"grad_norm": 0.05676361918449402,
"learning_rate": 3.7920298278509028e-06,
"loss": 0.0002,
"step": 293
},
{
"epoch": 2.5565217391304347,
"grad_norm": 0.033280279487371445,
"learning_rate": 3.7806153188114027e-06,
"loss": 0.0001,
"step": 294
},
{
"epoch": 2.5652173913043477,
"grad_norm": 0.13904324173927307,
"learning_rate": 3.7691645008471e-06,
"loss": 0.0008,
"step": 295
},
{
"epoch": 2.573913043478261,
"grad_norm": 0.2639399766921997,
"learning_rate": 3.7576776986198064e-06,
"loss": 0.0012,
"step": 296
},
{
"epoch": 2.5826086956521737,
"grad_norm": 0.5102531909942627,
"learning_rate": 3.7461552378115833e-06,
"loss": 0.0018,
"step": 297
},
{
"epoch": 2.591304347826087,
"grad_norm": 0.34142470359802246,
"learning_rate": 3.734597445115511e-06,
"loss": 0.0013,
"step": 298
},
{
"epoch": 2.6,
"grad_norm": 0.13264404237270355,
"learning_rate": 3.7230046482264256e-06,
"loss": 0.0004,
"step": 299
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.036736320704221725,
"learning_rate": 3.711377175831626e-06,
"loss": 0.0002,
"step": 300
},
{
"epoch": 2.617391304347826,
"grad_norm": 0.1738046258687973,
"learning_rate": 3.6997153576015552e-06,
"loss": 0.0014,
"step": 301
},
{
"epoch": 2.626086956521739,
"grad_norm": 0.12404807657003403,
"learning_rate": 3.6880195241804567e-06,
"loss": 0.0006,
"step": 302
},
{
"epoch": 2.634782608695652,
"grad_norm": 0.02482556365430355,
"learning_rate": 3.676290007176994e-06,
"loss": 0.0001,
"step": 303
},
{
"epoch": 2.643478260869565,
"grad_norm": 0.05156688019633293,
"learning_rate": 3.6645271391548542e-06,
"loss": 0.0004,
"step": 304
},
{
"epoch": 2.6521739130434785,
"grad_norm": 0.08253592997789383,
"learning_rate": 3.652731253623315e-06,
"loss": 0.0007,
"step": 305
},
{
"epoch": 2.660869565217391,
"grad_norm": 0.10988730192184448,
"learning_rate": 3.6409026850277908e-06,
"loss": 0.0008,
"step": 306
},
{
"epoch": 2.6695652173913045,
"grad_norm": 0.13908977806568146,
"learning_rate": 3.6290417687403485e-06,
"loss": 0.0009,
"step": 307
},
{
"epoch": 2.6782608695652175,
"grad_norm": 0.0466957651078701,
"learning_rate": 3.617148841050202e-06,
"loss": 0.0002,
"step": 308
},
{
"epoch": 2.6869565217391305,
"grad_norm": 0.18122495710849762,
"learning_rate": 3.6052242391541746e-06,
"loss": 0.0017,
"step": 309
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.1811421513557434,
"learning_rate": 3.593268301147139e-06,
"loss": 0.0007,
"step": 310
},
{
"epoch": 2.7043478260869565,
"grad_norm": 0.11010754853487015,
"learning_rate": 3.5812813660124313e-06,
"loss": 0.0004,
"step": 311
},
{
"epoch": 2.7130434782608694,
"grad_norm": 0.19528372585773468,
"learning_rate": 3.5692637736122427e-06,
"loss": 0.0011,
"step": 312
},
{
"epoch": 2.7217391304347824,
"grad_norm": 0.29341739416122437,
"learning_rate": 3.5572158646779787e-06,
"loss": 0.0009,
"step": 313
},
{
"epoch": 2.730434782608696,
"grad_norm": 0.13021045923233032,
"learning_rate": 3.5451379808006014e-06,
"loss": 0.0003,
"step": 314
},
{
"epoch": 2.7391304347826084,
"grad_norm": 0.17587915062904358,
"learning_rate": 3.5330304644209456e-06,
"loss": 0.0002,
"step": 315
},
{
"epoch": 2.747826086956522,
"grad_norm": 0.06901020556688309,
"learning_rate": 3.520893658820007e-06,
"loss": 0.0003,
"step": 316
},
{
"epoch": 2.756521739130435,
"grad_norm": 0.06725924462080002,
"learning_rate": 3.50872790810921e-06,
"loss": 0.0002,
"step": 317
},
{
"epoch": 2.765217391304348,
"grad_norm": 0.49958744645118713,
"learning_rate": 3.4965335572206516e-06,
"loss": 0.0011,
"step": 318
},
{
"epoch": 2.773913043478261,
"grad_norm": 0.04429077357053757,
"learning_rate": 3.484310951897323e-06,
"loss": 0.0001,
"step": 319
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.027020800858736038,
"learning_rate": 3.4720604386833024e-06,
"loss": 0.0001,
"step": 320
},
{
"epoch": 2.791304347826087,
"grad_norm": 0.7644746899604797,
"learning_rate": 3.459782364913935e-06,
"loss": 0.0015,
"step": 321
},
{
"epoch": 2.8,
"grad_norm": 0.07157191634178162,
"learning_rate": 3.447477078705983e-06,
"loss": 0.0003,
"step": 322
},
{
"epoch": 2.8086956521739133,
"grad_norm": 0.009031714871525764,
"learning_rate": 3.4351449289477543e-06,
"loss": 0.0,
"step": 323
},
{
"epoch": 2.8173913043478263,
"grad_norm": 0.02938353642821312,
"learning_rate": 3.4227862652892106e-06,
"loss": 0.0001,
"step": 324
},
{
"epoch": 2.8260869565217392,
"grad_norm": 0.2537288963794708,
"learning_rate": 3.410401438132056e-06,
"loss": 0.0009,
"step": 325
},
{
"epoch": 2.8347826086956522,
"grad_norm": 0.2668483555316925,
"learning_rate": 3.3979907986197996e-06,
"loss": 0.0004,
"step": 326
},
{
"epoch": 2.8434782608695652,
"grad_norm": 0.276434987783432,
"learning_rate": 3.385554698627803e-06,
"loss": 0.0005,
"step": 327
},
{
"epoch": 2.8521739130434782,
"grad_norm": 0.01908428780734539,
"learning_rate": 3.3730934907532997e-06,
"loss": 0.0,
"step": 328
},
{
"epoch": 2.860869565217391,
"grad_norm": 0.09876787662506104,
"learning_rate": 3.3606075283054005e-06,
"loss": 0.0002,
"step": 329
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.24581114947795868,
"learning_rate": 3.3480971652950757e-06,
"loss": 0.0001,
"step": 330
},
{
"epoch": 2.878260869565217,
"grad_norm": 0.028530647978186607,
"learning_rate": 3.3355627564251185e-06,
"loss": 0.0001,
"step": 331
},
{
"epoch": 2.8869565217391306,
"grad_norm": 0.036605849862098694,
"learning_rate": 3.3230046570800866e-06,
"loss": 0.0001,
"step": 332
},
{
"epoch": 2.8956521739130436,
"grad_norm": 0.19084055721759796,
"learning_rate": 3.3104232233162272e-06,
"loss": 0.0003,
"step": 333
},
{
"epoch": 2.9043478260869566,
"grad_norm": 0.02695636637508869,
"learning_rate": 3.2978188118513814e-06,
"loss": 0.0001,
"step": 334
},
{
"epoch": 2.9130434782608696,
"grad_norm": 0.046026572585105896,
"learning_rate": 3.2851917800548726e-06,
"loss": 0.0001,
"step": 335
},
{
"epoch": 2.9217391304347826,
"grad_norm": 0.22791646420955658,
"learning_rate": 3.272542485937369e-06,
"loss": 0.0003,
"step": 336
},
{
"epoch": 2.9304347826086956,
"grad_norm": 0.021017853170633316,
"learning_rate": 3.259871288140738e-06,
"loss": 0.0,
"step": 337
},
{
"epoch": 2.9391304347826086,
"grad_norm": 0.009931406937539577,
"learning_rate": 3.247178545927876e-06,
"loss": 0.0,
"step": 338
},
{
"epoch": 2.9478260869565216,
"grad_norm": 0.2339964658021927,
"learning_rate": 3.234464619172522e-06,
"loss": 0.0002,
"step": 339
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.018152186647057533,
"learning_rate": 3.221729868349053e-06,
"loss": 0.0,
"step": 340
},
{
"epoch": 2.965217391304348,
"grad_norm": 0.013054460287094116,
"learning_rate": 3.208974654522266e-06,
"loss": 0.0,
"step": 341
},
{
"epoch": 2.973913043478261,
"grad_norm": 0.24946050345897675,
"learning_rate": 3.1961993393371405e-06,
"loss": 0.0003,
"step": 342
},
{
"epoch": 2.982608695652174,
"grad_norm": 0.30698102712631226,
"learning_rate": 3.183404285008582e-06,
"loss": 0.001,
"step": 343
},
{
"epoch": 2.991304347826087,
"grad_norm": 0.1075379028916359,
"learning_rate": 3.1705898543111576e-06,
"loss": 0.0005,
"step": 344
},
{
"epoch": 3.0,
"grad_norm": 0.2779870927333832,
"learning_rate": 3.157756410568803e-06,
"loss": 0.0008,
"step": 345
},
{
"epoch": 3.008695652173913,
"grad_norm": 0.1857200264930725,
"learning_rate": 3.14490431764453e-06,
"loss": 0.0002,
"step": 346
},
{
"epoch": 3.017391304347826,
"grad_norm": 0.05824211612343788,
"learning_rate": 3.132033939930101e-06,
"loss": 0.0001,
"step": 347
},
{
"epoch": 3.026086956521739,
"grad_norm": 0.032336920499801636,
"learning_rate": 3.1191456423357047e-06,
"loss": 0.0001,
"step": 348
},
{
"epoch": 3.034782608695652,
"grad_norm": 0.14255207777023315,
"learning_rate": 3.106239790279606e-06,
"loss": 0.0002,
"step": 349
},
{
"epoch": 3.0434782608695654,
"grad_norm": 0.047609731554985046,
"learning_rate": 3.093316749677788e-06,
"loss": 0.0,
"step": 350
},
{
"epoch": 3.0521739130434784,
"grad_norm": 0.06523387879133224,
"learning_rate": 3.0803768869335726e-06,
"loss": 0.0001,
"step": 351
},
{
"epoch": 3.0608695652173914,
"grad_norm": 0.01587139070034027,
"learning_rate": 3.0674205689272378e-06,
"loss": 0.0,
"step": 352
},
{
"epoch": 3.0695652173913044,
"grad_norm": 0.02261805161833763,
"learning_rate": 3.054448163005613e-06,
"loss": 0.0001,
"step": 353
},
{
"epoch": 3.0782608695652174,
"grad_norm": 0.01384738925844431,
"learning_rate": 3.041460036971664e-06,
"loss": 0.0,
"step": 354
},
{
"epoch": 3.0869565217391304,
"grad_norm": 0.03285490348935127,
"learning_rate": 3.028456559074061e-06,
"loss": 0.0001,
"step": 355
},
{
"epoch": 3.0956521739130434,
"grad_norm": 0.0027383833657950163,
"learning_rate": 3.0154380979967456e-06,
"loss": 0.0,
"step": 356
},
{
"epoch": 3.1043478260869564,
"grad_norm": 0.18999581038951874,
"learning_rate": 3.0024050228484713e-06,
"loss": 0.0002,
"step": 357
},
{
"epoch": 3.1130434782608694,
"grad_norm": 0.09708679467439651,
"learning_rate": 2.9893577031523403e-06,
"loss": 0.0005,
"step": 358
},
{
"epoch": 3.121739130434783,
"grad_norm": 0.1416860669851303,
"learning_rate": 2.976296508835326e-06,
"loss": 0.0001,
"step": 359
},
{
"epoch": 3.130434782608696,
"grad_norm": 0.08077018707990646,
"learning_rate": 2.963221810217786e-06,
"loss": 0.0001,
"step": 360
},
{
"epoch": 3.139130434782609,
"grad_norm": 0.0910787582397461,
"learning_rate": 2.9501339780029614e-06,
"loss": 0.0005,
"step": 361
},
{
"epoch": 3.1478260869565218,
"grad_norm": 0.08710524439811707,
"learning_rate": 2.937033383266466e-06,
"loss": 0.0004,
"step": 362
},
{
"epoch": 3.1565217391304348,
"grad_norm": 0.00581876328215003,
"learning_rate": 2.923920397445766e-06,
"loss": 0.0,
"step": 363
},
{
"epoch": 3.1652173913043478,
"grad_norm": 0.2737768292427063,
"learning_rate": 2.910795392329649e-06,
"loss": 0.0005,
"step": 364
},
{
"epoch": 3.1739130434782608,
"grad_norm": 0.003364310134202242,
"learning_rate": 2.8976587400476804e-06,
"loss": 0.0,
"step": 365
},
{
"epoch": 3.1826086956521737,
"grad_norm": 0.004156508948653936,
"learning_rate": 2.884510813059657e-06,
"loss": 0.0,
"step": 366
},
{
"epoch": 3.1913043478260867,
"grad_norm": 0.01153493206948042,
"learning_rate": 2.871351984145042e-06,
"loss": 0.0,
"step": 367
},
{
"epoch": 3.2,
"grad_norm": 0.003978746011853218,
"learning_rate": 2.8581826263923993e-06,
"loss": 0.0,
"step": 368
},
{
"epoch": 3.208695652173913,
"grad_norm": 0.007011134643107653,
"learning_rate": 2.8450031131888147e-06,
"loss": 0.0,
"step": 369
},
{
"epoch": 3.217391304347826,
"grad_norm": 0.0030873024370521307,
"learning_rate": 2.8318138182093053e-06,
"loss": 0.0,
"step": 370
},
{
"epoch": 3.226086956521739,
"grad_norm": 0.005853947252035141,
"learning_rate": 2.8186151154062314e-06,
"loss": 0.0,
"step": 371
},
{
"epoch": 3.234782608695652,
"grad_norm": 0.005672338884323835,
"learning_rate": 2.8054073789986884e-06,
"loss": 0.0,
"step": 372
},
{
"epoch": 3.243478260869565,
"grad_norm": 0.002228250727057457,
"learning_rate": 2.792190983461902e-06,
"loss": 0.0,
"step": 373
},
{
"epoch": 3.252173913043478,
"grad_norm": 0.03767574205994606,
"learning_rate": 2.7789663035166035e-06,
"loss": 0.0001,
"step": 374
},
{
"epoch": 3.260869565217391,
"grad_norm": 0.04303343966603279,
"learning_rate": 2.7657337141184137e-06,
"loss": 0.0,
"step": 375
},
{
"epoch": 3.269565217391304,
"grad_norm": 1.011793613433838,
"learning_rate": 2.7524935904472056e-06,
"loss": 0.0003,
"step": 376
},
{
"epoch": 3.2782608695652176,
"grad_norm": 0.003984061535447836,
"learning_rate": 2.73924630789647e-06,
"loss": 0.0,
"step": 377
},
{
"epoch": 3.2869565217391306,
"grad_norm": 0.0033289589919149876,
"learning_rate": 2.7259922420626705e-06,
"loss": 0.0,
"step": 378
},
{
"epoch": 3.2956521739130435,
"grad_norm": 0.008105777204036713,
"learning_rate": 2.7127317687345973e-06,
"loss": 0.0,
"step": 379
},
{
"epoch": 3.3043478260869565,
"grad_norm": 0.0022414308041334152,
"learning_rate": 2.699465263882708e-06,
"loss": 0.0,
"step": 380
},
{
"epoch": 3.3130434782608695,
"grad_norm": 0.004690825939178467,
"learning_rate": 2.686193103648472e-06,
"loss": 0.0,
"step": 381
},
{
"epoch": 3.3217391304347825,
"grad_norm": 0.07751515507698059,
"learning_rate": 2.672915664333704e-06,
"loss": 0.0004,
"step": 382
},
{
"epoch": 3.3304347826086955,
"grad_norm": 0.0041257767006754875,
"learning_rate": 2.6596333223898934e-06,
"loss": 0.0,
"step": 383
},
{
"epoch": 3.3391304347826085,
"grad_norm": 0.001377386855892837,
"learning_rate": 2.6463464544075344e-06,
"loss": 0.0,
"step": 384
},
{
"epoch": 3.3478260869565215,
"grad_norm": 0.08914496004581451,
"learning_rate": 2.6330554371054466e-06,
"loss": 0.0001,
"step": 385
},
{
"epoch": 3.356521739130435,
"grad_norm": 0.2693479359149933,
"learning_rate": 2.6197606473200924e-06,
"loss": 0.0002,
"step": 386
},
{
"epoch": 3.365217391304348,
"grad_norm": 0.009363976307213306,
"learning_rate": 2.6064624619948966e-06,
"loss": 0.0,
"step": 387
},
{
"epoch": 3.373913043478261,
"grad_norm": 0.0025562141090631485,
"learning_rate": 2.593161258169554e-06,
"loss": 0.0,
"step": 388
},
{
"epoch": 3.382608695652174,
"grad_norm": 0.031549256294965744,
"learning_rate": 2.579857412969345e-06,
"loss": 0.0,
"step": 389
},
{
"epoch": 3.391304347826087,
"grad_norm": 0.1631297767162323,
"learning_rate": 2.5665513035944373e-06,
"loss": 0.0004,
"step": 390
},
{
"epoch": 3.4,
"grad_norm": 0.0037248400039970875,
"learning_rate": 2.5532433073091967e-06,
"loss": 0.0,
"step": 391
},
{
"epoch": 3.408695652173913,
"grad_norm": 0.010381661355495453,
"learning_rate": 2.539933801431487e-06,
"loss": 0.0,
"step": 392
},
{
"epoch": 3.417391304347826,
"grad_norm": 0.01955062709748745,
"learning_rate": 2.5266231633219733e-06,
"loss": 0.0,
"step": 393
},
{
"epoch": 3.426086956521739,
"grad_norm": 0.00635969964787364,
"learning_rate": 2.513311770373421e-06,
"loss": 0.0,
"step": 394
},
{
"epoch": 3.4347826086956523,
"grad_norm": 0.07282191514968872,
"learning_rate": 2.5e-06,
"loss": 0.0001,
"step": 395
},
{
"epoch": 3.4434782608695653,
"grad_norm": 0.00482774805277586,
"learning_rate": 2.4866882296265797e-06,
"loss": 0.0,
"step": 396
},
{
"epoch": 3.4521739130434783,
"grad_norm": 0.006832276936620474,
"learning_rate": 2.473376836678028e-06,
"loss": 0.0,
"step": 397
},
{
"epoch": 3.4608695652173913,
"grad_norm": 0.0018247866537421942,
"learning_rate": 2.4600661985685132e-06,
"loss": 0.0,
"step": 398
},
{
"epoch": 3.4695652173913043,
"grad_norm": 0.056768789887428284,
"learning_rate": 2.446756692690804e-06,
"loss": 0.0001,
"step": 399
},
{
"epoch": 3.4782608695652173,
"grad_norm": 0.013176783919334412,
"learning_rate": 2.4334486964055635e-06,
"loss": 0.0,
"step": 400
},
{
"epoch": 3.4869565217391303,
"grad_norm": 0.012747037224471569,
"learning_rate": 2.4201425870306566e-06,
"loss": 0.0,
"step": 401
},
{
"epoch": 3.4956521739130437,
"grad_norm": 0.013687407597899437,
"learning_rate": 2.406838741830446e-06,
"loss": 0.0,
"step": 402
},
{
"epoch": 3.5043478260869563,
"grad_norm": 0.23240718245506287,
"learning_rate": 2.393537538005104e-06,
"loss": 0.0001,
"step": 403
},
{
"epoch": 3.5130434782608697,
"grad_norm": 0.008161540143191814,
"learning_rate": 2.380239352679908e-06,
"loss": 0.0,
"step": 404
},
{
"epoch": 3.5217391304347827,
"grad_norm": 0.05203310027718544,
"learning_rate": 2.3669445628945543e-06,
"loss": 0.0,
"step": 405
},
{
"epoch": 3.5304347826086957,
"grad_norm": 0.007062139920890331,
"learning_rate": 2.3536535455924656e-06,
"loss": 0.0,
"step": 406
},
{
"epoch": 3.5391304347826087,
"grad_norm": 0.01181522011756897,
"learning_rate": 2.340366677610107e-06,
"loss": 0.0,
"step": 407
},
{
"epoch": 3.5478260869565217,
"grad_norm": 0.00283506466075778,
"learning_rate": 2.327084335666297e-06,
"loss": 0.0,
"step": 408
},
{
"epoch": 3.5565217391304347,
"grad_norm": 0.0024429778568446636,
"learning_rate": 2.313806896351529e-06,
"loss": 0.0,
"step": 409
},
{
"epoch": 3.5652173913043477,
"grad_norm": 0.004740845412015915,
"learning_rate": 2.300534736117292e-06,
"loss": 0.0,
"step": 410
},
{
"epoch": 3.573913043478261,
"grad_norm": 0.04463421553373337,
"learning_rate": 2.2872682312654035e-06,
"loss": 0.0002,
"step": 411
},
{
"epoch": 3.5826086956521737,
"grad_norm": 0.005315855145454407,
"learning_rate": 2.2740077579373303e-06,
"loss": 0.0,
"step": 412
},
{
"epoch": 3.591304347826087,
"grad_norm": 0.002299713436514139,
"learning_rate": 2.2607536921035313e-06,
"loss": 0.0,
"step": 413
},
{
"epoch": 3.6,
"grad_norm": 0.01868036948144436,
"learning_rate": 2.247506409552795e-06,
"loss": 0.0,
"step": 414
},
{
"epoch": 3.608695652173913,
"grad_norm": 0.01010120939463377,
"learning_rate": 2.234266285881587e-06,
"loss": 0.0,
"step": 415
},
{
"epoch": 3.617391304347826,
"grad_norm": 0.005287417210638523,
"learning_rate": 2.221033696483397e-06,
"loss": 0.0,
"step": 416
},
{
"epoch": 3.626086956521739,
"grad_norm": 0.005691695027053356,
"learning_rate": 2.2078090165380992e-06,
"loss": 0.0,
"step": 417
},
{
"epoch": 3.634782608695652,
"grad_norm": 0.0028878487646579742,
"learning_rate": 2.194592621001311e-06,
"loss": 0.0,
"step": 418
},
{
"epoch": 3.643478260869565,
"grad_norm": 0.0035773522686213255,
"learning_rate": 2.1813848845937695e-06,
"loss": 0.0,
"step": 419
},
{
"epoch": 3.6521739130434785,
"grad_norm": 0.0027882216963917017,
"learning_rate": 2.1681861817906955e-06,
"loss": 0.0,
"step": 420
},
{
"epoch": 3.660869565217391,
"grad_norm": 0.00462096743285656,
"learning_rate": 2.1549968868111866e-06,
"loss": 0.0,
"step": 421
},
{
"epoch": 3.6695652173913045,
"grad_norm": 0.00283925817348063,
"learning_rate": 2.141817373607601e-06,
"loss": 0.0,
"step": 422
},
{
"epoch": 3.6782608695652175,
"grad_norm": 0.002093710470944643,
"learning_rate": 2.1286480158549583e-06,
"loss": 0.0,
"step": 423
},
{
"epoch": 3.6869565217391305,
"grad_norm": 0.0020851960871368647,
"learning_rate": 2.1154891869403436e-06,
"loss": 0.0,
"step": 424
},
{
"epoch": 3.6956521739130435,
"grad_norm": 0.005665747448801994,
"learning_rate": 2.1023412599523204e-06,
"loss": 0.0,
"step": 425
},
{
"epoch": 3.7043478260869565,
"grad_norm": 0.00797071773558855,
"learning_rate": 2.089204607670352e-06,
"loss": 0.0,
"step": 426
},
{
"epoch": 3.7130434782608694,
"grad_norm": 0.0011778065236285329,
"learning_rate": 2.0760796025542342e-06,
"loss": 0.0,
"step": 427
},
{
"epoch": 3.7217391304347824,
"grad_norm": 0.0023867737036198378,
"learning_rate": 2.0629666167335344e-06,
"loss": 0.0,
"step": 428
},
{
"epoch": 3.730434782608696,
"grad_norm": 0.00638067489489913,
"learning_rate": 2.0498660219970395e-06,
"loss": 0.0,
"step": 429
},
{
"epoch": 3.7391304347826084,
"grad_norm": 0.012454311363399029,
"learning_rate": 2.0367781897822147e-06,
"loss": 0.0,
"step": 430
},
{
"epoch": 3.747826086956522,
"grad_norm": 0.002240461064502597,
"learning_rate": 2.0237034911646745e-06,
"loss": 0.0,
"step": 431
},
{
"epoch": 3.756521739130435,
"grad_norm": 0.001390191144309938,
"learning_rate": 2.0106422968476606e-06,
"loss": 0.0,
"step": 432
},
{
"epoch": 3.765217391304348,
"grad_norm": 0.003424042835831642,
"learning_rate": 1.9975949771515296e-06,
"loss": 0.0,
"step": 433
},
{
"epoch": 3.773913043478261,
"grad_norm": 0.0024630806874483824,
"learning_rate": 1.9845619020032552e-06,
"loss": 0.0,
"step": 434
},
{
"epoch": 3.782608695652174,
"grad_norm": 0.0031529609113931656,
"learning_rate": 1.9715434409259393e-06,
"loss": 0.0,
"step": 435
},
{
"epoch": 3.791304347826087,
"grad_norm": 0.005373111926019192,
"learning_rate": 1.958539963028337e-06,
"loss": 0.0,
"step": 436
},
{
"epoch": 3.8,
"grad_norm": 0.0037128408439457417,
"learning_rate": 1.9455518369943873e-06,
"loss": 0.0,
"step": 437
},
{
"epoch": 3.8086956521739133,
"grad_norm": 0.002308671362698078,
"learning_rate": 1.9325794310727626e-06,
"loss": 0.0,
"step": 438
},
{
"epoch": 3.8173913043478263,
"grad_norm": 0.002887170063331723,
"learning_rate": 1.9196231130664282e-06,
"loss": 0.0,
"step": 439
},
{
"epoch": 3.8260869565217392,
"grad_norm": 0.003565647406503558,
"learning_rate": 1.906683250322213e-06,
"loss": 0.0,
"step": 440
},
{
"epoch": 3.8347826086956522,
"grad_norm": 0.0023865115363150835,
"learning_rate": 1.8937602097203945e-06,
"loss": 0.0,
"step": 441
},
{
"epoch": 3.8434782608695652,
"grad_norm": 0.004892145283520222,
"learning_rate": 1.8808543576642966e-06,
"loss": 0.0,
"step": 442
},
{
"epoch": 3.8521739130434782,
"grad_norm": 0.0018583799246698618,
"learning_rate": 1.8679660600698996e-06,
"loss": 0.0,
"step": 443
},
{
"epoch": 3.860869565217391,
"grad_norm": 0.15015952289104462,
"learning_rate": 1.8550956823554708e-06,
"loss": 0.0003,
"step": 444
},
{
"epoch": 3.869565217391304,
"grad_norm": 0.0015204929513856769,
"learning_rate": 1.8422435894311973e-06,
"loss": 0.0,
"step": 445
},
{
"epoch": 3.878260869565217,
"grad_norm": 0.0579560324549675,
"learning_rate": 1.8294101456888433e-06,
"loss": 0.0001,
"step": 446
},
{
"epoch": 3.8869565217391306,
"grad_norm": 0.002745965728536248,
"learning_rate": 1.8165957149914182e-06,
"loss": 0.0,
"step": 447
},
{
"epoch": 3.8956521739130436,
"grad_norm": 0.00269194389693439,
"learning_rate": 1.8038006606628599e-06,
"loss": 0.0,
"step": 448
},
{
"epoch": 3.9043478260869566,
"grad_norm": 0.003470352618023753,
"learning_rate": 1.7910253454777346e-06,
"loss": 0.0,
"step": 449
},
{
"epoch": 3.9130434782608696,
"grad_norm": 0.007889937609434128,
"learning_rate": 1.7782701316509482e-06,
"loss": 0.0,
"step": 450
},
{
"epoch": 3.9217391304347826,
"grad_norm": 0.002997358562424779,
"learning_rate": 1.7655353808274795e-06,
"loss": 0.0,
"step": 451
},
{
"epoch": 3.9304347826086956,
"grad_norm": 0.06782340258359909,
"learning_rate": 1.752821454072124e-06,
"loss": 0.0003,
"step": 452
},
{
"epoch": 3.9391304347826086,
"grad_norm": 0.001661222893744707,
"learning_rate": 1.7401287118592626e-06,
"loss": 0.0,
"step": 453
},
{
"epoch": 3.9478260869565216,
"grad_norm": 0.0512867271900177,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.0,
"step": 454
},
{
"epoch": 3.9565217391304346,
"grad_norm": 0.0020310841500759125,
"learning_rate": 1.7148082199451288e-06,
"loss": 0.0,
"step": 455
},
{
"epoch": 3.965217391304348,
"grad_norm": 0.0014185906620696187,
"learning_rate": 1.7021811881486186e-06,
"loss": 0.0,
"step": 456
},
{
"epoch": 3.973913043478261,
"grad_norm": 0.00783043447881937,
"learning_rate": 1.6895767766837734e-06,
"loss": 0.0,
"step": 457
},
{
"epoch": 3.982608695652174,
"grad_norm": 0.015537451021373272,
"learning_rate": 1.6769953429199142e-06,
"loss": 0.0,
"step": 458
},
{
"epoch": 3.991304347826087,
"grad_norm": 0.003787545021623373,
"learning_rate": 1.6644372435748823e-06,
"loss": 0.0,
"step": 459
},
{
"epoch": 4.0,
"grad_norm": 0.0017769169062376022,
"learning_rate": 1.6519028347049242e-06,
"loss": 0.0,
"step": 460
}
],
"logging_steps": 1,
"max_steps": 690,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 115,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1611348682774938e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}