qwen3-4b-math-lora-a10000 / trainer_state.json
stepprog's picture
Upload final LoRA adapter
1d72e90 verified
{
"best_global_step": 300,
"best_metric": 0.11345648020505905,
"best_model_checkpoint": "./qwen-math-lora/checkpoint-300",
"epoch": 1.5800922874093606,
"eval_steps": 50,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005273566249176005,
"grad_norm": 1.5119972229003906,
"learning_rate": 0.0,
"loss": 1.6366,
"step": 1
},
{
"epoch": 0.01054713249835201,
"grad_norm": 1.4765245914459229,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.6667,
"step": 2
},
{
"epoch": 0.015820698747528016,
"grad_norm": 1.4236232042312622,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.6576,
"step": 3
},
{
"epoch": 0.02109426499670402,
"grad_norm": 1.5235203504562378,
"learning_rate": 3e-06,
"loss": 1.6572,
"step": 4
},
{
"epoch": 0.026367831245880026,
"grad_norm": 1.4115797281265259,
"learning_rate": 4.000000000000001e-06,
"loss": 1.5819,
"step": 5
},
{
"epoch": 0.03164139749505603,
"grad_norm": 1.4249836206436157,
"learning_rate": 5e-06,
"loss": 1.6502,
"step": 6
},
{
"epoch": 0.03691496374423204,
"grad_norm": 1.3071649074554443,
"learning_rate": 6e-06,
"loss": 1.6807,
"step": 7
},
{
"epoch": 0.04218852999340804,
"grad_norm": 1.2610119581222534,
"learning_rate": 7.000000000000001e-06,
"loss": 1.5825,
"step": 8
},
{
"epoch": 0.047462096242584045,
"grad_norm": 1.235508918762207,
"learning_rate": 8.000000000000001e-06,
"loss": 1.5464,
"step": 9
},
{
"epoch": 0.05273566249176005,
"grad_norm": 1.151235580444336,
"learning_rate": 9e-06,
"loss": 1.5856,
"step": 10
},
{
"epoch": 0.05800922874093606,
"grad_norm": 1.058812141418457,
"learning_rate": 1e-05,
"loss": 1.542,
"step": 11
},
{
"epoch": 0.06328279499011207,
"grad_norm": 0.935869038105011,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.5216,
"step": 12
},
{
"epoch": 0.06855636123928807,
"grad_norm": 0.8530864715576172,
"learning_rate": 1.2e-05,
"loss": 1.4271,
"step": 13
},
{
"epoch": 0.07382992748846408,
"grad_norm": 0.836365282535553,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.4095,
"step": 14
},
{
"epoch": 0.07910349373764008,
"grad_norm": 0.7369374632835388,
"learning_rate": 1.4000000000000001e-05,
"loss": 1.3664,
"step": 15
},
{
"epoch": 0.08437705998681608,
"grad_norm": 0.741736888885498,
"learning_rate": 1.5e-05,
"loss": 1.4031,
"step": 16
},
{
"epoch": 0.08965062623599208,
"grad_norm": 0.6538688540458679,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.3768,
"step": 17
},
{
"epoch": 0.09492419248516809,
"grad_norm": 0.6126262545585632,
"learning_rate": 1.7000000000000003e-05,
"loss": 1.3554,
"step": 18
},
{
"epoch": 0.1001977587343441,
"grad_norm": 0.5822679996490479,
"learning_rate": 1.8e-05,
"loss": 1.2992,
"step": 19
},
{
"epoch": 0.1054713249835201,
"grad_norm": 0.5410017967224121,
"learning_rate": 1.9e-05,
"loss": 1.2494,
"step": 20
},
{
"epoch": 0.11074489123269611,
"grad_norm": 0.5416837334632874,
"learning_rate": 2e-05,
"loss": 1.2604,
"step": 21
},
{
"epoch": 0.11601845748187212,
"grad_norm": 0.5807645320892334,
"learning_rate": 2.1e-05,
"loss": 1.178,
"step": 22
},
{
"epoch": 0.12129202373104812,
"grad_norm": 0.5549229383468628,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.1861,
"step": 23
},
{
"epoch": 0.12656558998022413,
"grad_norm": 0.5763499736785889,
"learning_rate": 2.3000000000000003e-05,
"loss": 1.2242,
"step": 24
},
{
"epoch": 0.13183915622940012,
"grad_norm": 0.5674681663513184,
"learning_rate": 2.4e-05,
"loss": 1.1442,
"step": 25
},
{
"epoch": 0.13711272247857614,
"grad_norm": 0.5441560745239258,
"learning_rate": 2.5e-05,
"loss": 1.0821,
"step": 26
},
{
"epoch": 0.14238628872775214,
"grad_norm": 0.58034348487854,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.1068,
"step": 27
},
{
"epoch": 0.14765985497692816,
"grad_norm": 0.563574492931366,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.0316,
"step": 28
},
{
"epoch": 0.15293342122610415,
"grad_norm": 0.5922898054122925,
"learning_rate": 2.8000000000000003e-05,
"loss": 1.0707,
"step": 29
},
{
"epoch": 0.15820698747528017,
"grad_norm": 0.46859392523765564,
"learning_rate": 2.9e-05,
"loss": 0.97,
"step": 30
},
{
"epoch": 0.16348055372445616,
"grad_norm": 0.7508406639099121,
"learning_rate": 3e-05,
"loss": 0.9857,
"step": 31
},
{
"epoch": 0.16875411997363216,
"grad_norm": 0.6806529760360718,
"learning_rate": 3.1e-05,
"loss": 0.9589,
"step": 32
},
{
"epoch": 0.17402768622280818,
"grad_norm": 0.35177281498908997,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.9319,
"step": 33
},
{
"epoch": 0.17930125247198417,
"grad_norm": 0.35340362787246704,
"learning_rate": 3.3e-05,
"loss": 0.9878,
"step": 34
},
{
"epoch": 0.1845748187211602,
"grad_norm": 0.3041383624076843,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.9501,
"step": 35
},
{
"epoch": 0.18984838497033618,
"grad_norm": 0.29335305094718933,
"learning_rate": 3.5e-05,
"loss": 0.8826,
"step": 36
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.2781873345375061,
"learning_rate": 3.6e-05,
"loss": 0.9757,
"step": 37
},
{
"epoch": 0.2003955174686882,
"grad_norm": 0.3608724772930145,
"learning_rate": 3.7e-05,
"loss": 0.9229,
"step": 38
},
{
"epoch": 0.20566908371786422,
"grad_norm": 0.2756713032722473,
"learning_rate": 3.8e-05,
"loss": 0.8868,
"step": 39
},
{
"epoch": 0.2109426499670402,
"grad_norm": 0.3764660060405731,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.9301,
"step": 40
},
{
"epoch": 0.21621621621621623,
"grad_norm": 0.27100852131843567,
"learning_rate": 4e-05,
"loss": 0.9014,
"step": 41
},
{
"epoch": 0.22148978246539222,
"grad_norm": 0.27153897285461426,
"learning_rate": 4.1e-05,
"loss": 0.8569,
"step": 42
},
{
"epoch": 0.2267633487145682,
"grad_norm": 0.2656016945838928,
"learning_rate": 4.2e-05,
"loss": 0.8353,
"step": 43
},
{
"epoch": 0.23203691496374423,
"grad_norm": 0.30224132537841797,
"learning_rate": 4.3e-05,
"loss": 0.8531,
"step": 44
},
{
"epoch": 0.23731048121292023,
"grad_norm": 0.2992110252380371,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.9029,
"step": 45
},
{
"epoch": 0.24258404746209625,
"grad_norm": 0.2795073091983795,
"learning_rate": 4.5e-05,
"loss": 0.8582,
"step": 46
},
{
"epoch": 0.24785761371127224,
"grad_norm": 0.27543389797210693,
"learning_rate": 4.600000000000001e-05,
"loss": 0.7899,
"step": 47
},
{
"epoch": 0.25313117996044826,
"grad_norm": 0.26102226972579956,
"learning_rate": 4.7e-05,
"loss": 0.7705,
"step": 48
},
{
"epoch": 0.2584047462096243,
"grad_norm": 0.32240045070648193,
"learning_rate": 4.8e-05,
"loss": 0.7833,
"step": 49
},
{
"epoch": 0.26367831245880025,
"grad_norm": 0.2760595679283142,
"learning_rate": 4.9e-05,
"loss": 0.8035,
"step": 50
},
{
"epoch": 0.26367831245880025,
"eval_loss": 0.8153137564659119,
"eval_runtime": 133.8438,
"eval_samples_per_second": 8.002,
"eval_steps_per_second": 2.002,
"step": 50
},
{
"epoch": 0.26895187870797627,
"grad_norm": 0.29733768105506897,
"learning_rate": 5e-05,
"loss": 0.8767,
"step": 51
},
{
"epoch": 0.2742254449571523,
"grad_norm": 0.4476633667945862,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.7695,
"step": 52
},
{
"epoch": 0.2794990112063283,
"grad_norm": 0.3744952380657196,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.8464,
"step": 53
},
{
"epoch": 0.28477257745550427,
"grad_norm": 0.2564408779144287,
"learning_rate": 5.300000000000001e-05,
"loss": 0.8191,
"step": 54
},
{
"epoch": 0.2900461437046803,
"grad_norm": 0.2613051235675812,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.7771,
"step": 55
},
{
"epoch": 0.2953197099538563,
"grad_norm": 0.4838894307613373,
"learning_rate": 5.500000000000001e-05,
"loss": 0.7751,
"step": 56
},
{
"epoch": 0.3005932762030323,
"grad_norm": 0.28951677680015564,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.7704,
"step": 57
},
{
"epoch": 0.3058668424522083,
"grad_norm": 0.2760978043079376,
"learning_rate": 5.6999999999999996e-05,
"loss": 0.8058,
"step": 58
},
{
"epoch": 0.3111404087013843,
"grad_norm": 0.2781215310096741,
"learning_rate": 5.8e-05,
"loss": 0.7634,
"step": 59
},
{
"epoch": 0.31641397495056034,
"grad_norm": 0.25308936834335327,
"learning_rate": 5.9e-05,
"loss": 0.7516,
"step": 60
},
{
"epoch": 0.3216875411997363,
"grad_norm": 0.3314322531223297,
"learning_rate": 6e-05,
"loss": 0.7492,
"step": 61
},
{
"epoch": 0.3269611074489123,
"grad_norm": 0.26924365758895874,
"learning_rate": 6.1e-05,
"loss": 0.7459,
"step": 62
},
{
"epoch": 0.33223467369808835,
"grad_norm": 0.26491013169288635,
"learning_rate": 6.2e-05,
"loss": 0.7338,
"step": 63
},
{
"epoch": 0.3375082399472643,
"grad_norm": 0.28656676411628723,
"learning_rate": 6.3e-05,
"loss": 0.7454,
"step": 64
},
{
"epoch": 0.34278180619644033,
"grad_norm": 0.3129251301288605,
"learning_rate": 6.400000000000001e-05,
"loss": 0.751,
"step": 65
},
{
"epoch": 0.34805537244561635,
"grad_norm": 0.3116537928581238,
"learning_rate": 6.500000000000001e-05,
"loss": 0.6941,
"step": 66
},
{
"epoch": 0.35332893869479237,
"grad_norm": 0.3021077513694763,
"learning_rate": 6.6e-05,
"loss": 0.7172,
"step": 67
},
{
"epoch": 0.35860250494396834,
"grad_norm": 0.2933245599269867,
"learning_rate": 6.7e-05,
"loss": 0.7293,
"step": 68
},
{
"epoch": 0.36387607119314436,
"grad_norm": 0.32778868079185486,
"learning_rate": 6.800000000000001e-05,
"loss": 0.6935,
"step": 69
},
{
"epoch": 0.3691496374423204,
"grad_norm": 0.286576509475708,
"learning_rate": 6.9e-05,
"loss": 0.6441,
"step": 70
},
{
"epoch": 0.3744232036914964,
"grad_norm": 0.27806833386421204,
"learning_rate": 7e-05,
"loss": 0.7246,
"step": 71
},
{
"epoch": 0.37969676994067236,
"grad_norm": 0.31078678369522095,
"learning_rate": 7.1e-05,
"loss": 0.6322,
"step": 72
},
{
"epoch": 0.3849703361898484,
"grad_norm": 0.3146444261074066,
"learning_rate": 7.2e-05,
"loss": 0.6872,
"step": 73
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.3151572346687317,
"learning_rate": 7.3e-05,
"loss": 0.693,
"step": 74
},
{
"epoch": 0.39551746868820037,
"grad_norm": 0.33185523748397827,
"learning_rate": 7.4e-05,
"loss": 0.6937,
"step": 75
},
{
"epoch": 0.4007910349373764,
"grad_norm": 0.3287936747074127,
"learning_rate": 7.500000000000001e-05,
"loss": 0.7058,
"step": 76
},
{
"epoch": 0.4060646011865524,
"grad_norm": 0.34535712003707886,
"learning_rate": 7.6e-05,
"loss": 0.6538,
"step": 77
},
{
"epoch": 0.41133816743572843,
"grad_norm": 0.34255126118659973,
"learning_rate": 7.7e-05,
"loss": 0.674,
"step": 78
},
{
"epoch": 0.4166117336849044,
"grad_norm": 0.7276009321212769,
"learning_rate": 7.800000000000001e-05,
"loss": 0.6221,
"step": 79
},
{
"epoch": 0.4218852999340804,
"grad_norm": 0.41575613617897034,
"learning_rate": 7.900000000000001e-05,
"loss": 0.5828,
"step": 80
},
{
"epoch": 0.42715886618325644,
"grad_norm": 0.33262866735458374,
"learning_rate": 8e-05,
"loss": 0.5782,
"step": 81
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.33510202169418335,
"learning_rate": 8.1e-05,
"loss": 0.5731,
"step": 82
},
{
"epoch": 0.4377059986816084,
"grad_norm": 0.3654046058654785,
"learning_rate": 8.2e-05,
"loss": 0.5739,
"step": 83
},
{
"epoch": 0.44297956493078444,
"grad_norm": 0.3834691643714905,
"learning_rate": 8.3e-05,
"loss": 0.5629,
"step": 84
},
{
"epoch": 0.44825313117996046,
"grad_norm": 0.3804622292518616,
"learning_rate": 8.4e-05,
"loss": 0.5433,
"step": 85
},
{
"epoch": 0.4535266974291364,
"grad_norm": 0.3488738238811493,
"learning_rate": 8.5e-05,
"loss": 0.5517,
"step": 86
},
{
"epoch": 0.45880026367831245,
"grad_norm": 0.38344502449035645,
"learning_rate": 8.6e-05,
"loss": 0.576,
"step": 87
},
{
"epoch": 0.46407382992748847,
"grad_norm": 0.3855077624320984,
"learning_rate": 8.7e-05,
"loss": 0.527,
"step": 88
},
{
"epoch": 0.4693473961766645,
"grad_norm": 0.3912067711353302,
"learning_rate": 8.800000000000001e-05,
"loss": 0.5165,
"step": 89
},
{
"epoch": 0.47462096242584045,
"grad_norm": 0.4480763077735901,
"learning_rate": 8.900000000000001e-05,
"loss": 0.5103,
"step": 90
},
{
"epoch": 0.4798945286750165,
"grad_norm": 0.4126266539096832,
"learning_rate": 9e-05,
"loss": 0.5246,
"step": 91
},
{
"epoch": 0.4851680949241925,
"grad_norm": 0.41678905487060547,
"learning_rate": 9.1e-05,
"loss": 0.55,
"step": 92
},
{
"epoch": 0.4904416611733685,
"grad_norm": 0.42350953817367554,
"learning_rate": 9.200000000000001e-05,
"loss": 0.493,
"step": 93
},
{
"epoch": 0.4957152274225445,
"grad_norm": 0.44608232378959656,
"learning_rate": 9.300000000000001e-05,
"loss": 0.5233,
"step": 94
},
{
"epoch": 0.5009887936717206,
"grad_norm": 0.44571366906166077,
"learning_rate": 9.4e-05,
"loss": 0.56,
"step": 95
},
{
"epoch": 0.5062623599208965,
"grad_norm": 0.44927364587783813,
"learning_rate": 9.5e-05,
"loss": 0.5191,
"step": 96
},
{
"epoch": 0.5115359261700725,
"grad_norm": 0.5781615376472473,
"learning_rate": 9.6e-05,
"loss": 0.5259,
"step": 97
},
{
"epoch": 0.5168094924192486,
"grad_norm": 0.4781758785247803,
"learning_rate": 9.7e-05,
"loss": 0.5061,
"step": 98
},
{
"epoch": 0.5220830586684245,
"grad_norm": 0.46505609154701233,
"learning_rate": 9.8e-05,
"loss": 0.4804,
"step": 99
},
{
"epoch": 0.5273566249176005,
"grad_norm": 0.7176192998886108,
"learning_rate": 9.900000000000001e-05,
"loss": 0.4287,
"step": 100
},
{
"epoch": 0.5273566249176005,
"eval_loss": 0.4786125123500824,
"eval_runtime": 133.8519,
"eval_samples_per_second": 8.001,
"eval_steps_per_second": 2.002,
"step": 100
},
{
"epoch": 0.5326301911667766,
"grad_norm": 0.4295816421508789,
"learning_rate": 0.0001,
"loss": 0.4786,
"step": 101
},
{
"epoch": 0.5379037574159525,
"grad_norm": 0.47132614254951477,
"learning_rate": 0.000101,
"loss": 0.4703,
"step": 102
},
{
"epoch": 0.5431773236651285,
"grad_norm": 0.5543473958969116,
"learning_rate": 0.00010200000000000001,
"loss": 0.4401,
"step": 103
},
{
"epoch": 0.5484508899143046,
"grad_norm": 0.498334139585495,
"learning_rate": 0.00010300000000000001,
"loss": 0.4324,
"step": 104
},
{
"epoch": 0.5537244561634805,
"grad_norm": 0.49423035979270935,
"learning_rate": 0.00010400000000000001,
"loss": 0.4316,
"step": 105
},
{
"epoch": 0.5589980224126566,
"grad_norm": 5.340365409851074,
"learning_rate": 0.000105,
"loss": 0.4713,
"step": 106
},
{
"epoch": 0.5642715886618326,
"grad_norm": 0.5593706965446472,
"learning_rate": 0.00010600000000000002,
"loss": 0.4143,
"step": 107
},
{
"epoch": 0.5695451549110085,
"grad_norm": 0.4752410054206848,
"learning_rate": 0.00010700000000000001,
"loss": 0.419,
"step": 108
},
{
"epoch": 0.5748187211601846,
"grad_norm": 0.6359984278678894,
"learning_rate": 0.00010800000000000001,
"loss": 0.4151,
"step": 109
},
{
"epoch": 0.5800922874093606,
"grad_norm": 0.5052346587181091,
"learning_rate": 0.000109,
"loss": 0.3952,
"step": 110
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.49212637543678284,
"learning_rate": 0.00011000000000000002,
"loss": 0.4617,
"step": 111
},
{
"epoch": 0.5906394199077126,
"grad_norm": 0.5236564874649048,
"learning_rate": 0.00011100000000000001,
"loss": 0.4325,
"step": 112
},
{
"epoch": 0.5959129861568886,
"grad_norm": 0.6041468381881714,
"learning_rate": 0.00011200000000000001,
"loss": 0.4009,
"step": 113
},
{
"epoch": 0.6011865524060646,
"grad_norm": 0.5389513969421387,
"learning_rate": 0.000113,
"loss": 0.4263,
"step": 114
},
{
"epoch": 0.6064601186552406,
"grad_norm": 0.5749898552894592,
"learning_rate": 0.00011399999999999999,
"loss": 0.4105,
"step": 115
},
{
"epoch": 0.6117336849044166,
"grad_norm": 0.5574321150779724,
"learning_rate": 0.00011499999999999999,
"loss": 0.3967,
"step": 116
},
{
"epoch": 0.6170072511535926,
"grad_norm": 0.5891500115394592,
"learning_rate": 0.000116,
"loss": 0.3991,
"step": 117
},
{
"epoch": 0.6222808174027686,
"grad_norm": 0.5306826829910278,
"learning_rate": 0.000117,
"loss": 0.3726,
"step": 118
},
{
"epoch": 0.6275543836519446,
"grad_norm": 0.4786357581615448,
"learning_rate": 0.000118,
"loss": 0.3541,
"step": 119
},
{
"epoch": 0.6328279499011207,
"grad_norm": 0.47434163093566895,
"learning_rate": 0.000119,
"loss": 0.3471,
"step": 120
},
{
"epoch": 0.6381015161502966,
"grad_norm": 0.5113804340362549,
"learning_rate": 0.00012,
"loss": 0.3519,
"step": 121
},
{
"epoch": 0.6433750823994726,
"grad_norm": 0.5574295520782471,
"learning_rate": 0.000121,
"loss": 0.3591,
"step": 122
},
{
"epoch": 0.6486486486486487,
"grad_norm": 1.5176341533660889,
"learning_rate": 0.000122,
"loss": 0.331,
"step": 123
},
{
"epoch": 0.6539222148978246,
"grad_norm": 0.5883108377456665,
"learning_rate": 0.000123,
"loss": 0.3621,
"step": 124
},
{
"epoch": 0.6591957811470006,
"grad_norm": 0.5086923837661743,
"learning_rate": 0.000124,
"loss": 0.3719,
"step": 125
},
{
"epoch": 0.6644693473961767,
"grad_norm": 0.5057904124259949,
"learning_rate": 0.000125,
"loss": 0.3418,
"step": 126
},
{
"epoch": 0.6697429136453527,
"grad_norm": 0.5942703485488892,
"learning_rate": 0.000126,
"loss": 0.305,
"step": 127
},
{
"epoch": 0.6750164798945286,
"grad_norm": 0.4942289888858795,
"learning_rate": 0.000127,
"loss": 0.3509,
"step": 128
},
{
"epoch": 0.6802900461437047,
"grad_norm": 0.6494962573051453,
"learning_rate": 0.00012800000000000002,
"loss": 0.3425,
"step": 129
},
{
"epoch": 0.6855636123928807,
"grad_norm": 1.0529124736785889,
"learning_rate": 0.00012900000000000002,
"loss": 0.294,
"step": 130
},
{
"epoch": 0.6908371786420567,
"grad_norm": 0.6346781253814697,
"learning_rate": 0.00013000000000000002,
"loss": 0.325,
"step": 131
},
{
"epoch": 0.6961107448912327,
"grad_norm": 0.5200821161270142,
"learning_rate": 0.000131,
"loss": 0.3484,
"step": 132
},
{
"epoch": 0.7013843111404087,
"grad_norm": 0.49618640542030334,
"learning_rate": 0.000132,
"loss": 0.301,
"step": 133
},
{
"epoch": 0.7066578773895847,
"grad_norm": 0.4997330904006958,
"learning_rate": 0.000133,
"loss": 0.2953,
"step": 134
},
{
"epoch": 0.7119314436387607,
"grad_norm": 0.5263347625732422,
"learning_rate": 0.000134,
"loss": 0.2767,
"step": 135
},
{
"epoch": 0.7172050098879367,
"grad_norm": 0.560567319393158,
"learning_rate": 0.00013500000000000003,
"loss": 0.3286,
"step": 136
},
{
"epoch": 0.7224785761371127,
"grad_norm": 0.4766915738582611,
"learning_rate": 0.00013600000000000003,
"loss": 0.3108,
"step": 137
},
{
"epoch": 0.7277521423862887,
"grad_norm": 0.47753745317459106,
"learning_rate": 0.00013700000000000002,
"loss": 0.2282,
"step": 138
},
{
"epoch": 0.7330257086354647,
"grad_norm": 0.5010929107666016,
"learning_rate": 0.000138,
"loss": 0.2731,
"step": 139
},
{
"epoch": 0.7382992748846408,
"grad_norm": 0.5264869928359985,
"learning_rate": 0.000139,
"loss": 0.2598,
"step": 140
},
{
"epoch": 0.7435728411338167,
"grad_norm": 0.47988757491111755,
"learning_rate": 0.00014,
"loss": 0.2637,
"step": 141
},
{
"epoch": 0.7488464073829928,
"grad_norm": 0.48291894793510437,
"learning_rate": 0.000141,
"loss": 0.2739,
"step": 142
},
{
"epoch": 0.7541199736321688,
"grad_norm": 0.5980640649795532,
"learning_rate": 0.000142,
"loss": 0.3233,
"step": 143
},
{
"epoch": 0.7593935398813447,
"grad_norm": 0.46733126044273376,
"learning_rate": 0.000143,
"loss": 0.2315,
"step": 144
},
{
"epoch": 0.7646671061305208,
"grad_norm": 0.4654427766799927,
"learning_rate": 0.000144,
"loss": 0.2479,
"step": 145
},
{
"epoch": 0.7699406723796968,
"grad_norm": 0.46202385425567627,
"learning_rate": 0.000145,
"loss": 0.3064,
"step": 146
},
{
"epoch": 0.7752142386288727,
"grad_norm": 0.47191861271858215,
"learning_rate": 0.000146,
"loss": 0.2139,
"step": 147
},
{
"epoch": 0.7804878048780488,
"grad_norm": 0.5178374648094177,
"learning_rate": 0.000147,
"loss": 0.2304,
"step": 148
},
{
"epoch": 0.7857613711272248,
"grad_norm": 0.3869185149669647,
"learning_rate": 0.000148,
"loss": 0.2772,
"step": 149
},
{
"epoch": 0.7910349373764007,
"grad_norm": 0.4422077238559723,
"learning_rate": 0.00014900000000000002,
"loss": 0.2469,
"step": 150
},
{
"epoch": 0.7910349373764007,
"eval_loss": 0.2504952847957611,
"eval_runtime": 133.8524,
"eval_samples_per_second": 8.001,
"eval_steps_per_second": 2.002,
"step": 150
},
{
"epoch": 0.7963085036255768,
"grad_norm": 0.4492229223251343,
"learning_rate": 0.00015000000000000001,
"loss": 0.2298,
"step": 151
},
{
"epoch": 0.8015820698747528,
"grad_norm": 0.5070360898971558,
"learning_rate": 0.000151,
"loss": 0.2389,
"step": 152
},
{
"epoch": 0.8068556361239289,
"grad_norm": 0.39493462443351746,
"learning_rate": 0.000152,
"loss": 0.206,
"step": 153
},
{
"epoch": 0.8121292023731048,
"grad_norm": 0.44301116466522217,
"learning_rate": 0.000153,
"loss": 0.2592,
"step": 154
},
{
"epoch": 0.8174027686222808,
"grad_norm": 0.4067859351634979,
"learning_rate": 0.000154,
"loss": 0.2242,
"step": 155
},
{
"epoch": 0.8226763348714569,
"grad_norm": 0.43918946385383606,
"learning_rate": 0.000155,
"loss": 0.2127,
"step": 156
},
{
"epoch": 0.8279499011206328,
"grad_norm": 0.5059219598770142,
"learning_rate": 0.00015600000000000002,
"loss": 0.2561,
"step": 157
},
{
"epoch": 0.8332234673698088,
"grad_norm": 0.4179636836051941,
"learning_rate": 0.00015700000000000002,
"loss": 0.201,
"step": 158
},
{
"epoch": 0.8384970336189849,
"grad_norm": 0.4800855219364166,
"learning_rate": 0.00015800000000000002,
"loss": 0.2486,
"step": 159
},
{
"epoch": 0.8437705998681608,
"grad_norm": 0.4267498254776001,
"learning_rate": 0.00015900000000000002,
"loss": 0.2054,
"step": 160
},
{
"epoch": 0.8490441661173368,
"grad_norm": 0.4868602156639099,
"learning_rate": 0.00016,
"loss": 0.2206,
"step": 161
},
{
"epoch": 0.8543177323665129,
"grad_norm": 0.4100910723209381,
"learning_rate": 0.000161,
"loss": 0.2076,
"step": 162
},
{
"epoch": 0.8595912986156888,
"grad_norm": 0.3785172402858734,
"learning_rate": 0.000162,
"loss": 0.2175,
"step": 163
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.4334642291069031,
"learning_rate": 0.000163,
"loss": 0.1801,
"step": 164
},
{
"epoch": 0.8701384311140409,
"grad_norm": 0.3873803913593292,
"learning_rate": 0.000164,
"loss": 0.203,
"step": 165
},
{
"epoch": 0.8754119973632168,
"grad_norm": 0.43101224303245544,
"learning_rate": 0.000165,
"loss": 0.2021,
"step": 166
},
{
"epoch": 0.8806855636123929,
"grad_norm": 0.43550118803977966,
"learning_rate": 0.000166,
"loss": 0.2096,
"step": 167
},
{
"epoch": 0.8859591298615689,
"grad_norm": 0.44657325744628906,
"learning_rate": 0.000167,
"loss": 0.2052,
"step": 168
},
{
"epoch": 0.8912326961107448,
"grad_norm": 0.4124061167240143,
"learning_rate": 0.000168,
"loss": 0.2199,
"step": 169
},
{
"epoch": 0.8965062623599209,
"grad_norm": 0.5452592372894287,
"learning_rate": 0.00016900000000000002,
"loss": 0.2295,
"step": 170
},
{
"epoch": 0.9017798286090969,
"grad_norm": 0.41437071561813354,
"learning_rate": 0.00017,
"loss": 0.1891,
"step": 171
},
{
"epoch": 0.9070533948582729,
"grad_norm": 0.3778395354747772,
"learning_rate": 0.000171,
"loss": 0.2194,
"step": 172
},
{
"epoch": 0.9123269611074489,
"grad_norm": 0.37173032760620117,
"learning_rate": 0.000172,
"loss": 0.1594,
"step": 173
},
{
"epoch": 0.9176005273566249,
"grad_norm": 0.38124048709869385,
"learning_rate": 0.000173,
"loss": 0.1975,
"step": 174
},
{
"epoch": 0.922874093605801,
"grad_norm": 0.48111554980278015,
"learning_rate": 0.000174,
"loss": 0.2017,
"step": 175
},
{
"epoch": 0.9281476598549769,
"grad_norm": 0.44690003991127014,
"learning_rate": 0.000175,
"loss": 0.1859,
"step": 176
},
{
"epoch": 0.9334212261041529,
"grad_norm": 0.3716354966163635,
"learning_rate": 0.00017600000000000002,
"loss": 0.1964,
"step": 177
},
{
"epoch": 0.938694792353329,
"grad_norm": 0.36687999963760376,
"learning_rate": 0.00017700000000000002,
"loss": 0.1982,
"step": 178
},
{
"epoch": 0.9439683586025049,
"grad_norm": 0.4883500337600708,
"learning_rate": 0.00017800000000000002,
"loss": 0.2219,
"step": 179
},
{
"epoch": 0.9492419248516809,
"grad_norm": 0.33809033036231995,
"learning_rate": 0.00017900000000000001,
"loss": 0.1812,
"step": 180
},
{
"epoch": 0.954515491100857,
"grad_norm": 0.3546331524848938,
"learning_rate": 0.00018,
"loss": 0.1767,
"step": 181
},
{
"epoch": 0.959789057350033,
"grad_norm": 0.357530802488327,
"learning_rate": 0.000181,
"loss": 0.1823,
"step": 182
},
{
"epoch": 0.9650626235992089,
"grad_norm": 0.34756705164909363,
"learning_rate": 0.000182,
"loss": 0.2015,
"step": 183
},
{
"epoch": 0.970336189848385,
"grad_norm": 0.36489251255989075,
"learning_rate": 0.000183,
"loss": 0.1799,
"step": 184
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.3720036745071411,
"learning_rate": 0.00018400000000000003,
"loss": 0.1852,
"step": 185
},
{
"epoch": 0.980883322346737,
"grad_norm": 0.3317737877368927,
"learning_rate": 0.00018500000000000002,
"loss": 0.1868,
"step": 186
},
{
"epoch": 0.986156888595913,
"grad_norm": 0.35604041814804077,
"learning_rate": 0.00018600000000000002,
"loss": 0.1787,
"step": 187
},
{
"epoch": 0.991430454845089,
"grad_norm": 0.37222427129745483,
"learning_rate": 0.00018700000000000002,
"loss": 0.1757,
"step": 188
},
{
"epoch": 0.996704021094265,
"grad_norm": 0.40133216977119446,
"learning_rate": 0.000188,
"loss": 0.1767,
"step": 189
},
{
"epoch": 1.0,
"grad_norm": 0.36636754870414734,
"learning_rate": 0.00018899999999999999,
"loss": 0.1186,
"step": 190
},
{
"epoch": 1.005273566249176,
"grad_norm": 0.3590473234653473,
"learning_rate": 0.00019,
"loss": 0.1585,
"step": 191
},
{
"epoch": 1.010547132498352,
"grad_norm": 0.3549407124519348,
"learning_rate": 0.000191,
"loss": 0.1512,
"step": 192
},
{
"epoch": 1.015820698747528,
"grad_norm": 0.3402779698371887,
"learning_rate": 0.000192,
"loss": 0.1317,
"step": 193
},
{
"epoch": 1.021094264996704,
"grad_norm": 0.27974751591682434,
"learning_rate": 0.000193,
"loss": 0.1492,
"step": 194
},
{
"epoch": 1.02636783124588,
"grad_norm": 0.2740594446659088,
"learning_rate": 0.000194,
"loss": 0.1614,
"step": 195
},
{
"epoch": 1.031641397495056,
"grad_norm": 0.3438091576099396,
"learning_rate": 0.000195,
"loss": 0.1639,
"step": 196
},
{
"epoch": 1.036914963744232,
"grad_norm": 0.35183268785476685,
"learning_rate": 0.000196,
"loss": 0.137,
"step": 197
},
{
"epoch": 1.042188529993408,
"grad_norm": 0.3638111352920532,
"learning_rate": 0.00019700000000000002,
"loss": 0.1896,
"step": 198
},
{
"epoch": 1.047462096242584,
"grad_norm": 0.3941810131072998,
"learning_rate": 0.00019800000000000002,
"loss": 0.1905,
"step": 199
},
{
"epoch": 1.05273566249176,
"grad_norm": 0.2802982032299042,
"learning_rate": 0.000199,
"loss": 0.1355,
"step": 200
},
{
"epoch": 1.05273566249176,
"eval_loss": 0.16099050641059875,
"eval_runtime": 133.9383,
"eval_samples_per_second": 7.996,
"eval_steps_per_second": 2.001,
"step": 200
},
{
"epoch": 1.0580092287409362,
"grad_norm": 0.33312973380088806,
"learning_rate": 0.0002,
"loss": 0.1296,
"step": 201
},
{
"epoch": 1.063282794990112,
"grad_norm": 0.291446328163147,
"learning_rate": 0.00019999842640648654,
"loss": 0.1354,
"step": 202
},
{
"epoch": 1.068556361239288,
"grad_norm": 0.3492049276828766,
"learning_rate": 0.00019999370567547008,
"loss": 0.1609,
"step": 203
},
{
"epoch": 1.0738299274884642,
"grad_norm": 0.36389562487602234,
"learning_rate": 0.00019998583795552083,
"loss": 0.1234,
"step": 204
},
{
"epoch": 1.07910349373764,
"grad_norm": 0.3225807845592499,
"learning_rate": 0.00019997482349425066,
"loss": 0.142,
"step": 205
},
{
"epoch": 1.084377059986816,
"grad_norm": 0.3185547888278961,
"learning_rate": 0.00019996066263830531,
"loss": 0.1493,
"step": 206
},
{
"epoch": 1.0896506262359922,
"grad_norm": 0.3187515139579773,
"learning_rate": 0.00019994335583335335,
"loss": 0.1595,
"step": 207
},
{
"epoch": 1.094924192485168,
"grad_norm": 0.3453561067581177,
"learning_rate": 0.0001999229036240723,
"loss": 0.1547,
"step": 208
},
{
"epoch": 1.1001977587343441,
"grad_norm": 0.3260701894760132,
"learning_rate": 0.00019989930665413147,
"loss": 0.1426,
"step": 209
},
{
"epoch": 1.1054713249835202,
"grad_norm": 0.3505662679672241,
"learning_rate": 0.00019987256566617162,
"loss": 0.1619,
"step": 210
},
{
"epoch": 1.110744891232696,
"grad_norm": 0.32154926657676697,
"learning_rate": 0.00019984268150178167,
"loss": 0.1474,
"step": 211
},
{
"epoch": 1.1160184574818721,
"grad_norm": 0.2730904817581177,
"learning_rate": 0.00019980965510147213,
"loss": 0.1307,
"step": 212
},
{
"epoch": 1.1212920237310482,
"grad_norm": 0.3337661921977997,
"learning_rate": 0.0001997734875046456,
"loss": 0.1584,
"step": 213
},
{
"epoch": 1.126565589980224,
"grad_norm": 0.3607318103313446,
"learning_rate": 0.00019973417984956403,
"loss": 0.1223,
"step": 214
},
{
"epoch": 1.1318391562294001,
"grad_norm": 0.27768680453300476,
"learning_rate": 0.0001996917333733128,
"loss": 0.1209,
"step": 215
},
{
"epoch": 1.1371127224785762,
"grad_norm": 0.2751491665840149,
"learning_rate": 0.00019964614941176195,
"loss": 0.1168,
"step": 216
},
{
"epoch": 1.142386288727752,
"grad_norm": 0.3006565570831299,
"learning_rate": 0.00019959742939952392,
"loss": 0.1295,
"step": 217
},
{
"epoch": 1.1476598549769281,
"grad_norm": 0.2547905743122101,
"learning_rate": 0.00019954557486990868,
"loss": 0.1247,
"step": 218
},
{
"epoch": 1.1529334212261042,
"grad_norm": 0.25938180088996887,
"learning_rate": 0.00019949058745487522,
"loss": 0.1247,
"step": 219
},
{
"epoch": 1.15820698747528,
"grad_norm": 0.3042941391468048,
"learning_rate": 0.00019943246888498041,
"loss": 0.1645,
"step": 220
},
{
"epoch": 1.1634805537244561,
"grad_norm": 0.25871893763542175,
"learning_rate": 0.00019937122098932428,
"loss": 0.1431,
"step": 221
},
{
"epoch": 1.1687541199736322,
"grad_norm": 0.24148327112197876,
"learning_rate": 0.00019930684569549264,
"loss": 0.1375,
"step": 222
},
{
"epoch": 1.174027686222808,
"grad_norm": 0.25406157970428467,
"learning_rate": 0.00019923934502949644,
"loss": 0.1524,
"step": 223
},
{
"epoch": 1.1793012524719841,
"grad_norm": 0.3008594512939453,
"learning_rate": 0.00019916872111570784,
"loss": 0.1353,
"step": 224
},
{
"epoch": 1.1845748187211602,
"grad_norm": 0.2584022879600525,
"learning_rate": 0.00019909497617679348,
"loss": 0.1147,
"step": 225
},
{
"epoch": 1.189848384970336,
"grad_norm": 0.2885512709617615,
"learning_rate": 0.00019901811253364456,
"loss": 0.1388,
"step": 226
},
{
"epoch": 1.1951219512195121,
"grad_norm": 0.3085253834724426,
"learning_rate": 0.00019893813260530368,
"loss": 0.1278,
"step": 227
},
{
"epoch": 1.2003955174686882,
"grad_norm": 0.23244811594486237,
"learning_rate": 0.00019885503890888876,
"loss": 0.1299,
"step": 228
},
{
"epoch": 1.2056690837178643,
"grad_norm": 0.21688468754291534,
"learning_rate": 0.00019876883405951377,
"loss": 0.1145,
"step": 229
},
{
"epoch": 1.2109426499670402,
"grad_norm": 0.2418506145477295,
"learning_rate": 0.00019867952077020666,
"loss": 0.1351,
"step": 230
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.27453094720840454,
"learning_rate": 0.0001985871018518236,
"loss": 0.1222,
"step": 231
},
{
"epoch": 1.2214897824653923,
"grad_norm": 0.20536746084690094,
"learning_rate": 0.00019849158021296081,
"loss": 0.1157,
"step": 232
},
{
"epoch": 1.2267633487145682,
"grad_norm": 0.2276519536972046,
"learning_rate": 0.00019839295885986296,
"loss": 0.1266,
"step": 233
},
{
"epoch": 1.2320369149637442,
"grad_norm": 0.2710774838924408,
"learning_rate": 0.00019829124089632845,
"loss": 0.1257,
"step": 234
},
{
"epoch": 1.2373104812129203,
"grad_norm": 0.2697718143463135,
"learning_rate": 0.00019818642952361187,
"loss": 0.1304,
"step": 235
},
{
"epoch": 1.2425840474620962,
"grad_norm": 0.21641883254051208,
"learning_rate": 0.00019807852804032305,
"loss": 0.1149,
"step": 236
},
{
"epoch": 1.2478576137112722,
"grad_norm": 0.23116011917591095,
"learning_rate": 0.00019796753984232358,
"loss": 0.1115,
"step": 237
},
{
"epoch": 1.2531311799604483,
"grad_norm": 0.23961959779262543,
"learning_rate": 0.00019785346842261957,
"loss": 0.1046,
"step": 238
},
{
"epoch": 1.2584047462096244,
"grad_norm": 0.2854941487312317,
"learning_rate": 0.00019773631737125192,
"loss": 0.1289,
"step": 239
},
{
"epoch": 1.2636783124588002,
"grad_norm": 0.2735542058944702,
"learning_rate": 0.0001976160903751834,
"loss": 0.1243,
"step": 240
},
{
"epoch": 1.2689518787079763,
"grad_norm": 0.2876754105091095,
"learning_rate": 0.00019749279121818235,
"loss": 0.1712,
"step": 241
},
{
"epoch": 1.2742254449571524,
"grad_norm": 0.21064290404319763,
"learning_rate": 0.00019736642378070392,
"loss": 0.1026,
"step": 242
},
{
"epoch": 1.2794990112063283,
"grad_norm": 0.2385692000389099,
"learning_rate": 0.00019723699203976766,
"loss": 0.1132,
"step": 243
},
{
"epoch": 1.2847725774555043,
"grad_norm": 0.2054402083158493,
"learning_rate": 0.00019710450006883256,
"loss": 0.1366,
"step": 244
},
{
"epoch": 1.2900461437046804,
"grad_norm": 0.25641337037086487,
"learning_rate": 0.0001969689520376687,
"loss": 0.1401,
"step": 245
},
{
"epoch": 1.2953197099538563,
"grad_norm": 0.21759799122810364,
"learning_rate": 0.00019683035221222618,
"loss": 0.1186,
"step": 246
},
{
"epoch": 1.3005932762030323,
"grad_norm": 0.20061059296131134,
"learning_rate": 0.00019668870495450066,
"loss": 0.1008,
"step": 247
},
{
"epoch": 1.3058668424522084,
"grad_norm": 0.22263573110103607,
"learning_rate": 0.0001965440147223963,
"loss": 0.1201,
"step": 248
},
{
"epoch": 1.3111404087013843,
"grad_norm": 0.22843922674655914,
"learning_rate": 0.00019639628606958533,
"loss": 0.1115,
"step": 249
},
{
"epoch": 1.3164139749505603,
"grad_norm": 0.20508253574371338,
"learning_rate": 0.00019624552364536473,
"loss": 0.1088,
"step": 250
},
{
"epoch": 1.3164139749505603,
"eval_loss": 0.12800532579421997,
"eval_runtime": 133.8769,
"eval_samples_per_second": 8.0,
"eval_steps_per_second": 2.002,
"step": 250
},
{
"epoch": 1.3216875411997364,
"grad_norm": 0.2247203290462494,
"learning_rate": 0.00019609173219450998,
"loss": 0.1406,
"step": 251
},
{
"epoch": 1.3269611074489123,
"grad_norm": 0.28306570649147034,
"learning_rate": 0.0001959349165571256,
"loss": 0.137,
"step": 252
},
{
"epoch": 1.3322346736980883,
"grad_norm": 0.21649472415447235,
"learning_rate": 0.00019577508166849304,
"loss": 0.1043,
"step": 253
},
{
"epoch": 1.3375082399472644,
"grad_norm": 0.23190827667713165,
"learning_rate": 0.0001956122325589152,
"loss": 0.1043,
"step": 254
},
{
"epoch": 1.3427818061964403,
"grad_norm": 0.21395829319953918,
"learning_rate": 0.00019544637435355808,
"loss": 0.1118,
"step": 255
},
{
"epoch": 1.3480553724456164,
"grad_norm": 0.20570361614227295,
"learning_rate": 0.00019527751227228963,
"loss": 0.1059,
"step": 256
},
{
"epoch": 1.3533289386947924,
"grad_norm": 0.22916211187839508,
"learning_rate": 0.00019510565162951537,
"loss": 0.109,
"step": 257
},
{
"epoch": 1.3586025049439683,
"grad_norm": 0.2180647999048233,
"learning_rate": 0.00019493079783401113,
"loss": 0.1272,
"step": 258
},
{
"epoch": 1.3638760711931444,
"grad_norm": 0.19418495893478394,
"learning_rate": 0.0001947529563887529,
"loss": 0.1288,
"step": 259
},
{
"epoch": 1.3691496374423204,
"grad_norm": 0.2715223431587219,
"learning_rate": 0.00019457213289074355,
"loss": 0.098,
"step": 260
},
{
"epoch": 1.3744232036914963,
"grad_norm": 0.19249342381954193,
"learning_rate": 0.00019438833303083678,
"loss": 0.1185,
"step": 261
},
{
"epoch": 1.3796967699406724,
"grad_norm": 0.1977251172065735,
"learning_rate": 0.00019420156259355791,
"loss": 0.1283,
"step": 262
},
{
"epoch": 1.3849703361898484,
"grad_norm": 0.19867144525051117,
"learning_rate": 0.0001940118274569219,
"loss": 0.1132,
"step": 263
},
{
"epoch": 1.3902439024390243,
"grad_norm": 0.17431101202964783,
"learning_rate": 0.00019381913359224842,
"loss": 0.0956,
"step": 264
},
{
"epoch": 1.3955174686882004,
"grad_norm": 0.2786570191383362,
"learning_rate": 0.00019362348706397373,
"loss": 0.1041,
"step": 265
},
{
"epoch": 1.4007910349373764,
"grad_norm": 0.2277083843946457,
"learning_rate": 0.00019342489402945998,
"loss": 0.1034,
"step": 266
},
{
"epoch": 1.4060646011865523,
"grad_norm": 0.18951818346977234,
"learning_rate": 0.00019322336073880142,
"loss": 0.1243,
"step": 267
},
{
"epoch": 1.4113381674357284,
"grad_norm": 0.18908710777759552,
"learning_rate": 0.00019301889353462762,
"loss": 0.1019,
"step": 268
},
{
"epoch": 1.4166117336849045,
"grad_norm": 0.24964019656181335,
"learning_rate": 0.0001928114988519039,
"loss": 0.1315,
"step": 269
},
{
"epoch": 1.4218852999340803,
"grad_norm": 0.22528688609600067,
"learning_rate": 0.0001926011832177288,
"loss": 0.0979,
"step": 270
},
{
"epoch": 1.4271588661832564,
"grad_norm": 0.21379578113555908,
"learning_rate": 0.0001923879532511287,
"loss": 0.1064,
"step": 271
},
{
"epoch": 1.4324324324324325,
"grad_norm": 0.21753202378749847,
"learning_rate": 0.0001921718156628494,
"loss": 0.0954,
"step": 272
},
{
"epoch": 1.4377059986816083,
"grad_norm": 0.20682744681835175,
"learning_rate": 0.0001919527772551451,
"loss": 0.1055,
"step": 273
},
{
"epoch": 1.4429795649307844,
"grad_norm": 0.19650743901729584,
"learning_rate": 0.00019173084492156407,
"loss": 0.1229,
"step": 274
},
{
"epoch": 1.4482531311799605,
"grad_norm": 0.19758552312850952,
"learning_rate": 0.00019150602564673198,
"loss": 0.1019,
"step": 275
},
{
"epoch": 1.4535266974291363,
"grad_norm": 0.20337599515914917,
"learning_rate": 0.00019127832650613189,
"loss": 0.0997,
"step": 276
},
{
"epoch": 1.4588002636783124,
"grad_norm": 0.23217317461967468,
"learning_rate": 0.00019104775466588161,
"loss": 0.1211,
"step": 277
},
{
"epoch": 1.4640738299274885,
"grad_norm": 0.20149654150009155,
"learning_rate": 0.00019081431738250814,
"loss": 0.0889,
"step": 278
},
{
"epoch": 1.4693473961766645,
"grad_norm": 0.19859851896762848,
"learning_rate": 0.00019057802200271942,
"loss": 0.1133,
"step": 279
},
{
"epoch": 1.4746209624258404,
"grad_norm": 0.2119692862033844,
"learning_rate": 0.00019033887596317298,
"loss": 0.1264,
"step": 280
},
{
"epoch": 1.4798945286750165,
"grad_norm": 0.1985294073820114,
"learning_rate": 0.0001900968867902419,
"loss": 0.0941,
"step": 281
},
{
"epoch": 1.4851680949241926,
"grad_norm": 0.22264046967029572,
"learning_rate": 0.00018985206209977813,
"loss": 0.1265,
"step": 282
},
{
"epoch": 1.4904416611733686,
"grad_norm": 0.17052385210990906,
"learning_rate": 0.00018960440959687254,
"loss": 0.0947,
"step": 283
},
{
"epoch": 1.4957152274225445,
"grad_norm": 0.17365668714046478,
"learning_rate": 0.00018935393707561251,
"loss": 0.1199,
"step": 284
},
{
"epoch": 1.5009887936717206,
"grad_norm": 0.23060303926467896,
"learning_rate": 0.0001891006524188368,
"loss": 0.0909,
"step": 285
},
{
"epoch": 1.5062623599208966,
"grad_norm": 0.18114161491394043,
"learning_rate": 0.00018884456359788724,
"loss": 0.1117,
"step": 286
},
{
"epoch": 1.5115359261700725,
"grad_norm": 0.22013631463050842,
"learning_rate": 0.000188585678672358,
"loss": 0.1164,
"step": 287
},
{
"epoch": 1.5168094924192486,
"grad_norm": 0.24089427292346954,
"learning_rate": 0.00018832400578984183,
"loss": 0.1177,
"step": 288
},
{
"epoch": 1.5220830586684246,
"grad_norm": 0.17679591476917267,
"learning_rate": 0.0001880595531856738,
"loss": 0.107,
"step": 289
},
{
"epoch": 1.5273566249176005,
"grad_norm": 0.15667003393173218,
"learning_rate": 0.00018779232918267195,
"loss": 0.1008,
"step": 290
},
{
"epoch": 1.5326301911667766,
"grad_norm": 0.21019265055656433,
"learning_rate": 0.00018752234219087538,
"loss": 0.1291,
"step": 291
},
{
"epoch": 1.5379037574159526,
"grad_norm": 0.1911863535642624,
"learning_rate": 0.00018724960070727972,
"loss": 0.1246,
"step": 292
},
{
"epoch": 1.5431773236651285,
"grad_norm": 0.16309945285320282,
"learning_rate": 0.00018697411331556956,
"loss": 0.1063,
"step": 293
},
{
"epoch": 1.5484508899143046,
"grad_norm": 0.15654757618904114,
"learning_rate": 0.0001866958886858483,
"loss": 0.1043,
"step": 294
},
{
"epoch": 1.5537244561634806,
"grad_norm": 0.17349812388420105,
"learning_rate": 0.0001864149355743655,
"loss": 0.0799,
"step": 295
},
{
"epoch": 1.5589980224126565,
"grad_norm": 0.19882531464099884,
"learning_rate": 0.00018613126282324092,
"loss": 0.0983,
"step": 296
},
{
"epoch": 1.5642715886618326,
"grad_norm": 0.1695946753025055,
"learning_rate": 0.00018584487936018661,
"loss": 0.0947,
"step": 297
},
{
"epoch": 1.5695451549110087,
"grad_norm": 0.2050606608390808,
"learning_rate": 0.00018555579419822583,
"loss": 0.1108,
"step": 298
},
{
"epoch": 1.5748187211601845,
"grad_norm": 0.18069462478160858,
"learning_rate": 0.00018526401643540922,
"loss": 0.1137,
"step": 299
},
{
"epoch": 1.5800922874093606,
"grad_norm": 0.2282589226961136,
"learning_rate": 0.00018496955525452874,
"loss": 0.1134,
"step": 300
},
{
"epoch": 1.5800922874093606,
"eval_loss": 0.11345648020505905,
"eval_runtime": 133.8392,
"eval_samples_per_second": 8.002,
"eval_steps_per_second": 2.002,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 760,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.968706055036672e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}