qwen7_2wiki3 / trainer_state.json
mangopy's picture
Upload trainer_state.json with huggingface_hub
39c3105 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9968,
"eval_steps": 500,
"global_step": 312,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0064,
"grad_norm": 2.4969531033325163,
"learning_rate": 6.25e-08,
"loss": 0.1252,
"step": 1
},
{
"epoch": 0.0128,
"grad_norm": 2.008890344969409,
"learning_rate": 1.25e-07,
"loss": 0.1336,
"step": 2
},
{
"epoch": 0.0192,
"grad_norm": 2.1487872414082574,
"learning_rate": 1.875e-07,
"loss": 0.1239,
"step": 3
},
{
"epoch": 0.0256,
"grad_norm": 2.404877068082983,
"learning_rate": 2.5e-07,
"loss": 0.1534,
"step": 4
},
{
"epoch": 0.032,
"grad_norm": 1.9977341287068444,
"learning_rate": 3.1249999999999997e-07,
"loss": 0.127,
"step": 5
},
{
"epoch": 0.0384,
"grad_norm": 2.10253617342413,
"learning_rate": 3.75e-07,
"loss": 0.1295,
"step": 6
},
{
"epoch": 0.0448,
"grad_norm": 1.9813074191320224,
"learning_rate": 4.375e-07,
"loss": 0.1443,
"step": 7
},
{
"epoch": 0.0512,
"grad_norm": 2.1426342290851705,
"learning_rate": 5e-07,
"loss": 0.1489,
"step": 8
},
{
"epoch": 0.0576,
"grad_norm": 2.3977225328887846,
"learning_rate": 5.625e-07,
"loss": 0.1715,
"step": 9
},
{
"epoch": 0.064,
"grad_norm": 2.4910499854065375,
"learning_rate": 6.249999999999999e-07,
"loss": 0.1481,
"step": 10
},
{
"epoch": 0.0704,
"grad_norm": 2.0036101364507264,
"learning_rate": 6.875e-07,
"loss": 0.133,
"step": 11
},
{
"epoch": 0.0768,
"grad_norm": 2.31397843050416,
"learning_rate": 7.5e-07,
"loss": 0.1197,
"step": 12
},
{
"epoch": 0.0832,
"grad_norm": 2.789398930843854,
"learning_rate": 8.125e-07,
"loss": 0.1026,
"step": 13
},
{
"epoch": 0.0896,
"grad_norm": 2.3679931786396997,
"learning_rate": 8.75e-07,
"loss": 0.1473,
"step": 14
},
{
"epoch": 0.096,
"grad_norm": 2.146960838741869,
"learning_rate": 9.374999999999999e-07,
"loss": 0.1499,
"step": 15
},
{
"epoch": 0.1024,
"grad_norm": 2.8095519343937547,
"learning_rate": 1e-06,
"loss": 0.1312,
"step": 16
},
{
"epoch": 0.1088,
"grad_norm": 3.1559803771267987,
"learning_rate": 1.0625e-06,
"loss": 0.1332,
"step": 17
},
{
"epoch": 0.1152,
"grad_norm": 2.6394797568137442,
"learning_rate": 1.125e-06,
"loss": 0.1516,
"step": 18
},
{
"epoch": 0.1216,
"grad_norm": 2.0903056059241214,
"learning_rate": 1.1874999999999999e-06,
"loss": 0.1401,
"step": 19
},
{
"epoch": 0.128,
"grad_norm": 2.9496671534502785,
"learning_rate": 1.2499999999999999e-06,
"loss": 0.1422,
"step": 20
},
{
"epoch": 0.1344,
"grad_norm": 2.2510408864309746,
"learning_rate": 1.3125e-06,
"loss": 0.1339,
"step": 21
},
{
"epoch": 0.1408,
"grad_norm": 2.988822164056312,
"learning_rate": 1.375e-06,
"loss": 0.1372,
"step": 22
},
{
"epoch": 0.1472,
"grad_norm": 2.425664327575325,
"learning_rate": 1.4375e-06,
"loss": 0.1419,
"step": 23
},
{
"epoch": 0.1536,
"grad_norm": 2.2878647901959135,
"learning_rate": 1.5e-06,
"loss": 0.1643,
"step": 24
},
{
"epoch": 0.16,
"grad_norm": 2.373440045449061,
"learning_rate": 1.5624999999999999e-06,
"loss": 0.1611,
"step": 25
},
{
"epoch": 0.1664,
"grad_norm": 3.1720114517373186,
"learning_rate": 1.625e-06,
"loss": 0.1241,
"step": 26
},
{
"epoch": 0.1728,
"grad_norm": 2.888254088184939,
"learning_rate": 1.6875e-06,
"loss": 0.1432,
"step": 27
},
{
"epoch": 0.1792,
"grad_norm": 2.4223635511865513,
"learning_rate": 1.75e-06,
"loss": 0.1013,
"step": 28
},
{
"epoch": 0.1856,
"grad_norm": 2.540051698989579,
"learning_rate": 1.8125e-06,
"loss": 0.1524,
"step": 29
},
{
"epoch": 0.192,
"grad_norm": 3.0841788486084516,
"learning_rate": 1.8749999999999998e-06,
"loss": 0.1583,
"step": 30
},
{
"epoch": 0.1984,
"grad_norm": 2.961021534474499,
"learning_rate": 1.9375e-06,
"loss": 0.1331,
"step": 31
},
{
"epoch": 0.2048,
"grad_norm": 2.0666296501147112,
"learning_rate": 2e-06,
"loss": 0.1235,
"step": 32
},
{
"epoch": 0.2112,
"grad_norm": 2.8434105826490987,
"learning_rate": 1.9999370567547003e-06,
"loss": 0.1728,
"step": 33
},
{
"epoch": 0.2176,
"grad_norm": 3.2571488442302927,
"learning_rate": 1.9997482349425066e-06,
"loss": 0.1423,
"step": 34
},
{
"epoch": 0.224,
"grad_norm": 2.737978793809459,
"learning_rate": 1.9994335583335335e-06,
"loss": 0.1468,
"step": 35
},
{
"epoch": 0.2304,
"grad_norm": 3.1545505286005997,
"learning_rate": 1.9989930665413145e-06,
"loss": 0.1169,
"step": 36
},
{
"epoch": 0.2368,
"grad_norm": 2.797274179036167,
"learning_rate": 1.9984268150178167e-06,
"loss": 0.1319,
"step": 37
},
{
"epoch": 0.2432,
"grad_norm": 2.8094170725156715,
"learning_rate": 1.997734875046456e-06,
"loss": 0.1052,
"step": 38
},
{
"epoch": 0.2496,
"grad_norm": 3.265503821648089,
"learning_rate": 1.996917333733128e-06,
"loss": 0.1425,
"step": 39
},
{
"epoch": 0.256,
"grad_norm": 3.150913553614633,
"learning_rate": 1.995974293995239e-06,
"loss": 0.1335,
"step": 40
},
{
"epoch": 0.2624,
"grad_norm": 2.8775764914997604,
"learning_rate": 1.994905874548752e-06,
"loss": 0.1226,
"step": 41
},
{
"epoch": 0.2688,
"grad_norm": 3.2180785068208575,
"learning_rate": 1.9937122098932426e-06,
"loss": 0.1329,
"step": 42
},
{
"epoch": 0.2752,
"grad_norm": 5.857949780062274,
"learning_rate": 1.9923934502949643e-06,
"loss": 0.152,
"step": 43
},
{
"epoch": 0.2816,
"grad_norm": 4.521459145995853,
"learning_rate": 1.9909497617679347e-06,
"loss": 0.1747,
"step": 44
},
{
"epoch": 0.288,
"grad_norm": 3.1404514498803136,
"learning_rate": 1.9893813260530367e-06,
"loss": 0.11,
"step": 45
},
{
"epoch": 0.2944,
"grad_norm": 4.201009323986255,
"learning_rate": 1.9876883405951377e-06,
"loss": 0.1634,
"step": 46
},
{
"epoch": 0.3008,
"grad_norm": 2.9577036484677004,
"learning_rate": 1.9858710185182355e-06,
"loss": 0.1561,
"step": 47
},
{
"epoch": 0.3072,
"grad_norm": 3.535826754862616,
"learning_rate": 1.9839295885986295e-06,
"loss": 0.1619,
"step": 48
},
{
"epoch": 0.3136,
"grad_norm": 4.120602596813965,
"learning_rate": 1.9818642952361183e-06,
"loss": 0.1495,
"step": 49
},
{
"epoch": 0.32,
"grad_norm": 4.259764877810664,
"learning_rate": 1.9796753984232355e-06,
"loss": 0.1307,
"step": 50
},
{
"epoch": 0.3264,
"grad_norm": 4.242488888827966,
"learning_rate": 1.977363173712519e-06,
"loss": 0.1285,
"step": 51
},
{
"epoch": 0.3328,
"grad_norm": 4.069178515100502,
"learning_rate": 1.9749279121818236e-06,
"loss": 0.1352,
"step": 52
},
{
"epoch": 0.3392,
"grad_norm": 4.544140717519671,
"learning_rate": 1.9723699203976766e-06,
"loss": 0.1571,
"step": 53
},
{
"epoch": 0.3456,
"grad_norm": 3.9107597581959426,
"learning_rate": 1.9696895203766866e-06,
"loss": 0.1686,
"step": 54
},
{
"epoch": 0.352,
"grad_norm": 3.583429566299793,
"learning_rate": 1.966887049545006e-06,
"loss": 0.1378,
"step": 55
},
{
"epoch": 0.3584,
"grad_norm": 3.517952555975872,
"learning_rate": 1.9639628606958534e-06,
"loss": 0.1386,
"step": 56
},
{
"epoch": 0.3648,
"grad_norm": 3.579939014596777,
"learning_rate": 1.9609173219450997e-06,
"loss": 0.1448,
"step": 57
},
{
"epoch": 0.3712,
"grad_norm": 4.961408389397768,
"learning_rate": 1.9577508166849303e-06,
"loss": 0.1334,
"step": 58
},
{
"epoch": 0.3776,
"grad_norm": 3.881854573486764,
"learning_rate": 1.9544637435355806e-06,
"loss": 0.1453,
"step": 59
},
{
"epoch": 0.384,
"grad_norm": 4.160469207615619,
"learning_rate": 1.9510565162951534e-06,
"loss": 0.1514,
"step": 60
},
{
"epoch": 0.3904,
"grad_norm": 3.7376823245313995,
"learning_rate": 1.947529563887529e-06,
"loss": 0.1756,
"step": 61
},
{
"epoch": 0.3968,
"grad_norm": 4.745526341279435,
"learning_rate": 1.9438833303083674e-06,
"loss": 0.1903,
"step": 62
},
{
"epoch": 0.4032,
"grad_norm": 3.5872890144465877,
"learning_rate": 1.9401182745692187e-06,
"loss": 0.1327,
"step": 63
},
{
"epoch": 0.4096,
"grad_norm": 4.895892914029687,
"learning_rate": 1.936234870639737e-06,
"loss": 0.1348,
"step": 64
},
{
"epoch": 0.416,
"grad_norm": 3.4298662566625913,
"learning_rate": 1.9322336073880143e-06,
"loss": 0.1262,
"step": 65
},
{
"epoch": 0.4224,
"grad_norm": 6.841317138496493,
"learning_rate": 1.928114988519039e-06,
"loss": 0.172,
"step": 66
},
{
"epoch": 0.4288,
"grad_norm": 4.250165113109143,
"learning_rate": 1.9238795325112867e-06,
"loss": 0.1541,
"step": 67
},
{
"epoch": 0.4352,
"grad_norm": 4.4033621224434025,
"learning_rate": 1.9195277725514506e-06,
"loss": 0.1745,
"step": 68
},
{
"epoch": 0.4416,
"grad_norm": 3.98075383182803,
"learning_rate": 1.91506025646732e-06,
"loss": 0.1474,
"step": 69
},
{
"epoch": 0.448,
"grad_norm": 3.627291803014999,
"learning_rate": 1.9104775466588157e-06,
"loss": 0.1807,
"step": 70
},
{
"epoch": 0.4544,
"grad_norm": 4.766210491599441,
"learning_rate": 1.905780220027194e-06,
"loss": 0.1502,
"step": 71
},
{
"epoch": 0.4608,
"grad_norm": 3.6759268834811625,
"learning_rate": 1.9009688679024189e-06,
"loss": 0.1393,
"step": 72
},
{
"epoch": 0.4672,
"grad_norm": 4.101009698917842,
"learning_rate": 1.8960440959687252e-06,
"loss": 0.1369,
"step": 73
},
{
"epoch": 0.4736,
"grad_norm": 4.59569886944552,
"learning_rate": 1.8910065241883678e-06,
"loss": 0.1619,
"step": 74
},
{
"epoch": 0.48,
"grad_norm": 3.7413886299218797,
"learning_rate": 1.8858567867235798e-06,
"loss": 0.1666,
"step": 75
},
{
"epoch": 0.4864,
"grad_norm": 3.8867223584125457,
"learning_rate": 1.8805955318567379e-06,
"loss": 0.1704,
"step": 76
},
{
"epoch": 0.4928,
"grad_norm": 4.493023053104012,
"learning_rate": 1.8752234219087537e-06,
"loss": 0.1661,
"step": 77
},
{
"epoch": 0.4992,
"grad_norm": 4.759475180875903,
"learning_rate": 1.8697411331556953e-06,
"loss": 0.1488,
"step": 78
},
{
"epoch": 0.5056,
"grad_norm": 4.539234264514853,
"learning_rate": 1.8641493557436548e-06,
"loss": 0.1524,
"step": 79
},
{
"epoch": 0.512,
"grad_norm": 3.493235818987802,
"learning_rate": 1.858448793601866e-06,
"loss": 0.162,
"step": 80
},
{
"epoch": 0.5184,
"grad_norm": 6.5293887794567596,
"learning_rate": 1.852640164354092e-06,
"loss": 0.1533,
"step": 81
},
{
"epoch": 0.5248,
"grad_norm": 6.663555301541236,
"learning_rate": 1.8467241992282841e-06,
"loss": 0.1789,
"step": 82
},
{
"epoch": 0.5312,
"grad_norm": 4.9493441573099695,
"learning_rate": 1.8407016429645302e-06,
"loss": 0.1563,
"step": 83
},
{
"epoch": 0.5376,
"grad_norm": 3.5356024848568492,
"learning_rate": 1.8345732537213026e-06,
"loss": 0.1331,
"step": 84
},
{
"epoch": 0.544,
"grad_norm": 3.331011840972039,
"learning_rate": 1.8283398029800164e-06,
"loss": 0.1725,
"step": 85
},
{
"epoch": 0.5504,
"grad_norm": 3.1323607027132603,
"learning_rate": 1.82200207544791e-06,
"loss": 0.1399,
"step": 86
},
{
"epoch": 0.5568,
"grad_norm": 4.543709410888673,
"learning_rate": 1.8155608689592601e-06,
"loss": 0.14,
"step": 87
},
{
"epoch": 0.5632,
"grad_norm": 4.45673006932675,
"learning_rate": 1.8090169943749474e-06,
"loss": 0.1615,
"step": 88
},
{
"epoch": 0.5696,
"grad_norm": 4.705373269348066,
"learning_rate": 1.802371275480378e-06,
"loss": 0.1564,
"step": 89
},
{
"epoch": 0.576,
"grad_norm": 4.329301032695692,
"learning_rate": 1.795624548881781e-06,
"loss": 0.1924,
"step": 90
},
{
"epoch": 0.5824,
"grad_norm": 3.216831310097902,
"learning_rate": 1.7887776639008912e-06,
"loss": 0.1636,
"step": 91
},
{
"epoch": 0.5888,
"grad_norm": 4.125027796241628,
"learning_rate": 1.7818314824680298e-06,
"loss": 0.177,
"step": 92
},
{
"epoch": 0.5952,
"grad_norm": 4.962921250352651,
"learning_rate": 1.774786879013601e-06,
"loss": 0.1525,
"step": 93
},
{
"epoch": 0.6016,
"grad_norm": 3.8404707219541683,
"learning_rate": 1.767644740358011e-06,
"loss": 0.1332,
"step": 94
},
{
"epoch": 0.608,
"grad_norm": 4.711974795123313,
"learning_rate": 1.760405965600031e-06,
"loss": 0.1422,
"step": 95
},
{
"epoch": 0.6144,
"grad_norm": 4.199038511043031,
"learning_rate": 1.753071466003611e-06,
"loss": 0.13,
"step": 96
},
{
"epoch": 0.6208,
"grad_norm": 3.8291089013264115,
"learning_rate": 1.7456421648831654e-06,
"loss": 0.1715,
"step": 97
},
{
"epoch": 0.6272,
"grad_norm": 3.939969260526378,
"learning_rate": 1.7381189974873407e-06,
"loss": 0.1406,
"step": 98
},
{
"epoch": 0.6336,
"grad_norm": 5.4527369894058175,
"learning_rate": 1.7305029108812774e-06,
"loss": 0.1551,
"step": 99
},
{
"epoch": 0.64,
"grad_norm": 5.494861371187384,
"learning_rate": 1.7227948638273915e-06,
"loss": 0.1701,
"step": 100
},
{
"epoch": 0.6464,
"grad_norm": 4.232388610306294,
"learning_rate": 1.7149958266646754e-06,
"loss": 0.1796,
"step": 101
},
{
"epoch": 0.6528,
"grad_norm": 3.7860648199901012,
"learning_rate": 1.7071067811865474e-06,
"loss": 0.1466,
"step": 102
},
{
"epoch": 0.6592,
"grad_norm": 4.380310274735351,
"learning_rate": 1.6991287205172574e-06,
"loss": 0.1476,
"step": 103
},
{
"epoch": 0.6656,
"grad_norm": 6.315476074209258,
"learning_rate": 1.6910626489868648e-06,
"loss": 0.1676,
"step": 104
},
{
"epoch": 0.672,
"grad_norm": 4.236074270422003,
"learning_rate": 1.682909582004807e-06,
"loss": 0.1732,
"step": 105
},
{
"epoch": 0.6784,
"grad_norm": 3.887311488635536,
"learning_rate": 1.6746705459320744e-06,
"loss": 0.1819,
"step": 106
},
{
"epoch": 0.6848,
"grad_norm": 4.324784278862244,
"learning_rate": 1.6663465779520037e-06,
"loss": 0.1514,
"step": 107
},
{
"epoch": 0.6912,
"grad_norm": 3.5010867620695336,
"learning_rate": 1.6579387259397126e-06,
"loss": 0.1378,
"step": 108
},
{
"epoch": 0.6976,
"grad_norm": 4.547638459770409,
"learning_rate": 1.6494480483301835e-06,
"loss": 0.1245,
"step": 109
},
{
"epoch": 0.704,
"grad_norm": 4.154442920017231,
"learning_rate": 1.640875613985024e-06,
"loss": 0.1507,
"step": 110
},
{
"epoch": 0.7104,
"grad_norm": 4.2738669740461805,
"learning_rate": 1.6322225020579096e-06,
"loss": 0.1578,
"step": 111
},
{
"epoch": 0.7168,
"grad_norm": 3.030419612133566,
"learning_rate": 1.6234898018587336e-06,
"loss": 0.1248,
"step": 112
},
{
"epoch": 0.7232,
"grad_norm": 3.236869339406748,
"learning_rate": 1.6146786127164771e-06,
"loss": 0.1522,
"step": 113
},
{
"epoch": 0.7296,
"grad_norm": 3.0781522550717986,
"learning_rate": 1.6057900438408199e-06,
"loss": 0.1186,
"step": 114
},
{
"epoch": 0.736,
"grad_norm": 4.235359931610902,
"learning_rate": 1.5968252141825035e-06,
"loss": 0.181,
"step": 115
},
{
"epoch": 0.7424,
"grad_norm": 3.8393666889310145,
"learning_rate": 1.587785252292473e-06,
"loss": 0.1951,
"step": 116
},
{
"epoch": 0.7488,
"grad_norm": 4.533557115616342,
"learning_rate": 1.578671296179806e-06,
"loss": 0.1162,
"step": 117
},
{
"epoch": 0.7552,
"grad_norm": 4.155646517913304,
"learning_rate": 1.569484493168452e-06,
"loss": 0.1569,
"step": 118
},
{
"epoch": 0.7616,
"grad_norm": 3.612304887656086,
"learning_rate": 1.5602259997528027e-06,
"loss": 0.1657,
"step": 119
},
{
"epoch": 0.768,
"grad_norm": 3.4718761590803457,
"learning_rate": 1.5508969814521024e-06,
"loss": 0.1006,
"step": 120
},
{
"epoch": 0.7744,
"grad_norm": 2.749613882614241,
"learning_rate": 1.5414986126637257e-06,
"loss": 0.1225,
"step": 121
},
{
"epoch": 0.7808,
"grad_norm": 3.9202126219465185,
"learning_rate": 1.5320320765153365e-06,
"loss": 0.1651,
"step": 122
},
{
"epoch": 0.7872,
"grad_norm": 3.769859629804315,
"learning_rate": 1.5224985647159488e-06,
"loss": 0.1207,
"step": 123
},
{
"epoch": 0.7936,
"grad_norm": 4.258299529843038,
"learning_rate": 1.5128992774059062e-06,
"loss": 0.1485,
"step": 124
},
{
"epoch": 0.8,
"grad_norm": 3.5138242342415755,
"learning_rate": 1.5032354230058002e-06,
"loss": 0.1716,
"step": 125
},
{
"epoch": 0.8064,
"grad_norm": 3.1536366453296063,
"learning_rate": 1.4935082180643467e-06,
"loss": 0.1567,
"step": 126
},
{
"epoch": 0.8128,
"grad_norm": 3.8255471768086466,
"learning_rate": 1.4837188871052397e-06,
"loss": 0.1289,
"step": 127
},
{
"epoch": 0.8192,
"grad_norm": 3.998586103793301,
"learning_rate": 1.4738686624729987e-06,
"loss": 0.1708,
"step": 128
},
{
"epoch": 0.8256,
"grad_norm": 6.108329633585067,
"learning_rate": 1.463958784177834e-06,
"loss": 0.1513,
"step": 129
},
{
"epoch": 0.832,
"grad_norm": 3.5116343920977755,
"learning_rate": 1.4539904997395467e-06,
"loss": 0.1426,
"step": 130
},
{
"epoch": 0.8384,
"grad_norm": 3.8466991449364487,
"learning_rate": 1.4439650640304821e-06,
"loss": 0.1879,
"step": 131
},
{
"epoch": 0.8448,
"grad_norm": 3.145269853440717,
"learning_rate": 1.433883739117558e-06,
"loss": 0.116,
"step": 132
},
{
"epoch": 0.8512,
"grad_norm": 3.6531846559334458,
"learning_rate": 1.4237477941033886e-06,
"loss": 0.1667,
"step": 133
},
{
"epoch": 0.8576,
"grad_norm": 3.870433185715427,
"learning_rate": 1.4135585049665206e-06,
"loss": 0.1159,
"step": 134
},
{
"epoch": 0.864,
"grad_norm": 2.9424803019594252,
"learning_rate": 1.4033171544008051e-06,
"loss": 0.1345,
"step": 135
},
{
"epoch": 0.8704,
"grad_norm": 3.1917917000111022,
"learning_rate": 1.3930250316539235e-06,
"loss": 0.1809,
"step": 136
},
{
"epoch": 0.8768,
"grad_norm": 3.779080435520245,
"learning_rate": 1.3826834323650898e-06,
"loss": 0.1667,
"step": 137
},
{
"epoch": 0.8832,
"grad_norm": 4.015156839459903,
"learning_rate": 1.3722936584019451e-06,
"loss": 0.1435,
"step": 138
},
{
"epoch": 0.8896,
"grad_norm": 3.3828511532441867,
"learning_rate": 1.3618570176966722e-06,
"loss": 0.1763,
"step": 139
},
{
"epoch": 0.896,
"grad_norm": 2.7378078741101004,
"learning_rate": 1.3513748240813427e-06,
"loss": 0.124,
"step": 140
},
{
"epoch": 0.9024,
"grad_norm": 2.667510129275553,
"learning_rate": 1.3408483971225249e-06,
"loss": 0.1276,
"step": 141
},
{
"epoch": 0.9088,
"grad_norm": 3.4600760953915493,
"learning_rate": 1.3302790619551672e-06,
"loss": 0.1604,
"step": 142
},
{
"epoch": 0.9152,
"grad_norm": 2.949770798393727,
"learning_rate": 1.3196681491157816e-06,
"loss": 0.1327,
"step": 143
},
{
"epoch": 0.9216,
"grad_norm": 3.0805919917128564,
"learning_rate": 1.3090169943749473e-06,
"loss": 0.1466,
"step": 144
},
{
"epoch": 0.928,
"grad_norm": 3.869134321743603,
"learning_rate": 1.298326938569156e-06,
"loss": 0.1477,
"step": 145
},
{
"epoch": 0.9344,
"grad_norm": 3.273813944999775,
"learning_rate": 1.2875993274320173e-06,
"loss": 0.1535,
"step": 146
},
{
"epoch": 0.9408,
"grad_norm": 2.607928868194842,
"learning_rate": 1.2768355114248492e-06,
"loss": 0.1175,
"step": 147
},
{
"epoch": 0.9472,
"grad_norm": 3.5206710496602436,
"learning_rate": 1.266036845566675e-06,
"loss": 0.1447,
"step": 148
},
{
"epoch": 0.9536,
"grad_norm": 2.9796516371379327,
"learning_rate": 1.2552046892636426e-06,
"loss": 0.1509,
"step": 149
},
{
"epoch": 0.96,
"grad_norm": 2.9436385585618674,
"learning_rate": 1.244340406137894e-06,
"loss": 0.1506,
"step": 150
},
{
"epoch": 0.9664,
"grad_norm": 4.010655685945475,
"learning_rate": 1.2334453638559054e-06,
"loss": 0.1641,
"step": 151
},
{
"epoch": 0.9728,
"grad_norm": 3.808704120734902,
"learning_rate": 1.2225209339563143e-06,
"loss": 0.1409,
"step": 152
},
{
"epoch": 0.9792,
"grad_norm": 4.080366555503018,
"learning_rate": 1.211568491677263e-06,
"loss": 0.1542,
"step": 153
},
{
"epoch": 0.9856,
"grad_norm": 3.32431009022062,
"learning_rate": 1.2005894157832728e-06,
"loss": 0.1554,
"step": 154
},
{
"epoch": 0.992,
"grad_norm": 4.109905324890129,
"learning_rate": 1.1895850883916785e-06,
"loss": 0.1356,
"step": 155
},
{
"epoch": 0.9984,
"grad_norm": 3.1041868788537768,
"learning_rate": 1.1785568947986366e-06,
"loss": 0.1448,
"step": 156
},
{
"epoch": 1.0048,
"grad_norm": 3.9549654789059807,
"learning_rate": 1.1675062233047363e-06,
"loss": 0.1323,
"step": 157
},
{
"epoch": 1.0112,
"grad_norm": 5.265789444979294,
"learning_rate": 1.156434465040231e-06,
"loss": 0.1336,
"step": 158
},
{
"epoch": 1.0176,
"grad_norm": 2.278886843906884,
"learning_rate": 1.1453430137899128e-06,
"loss": 0.1036,
"step": 159
},
{
"epoch": 1.024,
"grad_norm": 2.994965707347646,
"learning_rate": 1.1342332658176555e-06,
"loss": 0.1081,
"step": 160
},
{
"epoch": 1.0304,
"grad_norm": 2.707133385634132,
"learning_rate": 1.123106619690643e-06,
"loss": 0.1407,
"step": 161
},
{
"epoch": 1.0368,
"grad_norm": 2.151215553915982,
"learning_rate": 1.1119644761033077e-06,
"loss": 0.1119,
"step": 162
},
{
"epoch": 1.0432,
"grad_norm": 2.0387940365716397,
"learning_rate": 1.1008082377010045e-06,
"loss": 0.1272,
"step": 163
},
{
"epoch": 1.0496,
"grad_norm": 2.0188887298089755,
"learning_rate": 1.0896393089034335e-06,
"loss": 0.1244,
"step": 164
},
{
"epoch": 1.056,
"grad_norm": 3.5460776700681955,
"learning_rate": 1.078459095727845e-06,
"loss": 0.1104,
"step": 165
},
{
"epoch": 1.0624,
"grad_norm": 2.360145390442641,
"learning_rate": 1.0672690056120398e-06,
"loss": 0.1368,
"step": 166
},
{
"epoch": 1.0688,
"grad_norm": 2.7286076638804198,
"learning_rate": 1.0560704472371917e-06,
"loss": 0.098,
"step": 167
},
{
"epoch": 1.0752,
"grad_norm": 3.9363201426551226,
"learning_rate": 1.044864830350515e-06,
"loss": 0.1683,
"step": 168
},
{
"epoch": 1.0816,
"grad_norm": 3.2428028503112993,
"learning_rate": 1.033653565587794e-06,
"loss": 0.1142,
"step": 169
},
{
"epoch": 1.088,
"grad_norm": 3.113578605505914,
"learning_rate": 1.022438064295805e-06,
"loss": 0.1129,
"step": 170
},
{
"epoch": 1.0944,
"grad_norm": 2.8114334244479684,
"learning_rate": 1.0112197383546459e-06,
"loss": 0.1279,
"step": 171
},
{
"epoch": 1.1008,
"grad_norm": 3.1925580191789242,
"learning_rate": 1e-06,
"loss": 0.1497,
"step": 172
},
{
"epoch": 1.1072,
"grad_norm": 2.9392401643163497,
"learning_rate": 9.88780261645354e-07,
"loss": 0.1281,
"step": 173
},
{
"epoch": 1.1136,
"grad_norm": 3.2004694564966583,
"learning_rate": 9.77561935704195e-07,
"loss": 0.113,
"step": 174
},
{
"epoch": 1.12,
"grad_norm": 2.7402823405153574,
"learning_rate": 9.663464344122063e-07,
"loss": 0.0722,
"step": 175
},
{
"epoch": 1.1264,
"grad_norm": 3.870543794283185,
"learning_rate": 9.551351696494853e-07,
"loss": 0.145,
"step": 176
},
{
"epoch": 1.1328,
"grad_norm": 3.12656262154436,
"learning_rate": 9.43929552762808e-07,
"loss": 0.1343,
"step": 177
},
{
"epoch": 1.1392,
"grad_norm": 3.1524992206404527,
"learning_rate": 9.327309943879603e-07,
"loss": 0.129,
"step": 178
},
{
"epoch": 1.1456,
"grad_norm": 3.693607421239943,
"learning_rate": 9.215409042721551e-07,
"loss": 0.1206,
"step": 179
},
{
"epoch": 1.152,
"grad_norm": 2.589250388541951,
"learning_rate": 9.103606910965665e-07,
"loss": 0.1046,
"step": 180
},
{
"epoch": 1.1584,
"grad_norm": 3.9569120653267245,
"learning_rate": 8.991917622989955e-07,
"loss": 0.1437,
"step": 181
},
{
"epoch": 1.1648,
"grad_norm": 3.0436417760418824,
"learning_rate": 8.880355238966921e-07,
"loss": 0.1131,
"step": 182
},
{
"epoch": 1.1712,
"grad_norm": 2.7404896523288156,
"learning_rate": 8.768933803093572e-07,
"loss": 0.1182,
"step": 183
},
{
"epoch": 1.1776,
"grad_norm": 2.978196221491147,
"learning_rate": 8.657667341823448e-07,
"loss": 0.1331,
"step": 184
},
{
"epoch": 1.184,
"grad_norm": 2.7953950740003948,
"learning_rate": 8.546569862100875e-07,
"loss": 0.1773,
"step": 185
},
{
"epoch": 1.1904,
"grad_norm": 3.505293558413482,
"learning_rate": 8.435655349597689e-07,
"loss": 0.1289,
"step": 186
},
{
"epoch": 1.1968,
"grad_norm": 3.017122332004861,
"learning_rate": 8.324937766952636e-07,
"loss": 0.1264,
"step": 187
},
{
"epoch": 1.2032,
"grad_norm": 3.4700813765221294,
"learning_rate": 8.214431052013634e-07,
"loss": 0.123,
"step": 188
},
{
"epoch": 1.2096,
"grad_norm": 3.3325698466760905,
"learning_rate": 8.104149116083216e-07,
"loss": 0.1323,
"step": 189
},
{
"epoch": 1.216,
"grad_norm": 3.3688329432245645,
"learning_rate": 7.994105842167272e-07,
"loss": 0.1445,
"step": 190
},
{
"epoch": 1.2224,
"grad_norm": 5.410904469002936,
"learning_rate": 7.884315083227372e-07,
"loss": 0.1348,
"step": 191
},
{
"epoch": 1.2288000000000001,
"grad_norm": 2.8934452953623695,
"learning_rate": 7.774790660436857e-07,
"loss": 0.1042,
"step": 192
},
{
"epoch": 1.2352,
"grad_norm": 4.515892407436011,
"learning_rate": 7.665546361440949e-07,
"loss": 0.1344,
"step": 193
},
{
"epoch": 1.2416,
"grad_norm": 3.3336860990470583,
"learning_rate": 7.556595938621058e-07,
"loss": 0.1132,
"step": 194
},
{
"epoch": 1.248,
"grad_norm": 3.3781317545242686,
"learning_rate": 7.447953107363574e-07,
"loss": 0.1223,
"step": 195
},
{
"epoch": 1.2544,
"grad_norm": 3.4052474284643175,
"learning_rate": 7.33963154433325e-07,
"loss": 0.1147,
"step": 196
},
{
"epoch": 1.2608,
"grad_norm": 3.260066938920247,
"learning_rate": 7.231644885751507e-07,
"loss": 0.1162,
"step": 197
},
{
"epoch": 1.2671999999999999,
"grad_norm": 2.075129912437651,
"learning_rate": 7.124006725679828e-07,
"loss": 0.0891,
"step": 198
},
{
"epoch": 1.2736,
"grad_norm": 2.5796448583935994,
"learning_rate": 7.016730614308439e-07,
"loss": 0.1453,
"step": 199
},
{
"epoch": 1.28,
"grad_norm": 3.3625938910552935,
"learning_rate": 6.909830056250526e-07,
"loss": 0.1136,
"step": 200
},
{
"epoch": 1.2864,
"grad_norm": 2.7459563636737707,
"learning_rate": 6.803318508842186e-07,
"loss": 0.135,
"step": 201
},
{
"epoch": 1.2928,
"grad_norm": 3.6833307558438695,
"learning_rate": 6.697209380448332e-07,
"loss": 0.0941,
"step": 202
},
{
"epoch": 1.2992,
"grad_norm": 2.9375208875067464,
"learning_rate": 6.59151602877475e-07,
"loss": 0.1244,
"step": 203
},
{
"epoch": 1.3056,
"grad_norm": 3.7877853756064708,
"learning_rate": 6.486251759186572e-07,
"loss": 0.0908,
"step": 204
},
{
"epoch": 1.312,
"grad_norm": 2.626489357460925,
"learning_rate": 6.381429823033279e-07,
"loss": 0.1147,
"step": 205
},
{
"epoch": 1.3184,
"grad_norm": 2.7246829179484475,
"learning_rate": 6.277063415980548e-07,
"loss": 0.1111,
"step": 206
},
{
"epoch": 1.3248,
"grad_norm": 2.793990841290057,
"learning_rate": 6.173165676349102e-07,
"loss": 0.1236,
"step": 207
},
{
"epoch": 1.3312,
"grad_norm": 3.067033719132832,
"learning_rate": 6.069749683460764e-07,
"loss": 0.1269,
"step": 208
},
{
"epoch": 1.3376000000000001,
"grad_norm": 2.62953934675259,
"learning_rate": 5.96682845599195e-07,
"loss": 0.1296,
"step": 209
},
{
"epoch": 1.3439999999999999,
"grad_norm": 2.295428084618355,
"learning_rate": 5.864414950334795e-07,
"loss": 0.1067,
"step": 210
},
{
"epoch": 1.3504,
"grad_norm": 2.494288637796771,
"learning_rate": 5.762522058966113e-07,
"loss": 0.1012,
"step": 211
},
{
"epoch": 1.3568,
"grad_norm": 2.7289466545252856,
"learning_rate": 5.661162608824419e-07,
"loss": 0.1263,
"step": 212
},
{
"epoch": 1.3632,
"grad_norm": 3.297503764337652,
"learning_rate": 5.56034935969518e-07,
"loss": 0.1175,
"step": 213
},
{
"epoch": 1.3696,
"grad_norm": 2.733287987985122,
"learning_rate": 5.460095002604532e-07,
"loss": 0.1125,
"step": 214
},
{
"epoch": 1.376,
"grad_norm": 2.7994559067104574,
"learning_rate": 5.36041215822166e-07,
"loss": 0.1519,
"step": 215
},
{
"epoch": 1.3824,
"grad_norm": 2.7375733905542643,
"learning_rate": 5.261313375270013e-07,
"loss": 0.1173,
"step": 216
},
{
"epoch": 1.3888,
"grad_norm": 3.492872700831482,
"learning_rate": 5.162811128947602e-07,
"loss": 0.1594,
"step": 217
},
{
"epoch": 1.3952,
"grad_norm": 3.538990737016252,
"learning_rate": 5.064917819356531e-07,
"loss": 0.1121,
"step": 218
},
{
"epoch": 1.4016,
"grad_norm": 3.456492027602956,
"learning_rate": 4.967645769941999e-07,
"loss": 0.1329,
"step": 219
},
{
"epoch": 1.408,
"grad_norm": 6.8845618461429225,
"learning_rate": 4.871007225940939e-07,
"loss": 0.1215,
"step": 220
},
{
"epoch": 1.4144,
"grad_norm": 4.538034399979344,
"learning_rate": 4.775014352840512e-07,
"loss": 0.1148,
"step": 221
},
{
"epoch": 1.4208,
"grad_norm": 3.381941347409583,
"learning_rate": 4.6796792348466353e-07,
"loss": 0.1102,
"step": 222
},
{
"epoch": 1.4272,
"grad_norm": 2.3402894784761856,
"learning_rate": 4.585013873362743e-07,
"loss": 0.1208,
"step": 223
},
{
"epoch": 1.4336,
"grad_norm": 2.92152467138924,
"learning_rate": 4.4910301854789755e-07,
"loss": 0.1377,
"step": 224
},
{
"epoch": 1.44,
"grad_norm": 2.5271124781868988,
"learning_rate": 4.397740002471972e-07,
"loss": 0.1004,
"step": 225
},
{
"epoch": 1.4464000000000001,
"grad_norm": 3.2747270637549777,
"learning_rate": 4.3051550683154804e-07,
"loss": 0.1281,
"step": 226
},
{
"epoch": 1.4527999999999999,
"grad_norm": 2.6095079208452185,
"learning_rate": 4.2132870382019427e-07,
"loss": 0.1055,
"step": 227
},
{
"epoch": 1.4592,
"grad_norm": 5.83458655577108,
"learning_rate": 4.1221474770752696e-07,
"loss": 0.1215,
"step": 228
},
{
"epoch": 1.4656,
"grad_norm": 3.1153974927503763,
"learning_rate": 4.031747858174964e-07,
"loss": 0.0968,
"step": 229
},
{
"epoch": 1.472,
"grad_norm": 4.929073484618664,
"learning_rate": 3.942099561591802e-07,
"loss": 0.1048,
"step": 230
},
{
"epoch": 1.4784,
"grad_norm": 2.5166547994261417,
"learning_rate": 3.853213872835228e-07,
"loss": 0.1058,
"step": 231
},
{
"epoch": 1.4848,
"grad_norm": 2.4668836440326865,
"learning_rate": 3.765101981412665e-07,
"loss": 0.1308,
"step": 232
},
{
"epoch": 1.4912,
"grad_norm": 1.9951943331950452,
"learning_rate": 3.677774979420903e-07,
"loss": 0.1018,
"step": 233
},
{
"epoch": 1.4976,
"grad_norm": 2.9908171754511437,
"learning_rate": 3.5912438601497584e-07,
"loss": 0.1161,
"step": 234
},
{
"epoch": 1.504,
"grad_norm": 3.3264145678441976,
"learning_rate": 3.5055195166981646e-07,
"loss": 0.128,
"step": 235
},
{
"epoch": 1.5104,
"grad_norm": 3.0003805177406733,
"learning_rate": 3.420612740602874e-07,
"loss": 0.1273,
"step": 236
},
{
"epoch": 1.5168,
"grad_norm": 4.210870037523149,
"learning_rate": 3.3365342204799606e-07,
"loss": 0.1269,
"step": 237
},
{
"epoch": 1.5232,
"grad_norm": 3.1058783196727666,
"learning_rate": 3.253294540679257e-07,
"loss": 0.1253,
"step": 238
},
{
"epoch": 1.5295999999999998,
"grad_norm": 2.7923304271341607,
"learning_rate": 3.170904179951931e-07,
"loss": 0.112,
"step": 239
},
{
"epoch": 1.536,
"grad_norm": 3.193465065928833,
"learning_rate": 3.0893735101313535e-07,
"loss": 0.1188,
"step": 240
},
{
"epoch": 1.5424,
"grad_norm": 3.204335563563906,
"learning_rate": 3.008712794827426e-07,
"loss": 0.1384,
"step": 241
},
{
"epoch": 1.5488,
"grad_norm": 2.5887872249352806,
"learning_rate": 2.9289321881345254e-07,
"loss": 0.124,
"step": 242
},
{
"epoch": 1.5552000000000001,
"grad_norm": 2.6928108437347698,
"learning_rate": 2.850041733353247e-07,
"loss": 0.1071,
"step": 243
},
{
"epoch": 1.5615999999999999,
"grad_norm": 2.6514260220506563,
"learning_rate": 2.7720513617260855e-07,
"loss": 0.1284,
"step": 244
},
{
"epoch": 1.568,
"grad_norm": 3.4173755675031807,
"learning_rate": 2.6949708911872247e-07,
"loss": 0.1106,
"step": 245
},
{
"epoch": 1.5744,
"grad_norm": 3.964433823626616,
"learning_rate": 2.6188100251265943e-07,
"loss": 0.1087,
"step": 246
},
{
"epoch": 1.5808,
"grad_norm": 4.629149748841336,
"learning_rate": 2.543578351168344e-07,
"loss": 0.1148,
"step": 247
},
{
"epoch": 1.5872000000000002,
"grad_norm": 3.4337305056813197,
"learning_rate": 2.4692853399638913e-07,
"loss": 0.1387,
"step": 248
},
{
"epoch": 1.5936,
"grad_norm": 3.71914004045554,
"learning_rate": 2.395940343999691e-07,
"loss": 0.1216,
"step": 249
},
{
"epoch": 1.6,
"grad_norm": 6.720706070128215,
"learning_rate": 2.3235525964198888e-07,
"loss": 0.1501,
"step": 250
},
{
"epoch": 1.6064,
"grad_norm": 3.30556531360598,
"learning_rate": 2.252131209863991e-07,
"loss": 0.1052,
"step": 251
},
{
"epoch": 1.6128,
"grad_norm": 4.4642941608850055,
"learning_rate": 2.181685175319702e-07,
"loss": 0.139,
"step": 252
},
{
"epoch": 1.6192,
"grad_norm": 2.6766521332963777,
"learning_rate": 2.11222336099109e-07,
"loss": 0.0887,
"step": 253
},
{
"epoch": 1.6256,
"grad_norm": 3.3097474207466444,
"learning_rate": 2.043754511182191e-07,
"loss": 0.1218,
"step": 254
},
{
"epoch": 1.6320000000000001,
"grad_norm": 3.062260478274747,
"learning_rate": 1.9762872451962208e-07,
"loss": 0.1226,
"step": 255
},
{
"epoch": 1.6383999999999999,
"grad_norm": 3.2630102453572882,
"learning_rate": 1.9098300562505264e-07,
"loss": 0.0844,
"step": 256
},
{
"epoch": 1.6448,
"grad_norm": 2.7218126147886945,
"learning_rate": 1.8443913104073982e-07,
"loss": 0.1163,
"step": 257
},
{
"epoch": 1.6512,
"grad_norm": 3.7971992271480186,
"learning_rate": 1.7799792455209016e-07,
"loss": 0.1181,
"step": 258
},
{
"epoch": 1.6576,
"grad_norm": 3.738441130140733,
"learning_rate": 1.716601970199836e-07,
"loss": 0.1301,
"step": 259
},
{
"epoch": 1.6640000000000001,
"grad_norm": 3.05018064730553,
"learning_rate": 1.6542674627869734e-07,
"loss": 0.0798,
"step": 260
},
{
"epoch": 1.6703999999999999,
"grad_norm": 3.2523350421667363,
"learning_rate": 1.592983570354699e-07,
"loss": 0.1082,
"step": 261
},
{
"epoch": 1.6768,
"grad_norm": 3.284724299450673,
"learning_rate": 1.5327580077171588e-07,
"loss": 0.1207,
"step": 262
},
{
"epoch": 1.6832,
"grad_norm": 3.92886529982871,
"learning_rate": 1.473598356459078e-07,
"loss": 0.0965,
"step": 263
},
{
"epoch": 1.6896,
"grad_norm": 3.288393589630192,
"learning_rate": 1.415512063981339e-07,
"loss": 0.1356,
"step": 264
},
{
"epoch": 1.696,
"grad_norm": 2.4891167660291917,
"learning_rate": 1.358506442563454e-07,
"loss": 0.1205,
"step": 265
},
{
"epoch": 1.7024,
"grad_norm": 2.749129320779883,
"learning_rate": 1.3025886684430465e-07,
"loss": 0.1114,
"step": 266
},
{
"epoch": 1.7088,
"grad_norm": 2.9939547397600887,
"learning_rate": 1.2477657809124632e-07,
"loss": 0.1267,
"step": 267
},
{
"epoch": 1.7151999999999998,
"grad_norm": 3.059481363897511,
"learning_rate": 1.19404468143262e-07,
"loss": 0.13,
"step": 268
},
{
"epoch": 1.7216,
"grad_norm": 3.172719990097923,
"learning_rate": 1.1414321327642019e-07,
"loss": 0.1003,
"step": 269
},
{
"epoch": 1.728,
"grad_norm": 2.763074619615047,
"learning_rate": 1.089934758116322e-07,
"loss": 0.107,
"step": 270
},
{
"epoch": 1.7344,
"grad_norm": 3.3513605159844477,
"learning_rate": 1.0395590403127486e-07,
"loss": 0.1,
"step": 271
},
{
"epoch": 1.7408000000000001,
"grad_norm": 3.0130218075200284,
"learning_rate": 9.903113209758096e-08,
"loss": 0.1014,
"step": 272
},
{
"epoch": 1.7471999999999999,
"grad_norm": 3.2984153616886442,
"learning_rate": 9.421977997280594e-08,
"loss": 0.1443,
"step": 273
},
{
"epoch": 1.7536,
"grad_norm": 2.4116503417935977,
"learning_rate": 8.952245334118413e-08,
"loss": 0.129,
"step": 274
},
{
"epoch": 1.76,
"grad_norm": 2.303377440533526,
"learning_rate": 8.493974353268019e-08,
"loss": 0.1433,
"step": 275
},
{
"epoch": 1.7664,
"grad_norm": 2.575473010066114,
"learning_rate": 8.047222744854942e-08,
"loss": 0.0928,
"step": 276
},
{
"epoch": 1.7728000000000002,
"grad_norm": 2.862390938762308,
"learning_rate": 7.612046748871326e-08,
"loss": 0.1003,
"step": 277
},
{
"epoch": 1.7792,
"grad_norm": 2.512371847069459,
"learning_rate": 7.188501148096116e-08,
"loss": 0.1068,
"step": 278
},
{
"epoch": 1.7856,
"grad_norm": 3.9557015566480285,
"learning_rate": 6.77663926119858e-08,
"loss": 0.111,
"step": 279
},
{
"epoch": 1.792,
"grad_norm": 3.237819756927862,
"learning_rate": 6.376512936026279e-08,
"loss": 0.1242,
"step": 280
},
{
"epoch": 1.7984,
"grad_norm": 2.3340318791785646,
"learning_rate": 5.988172543078096e-08,
"loss": 0.1061,
"step": 281
},
{
"epoch": 1.8048,
"grad_norm": 2.912775412360275,
"learning_rate": 5.611666969163242e-08,
"loss": 0.1288,
"step": 282
},
{
"epoch": 1.8112,
"grad_norm": 3.5993922569990877,
"learning_rate": 5.2470436112471264e-08,
"loss": 0.1502,
"step": 283
},
{
"epoch": 1.8176,
"grad_norm": 3.1906595269687066,
"learning_rate": 4.8943483704846465e-08,
"loss": 0.1129,
"step": 284
},
{
"epoch": 1.8239999999999998,
"grad_norm": 2.7886058393742177,
"learning_rate": 4.553625646441928e-08,
"loss": 0.1417,
"step": 285
},
{
"epoch": 1.8304,
"grad_norm": 4.576338749251723,
"learning_rate": 4.224918331506955e-08,
"loss": 0.1079,
"step": 286
},
{
"epoch": 1.8368,
"grad_norm": 2.3424965325338434,
"learning_rate": 3.908267805490051e-08,
"loss": 0.113,
"step": 287
},
{
"epoch": 1.8432,
"grad_norm": 2.8896940146945,
"learning_rate": 3.6037139304146756e-08,
"loss": 0.132,
"step": 288
},
{
"epoch": 1.8496000000000001,
"grad_norm": 3.051287182293896,
"learning_rate": 3.3112950454993625e-08,
"loss": 0.1092,
"step": 289
},
{
"epoch": 1.8559999999999999,
"grad_norm": 3.4677503211105645,
"learning_rate": 3.0310479623313125e-08,
"loss": 0.1255,
"step": 290
},
{
"epoch": 1.8624,
"grad_norm": 2.629396760631384,
"learning_rate": 2.7630079602323443e-08,
"loss": 0.1082,
"step": 291
},
{
"epoch": 1.8688,
"grad_norm": 3.060707277576519,
"learning_rate": 2.507208781817638e-08,
"loss": 0.1289,
"step": 292
},
{
"epoch": 1.8752,
"grad_norm": 4.498542316071858,
"learning_rate": 2.263682628748087e-08,
"loss": 0.1112,
"step": 293
},
{
"epoch": 1.8816000000000002,
"grad_norm": 3.3242615389567653,
"learning_rate": 2.032460157676452e-08,
"loss": 0.1402,
"step": 294
},
{
"epoch": 1.888,
"grad_norm": 2.117823787343076,
"learning_rate": 1.8135704763881598e-08,
"loss": 0.1214,
"step": 295
},
{
"epoch": 1.8944,
"grad_norm": 3.7999512049595867,
"learning_rate": 1.607041140137033e-08,
"loss": 0.106,
"step": 296
},
{
"epoch": 1.9008,
"grad_norm": 2.9706181280580086,
"learning_rate": 1.4128981481764113e-08,
"loss": 0.1121,
"step": 297
},
{
"epoch": 1.9072,
"grad_norm": 2.3832740990053702,
"learning_rate": 1.231165940486234e-08,
"loss": 0.1084,
"step": 298
},
{
"epoch": 1.9136,
"grad_norm": 2.686907690774645,
"learning_rate": 1.0618673946963364e-08,
"loss": 0.1138,
"step": 299
},
{
"epoch": 1.92,
"grad_norm": 2.632510645620075,
"learning_rate": 9.050238232065299e-09,
"loss": 0.1069,
"step": 300
},
{
"epoch": 1.9264000000000001,
"grad_norm": 3.3133984005584614,
"learning_rate": 7.606549705035935e-09,
"loss": 0.1158,
"step": 301
},
{
"epoch": 1.9327999999999999,
"grad_norm": 3.033325678592628,
"learning_rate": 6.2877901067573955e-09,
"loss": 0.109,
"step": 302
},
{
"epoch": 1.9392,
"grad_norm": 2.625891227751338,
"learning_rate": 5.094125451247655e-09,
"loss": 0.1198,
"step": 303
},
{
"epoch": 1.9456,
"grad_norm": 4.0371687332878645,
"learning_rate": 4.025706004760931e-09,
"loss": 0.1089,
"step": 304
},
{
"epoch": 1.952,
"grad_norm": 2.681798202229694,
"learning_rate": 3.082666266872036e-09,
"loss": 0.1012,
"step": 305
},
{
"epoch": 1.9584000000000001,
"grad_norm": 2.3816672358564093,
"learning_rate": 2.2651249535439177e-09,
"loss": 0.132,
"step": 306
},
{
"epoch": 1.9647999999999999,
"grad_norm": 2.731557615502218,
"learning_rate": 1.5731849821833953e-09,
"loss": 0.1205,
"step": 307
},
{
"epoch": 1.9712,
"grad_norm": 2.3980291292550673,
"learning_rate": 1.0069334586854105e-09,
"loss": 0.0922,
"step": 308
},
{
"epoch": 1.9776,
"grad_norm": 2.262332556662717,
"learning_rate": 5.664416664666882e-10,
"loss": 0.1116,
"step": 309
},
{
"epoch": 1.984,
"grad_norm": 3.7908540657365184,
"learning_rate": 2.517650574934693e-10,
"loss": 0.1022,
"step": 310
},
{
"epoch": 1.9904,
"grad_norm": 2.5746244022394906,
"learning_rate": 6.29432452994294e-11,
"loss": 0.1039,
"step": 311
},
{
"epoch": 1.9968,
"grad_norm": 2.1856958417394132,
"learning_rate": 0.0,
"loss": 0.1103,
"step": 312
},
{
"epoch": 1.9968,
"step": 312,
"total_flos": 211349000945664.0,
"train_loss": 0.13338581045182088,
"train_runtime": 10585.2413,
"train_samples_per_second": 7.556,
"train_steps_per_second": 0.029
}
],
"logging_steps": 1,
"max_steps": 312,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 211349000945664.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}