llama3-3b-neko-lora-finetuned / trainer_state.json
ZBKC6H4O2's picture
Upload checkpoint-315 after 1 epoch
cfee78c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 315,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003177124702144559,
"grad_norm": 0.9960207343101501,
"learning_rate": 0.0,
"loss": 2.9881,
"step": 1
},
{
"epoch": 0.006354249404289118,
"grad_norm": 0.9263616800308228,
"learning_rate": 4e-05,
"loss": 2.9644,
"step": 2
},
{
"epoch": 0.009531374106433678,
"grad_norm": 0.9059100151062012,
"learning_rate": 8e-05,
"loss": 2.7584,
"step": 3
},
{
"epoch": 0.012708498808578236,
"grad_norm": 0.8163802027702332,
"learning_rate": 0.00012,
"loss": 2.7029,
"step": 4
},
{
"epoch": 0.015885623510722795,
"grad_norm": 0.7039979696273804,
"learning_rate": 0.00016,
"loss": 2.5991,
"step": 5
},
{
"epoch": 0.019062748212867357,
"grad_norm": 0.5652405023574829,
"learning_rate": 0.0002,
"loss": 2.4187,
"step": 6
},
{
"epoch": 0.022239872915011914,
"grad_norm": 0.7419441342353821,
"learning_rate": 0.00019968,
"loss": 2.2885,
"step": 7
},
{
"epoch": 0.025416997617156472,
"grad_norm": 0.6665301322937012,
"learning_rate": 0.00019936000000000002,
"loss": 2.3176,
"step": 8
},
{
"epoch": 0.028594122319301033,
"grad_norm": 0.5366690158843994,
"learning_rate": 0.00019904,
"loss": 2.2728,
"step": 9
},
{
"epoch": 0.03177124702144559,
"grad_norm": 0.5066737532615662,
"learning_rate": 0.00019872000000000002,
"loss": 2.068,
"step": 10
},
{
"epoch": 0.03494837172359015,
"grad_norm": 0.5810503959655762,
"learning_rate": 0.0001984,
"loss": 2.2197,
"step": 11
},
{
"epoch": 0.03812549642573471,
"grad_norm": 0.47433704137802124,
"learning_rate": 0.00019808,
"loss": 1.9871,
"step": 12
},
{
"epoch": 0.04130262112787927,
"grad_norm": 0.49652695655822754,
"learning_rate": 0.00019776,
"loss": 1.9139,
"step": 13
},
{
"epoch": 0.04447974583002383,
"grad_norm": 0.4280414879322052,
"learning_rate": 0.00019744,
"loss": 1.8852,
"step": 14
},
{
"epoch": 0.04765687053216839,
"grad_norm": 0.596341609954834,
"learning_rate": 0.00019712,
"loss": 1.9809,
"step": 15
},
{
"epoch": 0.050833995234312944,
"grad_norm": 0.5067018866539001,
"learning_rate": 0.0001968,
"loss": 1.9549,
"step": 16
},
{
"epoch": 0.054011119936457505,
"grad_norm": 0.4348883032798767,
"learning_rate": 0.00019648000000000002,
"loss": 1.9078,
"step": 17
},
{
"epoch": 0.057188244638602066,
"grad_norm": 0.4295920133590698,
"learning_rate": 0.00019616000000000002,
"loss": 1.8234,
"step": 18
},
{
"epoch": 0.06036536934074663,
"grad_norm": 0.43549808859825134,
"learning_rate": 0.00019584,
"loss": 1.9143,
"step": 19
},
{
"epoch": 0.06354249404289118,
"grad_norm": 0.4168950319290161,
"learning_rate": 0.00019552000000000003,
"loss": 1.7874,
"step": 20
},
{
"epoch": 0.06671961874503574,
"grad_norm": 0.4715218245983124,
"learning_rate": 0.0001952,
"loss": 1.8926,
"step": 21
},
{
"epoch": 0.0698967434471803,
"grad_norm": 0.38382261991500854,
"learning_rate": 0.00019488000000000003,
"loss": 1.7832,
"step": 22
},
{
"epoch": 0.07307386814932486,
"grad_norm": 0.43702301383018494,
"learning_rate": 0.00019456,
"loss": 1.7447,
"step": 23
},
{
"epoch": 0.07625099285146943,
"grad_norm": 0.4813467562198639,
"learning_rate": 0.00019424,
"loss": 1.8851,
"step": 24
},
{
"epoch": 0.07942811755361398,
"grad_norm": 0.4026224911212921,
"learning_rate": 0.00019392000000000001,
"loss": 1.8036,
"step": 25
},
{
"epoch": 0.08260524225575853,
"grad_norm": 0.39529579877853394,
"learning_rate": 0.00019360000000000002,
"loss": 1.7207,
"step": 26
},
{
"epoch": 0.0857823669579031,
"grad_norm": 0.4045431911945343,
"learning_rate": 0.00019328000000000002,
"loss": 1.7962,
"step": 27
},
{
"epoch": 0.08895949166004766,
"grad_norm": 0.3818039894104004,
"learning_rate": 0.00019296,
"loss": 1.68,
"step": 28
},
{
"epoch": 0.09213661636219221,
"grad_norm": 0.3767971098423004,
"learning_rate": 0.00019264,
"loss": 1.7949,
"step": 29
},
{
"epoch": 0.09531374106433678,
"grad_norm": 0.38719820976257324,
"learning_rate": 0.00019232,
"loss": 1.7069,
"step": 30
},
{
"epoch": 0.09849086576648133,
"grad_norm": 0.416172057390213,
"learning_rate": 0.000192,
"loss": 1.6897,
"step": 31
},
{
"epoch": 0.10166799046862589,
"grad_norm": 0.3721797466278076,
"learning_rate": 0.00019168,
"loss": 1.6996,
"step": 32
},
{
"epoch": 0.10484511517077046,
"grad_norm": 0.4110720753669739,
"learning_rate": 0.00019136,
"loss": 1.5606,
"step": 33
},
{
"epoch": 0.10802223987291501,
"grad_norm": 0.3850787580013275,
"learning_rate": 0.00019104000000000001,
"loss": 1.7615,
"step": 34
},
{
"epoch": 0.11119936457505956,
"grad_norm": 0.33883488178253174,
"learning_rate": 0.00019072000000000002,
"loss": 1.5231,
"step": 35
},
{
"epoch": 0.11437648927720413,
"grad_norm": 0.37157201766967773,
"learning_rate": 0.0001904,
"loss": 1.6246,
"step": 36
},
{
"epoch": 0.11755361397934869,
"grad_norm": 0.3907526433467865,
"learning_rate": 0.00019008000000000002,
"loss": 1.6537,
"step": 37
},
{
"epoch": 0.12073073868149325,
"grad_norm": 0.39647847414016724,
"learning_rate": 0.00018976,
"loss": 1.6193,
"step": 38
},
{
"epoch": 0.12390786338363781,
"grad_norm": 0.34513983130455017,
"learning_rate": 0.00018944000000000003,
"loss": 1.4992,
"step": 39
},
{
"epoch": 0.12708498808578236,
"grad_norm": 0.4174259305000305,
"learning_rate": 0.00018912,
"loss": 1.6547,
"step": 40
},
{
"epoch": 0.13026211278792693,
"grad_norm": 0.42140164971351624,
"learning_rate": 0.0001888,
"loss": 1.6165,
"step": 41
},
{
"epoch": 0.13343923749007147,
"grad_norm": 0.40260136127471924,
"learning_rate": 0.00018848,
"loss": 1.5583,
"step": 42
},
{
"epoch": 0.13661636219221604,
"grad_norm": 0.42584484815597534,
"learning_rate": 0.00018816000000000001,
"loss": 1.5742,
"step": 43
},
{
"epoch": 0.1397934868943606,
"grad_norm": 0.3613159954547882,
"learning_rate": 0.00018784000000000002,
"loss": 1.6282,
"step": 44
},
{
"epoch": 0.14297061159650518,
"grad_norm": 0.45315852761268616,
"learning_rate": 0.00018752,
"loss": 1.6274,
"step": 45
},
{
"epoch": 0.14614773629864972,
"grad_norm": 0.3528841733932495,
"learning_rate": 0.00018720000000000002,
"loss": 1.5076,
"step": 46
},
{
"epoch": 0.14932486100079428,
"grad_norm": 0.4335366487503052,
"learning_rate": 0.00018688,
"loss": 1.5561,
"step": 47
},
{
"epoch": 0.15250198570293885,
"grad_norm": 0.4157211184501648,
"learning_rate": 0.00018656,
"loss": 1.4963,
"step": 48
},
{
"epoch": 0.1556791104050834,
"grad_norm": 0.3932294249534607,
"learning_rate": 0.00018624,
"loss": 1.5381,
"step": 49
},
{
"epoch": 0.15885623510722796,
"grad_norm": 0.47142326831817627,
"learning_rate": 0.00018592,
"loss": 1.7029,
"step": 50
},
{
"epoch": 0.16203335980937253,
"grad_norm": 0.46922552585601807,
"learning_rate": 0.0001856,
"loss": 1.5722,
"step": 51
},
{
"epoch": 0.16521048451151707,
"grad_norm": 0.40435677766799927,
"learning_rate": 0.00018528000000000001,
"loss": 1.5885,
"step": 52
},
{
"epoch": 0.16838760921366164,
"grad_norm": 0.4449491500854492,
"learning_rate": 0.00018496,
"loss": 1.5691,
"step": 53
},
{
"epoch": 0.1715647339158062,
"grad_norm": 0.46489715576171875,
"learning_rate": 0.00018464000000000002,
"loss": 1.5736,
"step": 54
},
{
"epoch": 0.17474185861795075,
"grad_norm": 0.4461865723133087,
"learning_rate": 0.00018432,
"loss": 1.5359,
"step": 55
},
{
"epoch": 0.17791898332009531,
"grad_norm": 0.4674195349216461,
"learning_rate": 0.00018400000000000003,
"loss": 1.4617,
"step": 56
},
{
"epoch": 0.18109610802223988,
"grad_norm": 0.3901899755001068,
"learning_rate": 0.00018368,
"loss": 1.5141,
"step": 57
},
{
"epoch": 0.18427323272438442,
"grad_norm": 0.46142131090164185,
"learning_rate": 0.00018336,
"loss": 1.5436,
"step": 58
},
{
"epoch": 0.187450357426529,
"grad_norm": 0.4317268431186676,
"learning_rate": 0.00018304,
"loss": 1.56,
"step": 59
},
{
"epoch": 0.19062748212867356,
"grad_norm": 0.44914504885673523,
"learning_rate": 0.00018272,
"loss": 1.5477,
"step": 60
},
{
"epoch": 0.1938046068308181,
"grad_norm": 0.43380966782569885,
"learning_rate": 0.00018240000000000002,
"loss": 1.4989,
"step": 61
},
{
"epoch": 0.19698173153296267,
"grad_norm": 0.41798654198646545,
"learning_rate": 0.00018208000000000002,
"loss": 1.3978,
"step": 62
},
{
"epoch": 0.20015885623510724,
"grad_norm": 0.4322330355644226,
"learning_rate": 0.00018176000000000002,
"loss": 1.5094,
"step": 63
},
{
"epoch": 0.20333598093725178,
"grad_norm": 0.4732660949230194,
"learning_rate": 0.00018144,
"loss": 1.4756,
"step": 64
},
{
"epoch": 0.20651310563939634,
"grad_norm": 0.41877272725105286,
"learning_rate": 0.00018112,
"loss": 1.4598,
"step": 65
},
{
"epoch": 0.2096902303415409,
"grad_norm": 0.46112221479415894,
"learning_rate": 0.0001808,
"loss": 1.5562,
"step": 66
},
{
"epoch": 0.21286735504368545,
"grad_norm": 0.3946124017238617,
"learning_rate": 0.00018048,
"loss": 1.4517,
"step": 67
},
{
"epoch": 0.21604447974583002,
"grad_norm": 0.452828586101532,
"learning_rate": 0.00018016,
"loss": 1.5298,
"step": 68
},
{
"epoch": 0.2192216044479746,
"grad_norm": 0.45543792843818665,
"learning_rate": 0.00017984,
"loss": 1.5945,
"step": 69
},
{
"epoch": 0.22239872915011913,
"grad_norm": 0.4937468469142914,
"learning_rate": 0.00017952,
"loss": 1.5111,
"step": 70
},
{
"epoch": 0.2255758538522637,
"grad_norm": 0.43769434094429016,
"learning_rate": 0.00017920000000000002,
"loss": 1.5139,
"step": 71
},
{
"epoch": 0.22875297855440826,
"grad_norm": 0.42178353667259216,
"learning_rate": 0.00017888,
"loss": 1.3964,
"step": 72
},
{
"epoch": 0.23193010325655283,
"grad_norm": 0.4274325668811798,
"learning_rate": 0.00017856000000000003,
"loss": 1.4176,
"step": 73
},
{
"epoch": 0.23510722795869737,
"grad_norm": 0.4603947401046753,
"learning_rate": 0.00017824,
"loss": 1.549,
"step": 74
},
{
"epoch": 0.23828435266084194,
"grad_norm": 0.4948660731315613,
"learning_rate": 0.00017792,
"loss": 1.4564,
"step": 75
},
{
"epoch": 0.2414614773629865,
"grad_norm": 0.4219314455986023,
"learning_rate": 0.0001776,
"loss": 1.4946,
"step": 76
},
{
"epoch": 0.24463860206513105,
"grad_norm": 0.49445462226867676,
"learning_rate": 0.00017728,
"loss": 1.5655,
"step": 77
},
{
"epoch": 0.24781572676727562,
"grad_norm": 0.4661003053188324,
"learning_rate": 0.00017696,
"loss": 1.5347,
"step": 78
},
{
"epoch": 0.2509928514694202,
"grad_norm": 0.49738094210624695,
"learning_rate": 0.00017664000000000002,
"loss": 1.5218,
"step": 79
},
{
"epoch": 0.2541699761715647,
"grad_norm": 0.44844523072242737,
"learning_rate": 0.00017632000000000002,
"loss": 1.4657,
"step": 80
},
{
"epoch": 0.25734710087370927,
"grad_norm": 0.5222679972648621,
"learning_rate": 0.00017600000000000002,
"loss": 1.4799,
"step": 81
},
{
"epoch": 0.26052422557585386,
"grad_norm": 0.5003090500831604,
"learning_rate": 0.00017568,
"loss": 1.4998,
"step": 82
},
{
"epoch": 0.2637013502779984,
"grad_norm": 0.4072366952896118,
"learning_rate": 0.00017536,
"loss": 1.5213,
"step": 83
},
{
"epoch": 0.26687847498014294,
"grad_norm": 0.42663538455963135,
"learning_rate": 0.00017504,
"loss": 1.5446,
"step": 84
},
{
"epoch": 0.27005559968228754,
"grad_norm": 0.45552435517311096,
"learning_rate": 0.00017472,
"loss": 1.5624,
"step": 85
},
{
"epoch": 0.2732327243844321,
"grad_norm": 0.463173508644104,
"learning_rate": 0.0001744,
"loss": 1.4161,
"step": 86
},
{
"epoch": 0.2764098490865767,
"grad_norm": 0.4052661955356598,
"learning_rate": 0.00017408,
"loss": 1.5228,
"step": 87
},
{
"epoch": 0.2795869737887212,
"grad_norm": 0.3988233506679535,
"learning_rate": 0.00017376000000000002,
"loss": 1.3896,
"step": 88
},
{
"epoch": 0.28276409849086576,
"grad_norm": 0.3923889398574829,
"learning_rate": 0.00017344,
"loss": 1.4572,
"step": 89
},
{
"epoch": 0.28594122319301035,
"grad_norm": 0.46868669986724854,
"learning_rate": 0.00017312000000000002,
"loss": 1.5586,
"step": 90
},
{
"epoch": 0.2891183478951549,
"grad_norm": 0.43891963362693787,
"learning_rate": 0.0001728,
"loss": 1.5185,
"step": 91
},
{
"epoch": 0.29229547259729943,
"grad_norm": 0.4684846103191376,
"learning_rate": 0.00017248000000000003,
"loss": 1.4992,
"step": 92
},
{
"epoch": 0.29547259729944403,
"grad_norm": 0.4795592725276947,
"learning_rate": 0.00017216,
"loss": 1.4987,
"step": 93
},
{
"epoch": 0.29864972200158857,
"grad_norm": 0.4201822578907013,
"learning_rate": 0.00017184,
"loss": 1.4367,
"step": 94
},
{
"epoch": 0.3018268467037331,
"grad_norm": 0.443697065114975,
"learning_rate": 0.00017152,
"loss": 1.4436,
"step": 95
},
{
"epoch": 0.3050039714058777,
"grad_norm": 0.4432813823223114,
"learning_rate": 0.00017120000000000001,
"loss": 1.523,
"step": 96
},
{
"epoch": 0.30818109610802225,
"grad_norm": 0.43522974848747253,
"learning_rate": 0.00017088000000000002,
"loss": 1.3767,
"step": 97
},
{
"epoch": 0.3113582208101668,
"grad_norm": 0.396990031003952,
"learning_rate": 0.00017056000000000002,
"loss": 1.3021,
"step": 98
},
{
"epoch": 0.3145353455123114,
"grad_norm": 0.462819904088974,
"learning_rate": 0.00017024,
"loss": 1.455,
"step": 99
},
{
"epoch": 0.3177124702144559,
"grad_norm": 0.41220882534980774,
"learning_rate": 0.00016992,
"loss": 1.4471,
"step": 100
},
{
"epoch": 0.32088959491660046,
"grad_norm": 0.47001487016677856,
"learning_rate": 0.0001696,
"loss": 1.4668,
"step": 101
},
{
"epoch": 0.32406671961874506,
"grad_norm": 0.4349619150161743,
"learning_rate": 0.00016928,
"loss": 1.4413,
"step": 102
},
{
"epoch": 0.3272438443208896,
"grad_norm": 0.47175517678260803,
"learning_rate": 0.00016896,
"loss": 1.4546,
"step": 103
},
{
"epoch": 0.33042096902303414,
"grad_norm": 0.4192788600921631,
"learning_rate": 0.00016863999999999998,
"loss": 1.4569,
"step": 104
},
{
"epoch": 0.33359809372517873,
"grad_norm": 0.4177974462509155,
"learning_rate": 0.00016832000000000001,
"loss": 1.3423,
"step": 105
},
{
"epoch": 0.3367752184273233,
"grad_norm": 0.4190915524959564,
"learning_rate": 0.000168,
"loss": 1.3857,
"step": 106
},
{
"epoch": 0.3399523431294678,
"grad_norm": 0.42924079298973083,
"learning_rate": 0.00016768000000000002,
"loss": 1.4038,
"step": 107
},
{
"epoch": 0.3431294678316124,
"grad_norm": 0.425611287355423,
"learning_rate": 0.00016736,
"loss": 1.395,
"step": 108
},
{
"epoch": 0.34630659253375695,
"grad_norm": 0.4815029799938202,
"learning_rate": 0.00016704000000000003,
"loss": 1.4251,
"step": 109
},
{
"epoch": 0.3494837172359015,
"grad_norm": 0.45862439274787903,
"learning_rate": 0.00016672,
"loss": 1.3488,
"step": 110
},
{
"epoch": 0.3526608419380461,
"grad_norm": 0.46242061257362366,
"learning_rate": 0.0001664,
"loss": 1.393,
"step": 111
},
{
"epoch": 0.35583796664019063,
"grad_norm": 0.46360430121421814,
"learning_rate": 0.00016608,
"loss": 1.4168,
"step": 112
},
{
"epoch": 0.35901509134233517,
"grad_norm": 0.45501938462257385,
"learning_rate": 0.00016576,
"loss": 1.4718,
"step": 113
},
{
"epoch": 0.36219221604447976,
"grad_norm": 0.44886520504951477,
"learning_rate": 0.00016544000000000002,
"loss": 1.4206,
"step": 114
},
{
"epoch": 0.3653693407466243,
"grad_norm": 0.5432794690132141,
"learning_rate": 0.00016512000000000002,
"loss": 1.369,
"step": 115
},
{
"epoch": 0.36854646544876885,
"grad_norm": 0.40831825137138367,
"learning_rate": 0.0001648,
"loss": 1.3482,
"step": 116
},
{
"epoch": 0.37172359015091344,
"grad_norm": 0.4690685272216797,
"learning_rate": 0.00016448000000000002,
"loss": 1.4799,
"step": 117
},
{
"epoch": 0.374900714853058,
"grad_norm": 0.4517834782600403,
"learning_rate": 0.00016416,
"loss": 1.3597,
"step": 118
},
{
"epoch": 0.3780778395552025,
"grad_norm": 0.45939838886260986,
"learning_rate": 0.00016384,
"loss": 1.4188,
"step": 119
},
{
"epoch": 0.3812549642573471,
"grad_norm": 0.4444164037704468,
"learning_rate": 0.00016352,
"loss": 1.4501,
"step": 120
},
{
"epoch": 0.38443208895949166,
"grad_norm": 0.46045759320259094,
"learning_rate": 0.0001632,
"loss": 1.4315,
"step": 121
},
{
"epoch": 0.3876092136616362,
"grad_norm": 0.46573230624198914,
"learning_rate": 0.00016288,
"loss": 1.3214,
"step": 122
},
{
"epoch": 0.3907863383637808,
"grad_norm": 0.46668779850006104,
"learning_rate": 0.00016256,
"loss": 1.3008,
"step": 123
},
{
"epoch": 0.39396346306592533,
"grad_norm": 0.45954373478889465,
"learning_rate": 0.00016224000000000002,
"loss": 1.4209,
"step": 124
},
{
"epoch": 0.3971405877680699,
"grad_norm": 0.433923602104187,
"learning_rate": 0.00016192,
"loss": 1.4436,
"step": 125
},
{
"epoch": 0.40031771247021447,
"grad_norm": 0.49414584040641785,
"learning_rate": 0.00016160000000000002,
"loss": 1.3914,
"step": 126
},
{
"epoch": 0.403494837172359,
"grad_norm": 0.4280381202697754,
"learning_rate": 0.00016128,
"loss": 1.4302,
"step": 127
},
{
"epoch": 0.40667196187450355,
"grad_norm": 0.5049663782119751,
"learning_rate": 0.00016096,
"loss": 1.5463,
"step": 128
},
{
"epoch": 0.40984908657664815,
"grad_norm": 0.3671952486038208,
"learning_rate": 0.00016064,
"loss": 1.3469,
"step": 129
},
{
"epoch": 0.4130262112787927,
"grad_norm": 0.4638643264770508,
"learning_rate": 0.00016032,
"loss": 1.3772,
"step": 130
},
{
"epoch": 0.41620333598093723,
"grad_norm": 0.4278906583786011,
"learning_rate": 0.00016,
"loss": 1.4239,
"step": 131
},
{
"epoch": 0.4193804606830818,
"grad_norm": 0.45057350397109985,
"learning_rate": 0.00015968000000000002,
"loss": 1.3649,
"step": 132
},
{
"epoch": 0.42255758538522636,
"grad_norm": 0.4940052330493927,
"learning_rate": 0.00015936,
"loss": 1.4542,
"step": 133
},
{
"epoch": 0.4257347100873709,
"grad_norm": 0.48272138833999634,
"learning_rate": 0.00015904000000000002,
"loss": 1.4525,
"step": 134
},
{
"epoch": 0.4289118347895155,
"grad_norm": 0.4591176211833954,
"learning_rate": 0.00015872,
"loss": 1.372,
"step": 135
},
{
"epoch": 0.43208895949166004,
"grad_norm": 0.39564651250839233,
"learning_rate": 0.00015840000000000003,
"loss": 1.2387,
"step": 136
},
{
"epoch": 0.4352660841938046,
"grad_norm": 0.4640074670314789,
"learning_rate": 0.00015808,
"loss": 1.4381,
"step": 137
},
{
"epoch": 0.4384432088959492,
"grad_norm": 0.4523836374282837,
"learning_rate": 0.00015776,
"loss": 1.3681,
"step": 138
},
{
"epoch": 0.4416203335980937,
"grad_norm": 0.4463924765586853,
"learning_rate": 0.00015744,
"loss": 1.4002,
"step": 139
},
{
"epoch": 0.44479745830023826,
"grad_norm": 0.4263816177845001,
"learning_rate": 0.00015712000000000001,
"loss": 1.3452,
"step": 140
},
{
"epoch": 0.44797458300238285,
"grad_norm": 0.4039861857891083,
"learning_rate": 0.00015680000000000002,
"loss": 1.3888,
"step": 141
},
{
"epoch": 0.4511517077045274,
"grad_norm": 0.44540414214134216,
"learning_rate": 0.00015648,
"loss": 1.3827,
"step": 142
},
{
"epoch": 0.45432883240667193,
"grad_norm": 0.4521636664867401,
"learning_rate": 0.00015616000000000002,
"loss": 1.4448,
"step": 143
},
{
"epoch": 0.45750595710881653,
"grad_norm": 0.46087294816970825,
"learning_rate": 0.00015584,
"loss": 1.374,
"step": 144
},
{
"epoch": 0.46068308181096107,
"grad_norm": 0.43480321764945984,
"learning_rate": 0.00015552,
"loss": 1.4021,
"step": 145
},
{
"epoch": 0.46386020651310567,
"grad_norm": 0.48551246523857117,
"learning_rate": 0.0001552,
"loss": 1.3729,
"step": 146
},
{
"epoch": 0.4670373312152502,
"grad_norm": 0.44551774859428406,
"learning_rate": 0.00015488,
"loss": 1.3628,
"step": 147
},
{
"epoch": 0.47021445591739475,
"grad_norm": 0.43176624178886414,
"learning_rate": 0.00015456,
"loss": 1.4927,
"step": 148
},
{
"epoch": 0.47339158061953934,
"grad_norm": 0.47492435574531555,
"learning_rate": 0.00015424000000000001,
"loss": 1.4361,
"step": 149
},
{
"epoch": 0.4765687053216839,
"grad_norm": 0.46715089678764343,
"learning_rate": 0.00015392,
"loss": 1.382,
"step": 150
},
{
"epoch": 0.4797458300238284,
"grad_norm": 0.44686493277549744,
"learning_rate": 0.00015360000000000002,
"loss": 1.4603,
"step": 151
},
{
"epoch": 0.482922954725973,
"grad_norm": 0.429262638092041,
"learning_rate": 0.00015328,
"loss": 1.3358,
"step": 152
},
{
"epoch": 0.48610007942811756,
"grad_norm": 0.4371509850025177,
"learning_rate": 0.00015296000000000003,
"loss": 1.4007,
"step": 153
},
{
"epoch": 0.4892772041302621,
"grad_norm": 0.44418609142303467,
"learning_rate": 0.00015264,
"loss": 1.2738,
"step": 154
},
{
"epoch": 0.4924543288324067,
"grad_norm": 0.44011855125427246,
"learning_rate": 0.00015232,
"loss": 1.3724,
"step": 155
},
{
"epoch": 0.49563145353455124,
"grad_norm": 0.4057789742946625,
"learning_rate": 0.000152,
"loss": 1.3708,
"step": 156
},
{
"epoch": 0.4988085782366958,
"grad_norm": 0.462645024061203,
"learning_rate": 0.00015168,
"loss": 1.3377,
"step": 157
},
{
"epoch": 0.5019857029388404,
"grad_norm": 0.48834553360939026,
"learning_rate": 0.00015136000000000001,
"loss": 1.393,
"step": 158
},
{
"epoch": 0.5051628276409849,
"grad_norm": 0.4350643754005432,
"learning_rate": 0.00015104,
"loss": 1.3838,
"step": 159
},
{
"epoch": 0.5083399523431295,
"grad_norm": 0.529528796672821,
"learning_rate": 0.00015072000000000002,
"loss": 1.4754,
"step": 160
},
{
"epoch": 0.511517077045274,
"grad_norm": 0.43042004108428955,
"learning_rate": 0.0001504,
"loss": 1.2977,
"step": 161
},
{
"epoch": 0.5146942017474185,
"grad_norm": 0.4385746717453003,
"learning_rate": 0.00015008,
"loss": 1.3612,
"step": 162
},
{
"epoch": 0.5178713264495631,
"grad_norm": 0.4804169535636902,
"learning_rate": 0.00014976,
"loss": 1.3587,
"step": 163
},
{
"epoch": 0.5210484511517077,
"grad_norm": 0.43496274948120117,
"learning_rate": 0.00014944,
"loss": 1.3683,
"step": 164
},
{
"epoch": 0.5242255758538522,
"grad_norm": 0.4485699534416199,
"learning_rate": 0.00014912,
"loss": 1.3987,
"step": 165
},
{
"epoch": 0.5274027005559968,
"grad_norm": 0.4077622592449188,
"learning_rate": 0.0001488,
"loss": 1.3115,
"step": 166
},
{
"epoch": 0.5305798252581414,
"grad_norm": 0.480747789144516,
"learning_rate": 0.00014848,
"loss": 1.4257,
"step": 167
},
{
"epoch": 0.5337569499602859,
"grad_norm": 0.4339999854564667,
"learning_rate": 0.00014816000000000002,
"loss": 1.3689,
"step": 168
},
{
"epoch": 0.5369340746624305,
"grad_norm": 0.4264179766178131,
"learning_rate": 0.00014784,
"loss": 1.3877,
"step": 169
},
{
"epoch": 0.5401111993645751,
"grad_norm": 0.4252622127532959,
"learning_rate": 0.00014752000000000002,
"loss": 1.3887,
"step": 170
},
{
"epoch": 0.5432883240667196,
"grad_norm": 0.42078137397766113,
"learning_rate": 0.0001472,
"loss": 1.388,
"step": 171
},
{
"epoch": 0.5464654487688642,
"grad_norm": 0.4306480586528778,
"learning_rate": 0.00014688000000000003,
"loss": 1.3366,
"step": 172
},
{
"epoch": 0.5496425734710088,
"grad_norm": 0.4413485825061798,
"learning_rate": 0.00014656,
"loss": 1.4593,
"step": 173
},
{
"epoch": 0.5528196981731534,
"grad_norm": 0.5051458477973938,
"learning_rate": 0.00014624,
"loss": 1.3824,
"step": 174
},
{
"epoch": 0.5559968228752978,
"grad_norm": 0.42062053084373474,
"learning_rate": 0.00014592,
"loss": 1.2272,
"step": 175
},
{
"epoch": 0.5591739475774424,
"grad_norm": 0.46674421429634094,
"learning_rate": 0.00014560000000000002,
"loss": 1.4548,
"step": 176
},
{
"epoch": 0.562351072279587,
"grad_norm": 0.44679704308509827,
"learning_rate": 0.00014528000000000002,
"loss": 1.4101,
"step": 177
},
{
"epoch": 0.5655281969817315,
"grad_norm": 0.4165225327014923,
"learning_rate": 0.00014496,
"loss": 1.3625,
"step": 178
},
{
"epoch": 0.5687053216838761,
"grad_norm": 0.4735226631164551,
"learning_rate": 0.00014464,
"loss": 1.4569,
"step": 179
},
{
"epoch": 0.5718824463860207,
"grad_norm": 0.4978485405445099,
"learning_rate": 0.00014432,
"loss": 1.3913,
"step": 180
},
{
"epoch": 0.5750595710881652,
"grad_norm": 0.5753241181373596,
"learning_rate": 0.000144,
"loss": 1.3885,
"step": 181
},
{
"epoch": 0.5782366957903098,
"grad_norm": 0.4427070617675781,
"learning_rate": 0.00014368,
"loss": 1.4068,
"step": 182
},
{
"epoch": 0.5814138204924544,
"grad_norm": 0.4505138099193573,
"learning_rate": 0.00014336,
"loss": 1.3579,
"step": 183
},
{
"epoch": 0.5845909451945989,
"grad_norm": 0.5041322708129883,
"learning_rate": 0.00014303999999999999,
"loss": 1.4652,
"step": 184
},
{
"epoch": 0.5877680698967435,
"grad_norm": 0.4617048501968384,
"learning_rate": 0.00014272000000000002,
"loss": 1.3565,
"step": 185
},
{
"epoch": 0.5909451945988881,
"grad_norm": 0.4603610038757324,
"learning_rate": 0.0001424,
"loss": 1.3436,
"step": 186
},
{
"epoch": 0.5941223193010325,
"grad_norm": 0.5165044069290161,
"learning_rate": 0.00014208000000000002,
"loss": 1.4443,
"step": 187
},
{
"epoch": 0.5972994440031771,
"grad_norm": 0.3984765410423279,
"learning_rate": 0.00014176,
"loss": 1.2166,
"step": 188
},
{
"epoch": 0.6004765687053217,
"grad_norm": 0.5299299955368042,
"learning_rate": 0.00014144000000000003,
"loss": 1.446,
"step": 189
},
{
"epoch": 0.6036536934074662,
"grad_norm": 0.49046239256858826,
"learning_rate": 0.00014112,
"loss": 1.3844,
"step": 190
},
{
"epoch": 0.6068308181096108,
"grad_norm": 0.3969656825065613,
"learning_rate": 0.0001408,
"loss": 1.2135,
"step": 191
},
{
"epoch": 0.6100079428117554,
"grad_norm": 0.4312625527381897,
"learning_rate": 0.00014048,
"loss": 1.3431,
"step": 192
},
{
"epoch": 0.6131850675138999,
"grad_norm": 0.4357926547527313,
"learning_rate": 0.00014016,
"loss": 1.31,
"step": 193
},
{
"epoch": 0.6163621922160445,
"grad_norm": 0.4309421181678772,
"learning_rate": 0.00013984000000000002,
"loss": 1.3617,
"step": 194
},
{
"epoch": 0.6195393169181891,
"grad_norm": 0.4217104911804199,
"learning_rate": 0.00013952000000000002,
"loss": 1.3756,
"step": 195
},
{
"epoch": 0.6227164416203336,
"grad_norm": 0.5252110958099365,
"learning_rate": 0.0001392,
"loss": 1.3824,
"step": 196
},
{
"epoch": 0.6258935663224782,
"grad_norm": 0.4495287537574768,
"learning_rate": 0.00013888,
"loss": 1.3279,
"step": 197
},
{
"epoch": 0.6290706910246228,
"grad_norm": 0.4457398056983948,
"learning_rate": 0.00013856,
"loss": 1.385,
"step": 198
},
{
"epoch": 0.6322478157267672,
"grad_norm": 0.4607657790184021,
"learning_rate": 0.00013824,
"loss": 1.458,
"step": 199
},
{
"epoch": 0.6354249404289118,
"grad_norm": 0.43265438079833984,
"learning_rate": 0.00013792,
"loss": 1.3233,
"step": 200
},
{
"epoch": 0.6386020651310564,
"grad_norm": 0.4238455295562744,
"learning_rate": 0.00013759999999999998,
"loss": 1.3658,
"step": 201
},
{
"epoch": 0.6417791898332009,
"grad_norm": 0.4150598645210266,
"learning_rate": 0.00013728000000000001,
"loss": 1.406,
"step": 202
},
{
"epoch": 0.6449563145353455,
"grad_norm": 0.44659295678138733,
"learning_rate": 0.00013696,
"loss": 1.4752,
"step": 203
},
{
"epoch": 0.6481334392374901,
"grad_norm": 0.4836420714855194,
"learning_rate": 0.00013664000000000002,
"loss": 1.3478,
"step": 204
},
{
"epoch": 0.6513105639396346,
"grad_norm": 0.40945902466773987,
"learning_rate": 0.00013632,
"loss": 1.2042,
"step": 205
},
{
"epoch": 0.6544876886417792,
"grad_norm": 0.4980110228061676,
"learning_rate": 0.00013600000000000003,
"loss": 1.309,
"step": 206
},
{
"epoch": 0.6576648133439238,
"grad_norm": 0.4770593047142029,
"learning_rate": 0.00013568,
"loss": 1.4164,
"step": 207
},
{
"epoch": 0.6608419380460683,
"grad_norm": 0.4662317633628845,
"learning_rate": 0.00013536,
"loss": 1.4044,
"step": 208
},
{
"epoch": 0.6640190627482129,
"grad_norm": 0.4472275674343109,
"learning_rate": 0.00013504,
"loss": 1.3541,
"step": 209
},
{
"epoch": 0.6671961874503575,
"grad_norm": 0.45574310421943665,
"learning_rate": 0.00013472,
"loss": 1.406,
"step": 210
},
{
"epoch": 0.670373312152502,
"grad_norm": 0.4748678207397461,
"learning_rate": 0.00013440000000000001,
"loss": 1.2955,
"step": 211
},
{
"epoch": 0.6735504368546466,
"grad_norm": 0.4513389766216278,
"learning_rate": 0.00013408000000000002,
"loss": 1.3326,
"step": 212
},
{
"epoch": 0.6767275615567911,
"grad_norm": 0.4360558092594147,
"learning_rate": 0.00013376,
"loss": 1.2359,
"step": 213
},
{
"epoch": 0.6799046862589356,
"grad_norm": 0.41032615303993225,
"learning_rate": 0.00013344,
"loss": 1.3916,
"step": 214
},
{
"epoch": 0.6830818109610802,
"grad_norm": 0.46569857001304626,
"learning_rate": 0.00013312,
"loss": 1.3152,
"step": 215
},
{
"epoch": 0.6862589356632248,
"grad_norm": 0.4858649969100952,
"learning_rate": 0.0001328,
"loss": 1.3445,
"step": 216
},
{
"epoch": 0.6894360603653693,
"grad_norm": 0.4437476098537445,
"learning_rate": 0.00013248,
"loss": 1.3565,
"step": 217
},
{
"epoch": 0.6926131850675139,
"grad_norm": 0.47393283247947693,
"learning_rate": 0.00013216,
"loss": 1.3308,
"step": 218
},
{
"epoch": 0.6957903097696585,
"grad_norm": 0.446773499250412,
"learning_rate": 0.00013184,
"loss": 1.3424,
"step": 219
},
{
"epoch": 0.698967434471803,
"grad_norm": 0.4282335042953491,
"learning_rate": 0.00013152,
"loss": 1.3799,
"step": 220
},
{
"epoch": 0.7021445591739476,
"grad_norm": 0.36902791261672974,
"learning_rate": 0.00013120000000000002,
"loss": 1.2638,
"step": 221
},
{
"epoch": 0.7053216838760922,
"grad_norm": 0.4036352336406708,
"learning_rate": 0.00013088,
"loss": 1.2865,
"step": 222
},
{
"epoch": 0.7084988085782367,
"grad_norm": 0.4829830825328827,
"learning_rate": 0.00013056000000000002,
"loss": 1.3685,
"step": 223
},
{
"epoch": 0.7116759332803813,
"grad_norm": 0.425111323595047,
"learning_rate": 0.00013024,
"loss": 1.3151,
"step": 224
},
{
"epoch": 0.7148530579825259,
"grad_norm": 0.4299517869949341,
"learning_rate": 0.00012992,
"loss": 1.3412,
"step": 225
},
{
"epoch": 0.7180301826846703,
"grad_norm": 0.4297490417957306,
"learning_rate": 0.0001296,
"loss": 1.3244,
"step": 226
},
{
"epoch": 0.7212073073868149,
"grad_norm": 0.48203548789024353,
"learning_rate": 0.00012928,
"loss": 1.3428,
"step": 227
},
{
"epoch": 0.7243844320889595,
"grad_norm": 0.43935510516166687,
"learning_rate": 0.00012896,
"loss": 1.3323,
"step": 228
},
{
"epoch": 0.727561556791104,
"grad_norm": 0.4296364188194275,
"learning_rate": 0.00012864000000000002,
"loss": 1.3068,
"step": 229
},
{
"epoch": 0.7307386814932486,
"grad_norm": 0.44215404987335205,
"learning_rate": 0.00012832,
"loss": 1.3646,
"step": 230
},
{
"epoch": 0.7339158061953932,
"grad_norm": 0.4621836245059967,
"learning_rate": 0.00012800000000000002,
"loss": 1.3799,
"step": 231
},
{
"epoch": 0.7370929308975377,
"grad_norm": 0.4484768211841583,
"learning_rate": 0.00012768,
"loss": 1.4356,
"step": 232
},
{
"epoch": 0.7402700555996823,
"grad_norm": 0.4553694427013397,
"learning_rate": 0.00012736,
"loss": 1.2484,
"step": 233
},
{
"epoch": 0.7434471803018269,
"grad_norm": 0.40847110748291016,
"learning_rate": 0.00012704,
"loss": 1.2732,
"step": 234
},
{
"epoch": 0.7466243050039714,
"grad_norm": 0.4255897104740143,
"learning_rate": 0.00012672,
"loss": 1.2625,
"step": 235
},
{
"epoch": 0.749801429706116,
"grad_norm": 0.468524307012558,
"learning_rate": 0.0001264,
"loss": 1.4405,
"step": 236
},
{
"epoch": 0.7529785544082606,
"grad_norm": 0.48269885778427124,
"learning_rate": 0.00012607999999999999,
"loss": 1.3345,
"step": 237
},
{
"epoch": 0.756155679110405,
"grad_norm": 0.4441956877708435,
"learning_rate": 0.00012576000000000002,
"loss": 1.2967,
"step": 238
},
{
"epoch": 0.7593328038125496,
"grad_norm": 0.44516798853874207,
"learning_rate": 0.00012544,
"loss": 1.2502,
"step": 239
},
{
"epoch": 0.7625099285146942,
"grad_norm": 0.47978633642196655,
"learning_rate": 0.00012512000000000002,
"loss": 1.3275,
"step": 240
},
{
"epoch": 0.7656870532168387,
"grad_norm": 0.4489109218120575,
"learning_rate": 0.0001248,
"loss": 1.3594,
"step": 241
},
{
"epoch": 0.7688641779189833,
"grad_norm": 0.46123188734054565,
"learning_rate": 0.00012448,
"loss": 1.3014,
"step": 242
},
{
"epoch": 0.7720413026211279,
"grad_norm": 0.4514189064502716,
"learning_rate": 0.00012416,
"loss": 1.2148,
"step": 243
},
{
"epoch": 0.7752184273232724,
"grad_norm": 0.47698548436164856,
"learning_rate": 0.00012384,
"loss": 1.3562,
"step": 244
},
{
"epoch": 0.778395552025417,
"grad_norm": 0.4442936182022095,
"learning_rate": 0.00012352,
"loss": 1.3092,
"step": 245
},
{
"epoch": 0.7815726767275616,
"grad_norm": 0.48598411679267883,
"learning_rate": 0.0001232,
"loss": 1.418,
"step": 246
},
{
"epoch": 0.7847498014297061,
"grad_norm": 0.46551617980003357,
"learning_rate": 0.00012288,
"loss": 1.261,
"step": 247
},
{
"epoch": 0.7879269261318507,
"grad_norm": 0.4166944921016693,
"learning_rate": 0.00012256000000000002,
"loss": 1.213,
"step": 248
},
{
"epoch": 0.7911040508339953,
"grad_norm": 0.48919105529785156,
"learning_rate": 0.00012224,
"loss": 1.3328,
"step": 249
},
{
"epoch": 0.7942811755361397,
"grad_norm": 0.44160059094429016,
"learning_rate": 0.00012192000000000001,
"loss": 1.2758,
"step": 250
},
{
"epoch": 0.7974583002382843,
"grad_norm": 0.45591601729393005,
"learning_rate": 0.0001216,
"loss": 1.3771,
"step": 251
},
{
"epoch": 0.8006354249404289,
"grad_norm": 0.4620215892791748,
"learning_rate": 0.00012128000000000002,
"loss": 1.3787,
"step": 252
},
{
"epoch": 0.8038125496425734,
"grad_norm": 0.4968376159667969,
"learning_rate": 0.00012096000000000001,
"loss": 1.3798,
"step": 253
},
{
"epoch": 0.806989674344718,
"grad_norm": 0.4507087767124176,
"learning_rate": 0.00012064,
"loss": 1.3349,
"step": 254
},
{
"epoch": 0.8101667990468626,
"grad_norm": 0.46431511640548706,
"learning_rate": 0.00012032000000000001,
"loss": 1.359,
"step": 255
},
{
"epoch": 0.8133439237490071,
"grad_norm": 0.46496230363845825,
"learning_rate": 0.00012,
"loss": 1.2429,
"step": 256
},
{
"epoch": 0.8165210484511517,
"grad_norm": 0.46748438477516174,
"learning_rate": 0.00011968000000000002,
"loss": 1.3088,
"step": 257
},
{
"epoch": 0.8196981731532963,
"grad_norm": 0.45148542523384094,
"learning_rate": 0.00011936000000000001,
"loss": 1.2191,
"step": 258
},
{
"epoch": 0.8228752978554408,
"grad_norm": 0.4253683388233185,
"learning_rate": 0.00011904,
"loss": 1.2915,
"step": 259
},
{
"epoch": 0.8260524225575854,
"grad_norm": 0.506744384765625,
"learning_rate": 0.00011872000000000002,
"loss": 1.3271,
"step": 260
},
{
"epoch": 0.82922954725973,
"grad_norm": 0.4920015335083008,
"learning_rate": 0.0001184,
"loss": 1.3933,
"step": 261
},
{
"epoch": 0.8324066719618745,
"grad_norm": 0.4514538645744324,
"learning_rate": 0.00011808000000000001,
"loss": 1.356,
"step": 262
},
{
"epoch": 0.835583796664019,
"grad_norm": 0.5036830306053162,
"learning_rate": 0.00011776,
"loss": 1.2622,
"step": 263
},
{
"epoch": 0.8387609213661636,
"grad_norm": 0.5152455568313599,
"learning_rate": 0.00011744000000000001,
"loss": 1.3674,
"step": 264
},
{
"epoch": 0.8419380460683081,
"grad_norm": 0.4376108944416046,
"learning_rate": 0.00011712,
"loss": 1.3758,
"step": 265
},
{
"epoch": 0.8451151707704527,
"grad_norm": 0.4190007746219635,
"learning_rate": 0.00011679999999999999,
"loss": 1.2389,
"step": 266
},
{
"epoch": 0.8482922954725973,
"grad_norm": 0.5019193291664124,
"learning_rate": 0.00011648000000000001,
"loss": 1.3014,
"step": 267
},
{
"epoch": 0.8514694201747418,
"grad_norm": 0.47944578528404236,
"learning_rate": 0.00011616,
"loss": 1.3507,
"step": 268
},
{
"epoch": 0.8546465448768864,
"grad_norm": 0.4307346045970917,
"learning_rate": 0.00011584000000000002,
"loss": 1.2211,
"step": 269
},
{
"epoch": 0.857823669579031,
"grad_norm": 0.5099300742149353,
"learning_rate": 0.00011552,
"loss": 1.3172,
"step": 270
},
{
"epoch": 0.8610007942811755,
"grad_norm": 0.41971608996391296,
"learning_rate": 0.0001152,
"loss": 1.2691,
"step": 271
},
{
"epoch": 0.8641779189833201,
"grad_norm": 0.4612553119659424,
"learning_rate": 0.00011488000000000001,
"loss": 1.3128,
"step": 272
},
{
"epoch": 0.8673550436854647,
"grad_norm": 0.4589272141456604,
"learning_rate": 0.00011456,
"loss": 1.3275,
"step": 273
},
{
"epoch": 0.8705321683876092,
"grad_norm": 0.47001925110816956,
"learning_rate": 0.00011424000000000002,
"loss": 1.2607,
"step": 274
},
{
"epoch": 0.8737092930897538,
"grad_norm": 0.4315769672393799,
"learning_rate": 0.00011392000000000001,
"loss": 1.3215,
"step": 275
},
{
"epoch": 0.8768864177918984,
"grad_norm": 0.45138058066368103,
"learning_rate": 0.0001136,
"loss": 1.332,
"step": 276
},
{
"epoch": 0.8800635424940428,
"grad_norm": 0.4450497329235077,
"learning_rate": 0.00011328000000000001,
"loss": 1.3474,
"step": 277
},
{
"epoch": 0.8832406671961874,
"grad_norm": 0.4595153033733368,
"learning_rate": 0.00011296,
"loss": 1.27,
"step": 278
},
{
"epoch": 0.886417791898332,
"grad_norm": 0.42433419823646545,
"learning_rate": 0.00011264,
"loss": 1.3352,
"step": 279
},
{
"epoch": 0.8895949166004765,
"grad_norm": 0.44947418570518494,
"learning_rate": 0.00011232000000000001,
"loss": 1.2532,
"step": 280
},
{
"epoch": 0.8927720413026211,
"grad_norm": 0.4503403604030609,
"learning_rate": 0.00011200000000000001,
"loss": 1.3267,
"step": 281
},
{
"epoch": 0.8959491660047657,
"grad_norm": 0.418992280960083,
"learning_rate": 0.00011168,
"loss": 1.2699,
"step": 282
},
{
"epoch": 0.8991262907069102,
"grad_norm": 0.4266560971736908,
"learning_rate": 0.00011135999999999999,
"loss": 1.169,
"step": 283
},
{
"epoch": 0.9023034154090548,
"grad_norm": 0.5053189396858215,
"learning_rate": 0.00011104000000000001,
"loss": 1.4047,
"step": 284
},
{
"epoch": 0.9054805401111994,
"grad_norm": 0.5122870206832886,
"learning_rate": 0.00011072,
"loss": 1.3834,
"step": 285
},
{
"epoch": 0.9086576648133439,
"grad_norm": 0.43556493520736694,
"learning_rate": 0.00011040000000000001,
"loss": 1.2163,
"step": 286
},
{
"epoch": 0.9118347895154885,
"grad_norm": 0.4655609130859375,
"learning_rate": 0.00011008,
"loss": 1.2967,
"step": 287
},
{
"epoch": 0.9150119142176331,
"grad_norm": 0.4987747371196747,
"learning_rate": 0.00010975999999999999,
"loss": 1.2905,
"step": 288
},
{
"epoch": 0.9181890389197777,
"grad_norm": 0.4585645794868469,
"learning_rate": 0.00010944000000000001,
"loss": 1.2875,
"step": 289
},
{
"epoch": 0.9213661636219221,
"grad_norm": 0.5033825039863586,
"learning_rate": 0.00010912,
"loss": 1.2772,
"step": 290
},
{
"epoch": 0.9245432883240667,
"grad_norm": 0.4755001962184906,
"learning_rate": 0.00010880000000000002,
"loss": 1.4525,
"step": 291
},
{
"epoch": 0.9277204130262113,
"grad_norm": 0.43799713253974915,
"learning_rate": 0.00010848,
"loss": 1.3104,
"step": 292
},
{
"epoch": 0.9308975377283558,
"grad_norm": 0.43732205033302307,
"learning_rate": 0.00010816,
"loss": 1.2373,
"step": 293
},
{
"epoch": 0.9340746624305004,
"grad_norm": 0.45804721117019653,
"learning_rate": 0.00010784000000000001,
"loss": 1.3313,
"step": 294
},
{
"epoch": 0.937251787132645,
"grad_norm": 0.49885255098342896,
"learning_rate": 0.00010752,
"loss": 1.3134,
"step": 295
},
{
"epoch": 0.9404289118347895,
"grad_norm": 0.4742017090320587,
"learning_rate": 0.00010720000000000002,
"loss": 1.323,
"step": 296
},
{
"epoch": 0.9436060365369341,
"grad_norm": 0.4221518039703369,
"learning_rate": 0.00010688,
"loss": 1.3479,
"step": 297
},
{
"epoch": 0.9467831612390787,
"grad_norm": 0.4776606261730194,
"learning_rate": 0.00010656000000000001,
"loss": 1.265,
"step": 298
},
{
"epoch": 0.9499602859412232,
"grad_norm": 0.49409452080726624,
"learning_rate": 0.00010624000000000001,
"loss": 1.2697,
"step": 299
},
{
"epoch": 0.9531374106433678,
"grad_norm": 0.4598381817340851,
"learning_rate": 0.00010592,
"loss": 1.2518,
"step": 300
},
{
"epoch": 0.9563145353455124,
"grad_norm": 0.43075883388519287,
"learning_rate": 0.0001056,
"loss": 1.2483,
"step": 301
},
{
"epoch": 0.9594916600476568,
"grad_norm": 0.5096505880355835,
"learning_rate": 0.00010528,
"loss": 1.4291,
"step": 302
},
{
"epoch": 0.9626687847498014,
"grad_norm": 0.4315980076789856,
"learning_rate": 0.00010496000000000001,
"loss": 1.2332,
"step": 303
},
{
"epoch": 0.965845909451946,
"grad_norm": 0.47984281182289124,
"learning_rate": 0.00010464,
"loss": 1.3108,
"step": 304
},
{
"epoch": 0.9690230341540905,
"grad_norm": 0.4698749780654907,
"learning_rate": 0.00010431999999999999,
"loss": 1.3103,
"step": 305
},
{
"epoch": 0.9722001588562351,
"grad_norm": 0.465999960899353,
"learning_rate": 0.00010400000000000001,
"loss": 1.3343,
"step": 306
},
{
"epoch": 0.9753772835583797,
"grad_norm": 0.43465176224708557,
"learning_rate": 0.00010368,
"loss": 1.1649,
"step": 307
},
{
"epoch": 0.9785544082605242,
"grad_norm": 0.4245821237564087,
"learning_rate": 0.00010336000000000001,
"loss": 1.273,
"step": 308
},
{
"epoch": 0.9817315329626688,
"grad_norm": 0.43245622515678406,
"learning_rate": 0.00010304,
"loss": 1.2443,
"step": 309
},
{
"epoch": 0.9849086576648134,
"grad_norm": 0.4845837950706482,
"learning_rate": 0.00010271999999999999,
"loss": 1.2649,
"step": 310
},
{
"epoch": 0.9880857823669579,
"grad_norm": 0.424667090177536,
"learning_rate": 0.00010240000000000001,
"loss": 1.2708,
"step": 311
},
{
"epoch": 0.9912629070691025,
"grad_norm": 0.43120723962783813,
"learning_rate": 0.00010208,
"loss": 1.2844,
"step": 312
},
{
"epoch": 0.9944400317712471,
"grad_norm": 0.4800574481487274,
"learning_rate": 0.00010176000000000002,
"loss": 1.379,
"step": 313
},
{
"epoch": 0.9976171564733916,
"grad_norm": 0.5008915066719055,
"learning_rate": 0.00010144,
"loss": 1.3679,
"step": 314
},
{
"epoch": 1.0,
"grad_norm": 0.6201555132865906,
"learning_rate": 0.00010112000000000002,
"loss": 1.2757,
"step": 315
}
],
"logging_steps": 1,
"max_steps": 630,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.203279169783808e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}