MedQA_Mistral_7b_Instructive_KG3 / trainer_state.json
Manal0809's picture
Upload model
63d89e5 verified
{
"best_global_step": 638,
"best_metric": 0.313894122838974,
"best_model_checkpoint": "outputs/checkpoint-638",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 638,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003144036156415799,
"grad_norm": 2.18092679977417,
"learning_rate": 0.0,
"loss": 1.3508,
"step": 1
},
{
"epoch": 0.006288072312831598,
"grad_norm": 2.135878562927246,
"learning_rate": 2e-05,
"loss": 1.277,
"step": 2
},
{
"epoch": 0.009432108469247396,
"grad_norm": 1.8312653303146362,
"learning_rate": 4e-05,
"loss": 1.2121,
"step": 3
},
{
"epoch": 0.012576144625663196,
"grad_norm": 1.2045841217041016,
"learning_rate": 6e-05,
"loss": 1.1445,
"step": 4
},
{
"epoch": 0.015720180782078996,
"grad_norm": 0.8778926730155945,
"learning_rate": 8e-05,
"loss": 1.0126,
"step": 5
},
{
"epoch": 0.018864216938494792,
"grad_norm": 0.9060773849487305,
"learning_rate": 0.0001,
"loss": 0.8732,
"step": 6
},
{
"epoch": 0.02200825309491059,
"grad_norm": 0.8855406045913696,
"learning_rate": 9.995497523638001e-05,
"loss": 0.7976,
"step": 7
},
{
"epoch": 0.02515228925132639,
"grad_norm": 0.8502305746078491,
"learning_rate": 9.990995047276002e-05,
"loss": 0.6693,
"step": 8
},
{
"epoch": 0.028296325407742188,
"grad_norm": 2.569504737854004,
"learning_rate": 9.986492570914003e-05,
"loss": 0.5713,
"step": 9
},
{
"epoch": 0.03144036156415799,
"grad_norm": 0.60772705078125,
"learning_rate": 9.981990094552004e-05,
"loss": 0.5188,
"step": 10
},
{
"epoch": 0.034584397720573784,
"grad_norm": 0.3977140784263611,
"learning_rate": 9.977487618190005e-05,
"loss": 0.5009,
"step": 11
},
{
"epoch": 0.037728433876989584,
"grad_norm": 0.4928475022315979,
"learning_rate": 9.972985141828006e-05,
"loss": 0.4721,
"step": 12
},
{
"epoch": 0.040872470033405384,
"grad_norm": 0.3859867453575134,
"learning_rate": 9.968482665466006e-05,
"loss": 0.4744,
"step": 13
},
{
"epoch": 0.04401650618982118,
"grad_norm": 0.24680837988853455,
"learning_rate": 9.963980189104007e-05,
"loss": 0.4696,
"step": 14
},
{
"epoch": 0.04716054234623698,
"grad_norm": 0.23483239114284515,
"learning_rate": 9.95947771274201e-05,
"loss": 0.4319,
"step": 15
},
{
"epoch": 0.05030457850265278,
"grad_norm": 0.21595372259616852,
"learning_rate": 9.954975236380009e-05,
"loss": 0.4489,
"step": 16
},
{
"epoch": 0.053448614659068576,
"grad_norm": 0.5875914096832275,
"learning_rate": 9.95047276001801e-05,
"loss": 0.4522,
"step": 17
},
{
"epoch": 0.056592650815484376,
"grad_norm": 0.22523947060108185,
"learning_rate": 9.945970283656011e-05,
"loss": 0.4459,
"step": 18
},
{
"epoch": 0.059736686971900176,
"grad_norm": 0.20356932282447815,
"learning_rate": 9.941467807294013e-05,
"loss": 0.4596,
"step": 19
},
{
"epoch": 0.06288072312831598,
"grad_norm": 1.1203888654708862,
"learning_rate": 9.936965330932014e-05,
"loss": 0.446,
"step": 20
},
{
"epoch": 0.06602475928473177,
"grad_norm": 0.2615947723388672,
"learning_rate": 9.932462854570013e-05,
"loss": 0.4723,
"step": 21
},
{
"epoch": 0.06916879544114757,
"grad_norm": 0.21939712762832642,
"learning_rate": 9.927960378208014e-05,
"loss": 0.4203,
"step": 22
},
{
"epoch": 0.07231283159756337,
"grad_norm": 0.18068519234657288,
"learning_rate": 9.923457901846016e-05,
"loss": 0.4117,
"step": 23
},
{
"epoch": 0.07545686775397917,
"grad_norm": 0.1733531355857849,
"learning_rate": 9.918955425484017e-05,
"loss": 0.4386,
"step": 24
},
{
"epoch": 0.07860090391039497,
"grad_norm": 0.18101659417152405,
"learning_rate": 9.914452949122017e-05,
"loss": 0.4291,
"step": 25
},
{
"epoch": 0.08174494006681077,
"grad_norm": 0.18338626623153687,
"learning_rate": 9.909950472760019e-05,
"loss": 0.4189,
"step": 26
},
{
"epoch": 0.08488897622322657,
"grad_norm": 0.17645250260829926,
"learning_rate": 9.90544799639802e-05,
"loss": 0.3872,
"step": 27
},
{
"epoch": 0.08803301237964237,
"grad_norm": 0.1902536153793335,
"learning_rate": 9.900945520036021e-05,
"loss": 0.4055,
"step": 28
},
{
"epoch": 0.09117704853605817,
"grad_norm": 0.18971717357635498,
"learning_rate": 9.89644304367402e-05,
"loss": 0.4148,
"step": 29
},
{
"epoch": 0.09432108469247397,
"grad_norm": 0.1757958084344864,
"learning_rate": 9.891940567312022e-05,
"loss": 0.4019,
"step": 30
},
{
"epoch": 0.09746512084888977,
"grad_norm": 0.18116620182991028,
"learning_rate": 9.887438090950023e-05,
"loss": 0.409,
"step": 31
},
{
"epoch": 0.10060915700530557,
"grad_norm": 0.16721461713314056,
"learning_rate": 9.882935614588024e-05,
"loss": 0.4118,
"step": 32
},
{
"epoch": 0.10375319316172137,
"grad_norm": 0.18521425127983093,
"learning_rate": 9.878433138226025e-05,
"loss": 0.4358,
"step": 33
},
{
"epoch": 0.10689722931813715,
"grad_norm": 0.18263505399227142,
"learning_rate": 9.873930661864026e-05,
"loss": 0.3956,
"step": 34
},
{
"epoch": 0.11004126547455295,
"grad_norm": 0.1665913313627243,
"learning_rate": 9.869428185502027e-05,
"loss": 0.3826,
"step": 35
},
{
"epoch": 0.11318530163096875,
"grad_norm": 0.16498151421546936,
"learning_rate": 9.864925709140028e-05,
"loss": 0.3556,
"step": 36
},
{
"epoch": 0.11632933778738455,
"grad_norm": 0.17468655109405518,
"learning_rate": 9.860423232778028e-05,
"loss": 0.3702,
"step": 37
},
{
"epoch": 0.11947337394380035,
"grad_norm": 0.1699349582195282,
"learning_rate": 9.855920756416029e-05,
"loss": 0.3763,
"step": 38
},
{
"epoch": 0.12261741010021615,
"grad_norm": 0.17979387938976288,
"learning_rate": 9.85141828005403e-05,
"loss": 0.4059,
"step": 39
},
{
"epoch": 0.12576144625663196,
"grad_norm": 0.17460626363754272,
"learning_rate": 9.846915803692031e-05,
"loss": 0.4032,
"step": 40
},
{
"epoch": 0.12890548241304775,
"grad_norm": 0.1785130649805069,
"learning_rate": 9.842413327330032e-05,
"loss": 0.3939,
"step": 41
},
{
"epoch": 0.13204951856946354,
"grad_norm": 0.1852668821811676,
"learning_rate": 9.837910850968033e-05,
"loss": 0.3746,
"step": 42
},
{
"epoch": 0.13519355472587935,
"grad_norm": 0.18407292664051056,
"learning_rate": 9.833408374606034e-05,
"loss": 0.3851,
"step": 43
},
{
"epoch": 0.13833759088229514,
"grad_norm": 0.18785783648490906,
"learning_rate": 9.828905898244036e-05,
"loss": 0.3895,
"step": 44
},
{
"epoch": 0.14148162703871095,
"grad_norm": 0.18965557217597961,
"learning_rate": 9.824403421882035e-05,
"loss": 0.3746,
"step": 45
},
{
"epoch": 0.14462566319512674,
"grad_norm": 0.1771431416273117,
"learning_rate": 9.819900945520036e-05,
"loss": 0.3483,
"step": 46
},
{
"epoch": 0.14776969935154255,
"grad_norm": 0.18872253596782684,
"learning_rate": 9.815398469158037e-05,
"loss": 0.3533,
"step": 47
},
{
"epoch": 0.15091373550795834,
"grad_norm": 0.18482953310012817,
"learning_rate": 9.810895992796039e-05,
"loss": 0.3758,
"step": 48
},
{
"epoch": 0.15405777166437415,
"grad_norm": 0.18910518288612366,
"learning_rate": 9.806393516434039e-05,
"loss": 0.3804,
"step": 49
},
{
"epoch": 0.15720180782078993,
"grad_norm": 0.187296524643898,
"learning_rate": 9.80189104007204e-05,
"loss": 0.3676,
"step": 50
},
{
"epoch": 0.16034584397720575,
"grad_norm": 0.19214150309562683,
"learning_rate": 9.797388563710042e-05,
"loss": 0.3921,
"step": 51
},
{
"epoch": 0.16348988013362153,
"grad_norm": 0.19029422104358673,
"learning_rate": 9.792886087348043e-05,
"loss": 0.3778,
"step": 52
},
{
"epoch": 0.16663391629003735,
"grad_norm": 0.1980220377445221,
"learning_rate": 9.788383610986042e-05,
"loss": 0.3917,
"step": 53
},
{
"epoch": 0.16977795244645313,
"grad_norm": 0.19083669781684875,
"learning_rate": 9.783881134624043e-05,
"loss": 0.3798,
"step": 54
},
{
"epoch": 0.17292198860286892,
"grad_norm": 0.25795647501945496,
"learning_rate": 9.779378658262045e-05,
"loss": 0.3877,
"step": 55
},
{
"epoch": 0.17606602475928473,
"grad_norm": 0.19090382754802704,
"learning_rate": 9.774876181900046e-05,
"loss": 0.3696,
"step": 56
},
{
"epoch": 0.17921006091570052,
"grad_norm": 0.19982369244098663,
"learning_rate": 9.770373705538046e-05,
"loss": 0.3734,
"step": 57
},
{
"epoch": 0.18235409707211633,
"grad_norm": 0.1944751739501953,
"learning_rate": 9.765871229176046e-05,
"loss": 0.3572,
"step": 58
},
{
"epoch": 0.18549813322853212,
"grad_norm": 0.1942175179719925,
"learning_rate": 9.761368752814049e-05,
"loss": 0.4059,
"step": 59
},
{
"epoch": 0.18864216938494793,
"grad_norm": 0.177927166223526,
"learning_rate": 9.75686627645205e-05,
"loss": 0.3645,
"step": 60
},
{
"epoch": 0.19178620554136372,
"grad_norm": 0.18761321902275085,
"learning_rate": 9.752363800090049e-05,
"loss": 0.3735,
"step": 61
},
{
"epoch": 0.19493024169777953,
"grad_norm": 0.21108420193195343,
"learning_rate": 9.747861323728051e-05,
"loss": 0.3603,
"step": 62
},
{
"epoch": 0.19807427785419532,
"grad_norm": 0.18813803791999817,
"learning_rate": 9.743358847366052e-05,
"loss": 0.366,
"step": 63
},
{
"epoch": 0.20121831401061113,
"grad_norm": 0.1801685392856598,
"learning_rate": 9.738856371004053e-05,
"loss": 0.3585,
"step": 64
},
{
"epoch": 0.20436235016702692,
"grad_norm": 0.1869877278804779,
"learning_rate": 9.734353894642053e-05,
"loss": 0.3787,
"step": 65
},
{
"epoch": 0.20750638632344273,
"grad_norm": 0.18504877388477325,
"learning_rate": 9.729851418280055e-05,
"loss": 0.3442,
"step": 66
},
{
"epoch": 0.21065042247985852,
"grad_norm": 0.19591134786605835,
"learning_rate": 9.725348941918056e-05,
"loss": 0.3876,
"step": 67
},
{
"epoch": 0.2137944586362743,
"grad_norm": 0.1981891542673111,
"learning_rate": 9.720846465556056e-05,
"loss": 0.3507,
"step": 68
},
{
"epoch": 0.21693849479269012,
"grad_norm": 0.20417073369026184,
"learning_rate": 9.716343989194057e-05,
"loss": 0.3667,
"step": 69
},
{
"epoch": 0.2200825309491059,
"grad_norm": 0.19462363421916962,
"learning_rate": 9.711841512832058e-05,
"loss": 0.3595,
"step": 70
},
{
"epoch": 0.22322656710552172,
"grad_norm": 0.17222774028778076,
"learning_rate": 9.707339036470059e-05,
"loss": 0.3451,
"step": 71
},
{
"epoch": 0.2263706032619375,
"grad_norm": 0.1774955689907074,
"learning_rate": 9.70283656010806e-05,
"loss": 0.3386,
"step": 72
},
{
"epoch": 0.22951463941835332,
"grad_norm": 0.189998060464859,
"learning_rate": 9.698334083746061e-05,
"loss": 0.3522,
"step": 73
},
{
"epoch": 0.2326586755747691,
"grad_norm": 0.1920982450246811,
"learning_rate": 9.693831607384062e-05,
"loss": 0.3733,
"step": 74
},
{
"epoch": 0.23580271173118492,
"grad_norm": 0.1971607357263565,
"learning_rate": 9.689329131022062e-05,
"loss": 0.3504,
"step": 75
},
{
"epoch": 0.2389467478876007,
"grad_norm": 0.20512360334396362,
"learning_rate": 9.684826654660063e-05,
"loss": 0.3464,
"step": 76
},
{
"epoch": 0.24209078404401652,
"grad_norm": 0.2119520902633667,
"learning_rate": 9.680324178298064e-05,
"loss": 0.3686,
"step": 77
},
{
"epoch": 0.2452348202004323,
"grad_norm": 0.22858689725399017,
"learning_rate": 9.675821701936065e-05,
"loss": 0.3438,
"step": 78
},
{
"epoch": 0.24837885635684812,
"grad_norm": 0.1901649832725525,
"learning_rate": 9.671319225574066e-05,
"loss": 0.3607,
"step": 79
},
{
"epoch": 0.25152289251326393,
"grad_norm": 0.1896492838859558,
"learning_rate": 9.666816749212068e-05,
"loss": 0.3728,
"step": 80
},
{
"epoch": 0.2546669286696797,
"grad_norm": 0.18334272503852844,
"learning_rate": 9.662314272850068e-05,
"loss": 0.3617,
"step": 81
},
{
"epoch": 0.2578109648260955,
"grad_norm": 0.20095829665660858,
"learning_rate": 9.657811796488068e-05,
"loss": 0.3632,
"step": 82
},
{
"epoch": 0.2609550009825113,
"grad_norm": 0.17583882808685303,
"learning_rate": 9.653309320126069e-05,
"loss": 0.3582,
"step": 83
},
{
"epoch": 0.2640990371389271,
"grad_norm": 0.19473253190517426,
"learning_rate": 9.648806843764072e-05,
"loss": 0.3643,
"step": 84
},
{
"epoch": 0.2672430732953429,
"grad_norm": 0.1956205517053604,
"learning_rate": 9.644304367402071e-05,
"loss": 0.336,
"step": 85
},
{
"epoch": 0.2703871094517587,
"grad_norm": 0.19884824752807617,
"learning_rate": 9.639801891040072e-05,
"loss": 0.3626,
"step": 86
},
{
"epoch": 0.2735311456081745,
"grad_norm": 0.18607290089130402,
"learning_rate": 9.635299414678074e-05,
"loss": 0.3274,
"step": 87
},
{
"epoch": 0.27667518176459027,
"grad_norm": 0.18494442105293274,
"learning_rate": 9.630796938316075e-05,
"loss": 0.3437,
"step": 88
},
{
"epoch": 0.2798192179210061,
"grad_norm": 0.18413978815078735,
"learning_rate": 9.626294461954074e-05,
"loss": 0.3604,
"step": 89
},
{
"epoch": 0.2829632540774219,
"grad_norm": 0.19610458612442017,
"learning_rate": 9.621791985592075e-05,
"loss": 0.3729,
"step": 90
},
{
"epoch": 0.2861072902338377,
"grad_norm": 0.19458866119384766,
"learning_rate": 9.617289509230078e-05,
"loss": 0.3493,
"step": 91
},
{
"epoch": 0.28925132639025347,
"grad_norm": 0.20431379973888397,
"learning_rate": 9.612787032868078e-05,
"loss": 0.3464,
"step": 92
},
{
"epoch": 0.2923953625466693,
"grad_norm": 0.1833576112985611,
"learning_rate": 9.608284556506079e-05,
"loss": 0.3434,
"step": 93
},
{
"epoch": 0.2955393987030851,
"grad_norm": 0.18712273240089417,
"learning_rate": 9.603782080144079e-05,
"loss": 0.3497,
"step": 94
},
{
"epoch": 0.2986834348595009,
"grad_norm": 0.19049568474292755,
"learning_rate": 9.599279603782081e-05,
"loss": 0.3579,
"step": 95
},
{
"epoch": 0.30182747101591667,
"grad_norm": 0.18482261896133423,
"learning_rate": 9.594777127420082e-05,
"loss": 0.3714,
"step": 96
},
{
"epoch": 0.30497150717233246,
"grad_norm": 0.23206727206707,
"learning_rate": 9.590274651058083e-05,
"loss": 0.3457,
"step": 97
},
{
"epoch": 0.3081155433287483,
"grad_norm": 0.18018409609794617,
"learning_rate": 9.585772174696084e-05,
"loss": 0.3507,
"step": 98
},
{
"epoch": 0.3112595794851641,
"grad_norm": 0.1778680980205536,
"learning_rate": 9.581269698334084e-05,
"loss": 0.3455,
"step": 99
},
{
"epoch": 0.31440361564157987,
"grad_norm": 0.19341802597045898,
"learning_rate": 9.576767221972085e-05,
"loss": 0.3522,
"step": 100
},
{
"epoch": 0.31754765179799566,
"grad_norm": 0.18757164478302002,
"learning_rate": 9.572264745610086e-05,
"loss": 0.331,
"step": 101
},
{
"epoch": 0.3206916879544115,
"grad_norm": 0.1873527616262436,
"learning_rate": 9.567762269248087e-05,
"loss": 0.3341,
"step": 102
},
{
"epoch": 0.3238357241108273,
"grad_norm": 0.19451723992824554,
"learning_rate": 9.563259792886088e-05,
"loss": 0.3726,
"step": 103
},
{
"epoch": 0.32697976026724307,
"grad_norm": 0.19554930925369263,
"learning_rate": 9.558757316524089e-05,
"loss": 0.3568,
"step": 104
},
{
"epoch": 0.33012379642365886,
"grad_norm": 0.18672047555446625,
"learning_rate": 9.55425484016209e-05,
"loss": 0.3666,
"step": 105
},
{
"epoch": 0.3332678325800747,
"grad_norm": 0.18587937951087952,
"learning_rate": 9.54975236380009e-05,
"loss": 0.3239,
"step": 106
},
{
"epoch": 0.3364118687364905,
"grad_norm": 0.18024438619613647,
"learning_rate": 9.545249887438091e-05,
"loss": 0.3492,
"step": 107
},
{
"epoch": 0.33955590489290627,
"grad_norm": 0.18240226805210114,
"learning_rate": 9.540747411076092e-05,
"loss": 0.3351,
"step": 108
},
{
"epoch": 0.34269994104932205,
"grad_norm": 0.16186107695102692,
"learning_rate": 9.536244934714093e-05,
"loss": 0.3139,
"step": 109
},
{
"epoch": 0.34584397720573784,
"grad_norm": 0.16864165663719177,
"learning_rate": 9.531742458352094e-05,
"loss": 0.3321,
"step": 110
},
{
"epoch": 0.3489880133621537,
"grad_norm": 0.1825931966304779,
"learning_rate": 9.527239981990095e-05,
"loss": 0.3531,
"step": 111
},
{
"epoch": 0.35213204951856947,
"grad_norm": 0.17394313216209412,
"learning_rate": 9.522737505628096e-05,
"loss": 0.3626,
"step": 112
},
{
"epoch": 0.35527608567498525,
"grad_norm": 0.18193919956684113,
"learning_rate": 9.518235029266098e-05,
"loss": 0.342,
"step": 113
},
{
"epoch": 0.35842012183140104,
"grad_norm": 0.18491008877754211,
"learning_rate": 9.513732552904097e-05,
"loss": 0.348,
"step": 114
},
{
"epoch": 0.3615641579878169,
"grad_norm": 0.19338466227054596,
"learning_rate": 9.509230076542098e-05,
"loss": 0.3531,
"step": 115
},
{
"epoch": 0.36470819414423267,
"grad_norm": 0.18233619630336761,
"learning_rate": 9.5047276001801e-05,
"loss": 0.3147,
"step": 116
},
{
"epoch": 0.36785223030064845,
"grad_norm": 0.1774706244468689,
"learning_rate": 9.500225123818101e-05,
"loss": 0.3442,
"step": 117
},
{
"epoch": 0.37099626645706424,
"grad_norm": 0.1780499666929245,
"learning_rate": 9.495722647456101e-05,
"loss": 0.3368,
"step": 118
},
{
"epoch": 0.3741403026134801,
"grad_norm": 0.18548406660556793,
"learning_rate": 9.491220171094102e-05,
"loss": 0.3429,
"step": 119
},
{
"epoch": 0.37728433876989587,
"grad_norm": 0.18113106489181519,
"learning_rate": 9.486717694732104e-05,
"loss": 0.3278,
"step": 120
},
{
"epoch": 0.38042837492631165,
"grad_norm": 0.18168263137340546,
"learning_rate": 9.482215218370105e-05,
"loss": 0.3127,
"step": 121
},
{
"epoch": 0.38357241108272744,
"grad_norm": 0.17432525753974915,
"learning_rate": 9.477712742008104e-05,
"loss": 0.3594,
"step": 122
},
{
"epoch": 0.3867164472391432,
"grad_norm": 0.187408447265625,
"learning_rate": 9.473210265646106e-05,
"loss": 0.355,
"step": 123
},
{
"epoch": 0.38986048339555907,
"grad_norm": 0.17972330749034882,
"learning_rate": 9.468707789284107e-05,
"loss": 0.3493,
"step": 124
},
{
"epoch": 0.39300451955197485,
"grad_norm": 0.17262862622737885,
"learning_rate": 9.464205312922108e-05,
"loss": 0.323,
"step": 125
},
{
"epoch": 0.39614855570839064,
"grad_norm": 0.26467645168304443,
"learning_rate": 9.459702836560108e-05,
"loss": 0.3643,
"step": 126
},
{
"epoch": 0.3992925918648064,
"grad_norm": 0.24252085387706757,
"learning_rate": 9.45520036019811e-05,
"loss": 0.3573,
"step": 127
},
{
"epoch": 0.40243662802122226,
"grad_norm": 0.18881508708000183,
"learning_rate": 9.45069788383611e-05,
"loss": 0.325,
"step": 128
},
{
"epoch": 0.40558066417763805,
"grad_norm": 0.22236384451389313,
"learning_rate": 9.446195407474112e-05,
"loss": 0.3806,
"step": 129
},
{
"epoch": 0.40872470033405384,
"grad_norm": 0.19904322922229767,
"learning_rate": 9.441692931112111e-05,
"loss": 0.4056,
"step": 130
},
{
"epoch": 0.4118687364904696,
"grad_norm": 0.18705110251903534,
"learning_rate": 9.437190454750113e-05,
"loss": 0.3346,
"step": 131
},
{
"epoch": 0.41501277264688546,
"grad_norm": 0.18179073929786682,
"learning_rate": 9.432687978388114e-05,
"loss": 0.3136,
"step": 132
},
{
"epoch": 0.41815680880330125,
"grad_norm": 0.17651726305484772,
"learning_rate": 9.428185502026115e-05,
"loss": 0.3267,
"step": 133
},
{
"epoch": 0.42130084495971704,
"grad_norm": 0.1757514625787735,
"learning_rate": 9.423683025664116e-05,
"loss": 0.3442,
"step": 134
},
{
"epoch": 0.4244448811161328,
"grad_norm": 0.18630896508693695,
"learning_rate": 9.419180549302117e-05,
"loss": 0.3357,
"step": 135
},
{
"epoch": 0.4275889172725486,
"grad_norm": 0.17533083260059357,
"learning_rate": 9.414678072940118e-05,
"loss": 0.3253,
"step": 136
},
{
"epoch": 0.43073295342896445,
"grad_norm": 0.17959101498126984,
"learning_rate": 9.410175596578118e-05,
"loss": 0.3185,
"step": 137
},
{
"epoch": 0.43387698958538023,
"grad_norm": 0.1812899112701416,
"learning_rate": 9.405673120216119e-05,
"loss": 0.3502,
"step": 138
},
{
"epoch": 0.437021025741796,
"grad_norm": 0.18919287621974945,
"learning_rate": 9.40117064385412e-05,
"loss": 0.3326,
"step": 139
},
{
"epoch": 0.4401650618982118,
"grad_norm": 0.19101746380329132,
"learning_rate": 9.396668167492121e-05,
"loss": 0.2964,
"step": 140
},
{
"epoch": 0.44330909805462765,
"grad_norm": 7.04909086227417,
"learning_rate": 9.392165691130123e-05,
"loss": 0.366,
"step": 141
},
{
"epoch": 0.44645313421104343,
"grad_norm": 0.23961827158927917,
"learning_rate": 9.387663214768123e-05,
"loss": 0.3251,
"step": 142
},
{
"epoch": 0.4495971703674592,
"grad_norm": 0.18671870231628418,
"learning_rate": 9.383160738406124e-05,
"loss": 0.3513,
"step": 143
},
{
"epoch": 0.452741206523875,
"grad_norm": 0.25953730940818787,
"learning_rate": 9.378658262044124e-05,
"loss": 0.3189,
"step": 144
},
{
"epoch": 0.45588524268029085,
"grad_norm": 0.17804424464702606,
"learning_rate": 9.374155785682127e-05,
"loss": 0.3426,
"step": 145
},
{
"epoch": 0.45902927883670663,
"grad_norm": 0.19183290004730225,
"learning_rate": 9.369653309320126e-05,
"loss": 0.3388,
"step": 146
},
{
"epoch": 0.4621733149931224,
"grad_norm": 0.1751260757446289,
"learning_rate": 9.365150832958127e-05,
"loss": 0.3465,
"step": 147
},
{
"epoch": 0.4653173511495382,
"grad_norm": 0.17152872681617737,
"learning_rate": 9.360648356596128e-05,
"loss": 0.3009,
"step": 148
},
{
"epoch": 0.468461387305954,
"grad_norm": 0.17340736091136932,
"learning_rate": 9.35614588023413e-05,
"loss": 0.3377,
"step": 149
},
{
"epoch": 0.47160542346236983,
"grad_norm": 0.17497164011001587,
"learning_rate": 9.35164340387213e-05,
"loss": 0.3512,
"step": 150
},
{
"epoch": 0.4747494596187856,
"grad_norm": 0.18566282093524933,
"learning_rate": 9.34714092751013e-05,
"loss": 0.3208,
"step": 151
},
{
"epoch": 0.4778934957752014,
"grad_norm": 0.20263151824474335,
"learning_rate": 9.342638451148133e-05,
"loss": 0.3543,
"step": 152
},
{
"epoch": 0.4810375319316172,
"grad_norm": 0.19179081916809082,
"learning_rate": 9.338135974786133e-05,
"loss": 0.3387,
"step": 153
},
{
"epoch": 0.48418156808803303,
"grad_norm": 0.19308720529079437,
"learning_rate": 9.333633498424133e-05,
"loss": 0.3679,
"step": 154
},
{
"epoch": 0.4873256042444488,
"grad_norm": 0.1667911857366562,
"learning_rate": 9.329131022062134e-05,
"loss": 0.3243,
"step": 155
},
{
"epoch": 0.4904696404008646,
"grad_norm": 0.17789964377880096,
"learning_rate": 9.324628545700136e-05,
"loss": 0.3291,
"step": 156
},
{
"epoch": 0.4936136765572804,
"grad_norm": 0.17497336864471436,
"learning_rate": 9.320126069338137e-05,
"loss": 0.3347,
"step": 157
},
{
"epoch": 0.49675771271369623,
"grad_norm": 0.1668512523174286,
"learning_rate": 9.315623592976136e-05,
"loss": 0.3074,
"step": 158
},
{
"epoch": 0.499901748870112,
"grad_norm": 0.18032796680927277,
"learning_rate": 9.311121116614139e-05,
"loss": 0.3242,
"step": 159
},
{
"epoch": 0.5030457850265279,
"grad_norm": 0.19095478951931,
"learning_rate": 9.30661864025214e-05,
"loss": 0.3309,
"step": 160
},
{
"epoch": 0.5061898211829436,
"grad_norm": 0.17513571679592133,
"learning_rate": 9.30211616389014e-05,
"loss": 0.354,
"step": 161
},
{
"epoch": 0.5093338573393594,
"grad_norm": 0.17440561950206757,
"learning_rate": 9.29761368752814e-05,
"loss": 0.3447,
"step": 162
},
{
"epoch": 0.5124778934957752,
"grad_norm": 0.17587585747241974,
"learning_rate": 9.293111211166142e-05,
"loss": 0.347,
"step": 163
},
{
"epoch": 0.515621929652191,
"grad_norm": 0.17777486145496368,
"learning_rate": 9.288608734804143e-05,
"loss": 0.3216,
"step": 164
},
{
"epoch": 0.5187659658086068,
"grad_norm": 0.17235027253627777,
"learning_rate": 9.284106258442144e-05,
"loss": 0.342,
"step": 165
},
{
"epoch": 0.5219100019650226,
"grad_norm": 0.17032384872436523,
"learning_rate": 9.279603782080145e-05,
"loss": 0.3235,
"step": 166
},
{
"epoch": 0.5250540381214384,
"grad_norm": 0.1659417450428009,
"learning_rate": 9.275101305718146e-05,
"loss": 0.3294,
"step": 167
},
{
"epoch": 0.5281980742778541,
"grad_norm": 0.1650734841823578,
"learning_rate": 9.270598829356146e-05,
"loss": 0.3179,
"step": 168
},
{
"epoch": 0.53134211043427,
"grad_norm": 0.1897146999835968,
"learning_rate": 9.266096352994147e-05,
"loss": 0.3436,
"step": 169
},
{
"epoch": 0.5344861465906858,
"grad_norm": 0.18100985884666443,
"learning_rate": 9.261593876632148e-05,
"loss": 0.3378,
"step": 170
},
{
"epoch": 0.5376301827471016,
"grad_norm": 0.18976901471614838,
"learning_rate": 9.257091400270149e-05,
"loss": 0.3329,
"step": 171
},
{
"epoch": 0.5407742189035174,
"grad_norm": 0.18210701644420624,
"learning_rate": 9.25258892390815e-05,
"loss": 0.3566,
"step": 172
},
{
"epoch": 0.5439182550599332,
"grad_norm": 0.1779012680053711,
"learning_rate": 9.24808644754615e-05,
"loss": 0.3063,
"step": 173
},
{
"epoch": 0.547062291216349,
"grad_norm": 0.16529639065265656,
"learning_rate": 9.243583971184152e-05,
"loss": 0.3234,
"step": 174
},
{
"epoch": 0.5502063273727648,
"grad_norm": 0.24405060708522797,
"learning_rate": 9.239081494822152e-05,
"loss": 0.3321,
"step": 175
},
{
"epoch": 0.5533503635291805,
"grad_norm": 0.16497737169265747,
"learning_rate": 9.234579018460153e-05,
"loss": 0.3337,
"step": 176
},
{
"epoch": 0.5564943996855963,
"grad_norm": 0.47097891569137573,
"learning_rate": 9.230076542098155e-05,
"loss": 0.3378,
"step": 177
},
{
"epoch": 0.5596384358420122,
"grad_norm": 0.18612946569919586,
"learning_rate": 9.225574065736155e-05,
"loss": 0.3134,
"step": 178
},
{
"epoch": 0.562782471998428,
"grad_norm": 0.2161218822002411,
"learning_rate": 9.221071589374156e-05,
"loss": 0.3345,
"step": 179
},
{
"epoch": 0.5659265081548438,
"grad_norm": 0.19805237650871277,
"learning_rate": 9.216569113012157e-05,
"loss": 0.3342,
"step": 180
},
{
"epoch": 0.5690705443112596,
"grad_norm": 0.17592518031597137,
"learning_rate": 9.212066636650159e-05,
"loss": 0.3454,
"step": 181
},
{
"epoch": 0.5722145804676754,
"grad_norm": 0.18876737356185913,
"learning_rate": 9.207564160288158e-05,
"loss": 0.344,
"step": 182
},
{
"epoch": 0.5753586166240912,
"grad_norm": 0.18281705677509308,
"learning_rate": 9.203061683926159e-05,
"loss": 0.3277,
"step": 183
},
{
"epoch": 0.5785026527805069,
"grad_norm": 0.18671815097332,
"learning_rate": 9.19855920756416e-05,
"loss": 0.319,
"step": 184
},
{
"epoch": 0.5816466889369227,
"grad_norm": 0.1737174689769745,
"learning_rate": 9.194056731202162e-05,
"loss": 0.3554,
"step": 185
},
{
"epoch": 0.5847907250933386,
"grad_norm": 0.16264449059963226,
"learning_rate": 9.189554254840163e-05,
"loss": 0.3404,
"step": 186
},
{
"epoch": 0.5879347612497544,
"grad_norm": 0.16205957531929016,
"learning_rate": 9.185051778478163e-05,
"loss": 0.3345,
"step": 187
},
{
"epoch": 0.5910787974061702,
"grad_norm": 0.16299399733543396,
"learning_rate": 9.180549302116165e-05,
"loss": 0.3406,
"step": 188
},
{
"epoch": 0.594222833562586,
"grad_norm": 0.16929860413074493,
"learning_rate": 9.176046825754166e-05,
"loss": 0.3351,
"step": 189
},
{
"epoch": 0.5973668697190018,
"grad_norm": 0.18242709338665009,
"learning_rate": 9.171544349392167e-05,
"loss": 0.3277,
"step": 190
},
{
"epoch": 0.6005109058754176,
"grad_norm": 0.1715114861726761,
"learning_rate": 9.167041873030166e-05,
"loss": 0.3341,
"step": 191
},
{
"epoch": 0.6036549420318333,
"grad_norm": 0.1673378199338913,
"learning_rate": 9.162539396668168e-05,
"loss": 0.316,
"step": 192
},
{
"epoch": 0.6067989781882491,
"grad_norm": 0.1861652284860611,
"learning_rate": 9.158036920306169e-05,
"loss": 0.3425,
"step": 193
},
{
"epoch": 0.6099430143446649,
"grad_norm": 0.170218825340271,
"learning_rate": 9.15353444394417e-05,
"loss": 0.3495,
"step": 194
},
{
"epoch": 0.6130870505010808,
"grad_norm": 0.16409920156002045,
"learning_rate": 9.149031967582171e-05,
"loss": 0.3216,
"step": 195
},
{
"epoch": 0.6162310866574966,
"grad_norm": 0.1930875927209854,
"learning_rate": 9.144529491220172e-05,
"loss": 0.3221,
"step": 196
},
{
"epoch": 0.6193751228139124,
"grad_norm": 0.168474480509758,
"learning_rate": 9.140027014858173e-05,
"loss": 0.3483,
"step": 197
},
{
"epoch": 0.6225191589703282,
"grad_norm": 0.1649659276008606,
"learning_rate": 9.135524538496173e-05,
"loss": 0.3281,
"step": 198
},
{
"epoch": 0.625663195126744,
"grad_norm": 0.16725848615169525,
"learning_rate": 9.131022062134174e-05,
"loss": 0.349,
"step": 199
},
{
"epoch": 0.6288072312831597,
"grad_norm": 0.16848574578762054,
"learning_rate": 9.126519585772175e-05,
"loss": 0.3467,
"step": 200
},
{
"epoch": 0.6319512674395755,
"grad_norm": 0.17817632853984833,
"learning_rate": 9.122017109410176e-05,
"loss": 0.3468,
"step": 201
},
{
"epoch": 0.6350953035959913,
"grad_norm": 0.16884905099868774,
"learning_rate": 9.117514633048177e-05,
"loss": 0.3197,
"step": 202
},
{
"epoch": 0.6382393397524071,
"grad_norm": 0.16829445958137512,
"learning_rate": 9.113012156686178e-05,
"loss": 0.3495,
"step": 203
},
{
"epoch": 0.641383375908823,
"grad_norm": 0.1753387451171875,
"learning_rate": 9.108509680324179e-05,
"loss": 0.3549,
"step": 204
},
{
"epoch": 0.6445274120652388,
"grad_norm": 0.17498289048671722,
"learning_rate": 9.10400720396218e-05,
"loss": 0.3169,
"step": 205
},
{
"epoch": 0.6476714482216546,
"grad_norm": 0.17499548196792603,
"learning_rate": 9.09950472760018e-05,
"loss": 0.3226,
"step": 206
},
{
"epoch": 0.6508154843780704,
"grad_norm": 0.17783628404140472,
"learning_rate": 9.095002251238181e-05,
"loss": 0.3355,
"step": 207
},
{
"epoch": 0.6539595205344861,
"grad_norm": 0.16701580584049225,
"learning_rate": 9.090499774876182e-05,
"loss": 0.3348,
"step": 208
},
{
"epoch": 0.6571035566909019,
"grad_norm": 0.1692950576543808,
"learning_rate": 9.085997298514183e-05,
"loss": 0.3117,
"step": 209
},
{
"epoch": 0.6602475928473177,
"grad_norm": 0.17733407020568848,
"learning_rate": 9.081494822152185e-05,
"loss": 0.3254,
"step": 210
},
{
"epoch": 0.6633916290037335,
"grad_norm": 0.18444949388504028,
"learning_rate": 9.076992345790185e-05,
"loss": 0.3243,
"step": 211
},
{
"epoch": 0.6665356651601494,
"grad_norm": 0.1709858626127243,
"learning_rate": 9.072489869428186e-05,
"loss": 0.3437,
"step": 212
},
{
"epoch": 0.6696797013165652,
"grad_norm": 0.16070497035980225,
"learning_rate": 9.067987393066188e-05,
"loss": 0.3205,
"step": 213
},
{
"epoch": 0.672823737472981,
"grad_norm": 0.16293945908546448,
"learning_rate": 9.063484916704189e-05,
"loss": 0.3178,
"step": 214
},
{
"epoch": 0.6759677736293968,
"grad_norm": 0.17348802089691162,
"learning_rate": 9.058982440342188e-05,
"loss": 0.3434,
"step": 215
},
{
"epoch": 0.6791118097858125,
"grad_norm": 0.16067078709602356,
"learning_rate": 9.054479963980189e-05,
"loss": 0.3051,
"step": 216
},
{
"epoch": 0.6822558459422283,
"grad_norm": 0.1788797527551651,
"learning_rate": 9.049977487618191e-05,
"loss": 0.3311,
"step": 217
},
{
"epoch": 0.6853998820986441,
"grad_norm": 0.17016440629959106,
"learning_rate": 9.045475011256192e-05,
"loss": 0.3248,
"step": 218
},
{
"epoch": 0.6885439182550599,
"grad_norm": 0.18454566597938538,
"learning_rate": 9.040972534894192e-05,
"loss": 0.3144,
"step": 219
},
{
"epoch": 0.6916879544114757,
"grad_norm": 0.1694164127111435,
"learning_rate": 9.036470058532192e-05,
"loss": 0.3263,
"step": 220
},
{
"epoch": 0.6948319905678916,
"grad_norm": 0.1772613525390625,
"learning_rate": 9.031967582170195e-05,
"loss": 0.3468,
"step": 221
},
{
"epoch": 0.6979760267243074,
"grad_norm": 0.16401882469654083,
"learning_rate": 9.027465105808195e-05,
"loss": 0.342,
"step": 222
},
{
"epoch": 0.7011200628807231,
"grad_norm": 0.16261254251003265,
"learning_rate": 9.022962629446195e-05,
"loss": 0.3249,
"step": 223
},
{
"epoch": 0.7042640990371389,
"grad_norm": 0.17154066264629364,
"learning_rate": 9.018460153084197e-05,
"loss": 0.3631,
"step": 224
},
{
"epoch": 0.7074081351935547,
"grad_norm": 0.18076153099536896,
"learning_rate": 9.013957676722198e-05,
"loss": 0.3282,
"step": 225
},
{
"epoch": 0.7105521713499705,
"grad_norm": 0.15930242836475372,
"learning_rate": 9.009455200360199e-05,
"loss": 0.3233,
"step": 226
},
{
"epoch": 0.7136962075063863,
"grad_norm": 0.16669179499149323,
"learning_rate": 9.004952723998198e-05,
"loss": 0.307,
"step": 227
},
{
"epoch": 0.7168402436628021,
"grad_norm": 0.18358565866947174,
"learning_rate": 9.0004502476362e-05,
"loss": 0.3719,
"step": 228
},
{
"epoch": 0.719984279819218,
"grad_norm": 0.16769863665103912,
"learning_rate": 8.995947771274201e-05,
"loss": 0.3081,
"step": 229
},
{
"epoch": 0.7231283159756338,
"grad_norm": 0.1651238203048706,
"learning_rate": 8.991445294912202e-05,
"loss": 0.3229,
"step": 230
},
{
"epoch": 0.7262723521320495,
"grad_norm": 0.18452374637126923,
"learning_rate": 8.986942818550203e-05,
"loss": 0.3249,
"step": 231
},
{
"epoch": 0.7294163882884653,
"grad_norm": 0.17209681868553162,
"learning_rate": 8.982440342188204e-05,
"loss": 0.3444,
"step": 232
},
{
"epoch": 0.7325604244448811,
"grad_norm": 0.17528848350048065,
"learning_rate": 8.977937865826205e-05,
"loss": 0.3217,
"step": 233
},
{
"epoch": 0.7357044606012969,
"grad_norm": 0.16508957743644714,
"learning_rate": 8.973435389464206e-05,
"loss": 0.3098,
"step": 234
},
{
"epoch": 0.7388484967577127,
"grad_norm": 0.171140655875206,
"learning_rate": 8.968932913102207e-05,
"loss": 0.367,
"step": 235
},
{
"epoch": 0.7419925329141285,
"grad_norm": 0.16529837250709534,
"learning_rate": 8.964430436740207e-05,
"loss": 0.2911,
"step": 236
},
{
"epoch": 0.7451365690705443,
"grad_norm": 0.1798229068517685,
"learning_rate": 8.959927960378208e-05,
"loss": 0.3264,
"step": 237
},
{
"epoch": 0.7482806052269602,
"grad_norm": 0.17085868120193481,
"learning_rate": 8.955425484016209e-05,
"loss": 0.3109,
"step": 238
},
{
"epoch": 0.751424641383376,
"grad_norm": 0.17515264451503754,
"learning_rate": 8.95092300765421e-05,
"loss": 0.317,
"step": 239
},
{
"epoch": 0.7545686775397917,
"grad_norm": 0.18475565314292908,
"learning_rate": 8.946420531292211e-05,
"loss": 0.3683,
"step": 240
},
{
"epoch": 0.7577127136962075,
"grad_norm": 0.16714327037334442,
"learning_rate": 8.941918054930212e-05,
"loss": 0.3429,
"step": 241
},
{
"epoch": 0.7608567498526233,
"grad_norm": 0.15969350934028625,
"learning_rate": 8.937415578568214e-05,
"loss": 0.3331,
"step": 242
},
{
"epoch": 0.7640007860090391,
"grad_norm": 0.16738007962703705,
"learning_rate": 8.932913102206213e-05,
"loss": 0.3332,
"step": 243
},
{
"epoch": 0.7671448221654549,
"grad_norm": 0.15596827864646912,
"learning_rate": 8.928410625844214e-05,
"loss": 0.2864,
"step": 244
},
{
"epoch": 0.7702888583218707,
"grad_norm": 0.16769914329051971,
"learning_rate": 8.923908149482215e-05,
"loss": 0.3411,
"step": 245
},
{
"epoch": 0.7734328944782864,
"grad_norm": 0.1581619828939438,
"learning_rate": 8.919405673120217e-05,
"loss": 0.3271,
"step": 246
},
{
"epoch": 0.7765769306347023,
"grad_norm": 0.18669439852237701,
"learning_rate": 8.914903196758217e-05,
"loss": 0.3363,
"step": 247
},
{
"epoch": 0.7797209667911181,
"grad_norm": 0.1833750307559967,
"learning_rate": 8.910400720396218e-05,
"loss": 0.3425,
"step": 248
},
{
"epoch": 0.7828650029475339,
"grad_norm": 0.16842873394489288,
"learning_rate": 8.90589824403422e-05,
"loss": 0.3019,
"step": 249
},
{
"epoch": 0.7860090391039497,
"grad_norm": 0.1643659919500351,
"learning_rate": 8.901395767672221e-05,
"loss": 0.3301,
"step": 250
},
{
"epoch": 0.7891530752603655,
"grad_norm": 0.17208907008171082,
"learning_rate": 8.89689329131022e-05,
"loss": 0.3469,
"step": 251
},
{
"epoch": 0.7922971114167813,
"grad_norm": 0.16336563229560852,
"learning_rate": 8.892390814948221e-05,
"loss": 0.325,
"step": 252
},
{
"epoch": 0.7954411475731971,
"grad_norm": 0.17350764572620392,
"learning_rate": 8.887888338586223e-05,
"loss": 0.3486,
"step": 253
},
{
"epoch": 0.7985851837296128,
"grad_norm": 0.15856927633285522,
"learning_rate": 8.883385862224224e-05,
"loss": 0.3223,
"step": 254
},
{
"epoch": 0.8017292198860287,
"grad_norm": 0.16306869685649872,
"learning_rate": 8.878883385862224e-05,
"loss": 0.3164,
"step": 255
},
{
"epoch": 0.8048732560424445,
"grad_norm": 0.1610950231552124,
"learning_rate": 8.874380909500225e-05,
"loss": 0.3146,
"step": 256
},
{
"epoch": 0.8080172921988603,
"grad_norm": 0.18235592544078827,
"learning_rate": 8.869878433138227e-05,
"loss": 0.3259,
"step": 257
},
{
"epoch": 0.8111613283552761,
"grad_norm": 0.1566954404115677,
"learning_rate": 8.865375956776228e-05,
"loss": 0.29,
"step": 258
},
{
"epoch": 0.8143053645116919,
"grad_norm": 0.17046710848808289,
"learning_rate": 8.860873480414229e-05,
"loss": 0.3419,
"step": 259
},
{
"epoch": 0.8174494006681077,
"grad_norm": 0.1749659776687622,
"learning_rate": 8.85637100405223e-05,
"loss": 0.3038,
"step": 260
},
{
"epoch": 0.8205934368245235,
"grad_norm": 0.1782928705215454,
"learning_rate": 8.85186852769023e-05,
"loss": 0.3123,
"step": 261
},
{
"epoch": 0.8237374729809392,
"grad_norm": 0.16543257236480713,
"learning_rate": 8.847366051328231e-05,
"loss": 0.2995,
"step": 262
},
{
"epoch": 0.826881509137355,
"grad_norm": 0.17038169503211975,
"learning_rate": 8.842863574966232e-05,
"loss": 0.3437,
"step": 263
},
{
"epoch": 0.8300255452937709,
"grad_norm": 0.16956864297389984,
"learning_rate": 8.838361098604233e-05,
"loss": 0.3208,
"step": 264
},
{
"epoch": 0.8331695814501867,
"grad_norm": 0.16261757910251617,
"learning_rate": 8.833858622242234e-05,
"loss": 0.323,
"step": 265
},
{
"epoch": 0.8363136176066025,
"grad_norm": 0.1713312268257141,
"learning_rate": 8.829356145880235e-05,
"loss": 0.3261,
"step": 266
},
{
"epoch": 0.8394576537630183,
"grad_norm": 0.16358059644699097,
"learning_rate": 8.824853669518235e-05,
"loss": 0.3207,
"step": 267
},
{
"epoch": 0.8426016899194341,
"grad_norm": 0.1711219847202301,
"learning_rate": 8.820351193156236e-05,
"loss": 0.3343,
"step": 268
},
{
"epoch": 0.8457457260758499,
"grad_norm": 0.15430651605129242,
"learning_rate": 8.815848716794237e-05,
"loss": 0.3004,
"step": 269
},
{
"epoch": 0.8488897622322656,
"grad_norm": 0.1770448386669159,
"learning_rate": 8.811346240432238e-05,
"loss": 0.3483,
"step": 270
},
{
"epoch": 0.8520337983886814,
"grad_norm": 0.17468826472759247,
"learning_rate": 8.806843764070239e-05,
"loss": 0.3382,
"step": 271
},
{
"epoch": 0.8551778345450972,
"grad_norm": 0.16027683019638062,
"learning_rate": 8.80234128770824e-05,
"loss": 0.3178,
"step": 272
},
{
"epoch": 0.8583218707015131,
"grad_norm": 0.1797255426645279,
"learning_rate": 8.79783881134624e-05,
"loss": 0.339,
"step": 273
},
{
"epoch": 0.8614659068579289,
"grad_norm": 0.17427705228328705,
"learning_rate": 8.793336334984241e-05,
"loss": 0.334,
"step": 274
},
{
"epoch": 0.8646099430143447,
"grad_norm": 0.16854874789714813,
"learning_rate": 8.788833858622242e-05,
"loss": 0.3512,
"step": 275
},
{
"epoch": 0.8677539791707605,
"grad_norm": 0.1548936814069748,
"learning_rate": 8.784331382260243e-05,
"loss": 0.2863,
"step": 276
},
{
"epoch": 0.8708980153271763,
"grad_norm": 0.17063087224960327,
"learning_rate": 8.779828905898244e-05,
"loss": 0.3299,
"step": 277
},
{
"epoch": 0.874042051483592,
"grad_norm": 0.17423272132873535,
"learning_rate": 8.775326429536246e-05,
"loss": 0.3161,
"step": 278
},
{
"epoch": 0.8771860876400078,
"grad_norm": 0.16254863142967224,
"learning_rate": 8.770823953174246e-05,
"loss": 0.3236,
"step": 279
},
{
"epoch": 0.8803301237964236,
"grad_norm": 0.16803030669689178,
"learning_rate": 8.766321476812247e-05,
"loss": 0.3288,
"step": 280
},
{
"epoch": 0.8834741599528395,
"grad_norm": 0.1748911589384079,
"learning_rate": 8.761819000450247e-05,
"loss": 0.2984,
"step": 281
},
{
"epoch": 0.8866181961092553,
"grad_norm": 0.16277071833610535,
"learning_rate": 8.75731652408825e-05,
"loss": 0.2983,
"step": 282
},
{
"epoch": 0.8897622322656711,
"grad_norm": 0.16375455260276794,
"learning_rate": 8.75281404772625e-05,
"loss": 0.3339,
"step": 283
},
{
"epoch": 0.8929062684220869,
"grad_norm": 0.15943491458892822,
"learning_rate": 8.74831157136425e-05,
"loss": 0.3414,
"step": 284
},
{
"epoch": 0.8960503045785027,
"grad_norm": 0.16314157843589783,
"learning_rate": 8.743809095002252e-05,
"loss": 0.3207,
"step": 285
},
{
"epoch": 0.8991943407349184,
"grad_norm": 0.16236723959445953,
"learning_rate": 8.739306618640253e-05,
"loss": 0.3349,
"step": 286
},
{
"epoch": 0.9023383768913342,
"grad_norm": 0.17521819472312927,
"learning_rate": 8.734804142278254e-05,
"loss": 0.3275,
"step": 287
},
{
"epoch": 0.90548241304775,
"grad_norm": 0.2164747565984726,
"learning_rate": 8.730301665916253e-05,
"loss": 0.3288,
"step": 288
},
{
"epoch": 0.9086264492041658,
"grad_norm": 0.16042940318584442,
"learning_rate": 8.725799189554256e-05,
"loss": 0.3214,
"step": 289
},
{
"epoch": 0.9117704853605817,
"grad_norm": 0.16976606845855713,
"learning_rate": 8.721296713192257e-05,
"loss": 0.3112,
"step": 290
},
{
"epoch": 0.9149145215169975,
"grad_norm": 0.17349150776863098,
"learning_rate": 8.716794236830257e-05,
"loss": 0.2979,
"step": 291
},
{
"epoch": 0.9180585576734133,
"grad_norm": 0.15189234912395477,
"learning_rate": 8.712291760468257e-05,
"loss": 0.3083,
"step": 292
},
{
"epoch": 0.921202593829829,
"grad_norm": 0.15766362845897675,
"learning_rate": 8.707789284106259e-05,
"loss": 0.3337,
"step": 293
},
{
"epoch": 0.9243466299862448,
"grad_norm": 0.15773652493953705,
"learning_rate": 8.70328680774426e-05,
"loss": 0.3161,
"step": 294
},
{
"epoch": 0.9274906661426606,
"grad_norm": 0.15952229499816895,
"learning_rate": 8.698784331382261e-05,
"loss": 0.3131,
"step": 295
},
{
"epoch": 0.9306347022990764,
"grad_norm": 0.16705040633678436,
"learning_rate": 8.694281855020262e-05,
"loss": 0.321,
"step": 296
},
{
"epoch": 0.9337787384554922,
"grad_norm": 0.16729433834552765,
"learning_rate": 8.689779378658263e-05,
"loss": 0.2895,
"step": 297
},
{
"epoch": 0.936922774611908,
"grad_norm": 0.17739711701869965,
"learning_rate": 8.685276902296263e-05,
"loss": 0.3236,
"step": 298
},
{
"epoch": 0.9400668107683239,
"grad_norm": 0.16125445067882538,
"learning_rate": 8.680774425934264e-05,
"loss": 0.3227,
"step": 299
},
{
"epoch": 0.9432108469247397,
"grad_norm": 0.19061018526554108,
"learning_rate": 8.676271949572265e-05,
"loss": 0.3327,
"step": 300
},
{
"epoch": 0.9463548830811555,
"grad_norm": 0.17478956282138824,
"learning_rate": 8.671769473210266e-05,
"loss": 0.3325,
"step": 301
},
{
"epoch": 0.9494989192375712,
"grad_norm": 0.1599021852016449,
"learning_rate": 8.667266996848267e-05,
"loss": 0.3091,
"step": 302
},
{
"epoch": 0.952642955393987,
"grad_norm": 0.16696953773498535,
"learning_rate": 8.662764520486268e-05,
"loss": 0.3211,
"step": 303
},
{
"epoch": 0.9557869915504028,
"grad_norm": 0.16814808547496796,
"learning_rate": 8.658262044124269e-05,
"loss": 0.3284,
"step": 304
},
{
"epoch": 0.9589310277068186,
"grad_norm": 0.15857313573360443,
"learning_rate": 8.65375956776227e-05,
"loss": 0.3224,
"step": 305
},
{
"epoch": 0.9620750638632344,
"grad_norm": 0.15295511484146118,
"learning_rate": 8.64925709140027e-05,
"loss": 0.3315,
"step": 306
},
{
"epoch": 0.9652191000196503,
"grad_norm": 0.21399492025375366,
"learning_rate": 8.644754615038273e-05,
"loss": 0.2922,
"step": 307
},
{
"epoch": 0.9683631361760661,
"grad_norm": 0.17268632352352142,
"learning_rate": 8.640252138676272e-05,
"loss": 0.3564,
"step": 308
},
{
"epoch": 0.9715071723324818,
"grad_norm": 0.17499002814292908,
"learning_rate": 8.635749662314273e-05,
"loss": 0.3385,
"step": 309
},
{
"epoch": 0.9746512084888976,
"grad_norm": 0.170021191239357,
"learning_rate": 8.631247185952274e-05,
"loss": 0.3305,
"step": 310
},
{
"epoch": 0.9777952446453134,
"grad_norm": 0.17455638945102692,
"learning_rate": 8.626744709590276e-05,
"loss": 0.3107,
"step": 311
},
{
"epoch": 0.9809392808017292,
"grad_norm": 0.16129587590694427,
"learning_rate": 8.622242233228275e-05,
"loss": 0.3182,
"step": 312
},
{
"epoch": 0.984083316958145,
"grad_norm": 0.161848783493042,
"learning_rate": 8.617739756866276e-05,
"loss": 0.3368,
"step": 313
},
{
"epoch": 0.9872273531145608,
"grad_norm": 0.14891745150089264,
"learning_rate": 8.613237280504279e-05,
"loss": 0.2923,
"step": 314
},
{
"epoch": 0.9903713892709766,
"grad_norm": 0.1604168862104416,
"learning_rate": 8.60873480414228e-05,
"loss": 0.3197,
"step": 315
},
{
"epoch": 0.9935154254273925,
"grad_norm": 0.15211592614650726,
"learning_rate": 8.604232327780279e-05,
"loss": 0.3197,
"step": 316
},
{
"epoch": 0.9966594615838082,
"grad_norm": 0.1654754877090454,
"learning_rate": 8.59972985141828e-05,
"loss": 0.3108,
"step": 317
},
{
"epoch": 0.999803497740224,
"grad_norm": 0.1642957627773285,
"learning_rate": 8.595227375056282e-05,
"loss": 0.3337,
"step": 318
},
{
"epoch": 1.0,
"grad_norm": 0.6600321531295776,
"learning_rate": 8.590724898694283e-05,
"loss": 0.3815,
"step": 319
},
{
"epoch": 1.0,
"eval_loss": 0.3256175220012665,
"eval_runtime": 102.4846,
"eval_samples_per_second": 12.412,
"eval_steps_per_second": 12.412,
"step": 319
},
{
"epoch": 1.003144036156416,
"grad_norm": 0.15727423131465912,
"learning_rate": 8.586222422332282e-05,
"loss": 0.2867,
"step": 320
},
{
"epoch": 1.0062880723128316,
"grad_norm": 0.17558008432388306,
"learning_rate": 8.581719945970285e-05,
"loss": 0.3426,
"step": 321
},
{
"epoch": 1.0094321084692475,
"grad_norm": 0.1863006204366684,
"learning_rate": 8.577217469608285e-05,
"loss": 0.2946,
"step": 322
},
{
"epoch": 1.0125761446256631,
"grad_norm": 0.20207758247852325,
"learning_rate": 8.572714993246286e-05,
"loss": 0.2872,
"step": 323
},
{
"epoch": 1.015720180782079,
"grad_norm": 0.17089968919754028,
"learning_rate": 8.568212516884286e-05,
"loss": 0.3098,
"step": 324
},
{
"epoch": 1.0188642169384947,
"grad_norm": 0.18078500032424927,
"learning_rate": 8.563710040522288e-05,
"loss": 0.2901,
"step": 325
},
{
"epoch": 1.0220082530949106,
"grad_norm": 0.18292267620563507,
"learning_rate": 8.559207564160289e-05,
"loss": 0.332,
"step": 326
},
{
"epoch": 1.0251522892513263,
"grad_norm": 0.16480115056037903,
"learning_rate": 8.55470508779829e-05,
"loss": 0.3042,
"step": 327
},
{
"epoch": 1.0282963254077422,
"grad_norm": 0.16814446449279785,
"learning_rate": 8.550202611436289e-05,
"loss": 0.319,
"step": 328
},
{
"epoch": 1.031440361564158,
"grad_norm": 0.16136884689331055,
"learning_rate": 8.545700135074291e-05,
"loss": 0.291,
"step": 329
},
{
"epoch": 1.0345843977205738,
"grad_norm": 0.17144669592380524,
"learning_rate": 8.541197658712292e-05,
"loss": 0.2852,
"step": 330
},
{
"epoch": 1.0377284338769897,
"grad_norm": 0.1635693609714508,
"learning_rate": 8.536695182350293e-05,
"loss": 0.2684,
"step": 331
},
{
"epoch": 1.0408724700334053,
"grad_norm": 0.1604490727186203,
"learning_rate": 8.532192705988294e-05,
"loss": 0.3093,
"step": 332
},
{
"epoch": 1.0440165061898212,
"grad_norm": 0.15917396545410156,
"learning_rate": 8.527690229626295e-05,
"loss": 0.2697,
"step": 333
},
{
"epoch": 1.047160542346237,
"grad_norm": 0.17525093257427216,
"learning_rate": 8.523187753264296e-05,
"loss": 0.3084,
"step": 334
},
{
"epoch": 1.0503045785026528,
"grad_norm": 0.1677919328212738,
"learning_rate": 8.518685276902297e-05,
"loss": 0.2974,
"step": 335
},
{
"epoch": 1.0534486146590685,
"grad_norm": 0.17789426445960999,
"learning_rate": 8.514182800540297e-05,
"loss": 0.3062,
"step": 336
},
{
"epoch": 1.0565926508154844,
"grad_norm": 0.16536547243595123,
"learning_rate": 8.509680324178298e-05,
"loss": 0.2872,
"step": 337
},
{
"epoch": 1.0597366869719003,
"grad_norm": 0.17476080358028412,
"learning_rate": 8.505177847816299e-05,
"loss": 0.306,
"step": 338
},
{
"epoch": 1.062880723128316,
"grad_norm": 0.1762908548116684,
"learning_rate": 8.500675371454301e-05,
"loss": 0.2878,
"step": 339
},
{
"epoch": 1.0660247592847318,
"grad_norm": 0.17144866287708282,
"learning_rate": 8.496172895092301e-05,
"loss": 0.2915,
"step": 340
},
{
"epoch": 1.0691687954411475,
"grad_norm": 0.16622525453567505,
"learning_rate": 8.491670418730302e-05,
"loss": 0.3169,
"step": 341
},
{
"epoch": 1.0723128315975634,
"grad_norm": 0.17786164581775665,
"learning_rate": 8.487167942368303e-05,
"loss": 0.2731,
"step": 342
},
{
"epoch": 1.075456867753979,
"grad_norm": 0.17761558294296265,
"learning_rate": 8.482665466006305e-05,
"loss": 0.2967,
"step": 343
},
{
"epoch": 1.078600903910395,
"grad_norm": 0.16161416471004486,
"learning_rate": 8.478162989644304e-05,
"loss": 0.2887,
"step": 344
},
{
"epoch": 1.0817449400668107,
"grad_norm": 0.1788141131401062,
"learning_rate": 8.473660513282305e-05,
"loss": 0.3456,
"step": 345
},
{
"epoch": 1.0848889762232266,
"grad_norm": 0.17762236297130585,
"learning_rate": 8.469158036920306e-05,
"loss": 0.3078,
"step": 346
},
{
"epoch": 1.0880330123796425,
"grad_norm": 0.16463209688663483,
"learning_rate": 8.464655560558308e-05,
"loss": 0.285,
"step": 347
},
{
"epoch": 1.0911770485360581,
"grad_norm": 0.17661692202091217,
"learning_rate": 8.460153084196308e-05,
"loss": 0.3139,
"step": 348
},
{
"epoch": 1.094321084692474,
"grad_norm": 0.170676589012146,
"learning_rate": 8.455650607834309e-05,
"loss": 0.3009,
"step": 349
},
{
"epoch": 1.0974651208488897,
"grad_norm": 0.17777417600154877,
"learning_rate": 8.451148131472311e-05,
"loss": 0.2889,
"step": 350
},
{
"epoch": 1.1006091570053056,
"grad_norm": 0.17952531576156616,
"learning_rate": 8.446645655110312e-05,
"loss": 0.3181,
"step": 351
},
{
"epoch": 1.1037531931617213,
"grad_norm": 0.17377127707004547,
"learning_rate": 8.442143178748311e-05,
"loss": 0.3003,
"step": 352
},
{
"epoch": 1.1068972293181372,
"grad_norm": 0.17013375461101532,
"learning_rate": 8.437640702386312e-05,
"loss": 0.3086,
"step": 353
},
{
"epoch": 1.1100412654745528,
"grad_norm": 0.17330169677734375,
"learning_rate": 8.433138226024314e-05,
"loss": 0.314,
"step": 354
},
{
"epoch": 1.1131853016309687,
"grad_norm": 0.17327344417572021,
"learning_rate": 8.428635749662315e-05,
"loss": 0.3227,
"step": 355
},
{
"epoch": 1.1163293377873846,
"grad_norm": 0.17580825090408325,
"learning_rate": 8.424133273300316e-05,
"loss": 0.3104,
"step": 356
},
{
"epoch": 1.1194733739438003,
"grad_norm": 0.17389941215515137,
"learning_rate": 8.419630796938317e-05,
"loss": 0.285,
"step": 357
},
{
"epoch": 1.1226174101002162,
"grad_norm": 0.1775561273097992,
"learning_rate": 8.415128320576318e-05,
"loss": 0.2969,
"step": 358
},
{
"epoch": 1.1257614462566319,
"grad_norm": 0.18656259775161743,
"learning_rate": 8.410625844214319e-05,
"loss": 0.3193,
"step": 359
},
{
"epoch": 1.1289054824130478,
"grad_norm": 0.1777326762676239,
"learning_rate": 8.40612336785232e-05,
"loss": 0.3024,
"step": 360
},
{
"epoch": 1.1320495185694635,
"grad_norm": 0.17319585382938385,
"learning_rate": 8.40162089149032e-05,
"loss": 0.2978,
"step": 361
},
{
"epoch": 1.1351935547258794,
"grad_norm": 0.17653490602970123,
"learning_rate": 8.397118415128321e-05,
"loss": 0.308,
"step": 362
},
{
"epoch": 1.138337590882295,
"grad_norm": 0.160200297832489,
"learning_rate": 8.392615938766322e-05,
"loss": 0.2694,
"step": 363
},
{
"epoch": 1.141481627038711,
"grad_norm": 0.16492871940135956,
"learning_rate": 8.388113462404323e-05,
"loss": 0.2824,
"step": 364
},
{
"epoch": 1.1446256631951268,
"grad_norm": 0.17109255492687225,
"learning_rate": 8.383610986042324e-05,
"loss": 0.2841,
"step": 365
},
{
"epoch": 1.1477696993515425,
"grad_norm": 0.23076315224170685,
"learning_rate": 8.379108509680325e-05,
"loss": 0.314,
"step": 366
},
{
"epoch": 1.1509137355079584,
"grad_norm": 0.1704353392124176,
"learning_rate": 8.374606033318325e-05,
"loss": 0.2962,
"step": 367
},
{
"epoch": 1.154057771664374,
"grad_norm": 0.17220115661621094,
"learning_rate": 8.370103556956326e-05,
"loss": 0.2904,
"step": 368
},
{
"epoch": 1.15720180782079,
"grad_norm": 0.17528584599494934,
"learning_rate": 8.365601080594327e-05,
"loss": 0.3263,
"step": 369
},
{
"epoch": 1.1603458439772059,
"grad_norm": 0.18602944910526276,
"learning_rate": 8.361098604232328e-05,
"loss": 0.2989,
"step": 370
},
{
"epoch": 1.1634898801336215,
"grad_norm": 0.18739493191242218,
"learning_rate": 8.356596127870329e-05,
"loss": 0.3148,
"step": 371
},
{
"epoch": 1.1666339162900374,
"grad_norm": 0.1813725382089615,
"learning_rate": 8.35209365150833e-05,
"loss": 0.2989,
"step": 372
},
{
"epoch": 1.169777952446453,
"grad_norm": 0.1674114614725113,
"learning_rate": 8.34759117514633e-05,
"loss": 0.2668,
"step": 373
},
{
"epoch": 1.172921988602869,
"grad_norm": 0.1844543069601059,
"learning_rate": 8.343088698784331e-05,
"loss": 0.3003,
"step": 374
},
{
"epoch": 1.1760660247592847,
"grad_norm": 0.17155998945236206,
"learning_rate": 8.338586222422334e-05,
"loss": 0.2931,
"step": 375
},
{
"epoch": 1.1792100609157006,
"grad_norm": 0.1664140224456787,
"learning_rate": 8.334083746060334e-05,
"loss": 0.2896,
"step": 376
},
{
"epoch": 1.1823540970721163,
"grad_norm": 0.18443046510219574,
"learning_rate": 8.329581269698334e-05,
"loss": 0.3098,
"step": 377
},
{
"epoch": 1.1854981332285321,
"grad_norm": 0.16364677250385284,
"learning_rate": 8.325078793336335e-05,
"loss": 0.2849,
"step": 378
},
{
"epoch": 1.188642169384948,
"grad_norm": 0.1778181493282318,
"learning_rate": 8.320576316974337e-05,
"loss": 0.2951,
"step": 379
},
{
"epoch": 1.1917862055413637,
"grad_norm": 0.17129847407341003,
"learning_rate": 8.316073840612338e-05,
"loss": 0.2917,
"step": 380
},
{
"epoch": 1.1949302416977796,
"grad_norm": 0.17360500991344452,
"learning_rate": 8.311571364250337e-05,
"loss": 0.3078,
"step": 381
},
{
"epoch": 1.1980742778541953,
"grad_norm": 0.17020374536514282,
"learning_rate": 8.307068887888338e-05,
"loss": 0.2966,
"step": 382
},
{
"epoch": 1.2012183140106112,
"grad_norm": 0.1838023066520691,
"learning_rate": 8.30256641152634e-05,
"loss": 0.2991,
"step": 383
},
{
"epoch": 1.2043623501670269,
"grad_norm": 0.18513008952140808,
"learning_rate": 8.298063935164341e-05,
"loss": 0.3455,
"step": 384
},
{
"epoch": 1.2075063863234428,
"grad_norm": 0.17663338780403137,
"learning_rate": 8.293561458802341e-05,
"loss": 0.3059,
"step": 385
},
{
"epoch": 1.2106504224798584,
"grad_norm": 0.17648449540138245,
"learning_rate": 8.289058982440343e-05,
"loss": 0.2991,
"step": 386
},
{
"epoch": 1.2137944586362743,
"grad_norm": 0.18601331114768982,
"learning_rate": 8.284556506078344e-05,
"loss": 0.2936,
"step": 387
},
{
"epoch": 1.2169384947926902,
"grad_norm": 0.18048390746116638,
"learning_rate": 8.280054029716345e-05,
"loss": 0.2837,
"step": 388
},
{
"epoch": 1.220082530949106,
"grad_norm": 0.17065560817718506,
"learning_rate": 8.275551553354344e-05,
"loss": 0.283,
"step": 389
},
{
"epoch": 1.2232265671055218,
"grad_norm": 0.1708894670009613,
"learning_rate": 8.271049076992347e-05,
"loss": 0.2718,
"step": 390
},
{
"epoch": 1.2263706032619375,
"grad_norm": 0.17479634284973145,
"learning_rate": 8.266546600630347e-05,
"loss": 0.3003,
"step": 391
},
{
"epoch": 1.2295146394183534,
"grad_norm": 0.22091898322105408,
"learning_rate": 8.262044124268348e-05,
"loss": 0.3324,
"step": 392
},
{
"epoch": 1.232658675574769,
"grad_norm": 0.17981559038162231,
"learning_rate": 8.257541647906349e-05,
"loss": 0.3042,
"step": 393
},
{
"epoch": 1.235802711731185,
"grad_norm": 0.16127324104309082,
"learning_rate": 8.25303917154435e-05,
"loss": 0.2662,
"step": 394
},
{
"epoch": 1.2389467478876006,
"grad_norm": 0.18422247469425201,
"learning_rate": 8.248536695182351e-05,
"loss": 0.31,
"step": 395
},
{
"epoch": 1.2420907840440165,
"grad_norm": 0.18198904395103455,
"learning_rate": 8.244034218820352e-05,
"loss": 0.3242,
"step": 396
},
{
"epoch": 1.2452348202004324,
"grad_norm": 0.17157980799674988,
"learning_rate": 8.239531742458353e-05,
"loss": 0.3025,
"step": 397
},
{
"epoch": 1.248378856356848,
"grad_norm": 0.17674268782138824,
"learning_rate": 8.235029266096353e-05,
"loss": 0.3072,
"step": 398
},
{
"epoch": 1.251522892513264,
"grad_norm": 0.18540705740451813,
"learning_rate": 8.230526789734354e-05,
"loss": 0.3159,
"step": 399
},
{
"epoch": 1.2546669286696797,
"grad_norm": 0.18816250562667847,
"learning_rate": 8.226024313372355e-05,
"loss": 0.328,
"step": 400
},
{
"epoch": 1.2578109648260956,
"grad_norm": 0.1927611082792282,
"learning_rate": 8.221521837010356e-05,
"loss": 0.3032,
"step": 401
},
{
"epoch": 1.2609550009825112,
"grad_norm": 0.18845967948436737,
"learning_rate": 8.217019360648357e-05,
"loss": 0.2793,
"step": 402
},
{
"epoch": 1.2640990371389271,
"grad_norm": 0.19096308946609497,
"learning_rate": 8.212516884286358e-05,
"loss": 0.3369,
"step": 403
},
{
"epoch": 1.2672430732953428,
"grad_norm": 0.16917437314987183,
"learning_rate": 8.20801440792436e-05,
"loss": 0.2785,
"step": 404
},
{
"epoch": 1.2703871094517587,
"grad_norm": 0.16734306514263153,
"learning_rate": 8.20351193156236e-05,
"loss": 0.2914,
"step": 405
},
{
"epoch": 1.2735311456081746,
"grad_norm": 0.17491504549980164,
"learning_rate": 8.19900945520036e-05,
"loss": 0.2905,
"step": 406
},
{
"epoch": 1.2766751817645903,
"grad_norm": 0.18531963229179382,
"learning_rate": 8.194506978838361e-05,
"loss": 0.3365,
"step": 407
},
{
"epoch": 1.2798192179210062,
"grad_norm": 0.1812233179807663,
"learning_rate": 8.190004502476363e-05,
"loss": 0.3051,
"step": 408
},
{
"epoch": 1.2829632540774218,
"grad_norm": 0.17402727901935577,
"learning_rate": 8.185502026114363e-05,
"loss": 0.2906,
"step": 409
},
{
"epoch": 1.2861072902338377,
"grad_norm": 0.18160969018936157,
"learning_rate": 8.180999549752364e-05,
"loss": 0.295,
"step": 410
},
{
"epoch": 1.2892513263902534,
"grad_norm": 0.17364852130413055,
"learning_rate": 8.176497073390366e-05,
"loss": 0.2789,
"step": 411
},
{
"epoch": 1.2923953625466693,
"grad_norm": 0.17983028292655945,
"learning_rate": 8.171994597028367e-05,
"loss": 0.2976,
"step": 412
},
{
"epoch": 1.295539398703085,
"grad_norm": 0.18376639485359192,
"learning_rate": 8.167492120666366e-05,
"loss": 0.2984,
"step": 413
},
{
"epoch": 1.2986834348595009,
"grad_norm": 0.16966019570827484,
"learning_rate": 8.162989644304367e-05,
"loss": 0.2825,
"step": 414
},
{
"epoch": 1.3018274710159168,
"grad_norm": 0.18048398196697235,
"learning_rate": 8.15848716794237e-05,
"loss": 0.3105,
"step": 415
},
{
"epoch": 1.3049715071723325,
"grad_norm": 0.1738966405391693,
"learning_rate": 8.15398469158037e-05,
"loss": 0.27,
"step": 416
},
{
"epoch": 1.3081155433287484,
"grad_norm": 0.22127372026443481,
"learning_rate": 8.14948221521837e-05,
"loss": 0.3156,
"step": 417
},
{
"epoch": 1.311259579485164,
"grad_norm": 0.17313317954540253,
"learning_rate": 8.14497973885637e-05,
"loss": 0.2921,
"step": 418
},
{
"epoch": 1.31440361564158,
"grad_norm": 0.17622841894626617,
"learning_rate": 8.140477262494373e-05,
"loss": 0.3026,
"step": 419
},
{
"epoch": 1.3175476517979956,
"grad_norm": 0.17847168445587158,
"learning_rate": 8.135974786132374e-05,
"loss": 0.3021,
"step": 420
},
{
"epoch": 1.3206916879544115,
"grad_norm": 0.18637776374816895,
"learning_rate": 8.131472309770373e-05,
"loss": 0.3186,
"step": 421
},
{
"epoch": 1.3238357241108272,
"grad_norm": 0.16532807052135468,
"learning_rate": 8.126969833408375e-05,
"loss": 0.2878,
"step": 422
},
{
"epoch": 1.326979760267243,
"grad_norm": 0.16804370284080505,
"learning_rate": 8.122467357046376e-05,
"loss": 0.2868,
"step": 423
},
{
"epoch": 1.330123796423659,
"grad_norm": 0.1693575531244278,
"learning_rate": 8.117964880684377e-05,
"loss": 0.2898,
"step": 424
},
{
"epoch": 1.3332678325800746,
"grad_norm": 0.17773057520389557,
"learning_rate": 8.113462404322378e-05,
"loss": 0.2741,
"step": 425
},
{
"epoch": 1.3364118687364905,
"grad_norm": 0.1866486817598343,
"learning_rate": 8.108959927960379e-05,
"loss": 0.298,
"step": 426
},
{
"epoch": 1.3395559048929062,
"grad_norm": 0.18073201179504395,
"learning_rate": 8.10445745159838e-05,
"loss": 0.2933,
"step": 427
},
{
"epoch": 1.342699941049322,
"grad_norm": 0.17505986988544464,
"learning_rate": 8.09995497523638e-05,
"loss": 0.2936,
"step": 428
},
{
"epoch": 1.3458439772057378,
"grad_norm": 0.17242185771465302,
"learning_rate": 8.095452498874381e-05,
"loss": 0.2827,
"step": 429
},
{
"epoch": 1.3489880133621537,
"grad_norm": 0.16698665916919708,
"learning_rate": 8.090950022512382e-05,
"loss": 0.2742,
"step": 430
},
{
"epoch": 1.3521320495185694,
"grad_norm": 0.18763364851474762,
"learning_rate": 8.086447546150383e-05,
"loss": 0.3192,
"step": 431
},
{
"epoch": 1.3552760856749853,
"grad_norm": 0.18754689395427704,
"learning_rate": 8.081945069788384e-05,
"loss": 0.2932,
"step": 432
},
{
"epoch": 1.3584201218314012,
"grad_norm": 0.18708984553813934,
"learning_rate": 8.077442593426385e-05,
"loss": 0.328,
"step": 433
},
{
"epoch": 1.3615641579878168,
"grad_norm": 0.18035311996936798,
"learning_rate": 8.072940117064386e-05,
"loss": 0.2699,
"step": 434
},
{
"epoch": 1.3647081941442327,
"grad_norm": 0.17291460931301117,
"learning_rate": 8.068437640702387e-05,
"loss": 0.2729,
"step": 435
},
{
"epoch": 1.3678522303006484,
"grad_norm": 0.1894587129354477,
"learning_rate": 8.063935164340387e-05,
"loss": 0.2792,
"step": 436
},
{
"epoch": 1.3709962664570643,
"grad_norm": 0.17740470170974731,
"learning_rate": 8.059432687978388e-05,
"loss": 0.3022,
"step": 437
},
{
"epoch": 1.3741403026134802,
"grad_norm": 0.17968855798244476,
"learning_rate": 8.054930211616389e-05,
"loss": 0.3095,
"step": 438
},
{
"epoch": 1.3772843387698959,
"grad_norm": 0.1771247237920761,
"learning_rate": 8.05042773525439e-05,
"loss": 0.3052,
"step": 439
},
{
"epoch": 1.3804283749263115,
"grad_norm": 0.164938822388649,
"learning_rate": 8.045925258892392e-05,
"loss": 0.2912,
"step": 440
},
{
"epoch": 1.3835724110827274,
"grad_norm": 0.17572474479675293,
"learning_rate": 8.041422782530392e-05,
"loss": 0.2942,
"step": 441
},
{
"epoch": 1.3867164472391433,
"grad_norm": 0.16663512587547302,
"learning_rate": 8.036920306168393e-05,
"loss": 0.2838,
"step": 442
},
{
"epoch": 1.389860483395559,
"grad_norm": 0.1684209108352661,
"learning_rate": 8.032417829806393e-05,
"loss": 0.3024,
"step": 443
},
{
"epoch": 1.393004519551975,
"grad_norm": 0.1704261749982834,
"learning_rate": 8.027915353444396e-05,
"loss": 0.3055,
"step": 444
},
{
"epoch": 1.3961485557083906,
"grad_norm": 0.17855525016784668,
"learning_rate": 8.023412877082395e-05,
"loss": 0.3047,
"step": 445
},
{
"epoch": 1.3992925918648065,
"grad_norm": 0.16438795626163483,
"learning_rate": 8.018910400720396e-05,
"loss": 0.2833,
"step": 446
},
{
"epoch": 1.4024366280212224,
"grad_norm": 0.1803821176290512,
"learning_rate": 8.014407924358398e-05,
"loss": 0.2693,
"step": 447
},
{
"epoch": 1.405580664177638,
"grad_norm": 0.17037837207317352,
"learning_rate": 8.009905447996399e-05,
"loss": 0.2931,
"step": 448
},
{
"epoch": 1.4087247003340537,
"grad_norm": 0.17838133871555328,
"learning_rate": 8.0054029716344e-05,
"loss": 0.3126,
"step": 449
},
{
"epoch": 1.4118687364904696,
"grad_norm": 0.17596563696861267,
"learning_rate": 8.0009004952724e-05,
"loss": 0.3124,
"step": 450
},
{
"epoch": 1.4150127726468855,
"grad_norm": 0.17477372288703918,
"learning_rate": 7.996398018910402e-05,
"loss": 0.2846,
"step": 451
},
{
"epoch": 1.4181568088033012,
"grad_norm": 0.16831114888191223,
"learning_rate": 7.991895542548402e-05,
"loss": 0.2841,
"step": 452
},
{
"epoch": 1.421300844959717,
"grad_norm": 0.16885237395763397,
"learning_rate": 7.987393066186403e-05,
"loss": 0.2882,
"step": 453
},
{
"epoch": 1.4244448811161328,
"grad_norm": 0.1732211410999298,
"learning_rate": 7.982890589824403e-05,
"loss": 0.3065,
"step": 454
},
{
"epoch": 1.4275889172725487,
"grad_norm": 0.17489729821681976,
"learning_rate": 7.978388113462405e-05,
"loss": 0.2937,
"step": 455
},
{
"epoch": 1.4307329534289646,
"grad_norm": 0.1771242618560791,
"learning_rate": 7.973885637100406e-05,
"loss": 0.2963,
"step": 456
},
{
"epoch": 1.4338769895853802,
"grad_norm": 0.19036780297756195,
"learning_rate": 7.969383160738407e-05,
"loss": 0.3104,
"step": 457
},
{
"epoch": 1.437021025741796,
"grad_norm": 0.1863013058900833,
"learning_rate": 7.964880684376408e-05,
"loss": 0.3047,
"step": 458
},
{
"epoch": 1.4401650618982118,
"grad_norm": 0.1722109168767929,
"learning_rate": 7.960378208014408e-05,
"loss": 0.2828,
"step": 459
},
{
"epoch": 1.4433090980546277,
"grad_norm": 0.1802283078432083,
"learning_rate": 7.95587573165241e-05,
"loss": 0.2978,
"step": 460
},
{
"epoch": 1.4464531342110434,
"grad_norm": 0.17628727853298187,
"learning_rate": 7.95137325529041e-05,
"loss": 0.2963,
"step": 461
},
{
"epoch": 1.4495971703674593,
"grad_norm": 0.17598123848438263,
"learning_rate": 7.946870778928411e-05,
"loss": 0.3106,
"step": 462
},
{
"epoch": 1.452741206523875,
"grad_norm": 0.17388591170310974,
"learning_rate": 7.942368302566412e-05,
"loss": 0.3067,
"step": 463
},
{
"epoch": 1.4558852426802908,
"grad_norm": 0.17893949151039124,
"learning_rate": 7.937865826204413e-05,
"loss": 0.2756,
"step": 464
},
{
"epoch": 1.4590292788367067,
"grad_norm": 0.16779755055904388,
"learning_rate": 7.933363349842414e-05,
"loss": 0.2911,
"step": 465
},
{
"epoch": 1.4621733149931224,
"grad_norm": 0.19151651859283447,
"learning_rate": 7.928860873480414e-05,
"loss": 0.2919,
"step": 466
},
{
"epoch": 1.465317351149538,
"grad_norm": 0.17654001712799072,
"learning_rate": 7.924358397118415e-05,
"loss": 0.3021,
"step": 467
},
{
"epoch": 1.468461387305954,
"grad_norm": 0.17647038400173187,
"learning_rate": 7.919855920756416e-05,
"loss": 0.271,
"step": 468
},
{
"epoch": 1.4716054234623699,
"grad_norm": 0.17813007533550262,
"learning_rate": 7.915353444394417e-05,
"loss": 0.3031,
"step": 469
},
{
"epoch": 1.4747494596187856,
"grad_norm": 0.19432079792022705,
"learning_rate": 7.910850968032418e-05,
"loss": 0.3529,
"step": 470
},
{
"epoch": 1.4778934957752015,
"grad_norm": 0.18345120549201965,
"learning_rate": 7.906348491670419e-05,
"loss": 0.2832,
"step": 471
},
{
"epoch": 1.4810375319316171,
"grad_norm": 0.1722515970468521,
"learning_rate": 7.90184601530842e-05,
"loss": 0.3294,
"step": 472
},
{
"epoch": 1.484181568088033,
"grad_norm": 0.1815156191587448,
"learning_rate": 7.897343538946422e-05,
"loss": 0.3025,
"step": 473
},
{
"epoch": 1.487325604244449,
"grad_norm": 0.17528071999549866,
"learning_rate": 7.892841062584421e-05,
"loss": 0.3167,
"step": 474
},
{
"epoch": 1.4904696404008646,
"grad_norm": 0.1877971738576889,
"learning_rate": 7.888338586222422e-05,
"loss": 0.3088,
"step": 475
},
{
"epoch": 1.4936136765572803,
"grad_norm": 0.16935402154922485,
"learning_rate": 7.883836109860424e-05,
"loss": 0.3003,
"step": 476
},
{
"epoch": 1.4967577127136962,
"grad_norm": 0.1625109314918518,
"learning_rate": 7.879333633498425e-05,
"loss": 0.2853,
"step": 477
},
{
"epoch": 1.499901748870112,
"grad_norm": 0.17674805223941803,
"learning_rate": 7.874831157136425e-05,
"loss": 0.3047,
"step": 478
},
{
"epoch": 1.503045785026528,
"grad_norm": 0.168808251619339,
"learning_rate": 7.870328680774426e-05,
"loss": 0.301,
"step": 479
},
{
"epoch": 1.5061898211829436,
"grad_norm": 0.1753881871700287,
"learning_rate": 7.865826204412428e-05,
"loss": 0.2939,
"step": 480
},
{
"epoch": 1.5093338573393593,
"grad_norm": 0.16852906346321106,
"learning_rate": 7.861323728050429e-05,
"loss": 0.2989,
"step": 481
},
{
"epoch": 1.5124778934957752,
"grad_norm": 0.16612806916236877,
"learning_rate": 7.856821251688428e-05,
"loss": 0.2731,
"step": 482
},
{
"epoch": 1.5156219296521911,
"grad_norm": 0.17498096823692322,
"learning_rate": 7.85231877532643e-05,
"loss": 0.2903,
"step": 483
},
{
"epoch": 1.5187659658086068,
"grad_norm": 0.1843009740114212,
"learning_rate": 7.847816298964431e-05,
"loss": 0.3168,
"step": 484
},
{
"epoch": 1.5219100019650225,
"grad_norm": 0.17858386039733887,
"learning_rate": 7.843313822602432e-05,
"loss": 0.2954,
"step": 485
},
{
"epoch": 1.5250540381214384,
"grad_norm": 0.18993936479091644,
"learning_rate": 7.838811346240432e-05,
"loss": 0.3264,
"step": 486
},
{
"epoch": 1.5281980742778543,
"grad_norm": 0.1731633096933365,
"learning_rate": 7.834308869878434e-05,
"loss": 0.2982,
"step": 487
},
{
"epoch": 1.5313421104342702,
"grad_norm": 0.17727167904376984,
"learning_rate": 7.829806393516435e-05,
"loss": 0.2854,
"step": 488
},
{
"epoch": 1.5344861465906858,
"grad_norm": 0.17536379396915436,
"learning_rate": 7.825303917154436e-05,
"loss": 0.2788,
"step": 489
},
{
"epoch": 1.5376301827471015,
"grad_norm": 0.1785167157649994,
"learning_rate": 7.820801440792435e-05,
"loss": 0.3007,
"step": 490
},
{
"epoch": 1.5407742189035174,
"grad_norm": 0.1738578975200653,
"learning_rate": 7.816298964430437e-05,
"loss": 0.3015,
"step": 491
},
{
"epoch": 1.5439182550599333,
"grad_norm": 0.1737809181213379,
"learning_rate": 7.811796488068438e-05,
"loss": 0.3031,
"step": 492
},
{
"epoch": 1.547062291216349,
"grad_norm": 0.17526312172412872,
"learning_rate": 7.807294011706439e-05,
"loss": 0.3072,
"step": 493
},
{
"epoch": 1.5502063273727646,
"grad_norm": 0.17959162592887878,
"learning_rate": 7.80279153534444e-05,
"loss": 0.2699,
"step": 494
},
{
"epoch": 1.5533503635291805,
"grad_norm": 0.17218153178691864,
"learning_rate": 7.798289058982441e-05,
"loss": 0.2734,
"step": 495
},
{
"epoch": 1.5564943996855964,
"grad_norm": 0.17062252759933472,
"learning_rate": 7.793786582620442e-05,
"loss": 0.2426,
"step": 496
},
{
"epoch": 1.5596384358420123,
"grad_norm": 0.19795489311218262,
"learning_rate": 7.789284106258442e-05,
"loss": 0.3396,
"step": 497
},
{
"epoch": 1.562782471998428,
"grad_norm": 0.18899548053741455,
"learning_rate": 7.784781629896443e-05,
"loss": 0.295,
"step": 498
},
{
"epoch": 1.5659265081548437,
"grad_norm": 0.18889367580413818,
"learning_rate": 7.780279153534444e-05,
"loss": 0.2996,
"step": 499
},
{
"epoch": 1.5690705443112596,
"grad_norm": 0.184955894947052,
"learning_rate": 7.775776677172445e-05,
"loss": 0.2833,
"step": 500
},
{
"epoch": 1.5722145804676755,
"grad_norm": 0.16244037449359894,
"learning_rate": 7.771274200810447e-05,
"loss": 0.2677,
"step": 501
},
{
"epoch": 1.5753586166240912,
"grad_norm": 0.19440148770809174,
"learning_rate": 7.766771724448447e-05,
"loss": 0.3052,
"step": 502
},
{
"epoch": 1.5785026527805068,
"grad_norm": 0.1759510636329651,
"learning_rate": 7.762269248086448e-05,
"loss": 0.2824,
"step": 503
},
{
"epoch": 1.5816466889369227,
"grad_norm": 0.17166948318481445,
"learning_rate": 7.757766771724448e-05,
"loss": 0.2911,
"step": 504
},
{
"epoch": 1.5847907250933386,
"grad_norm": 0.17509418725967407,
"learning_rate": 7.753264295362451e-05,
"loss": 0.2845,
"step": 505
},
{
"epoch": 1.5879347612497545,
"grad_norm": 0.16338001191616058,
"learning_rate": 7.74876181900045e-05,
"loss": 0.2759,
"step": 506
},
{
"epoch": 1.5910787974061702,
"grad_norm": 0.1770390421152115,
"learning_rate": 7.744259342638451e-05,
"loss": 0.3137,
"step": 507
},
{
"epoch": 1.5942228335625859,
"grad_norm": 0.17159558832645416,
"learning_rate": 7.739756866276452e-05,
"loss": 0.2736,
"step": 508
},
{
"epoch": 1.5973668697190018,
"grad_norm": 0.18849338591098785,
"learning_rate": 7.735254389914454e-05,
"loss": 0.2775,
"step": 509
},
{
"epoch": 1.6005109058754177,
"grad_norm": 0.18084058165550232,
"learning_rate": 7.730751913552454e-05,
"loss": 0.2883,
"step": 510
},
{
"epoch": 1.6036549420318333,
"grad_norm": 0.1859467774629593,
"learning_rate": 7.726249437190454e-05,
"loss": 0.3053,
"step": 511
},
{
"epoch": 1.606798978188249,
"grad_norm": 0.18158085644245148,
"learning_rate": 7.721746960828457e-05,
"loss": 0.2923,
"step": 512
},
{
"epoch": 1.609943014344665,
"grad_norm": 0.18600253760814667,
"learning_rate": 7.717244484466458e-05,
"loss": 0.3107,
"step": 513
},
{
"epoch": 1.6130870505010808,
"grad_norm": 0.1869710236787796,
"learning_rate": 7.712742008104457e-05,
"loss": 0.2821,
"step": 514
},
{
"epoch": 1.6162310866574967,
"grad_norm": 0.1755673587322235,
"learning_rate": 7.708239531742458e-05,
"loss": 0.2621,
"step": 515
},
{
"epoch": 1.6193751228139124,
"grad_norm": 0.17789125442504883,
"learning_rate": 7.70373705538046e-05,
"loss": 0.3073,
"step": 516
},
{
"epoch": 1.622519158970328,
"grad_norm": 0.16756756603717804,
"learning_rate": 7.699234579018461e-05,
"loss": 0.3078,
"step": 517
},
{
"epoch": 1.625663195126744,
"grad_norm": 0.17822512984275818,
"learning_rate": 7.69473210265646e-05,
"loss": 0.2998,
"step": 518
},
{
"epoch": 1.6288072312831599,
"grad_norm": 0.16880451142787933,
"learning_rate": 7.690229626294463e-05,
"loss": 0.2918,
"step": 519
},
{
"epoch": 1.6319512674395755,
"grad_norm": 0.1791965365409851,
"learning_rate": 7.685727149932464e-05,
"loss": 0.2898,
"step": 520
},
{
"epoch": 1.6350953035959912,
"grad_norm": 0.17452813684940338,
"learning_rate": 7.681224673570464e-05,
"loss": 0.2885,
"step": 521
},
{
"epoch": 1.638239339752407,
"grad_norm": 0.18743397295475006,
"learning_rate": 7.676722197208465e-05,
"loss": 0.3007,
"step": 522
},
{
"epoch": 1.641383375908823,
"grad_norm": 0.18785692751407623,
"learning_rate": 7.672219720846466e-05,
"loss": 0.2928,
"step": 523
},
{
"epoch": 1.644527412065239,
"grad_norm": 0.19505468010902405,
"learning_rate": 7.667717244484467e-05,
"loss": 0.3206,
"step": 524
},
{
"epoch": 1.6476714482216546,
"grad_norm": 0.1750132143497467,
"learning_rate": 7.663214768122468e-05,
"loss": 0.2764,
"step": 525
},
{
"epoch": 1.6508154843780702,
"grad_norm": 0.18247836828231812,
"learning_rate": 7.658712291760469e-05,
"loss": 0.314,
"step": 526
},
{
"epoch": 1.6539595205344861,
"grad_norm": 0.1866837590932846,
"learning_rate": 7.65420981539847e-05,
"loss": 0.3158,
"step": 527
},
{
"epoch": 1.657103556690902,
"grad_norm": 0.17475096881389618,
"learning_rate": 7.64970733903647e-05,
"loss": 0.2964,
"step": 528
},
{
"epoch": 1.6602475928473177,
"grad_norm": 0.1679716855287552,
"learning_rate": 7.645204862674471e-05,
"loss": 0.3122,
"step": 529
},
{
"epoch": 1.6633916290037334,
"grad_norm": 0.16546200215816498,
"learning_rate": 7.640702386312472e-05,
"loss": 0.3125,
"step": 530
},
{
"epoch": 1.6665356651601493,
"grad_norm": 0.16651305556297302,
"learning_rate": 7.636199909950473e-05,
"loss": 0.2636,
"step": 531
},
{
"epoch": 1.6696797013165652,
"grad_norm": 0.16956521570682526,
"learning_rate": 7.631697433588474e-05,
"loss": 0.2956,
"step": 532
},
{
"epoch": 1.672823737472981,
"grad_norm": 0.17262689769268036,
"learning_rate": 7.627194957226475e-05,
"loss": 0.2889,
"step": 533
},
{
"epoch": 1.6759677736293968,
"grad_norm": 0.17842979729175568,
"learning_rate": 7.622692480864476e-05,
"loss": 0.3175,
"step": 534
},
{
"epoch": 1.6791118097858124,
"grad_norm": 0.18716371059417725,
"learning_rate": 7.618190004502476e-05,
"loss": 0.294,
"step": 535
},
{
"epoch": 1.6822558459422283,
"grad_norm": 0.17072086036205292,
"learning_rate": 7.613687528140477e-05,
"loss": 0.3001,
"step": 536
},
{
"epoch": 1.6853998820986442,
"grad_norm": 0.16700303554534912,
"learning_rate": 7.60918505177848e-05,
"loss": 0.2576,
"step": 537
},
{
"epoch": 1.68854391825506,
"grad_norm": 0.17436909675598145,
"learning_rate": 7.604682575416479e-05,
"loss": 0.2986,
"step": 538
},
{
"epoch": 1.6916879544114756,
"grad_norm": 0.1712087094783783,
"learning_rate": 7.60018009905448e-05,
"loss": 0.2824,
"step": 539
},
{
"epoch": 1.6948319905678915,
"grad_norm": 0.17220038175582886,
"learning_rate": 7.595677622692481e-05,
"loss": 0.2901,
"step": 540
},
{
"epoch": 1.6979760267243074,
"grad_norm": 0.18637694418430328,
"learning_rate": 7.591175146330483e-05,
"loss": 0.3375,
"step": 541
},
{
"epoch": 1.7011200628807233,
"grad_norm": 0.1691576987504959,
"learning_rate": 7.586672669968482e-05,
"loss": 0.2638,
"step": 542
},
{
"epoch": 1.704264099037139,
"grad_norm": 0.17768289148807526,
"learning_rate": 7.582170193606483e-05,
"loss": 0.2879,
"step": 543
},
{
"epoch": 1.7074081351935546,
"grad_norm": 0.1812208741903305,
"learning_rate": 7.577667717244484e-05,
"loss": 0.3129,
"step": 544
},
{
"epoch": 1.7105521713499705,
"grad_norm": 0.18346074223518372,
"learning_rate": 7.573165240882486e-05,
"loss": 0.302,
"step": 545
},
{
"epoch": 1.7136962075063864,
"grad_norm": 0.17309945821762085,
"learning_rate": 7.568662764520487e-05,
"loss": 0.255,
"step": 546
},
{
"epoch": 1.716840243662802,
"grad_norm": 0.1879347264766693,
"learning_rate": 7.564160288158487e-05,
"loss": 0.3124,
"step": 547
},
{
"epoch": 1.719984279819218,
"grad_norm": 0.1695443093776703,
"learning_rate": 7.559657811796489e-05,
"loss": 0.2809,
"step": 548
},
{
"epoch": 1.7231283159756337,
"grad_norm": 0.17476417124271393,
"learning_rate": 7.55515533543449e-05,
"loss": 0.3043,
"step": 549
},
{
"epoch": 1.7262723521320495,
"grad_norm": 0.1775609701871872,
"learning_rate": 7.550652859072491e-05,
"loss": 0.289,
"step": 550
},
{
"epoch": 1.7294163882884654,
"grad_norm": 0.17453855276107788,
"learning_rate": 7.54615038271049e-05,
"loss": 0.2914,
"step": 551
},
{
"epoch": 1.7325604244448811,
"grad_norm": 0.18414853513240814,
"learning_rate": 7.541647906348492e-05,
"loss": 0.3184,
"step": 552
},
{
"epoch": 1.7357044606012968,
"grad_norm": 0.18060451745986938,
"learning_rate": 7.537145429986493e-05,
"loss": 0.2998,
"step": 553
},
{
"epoch": 1.7388484967577127,
"grad_norm": 0.1735735535621643,
"learning_rate": 7.532642953624494e-05,
"loss": 0.3192,
"step": 554
},
{
"epoch": 1.7419925329141286,
"grad_norm": 0.17077748477458954,
"learning_rate": 7.528140477262495e-05,
"loss": 0.2912,
"step": 555
},
{
"epoch": 1.7451365690705443,
"grad_norm": 0.16513197124004364,
"learning_rate": 7.523638000900496e-05,
"loss": 0.3024,
"step": 556
},
{
"epoch": 1.7482806052269602,
"grad_norm": 0.1681637018918991,
"learning_rate": 7.519135524538497e-05,
"loss": 0.2764,
"step": 557
},
{
"epoch": 1.7514246413833758,
"grad_norm": 0.17060600221157074,
"learning_rate": 7.514633048176498e-05,
"loss": 0.272,
"step": 558
},
{
"epoch": 1.7545686775397917,
"grad_norm": 0.1727294623851776,
"learning_rate": 7.510130571814498e-05,
"loss": 0.2938,
"step": 559
},
{
"epoch": 1.7577127136962076,
"grad_norm": 0.16411182284355164,
"learning_rate": 7.505628095452499e-05,
"loss": 0.2864,
"step": 560
},
{
"epoch": 1.7608567498526233,
"grad_norm": 0.16701269149780273,
"learning_rate": 7.5011256190905e-05,
"loss": 0.276,
"step": 561
},
{
"epoch": 1.764000786009039,
"grad_norm": 0.16412830352783203,
"learning_rate": 7.496623142728501e-05,
"loss": 0.2836,
"step": 562
},
{
"epoch": 1.7671448221654549,
"grad_norm": 0.17730842530727386,
"learning_rate": 7.492120666366502e-05,
"loss": 0.2812,
"step": 563
},
{
"epoch": 1.7702888583218708,
"grad_norm": 0.16831046342849731,
"learning_rate": 7.487618190004503e-05,
"loss": 0.2832,
"step": 564
},
{
"epoch": 1.7734328944782864,
"grad_norm": 0.17002396285533905,
"learning_rate": 7.483115713642504e-05,
"loss": 0.2884,
"step": 565
},
{
"epoch": 1.7765769306347023,
"grad_norm": 0.181968092918396,
"learning_rate": 7.478613237280504e-05,
"loss": 0.32,
"step": 566
},
{
"epoch": 1.779720966791118,
"grad_norm": 0.18976394832134247,
"learning_rate": 7.474110760918505e-05,
"loss": 0.2993,
"step": 567
},
{
"epoch": 1.782865002947534,
"grad_norm": 0.1806926429271698,
"learning_rate": 7.469608284556506e-05,
"loss": 0.2914,
"step": 568
},
{
"epoch": 1.7860090391039498,
"grad_norm": 0.17822052538394928,
"learning_rate": 7.465105808194507e-05,
"loss": 0.2843,
"step": 569
},
{
"epoch": 1.7891530752603655,
"grad_norm": 0.18080289661884308,
"learning_rate": 7.460603331832509e-05,
"loss": 0.3121,
"step": 570
},
{
"epoch": 1.7922971114167812,
"grad_norm": 0.17676854133605957,
"learning_rate": 7.456100855470509e-05,
"loss": 0.2901,
"step": 571
},
{
"epoch": 1.795441147573197,
"grad_norm": 0.16959191858768463,
"learning_rate": 7.45159837910851e-05,
"loss": 0.3058,
"step": 572
},
{
"epoch": 1.798585183729613,
"grad_norm": 0.16757243871688843,
"learning_rate": 7.447095902746512e-05,
"loss": 0.2982,
"step": 573
},
{
"epoch": 1.8017292198860289,
"grad_norm": 0.1798073798418045,
"learning_rate": 7.442593426384513e-05,
"loss": 0.3097,
"step": 574
},
{
"epoch": 1.8048732560424445,
"grad_norm": 0.16888341307640076,
"learning_rate": 7.438090950022512e-05,
"loss": 0.2763,
"step": 575
},
{
"epoch": 1.8080172921988602,
"grad_norm": 0.17195682227611542,
"learning_rate": 7.433588473660513e-05,
"loss": 0.2778,
"step": 576
},
{
"epoch": 1.811161328355276,
"grad_norm": 0.17291922867298126,
"learning_rate": 7.429085997298515e-05,
"loss": 0.2894,
"step": 577
},
{
"epoch": 1.814305364511692,
"grad_norm": 0.17213907837867737,
"learning_rate": 7.424583520936516e-05,
"loss": 0.2998,
"step": 578
},
{
"epoch": 1.8174494006681077,
"grad_norm": 0.16969838738441467,
"learning_rate": 7.420081044574516e-05,
"loss": 0.2953,
"step": 579
},
{
"epoch": 1.8205934368245233,
"grad_norm": 0.16950733959674835,
"learning_rate": 7.415578568212516e-05,
"loss": 0.2774,
"step": 580
},
{
"epoch": 1.8237374729809392,
"grad_norm": 0.1866762787103653,
"learning_rate": 7.411076091850519e-05,
"loss": 0.3004,
"step": 581
},
{
"epoch": 1.8268815091373551,
"grad_norm": 0.18050317466259003,
"learning_rate": 7.40657361548852e-05,
"loss": 0.2867,
"step": 582
},
{
"epoch": 1.830025545293771,
"grad_norm": 0.19073279201984406,
"learning_rate": 7.402071139126519e-05,
"loss": 0.2874,
"step": 583
},
{
"epoch": 1.8331695814501867,
"grad_norm": 0.18162357807159424,
"learning_rate": 7.397568662764521e-05,
"loss": 0.3211,
"step": 584
},
{
"epoch": 1.8363136176066024,
"grad_norm": 0.17108604311943054,
"learning_rate": 7.393066186402522e-05,
"loss": 0.3021,
"step": 585
},
{
"epoch": 1.8394576537630183,
"grad_norm": 0.17849913239479065,
"learning_rate": 7.388563710040523e-05,
"loss": 0.3127,
"step": 586
},
{
"epoch": 1.8426016899194342,
"grad_norm": 0.16922686994075775,
"learning_rate": 7.384061233678522e-05,
"loss": 0.2732,
"step": 587
},
{
"epoch": 1.8457457260758499,
"grad_norm": 0.17308250069618225,
"learning_rate": 7.379558757316525e-05,
"loss": 0.2932,
"step": 588
},
{
"epoch": 1.8488897622322655,
"grad_norm": 0.18480746448040009,
"learning_rate": 7.375056280954526e-05,
"loss": 0.3036,
"step": 589
},
{
"epoch": 1.8520337983886814,
"grad_norm": 0.16831083595752716,
"learning_rate": 7.370553804592526e-05,
"loss": 0.2913,
"step": 590
},
{
"epoch": 1.8551778345450973,
"grad_norm": 0.1726708710193634,
"learning_rate": 7.366051328230527e-05,
"loss": 0.2683,
"step": 591
},
{
"epoch": 1.8583218707015132,
"grad_norm": 0.17040051519870758,
"learning_rate": 7.361548851868528e-05,
"loss": 0.2681,
"step": 592
},
{
"epoch": 1.861465906857929,
"grad_norm": 0.18175894021987915,
"learning_rate": 7.357046375506529e-05,
"loss": 0.2643,
"step": 593
},
{
"epoch": 1.8646099430143446,
"grad_norm": 0.18901702761650085,
"learning_rate": 7.35254389914453e-05,
"loss": 0.2859,
"step": 594
},
{
"epoch": 1.8677539791707605,
"grad_norm": 0.18690907955169678,
"learning_rate": 7.348041422782531e-05,
"loss": 0.3001,
"step": 595
},
{
"epoch": 1.8708980153271764,
"grad_norm": 0.16587451100349426,
"learning_rate": 7.343538946420532e-05,
"loss": 0.2833,
"step": 596
},
{
"epoch": 1.874042051483592,
"grad_norm": 0.170462504029274,
"learning_rate": 7.339036470058532e-05,
"loss": 0.2754,
"step": 597
},
{
"epoch": 1.8771860876400077,
"grad_norm": 0.17350532114505768,
"learning_rate": 7.334533993696533e-05,
"loss": 0.2956,
"step": 598
},
{
"epoch": 1.8803301237964236,
"grad_norm": 0.1863803118467331,
"learning_rate": 7.330031517334534e-05,
"loss": 0.3071,
"step": 599
},
{
"epoch": 1.8834741599528395,
"grad_norm": 0.17055153846740723,
"learning_rate": 7.325529040972535e-05,
"loss": 0.3082,
"step": 600
},
{
"epoch": 1.8866181961092554,
"grad_norm": 0.17581762373447418,
"learning_rate": 7.321026564610536e-05,
"loss": 0.2947,
"step": 601
},
{
"epoch": 1.889762232265671,
"grad_norm": 0.18630677461624146,
"learning_rate": 7.316524088248538e-05,
"loss": 0.315,
"step": 602
},
{
"epoch": 1.8929062684220868,
"grad_norm": 0.18152126669883728,
"learning_rate": 7.312021611886538e-05,
"loss": 0.312,
"step": 603
},
{
"epoch": 1.8960503045785027,
"grad_norm": 0.1875506043434143,
"learning_rate": 7.307519135524538e-05,
"loss": 0.3224,
"step": 604
},
{
"epoch": 1.8991943407349186,
"grad_norm": 0.18474234640598297,
"learning_rate": 7.303016659162539e-05,
"loss": 0.2944,
"step": 605
},
{
"epoch": 1.9023383768913342,
"grad_norm": 0.18096047639846802,
"learning_rate": 7.298514182800542e-05,
"loss": 0.3152,
"step": 606
},
{
"epoch": 1.90548241304775,
"grad_norm": 0.16774339973926544,
"learning_rate": 7.294011706438541e-05,
"loss": 0.2588,
"step": 607
},
{
"epoch": 1.9086264492041658,
"grad_norm": 0.17628465592861176,
"learning_rate": 7.289509230076542e-05,
"loss": 0.2923,
"step": 608
},
{
"epoch": 1.9117704853605817,
"grad_norm": 0.17404650151729584,
"learning_rate": 7.285006753714544e-05,
"loss": 0.2852,
"step": 609
},
{
"epoch": 1.9149145215169976,
"grad_norm": 0.1805901676416397,
"learning_rate": 7.280504277352545e-05,
"loss": 0.2866,
"step": 610
},
{
"epoch": 1.9180585576734133,
"grad_norm": 0.18428674340248108,
"learning_rate": 7.276001800990544e-05,
"loss": 0.3207,
"step": 611
},
{
"epoch": 1.921202593829829,
"grad_norm": 0.1807202845811844,
"learning_rate": 7.271499324628545e-05,
"loss": 0.2933,
"step": 612
},
{
"epoch": 1.9243466299862448,
"grad_norm": 0.17590177059173584,
"learning_rate": 7.266996848266548e-05,
"loss": 0.2732,
"step": 613
},
{
"epoch": 1.9274906661426607,
"grad_norm": 0.16720589995384216,
"learning_rate": 7.262494371904548e-05,
"loss": 0.3146,
"step": 614
},
{
"epoch": 1.9306347022990764,
"grad_norm": 0.1786167472600937,
"learning_rate": 7.257991895542548e-05,
"loss": 0.2892,
"step": 615
},
{
"epoch": 1.933778738455492,
"grad_norm": 0.17377831041812897,
"learning_rate": 7.253489419180549e-05,
"loss": 0.3105,
"step": 616
},
{
"epoch": 1.936922774611908,
"grad_norm": 0.18173402547836304,
"learning_rate": 7.248986942818551e-05,
"loss": 0.3436,
"step": 617
},
{
"epoch": 1.9400668107683239,
"grad_norm": 0.17383264005184174,
"learning_rate": 7.244484466456552e-05,
"loss": 0.2823,
"step": 618
},
{
"epoch": 1.9432108469247398,
"grad_norm": 0.18473853170871735,
"learning_rate": 7.239981990094553e-05,
"loss": 0.2867,
"step": 619
},
{
"epoch": 1.9463548830811555,
"grad_norm": 0.17817547917366028,
"learning_rate": 7.235479513732554e-05,
"loss": 0.3056,
"step": 620
},
{
"epoch": 1.9494989192375711,
"grad_norm": 0.17514194548130035,
"learning_rate": 7.230977037370554e-05,
"loss": 0.2951,
"step": 621
},
{
"epoch": 1.952642955393987,
"grad_norm": 0.17744790017604828,
"learning_rate": 7.226474561008555e-05,
"loss": 0.293,
"step": 622
},
{
"epoch": 1.955786991550403,
"grad_norm": 0.1766396164894104,
"learning_rate": 7.221972084646556e-05,
"loss": 0.2775,
"step": 623
},
{
"epoch": 1.9589310277068186,
"grad_norm": 0.17238113284111023,
"learning_rate": 7.217469608284557e-05,
"loss": 0.2873,
"step": 624
},
{
"epoch": 1.9620750638632343,
"grad_norm": 0.17035745084285736,
"learning_rate": 7.212967131922558e-05,
"loss": 0.2552,
"step": 625
},
{
"epoch": 1.9652191000196502,
"grad_norm": 0.17209386825561523,
"learning_rate": 7.208464655560559e-05,
"loss": 0.2826,
"step": 626
},
{
"epoch": 1.968363136176066,
"grad_norm": 0.17958694696426392,
"learning_rate": 7.20396217919856e-05,
"loss": 0.3108,
"step": 627
},
{
"epoch": 1.971507172332482,
"grad_norm": 0.18314975500106812,
"learning_rate": 7.19945970283656e-05,
"loss": 0.3155,
"step": 628
},
{
"epoch": 1.9746512084888976,
"grad_norm": 0.17581366002559662,
"learning_rate": 7.194957226474561e-05,
"loss": 0.2829,
"step": 629
},
{
"epoch": 1.9777952446453133,
"grad_norm": 0.1770240068435669,
"learning_rate": 7.190454750112562e-05,
"loss": 0.2861,
"step": 630
},
{
"epoch": 1.9809392808017292,
"grad_norm": 0.17571915686130524,
"learning_rate": 7.185952273750563e-05,
"loss": 0.2827,
"step": 631
},
{
"epoch": 1.984083316958145,
"grad_norm": 0.18270526826381683,
"learning_rate": 7.181449797388564e-05,
"loss": 0.325,
"step": 632
},
{
"epoch": 1.9872273531145608,
"grad_norm": 0.18204954266548157,
"learning_rate": 7.176947321026565e-05,
"loss": 0.3038,
"step": 633
},
{
"epoch": 1.9903713892709765,
"grad_norm": 0.16646772623062134,
"learning_rate": 7.172444844664566e-05,
"loss": 0.2859,
"step": 634
},
{
"epoch": 1.9935154254273924,
"grad_norm": 0.1777997761964798,
"learning_rate": 7.167942368302566e-05,
"loss": 0.2793,
"step": 635
},
{
"epoch": 1.9966594615838082,
"grad_norm": 0.1707630306482315,
"learning_rate": 7.163439891940567e-05,
"loss": 0.2845,
"step": 636
},
{
"epoch": 1.9998034977402241,
"grad_norm": 0.17496661841869354,
"learning_rate": 7.158937415578568e-05,
"loss": 0.2853,
"step": 637
},
{
"epoch": 2.0,
"grad_norm": 0.7054563760757446,
"learning_rate": 7.15443493921657e-05,
"loss": 0.3121,
"step": 638
},
{
"epoch": 2.0,
"eval_loss": 0.313894122838974,
"eval_runtime": 102.2414,
"eval_samples_per_second": 12.441,
"eval_steps_per_second": 12.441,
"step": 638
}
],
"logging_steps": 1,
"max_steps": 2226,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.882536262118605e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}