9b-55 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
638a27f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1098,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00546448087431694,
"grad_norm": 1.0290616750717163,
"learning_rate": 5.454545454545455e-07,
"loss": 2.5600106716156006,
"step": 2
},
{
"epoch": 0.01092896174863388,
"grad_norm": 0.9784867763519287,
"learning_rate": 1.6363636363636363e-06,
"loss": 1.935613989830017,
"step": 4
},
{
"epoch": 0.01639344262295082,
"grad_norm": 0.6667375564575195,
"learning_rate": 2.7272727272727272e-06,
"loss": 1.8673298358917236,
"step": 6
},
{
"epoch": 0.02185792349726776,
"grad_norm": 0.76627117395401,
"learning_rate": 3.818181818181818e-06,
"loss": 1.7078779935836792,
"step": 8
},
{
"epoch": 0.0273224043715847,
"grad_norm": 0.5361675024032593,
"learning_rate": 4.90909090909091e-06,
"loss": 1.6706163883209229,
"step": 10
},
{
"epoch": 0.03278688524590164,
"grad_norm": 0.8627806901931763,
"learning_rate": 6e-06,
"loss": 1.5410383939743042,
"step": 12
},
{
"epoch": 0.03825136612021858,
"grad_norm": 0.7060588598251343,
"learning_rate": 7.090909090909091e-06,
"loss": 1.4492840766906738,
"step": 14
},
{
"epoch": 0.04371584699453552,
"grad_norm": 0.43548259139060974,
"learning_rate": 8.181818181818181e-06,
"loss": 1.1622447967529297,
"step": 16
},
{
"epoch": 0.04918032786885246,
"grad_norm": 0.5143709182739258,
"learning_rate": 9.272727272727273e-06,
"loss": 1.1726503372192383,
"step": 18
},
{
"epoch": 0.0546448087431694,
"grad_norm": 0.36442285776138306,
"learning_rate": 1.0363636363636364e-05,
"loss": 1.3827235698699951,
"step": 20
},
{
"epoch": 0.060109289617486336,
"grad_norm": 0.39796414971351624,
"learning_rate": 1.1454545454545455e-05,
"loss": 1.30994713306427,
"step": 22
},
{
"epoch": 0.06557377049180328,
"grad_norm": 0.6557633280754089,
"learning_rate": 1.2545454545454545e-05,
"loss": 1.3867391347885132,
"step": 24
},
{
"epoch": 0.07103825136612021,
"grad_norm": 2.191542863845825,
"learning_rate": 1.3636363636363637e-05,
"loss": 1.5845718383789062,
"step": 26
},
{
"epoch": 0.07650273224043716,
"grad_norm": 0.48486751317977905,
"learning_rate": 1.4727272727272728e-05,
"loss": 1.3515061140060425,
"step": 28
},
{
"epoch": 0.08196721311475409,
"grad_norm": 0.39387887716293335,
"learning_rate": 1.5818181818181818e-05,
"loss": 1.2701536417007446,
"step": 30
},
{
"epoch": 0.08743169398907104,
"grad_norm": 0.36588501930236816,
"learning_rate": 1.6909090909090907e-05,
"loss": 1.2911758422851562,
"step": 32
},
{
"epoch": 0.09289617486338798,
"grad_norm": 0.541991651058197,
"learning_rate": 1.8e-05,
"loss": 1.014011263847351,
"step": 34
},
{
"epoch": 0.09836065573770492,
"grad_norm": 0.3574058413505554,
"learning_rate": 1.909090909090909e-05,
"loss": 1.1121183633804321,
"step": 36
},
{
"epoch": 0.10382513661202186,
"grad_norm": 0.5543901324272156,
"learning_rate": 2.0181818181818183e-05,
"loss": 1.3590247631072998,
"step": 38
},
{
"epoch": 0.1092896174863388,
"grad_norm": 0.28989821672439575,
"learning_rate": 2.1272727272727273e-05,
"loss": 1.3065768480300903,
"step": 40
},
{
"epoch": 0.11475409836065574,
"grad_norm": 0.8808363676071167,
"learning_rate": 2.2363636363636366e-05,
"loss": 0.9873176217079163,
"step": 42
},
{
"epoch": 0.12021857923497267,
"grad_norm": 0.40827518701553345,
"learning_rate": 2.3454545454545456e-05,
"loss": 1.0595043897628784,
"step": 44
},
{
"epoch": 0.12568306010928962,
"grad_norm": 0.3883642852306366,
"learning_rate": 2.454545454545455e-05,
"loss": 1.348591923713684,
"step": 46
},
{
"epoch": 0.13114754098360656,
"grad_norm": 0.3302466571331024,
"learning_rate": 2.5636363636363635e-05,
"loss": 1.3286434412002563,
"step": 48
},
{
"epoch": 0.1366120218579235,
"grad_norm": 0.5504478812217712,
"learning_rate": 2.6727272727272728e-05,
"loss": 1.2936590909957886,
"step": 50
},
{
"epoch": 0.14207650273224043,
"grad_norm": 0.39443132281303406,
"learning_rate": 2.7818181818181818e-05,
"loss": 1.2653636932373047,
"step": 52
},
{
"epoch": 0.14754098360655737,
"grad_norm": 1.2607563734054565,
"learning_rate": 2.890909090909091e-05,
"loss": 0.8410840630531311,
"step": 54
},
{
"epoch": 0.15300546448087432,
"grad_norm": 0.4433146119117737,
"learning_rate": 3e-05,
"loss": 1.4708025455474854,
"step": 56
},
{
"epoch": 0.15846994535519127,
"grad_norm": 0.6779677867889404,
"learning_rate": 2.9997491688899256e-05,
"loss": 1.4647767543792725,
"step": 58
},
{
"epoch": 0.16393442622950818,
"grad_norm": 0.6174322962760925,
"learning_rate": 2.998996768768956e-05,
"loss": 1.3616957664489746,
"step": 60
},
{
"epoch": 0.16939890710382513,
"grad_norm": 0.6061730980873108,
"learning_rate": 2.9977430792302124e-05,
"loss": 1.5163816213607788,
"step": 62
},
{
"epoch": 0.17486338797814208,
"grad_norm": 0.7894387245178223,
"learning_rate": 2.9959885661467903e-05,
"loss": 1.2052745819091797,
"step": 64
},
{
"epoch": 0.18032786885245902,
"grad_norm": 1.0945260524749756,
"learning_rate": 2.993733881498636e-05,
"loss": 1.3422211408615112,
"step": 66
},
{
"epoch": 0.18579234972677597,
"grad_norm": 0.4321194887161255,
"learning_rate": 2.9909798631302736e-05,
"loss": 1.3147833347320557,
"step": 68
},
{
"epoch": 0.1912568306010929,
"grad_norm": 0.9726943373680115,
"learning_rate": 2.987727534439457e-05,
"loss": 1.3389136791229248,
"step": 70
},
{
"epoch": 0.19672131147540983,
"grad_norm": 0.5656908750534058,
"learning_rate": 2.983978103996877e-05,
"loss": 1.1321386098861694,
"step": 72
},
{
"epoch": 0.20218579234972678,
"grad_norm": 0.39216819405555725,
"learning_rate": 2.9797329650970525e-05,
"loss": 1.3821684122085571,
"step": 74
},
{
"epoch": 0.20765027322404372,
"grad_norm": 0.6111781597137451,
"learning_rate": 2.974993695240579e-05,
"loss": 1.3336125612258911,
"step": 76
},
{
"epoch": 0.21311475409836064,
"grad_norm": 2.4362525939941406,
"learning_rate": 2.9697620555479297e-05,
"loss": 1.0982894897460938,
"step": 78
},
{
"epoch": 0.2185792349726776,
"grad_norm": 0.4370291531085968,
"learning_rate": 2.9640399901050182e-05,
"loss": 0.610292911529541,
"step": 80
},
{
"epoch": 0.22404371584699453,
"grad_norm": 0.46904757618904114,
"learning_rate": 2.9578296252407734e-05,
"loss": 1.3859418630599976,
"step": 82
},
{
"epoch": 0.22950819672131148,
"grad_norm": 0.4592580795288086,
"learning_rate": 2.9511332687369917e-05,
"loss": 0.7786137461662292,
"step": 84
},
{
"epoch": 0.23497267759562843,
"grad_norm": 0.2533802390098572,
"learning_rate": 2.9439534089707624e-05,
"loss": 1.3341573476791382,
"step": 86
},
{
"epoch": 0.24043715846994534,
"grad_norm": 0.4031212329864502,
"learning_rate": 2.9362927139897832e-05,
"loss": 1.2731207609176636,
"step": 88
},
{
"epoch": 0.2459016393442623,
"grad_norm": 1.3620141744613647,
"learning_rate": 2.9281540305209068e-05,
"loss": 1.0239676237106323,
"step": 90
},
{
"epoch": 0.25136612021857924,
"grad_norm": 0.9083771705627441,
"learning_rate": 2.919540382912294e-05,
"loss": 1.3299720287322998,
"step": 92
},
{
"epoch": 0.2568306010928962,
"grad_norm": 0.39796510338783264,
"learning_rate": 2.9104549720095634e-05,
"loss": 1.1993896961212158,
"step": 94
},
{
"epoch": 0.26229508196721313,
"grad_norm": 0.8477935194969177,
"learning_rate": 2.9009011739663467e-05,
"loss": 1.5670781135559082,
"step": 96
},
{
"epoch": 0.2677595628415301,
"grad_norm": 0.4144780933856964,
"learning_rate": 2.8908825389897094e-05,
"loss": 1.339431881904602,
"step": 98
},
{
"epoch": 0.273224043715847,
"grad_norm": 0.5135835409164429,
"learning_rate": 2.8804027900208843e-05,
"loss": 1.1886396408081055,
"step": 100
},
{
"epoch": 0.2786885245901639,
"grad_norm": 0.5162252187728882,
"learning_rate": 2.8694658213518226e-05,
"loss": 1.3355013132095337,
"step": 102
},
{
"epoch": 0.28415300546448086,
"grad_norm": 0.5908259749412537,
"learning_rate": 2.8580756971780686e-05,
"loss": 1.1072770357131958,
"step": 104
},
{
"epoch": 0.2896174863387978,
"grad_norm": 0.8564960360527039,
"learning_rate": 2.846236650088497e-05,
"loss": 1.297569990158081,
"step": 106
},
{
"epoch": 0.29508196721311475,
"grad_norm": 0.37959277629852295,
"learning_rate": 2.833953079492476e-05,
"loss": 1.4445881843566895,
"step": 108
},
{
"epoch": 0.3005464480874317,
"grad_norm": 0.48318997025489807,
"learning_rate": 2.82122954998504e-05,
"loss": 1.2766779661178589,
"step": 110
},
{
"epoch": 0.30601092896174864,
"grad_norm": 0.33517327904701233,
"learning_rate": 2.808070789650679e-05,
"loss": 1.2814470529556274,
"step": 112
},
{
"epoch": 0.3114754098360656,
"grad_norm": 0.9267619848251343,
"learning_rate": 2.7944816883063727e-05,
"loss": 1.4905495643615723,
"step": 114
},
{
"epoch": 0.31693989071038253,
"grad_norm": 0.5099186301231384,
"learning_rate": 2.7804672956845295e-05,
"loss": 1.2726738452911377,
"step": 116
},
{
"epoch": 0.3224043715846995,
"grad_norm": 0.472024142742157,
"learning_rate": 2.766032819556495e-05,
"loss": 1.0373400449752808,
"step": 118
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.705938458442688,
"learning_rate": 2.7511836237973366e-05,
"loss": 1.2890139818191528,
"step": 120
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.6098809838294983,
"learning_rate": 2.735925226392618e-05,
"loss": 1.3172739744186401,
"step": 122
},
{
"epoch": 0.33879781420765026,
"grad_norm": 0.3613099753856659,
"learning_rate": 2.7202632973879086e-05,
"loss": 1.0878937244415283,
"step": 124
},
{
"epoch": 0.3442622950819672,
"grad_norm": 0.274772971868515,
"learning_rate": 2.7042036567817838e-05,
"loss": 1.1205496788024902,
"step": 126
},
{
"epoch": 0.34972677595628415,
"grad_norm": 0.5086337327957153,
"learning_rate": 2.6877522723631036e-05,
"loss": 1.3235446214675903,
"step": 128
},
{
"epoch": 0.3551912568306011,
"grad_norm": 0.29697364568710327,
"learning_rate": 2.6709152574933727e-05,
"loss": 1.2779330015182495,
"step": 130
},
{
"epoch": 0.36065573770491804,
"grad_norm": 0.47989189624786377,
"learning_rate": 2.6536988688350067e-05,
"loss": 1.2393711805343628,
"step": 132
},
{
"epoch": 0.366120218579235,
"grad_norm": 0.7712145447731018,
"learning_rate": 2.6361095040263437e-05,
"loss": 1.1260766983032227,
"step": 134
},
{
"epoch": 0.37158469945355194,
"grad_norm": 1.069640874862671,
"learning_rate": 2.618153699304274e-05,
"loss": 1.049655795097351,
"step": 136
},
{
"epoch": 0.3770491803278688,
"grad_norm": 0.2968403398990631,
"learning_rate": 2.599838127075361e-05,
"loss": 1.030115008354187,
"step": 138
},
{
"epoch": 0.3825136612021858,
"grad_norm": 0.40293624997138977,
"learning_rate": 2.5811695934363666e-05,
"loss": 0.6581886410713196,
"step": 140
},
{
"epoch": 0.3879781420765027,
"grad_norm": 0.43892744183540344,
"learning_rate": 2.5621550356450914e-05,
"loss": 1.237864375114441,
"step": 142
},
{
"epoch": 0.39344262295081966,
"grad_norm": 0.41456156969070435,
"learning_rate": 2.5428015195424825e-05,
"loss": 1.2691991329193115,
"step": 144
},
{
"epoch": 0.3989071038251366,
"grad_norm": 0.5273171067237854,
"learning_rate": 2.5231162369269498e-05,
"loss": 1.226870059967041,
"step": 146
},
{
"epoch": 0.40437158469945356,
"grad_norm": 0.35689717531204224,
"learning_rate": 2.503106502881889e-05,
"loss": 1.220680832862854,
"step": 148
},
{
"epoch": 0.4098360655737705,
"grad_norm": 0.7350449562072754,
"learning_rate": 2.4827797530573762e-05,
"loss": 1.1639314889907837,
"step": 150
},
{
"epoch": 0.41530054644808745,
"grad_norm": 0.47809040546417236,
"learning_rate": 2.4621435409070757e-05,
"loss": 1.266118049621582,
"step": 152
},
{
"epoch": 0.4207650273224044,
"grad_norm": 3.797673463821411,
"learning_rate": 2.4412055348813602e-05,
"loss": 1.2429925203323364,
"step": 154
},
{
"epoch": 0.4262295081967213,
"grad_norm": 0.36509862542152405,
"learning_rate": 2.4199735155777017e-05,
"loss": 1.296124815940857,
"step": 156
},
{
"epoch": 0.43169398907103823,
"grad_norm": 0.6357290148735046,
"learning_rate": 2.3984553728493914e-05,
"loss": 1.0684611797332764,
"step": 158
},
{
"epoch": 0.4371584699453552,
"grad_norm": 0.4296596944332123,
"learning_rate": 2.3766591028736547e-05,
"loss": 1.2166671752929688,
"step": 160
},
{
"epoch": 0.4426229508196721,
"grad_norm": 1.0483851432800293,
"learning_rate": 2.3545928051802588e-05,
"loss": 0.7761582732200623,
"step": 162
},
{
"epoch": 0.44808743169398907,
"grad_norm": 0.4189485013484955,
"learning_rate": 2.332264679641717e-05,
"loss": 1.6205989122390747,
"step": 164
},
{
"epoch": 0.453551912568306,
"grad_norm": 0.7714853882789612,
"learning_rate": 2.3096830234261996e-05,
"loss": 0.7628769874572754,
"step": 166
},
{
"epoch": 0.45901639344262296,
"grad_norm": 0.6214097142219543,
"learning_rate": 2.2868562279142912e-05,
"loss": 1.1566860675811768,
"step": 168
},
{
"epoch": 0.4644808743169399,
"grad_norm": 0.5600799322128296,
"learning_rate": 2.2637927755807458e-05,
"loss": 1.2521134614944458,
"step": 170
},
{
"epoch": 0.46994535519125685,
"grad_norm": 1.0058698654174805,
"learning_rate": 2.2405012368423786e-05,
"loss": 1.3131451606750488,
"step": 172
},
{
"epoch": 0.47540983606557374,
"grad_norm": 0.9977296590805054,
"learning_rate": 2.2169902668732893e-05,
"loss": 1.4315543174743652,
"step": 174
},
{
"epoch": 0.4808743169398907,
"grad_norm": 0.2930877208709717,
"learning_rate": 2.193268602388583e-05,
"loss": 1.2214438915252686,
"step": 176
},
{
"epoch": 0.48633879781420764,
"grad_norm": 0.426462322473526,
"learning_rate": 2.1693450583977953e-05,
"loss": 1.1797651052474976,
"step": 178
},
{
"epoch": 0.4918032786885246,
"grad_norm": 0.28153467178344727,
"learning_rate": 2.1452285249292147e-05,
"loss": 1.1794407367706299,
"step": 180
},
{
"epoch": 0.4972677595628415,
"grad_norm": 0.5039904713630676,
"learning_rate": 2.12092796372634e-05,
"loss": 1.1637710332870483,
"step": 182
},
{
"epoch": 0.5027322404371585,
"grad_norm": 0.739661693572998,
"learning_rate": 2.096452404917679e-05,
"loss": 1.3269554376602173,
"step": 184
},
{
"epoch": 0.5081967213114754,
"grad_norm": 1.9196809530258179,
"learning_rate": 2.0718109436611348e-05,
"loss": 1.22355055809021,
"step": 186
},
{
"epoch": 0.5136612021857924,
"grad_norm": 0.6393678784370422,
"learning_rate": 2.0470127367642345e-05,
"loss": 1.1911969184875488,
"step": 188
},
{
"epoch": 0.5191256830601093,
"grad_norm": 0.3692255914211273,
"learning_rate": 2.022066999281444e-05,
"loss": 1.2380821704864502,
"step": 190
},
{
"epoch": 0.5245901639344263,
"grad_norm": 0.5736802816390991,
"learning_rate": 1.9969830010898358e-05,
"loss": 1.2054179906845093,
"step": 192
},
{
"epoch": 0.5300546448087432,
"grad_norm": 0.38136202096939087,
"learning_rate": 1.9717700634443903e-05,
"loss": 1.2205357551574707,
"step": 194
},
{
"epoch": 0.5355191256830601,
"grad_norm": 0.4319143295288086,
"learning_rate": 1.9464375555142e-05,
"loss": 1.278338074684143,
"step": 196
},
{
"epoch": 0.5409836065573771,
"grad_norm": 0.4084922969341278,
"learning_rate": 1.9209948909008734e-05,
"loss": 1.1067121028900146,
"step": 198
},
{
"epoch": 0.546448087431694,
"grad_norm": 0.738842785358429,
"learning_rate": 1.8954515241404218e-05,
"loss": 1.244836449623108,
"step": 200
},
{
"epoch": 0.5519125683060109,
"grad_norm": 0.3579850494861603,
"learning_rate": 1.8698169471899414e-05,
"loss": 1.069273591041565,
"step": 202
},
{
"epoch": 0.5573770491803278,
"grad_norm": 2.0719704627990723,
"learning_rate": 1.8441006859003842e-05,
"loss": 0.933051347732544,
"step": 204
},
{
"epoch": 0.5628415300546448,
"grad_norm": 0.381488174200058,
"learning_rate": 1.818312296476737e-05,
"loss": 1.243231177330017,
"step": 206
},
{
"epoch": 0.5683060109289617,
"grad_norm": 0.4084261655807495,
"learning_rate": 1.792461361926921e-05,
"loss": 1.0697021484375,
"step": 208
},
{
"epoch": 0.5737704918032787,
"grad_norm": 0.5111251473426819,
"learning_rate": 1.766557488500727e-05,
"loss": 1.199232816696167,
"step": 210
},
{
"epoch": 0.5792349726775956,
"grad_norm": 0.48198121786117554,
"learning_rate": 1.7406103021201212e-05,
"loss": 1.506237268447876,
"step": 212
},
{
"epoch": 0.5846994535519126,
"grad_norm": 0.4809795916080475,
"learning_rate": 1.7146294448022335e-05,
"loss": 1.2066627740859985,
"step": 214
},
{
"epoch": 0.5901639344262295,
"grad_norm": 0.7484958171844482,
"learning_rate": 1.688624571076371e-05,
"loss": 1.5664700269699097,
"step": 216
},
{
"epoch": 0.5956284153005464,
"grad_norm": 0.600191056728363,
"learning_rate": 1.6626053443963762e-05,
"loss": 1.1993545293807983,
"step": 218
},
{
"epoch": 0.6010928961748634,
"grad_norm": 0.40568259358406067,
"learning_rate": 1.636581433549674e-05,
"loss": 1.2419377565383911,
"step": 220
},
{
"epoch": 0.6065573770491803,
"grad_norm": 0.5247995257377625,
"learning_rate": 1.610562509064332e-05,
"loss": 1.111983060836792,
"step": 222
},
{
"epoch": 0.6120218579234973,
"grad_norm": 0.718986988067627,
"learning_rate": 1.5845582396154786e-05,
"loss": 1.018728256225586,
"step": 224
},
{
"epoch": 0.6174863387978142,
"grad_norm": 0.5365363359451294,
"learning_rate": 1.5585782884324064e-05,
"loss": 1.2150883674621582,
"step": 226
},
{
"epoch": 0.6229508196721312,
"grad_norm": 1.9675040245056152,
"learning_rate": 1.5326323097077015e-05,
"loss": 1.1990766525268555,
"step": 228
},
{
"epoch": 0.6284153005464481,
"grad_norm": 0.4135526418685913,
"learning_rate": 1.5067299450097261e-05,
"loss": 0.7576482892036438,
"step": 230
},
{
"epoch": 0.6338797814207651,
"grad_norm": 0.4321082532405853,
"learning_rate": 1.4808808196998006e-05,
"loss": 1.2172483205795288,
"step": 232
},
{
"epoch": 0.639344262295082,
"grad_norm": 0.6618289947509766,
"learning_rate": 1.4550945393554004e-05,
"loss": 0.907590925693512,
"step": 234
},
{
"epoch": 0.644808743169399,
"grad_norm": 1.15780770778656,
"learning_rate": 1.4293806862007085e-05,
"loss": 0.8806172013282776,
"step": 236
},
{
"epoch": 0.6502732240437158,
"grad_norm": 0.828869104385376,
"learning_rate": 1.4037488155458448e-05,
"loss": 1.2819329500198364,
"step": 238
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.6129389405250549,
"learning_rate": 1.3782084522360981e-05,
"loss": 1.1803516149520874,
"step": 240
},
{
"epoch": 0.6612021857923497,
"grad_norm": 1.0248076915740967,
"learning_rate": 1.3527690871124762e-05,
"loss": 1.1249548196792603,
"step": 242
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.5638490915298462,
"learning_rate": 1.3274401734848958e-05,
"loss": 1.0163003206253052,
"step": 244
},
{
"epoch": 0.6721311475409836,
"grad_norm": 0.5262971520423889,
"learning_rate": 1.3022311236193156e-05,
"loss": 1.1787549257278442,
"step": 246
},
{
"epoch": 0.6775956284153005,
"grad_norm": 0.39949288964271545,
"learning_rate": 1.2771513052401236e-05,
"loss": 1.225506067276001,
"step": 248
},
{
"epoch": 0.6830601092896175,
"grad_norm": 0.3623184263706207,
"learning_rate": 1.2522100380490744e-05,
"loss": 1.1974411010742188,
"step": 250
},
{
"epoch": 0.6885245901639344,
"grad_norm": 0.510343611240387,
"learning_rate": 1.2274165902620732e-05,
"loss": 1.1023813486099243,
"step": 252
},
{
"epoch": 0.6939890710382514,
"grad_norm": 0.5542072057723999,
"learning_rate": 1.2027801751650918e-05,
"loss": 1.1539828777313232,
"step": 254
},
{
"epoch": 0.6994535519125683,
"grad_norm": 0.4000115394592285,
"learning_rate": 1.1783099476904972e-05,
"loss": 1.1491655111312866,
"step": 256
},
{
"epoch": 0.7049180327868853,
"grad_norm": 0.4911912977695465,
"learning_rate": 1.1540150010150599e-05,
"loss": 1.0607776641845703,
"step": 258
},
{
"epoch": 0.7103825136612022,
"grad_norm": 0.43882060050964355,
"learning_rate": 1.1299043631809205e-05,
"loss": 1.1459139585494995,
"step": 260
},
{
"epoch": 0.7158469945355191,
"grad_norm": 1.452078938484192,
"learning_rate": 1.1059869937407486e-05,
"loss": 1.2439805269241333,
"step": 262
},
{
"epoch": 0.7213114754098361,
"grad_norm": 0.39000415802001953,
"learning_rate": 1.082271780428362e-05,
"loss": 1.1889139413833618,
"step": 264
},
{
"epoch": 0.726775956284153,
"grad_norm": 0.4281291365623474,
"learning_rate": 1.0587675358560278e-05,
"loss": 1.0366110801696777,
"step": 266
},
{
"epoch": 0.73224043715847,
"grad_norm": 0.8248846530914307,
"learning_rate": 1.0354829942396837e-05,
"loss": 1.159615159034729,
"step": 268
},
{
"epoch": 0.7377049180327869,
"grad_norm": 0.3553898334503174,
"learning_rate": 1.012426808153287e-05,
"loss": 1.1739202737808228,
"step": 270
},
{
"epoch": 0.7431693989071039,
"grad_norm": 0.46099165081977844,
"learning_rate": 9.896075453135039e-06,
"loss": 1.1385735273361206,
"step": 272
},
{
"epoch": 0.7486338797814208,
"grad_norm": 1.5474929809570312,
"learning_rate": 9.67033685395934e-06,
"loss": 1.158645749092102,
"step": 274
},
{
"epoch": 0.7540983606557377,
"grad_norm": 1.8595592975616455,
"learning_rate": 9.447136168840466e-06,
"loss": 1.2116674184799194,
"step": 276
},
{
"epoch": 0.7595628415300546,
"grad_norm": 0.9305843710899353,
"learning_rate": 9.226556339520069e-06,
"loss": 1.1799819469451904,
"step": 278
},
{
"epoch": 0.7650273224043715,
"grad_norm": 1.2089946269989014,
"learning_rate": 9.008679333825478e-06,
"loss": 0.8697497844696045,
"step": 280
},
{
"epoch": 0.7704918032786885,
"grad_norm": 0.561233639717102,
"learning_rate": 8.793586115210326e-06,
"loss": 1.1774860620498657,
"step": 282
},
{
"epoch": 0.7759562841530054,
"grad_norm": 0.5194445848464966,
"learning_rate": 8.581356612668382e-06,
"loss": 1.1665196418762207,
"step": 284
},
{
"epoch": 0.7814207650273224,
"grad_norm": 0.7688720226287842,
"learning_rate": 8.372069691031804e-06,
"loss": 1.1759231090545654,
"step": 286
},
{
"epoch": 0.7868852459016393,
"grad_norm": 0.272182822227478,
"learning_rate": 8.165803121664869e-06,
"loss": 1.145310401916504,
"step": 288
},
{
"epoch": 0.7923497267759563,
"grad_norm": 0.3153116703033447,
"learning_rate": 7.962633553563965e-06,
"loss": 1.1056978702545166,
"step": 290
},
{
"epoch": 0.7978142076502732,
"grad_norm": 2.426980495452881,
"learning_rate": 7.762636484874723e-06,
"loss": 1.1285853385925293,
"step": 292
},
{
"epoch": 0.8032786885245902,
"grad_norm": 0.5193243026733398,
"learning_rate": 7.565886234836767e-06,
"loss": 1.146580696105957,
"step": 294
},
{
"epoch": 0.8087431693989071,
"grad_norm": 2.4626994132995605,
"learning_rate": 7.3724559161665876e-06,
"loss": 1.4981356859207153,
"step": 296
},
{
"epoch": 0.8142076502732241,
"grad_norm": 1.1054961681365967,
"learning_rate": 7.182417407888703e-06,
"loss": 0.845654308795929,
"step": 298
},
{
"epoch": 0.819672131147541,
"grad_norm": 1.4546505212783813,
"learning_rate": 6.995841328625321e-06,
"loss": 1.565701961517334,
"step": 300
},
{
"epoch": 0.825136612021858,
"grad_norm": 0.6991487741470337,
"learning_rate": 6.812797010354325e-06,
"loss": 1.186747431755066,
"step": 302
},
{
"epoch": 0.8306010928961749,
"grad_norm": 0.5305312871932983,
"learning_rate": 6.63335247264542e-06,
"loss": 0.7601019144058228,
"step": 304
},
{
"epoch": 0.8360655737704918,
"grad_norm": 1.119845986366272,
"learning_rate": 6.457574397383919e-06,
"loss": 1.5136618614196777,
"step": 306
},
{
"epoch": 0.8415300546448088,
"grad_norm": 0.4489692747592926,
"learning_rate": 6.285528103991665e-06,
"loss": 0.8288950324058533,
"step": 308
},
{
"epoch": 0.8469945355191257,
"grad_norm": 0.5143682360649109,
"learning_rate": 6.117277525154225e-06,
"loss": 1.1981096267700195,
"step": 310
},
{
"epoch": 0.8524590163934426,
"grad_norm": 0.5933207273483276,
"learning_rate": 5.952885183063397e-06,
"loss": 1.180372714996338,
"step": 312
},
{
"epoch": 0.8579234972677595,
"grad_norm": 0.4110560417175293,
"learning_rate": 5.792412166183841e-06,
"loss": 1.1811316013336182,
"step": 314
},
{
"epoch": 0.8633879781420765,
"grad_norm": 1.3905489444732666,
"learning_rate": 5.635918106552546e-06,
"loss": 1.187584400177002,
"step": 316
},
{
"epoch": 0.8688524590163934,
"grad_norm": 0.47751036286354065,
"learning_rate": 5.483461157619428e-06,
"loss": 1.1209925413131714,
"step": 318
},
{
"epoch": 0.8743169398907104,
"grad_norm": 0.33796951174736023,
"learning_rate": 5.335097972637441e-06,
"loss": 1.3076804876327515,
"step": 320
},
{
"epoch": 0.8797814207650273,
"grad_norm": 0.3710154891014099,
"learning_rate": 5.1908836836101135e-06,
"loss": 1.1038309335708618,
"step": 322
},
{
"epoch": 0.8852459016393442,
"grad_norm": 0.7831116914749146,
"learning_rate": 5.050871880804414e-06,
"loss": 0.5361250638961792,
"step": 324
},
{
"epoch": 0.8907103825136612,
"grad_norm": 0.4909685254096985,
"learning_rate": 4.915114592836521e-06,
"loss": 1.1557132005691528,
"step": 326
},
{
"epoch": 0.8961748633879781,
"grad_norm": 1.0579615831375122,
"learning_rate": 4.783662267337909e-06,
"loss": 1.0232505798339844,
"step": 328
},
{
"epoch": 0.9016393442622951,
"grad_norm": 0.5031599402427673,
"learning_rate": 4.656563752208907e-06,
"loss": 1.1777091026306152,
"step": 330
},
{
"epoch": 0.907103825136612,
"grad_norm": 0.8965206742286682,
"learning_rate": 4.533866277466767e-06,
"loss": 1.2399661540985107,
"step": 332
},
{
"epoch": 0.912568306010929,
"grad_norm": 1.0849262475967407,
"learning_rate": 4.415615437694876e-06,
"loss": 1.0352680683135986,
"step": 334
},
{
"epoch": 0.9180327868852459,
"grad_norm": 1.7745161056518555,
"learning_rate": 4.3018551750997694e-06,
"loss": 1.1363130807876587,
"step": 336
},
{
"epoch": 0.9234972677595629,
"grad_norm": 0.6566275358200073,
"learning_rate": 4.192627763182111e-06,
"loss": 1.1563678979873657,
"step": 338
},
{
"epoch": 0.9289617486338798,
"grad_norm": 0.44895005226135254,
"learning_rate": 4.087973791027797e-06,
"loss": 1.153563141822815,
"step": 340
},
{
"epoch": 0.9344262295081968,
"grad_norm": 0.4270648658275604,
"learning_rate": 3.987932148224993e-06,
"loss": 1.1889362335205078,
"step": 342
},
{
"epoch": 0.9398907103825137,
"grad_norm": 1.5882360935211182,
"learning_rate": 3.8925400104126834e-06,
"loss": 1.1308345794677734,
"step": 344
},
{
"epoch": 0.9453551912568307,
"grad_norm": 0.4681449830532074,
"learning_rate": 3.8018328254661618e-06,
"loss": 1.5146846771240234,
"step": 346
},
{
"epoch": 0.9508196721311475,
"grad_norm": 0.9261354207992554,
"learning_rate": 3.715844300324527e-06,
"loss": 0.8396592736244202,
"step": 348
},
{
"epoch": 0.9562841530054644,
"grad_norm": 0.41446763277053833,
"learning_rate": 3.6346063884651327e-06,
"loss": 0.7924999594688416,
"step": 350
},
{
"epoch": 0.9617486338797814,
"grad_norm": 0.5155854821205139,
"learning_rate": 3.558149278029624e-06,
"loss": 1.1450321674346924,
"step": 352
},
{
"epoch": 0.9672131147540983,
"grad_norm": 0.42487382888793945,
"learning_rate": 3.4865013806059817e-06,
"loss": 0.8312717080116272,
"step": 354
},
{
"epoch": 0.9726775956284153,
"grad_norm": 0.36651793122291565,
"learning_rate": 3.419689320670712e-06,
"loss": 1.0396209955215454,
"step": 356
},
{
"epoch": 0.9781420765027322,
"grad_norm": 0.5737856030464172,
"learning_rate": 3.35773792569517e-06,
"loss": 0.7241445183753967,
"step": 358
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.423976868391037,
"learning_rate": 3.300670216919602e-06,
"loss": 1.1488103866577148,
"step": 360
},
{
"epoch": 0.9890710382513661,
"grad_norm": 4.771485805511475,
"learning_rate": 3.2485074007984468e-06,
"loss": 0.9309545159339905,
"step": 362
},
{
"epoch": 0.994535519125683,
"grad_norm": 0.43005508184432983,
"learning_rate": 3.2012688611199566e-06,
"loss": 1.1338376998901367,
"step": 364
},
{
"epoch": 1.0,
"grad_norm": 0.508982241153717,
"learning_rate": 3.158972151803165e-06,
"loss": 1.1149035692214966,
"step": 366
},
{
"epoch": 1.005464480874317,
"grad_norm": 0.3700520098209381,
"learning_rate": 3.1216329903748095e-06,
"loss": 0.9985525012016296,
"step": 368
},
{
"epoch": 1.010928961748634,
"grad_norm": 0.7943176031112671,
"learning_rate": 3.089265252128686e-06,
"loss": 1.105968952178955,
"step": 370
},
{
"epoch": 1.0163934426229508,
"grad_norm": 1.9119828939437866,
"learning_rate": 3.061880964969555e-06,
"loss": 1.0278416872024536,
"step": 372
},
{
"epoch": 1.0218579234972678,
"grad_norm": 0.6963576078414917,
"learning_rate": 3.039490304943562e-06,
"loss": 1.0062611103057861,
"step": 374
},
{
"epoch": 1.0273224043715847,
"grad_norm": 0.7515604496002197,
"learning_rate": 3.022101592456795e-06,
"loss": 0.8461394906044006,
"step": 376
},
{
"epoch": 1.0327868852459017,
"grad_norm": 0.6455757021903992,
"learning_rate": 3.0097212891834095e-06,
"loss": 0.9858485460281372,
"step": 378
},
{
"epoch": 1.0382513661202186,
"grad_norm": 0.49385198950767517,
"learning_rate": 3.0023539956644634e-06,
"loss": 0.6368663907051086,
"step": 380
},
{
"epoch": 1.0437158469945356,
"grad_norm": 0.5183748006820679,
"learning_rate": 3.0000024495983428e-06,
"loss": 0.9209074378013611,
"step": 382
},
{
"epoch": 1.0491803278688525,
"grad_norm": 0.45442020893096924,
"learning_rate": 3.002667524823434e-06,
"loss": 1.0152575969696045,
"step": 384
},
{
"epoch": 1.0546448087431695,
"grad_norm": 0.5821099281311035,
"learning_rate": 3.010348230993402e-06,
"loss": 0.589934766292572,
"step": 386
},
{
"epoch": 1.0601092896174864,
"grad_norm": 0.497645765542984,
"learning_rate": 3.0230417139451987e-06,
"loss": 0.5414527654647827,
"step": 388
},
{
"epoch": 1.0655737704918034,
"grad_norm": 0.64039146900177,
"learning_rate": 3.0407432567596883e-06,
"loss": 1.07064688205719,
"step": 390
},
{
"epoch": 1.0710382513661203,
"grad_norm": 0.5888509750366211,
"learning_rate": 3.0634462815144474e-06,
"loss": 1.0028784275054932,
"step": 392
},
{
"epoch": 1.0765027322404372,
"grad_norm": 0.6519098281860352,
"learning_rate": 3.0911423517281404e-06,
"loss": 0.54591304063797,
"step": 394
},
{
"epoch": 1.0819672131147542,
"grad_norm": 0.5070369839668274,
"learning_rate": 3.1238211754955294e-06,
"loss": 0.9594630599021912,
"step": 396
},
{
"epoch": 1.0874316939890711,
"grad_norm": 0.42379260063171387,
"learning_rate": 3.161470609311961e-06,
"loss": 0.913062334060669,
"step": 398
},
{
"epoch": 1.092896174863388,
"grad_norm": 0.7389850616455078,
"learning_rate": 3.2040766625859115e-06,
"loss": 1.0678118467330933,
"step": 400
},
{
"epoch": 1.098360655737705,
"grad_norm": 1.0723117589950562,
"learning_rate": 3.2516235028379157e-06,
"loss": 0.9257262349128723,
"step": 402
},
{
"epoch": 1.1038251366120218,
"grad_norm": 0.5175579190254211,
"learning_rate": 3.304093461583944e-06,
"loss": 0.9655621647834778,
"step": 404
},
{
"epoch": 1.1092896174863387,
"grad_norm": 1.053481936454773,
"learning_rate": 3.3614670409010353e-06,
"loss": 1.0419620275497437,
"step": 406
},
{
"epoch": 1.1147540983606556,
"grad_norm": 0.9978030920028687,
"learning_rate": 3.4237229206727602e-06,
"loss": 0.6634020209312439,
"step": 408
},
{
"epoch": 1.1202185792349726,
"grad_norm": 0.5816113948822021,
"learning_rate": 3.490837966511817e-06,
"loss": 1.1328160762786865,
"step": 410
},
{
"epoch": 1.1256830601092895,
"grad_norm": 1.0110743045806885,
"learning_rate": 3.5627872383567937e-06,
"loss": 0.6518710851669312,
"step": 412
},
{
"epoch": 1.1311475409836065,
"grad_norm": 0.43525078892707825,
"learning_rate": 3.6395439997399494e-06,
"loss": 0.6946133375167847,
"step": 414
},
{
"epoch": 1.1366120218579234,
"grad_norm": 1.5146125555038452,
"learning_rate": 3.721079727722522e-06,
"loss": 0.6212134957313538,
"step": 416
},
{
"epoch": 1.1420765027322404,
"grad_norm": 0.5821154713630676,
"learning_rate": 3.8073641234939055e-06,
"loss": 0.9967427849769592,
"step": 418
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.48840880393981934,
"learning_rate": 3.898365123630732e-06,
"loss": 0.8896088600158691,
"step": 420
},
{
"epoch": 1.1530054644808743,
"grad_norm": 0.509731650352478,
"learning_rate": 3.994048912011692e-06,
"loss": 0.9078909754753113,
"step": 422
},
{
"epoch": 1.1584699453551912,
"grad_norm": 0.754647433757782,
"learning_rate": 4.094379932383666e-06,
"loss": 0.702852725982666,
"step": 424
},
{
"epoch": 1.1639344262295082,
"grad_norm": 0.5465433597564697,
"learning_rate": 4.199320901574489e-06,
"loss": 0.6045834422111511,
"step": 426
},
{
"epoch": 1.169398907103825,
"grad_norm": 0.5529078245162964,
"learning_rate": 4.3088328233474185e-06,
"loss": 1.0903806686401367,
"step": 428
},
{
"epoch": 1.174863387978142,
"grad_norm": 0.4957817494869232,
"learning_rate": 4.422875002892234e-06,
"loss": 0.9784116744995117,
"step": 430
},
{
"epoch": 1.180327868852459,
"grad_norm": 0.5805404782295227,
"learning_rate": 4.54140506194747e-06,
"loss": 0.9702048897743225,
"step": 432
},
{
"epoch": 1.185792349726776,
"grad_norm": 0.6367265582084656,
"learning_rate": 4.664378954548241e-06,
"loss": 0.7683918476104736,
"step": 434
},
{
"epoch": 1.1912568306010929,
"grad_norm": 0.4415365159511566,
"learning_rate": 4.791750983393832e-06,
"loss": 0.966478168964386,
"step": 436
},
{
"epoch": 1.1967213114754098,
"grad_norm": 0.3349447548389435,
"learning_rate": 4.9234738168288466e-06,
"loss": 0.9200465083122253,
"step": 438
},
{
"epoch": 1.2021857923497268,
"grad_norm": 0.4064251184463501,
"learning_rate": 5.059498506431758e-06,
"loss": 0.8399279713630676,
"step": 440
},
{
"epoch": 1.2076502732240437,
"grad_norm": 0.40499797463417053,
"learning_rate": 5.199774505204206e-06,
"loss": 0.9804560542106628,
"step": 442
},
{
"epoch": 1.2131147540983607,
"grad_norm": 0.42770370841026306,
"learning_rate": 5.344249686354357e-06,
"loss": 0.9823156595230103,
"step": 444
},
{
"epoch": 1.2185792349726776,
"grad_norm": 0.5131499767303467,
"learning_rate": 5.492870362667299e-06,
"loss": 0.5317606925964355,
"step": 446
},
{
"epoch": 1.2240437158469946,
"grad_norm": 0.3947273790836334,
"learning_rate": 5.645581306455302e-06,
"loss": 0.8337725400924683,
"step": 448
},
{
"epoch": 1.2295081967213115,
"grad_norm": 0.5377382040023804,
"learning_rate": 5.802325770080506e-06,
"loss": 0.9497036337852478,
"step": 450
},
{
"epoch": 1.2349726775956285,
"grad_norm": 0.7948735952377319,
"learning_rate": 5.96304550704246e-06,
"loss": 0.9841055274009705,
"step": 452
},
{
"epoch": 1.2404371584699454,
"grad_norm": 1.0466973781585693,
"learning_rate": 6.127680793622588e-06,
"loss": 0.38940808176994324,
"step": 454
},
{
"epoch": 1.2459016393442623,
"grad_norm": 1.6290979385375977,
"learning_rate": 6.296170451077657e-06,
"loss": 0.8338759541511536,
"step": 456
},
{
"epoch": 1.2513661202185793,
"grad_norm": 0.42051175236701965,
"learning_rate": 6.468451868373856e-06,
"loss": 0.6538374423980713,
"step": 458
},
{
"epoch": 1.2568306010928962,
"grad_norm": 0.40463048219680786,
"learning_rate": 6.6444610254532e-06,
"loss": 0.979476273059845,
"step": 460
},
{
"epoch": 1.2622950819672132,
"grad_norm": 1.2305693626403809,
"learning_rate": 6.824132517023449e-06,
"loss": 0.6208479404449463,
"step": 462
},
{
"epoch": 1.2677595628415301,
"grad_norm": 0.5815154314041138,
"learning_rate": 7.007399576862872e-06,
"loss": 0.9388564825057983,
"step": 464
},
{
"epoch": 1.273224043715847,
"grad_norm": 0.46247026324272156,
"learning_rate": 7.1941941026306275e-06,
"loss": 0.9793145060539246,
"step": 466
},
{
"epoch": 1.278688524590164,
"grad_norm": 0.36669930815696716,
"learning_rate": 7.3844466811737555e-06,
"loss": 0.8605738878250122,
"step": 468
},
{
"epoch": 1.2841530054644807,
"grad_norm": 0.5819079279899597,
"learning_rate": 7.578086614321175e-06,
"loss": 0.9266294836997986,
"step": 470
},
{
"epoch": 1.289617486338798,
"grad_norm": 0.3731381893157959,
"learning_rate": 7.775041945155295e-06,
"loss": 1.2316067218780518,
"step": 472
},
{
"epoch": 1.2950819672131146,
"grad_norm": 0.4378661513328552,
"learning_rate": 7.975239484751258e-06,
"loss": 0.8832852840423584,
"step": 474
},
{
"epoch": 1.3005464480874318,
"grad_norm": 1.3814300298690796,
"learning_rate": 8.178604839374125e-06,
"loss": 0.5840453505516052,
"step": 476
},
{
"epoch": 1.3060109289617485,
"grad_norm": 0.5126817226409912,
"learning_rate": 8.385062438123673e-06,
"loss": 1.0483534336090088,
"step": 478
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.3563435971736908,
"learning_rate": 8.594535561016661e-06,
"loss": 0.9596335291862488,
"step": 480
},
{
"epoch": 1.3169398907103824,
"grad_norm": 0.6490895748138428,
"learning_rate": 8.806946367496155e-06,
"loss": 0.9330964684486389,
"step": 482
},
{
"epoch": 1.3224043715846996,
"grad_norm": 0.7669457197189331,
"learning_rate": 9.02221592535712e-06,
"loss": 0.6217712163925171,
"step": 484
},
{
"epoch": 1.3278688524590163,
"grad_norm": 0.6613099575042725,
"learning_rate": 9.240264240077859e-06,
"loss": 0.9006657004356384,
"step": 486
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.9986913204193115,
"learning_rate": 9.461010284546016e-06,
"loss": 1.0438649654388428,
"step": 488
},
{
"epoch": 1.3387978142076502,
"grad_norm": 0.746275007724762,
"learning_rate": 9.684372029168438e-06,
"loss": 0.6099355220794678,
"step": 490
},
{
"epoch": 1.3442622950819672,
"grad_norm": 0.6011969447135925,
"learning_rate": 9.91026647235348e-06,
"loss": 1.0746405124664307,
"step": 492
},
{
"epoch": 1.349726775956284,
"grad_norm": 0.22186622023582458,
"learning_rate": 1.0138609671354586e-05,
"loss": 0.8016988039016724,
"step": 494
},
{
"epoch": 1.355191256830601,
"grad_norm": 0.49686399102211,
"learning_rate": 1.0369316773463458e-05,
"loss": 1.0189622640609741,
"step": 496
},
{
"epoch": 1.360655737704918,
"grad_norm": 1.1691715717315674,
"learning_rate": 1.0602302047541566e-05,
"loss": 0.943423330783844,
"step": 498
},
{
"epoch": 1.366120218579235,
"grad_norm": 0.6693958640098572,
"learning_rate": 1.083747891587788e-05,
"loss": 1.1473667621612549,
"step": 500
},
{
"epoch": 1.3715846994535519,
"grad_norm": 0.3323294222354889,
"learning_rate": 1.1074759986361392e-05,
"loss": 0.9242939352989197,
"step": 502
},
{
"epoch": 1.3770491803278688,
"grad_norm": 5.01376485824585,
"learning_rate": 1.1314057084956073e-05,
"loss": 0.5509817600250244,
"step": 504
},
{
"epoch": 1.3825136612021858,
"grad_norm": 0.35369741916656494,
"learning_rate": 1.1555281288466553e-05,
"loss": 0.7877460718154907,
"step": 506
},
{
"epoch": 1.3879781420765027,
"grad_norm": 0.8414467573165894,
"learning_rate": 1.1798342957582084e-05,
"loss": 0.9529051184654236,
"step": 508
},
{
"epoch": 1.3934426229508197,
"grad_norm": 0.41061052680015564,
"learning_rate": 1.2043151770186725e-05,
"loss": 0.9338226318359375,
"step": 510
},
{
"epoch": 1.3989071038251366,
"grad_norm": 0.8062289953231812,
"learning_rate": 1.2289616754923078e-05,
"loss": 0.9093165397644043,
"step": 512
},
{
"epoch": 1.4043715846994536,
"grad_norm": 0.296541303396225,
"learning_rate": 1.253764632499752e-05,
"loss": 0.7254652380943298,
"step": 514
},
{
"epoch": 1.4098360655737705,
"grad_norm": 0.7755069732666016,
"learning_rate": 1.2787148312213901e-05,
"loss": 0.7038006782531738,
"step": 516
},
{
"epoch": 1.4153005464480874,
"grad_norm": 0.47872284054756165,
"learning_rate": 1.3038030001223439e-05,
"loss": 0.9423065781593323,
"step": 518
},
{
"epoch": 1.4207650273224044,
"grad_norm": 1.5793722867965698,
"learning_rate": 1.3290198163977933e-05,
"loss": 0.6078845262527466,
"step": 520
},
{
"epoch": 1.4262295081967213,
"grad_norm": 0.576677143573761,
"learning_rate": 1.3543559094373372e-05,
"loss": 1.0813558101654053,
"step": 522
},
{
"epoch": 1.4316939890710383,
"grad_norm": 0.40600138902664185,
"learning_rate": 1.3798018643071386e-05,
"loss": 0.9510488510131836,
"step": 524
},
{
"epoch": 1.4371584699453552,
"grad_norm": 0.4948050081729889,
"learning_rate": 1.4053482252485178e-05,
"loss": 0.9433645009994507,
"step": 526
},
{
"epoch": 1.4426229508196722,
"grad_norm": 0.39013397693634033,
"learning_rate": 1.4309854991917388e-05,
"loss": 0.5906414985656738,
"step": 528
},
{
"epoch": 1.4480874316939891,
"grad_norm": 1.0018880367279053,
"learning_rate": 1.4567041592836413e-05,
"loss": 0.7334268689155579,
"step": 530
},
{
"epoch": 1.453551912568306,
"grad_norm": 0.9915137887001038,
"learning_rate": 1.48249464842784e-05,
"loss": 0.7494279742240906,
"step": 532
},
{
"epoch": 1.459016393442623,
"grad_norm": 0.4450792968273163,
"learning_rate": 1.508347382836153e-05,
"loss": 1.036052942276001,
"step": 534
},
{
"epoch": 1.46448087431694,
"grad_norm": 0.4731960892677307,
"learning_rate": 1.534252755589961e-05,
"loss": 0.9620227217674255,
"step": 536
},
{
"epoch": 1.469945355191257,
"grad_norm": 0.4848044812679291,
"learning_rate": 1.5602011402101432e-05,
"loss": 1.0610109567642212,
"step": 538
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.5348864197731018,
"learning_rate": 1.5861828942343037e-05,
"loss": 1.0170478820800781,
"step": 540
},
{
"epoch": 1.4808743169398908,
"grad_norm": 0.9850108623504639,
"learning_rate": 1.612188362799917e-05,
"loss": 1.0071587562561035,
"step": 542
},
{
"epoch": 1.4863387978142075,
"grad_norm": 1.0645705461502075,
"learning_rate": 1.6382078822320964e-05,
"loss": 0.8466319441795349,
"step": 544
},
{
"epoch": 1.4918032786885247,
"grad_norm": 0.9430765509605408,
"learning_rate": 1.6642317836346324e-05,
"loss": 1.1048694849014282,
"step": 546
},
{
"epoch": 1.4972677595628414,
"grad_norm": 2.00789213180542,
"learning_rate": 1.6902503964829644e-05,
"loss": 0.5449416637420654,
"step": 548
},
{
"epoch": 1.5027322404371586,
"grad_norm": 0.3312459886074066,
"learning_rate": 1.7162540522177685e-05,
"loss": 0.9255615472793579,
"step": 550
},
{
"epoch": 1.5081967213114753,
"grad_norm": 1.1483796834945679,
"learning_rate": 1.7422330878378113e-05,
"loss": 1.0408291816711426,
"step": 552
},
{
"epoch": 1.5136612021857925,
"grad_norm": 0.2985171377658844,
"learning_rate": 1.7681778494907298e-05,
"loss": 0.932111918926239,
"step": 554
},
{
"epoch": 1.5191256830601092,
"grad_norm": 0.4195594787597656,
"learning_rate": 1.794078696060429e-05,
"loss": 1.1309460401535034,
"step": 556
},
{
"epoch": 1.5245901639344264,
"grad_norm": 0.48297399282455444,
"learning_rate": 1.819926002749727e-05,
"loss": 0.7807631492614746,
"step": 558
},
{
"epoch": 1.530054644808743,
"grad_norm": 0.5782639980316162,
"learning_rate": 1.84571016465695e-05,
"loss": 0.5854570269584656,
"step": 560
},
{
"epoch": 1.5355191256830603,
"grad_norm": 1.3704901933670044,
"learning_rate": 1.8714216003451295e-05,
"loss": 0.9114719033241272,
"step": 562
},
{
"epoch": 1.540983606557377,
"grad_norm": 0.5284550786018372,
"learning_rate": 1.8970507554024827e-05,
"loss": 0.7601478695869446,
"step": 564
},
{
"epoch": 1.5464480874316942,
"grad_norm": 1.413692831993103,
"learning_rate": 1.922588105992838e-05,
"loss": 0.9909778833389282,
"step": 566
},
{
"epoch": 1.5519125683060109,
"grad_norm": 0.63189297914505,
"learning_rate": 1.9480241623947206e-05,
"loss": 0.822301983833313,
"step": 568
},
{
"epoch": 1.5573770491803278,
"grad_norm": 0.5345529913902283,
"learning_rate": 1.9733494725277413e-05,
"loss": 1.1544920206069946,
"step": 570
},
{
"epoch": 1.5628415300546448,
"grad_norm": 1.3997973203659058,
"learning_rate": 1.998554625465005e-05,
"loss": 0.9297661185264587,
"step": 572
},
{
"epoch": 1.5683060109289617,
"grad_norm": 0.3266404867172241,
"learning_rate": 2.0236302549302293e-05,
"loss": 1.020856499671936,
"step": 574
},
{
"epoch": 1.5737704918032787,
"grad_norm": 0.5123386979103088,
"learning_rate": 2.0485670427782644e-05,
"loss": 0.8248804211616516,
"step": 576
},
{
"epoch": 1.5792349726775956,
"grad_norm": 0.6250765919685364,
"learning_rate": 2.073355722457739e-05,
"loss": 1.0233927965164185,
"step": 578
},
{
"epoch": 1.5846994535519126,
"grad_norm": 0.635255753993988,
"learning_rate": 2.0979870824545165e-05,
"loss": 0.6386371850967407,
"step": 580
},
{
"epoch": 1.5901639344262295,
"grad_norm": 0.46638670563697815,
"learning_rate": 2.1224519697147145e-05,
"loss": 0.6583716869354248,
"step": 582
},
{
"epoch": 1.5956284153005464,
"grad_norm": 0.28147849440574646,
"learning_rate": 2.1467412930459936e-05,
"loss": 0.6712157726287842,
"step": 584
},
{
"epoch": 1.6010928961748634,
"grad_norm": 0.6860049366950989,
"learning_rate": 2.1708460264958595e-05,
"loss": 0.9819990992546082,
"step": 586
},
{
"epoch": 1.6065573770491803,
"grad_norm": 0.5746862888336182,
"learning_rate": 2.194757212705718e-05,
"loss": 0.989525318145752,
"step": 588
},
{
"epoch": 1.6120218579234973,
"grad_norm": 0.824454128742218,
"learning_rate": 2.2184659662394522e-05,
"loss": 0.9113327264785767,
"step": 590
},
{
"epoch": 1.6174863387978142,
"grad_norm": 0.7105326056480408,
"learning_rate": 2.24196347688526e-05,
"loss": 1.093321681022644,
"step": 592
},
{
"epoch": 1.6229508196721312,
"grad_norm": 1.0919660329818726,
"learning_rate": 2.265241012929541e-05,
"loss": 0.88104248046875,
"step": 594
},
{
"epoch": 1.6284153005464481,
"grad_norm": 0.46225038170814514,
"learning_rate": 2.28828992440162e-05,
"loss": 0.44228631258010864,
"step": 596
},
{
"epoch": 1.633879781420765,
"grad_norm": 0.7734431028366089,
"learning_rate": 2.3111016462880873e-05,
"loss": 1.205407977104187,
"step": 598
},
{
"epoch": 1.639344262295082,
"grad_norm": 1.4312113523483276,
"learning_rate": 2.333667701715578e-05,
"loss": 1.1285673379898071,
"step": 600
},
{
"epoch": 1.644808743169399,
"grad_norm": 0.5179247856140137,
"learning_rate": 2.3559797051007815e-05,
"loss": 1.0147968530654907,
"step": 602
},
{
"epoch": 1.650273224043716,
"grad_norm": 0.48647308349609375,
"learning_rate": 2.3780293652665477e-05,
"loss": 1.050143837928772,
"step": 604
},
{
"epoch": 1.6557377049180326,
"grad_norm": 0.676051139831543,
"learning_rate": 2.399808488522895e-05,
"loss": 1.150329351425171,
"step": 606
},
{
"epoch": 1.6612021857923498,
"grad_norm": 0.7187015414237976,
"learning_rate": 2.4213089817118078e-05,
"loss": 0.7985825538635254,
"step": 608
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.5014326572418213,
"learning_rate": 2.4425228552146573e-05,
"loss": 1.124566912651062,
"step": 610
},
{
"epoch": 1.6721311475409837,
"grad_norm": 0.8358767628669739,
"learning_rate": 2.4634422259211614e-05,
"loss": 0.6776012182235718,
"step": 612
},
{
"epoch": 1.6775956284153004,
"grad_norm": 0.4701133370399475,
"learning_rate": 2.4840593201587626e-05,
"loss": 0.9594138264656067,
"step": 614
},
{
"epoch": 1.6830601092896176,
"grad_norm": 0.8161628246307373,
"learning_rate": 2.5043664765813377e-05,
"loss": 1.0532571077346802,
"step": 616
},
{
"epoch": 1.6885245901639343,
"grad_norm": 0.2768792510032654,
"learning_rate": 2.524356149016163e-05,
"loss": 0.6472287178039551,
"step": 618
},
{
"epoch": 1.6939890710382515,
"grad_norm": 1.2430589199066162,
"learning_rate": 2.544020909268085e-05,
"loss": 1.0938588380813599,
"step": 620
},
{
"epoch": 1.6994535519125682,
"grad_norm": 5.1389594078063965,
"learning_rate": 2.5633534498798598e-05,
"loss": 1.2041553258895874,
"step": 622
},
{
"epoch": 1.7049180327868854,
"grad_norm": 0.6408423781394958,
"learning_rate": 2.5823465868475985e-05,
"loss": 0.8244649767875671,
"step": 624
},
{
"epoch": 1.710382513661202,
"grad_norm": 0.418550968170166,
"learning_rate": 2.60099326229037e-05,
"loss": 0.741182804107666,
"step": 626
},
{
"epoch": 1.7158469945355193,
"grad_norm": 0.4585968554019928,
"learning_rate": 2.619286547072914e-05,
"loss": 1.0170096158981323,
"step": 628
},
{
"epoch": 1.721311475409836,
"grad_norm": 0.7861810326576233,
"learning_rate": 2.6372196433805214e-05,
"loss": 0.7566246390342712,
"step": 630
},
{
"epoch": 1.7267759562841531,
"grad_norm": 0.4659828245639801,
"learning_rate": 2.654785887245112e-05,
"loss": 1.0305655002593994,
"step": 632
},
{
"epoch": 1.7322404371584699,
"grad_norm": 0.5718828439712524,
"learning_rate": 2.671978751021577e-05,
"loss": 1.1177818775177002,
"step": 634
},
{
"epoch": 1.737704918032787,
"grad_norm": 0.45309358835220337,
"learning_rate": 2.6887918458134622e-05,
"loss": 0.9918133616447449,
"step": 636
},
{
"epoch": 1.7431693989071038,
"grad_norm": 0.2656165361404419,
"learning_rate": 2.705218923847093e-05,
"loss": 0.994335412979126,
"step": 638
},
{
"epoch": 1.748633879781421,
"grad_norm": 5.145861625671387,
"learning_rate": 2.7212538807932576e-05,
"loss": 0.9535347819328308,
"step": 640
},
{
"epoch": 1.7540983606557377,
"grad_norm": 0.23799912631511688,
"learning_rate": 2.7368907580355843e-05,
"loss": 1.0429202318191528,
"step": 642
},
{
"epoch": 1.7595628415300546,
"grad_norm": 0.2962387800216675,
"learning_rate": 2.7521237448847734e-05,
"loss": 1.000279426574707,
"step": 644
},
{
"epoch": 1.7650273224043715,
"grad_norm": 0.4010259807109833,
"learning_rate": 2.766947180737861e-05,
"loss": 0.8354232311248779,
"step": 646
},
{
"epoch": 1.7704918032786885,
"grad_norm": 0.28394171595573425,
"learning_rate": 2.781355557181706e-05,
"loss": 1.0121784210205078,
"step": 648
},
{
"epoch": 1.7759562841530054,
"grad_norm": 0.5224506855010986,
"learning_rate": 2.7953435200399262e-05,
"loss": 1.0634323358535767,
"step": 650
},
{
"epoch": 1.7814207650273224,
"grad_norm": 0.20725345611572266,
"learning_rate": 2.8089058713625194e-05,
"loss": 1.1551871299743652,
"step": 652
},
{
"epoch": 1.7868852459016393,
"grad_norm": 0.36834990978240967,
"learning_rate": 2.8220375713574307e-05,
"loss": 1.040554404258728,
"step": 654
},
{
"epoch": 1.7923497267759563,
"grad_norm": 2.1827328205108643,
"learning_rate": 2.8347337402633456e-05,
"loss": 1.0153180360794067,
"step": 656
},
{
"epoch": 1.7978142076502732,
"grad_norm": 0.3932366371154785,
"learning_rate": 2.846989660163019e-05,
"loss": 1.0517958402633667,
"step": 658
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.8681307435035706,
"learning_rate": 2.858800776736461e-05,
"loss": 1.2040562629699707,
"step": 660
},
{
"epoch": 1.8087431693989071,
"grad_norm": 0.2651101052761078,
"learning_rate": 2.87016270095333e-05,
"loss": 1.1040290594100952,
"step": 662
},
{
"epoch": 1.814207650273224,
"grad_norm": 0.4739510715007782,
"learning_rate": 2.8810712107039e-05,
"loss": 1.107933521270752,
"step": 664
},
{
"epoch": 1.819672131147541,
"grad_norm": 0.6362955570220947,
"learning_rate": 2.8915222523680082e-05,
"loss": 1.386487364768982,
"step": 666
},
{
"epoch": 1.825136612021858,
"grad_norm": 0.366566926240921,
"learning_rate": 2.9015119423213857e-05,
"loss": 1.1438971757888794,
"step": 668
},
{
"epoch": 1.830601092896175,
"grad_norm": 0.2171451449394226,
"learning_rate": 2.9110365683788173e-05,
"loss": 0.7628768086433411,
"step": 670
},
{
"epoch": 1.8360655737704918,
"grad_norm": 1.5162757635116577,
"learning_rate": 2.9200925911735956e-05,
"loss": 1.3212112188339233,
"step": 672
},
{
"epoch": 1.8415300546448088,
"grad_norm": 0.645240843296051,
"learning_rate": 2.9286766454727563e-05,
"loss": 1.1983546018600464,
"step": 674
},
{
"epoch": 1.8469945355191257,
"grad_norm": 0.2693498730659485,
"learning_rate": 2.9367855414276073e-05,
"loss": 1.2009769678115845,
"step": 676
},
{
"epoch": 1.8524590163934427,
"grad_norm": 0.1954379677772522,
"learning_rate": 2.9444162657590747e-05,
"loss": 0.3892729878425598,
"step": 678
},
{
"epoch": 1.8579234972677594,
"grad_norm": 0.35599058866500854,
"learning_rate": 2.951565982877447e-05,
"loss": 1.048251986503601,
"step": 680
},
{
"epoch": 1.8633879781420766,
"grad_norm": 0.3966503441333771,
"learning_rate": 2.9582320359360864e-05,
"loss": 1.061416506767273,
"step": 682
},
{
"epoch": 1.8688524590163933,
"grad_norm": 0.40303412079811096,
"learning_rate": 2.9644119478187126e-05,
"loss": 1.1344728469848633,
"step": 684
},
{
"epoch": 1.8743169398907105,
"grad_norm": 1.145259141921997,
"learning_rate": 2.9701034220599074e-05,
"loss": 0.9383465051651001,
"step": 686
},
{
"epoch": 1.8797814207650272,
"grad_norm": 0.3722691833972931,
"learning_rate": 2.975304343698483e-05,
"loss": 1.1722391843795776,
"step": 688
},
{
"epoch": 1.8852459016393444,
"grad_norm": 0.3040332496166229,
"learning_rate": 2.980012780063404e-05,
"loss": 1.110710620880127,
"step": 690
},
{
"epoch": 1.890710382513661,
"grad_norm": 0.4321642220020294,
"learning_rate": 2.9842269814919755e-05,
"loss": 0.8456806540489197,
"step": 692
},
{
"epoch": 1.8961748633879782,
"grad_norm": 0.7092167735099792,
"learning_rate": 2.9879453819800156e-05,
"loss": 1.1707490682601929,
"step": 694
},
{
"epoch": 1.901639344262295,
"grad_norm": 0.32058265805244446,
"learning_rate": 2.991166599763788e-05,
"loss": 1.1301288604736328,
"step": 696
},
{
"epoch": 1.9071038251366121,
"grad_norm": 0.7582931518554688,
"learning_rate": 2.993889437833466e-05,
"loss": 1.1136192083358765,
"step": 698
},
{
"epoch": 1.9125683060109289,
"grad_norm": 0.3699612617492676,
"learning_rate": 2.9961128843779457e-05,
"loss": 1.12708580493927,
"step": 700
},
{
"epoch": 1.918032786885246,
"grad_norm": 0.6102017164230347,
"learning_rate": 2.9978361131608348e-05,
"loss": 1.0776515007019043,
"step": 702
},
{
"epoch": 1.9234972677595628,
"grad_norm": 0.37337955832481384,
"learning_rate": 2.999058483827483e-05,
"loss": 1.2161076068878174,
"step": 704
},
{
"epoch": 1.92896174863388,
"grad_norm": 0.3820495307445526,
"learning_rate": 2.9997795421429404e-05,
"loss": 1.1798738241195679,
"step": 706
},
{
"epoch": 1.9344262295081966,
"grad_norm": 0.3551599085330963,
"learning_rate": 2.9999990201607516e-05,
"loss": 1.21042001247406,
"step": 708
},
{
"epoch": 1.9398907103825138,
"grad_norm": 0.8819627165794373,
"learning_rate": 2.999716836322524e-05,
"loss": 1.0406569242477417,
"step": 710
},
{
"epoch": 1.9453551912568305,
"grad_norm": 0.22963757812976837,
"learning_rate": 2.9989330954882366e-05,
"loss": 0.7115339040756226,
"step": 712
},
{
"epoch": 1.9508196721311475,
"grad_norm": 0.3712817132472992,
"learning_rate": 2.9976480888972708e-05,
"loss": 1.1397637128829956,
"step": 714
},
{
"epoch": 1.9562841530054644,
"grad_norm": 0.542335033416748,
"learning_rate": 2.9958622940601907e-05,
"loss": 1.0498206615447998,
"step": 716
},
{
"epoch": 1.9617486338797814,
"grad_norm": 0.3853362500667572,
"learning_rate": 2.9935763745812935e-05,
"loss": 1.1299047470092773,
"step": 718
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.6234093308448792,
"learning_rate": 2.990791179912017e-05,
"loss": 1.1040066480636597,
"step": 720
},
{
"epoch": 1.9726775956284153,
"grad_norm": 0.6169328689575195,
"learning_rate": 2.9875077450352817e-05,
"loss": 1.1971713304519653,
"step": 722
},
{
"epoch": 1.9781420765027322,
"grad_norm": 0.25733399391174316,
"learning_rate": 2.9837272900808863e-05,
"loss": 1.0600217580795288,
"step": 724
},
{
"epoch": 1.9836065573770492,
"grad_norm": 1.5349887609481812,
"learning_rate": 2.9794512198721092e-05,
"loss": 1.1235666275024414,
"step": 726
},
{
"epoch": 1.989071038251366,
"grad_norm": 0.33249858021736145,
"learning_rate": 2.9746811234036736e-05,
"loss": 1.119554042816162,
"step": 728
},
{
"epoch": 1.994535519125683,
"grad_norm": 2.475351333618164,
"learning_rate": 2.9694187732512702e-05,
"loss": 1.0746409893035889,
"step": 730
},
{
"epoch": 2.0,
"grad_norm": 0.4339929521083832,
"learning_rate": 2.96366612491287e-05,
"loss": 1.1968926191329956,
"step": 732
},
{
"epoch": 2.0054644808743167,
"grad_norm": 0.2686910331249237,
"learning_rate": 2.9574253160820573e-05,
"loss": 0.5184199213981628,
"step": 734
},
{
"epoch": 2.010928961748634,
"grad_norm": 0.8379266262054443,
"learning_rate": 2.9506986658536562e-05,
"loss": 0.8861691355705261,
"step": 736
},
{
"epoch": 2.0163934426229506,
"grad_norm": 0.34444382786750793,
"learning_rate": 2.9434886738619537e-05,
"loss": 0.9877526164054871,
"step": 738
},
{
"epoch": 2.021857923497268,
"grad_norm": 0.9714856147766113,
"learning_rate": 2.9357980193518312e-05,
"loss": 0.9919371604919434,
"step": 740
},
{
"epoch": 2.0273224043715845,
"grad_norm": 0.8462095856666565,
"learning_rate": 2.927629560183153e-05,
"loss": 0.8833386898040771,
"step": 742
},
{
"epoch": 2.0327868852459017,
"grad_norm": 3.7039377689361572,
"learning_rate": 2.91898633176878e-05,
"loss": 0.999144971370697,
"step": 744
},
{
"epoch": 2.0382513661202184,
"grad_norm": 0.3893999457359314,
"learning_rate": 2.909871545946603e-05,
"loss": 0.8870663046836853,
"step": 746
},
{
"epoch": 2.0437158469945356,
"grad_norm": 0.6937627196311951,
"learning_rate": 2.9002885897860252e-05,
"loss": 1.0602282285690308,
"step": 748
},
{
"epoch": 2.0491803278688523,
"grad_norm": 2.6931281089782715,
"learning_rate": 2.8902410243293152e-05,
"loss": 0.7386649250984192,
"step": 750
},
{
"epoch": 2.0546448087431695,
"grad_norm": 1.6844873428344727,
"learning_rate": 2.8797325832683208e-05,
"loss": 0.6009291410446167,
"step": 752
},
{
"epoch": 2.060109289617486,
"grad_norm": 2.9810950756073,
"learning_rate": 2.868767171557021e-05,
"loss": 0.69991534948349,
"step": 754
},
{
"epoch": 2.0655737704918034,
"grad_norm": 0.9267653822898865,
"learning_rate": 2.8573488639604418e-05,
"loss": 1.114338994026184,
"step": 756
},
{
"epoch": 2.07103825136612,
"grad_norm": 0.4357033669948578,
"learning_rate": 2.845481903540464e-05,
"loss": 1.0973328351974487,
"step": 758
},
{
"epoch": 2.0765027322404372,
"grad_norm": 0.2919345498085022,
"learning_rate": 2.8331707000790954e-05,
"loss": 0.9262146949768066,
"step": 760
},
{
"epoch": 2.081967213114754,
"grad_norm": 0.3780726194381714,
"learning_rate": 2.820419828439788e-05,
"loss": 1.0779647827148438,
"step": 762
},
{
"epoch": 2.087431693989071,
"grad_norm": 0.4172421097755432,
"learning_rate": 2.8072340268674133e-05,
"loss": 0.9276398420333862,
"step": 764
},
{
"epoch": 2.092896174863388,
"grad_norm": 0.4299885630607605,
"learning_rate": 2.793618195227521e-05,
"loss": 1.0986346006393433,
"step": 766
},
{
"epoch": 2.098360655737705,
"grad_norm": 0.7097730040550232,
"learning_rate": 2.779577393185539e-05,
"loss": 0.8381191492080688,
"step": 768
},
{
"epoch": 2.1038251366120218,
"grad_norm": 0.4378579258918762,
"learning_rate": 2.765116838326597e-05,
"loss": 0.9266961812973022,
"step": 770
},
{
"epoch": 2.109289617486339,
"grad_norm": 4.270036697387695,
"learning_rate": 2.750241904216663e-05,
"loss": 0.9932444095611572,
"step": 772
},
{
"epoch": 2.1147540983606556,
"grad_norm": 0.3399311304092407,
"learning_rate": 2.7349581184057144e-05,
"loss": 0.8846595287322998,
"step": 774
},
{
"epoch": 2.120218579234973,
"grad_norm": 0.2650469243526459,
"learning_rate": 2.719271160373693e-05,
"loss": 0.8565717339515686,
"step": 776
},
{
"epoch": 2.1256830601092895,
"grad_norm": 2.0133235454559326,
"learning_rate": 2.703186859420002e-05,
"loss": 0.6195998787879944,
"step": 778
},
{
"epoch": 2.1311475409836067,
"grad_norm": 0.8005315065383911,
"learning_rate": 2.6867111924973283e-05,
"loss": 0.8420849442481995,
"step": 780
},
{
"epoch": 2.1366120218579234,
"grad_norm": 0.27498698234558105,
"learning_rate": 2.6698502819905935e-05,
"loss": 0.8232638239860535,
"step": 782
},
{
"epoch": 2.1420765027322406,
"grad_norm": 0.45607131719589233,
"learning_rate": 2.652610393441872e-05,
"loss": 1.0407649278640747,
"step": 784
},
{
"epoch": 2.1475409836065573,
"grad_norm": 0.6649620532989502,
"learning_rate": 2.6349979332220992e-05,
"loss": 0.684894859790802,
"step": 786
},
{
"epoch": 2.1530054644808745,
"grad_norm": 4.227207660675049,
"learning_rate": 2.6170194461504586e-05,
"loss": 0.7987200617790222,
"step": 788
},
{
"epoch": 2.158469945355191,
"grad_norm": 0.4490732252597809,
"learning_rate": 2.5986816130623133e-05,
"loss": 1.048277735710144,
"step": 790
},
{
"epoch": 2.1639344262295084,
"grad_norm": 0.3105827271938324,
"learning_rate": 2.579991248326594e-05,
"loss": 0.5875641703605652,
"step": 792
},
{
"epoch": 2.169398907103825,
"grad_norm": 1.113189697265625,
"learning_rate": 2.560955297313575e-05,
"loss": 0.7966644167900085,
"step": 794
},
{
"epoch": 2.1748633879781423,
"grad_norm": 0.3387541174888611,
"learning_rate": 2.5415808338139595e-05,
"loss": 0.8600819706916809,
"step": 796
},
{
"epoch": 2.180327868852459,
"grad_norm": 0.3505089581012726,
"learning_rate": 2.5218750574102465e-05,
"loss": 0.831095278263092,
"step": 798
},
{
"epoch": 2.185792349726776,
"grad_norm": 1.6662020683288574,
"learning_rate": 2.5018452908013522e-05,
"loss": 0.9187523126602173,
"step": 800
},
{
"epoch": 2.191256830601093,
"grad_norm": 0.45580047369003296,
"learning_rate": 2.48149897708149e-05,
"loss": 0.8529684543609619,
"step": 802
},
{
"epoch": 2.19672131147541,
"grad_norm": 0.31793418526649475,
"learning_rate": 2.4608436769743e-05,
"loss": 0.8847852945327759,
"step": 804
},
{
"epoch": 2.202185792349727,
"grad_norm": 0.38347429037094116,
"learning_rate": 2.4398870660232684e-05,
"loss": 0.9523177742958069,
"step": 806
},
{
"epoch": 2.2076502732240435,
"grad_norm": 0.5963994860649109,
"learning_rate": 2.418636931739491e-05,
"loss": 0.5150175094604492,
"step": 808
},
{
"epoch": 2.2131147540983607,
"grad_norm": 0.5923159122467041,
"learning_rate": 2.3971011707078125e-05,
"loss": 0.6686972975730896,
"step": 810
},
{
"epoch": 2.2185792349726774,
"grad_norm": 0.3102145195007324,
"learning_rate": 2.3752877856524532e-05,
"loss": 0.8697759509086609,
"step": 812
},
{
"epoch": 2.2240437158469946,
"grad_norm": 0.38423094153404236,
"learning_rate": 2.353204882463168e-05,
"loss": 0.7210009694099426,
"step": 814
},
{
"epoch": 2.2295081967213113,
"grad_norm": 0.5957372784614563,
"learning_rate": 2.330860667183101e-05,
"loss": 0.6225443482398987,
"step": 816
},
{
"epoch": 2.2349726775956285,
"grad_norm": 0.32627472281455994,
"learning_rate": 2.308263442959396e-05,
"loss": 0.8318431377410889,
"step": 818
},
{
"epoch": 2.240437158469945,
"grad_norm": 0.5187286734580994,
"learning_rate": 2.2854216069577376e-05,
"loss": 1.0246249437332153,
"step": 820
},
{
"epoch": 2.2459016393442623,
"grad_norm": 1.3565365076065063,
"learning_rate": 2.2623436472419476e-05,
"loss": 1.301908254623413,
"step": 822
},
{
"epoch": 2.251366120218579,
"grad_norm": 0.4754975438117981,
"learning_rate": 2.2390381396198102e-05,
"loss": 0.8299983143806458,
"step": 824
},
{
"epoch": 2.2568306010928962,
"grad_norm": 7.579851150512695,
"learning_rate": 2.2155137444562842e-05,
"loss": 0.4918154776096344,
"step": 826
},
{
"epoch": 2.262295081967213,
"grad_norm": 1.118971347808838,
"learning_rate": 2.191779203455302e-05,
"loss": 0.7644491791725159,
"step": 828
},
{
"epoch": 2.26775956284153,
"grad_norm": 0.2197006791830063,
"learning_rate": 2.1678433364113297e-05,
"loss": 0.7685542106628418,
"step": 830
},
{
"epoch": 2.273224043715847,
"grad_norm": 0.7653494477272034,
"learning_rate": 2.1437150379319245e-05,
"loss": 0.6390159726142883,
"step": 832
},
{
"epoch": 2.278688524590164,
"grad_norm": 0.3541068434715271,
"learning_rate": 2.1194032741324823e-05,
"loss": 0.6241085529327393,
"step": 834
},
{
"epoch": 2.2841530054644807,
"grad_norm": 2.009389877319336,
"learning_rate": 2.0949170793044142e-05,
"loss": 1.0873188972473145,
"step": 836
},
{
"epoch": 2.289617486338798,
"grad_norm": 1.058263897895813,
"learning_rate": 2.070265552557985e-05,
"loss": 0.988737940788269,
"step": 838
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.36253905296325684,
"learning_rate": 2.0454578544410758e-05,
"loss": 0.7163146138191223,
"step": 840
},
{
"epoch": 2.300546448087432,
"grad_norm": 0.4018394351005554,
"learning_rate": 2.0205032035351043e-05,
"loss": 0.883522093296051,
"step": 842
},
{
"epoch": 2.3060109289617485,
"grad_norm": 0.40066567063331604,
"learning_rate": 1.9954108730293875e-05,
"loss": 1.0170973539352417,
"step": 844
},
{
"epoch": 2.3114754098360657,
"grad_norm": 0.38333478569984436,
"learning_rate": 1.9701901872752047e-05,
"loss": 0.5225908756256104,
"step": 846
},
{
"epoch": 2.3169398907103824,
"grad_norm": 0.34811270236968994,
"learning_rate": 1.9448505183208607e-05,
"loss": 1.0079129934310913,
"step": 848
},
{
"epoch": 2.3224043715846996,
"grad_norm": 0.2737729549407959,
"learning_rate": 1.919401282429013e-05,
"loss": 0.8766093850135803,
"step": 850
},
{
"epoch": 2.3278688524590163,
"grad_norm": 1.6548091173171997,
"learning_rate": 1.893851936577567e-05,
"loss": 0.8315792679786682,
"step": 852
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.6581529974937439,
"learning_rate": 1.868211974945461e-05,
"loss": 0.9848966598510742,
"step": 854
},
{
"epoch": 2.33879781420765,
"grad_norm": 0.27750951051712036,
"learning_rate": 1.842490925384604e-05,
"loss": 0.9963698983192444,
"step": 856
},
{
"epoch": 2.3442622950819674,
"grad_norm": 0.2942568361759186,
"learning_rate": 1.816698345879313e-05,
"loss": 0.607078492641449,
"step": 858
},
{
"epoch": 2.349726775956284,
"grad_norm": 0.28276923298835754,
"learning_rate": 1.790843820994548e-05,
"loss": 0.6770678162574768,
"step": 860
},
{
"epoch": 2.3551912568306013,
"grad_norm": 0.31237614154815674,
"learning_rate": 1.7649369583142763e-05,
"loss": 0.9150133728981018,
"step": 862
},
{
"epoch": 2.360655737704918,
"grad_norm": 0.3164447844028473,
"learning_rate": 1.738987384871274e-05,
"loss": 0.5802142024040222,
"step": 864
},
{
"epoch": 2.366120218579235,
"grad_norm": 0.8007859587669373,
"learning_rate": 1.7130047435697118e-05,
"loss": 1.2320353984832764,
"step": 866
},
{
"epoch": 2.371584699453552,
"grad_norm": 0.28759169578552246,
"learning_rate": 1.6869986896018226e-05,
"loss": 0.8972910046577454,
"step": 868
},
{
"epoch": 2.3770491803278686,
"grad_norm": 0.30926313996315,
"learning_rate": 1.66097888686003e-05,
"loss": 0.6624117493629456,
"step": 870
},
{
"epoch": 2.3825136612021858,
"grad_norm": 0.42763233184814453,
"learning_rate": 1.6349550043458252e-05,
"loss": 0.8042442202568054,
"step": 872
},
{
"epoch": 2.387978142076503,
"grad_norm": 0.45229074358940125,
"learning_rate": 1.608936712576749e-05,
"loss": 0.9678982496261597,
"step": 874
},
{
"epoch": 2.3934426229508197,
"grad_norm": 1.1206239461898804,
"learning_rate": 1.582933679992809e-05,
"loss": 0.8419029712677002,
"step": 876
},
{
"epoch": 2.3989071038251364,
"grad_norm": 0.3222406208515167,
"learning_rate": 1.556955569363678e-05,
"loss": 0.8094390630722046,
"step": 878
},
{
"epoch": 2.4043715846994536,
"grad_norm": 0.4028852880001068,
"learning_rate": 1.531012034197988e-05,
"loss": 1.0756434202194214,
"step": 880
},
{
"epoch": 2.4098360655737707,
"grad_norm": 0.49627208709716797,
"learning_rate": 1.5051127151560745e-05,
"loss": 0.44026413559913635,
"step": 882
},
{
"epoch": 2.4153005464480874,
"grad_norm": 0.32852479815483093,
"learning_rate": 1.4792672364674816e-05,
"loss": 0.6913017630577087,
"step": 884
},
{
"epoch": 2.420765027322404,
"grad_norm": 0.290580689907074,
"learning_rate": 1.4534852023545968e-05,
"loss": 0.7411317825317383,
"step": 886
},
{
"epoch": 2.4262295081967213,
"grad_norm": 0.36047178506851196,
"learning_rate": 1.4277761934636963e-05,
"loss": 0.6693094372749329,
"step": 888
},
{
"epoch": 2.431693989071038,
"grad_norm": 0.3268221318721771,
"learning_rate": 1.4021497633047664e-05,
"loss": 0.5395329594612122,
"step": 890
},
{
"epoch": 2.4371584699453552,
"grad_norm": 0.3652787208557129,
"learning_rate": 1.3766154347013933e-05,
"loss": 0.8786762356758118,
"step": 892
},
{
"epoch": 2.442622950819672,
"grad_norm": 0.265882283449173,
"learning_rate": 1.3511826962520809e-05,
"loss": 0.434492290019989,
"step": 894
},
{
"epoch": 2.448087431693989,
"grad_norm": 0.30209118127822876,
"learning_rate": 1.3258609988042627e-05,
"loss": 0.6374402046203613,
"step": 896
},
{
"epoch": 2.453551912568306,
"grad_norm": 0.3441826105117798,
"learning_rate": 1.300659751942353e-05,
"loss": 0.7946062088012695,
"step": 898
},
{
"epoch": 2.459016393442623,
"grad_norm": 0.306071013212204,
"learning_rate": 1.2755883204911305e-05,
"loss": 1.020022988319397,
"step": 900
},
{
"epoch": 2.4644808743169397,
"grad_norm": 0.20878276228904724,
"learning_rate": 1.2506560210357541e-05,
"loss": 0.7610766887664795,
"step": 902
},
{
"epoch": 2.469945355191257,
"grad_norm": 0.3049929141998291,
"learning_rate": 1.225872118459706e-05,
"loss": 0.7081919312477112,
"step": 904
},
{
"epoch": 2.4754098360655736,
"grad_norm": 0.9971242547035217,
"learning_rate": 1.2012458225019375e-05,
"loss": 0.8515815734863281,
"step": 906
},
{
"epoch": 2.480874316939891,
"grad_norm": 0.19427776336669922,
"learning_rate": 1.176786284334528e-05,
"loss": 0.3863685429096222,
"step": 908
},
{
"epoch": 2.4863387978142075,
"grad_norm": 0.3503221273422241,
"learning_rate": 1.1525025931620855e-05,
"loss": 0.7190455794334412,
"step": 910
},
{
"epoch": 2.4918032786885247,
"grad_norm": 0.31161224842071533,
"learning_rate": 1.1284037728441877e-05,
"loss": 0.5138267278671265,
"step": 912
},
{
"epoch": 2.4972677595628414,
"grad_norm": 0.3345715403556824,
"learning_rate": 1.1044987785420924e-05,
"loss": 0.9073923826217651,
"step": 914
},
{
"epoch": 2.5027322404371586,
"grad_norm": 0.48776909708976746,
"learning_rate": 1.0807964933909975e-05,
"loss": 0.811479389667511,
"step": 916
},
{
"epoch": 2.5081967213114753,
"grad_norm": 0.30571448802948,
"learning_rate": 1.0573057251990443e-05,
"loss": 0.7072067856788635,
"step": 918
},
{
"epoch": 2.5136612021857925,
"grad_norm": 0.31730931997299194,
"learning_rate": 1.0340352031743256e-05,
"loss": 0.6728021502494812,
"step": 920
},
{
"epoch": 2.519125683060109,
"grad_norm": 0.31097251176834106,
"learning_rate": 1.010993574681095e-05,
"loss": 0.8272064328193665,
"step": 922
},
{
"epoch": 2.5245901639344264,
"grad_norm": 0.5129551887512207,
"learning_rate": 9.881894020263938e-06,
"loss": 0.5476985573768616,
"step": 924
},
{
"epoch": 2.530054644808743,
"grad_norm": 0.32754331827163696,
"learning_rate": 9.656311592782831e-06,
"loss": 0.5990601778030396,
"step": 926
},
{
"epoch": 2.5355191256830603,
"grad_norm": 1.367723822593689,
"learning_rate": 9.433272291168689e-06,
"loss": 0.6560569405555725,
"step": 928
},
{
"epoch": 2.540983606557377,
"grad_norm": 0.38759374618530273,
"learning_rate": 9.212858997192744e-06,
"loss": 0.979541003704071,
"step": 930
},
{
"epoch": 2.546448087431694,
"grad_norm": 1.5108798742294312,
"learning_rate": 8.995153616797544e-06,
"loss": 0.7299838066101074,
"step": 932
},
{
"epoch": 2.551912568306011,
"grad_norm": 0.25322091579437256,
"learning_rate": 8.78023704966047e-06,
"loss": 0.8324632048606873,
"step": 934
},
{
"epoch": 2.557377049180328,
"grad_norm": 1.5061708688735962,
"learning_rate": 8.568189159131336e-06,
"loss": 0.7508612275123596,
"step": 936
},
{
"epoch": 2.5628415300546448,
"grad_norm": 1.1870174407958984,
"learning_rate": 8.359088742554941e-06,
"loss": 0.5936163067817688,
"step": 938
},
{
"epoch": 2.5683060109289615,
"grad_norm": 0.5020766854286194,
"learning_rate": 8.15301350198999e-06,
"loss": 0.9320186972618103,
"step": 940
},
{
"epoch": 2.5737704918032787,
"grad_norm": 3.2037689685821533,
"learning_rate": 7.950040015334789e-06,
"loss": 0.547394335269928,
"step": 942
},
{
"epoch": 2.579234972677596,
"grad_norm": 0.3436882197856903,
"learning_rate": 7.750243707870748e-06,
"loss": 0.8800086975097656,
"step": 944
},
{
"epoch": 2.5846994535519126,
"grad_norm": 0.45297709107398987,
"learning_rate": 7.553698824234314e-06,
"loss": 0.32935142517089844,
"step": 946
},
{
"epoch": 2.5901639344262293,
"grad_norm": 0.4006066620349884,
"learning_rate": 7.360478400827475e-06,
"loss": 0.6979743838310242,
"step": 948
},
{
"epoch": 2.5956284153005464,
"grad_norm": 0.3217792809009552,
"learning_rate": 7.170654238677331e-06,
"loss": 0.6965923309326172,
"step": 950
},
{
"epoch": 2.6010928961748636,
"grad_norm": 0.3064567446708679,
"learning_rate": 6.984296876754711e-06,
"loss": 1.0260727405548096,
"step": 952
},
{
"epoch": 2.6065573770491803,
"grad_norm": 0.3507385551929474,
"learning_rate": 6.801475565761783e-06,
"loss": 0.4919123649597168,
"step": 954
},
{
"epoch": 2.612021857923497,
"grad_norm": 0.29441311955451965,
"learning_rate": 6.622258242398371e-06,
"loss": 0.29813262820243835,
"step": 956
},
{
"epoch": 2.6174863387978142,
"grad_norm": 0.3191528916358948,
"learning_rate": 6.4467115041165855e-06,
"loss": 0.5578616261482239,
"step": 958
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.34111329913139343,
"learning_rate": 6.2749005843730336e-06,
"loss": 0.6605692505836487,
"step": 960
},
{
"epoch": 2.628415300546448,
"grad_norm": 1.6721758842468262,
"learning_rate": 6.106889328388064e-06,
"loss": 0.6230573654174805,
"step": 962
},
{
"epoch": 2.633879781420765,
"grad_norm": 0.40547674894332886,
"learning_rate": 5.942740169420701e-06,
"loss": 0.9198755025863647,
"step": 964
},
{
"epoch": 2.639344262295082,
"grad_norm": 0.2505720853805542,
"learning_rate": 5.7825141055683895e-06,
"loss": 0.7033195495605469,
"step": 966
},
{
"epoch": 2.644808743169399,
"grad_norm": 0.42646244168281555,
"learning_rate": 5.62627067709992e-06,
"loss": 0.7230016589164734,
"step": 968
},
{
"epoch": 2.650273224043716,
"grad_norm": 0.8739278316497803,
"learning_rate": 5.474067944330285e-06,
"loss": 0.7599860429763794,
"step": 970
},
{
"epoch": 2.6557377049180326,
"grad_norm": 0.3435499966144562,
"learning_rate": 5.325962466045282e-06,
"loss": 1.0251665115356445,
"step": 972
},
{
"epoch": 2.66120218579235,
"grad_norm": 0.30040714144706726,
"learning_rate": 5.18200927848421e-06,
"loss": 0.6986700892448425,
"step": 974
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.30992403626441956,
"learning_rate": 5.042261874888308e-06,
"loss": 0.6806256771087646,
"step": 976
},
{
"epoch": 2.6721311475409837,
"grad_norm": 0.15724875032901764,
"learning_rate": 4.906772185622572e-06,
"loss": 0.7747625708580017,
"step": 978
},
{
"epoch": 2.6775956284153004,
"grad_norm": 0.24404537677764893,
"learning_rate": 4.775590558878368e-06,
"loss": 0.7235421538352966,
"step": 980
},
{
"epoch": 2.6830601092896176,
"grad_norm": 0.45935794711112976,
"learning_rate": 4.648765741963903e-06,
"loss": 0.8421517610549927,
"step": 982
},
{
"epoch": 2.6885245901639343,
"grad_norm": 0.8475012183189392,
"learning_rate": 4.526344863189724e-06,
"loss": 0.6789471507072449,
"step": 984
},
{
"epoch": 2.6939890710382515,
"grad_norm": 0.36752596497535706,
"learning_rate": 4.408373414355714e-06,
"loss": 0.9892035126686096,
"step": 986
},
{
"epoch": 2.699453551912568,
"grad_norm": 0.2924305498600006,
"learning_rate": 4.29489523384628e-06,
"loss": 0.8234527707099915,
"step": 988
},
{
"epoch": 2.7049180327868854,
"grad_norm": 0.29964494705200195,
"learning_rate": 4.185952490339899e-06,
"loss": 0.7644519805908203,
"step": 990
},
{
"epoch": 2.710382513661202,
"grad_norm": 0.25154849886894226,
"learning_rate": 4.081585667139231e-06,
"loss": 0.4852161407470703,
"step": 992
},
{
"epoch": 2.7158469945355193,
"grad_norm": 0.2300635278224945,
"learning_rate": 3.981833547127413e-06,
"loss": 0.8437511324882507,
"step": 994
},
{
"epoch": 2.721311475409836,
"grad_norm": 0.27773451805114746,
"learning_rate": 3.886733198356298e-06,
"loss": 0.7463192939758301,
"step": 996
},
{
"epoch": 2.726775956284153,
"grad_norm": 0.18970778584480286,
"learning_rate": 3.7963199602718717e-06,
"loss": 0.6530200242996216,
"step": 998
},
{
"epoch": 2.73224043715847,
"grad_norm": 0.6071864366531372,
"learning_rate": 3.7106274305821034e-06,
"loss": 1.0014379024505615,
"step": 1000
},
{
"epoch": 2.737704918032787,
"grad_norm": 0.32319918274879456,
"learning_rate": 3.6296874527719515e-06,
"loss": 0.8679925203323364,
"step": 1002
},
{
"epoch": 2.7431693989071038,
"grad_norm": 0.2652120888233185,
"learning_rate": 3.553530104270281e-06,
"loss": 0.6595214605331421,
"step": 1004
},
{
"epoch": 2.748633879781421,
"grad_norm": 1.4743956327438354,
"learning_rate": 3.4821836852730384e-06,
"loss": 0.3518790304660797,
"step": 1006
},
{
"epoch": 2.7540983606557377,
"grad_norm": 0.27878043055534363,
"learning_rate": 3.41567470822686e-06,
"loss": 1.0067280530929565,
"step": 1008
},
{
"epoch": 2.7595628415300544,
"grad_norm": 0.37674176692962646,
"learning_rate": 3.354027887976989e-06,
"loss": 0.988980233669281,
"step": 1010
},
{
"epoch": 2.7650273224043715,
"grad_norm": 0.38307106494903564,
"learning_rate": 3.297266132583221e-06,
"loss": 0.7983114123344421,
"step": 1012
},
{
"epoch": 2.7704918032786887,
"grad_norm": 0.2897000312805176,
"learning_rate": 3.245410534807195e-06,
"loss": 0.9022005796432495,
"step": 1014
},
{
"epoch": 2.7759562841530054,
"grad_norm": 0.5147584676742554,
"learning_rate": 3.1984803642743314e-06,
"loss": 0.7322267293930054,
"step": 1016
},
{
"epoch": 2.781420765027322,
"grad_norm": 0.2588910460472107,
"learning_rate": 3.1564930603131777e-06,
"loss": 0.8495315313339233,
"step": 1018
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.33818700909614563,
"learning_rate": 3.1194642254749395e-06,
"loss": 0.6339123845100403,
"step": 1020
},
{
"epoch": 2.7923497267759565,
"grad_norm": 0.31622907519340515,
"learning_rate": 3.0874076197355317e-06,
"loss": 0.8431963920593262,
"step": 1022
},
{
"epoch": 2.797814207650273,
"grad_norm": 0.24222347140312195,
"learning_rate": 3.0603351553823717e-06,
"loss": 0.37352508306503296,
"step": 1024
},
{
"epoch": 2.80327868852459,
"grad_norm": 0.31230342388153076,
"learning_rate": 3.038256892587734e-06,
"loss": 0.9501305222511292,
"step": 1026
},
{
"epoch": 2.808743169398907,
"grad_norm": 1.9719264507293701,
"learning_rate": 3.0211810356703803e-06,
"loss": 0.6887733936309814,
"step": 1028
},
{
"epoch": 2.8142076502732243,
"grad_norm": 0.3655056655406952,
"learning_rate": 3.0091139300468266e-06,
"loss": 0.5987123250961304,
"step": 1030
},
{
"epoch": 2.819672131147541,
"grad_norm": 0.23417578637599945,
"learning_rate": 3.0020600598733656e-06,
"loss": 0.6534858345985413,
"step": 1032
},
{
"epoch": 2.8251366120218577,
"grad_norm": 1.2176401615142822,
"learning_rate": 3.000022046379753e-06,
"loss": 0.7378473877906799,
"step": 1034
},
{
"epoch": 2.830601092896175,
"grad_norm": 0.7160838842391968,
"learning_rate": 3.0030006468951557e-06,
"loss": 0.9491780996322632,
"step": 1036
},
{
"epoch": 2.836065573770492,
"grad_norm": 0.2178313285112381,
"learning_rate": 3.0109947545667246e-06,
"loss": 0.5883079767227173,
"step": 1038
},
{
"epoch": 2.841530054644809,
"grad_norm": 0.3265758454799652,
"learning_rate": 3.024001398770901e-06,
"loss": 0.823525607585907,
"step": 1040
},
{
"epoch": 2.8469945355191255,
"grad_norm": 0.32923081517219543,
"learning_rate": 3.042015746217308e-06,
"loss": 0.8023366928100586,
"step": 1042
},
{
"epoch": 2.8524590163934427,
"grad_norm": 0.26465603709220886,
"learning_rate": 3.0650311027448116e-06,
"loss": 0.7749249339103699,
"step": 1044
},
{
"epoch": 2.8579234972677594,
"grad_norm": 0.34655559062957764,
"learning_rate": 3.0930389158090754e-06,
"loss": 1.0069957971572876,
"step": 1046
},
{
"epoch": 2.8633879781420766,
"grad_norm": 0.18657130002975464,
"learning_rate": 3.1260287776607025e-06,
"loss": 0.43821409344673157,
"step": 1048
},
{
"epoch": 2.8688524590163933,
"grad_norm": 0.37593531608581543,
"learning_rate": 3.163988429212773e-06,
"loss": 0.7646658420562744,
"step": 1050
},
{
"epoch": 2.8743169398907105,
"grad_norm": 0.3105663061141968,
"learning_rate": 3.206903764596349e-06,
"loss": 0.6222292184829712,
"step": 1052
},
{
"epoch": 2.879781420765027,
"grad_norm": 0.33960670232772827,
"learning_rate": 3.254758836402225e-06,
"loss": 0.917137861251831,
"step": 1054
},
{
"epoch": 2.8852459016393444,
"grad_norm": 1.0527145862579346,
"learning_rate": 3.3075358616070144e-06,
"loss": 0.715148389339447,
"step": 1056
},
{
"epoch": 2.890710382513661,
"grad_norm": 1.0480117797851562,
"learning_rate": 3.365215228181358e-06,
"loss": 0.6156185865402222,
"step": 1058
},
{
"epoch": 2.8961748633879782,
"grad_norm": 0.330814391374588,
"learning_rate": 3.4277755023777795e-06,
"loss": 0.9344475269317627,
"step": 1060
},
{
"epoch": 2.901639344262295,
"grad_norm": 0.4226922392845154,
"learning_rate": 3.495193436695504e-06,
"loss": 0.8273733258247375,
"step": 1062
},
{
"epoch": 2.907103825136612,
"grad_norm": 0.2566080689430237,
"learning_rate": 3.567443978519267e-06,
"loss": 0.871826708316803,
"step": 1064
},
{
"epoch": 2.912568306010929,
"grad_norm": 0.18654237687587738,
"learning_rate": 3.6445002794288992e-06,
"loss": 0.579515278339386,
"step": 1066
},
{
"epoch": 2.918032786885246,
"grad_norm": 0.2575370669364929,
"learning_rate": 3.7263337051762718e-06,
"loss": 0.5843703150749207,
"step": 1068
},
{
"epoch": 2.9234972677595628,
"grad_norm": 0.9355533123016357,
"learning_rate": 3.8129138463257943e-06,
"loss": 0.8395287990570068,
"step": 1070
},
{
"epoch": 2.92896174863388,
"grad_norm": 0.2116459608078003,
"learning_rate": 3.904208529554625e-06,
"loss": 0.9328120350837708,
"step": 1072
},
{
"epoch": 2.9344262295081966,
"grad_norm": 0.33248427510261536,
"learning_rate": 4.000183829608332e-06,
"loss": 0.5601940751075745,
"step": 1074
},
{
"epoch": 2.939890710382514,
"grad_norm": 3.3151776790618896,
"learning_rate": 4.100804081907595e-06,
"loss": 0.6075171828269958,
"step": 1076
},
{
"epoch": 2.9453551912568305,
"grad_norm": 1.3758751153945923,
"learning_rate": 4.206031895801176e-06,
"loss": 0.5575841665267944,
"step": 1078
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.7501595616340637,
"learning_rate": 4.315828168460367e-06,
"loss": 0.9044106006622314,
"step": 1080
},
{
"epoch": 2.9562841530054644,
"grad_norm": 0.30890920758247375,
"learning_rate": 4.430152099409704e-06,
"loss": 0.6737725734710693,
"step": 1082
},
{
"epoch": 2.9617486338797816,
"grad_norm": 0.3175513446331024,
"learning_rate": 4.548961205688424e-06,
"loss": 0.860198438167572,
"step": 1084
},
{
"epoch": 2.9672131147540983,
"grad_norm": 0.2916860580444336,
"learning_rate": 4.672211337637246e-06,
"loss": 0.572981059551239,
"step": 1086
},
{
"epoch": 2.972677595628415,
"grad_norm": 0.2856867015361786,
"learning_rate": 4.7998566953044445e-06,
"loss": 1.1048438549041748,
"step": 1088
},
{
"epoch": 2.978142076502732,
"grad_norm": 0.3044918179512024,
"learning_rate": 4.931849845465193e-06,
"loss": 0.6697894334793091,
"step": 1090
},
{
"epoch": 2.9836065573770494,
"grad_norm": 0.3794187903404236,
"learning_rate": 5.06814173924782e-06,
"loss": 0.8674835562705994,
"step": 1092
},
{
"epoch": 2.989071038251366,
"grad_norm": 0.3471493124961853,
"learning_rate": 5.208681730360458e-06,
"loss": 0.9353753328323364,
"step": 1094
},
{
"epoch": 2.994535519125683,
"grad_norm": 0.9036471843719482,
"learning_rate": 5.3534175939112694e-06,
"loss": 0.5917394757270813,
"step": 1096
},
{
"epoch": 3.0,
"grad_norm": 0.34582749009132385,
"learning_rate": 5.50229554581536e-06,
"loss": 0.6814420819282532,
"step": 1098
},
{
"epoch": 3.0,
"step": 1098,
"total_flos": 4.957143256761631e+18,
"train_loss": 0.9743868487671642,
"train_runtime": 11662.4045,
"train_samples_per_second": 5.649,
"train_steps_per_second": 0.094
}
],
"logging_steps": 2,
"max_steps": 1098,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.957143256761631e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}