google-links / trainer_state.json
dejanseo's picture
Upload 22 files
f29b6e6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 15948,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009405568096313018,
"grad_norm": 4.406911849975586,
"learning_rate": 1.9938550288437425e-05,
"loss": 0.6174,
"step": 50
},
{
"epoch": 0.018811136192626036,
"grad_norm": 6.521998405456543,
"learning_rate": 1.987584650112867e-05,
"loss": 0.398,
"step": 100
},
{
"epoch": 0.028216704288939052,
"grad_norm": 4.9869771003723145,
"learning_rate": 1.9813142713819916e-05,
"loss": 0.3699,
"step": 150
},
{
"epoch": 0.03762227238525207,
"grad_norm": 4.50770378112793,
"learning_rate": 1.975043892651116e-05,
"loss": 0.3522,
"step": 200
},
{
"epoch": 0.04702784048156509,
"grad_norm": 4.725740909576416,
"learning_rate": 1.9687735139202408e-05,
"loss": 0.3176,
"step": 250
},
{
"epoch": 0.056433408577878104,
"grad_norm": 2.476680040359497,
"learning_rate": 1.9625031351893655e-05,
"loss": 0.3091,
"step": 300
},
{
"epoch": 0.06583897667419113,
"grad_norm": 5.042876720428467,
"learning_rate": 1.9562327564584903e-05,
"loss": 0.3506,
"step": 350
},
{
"epoch": 0.07524454477050414,
"grad_norm": 7.000358581542969,
"learning_rate": 1.949962377727615e-05,
"loss": 0.3493,
"step": 400
},
{
"epoch": 0.08465011286681716,
"grad_norm": 9.695847511291504,
"learning_rate": 1.9436919989967394e-05,
"loss": 0.2922,
"step": 450
},
{
"epoch": 0.09405568096313018,
"grad_norm": 5.6148552894592285,
"learning_rate": 1.9374216202658642e-05,
"loss": 0.3225,
"step": 500
},
{
"epoch": 0.10346124905944319,
"grad_norm": 3.7483432292938232,
"learning_rate": 1.931151241534989e-05,
"loss": 0.3329,
"step": 550
},
{
"epoch": 0.11286681715575621,
"grad_norm": 3.2282767295837402,
"learning_rate": 1.9248808628041137e-05,
"loss": 0.3107,
"step": 600
},
{
"epoch": 0.12227238525206922,
"grad_norm": 9.40439224243164,
"learning_rate": 1.918610484073238e-05,
"loss": 0.3059,
"step": 650
},
{
"epoch": 0.13167795334838225,
"grad_norm": 2.8919079303741455,
"learning_rate": 1.912340105342363e-05,
"loss": 0.2929,
"step": 700
},
{
"epoch": 0.14108352144469527,
"grad_norm": 3.744126558303833,
"learning_rate": 1.9060697266114876e-05,
"loss": 0.3426,
"step": 750
},
{
"epoch": 0.1504890895410083,
"grad_norm": 2.0327718257904053,
"learning_rate": 1.899799347880612e-05,
"loss": 0.3259,
"step": 800
},
{
"epoch": 0.1598946576373213,
"grad_norm": 3.749131679534912,
"learning_rate": 1.8935289691497367e-05,
"loss": 0.2781,
"step": 850
},
{
"epoch": 0.16930022573363432,
"grad_norm": 2.2363057136535645,
"learning_rate": 1.8872585904188615e-05,
"loss": 0.3797,
"step": 900
},
{
"epoch": 0.17870579382994733,
"grad_norm": 8.247345924377441,
"learning_rate": 1.8809882116879862e-05,
"loss": 0.3156,
"step": 950
},
{
"epoch": 0.18811136192626035,
"grad_norm": 4.1785454750061035,
"learning_rate": 1.8747178329571106e-05,
"loss": 0.2728,
"step": 1000
},
{
"epoch": 0.19751693002257337,
"grad_norm": 2.009939670562744,
"learning_rate": 1.8684474542262354e-05,
"loss": 0.2674,
"step": 1050
},
{
"epoch": 0.20692249811888638,
"grad_norm": 14.008905410766602,
"learning_rate": 1.86217707549536e-05,
"loss": 0.3228,
"step": 1100
},
{
"epoch": 0.2163280662151994,
"grad_norm": 7.390902042388916,
"learning_rate": 1.855906696764485e-05,
"loss": 0.2759,
"step": 1150
},
{
"epoch": 0.22573363431151242,
"grad_norm": 5.746609210968018,
"learning_rate": 1.8496363180336093e-05,
"loss": 0.257,
"step": 1200
},
{
"epoch": 0.23513920240782543,
"grad_norm": 5.413491725921631,
"learning_rate": 1.843365939302734e-05,
"loss": 0.2821,
"step": 1250
},
{
"epoch": 0.24454477050413845,
"grad_norm": 17.94203758239746,
"learning_rate": 1.8370955605718588e-05,
"loss": 0.2749,
"step": 1300
},
{
"epoch": 0.25395033860045146,
"grad_norm": 4.912784099578857,
"learning_rate": 1.8308251818409832e-05,
"loss": 0.2733,
"step": 1350
},
{
"epoch": 0.2633559066967645,
"grad_norm": 3.2884740829467773,
"learning_rate": 1.824554803110108e-05,
"loss": 0.2873,
"step": 1400
},
{
"epoch": 0.2727614747930775,
"grad_norm": 3.9251766204833984,
"learning_rate": 1.8182844243792327e-05,
"loss": 0.2802,
"step": 1450
},
{
"epoch": 0.28216704288939054,
"grad_norm": 1.8012003898620605,
"learning_rate": 1.8120140456483574e-05,
"loss": 0.2765,
"step": 1500
},
{
"epoch": 0.29157261098570353,
"grad_norm": 3.162705183029175,
"learning_rate": 1.805743666917482e-05,
"loss": 0.2761,
"step": 1550
},
{
"epoch": 0.3009781790820166,
"grad_norm": 2.2068610191345215,
"learning_rate": 1.7994732881866066e-05,
"loss": 0.263,
"step": 1600
},
{
"epoch": 0.31038374717832956,
"grad_norm": 2.723480224609375,
"learning_rate": 1.7932029094557313e-05,
"loss": 0.2496,
"step": 1650
},
{
"epoch": 0.3197893152746426,
"grad_norm": 2.9920785427093506,
"learning_rate": 1.786932530724856e-05,
"loss": 0.2955,
"step": 1700
},
{
"epoch": 0.3291948833709556,
"grad_norm": 4.665702819824219,
"learning_rate": 1.7806621519939805e-05,
"loss": 0.306,
"step": 1750
},
{
"epoch": 0.33860045146726864,
"grad_norm": 1.996135950088501,
"learning_rate": 1.7743917732631052e-05,
"loss": 0.329,
"step": 1800
},
{
"epoch": 0.3480060195635816,
"grad_norm": 2.1975622177124023,
"learning_rate": 1.76812139453223e-05,
"loss": 0.2896,
"step": 1850
},
{
"epoch": 0.35741158765989467,
"grad_norm": 23.333040237426758,
"learning_rate": 1.7618510158013547e-05,
"loss": 0.2878,
"step": 1900
},
{
"epoch": 0.36681715575620766,
"grad_norm": 1.7410361766815186,
"learning_rate": 1.755580637070479e-05,
"loss": 0.2731,
"step": 1950
},
{
"epoch": 0.3762227238525207,
"grad_norm": 5.49874210357666,
"learning_rate": 1.749310258339604e-05,
"loss": 0.3071,
"step": 2000
},
{
"epoch": 0.3856282919488337,
"grad_norm": 2.9172000885009766,
"learning_rate": 1.7430398796087283e-05,
"loss": 0.2777,
"step": 2050
},
{
"epoch": 0.39503386004514673,
"grad_norm": 2.531278371810913,
"learning_rate": 1.736769500877853e-05,
"loss": 0.2683,
"step": 2100
},
{
"epoch": 0.4044394281414597,
"grad_norm": 3.2860658168792725,
"learning_rate": 1.7304991221469778e-05,
"loss": 0.2599,
"step": 2150
},
{
"epoch": 0.41384499623777277,
"grad_norm": 1.781692624092102,
"learning_rate": 1.7242287434161025e-05,
"loss": 0.2867,
"step": 2200
},
{
"epoch": 0.42325056433408575,
"grad_norm": 2.29233717918396,
"learning_rate": 1.7179583646852273e-05,
"loss": 0.2744,
"step": 2250
},
{
"epoch": 0.4326561324303988,
"grad_norm": 2.741166591644287,
"learning_rate": 1.7116879859543517e-05,
"loss": 0.2595,
"step": 2300
},
{
"epoch": 0.4420617005267118,
"grad_norm": 5.684919834136963,
"learning_rate": 1.7054176072234764e-05,
"loss": 0.3033,
"step": 2350
},
{
"epoch": 0.45146726862302483,
"grad_norm": 2.437774181365967,
"learning_rate": 1.699147228492601e-05,
"loss": 0.2934,
"step": 2400
},
{
"epoch": 0.4608728367193379,
"grad_norm": 6.011141300201416,
"learning_rate": 1.692876849761726e-05,
"loss": 0.2793,
"step": 2450
},
{
"epoch": 0.47027840481565086,
"grad_norm": 24.469600677490234,
"learning_rate": 1.6866064710308507e-05,
"loss": 0.2492,
"step": 2500
},
{
"epoch": 0.4796839729119639,
"grad_norm": 4.883657455444336,
"learning_rate": 1.680336092299975e-05,
"loss": 0.2216,
"step": 2550
},
{
"epoch": 0.4890895410082769,
"grad_norm": 2.0113911628723145,
"learning_rate": 1.6740657135690995e-05,
"loss": 0.2999,
"step": 2600
},
{
"epoch": 0.49849510910458994,
"grad_norm": 2.1354928016662598,
"learning_rate": 1.6677953348382242e-05,
"loss": 0.2635,
"step": 2650
},
{
"epoch": 0.5079006772009029,
"grad_norm": 3.987088918685913,
"learning_rate": 1.661524956107349e-05,
"loss": 0.2405,
"step": 2700
},
{
"epoch": 0.5173062452972159,
"grad_norm": 4.9606709480285645,
"learning_rate": 1.6552545773764737e-05,
"loss": 0.239,
"step": 2750
},
{
"epoch": 0.526711813393529,
"grad_norm": 1.6401499509811401,
"learning_rate": 1.6489841986455985e-05,
"loss": 0.3246,
"step": 2800
},
{
"epoch": 0.536117381489842,
"grad_norm": 5.161315441131592,
"learning_rate": 1.642713819914723e-05,
"loss": 0.2625,
"step": 2850
},
{
"epoch": 0.545522949586155,
"grad_norm": 1.054700255393982,
"learning_rate": 1.6364434411838476e-05,
"loss": 0.2741,
"step": 2900
},
{
"epoch": 0.554928517682468,
"grad_norm": 2.2569172382354736,
"learning_rate": 1.6301730624529724e-05,
"loss": 0.2622,
"step": 2950
},
{
"epoch": 0.5643340857787811,
"grad_norm": 11.479528427124023,
"learning_rate": 1.623902683722097e-05,
"loss": 0.2235,
"step": 3000
},
{
"epoch": 0.5737396538750941,
"grad_norm": 2.314810276031494,
"learning_rate": 1.617632304991222e-05,
"loss": 0.284,
"step": 3050
},
{
"epoch": 0.5831452219714071,
"grad_norm": 2.623328924179077,
"learning_rate": 1.6113619262603463e-05,
"loss": 0.2951,
"step": 3100
},
{
"epoch": 0.59255079006772,
"grad_norm": 6.059717655181885,
"learning_rate": 1.6050915475294707e-05,
"loss": 0.2613,
"step": 3150
},
{
"epoch": 0.6019563581640331,
"grad_norm": 1.6962251663208008,
"learning_rate": 1.5988211687985954e-05,
"loss": 0.2676,
"step": 3200
},
{
"epoch": 0.6113619262603461,
"grad_norm": 6.8796586990356445,
"learning_rate": 1.59255079006772e-05,
"loss": 0.2314,
"step": 3250
},
{
"epoch": 0.6207674943566591,
"grad_norm": 5.26965856552124,
"learning_rate": 1.586280411336845e-05,
"loss": 0.2655,
"step": 3300
},
{
"epoch": 0.6301730624529721,
"grad_norm": 2.5264058113098145,
"learning_rate": 1.5800100326059697e-05,
"loss": 0.2399,
"step": 3350
},
{
"epoch": 0.6395786305492852,
"grad_norm": 7.36959171295166,
"learning_rate": 1.573739653875094e-05,
"loss": 0.2645,
"step": 3400
},
{
"epoch": 0.6489841986455982,
"grad_norm": 6.5851874351501465,
"learning_rate": 1.5674692751442188e-05,
"loss": 0.2544,
"step": 3450
},
{
"epoch": 0.6583897667419112,
"grad_norm": 1.4328551292419434,
"learning_rate": 1.5611988964133436e-05,
"loss": 0.2385,
"step": 3500
},
{
"epoch": 0.6677953348382242,
"grad_norm": 1.747718095779419,
"learning_rate": 1.5549285176824683e-05,
"loss": 0.2371,
"step": 3550
},
{
"epoch": 0.6772009029345373,
"grad_norm": 5.544091701507568,
"learning_rate": 1.548658138951593e-05,
"loss": 0.2347,
"step": 3600
},
{
"epoch": 0.6866064710308503,
"grad_norm": 2.0724775791168213,
"learning_rate": 1.5423877602207175e-05,
"loss": 0.2817,
"step": 3650
},
{
"epoch": 0.6960120391271633,
"grad_norm": 3.6699838638305664,
"learning_rate": 1.536117381489842e-05,
"loss": 0.282,
"step": 3700
},
{
"epoch": 0.7054176072234764,
"grad_norm": 2.081963539123535,
"learning_rate": 1.5298470027589666e-05,
"loss": 0.3104,
"step": 3750
},
{
"epoch": 0.7148231753197893,
"grad_norm": 2.8969521522521973,
"learning_rate": 1.5235766240280914e-05,
"loss": 0.2473,
"step": 3800
},
{
"epoch": 0.7242287434161023,
"grad_norm": 2.79297137260437,
"learning_rate": 1.5173062452972161e-05,
"loss": 0.2199,
"step": 3850
},
{
"epoch": 0.7336343115124153,
"grad_norm": 1.689758062362671,
"learning_rate": 1.5110358665663407e-05,
"loss": 0.2487,
"step": 3900
},
{
"epoch": 0.7430398796087284,
"grad_norm": 2.0919642448425293,
"learning_rate": 1.5047654878354654e-05,
"loss": 0.2558,
"step": 3950
},
{
"epoch": 0.7524454477050414,
"grad_norm": 8.588711738586426,
"learning_rate": 1.49849510910459e-05,
"loss": 0.2798,
"step": 4000
},
{
"epoch": 0.7618510158013544,
"grad_norm": 2.519028425216675,
"learning_rate": 1.4922247303737148e-05,
"loss": 0.2862,
"step": 4050
},
{
"epoch": 0.7712565838976674,
"grad_norm": 5.890926837921143,
"learning_rate": 1.4859543516428393e-05,
"loss": 0.2243,
"step": 4100
},
{
"epoch": 0.7806621519939805,
"grad_norm": 2.727581262588501,
"learning_rate": 1.479683972911964e-05,
"loss": 0.2578,
"step": 4150
},
{
"epoch": 0.7900677200902935,
"grad_norm": 3.8163578510284424,
"learning_rate": 1.4734135941810888e-05,
"loss": 0.2844,
"step": 4200
},
{
"epoch": 0.7994732881866065,
"grad_norm": 3.923978567123413,
"learning_rate": 1.4671432154502132e-05,
"loss": 0.2344,
"step": 4250
},
{
"epoch": 0.8088788562829194,
"grad_norm": 1.962684154510498,
"learning_rate": 1.4608728367193378e-05,
"loss": 0.2366,
"step": 4300
},
{
"epoch": 0.8182844243792325,
"grad_norm": 4.875962734222412,
"learning_rate": 1.4546024579884626e-05,
"loss": 0.266,
"step": 4350
},
{
"epoch": 0.8276899924755455,
"grad_norm": 3.953382730484009,
"learning_rate": 1.4483320792575873e-05,
"loss": 0.2949,
"step": 4400
},
{
"epoch": 0.8370955605718585,
"grad_norm": 4.6789870262146,
"learning_rate": 1.4420617005267119e-05,
"loss": 0.2429,
"step": 4450
},
{
"epoch": 0.8465011286681715,
"grad_norm": 1.2689917087554932,
"learning_rate": 1.4357913217958366e-05,
"loss": 0.2278,
"step": 4500
},
{
"epoch": 0.8559066967644846,
"grad_norm": 1.1619006395339966,
"learning_rate": 1.4295209430649612e-05,
"loss": 0.2125,
"step": 4550
},
{
"epoch": 0.8653122648607976,
"grad_norm": 2.665306329727173,
"learning_rate": 1.423250564334086e-05,
"loss": 0.2627,
"step": 4600
},
{
"epoch": 0.8747178329571106,
"grad_norm": 2.67232084274292,
"learning_rate": 1.4169801856032105e-05,
"loss": 0.2747,
"step": 4650
},
{
"epoch": 0.8841234010534236,
"grad_norm": 1.467073678970337,
"learning_rate": 1.4107098068723353e-05,
"loss": 0.2261,
"step": 4700
},
{
"epoch": 0.8935289691497367,
"grad_norm": 7.690640449523926,
"learning_rate": 1.40443942814146e-05,
"loss": 0.2209,
"step": 4750
},
{
"epoch": 0.9029345372460497,
"grad_norm": 2.4795353412628174,
"learning_rate": 1.3981690494105846e-05,
"loss": 0.247,
"step": 4800
},
{
"epoch": 0.9123401053423627,
"grad_norm": 2.3668243885040283,
"learning_rate": 1.391898670679709e-05,
"loss": 0.2378,
"step": 4850
},
{
"epoch": 0.9217456734386757,
"grad_norm": 2.6995575428009033,
"learning_rate": 1.3856282919488338e-05,
"loss": 0.2156,
"step": 4900
},
{
"epoch": 0.9311512415349887,
"grad_norm": 3.877608299255371,
"learning_rate": 1.3793579132179585e-05,
"loss": 0.2434,
"step": 4950
},
{
"epoch": 0.9405568096313017,
"grad_norm": 5.548897743225098,
"learning_rate": 1.373087534487083e-05,
"loss": 0.2571,
"step": 5000
},
{
"epoch": 0.9499623777276147,
"grad_norm": 1.6183255910873413,
"learning_rate": 1.3668171557562078e-05,
"loss": 0.2118,
"step": 5050
},
{
"epoch": 0.9593679458239278,
"grad_norm": 8.709449768066406,
"learning_rate": 1.3605467770253324e-05,
"loss": 0.2548,
"step": 5100
},
{
"epoch": 0.9687735139202408,
"grad_norm": 1.0707285404205322,
"learning_rate": 1.3542763982944572e-05,
"loss": 0.1974,
"step": 5150
},
{
"epoch": 0.9781790820165538,
"grad_norm": 2.0646872520446777,
"learning_rate": 1.3480060195635817e-05,
"loss": 0.2524,
"step": 5200
},
{
"epoch": 0.9875846501128668,
"grad_norm": 2.3454248905181885,
"learning_rate": 1.3417356408327065e-05,
"loss": 0.2698,
"step": 5250
},
{
"epoch": 0.9969902182091799,
"grad_norm": 3.7309887409210205,
"learning_rate": 1.3354652621018312e-05,
"loss": 0.2508,
"step": 5300
},
{
"epoch": 1.0063957863054929,
"grad_norm": 10.15404987335205,
"learning_rate": 1.3291948833709558e-05,
"loss": 0.2409,
"step": 5350
},
{
"epoch": 1.0158013544018059,
"grad_norm": 2.018286943435669,
"learning_rate": 1.3229245046400802e-05,
"loss": 0.2634,
"step": 5400
},
{
"epoch": 1.0252069224981188,
"grad_norm": 1.0378094911575317,
"learning_rate": 1.316654125909205e-05,
"loss": 0.2317,
"step": 5450
},
{
"epoch": 1.0346124905944318,
"grad_norm": 2.633552074432373,
"learning_rate": 1.3103837471783295e-05,
"loss": 0.2087,
"step": 5500
},
{
"epoch": 1.0440180586907448,
"grad_norm": 2.9494006633758545,
"learning_rate": 1.3041133684474543e-05,
"loss": 0.2294,
"step": 5550
},
{
"epoch": 1.053423626787058,
"grad_norm": 1.539960265159607,
"learning_rate": 1.297842989716579e-05,
"loss": 0.1995,
"step": 5600
},
{
"epoch": 1.062829194883371,
"grad_norm": 1.6446877717971802,
"learning_rate": 1.2915726109857036e-05,
"loss": 0.2609,
"step": 5650
},
{
"epoch": 1.072234762979684,
"grad_norm": 0.5871282815933228,
"learning_rate": 1.2853022322548283e-05,
"loss": 0.2095,
"step": 5700
},
{
"epoch": 1.081640331075997,
"grad_norm": 3.3624796867370605,
"learning_rate": 1.279031853523953e-05,
"loss": 0.2342,
"step": 5750
},
{
"epoch": 1.09104589917231,
"grad_norm": 6.334434509277344,
"learning_rate": 1.2727614747930777e-05,
"loss": 0.2143,
"step": 5800
},
{
"epoch": 1.100451467268623,
"grad_norm": 3.2644360065460205,
"learning_rate": 1.2664910960622022e-05,
"loss": 0.2094,
"step": 5850
},
{
"epoch": 1.109857035364936,
"grad_norm": 2.850273847579956,
"learning_rate": 1.260220717331327e-05,
"loss": 0.2012,
"step": 5900
},
{
"epoch": 1.119262603461249,
"grad_norm": 6.344181537628174,
"learning_rate": 1.2539503386004517e-05,
"loss": 0.1993,
"step": 5950
},
{
"epoch": 1.1286681715575622,
"grad_norm": 3.1634130477905273,
"learning_rate": 1.2476799598695761e-05,
"loss": 0.2035,
"step": 6000
},
{
"epoch": 1.1380737396538751,
"grad_norm": 1.3129241466522217,
"learning_rate": 1.2414095811387007e-05,
"loss": 0.2632,
"step": 6050
},
{
"epoch": 1.1474793077501881,
"grad_norm": 1.7623401880264282,
"learning_rate": 1.2351392024078255e-05,
"loss": 0.1882,
"step": 6100
},
{
"epoch": 1.1568848758465011,
"grad_norm": 1.544403076171875,
"learning_rate": 1.2288688236769502e-05,
"loss": 0.2814,
"step": 6150
},
{
"epoch": 1.1662904439428141,
"grad_norm": 2.739286184310913,
"learning_rate": 1.2225984449460748e-05,
"loss": 0.1824,
"step": 6200
},
{
"epoch": 1.175696012039127,
"grad_norm": 6.419041633605957,
"learning_rate": 1.2163280662151995e-05,
"loss": 0.2174,
"step": 6250
},
{
"epoch": 1.18510158013544,
"grad_norm": 2.975383996963501,
"learning_rate": 1.2100576874843241e-05,
"loss": 0.2511,
"step": 6300
},
{
"epoch": 1.1945071482317533,
"grad_norm": 2.4400739669799805,
"learning_rate": 1.2037873087534489e-05,
"loss": 0.2021,
"step": 6350
},
{
"epoch": 1.2039127163280663,
"grad_norm": 3.1182546615600586,
"learning_rate": 1.1975169300225734e-05,
"loss": 0.2323,
"step": 6400
},
{
"epoch": 1.2133182844243793,
"grad_norm": 1.4824222326278687,
"learning_rate": 1.1912465512916982e-05,
"loss": 0.2289,
"step": 6450
},
{
"epoch": 1.2227238525206923,
"grad_norm": 5.336580753326416,
"learning_rate": 1.184976172560823e-05,
"loss": 0.1726,
"step": 6500
},
{
"epoch": 1.2321294206170053,
"grad_norm": 14.752867698669434,
"learning_rate": 1.1787057938299473e-05,
"loss": 0.253,
"step": 6550
},
{
"epoch": 1.2415349887133182,
"grad_norm": 1.371951699256897,
"learning_rate": 1.172435415099072e-05,
"loss": 0.1525,
"step": 6600
},
{
"epoch": 1.2509405568096312,
"grad_norm": 2.216179847717285,
"learning_rate": 1.1661650363681967e-05,
"loss": 0.2001,
"step": 6650
},
{
"epoch": 1.2603461249059444,
"grad_norm": 6.2752299308776855,
"learning_rate": 1.1598946576373214e-05,
"loss": 0.2261,
"step": 6700
},
{
"epoch": 1.2697516930022572,
"grad_norm": 3.347257137298584,
"learning_rate": 1.153624278906446e-05,
"loss": 0.2291,
"step": 6750
},
{
"epoch": 1.2791572610985704,
"grad_norm": 8.093568801879883,
"learning_rate": 1.1473539001755707e-05,
"loss": 0.1976,
"step": 6800
},
{
"epoch": 1.2885628291948834,
"grad_norm": 1.470790147781372,
"learning_rate": 1.1410835214446953e-05,
"loss": 0.1928,
"step": 6850
},
{
"epoch": 1.2979683972911964,
"grad_norm": 3.1566500663757324,
"learning_rate": 1.13481314271382e-05,
"loss": 0.2028,
"step": 6900
},
{
"epoch": 1.3073739653875094,
"grad_norm": 9.452258110046387,
"learning_rate": 1.1285427639829446e-05,
"loss": 0.213,
"step": 6950
},
{
"epoch": 1.3167795334838224,
"grad_norm": 7.935844898223877,
"learning_rate": 1.1222723852520694e-05,
"loss": 0.193,
"step": 7000
},
{
"epoch": 1.3261851015801354,
"grad_norm": 1.4266091585159302,
"learning_rate": 1.1160020065211941e-05,
"loss": 0.1707,
"step": 7050
},
{
"epoch": 1.3355906696764483,
"grad_norm": 11.033124923706055,
"learning_rate": 1.1097316277903187e-05,
"loss": 0.1936,
"step": 7100
},
{
"epoch": 1.3449962377727616,
"grad_norm": 1.1958593130111694,
"learning_rate": 1.1034612490594431e-05,
"loss": 0.196,
"step": 7150
},
{
"epoch": 1.3544018058690745,
"grad_norm": 1.222621202468872,
"learning_rate": 1.0971908703285679e-05,
"loss": 0.2003,
"step": 7200
},
{
"epoch": 1.3638073739653875,
"grad_norm": 2.297128200531006,
"learning_rate": 1.0909204915976926e-05,
"loss": 0.1994,
"step": 7250
},
{
"epoch": 1.3732129420617005,
"grad_norm": 4.549992561340332,
"learning_rate": 1.0846501128668172e-05,
"loss": 0.1951,
"step": 7300
},
{
"epoch": 1.3826185101580135,
"grad_norm": 2.43581223487854,
"learning_rate": 1.078379734135942e-05,
"loss": 0.2665,
"step": 7350
},
{
"epoch": 1.3920240782543265,
"grad_norm": 2.75065016746521,
"learning_rate": 1.0721093554050665e-05,
"loss": 0.2465,
"step": 7400
},
{
"epoch": 1.4014296463506395,
"grad_norm": 5.422140121459961,
"learning_rate": 1.0658389766741913e-05,
"loss": 0.1747,
"step": 7450
},
{
"epoch": 1.4108352144469527,
"grad_norm": 0.8706988096237183,
"learning_rate": 1.0595685979433158e-05,
"loss": 0.2454,
"step": 7500
},
{
"epoch": 1.4202407825432657,
"grad_norm": 1.9640963077545166,
"learning_rate": 1.0532982192124406e-05,
"loss": 0.2293,
"step": 7550
},
{
"epoch": 1.4296463506395787,
"grad_norm": 2.4464077949523926,
"learning_rate": 1.0470278404815653e-05,
"loss": 0.2183,
"step": 7600
},
{
"epoch": 1.4390519187358917,
"grad_norm": 1.8322765827178955,
"learning_rate": 1.0407574617506899e-05,
"loss": 0.1933,
"step": 7650
},
{
"epoch": 1.4484574868322047,
"grad_norm": 1.6448564529418945,
"learning_rate": 1.0344870830198143e-05,
"loss": 0.2198,
"step": 7700
},
{
"epoch": 1.4578630549285176,
"grad_norm": 1.1031991243362427,
"learning_rate": 1.028216704288939e-05,
"loss": 0.2223,
"step": 7750
},
{
"epoch": 1.4672686230248306,
"grad_norm": 2.653724193572998,
"learning_rate": 1.0219463255580638e-05,
"loss": 0.1873,
"step": 7800
},
{
"epoch": 1.4766741911211438,
"grad_norm": 9.545223236083984,
"learning_rate": 1.0156759468271884e-05,
"loss": 0.2573,
"step": 7850
},
{
"epoch": 1.4860797592174566,
"grad_norm": 0.947347104549408,
"learning_rate": 1.0094055680963131e-05,
"loss": 0.1485,
"step": 7900
},
{
"epoch": 1.4954853273137698,
"grad_norm": 1.778729796409607,
"learning_rate": 1.0031351893654377e-05,
"loss": 0.2725,
"step": 7950
},
{
"epoch": 1.5048908954100828,
"grad_norm": 4.2415995597839355,
"learning_rate": 9.968648106345625e-06,
"loss": 0.1629,
"step": 8000
},
{
"epoch": 1.5142964635063958,
"grad_norm": 2.495288133621216,
"learning_rate": 9.90594431903687e-06,
"loss": 0.1961,
"step": 8050
},
{
"epoch": 1.5237020316027088,
"grad_norm": 15.494341850280762,
"learning_rate": 9.843240531728118e-06,
"loss": 0.2032,
"step": 8100
},
{
"epoch": 1.5331075996990218,
"grad_norm": 0.8584136962890625,
"learning_rate": 9.780536744419364e-06,
"loss": 0.2194,
"step": 8150
},
{
"epoch": 1.542513167795335,
"grad_norm": 1.7161898612976074,
"learning_rate": 9.71783295711061e-06,
"loss": 0.1985,
"step": 8200
},
{
"epoch": 1.5519187358916477,
"grad_norm": 13.85793399810791,
"learning_rate": 9.655129169801857e-06,
"loss": 0.1902,
"step": 8250
},
{
"epoch": 1.561324303987961,
"grad_norm": 1.6476123332977295,
"learning_rate": 9.592425382493104e-06,
"loss": 0.2398,
"step": 8300
},
{
"epoch": 1.5707298720842737,
"grad_norm": 2.9998719692230225,
"learning_rate": 9.52972159518435e-06,
"loss": 0.1788,
"step": 8350
},
{
"epoch": 1.580135440180587,
"grad_norm": 7.067188262939453,
"learning_rate": 9.467017807875598e-06,
"loss": 0.211,
"step": 8400
},
{
"epoch": 1.5895410082769,
"grad_norm": 4.7561936378479,
"learning_rate": 9.404314020566843e-06,
"loss": 0.1814,
"step": 8450
},
{
"epoch": 1.598946576373213,
"grad_norm": 7.8336873054504395,
"learning_rate": 9.341610233258089e-06,
"loss": 0.2003,
"step": 8500
},
{
"epoch": 1.6083521444695261,
"grad_norm": 3.6782350540161133,
"learning_rate": 9.278906445949337e-06,
"loss": 0.2555,
"step": 8550
},
{
"epoch": 1.617757712565839,
"grad_norm": 1.2770379781723022,
"learning_rate": 9.216202658640582e-06,
"loss": 0.1839,
"step": 8600
},
{
"epoch": 1.627163280662152,
"grad_norm": 2.8836193084716797,
"learning_rate": 9.15349887133183e-06,
"loss": 0.2034,
"step": 8650
},
{
"epoch": 1.6365688487584649,
"grad_norm": 3.362605094909668,
"learning_rate": 9.090795084023076e-06,
"loss": 0.1872,
"step": 8700
},
{
"epoch": 1.645974416854778,
"grad_norm": 3.509291172027588,
"learning_rate": 9.028091296714321e-06,
"loss": 0.2519,
"step": 8750
},
{
"epoch": 1.655379984951091,
"grad_norm": 12.957924842834473,
"learning_rate": 8.965387509405569e-06,
"loss": 0.2648,
"step": 8800
},
{
"epoch": 1.664785553047404,
"grad_norm": 3.217221975326538,
"learning_rate": 8.902683722096816e-06,
"loss": 0.1699,
"step": 8850
},
{
"epoch": 1.674191121143717,
"grad_norm": 2.8752570152282715,
"learning_rate": 8.839979934788062e-06,
"loss": 0.1924,
"step": 8900
},
{
"epoch": 1.68359668924003,
"grad_norm": 3.4973011016845703,
"learning_rate": 8.77727614747931e-06,
"loss": 0.183,
"step": 8950
},
{
"epoch": 1.6930022573363432,
"grad_norm": 1.2514209747314453,
"learning_rate": 8.714572360170555e-06,
"loss": 0.1837,
"step": 9000
},
{
"epoch": 1.702407825432656,
"grad_norm": 6.367992877960205,
"learning_rate": 8.651868572861801e-06,
"loss": 0.1828,
"step": 9050
},
{
"epoch": 1.7118133935289692,
"grad_norm": 1.3052902221679688,
"learning_rate": 8.589164785553048e-06,
"loss": 0.1863,
"step": 9100
},
{
"epoch": 1.7212189616252822,
"grad_norm": 1.235916256904602,
"learning_rate": 8.526460998244294e-06,
"loss": 0.2599,
"step": 9150
},
{
"epoch": 1.7306245297215952,
"grad_norm": 1.0772079229354858,
"learning_rate": 8.463757210935542e-06,
"loss": 0.1842,
"step": 9200
},
{
"epoch": 1.7400300978179082,
"grad_norm": 8.388031959533691,
"learning_rate": 8.401053423626787e-06,
"loss": 0.1866,
"step": 9250
},
{
"epoch": 1.7494356659142212,
"grad_norm": 4.407077789306641,
"learning_rate": 8.338349636318033e-06,
"loss": 0.2991,
"step": 9300
},
{
"epoch": 1.7588412340105344,
"grad_norm": 5.840625762939453,
"learning_rate": 8.27564584900928e-06,
"loss": 0.19,
"step": 9350
},
{
"epoch": 1.7682468021068471,
"grad_norm": 2.0648770332336426,
"learning_rate": 8.212942061700526e-06,
"loss": 0.1934,
"step": 9400
},
{
"epoch": 1.7776523702031604,
"grad_norm": 4.299741744995117,
"learning_rate": 8.150238274391774e-06,
"loss": 0.155,
"step": 9450
},
{
"epoch": 1.7870579382994731,
"grad_norm": 1.6990511417388916,
"learning_rate": 8.087534487083021e-06,
"loss": 0.237,
"step": 9500
},
{
"epoch": 1.7964635063957863,
"grad_norm": 2.469029664993286,
"learning_rate": 8.024830699774267e-06,
"loss": 0.2569,
"step": 9550
},
{
"epoch": 1.8058690744920993,
"grad_norm": 0.9023020267486572,
"learning_rate": 7.962126912465513e-06,
"loss": 0.1873,
"step": 9600
},
{
"epoch": 1.8152746425884123,
"grad_norm": 3.4308788776397705,
"learning_rate": 7.89942312515676e-06,
"loss": 0.2105,
"step": 9650
},
{
"epoch": 1.8246802106847255,
"grad_norm": 2.518071174621582,
"learning_rate": 7.836719337848006e-06,
"loss": 0.2872,
"step": 9700
},
{
"epoch": 1.8340857787810383,
"grad_norm": 1.2336055040359497,
"learning_rate": 7.774015550539254e-06,
"loss": 0.1982,
"step": 9750
},
{
"epoch": 1.8434913468773515,
"grad_norm": 4.147019863128662,
"learning_rate": 7.7113117632305e-06,
"loss": 0.1778,
"step": 9800
},
{
"epoch": 1.8528969149736643,
"grad_norm": 2.8657143115997314,
"learning_rate": 7.648607975921745e-06,
"loss": 0.2742,
"step": 9850
},
{
"epoch": 1.8623024830699775,
"grad_norm": 4.490947246551514,
"learning_rate": 7.585904188612993e-06,
"loss": 0.2063,
"step": 9900
},
{
"epoch": 1.8717080511662905,
"grad_norm": 13.179983139038086,
"learning_rate": 7.523200401304239e-06,
"loss": 0.2251,
"step": 9950
},
{
"epoch": 1.8811136192626035,
"grad_norm": 2.9998207092285156,
"learning_rate": 7.460496613995486e-06,
"loss": 0.1819,
"step": 10000
},
{
"epoch": 1.8905191873589164,
"grad_norm": 3.122727394104004,
"learning_rate": 7.3977928266867325e-06,
"loss": 0.1712,
"step": 10050
},
{
"epoch": 1.8999247554552294,
"grad_norm": 12.002041816711426,
"learning_rate": 7.335089039377979e-06,
"loss": 0.1908,
"step": 10100
},
{
"epoch": 1.9093303235515426,
"grad_norm": 3.0402774810791016,
"learning_rate": 7.272385252069225e-06,
"loss": 0.2027,
"step": 10150
},
{
"epoch": 1.9187358916478554,
"grad_norm": 2.971097707748413,
"learning_rate": 7.2096814647604716e-06,
"loss": 0.1789,
"step": 10200
},
{
"epoch": 1.9281414597441686,
"grad_norm": 1.380375862121582,
"learning_rate": 7.146977677451718e-06,
"loss": 0.2104,
"step": 10250
},
{
"epoch": 1.9375470278404816,
"grad_norm": 1.4936792850494385,
"learning_rate": 7.084273890142966e-06,
"loss": 0.171,
"step": 10300
},
{
"epoch": 1.9469525959367946,
"grad_norm": 1.463129997253418,
"learning_rate": 7.021570102834212e-06,
"loss": 0.1944,
"step": 10350
},
{
"epoch": 1.9563581640331076,
"grad_norm": 2.7813374996185303,
"learning_rate": 6.958866315525459e-06,
"loss": 0.1901,
"step": 10400
},
{
"epoch": 1.9657637321294206,
"grad_norm": 5.770429611206055,
"learning_rate": 6.896162528216705e-06,
"loss": 0.1671,
"step": 10450
},
{
"epoch": 1.9751693002257338,
"grad_norm": 12.642657279968262,
"learning_rate": 6.833458740907951e-06,
"loss": 0.1977,
"step": 10500
},
{
"epoch": 1.9845748683220465,
"grad_norm": 5.965068817138672,
"learning_rate": 6.770754953599198e-06,
"loss": 0.1535,
"step": 10550
},
{
"epoch": 1.9939804364183598,
"grad_norm": 1.6920294761657715,
"learning_rate": 6.7080511662904445e-06,
"loss": 0.1837,
"step": 10600
},
{
"epoch": 2.0033860045146725,
"grad_norm": 1.7706254720687866,
"learning_rate": 6.645347378981691e-06,
"loss": 0.1791,
"step": 10650
},
{
"epoch": 2.0127915726109857,
"grad_norm": 8.003987312316895,
"learning_rate": 6.582643591672938e-06,
"loss": 0.2644,
"step": 10700
},
{
"epoch": 2.0221971407072985,
"grad_norm": 1.5629470348358154,
"learning_rate": 6.5199398043641835e-06,
"loss": 0.1612,
"step": 10750
},
{
"epoch": 2.0316027088036117,
"grad_norm": 2.4208626747131348,
"learning_rate": 6.45723601705543e-06,
"loss": 0.2278,
"step": 10800
},
{
"epoch": 2.041008276899925,
"grad_norm": 1.0424669981002808,
"learning_rate": 6.394532229746678e-06,
"loss": 0.1516,
"step": 10850
},
{
"epoch": 2.0504138449962377,
"grad_norm": 2.8615269660949707,
"learning_rate": 6.331828442437924e-06,
"loss": 0.2025,
"step": 10900
},
{
"epoch": 2.059819413092551,
"grad_norm": 13.714409828186035,
"learning_rate": 6.269124655129171e-06,
"loss": 0.1939,
"step": 10950
},
{
"epoch": 2.0692249811888637,
"grad_norm": 0.9942559003829956,
"learning_rate": 6.206420867820417e-06,
"loss": 0.1828,
"step": 11000
},
{
"epoch": 2.078630549285177,
"grad_norm": 2.9414799213409424,
"learning_rate": 6.143717080511663e-06,
"loss": 0.1681,
"step": 11050
},
{
"epoch": 2.0880361173814896,
"grad_norm": 3.001040458679199,
"learning_rate": 6.08101329320291e-06,
"loss": 0.165,
"step": 11100
},
{
"epoch": 2.097441685477803,
"grad_norm": 4.616268634796143,
"learning_rate": 6.0183095058941565e-06,
"loss": 0.1328,
"step": 11150
},
{
"epoch": 2.106847253574116,
"grad_norm": 16.67197608947754,
"learning_rate": 5.955605718585403e-06,
"loss": 0.1496,
"step": 11200
},
{
"epoch": 2.116252821670429,
"grad_norm": 3.5193891525268555,
"learning_rate": 5.89290193127665e-06,
"loss": 0.1478,
"step": 11250
},
{
"epoch": 2.125658389766742,
"grad_norm": 4.846385478973389,
"learning_rate": 5.8301981439678955e-06,
"loss": 0.1585,
"step": 11300
},
{
"epoch": 2.135063957863055,
"grad_norm": 2.8305087089538574,
"learning_rate": 5.767494356659142e-06,
"loss": 0.1743,
"step": 11350
},
{
"epoch": 2.144469525959368,
"grad_norm": 8.07402229309082,
"learning_rate": 5.704790569350389e-06,
"loss": 0.1978,
"step": 11400
},
{
"epoch": 2.153875094055681,
"grad_norm": 5.102453231811523,
"learning_rate": 5.642086782041636e-06,
"loss": 0.23,
"step": 11450
},
{
"epoch": 2.163280662151994,
"grad_norm": 5.685837268829346,
"learning_rate": 5.579382994732883e-06,
"loss": 0.151,
"step": 11500
},
{
"epoch": 2.172686230248307,
"grad_norm": 1.282105803489685,
"learning_rate": 5.516679207424129e-06,
"loss": 0.2055,
"step": 11550
},
{
"epoch": 2.18209179834462,
"grad_norm": 1.42794930934906,
"learning_rate": 5.453975420115375e-06,
"loss": 0.2316,
"step": 11600
},
{
"epoch": 2.191497366440933,
"grad_norm": 0.9819298982620239,
"learning_rate": 5.391271632806622e-06,
"loss": 0.1529,
"step": 11650
},
{
"epoch": 2.200902934537246,
"grad_norm": 6.298890113830566,
"learning_rate": 5.3285678454978684e-06,
"loss": 0.1741,
"step": 11700
},
{
"epoch": 2.210308502633559,
"grad_norm": 2.593261480331421,
"learning_rate": 5.265864058189115e-06,
"loss": 0.2045,
"step": 11750
},
{
"epoch": 2.219714070729872,
"grad_norm": 1.2109441757202148,
"learning_rate": 5.203160270880362e-06,
"loss": 0.1566,
"step": 11800
},
{
"epoch": 2.229119638826185,
"grad_norm": 2.688478469848633,
"learning_rate": 5.140456483571608e-06,
"loss": 0.1741,
"step": 11850
},
{
"epoch": 2.238525206922498,
"grad_norm": 1.0694407224655151,
"learning_rate": 5.077752696262854e-06,
"loss": 0.1442,
"step": 11900
},
{
"epoch": 2.247930775018811,
"grad_norm": 3.074937343597412,
"learning_rate": 5.015048908954101e-06,
"loss": 0.1743,
"step": 11950
},
{
"epoch": 2.2573363431151243,
"grad_norm": 1.3289566040039062,
"learning_rate": 4.952345121645348e-06,
"loss": 0.1526,
"step": 12000
},
{
"epoch": 2.266741911211437,
"grad_norm": 1.629441261291504,
"learning_rate": 4.889641334336595e-06,
"loss": 0.2462,
"step": 12050
},
{
"epoch": 2.2761474793077503,
"grad_norm": 5.780508518218994,
"learning_rate": 4.8269375470278405e-06,
"loss": 0.1472,
"step": 12100
},
{
"epoch": 2.285553047404063,
"grad_norm": 10.06039810180664,
"learning_rate": 4.764233759719087e-06,
"loss": 0.1585,
"step": 12150
},
{
"epoch": 2.2949586155003763,
"grad_norm": 11.558574676513672,
"learning_rate": 4.701529972410334e-06,
"loss": 0.1261,
"step": 12200
},
{
"epoch": 2.3043641835966895,
"grad_norm": 1.2220011949539185,
"learning_rate": 4.63882618510158e-06,
"loss": 0.1866,
"step": 12250
},
{
"epoch": 2.3137697516930023,
"grad_norm": 4.4303460121154785,
"learning_rate": 4.576122397792827e-06,
"loss": 0.1733,
"step": 12300
},
{
"epoch": 2.3231753197893155,
"grad_norm": 2.550745964050293,
"learning_rate": 4.513418610484074e-06,
"loss": 0.2603,
"step": 12350
},
{
"epoch": 2.3325808878856282,
"grad_norm": 3.003775119781494,
"learning_rate": 4.45071482317532e-06,
"loss": 0.141,
"step": 12400
},
{
"epoch": 2.3419864559819414,
"grad_norm": 1.6381702423095703,
"learning_rate": 4.388011035866567e-06,
"loss": 0.1607,
"step": 12450
},
{
"epoch": 2.351392024078254,
"grad_norm": 6.195992469787598,
"learning_rate": 4.3253072485578135e-06,
"loss": 0.1694,
"step": 12500
},
{
"epoch": 2.3607975921745674,
"grad_norm": 1.3570517301559448,
"learning_rate": 4.262603461249059e-06,
"loss": 0.1823,
"step": 12550
},
{
"epoch": 2.37020316027088,
"grad_norm": 1.8480778932571411,
"learning_rate": 4.199899673940307e-06,
"loss": 0.1476,
"step": 12600
},
{
"epoch": 2.3796087283671934,
"grad_norm": 1.602105736732483,
"learning_rate": 4.137195886631553e-06,
"loss": 0.145,
"step": 12650
},
{
"epoch": 2.3890142964635066,
"grad_norm": 2.0385992527008057,
"learning_rate": 4.074492099322799e-06,
"loss": 0.1685,
"step": 12700
},
{
"epoch": 2.3984198645598194,
"grad_norm": 1.7218743562698364,
"learning_rate": 4.011788312014046e-06,
"loss": 0.1782,
"step": 12750
},
{
"epoch": 2.4078254326561326,
"grad_norm": 5.287595272064209,
"learning_rate": 3.949084524705293e-06,
"loss": 0.173,
"step": 12800
},
{
"epoch": 2.4172310007524453,
"grad_norm": 7.334980010986328,
"learning_rate": 3.886380737396539e-06,
"loss": 0.2384,
"step": 12850
},
{
"epoch": 2.4266365688487586,
"grad_norm": 5.126362323760986,
"learning_rate": 3.823676950087786e-06,
"loss": 0.14,
"step": 12900
},
{
"epoch": 2.4360421369450713,
"grad_norm": 1.547075867652893,
"learning_rate": 3.760973162779032e-06,
"loss": 0.1706,
"step": 12950
},
{
"epoch": 2.4454477050413845,
"grad_norm": 4.503610134124756,
"learning_rate": 3.698269375470279e-06,
"loss": 0.2401,
"step": 13000
},
{
"epoch": 2.4548532731376973,
"grad_norm": 2.71994686126709,
"learning_rate": 3.6355655881615255e-06,
"loss": 0.1496,
"step": 13050
},
{
"epoch": 2.4642588412340105,
"grad_norm": 8.66321849822998,
"learning_rate": 3.5728618008527716e-06,
"loss": 0.1299,
"step": 13100
},
{
"epoch": 2.4736644093303237,
"grad_norm": 11.310059547424316,
"learning_rate": 3.5101580135440183e-06,
"loss": 0.1402,
"step": 13150
},
{
"epoch": 2.4830699774266365,
"grad_norm": 5.4790873527526855,
"learning_rate": 3.447454226235265e-06,
"loss": 0.1768,
"step": 13200
},
{
"epoch": 2.4924755455229497,
"grad_norm": 3.9280734062194824,
"learning_rate": 3.384750438926511e-06,
"loss": 0.1844,
"step": 13250
},
{
"epoch": 2.5018811136192625,
"grad_norm": 2.2618765830993652,
"learning_rate": 3.322046651617758e-06,
"loss": 0.1641,
"step": 13300
},
{
"epoch": 2.5112866817155757,
"grad_norm": 1.4370334148406982,
"learning_rate": 3.2593428643090047e-06,
"loss": 0.1846,
"step": 13350
},
{
"epoch": 2.520692249811889,
"grad_norm": 0.5764915347099304,
"learning_rate": 3.196639077000251e-06,
"loss": 0.1174,
"step": 13400
},
{
"epoch": 2.5300978179082017,
"grad_norm": 13.661349296569824,
"learning_rate": 3.1339352896914976e-06,
"loss": 0.1453,
"step": 13450
},
{
"epoch": 2.5395033860045144,
"grad_norm": 1.970798373222351,
"learning_rate": 3.071231502382744e-06,
"loss": 0.1712,
"step": 13500
},
{
"epoch": 2.5489089541008276,
"grad_norm": 5.565087795257568,
"learning_rate": 3.0085277150739904e-06,
"loss": 0.1335,
"step": 13550
},
{
"epoch": 2.558314522197141,
"grad_norm": 1.0521272420883179,
"learning_rate": 2.9458239277652374e-06,
"loss": 0.142,
"step": 13600
},
{
"epoch": 2.5677200902934536,
"grad_norm": 2.4545013904571533,
"learning_rate": 2.883120140456484e-06,
"loss": 0.1364,
"step": 13650
},
{
"epoch": 2.577125658389767,
"grad_norm": 2.9941937923431396,
"learning_rate": 2.8204163531477302e-06,
"loss": 0.1781,
"step": 13700
},
{
"epoch": 2.5865312264860796,
"grad_norm": 1.3698679208755493,
"learning_rate": 2.757712565838977e-06,
"loss": 0.223,
"step": 13750
},
{
"epoch": 2.595936794582393,
"grad_norm": 1.4991743564605713,
"learning_rate": 2.695008778530224e-06,
"loss": 0.1749,
"step": 13800
},
{
"epoch": 2.605342362678706,
"grad_norm": 1.5825111865997314,
"learning_rate": 2.63230499122147e-06,
"loss": 0.2057,
"step": 13850
},
{
"epoch": 2.6147479307750188,
"grad_norm": 9.330909729003906,
"learning_rate": 2.5696012039127167e-06,
"loss": 0.1412,
"step": 13900
},
{
"epoch": 2.624153498871332,
"grad_norm": 4.595930576324463,
"learning_rate": 2.5068974166039633e-06,
"loss": 0.1576,
"step": 13950
},
{
"epoch": 2.6335590669676447,
"grad_norm": 2.5993683338165283,
"learning_rate": 2.4441936292952095e-06,
"loss": 0.1585,
"step": 14000
},
{
"epoch": 2.642964635063958,
"grad_norm": 1.360910177230835,
"learning_rate": 2.381489841986456e-06,
"loss": 0.1438,
"step": 14050
},
{
"epoch": 2.6523702031602707,
"grad_norm": 7.876944541931152,
"learning_rate": 2.3187860546777028e-06,
"loss": 0.2808,
"step": 14100
},
{
"epoch": 2.661775771256584,
"grad_norm": 7.476833820343018,
"learning_rate": 2.2560822673689494e-06,
"loss": 0.1367,
"step": 14150
},
{
"epoch": 2.6711813393528967,
"grad_norm": 12.081886291503906,
"learning_rate": 2.193378480060196e-06,
"loss": 0.1393,
"step": 14200
},
{
"epoch": 2.68058690744921,
"grad_norm": 2.116596221923828,
"learning_rate": 2.130674692751442e-06,
"loss": 0.1329,
"step": 14250
},
{
"epoch": 2.689992475545523,
"grad_norm": 9.647782325744629,
"learning_rate": 2.067970905442689e-06,
"loss": 0.1475,
"step": 14300
},
{
"epoch": 2.699398043641836,
"grad_norm": 5.909144878387451,
"learning_rate": 2.0052671181339354e-06,
"loss": 0.2392,
"step": 14350
},
{
"epoch": 2.708803611738149,
"grad_norm": 3.9596285820007324,
"learning_rate": 1.942563330825182e-06,
"loss": 0.225,
"step": 14400
},
{
"epoch": 2.718209179834462,
"grad_norm": 0.5444441437721252,
"learning_rate": 1.8798595435164285e-06,
"loss": 0.1488,
"step": 14450
},
{
"epoch": 2.727614747930775,
"grad_norm": 1.5310776233673096,
"learning_rate": 1.817155756207675e-06,
"loss": 0.1905,
"step": 14500
},
{
"epoch": 2.7370203160270883,
"grad_norm": 9.348706245422363,
"learning_rate": 1.7544519688989217e-06,
"loss": 0.1475,
"step": 14550
},
{
"epoch": 2.746425884123401,
"grad_norm": 3.9080257415771484,
"learning_rate": 1.691748181590168e-06,
"loss": 0.1715,
"step": 14600
},
{
"epoch": 2.755831452219714,
"grad_norm": 1.6289005279541016,
"learning_rate": 1.6290443942814147e-06,
"loss": 0.1468,
"step": 14650
},
{
"epoch": 2.765237020316027,
"grad_norm": 8.900264739990234,
"learning_rate": 1.5663406069726613e-06,
"loss": 0.175,
"step": 14700
},
{
"epoch": 2.7746425884123402,
"grad_norm": 7.342721462249756,
"learning_rate": 1.5036368196639077e-06,
"loss": 0.1888,
"step": 14750
},
{
"epoch": 2.784048156508653,
"grad_norm": 9.584351539611816,
"learning_rate": 1.4409330323551544e-06,
"loss": 0.1692,
"step": 14800
},
{
"epoch": 2.793453724604966,
"grad_norm": 1.2614892721176147,
"learning_rate": 1.378229245046401e-06,
"loss": 0.1522,
"step": 14850
},
{
"epoch": 2.802859292701279,
"grad_norm": 4.842260360717773,
"learning_rate": 1.3155254577376476e-06,
"loss": 0.1783,
"step": 14900
},
{
"epoch": 2.812264860797592,
"grad_norm": 1.9376300573349,
"learning_rate": 1.252821670428894e-06,
"loss": 0.1997,
"step": 14950
},
{
"epoch": 2.8216704288939054,
"grad_norm": 1.777771234512329,
"learning_rate": 1.1901178831201406e-06,
"loss": 0.1452,
"step": 15000
},
{
"epoch": 2.831075996990218,
"grad_norm": 8.967829704284668,
"learning_rate": 1.1274140958113872e-06,
"loss": 0.1644,
"step": 15050
},
{
"epoch": 2.8404815650865314,
"grad_norm": 7.596806526184082,
"learning_rate": 1.0647103085026337e-06,
"loss": 0.2017,
"step": 15100
},
{
"epoch": 2.849887133182844,
"grad_norm": 7.003665447235107,
"learning_rate": 1.0020065211938803e-06,
"loss": 0.1263,
"step": 15150
},
{
"epoch": 2.8592927012791574,
"grad_norm": 2.5569920539855957,
"learning_rate": 9.393027338851267e-07,
"loss": 0.2034,
"step": 15200
},
{
"epoch": 2.86869826937547,
"grad_norm": 4.9097490310668945,
"learning_rate": 8.765989465763733e-07,
"loss": 0.1402,
"step": 15250
},
{
"epoch": 2.8781038374717833,
"grad_norm": 2.4663660526275635,
"learning_rate": 8.138951592676199e-07,
"loss": 0.1558,
"step": 15300
},
{
"epoch": 2.887509405568096,
"grad_norm": 2.791815996170044,
"learning_rate": 7.511913719588663e-07,
"loss": 0.129,
"step": 15350
},
{
"epoch": 2.8969149736644093,
"grad_norm": 2.119055986404419,
"learning_rate": 6.884875846501129e-07,
"loss": 0.2119,
"step": 15400
},
{
"epoch": 2.9063205417607225,
"grad_norm": 12.687973976135254,
"learning_rate": 6.257837973413595e-07,
"loss": 0.1316,
"step": 15450
},
{
"epoch": 2.9157261098570353,
"grad_norm": 1.892902135848999,
"learning_rate": 5.63080010032606e-07,
"loss": 0.1586,
"step": 15500
},
{
"epoch": 2.9251316779533485,
"grad_norm": 1.8958168029785156,
"learning_rate": 5.003762227238526e-07,
"loss": 0.1394,
"step": 15550
},
{
"epoch": 2.9345372460496613,
"grad_norm": 3.460698366165161,
"learning_rate": 4.3767243541509916e-07,
"loss": 0.1432,
"step": 15600
},
{
"epoch": 2.9439428141459745,
"grad_norm": 8.283778190612793,
"learning_rate": 3.7496864810634567e-07,
"loss": 0.1961,
"step": 15650
},
{
"epoch": 2.9533483822422877,
"grad_norm": 4.339179515838623,
"learning_rate": 3.122648607975922e-07,
"loss": 0.1535,
"step": 15700
},
{
"epoch": 2.9627539503386005,
"grad_norm": 4.080807685852051,
"learning_rate": 2.4956107348883875e-07,
"loss": 0.1448,
"step": 15750
},
{
"epoch": 2.972159518434913,
"grad_norm": 1.3516851663589478,
"learning_rate": 1.868572861800853e-07,
"loss": 0.1516,
"step": 15800
},
{
"epoch": 2.9815650865312264,
"grad_norm": 6.547214031219482,
"learning_rate": 1.2415349887133183e-07,
"loss": 0.1415,
"step": 15850
},
{
"epoch": 2.9909706546275396,
"grad_norm": 2.244124412536621,
"learning_rate": 6.144971156257838e-08,
"loss": 0.1463,
"step": 15900
},
{
"epoch": 3.0,
"step": 15948,
"total_flos": 1.6575120153138816e+16,
"train_loss": 0.21908686365553812,
"train_runtime": 7613.9058,
"train_samples_per_second": 8.378,
"train_steps_per_second": 2.095
},
{
"epoch": 3.0,
"eval_accuracy": 0.9558278538012962,
"eval_f1": 0.5528281962901715,
"eval_loss": 0.3851251006126404,
"eval_pos_rate_pred": 0.06444756279730685,
"eval_pos_rate_true": 0.034333589843004275,
"eval_precision": 0.42367006657301653,
"eval_recall": 0.7952708512467755,
"eval_runtime": 37.8976,
"eval_samples_per_second": 63.381,
"eval_steps_per_second": 7.942,
"step": 15948
}
],
"logging_steps": 50,
"max_steps": 15948,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6575120153138816e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}