GramTrans_Qwen2.5_Grammar / trainer_state.json
GramTrans's picture
Upload folder using huggingface_hub
3c800ce verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.077741172308577,
"eval_steps": 500,
"global_step": 34800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017688548875671613,
"grad_norm": 4.9453349113464355,
"learning_rate": 1.768972227136034e-06,
"loss": 6.7403,
"step": 200
},
{
"epoch": 0.03537709775134323,
"grad_norm": 8.81293773651123,
"learning_rate": 3.537944454272068e-06,
"loss": 4.3531,
"step": 400
},
{
"epoch": 0.05306564662701484,
"grad_norm": 5.486693382263184,
"learning_rate": 5.306916681408102e-06,
"loss": 2.2861,
"step": 600
},
{
"epoch": 0.07075419550268645,
"grad_norm": 2.244082450866699,
"learning_rate": 7.075888908544136e-06,
"loss": 1.162,
"step": 800
},
{
"epoch": 0.08844274437835806,
"grad_norm": 0.6459134221076965,
"learning_rate": 8.84486113568017e-06,
"loss": 0.3574,
"step": 1000
},
{
"epoch": 0.10613129325402967,
"grad_norm": 0.42250216007232666,
"learning_rate": 1.0613833362816204e-05,
"loss": 0.22,
"step": 1200
},
{
"epoch": 0.12381984212970129,
"grad_norm": 0.4047611951828003,
"learning_rate": 1.2382805589952239e-05,
"loss": 0.1967,
"step": 1400
},
{
"epoch": 0.1415083910053729,
"grad_norm": 0.34184110164642334,
"learning_rate": 1.4151777817088272e-05,
"loss": 0.1853,
"step": 1600
},
{
"epoch": 0.1591969398810445,
"grad_norm": 0.32166364789009094,
"learning_rate": 1.5920750044224307e-05,
"loss": 0.1795,
"step": 1800
},
{
"epoch": 0.1768854887567161,
"grad_norm": 0.30309006571769714,
"learning_rate": 1.768972227136034e-05,
"loss": 0.1777,
"step": 2000
},
{
"epoch": 0.19457403763238773,
"grad_norm": 0.28535833954811096,
"learning_rate": 1.9458694498496373e-05,
"loss": 0.1728,
"step": 2200
},
{
"epoch": 0.21226258650805935,
"grad_norm": 0.2785949409008026,
"learning_rate": 2.1227666725632408e-05,
"loss": 0.1707,
"step": 2400
},
{
"epoch": 0.22995113538373096,
"grad_norm": 0.28039732575416565,
"learning_rate": 2.2996638952768443e-05,
"loss": 0.1682,
"step": 2600
},
{
"epoch": 0.24763968425940258,
"grad_norm": 0.2688646614551544,
"learning_rate": 2.4765611179904478e-05,
"loss": 0.1663,
"step": 2800
},
{
"epoch": 0.2653282331350742,
"grad_norm": 0.2585384249687195,
"learning_rate": 2.653458340704051e-05,
"loss": 0.1672,
"step": 3000
},
{
"epoch": 0.2830167820107458,
"grad_norm": 0.2583986222743988,
"learning_rate": 2.8303555634176544e-05,
"loss": 0.1646,
"step": 3200
},
{
"epoch": 0.30070533088641743,
"grad_norm": 0.23997661471366882,
"learning_rate": 3.007252786131258e-05,
"loss": 0.1647,
"step": 3400
},
{
"epoch": 0.318393879762089,
"grad_norm": 0.2527500092983246,
"learning_rate": 3.1841500088448614e-05,
"loss": 0.1629,
"step": 3600
},
{
"epoch": 0.3360824286377606,
"grad_norm": 0.23295536637306213,
"learning_rate": 3.3610472315584645e-05,
"loss": 0.1615,
"step": 3800
},
{
"epoch": 0.3537709775134322,
"grad_norm": 0.24165655672550201,
"learning_rate": 3.537944454272068e-05,
"loss": 0.1626,
"step": 4000
},
{
"epoch": 0.37145952638910384,
"grad_norm": 0.22286227345466614,
"learning_rate": 3.7148416769856715e-05,
"loss": 0.1611,
"step": 4200
},
{
"epoch": 0.38914807526477546,
"grad_norm": 0.23912444710731506,
"learning_rate": 3.8917388996992746e-05,
"loss": 0.1639,
"step": 4400
},
{
"epoch": 0.4068366241404471,
"grad_norm": 0.22330383956432343,
"learning_rate": 4.0686361224128784e-05,
"loss": 0.1613,
"step": 4600
},
{
"epoch": 0.4245251730161187,
"grad_norm": 0.2139132171869278,
"learning_rate": 4.2455333451264816e-05,
"loss": 0.1601,
"step": 4800
},
{
"epoch": 0.4422137218917903,
"grad_norm": 0.19250735640525818,
"learning_rate": 4.4224305678400854e-05,
"loss": 0.1595,
"step": 5000
},
{
"epoch": 0.45990227076746193,
"grad_norm": 0.19502046704292297,
"learning_rate": 4.5993277905536885e-05,
"loss": 0.159,
"step": 5200
},
{
"epoch": 0.47759081964313355,
"grad_norm": 0.19040720164775848,
"learning_rate": 4.776225013267292e-05,
"loss": 0.1578,
"step": 5400
},
{
"epoch": 0.49527936851880516,
"grad_norm": 0.20574906468391418,
"learning_rate": 4.9531222359808955e-05,
"loss": 0.1603,
"step": 5600
},
{
"epoch": 0.5129679173944768,
"grad_norm": 0.19954811036586761,
"learning_rate": 4.999976891136569e-05,
"loss": 0.1582,
"step": 5800
},
{
"epoch": 0.5306564662701484,
"grad_norm": 0.18808187544345856,
"learning_rate": 4.999871234414489e-05,
"loss": 0.1589,
"step": 6000
},
{
"epoch": 0.54834501514582,
"grad_norm": 0.17781756818294525,
"learning_rate": 4.999680029474971e-05,
"loss": 0.158,
"step": 6200
},
{
"epoch": 0.5660335640214916,
"grad_norm": 0.18824966251850128,
"learning_rate": 4.999403282861248e-05,
"loss": 0.1572,
"step": 6400
},
{
"epoch": 0.5837221128971632,
"grad_norm": 0.20633359253406525,
"learning_rate": 4.999041004043882e-05,
"loss": 0.1572,
"step": 6600
},
{
"epoch": 0.6014106617728349,
"grad_norm": 0.17536261677742004,
"learning_rate": 4.998593205420432e-05,
"loss": 0.1565,
"step": 6800
},
{
"epoch": 0.6190992106485064,
"grad_norm": 0.1713794469833374,
"learning_rate": 4.998059902315038e-05,
"loss": 0.1554,
"step": 7000
},
{
"epoch": 0.636787759524178,
"grad_norm": 0.16785737872123718,
"learning_rate": 4.997441112977891e-05,
"loss": 0.1526,
"step": 7200
},
{
"epoch": 0.6544763083998496,
"grad_norm": 0.18514026701450348,
"learning_rate": 4.996736858584613e-05,
"loss": 0.153,
"step": 7400
},
{
"epoch": 0.6721648572755212,
"grad_norm": 0.17102603614330292,
"learning_rate": 4.995947163235527e-05,
"loss": 0.1517,
"step": 7600
},
{
"epoch": 0.6898534061511928,
"grad_norm": 0.16847601532936096,
"learning_rate": 4.9950720539548384e-05,
"loss": 0.1519,
"step": 7800
},
{
"epoch": 0.7075419550268645,
"grad_norm": 0.16419926285743713,
"learning_rate": 4.9941115606897036e-05,
"loss": 0.1507,
"step": 8000
},
{
"epoch": 0.7252305039025361,
"grad_norm": 0.17619839310646057,
"learning_rate": 4.9930657163092123e-05,
"loss": 0.15,
"step": 8200
},
{
"epoch": 0.7429190527782077,
"grad_norm": 0.16663217544555664,
"learning_rate": 4.991934556603254e-05,
"loss": 0.1502,
"step": 8400
},
{
"epoch": 0.7606076016538793,
"grad_norm": 0.18116818368434906,
"learning_rate": 4.990718120281304e-05,
"loss": 0.1511,
"step": 8600
},
{
"epoch": 0.7782961505295509,
"grad_norm": 0.1595318764448166,
"learning_rate": 4.989416448971088e-05,
"loss": 0.1502,
"step": 8800
},
{
"epoch": 0.7959846994052225,
"grad_norm": 0.15985798835754395,
"learning_rate": 4.988029587217165e-05,
"loss": 0.1513,
"step": 9000
},
{
"epoch": 0.8136732482808942,
"grad_norm": 0.17328430712223053,
"learning_rate": 4.9865575824794e-05,
"loss": 0.1491,
"step": 9200
},
{
"epoch": 0.8313617971565658,
"grad_norm": 0.1536557823419571,
"learning_rate": 4.98500048513134e-05,
"loss": 0.1474,
"step": 9400
},
{
"epoch": 0.8490503460322374,
"grad_norm": 0.15362966060638428,
"learning_rate": 4.983358348458491e-05,
"loss": 0.1458,
"step": 9600
},
{
"epoch": 0.866738894907909,
"grad_norm": 0.16235561668872833,
"learning_rate": 4.9816312286564926e-05,
"loss": 0.1454,
"step": 9800
},
{
"epoch": 0.8844274437835806,
"grad_norm": 0.6078733801841736,
"learning_rate": 4.979819184829197e-05,
"loss": 0.148,
"step": 10000
},
{
"epoch": 0.9021159926592522,
"grad_norm": 0.15071770548820496,
"learning_rate": 4.9779222789866476e-05,
"loss": 0.1461,
"step": 10200
},
{
"epoch": 0.9198045415349239,
"grad_norm": 0.15512414276599884,
"learning_rate": 4.9759405760429524e-05,
"loss": 0.1455,
"step": 10400
},
{
"epoch": 0.9374930904105955,
"grad_norm": 0.15411154925823212,
"learning_rate": 4.9738741438140644e-05,
"loss": 0.1437,
"step": 10600
},
{
"epoch": 0.9551816392862671,
"grad_norm": 0.15488554537296295,
"learning_rate": 4.9717230530154657e-05,
"loss": 0.1445,
"step": 10800
},
{
"epoch": 0.9728701881619387,
"grad_norm": 0.14975008368492126,
"learning_rate": 4.9694873772597396e-05,
"loss": 0.1437,
"step": 11000
},
{
"epoch": 0.9905587370376103,
"grad_norm": 0.15676584839820862,
"learning_rate": 4.967167193054058e-05,
"loss": 0.1432,
"step": 11200
},
{
"epoch": 1.0082251752271874,
"grad_norm": 0.15392336249351501,
"learning_rate": 4.964762579797558e-05,
"loss": 0.1355,
"step": 11400
},
{
"epoch": 1.0259137241028589,
"grad_norm": 0.1635395586490631,
"learning_rate": 4.962273619778632e-05,
"loss": 0.1268,
"step": 11600
},
{
"epoch": 1.0436022729785306,
"grad_norm": 0.15909574925899506,
"learning_rate": 4.959700398172101e-05,
"loss": 0.1263,
"step": 11800
},
{
"epoch": 1.061290821854202,
"grad_norm": 0.16021914780139923,
"learning_rate": 4.957043003036311e-05,
"loss": 0.1264,
"step": 12000
},
{
"epoch": 1.0789793707298738,
"grad_norm": 0.163675919175148,
"learning_rate": 4.954301525310113e-05,
"loss": 0.1262,
"step": 12200
},
{
"epoch": 1.0966679196055453,
"grad_norm": 0.15520550310611725,
"learning_rate": 4.951476058809751e-05,
"loss": 0.1275,
"step": 12400
},
{
"epoch": 1.114356468481217,
"grad_norm": 0.1602706015110016,
"learning_rate": 4.948566700225654e-05,
"loss": 0.1268,
"step": 12600
},
{
"epoch": 1.1320450173568886,
"grad_norm": 0.15892288088798523,
"learning_rate": 4.945573549119128e-05,
"loss": 0.1269,
"step": 12800
},
{
"epoch": 1.1497335662325603,
"grad_norm": 0.16024959087371826,
"learning_rate": 4.9424967079189434e-05,
"loss": 0.1265,
"step": 13000
},
{
"epoch": 1.1674221151082318,
"grad_norm": 0.15324904024600983,
"learning_rate": 4.939336281917837e-05,
"loss": 0.1265,
"step": 13200
},
{
"epoch": 1.1851106639839033,
"grad_norm": 0.15968984365463257,
"learning_rate": 4.936092379268902e-05,
"loss": 0.1269,
"step": 13400
},
{
"epoch": 1.202799212859575,
"grad_norm": 0.1491909772157669,
"learning_rate": 4.932765110981894e-05,
"loss": 0.1261,
"step": 13600
},
{
"epoch": 1.2204877617352468,
"grad_norm": 0.15451796352863312,
"learning_rate": 4.929354590919424e-05,
"loss": 0.1273,
"step": 13800
},
{
"epoch": 1.2381763106109183,
"grad_norm": 0.1513613909482956,
"learning_rate": 4.9258609357930686e-05,
"loss": 0.1264,
"step": 14000
},
{
"epoch": 1.2558648594865898,
"grad_norm": 0.1397712677717209,
"learning_rate": 4.9222842651593736e-05,
"loss": 0.1268,
"step": 14200
},
{
"epoch": 1.2735534083622615,
"grad_norm": 0.13348053395748138,
"learning_rate": 4.918624701415763e-05,
"loss": 0.1267,
"step": 14400
},
{
"epoch": 1.2912419572379332,
"grad_norm": 0.15651994943618774,
"learning_rate": 4.9148823697963465e-05,
"loss": 0.1258,
"step": 14600
},
{
"epoch": 1.3089305061136047,
"grad_norm": 0.16271665692329407,
"learning_rate": 4.9110573983676414e-05,
"loss": 0.1258,
"step": 14800
},
{
"epoch": 1.3266190549892762,
"grad_norm": 0.15141454339027405,
"learning_rate": 4.907149918024185e-05,
"loss": 0.1252,
"step": 15000
},
{
"epoch": 1.344307603864948,
"grad_norm": 0.14476826786994934,
"learning_rate": 4.903160062484056e-05,
"loss": 0.1263,
"step": 15200
},
{
"epoch": 1.3619961527406195,
"grad_norm": 0.14604584872722626,
"learning_rate": 4.8990879682842964e-05,
"loss": 0.1267,
"step": 15400
},
{
"epoch": 1.3796847016162912,
"grad_norm": 0.1551065295934677,
"learning_rate": 4.8949337747762465e-05,
"loss": 0.1268,
"step": 15600
},
{
"epoch": 1.3973732504919627,
"grad_norm": 0.15430127084255219,
"learning_rate": 4.890697624120767e-05,
"loss": 0.1258,
"step": 15800
},
{
"epoch": 1.4150617993676344,
"grad_norm": 0.1534918248653412,
"learning_rate": 4.886379661283379e-05,
"loss": 0.1245,
"step": 16000
},
{
"epoch": 1.432750348243306,
"grad_norm": 0.15945963561534882,
"learning_rate": 4.881980034029303e-05,
"loss": 0.1251,
"step": 16200
},
{
"epoch": 1.4504388971189777,
"grad_norm": 0.15190242230892181,
"learning_rate": 4.877498892918403e-05,
"loss": 0.1246,
"step": 16400
},
{
"epoch": 1.4681274459946492,
"grad_norm": 0.14410296082496643,
"learning_rate": 4.872936391300029e-05,
"loss": 0.1251,
"step": 16600
},
{
"epoch": 1.485815994870321,
"grad_norm": 0.13909801840782166,
"learning_rate": 4.868292685307776e-05,
"loss": 0.1255,
"step": 16800
},
{
"epoch": 1.5035045437459924,
"grad_norm": 0.14646555483341217,
"learning_rate": 4.8635679338541364e-05,
"loss": 0.1243,
"step": 17000
},
{
"epoch": 1.521193092621664,
"grad_norm": 0.14244422316551208,
"learning_rate": 4.858762298625065e-05,
"loss": 0.1248,
"step": 17200
},
{
"epoch": 1.5388816414973356,
"grad_norm": 0.15146219730377197,
"learning_rate": 4.853875944074442e-05,
"loss": 0.1235,
"step": 17400
},
{
"epoch": 1.5565701903730074,
"grad_norm": 0.14190447330474854,
"learning_rate": 4.848909037418449e-05,
"loss": 0.1242,
"step": 17600
},
{
"epoch": 1.5742587392486789,
"grad_norm": 0.13961580395698547,
"learning_rate": 4.8438617486298455e-05,
"loss": 0.1235,
"step": 17800
},
{
"epoch": 1.5919472881243504,
"grad_norm": 0.13622143864631653,
"learning_rate": 4.838734250432152e-05,
"loss": 0.1231,
"step": 18000
},
{
"epoch": 1.609635837000022,
"grad_norm": 0.140974760055542,
"learning_rate": 4.833526718293736e-05,
"loss": 0.1229,
"step": 18200
},
{
"epoch": 1.6273243858756938,
"grad_norm": 0.1440068930387497,
"learning_rate": 4.828239330421815e-05,
"loss": 0.1233,
"step": 18400
},
{
"epoch": 1.6450129347513653,
"grad_norm": 0.13277290761470795,
"learning_rate": 4.822872267756351e-05,
"loss": 0.122,
"step": 18600
},
{
"epoch": 1.6627014836270368,
"grad_norm": 0.1427055150270462,
"learning_rate": 4.817425713963861e-05,
"loss": 0.123,
"step": 18800
},
{
"epoch": 1.6803900325027086,
"grad_norm": 0.14716538786888123,
"learning_rate": 4.8118998554311336e-05,
"loss": 0.1235,
"step": 19000
},
{
"epoch": 1.6980785813783803,
"grad_norm": 0.14058035612106323,
"learning_rate": 4.806294881258846e-05,
"loss": 0.1219,
"step": 19200
},
{
"epoch": 1.7157671302540518,
"grad_norm": 0.1426689326763153,
"learning_rate": 4.800610983255098e-05,
"loss": 0.1223,
"step": 19400
},
{
"epoch": 1.7334556791297233,
"grad_norm": 0.142822727560997,
"learning_rate": 4.7948483559288445e-05,
"loss": 0.1217,
"step": 19600
},
{
"epoch": 1.751144228005395,
"grad_norm": 0.14739733934402466,
"learning_rate": 4.7890071964832426e-05,
"loss": 0.122,
"step": 19800
},
{
"epoch": 1.7688327768810668,
"grad_norm": 0.1422308385372162,
"learning_rate": 4.7830877048088974e-05,
"loss": 0.1209,
"step": 20000
},
{
"epoch": 1.7865213257567383,
"grad_norm": 0.13312865793704987,
"learning_rate": 4.777090083477027e-05,
"loss": 0.1209,
"step": 20200
},
{
"epoch": 1.8042098746324098,
"grad_norm": 0.1410498321056366,
"learning_rate": 4.771014537732529e-05,
"loss": 0.1202,
"step": 20400
},
{
"epoch": 1.8218984235080815,
"grad_norm": 0.14671219885349274,
"learning_rate": 4.764861275486956e-05,
"loss": 0.1195,
"step": 20600
},
{
"epoch": 1.839586972383753,
"grad_norm": 0.13390739262104034,
"learning_rate": 4.758630507311399e-05,
"loss": 0.1204,
"step": 20800
},
{
"epoch": 1.8572755212594245,
"grad_norm": 0.1377139389514923,
"learning_rate": 4.7523224464292855e-05,
"loss": 0.1194,
"step": 21000
},
{
"epoch": 1.8749640701350963,
"grad_norm": 0.13953597843647003,
"learning_rate": 4.745937308709079e-05,
"loss": 0.1196,
"step": 21200
},
{
"epoch": 1.892652619010768,
"grad_norm": 0.14722049236297607,
"learning_rate": 4.739475312656895e-05,
"loss": 0.1189,
"step": 21400
},
{
"epoch": 1.9103411678864395,
"grad_norm": 0.129732146859169,
"learning_rate": 4.7329366794090205e-05,
"loss": 0.1195,
"step": 21600
},
{
"epoch": 1.928029716762111,
"grad_norm": 0.13127955794334412,
"learning_rate": 4.726321632724346e-05,
"loss": 0.1188,
"step": 21800
},
{
"epoch": 1.9457182656377827,
"grad_norm": 0.1353120505809784,
"learning_rate": 4.719630398976714e-05,
"loss": 0.1184,
"step": 22000
},
{
"epoch": 1.9634068145134544,
"grad_norm": 0.14046898484230042,
"learning_rate": 4.7128632071471667e-05,
"loss": 0.1185,
"step": 22200
},
{
"epoch": 1.981095363389126,
"grad_norm": 0.13207530975341797,
"learning_rate": 4.7060202888161106e-05,
"loss": 0.1174,
"step": 22400
},
{
"epoch": 1.9987839122647975,
"grad_norm": 0.1329745352268219,
"learning_rate": 4.6991018781553926e-05,
"loss": 0.1181,
"step": 22600
},
{
"epoch": 2.0164503504543747,
"grad_norm": 0.13534873723983765,
"learning_rate": 4.692108211920287e-05,
"loss": 0.097,
"step": 22800
},
{
"epoch": 2.0341388993300464,
"grad_norm": 0.13999226689338684,
"learning_rate": 4.685039529441393e-05,
"loss": 0.096,
"step": 23000
},
{
"epoch": 2.0518274482057177,
"grad_norm": 0.14331400394439697,
"learning_rate": 4.677896072616444e-05,
"loss": 0.0956,
"step": 23200
},
{
"epoch": 2.0695159970813894,
"grad_norm": 0.1391351968050003,
"learning_rate": 4.67067808590203e-05,
"loss": 0.0955,
"step": 23400
},
{
"epoch": 2.087204545957061,
"grad_norm": 0.15080305933952332,
"learning_rate": 4.6633858163052324e-05,
"loss": 0.0966,
"step": 23600
},
{
"epoch": 2.1048930948327325,
"grad_norm": 0.14853306114673615,
"learning_rate": 4.656019513375171e-05,
"loss": 0.0955,
"step": 23800
},
{
"epoch": 2.122581643708404,
"grad_norm": 0.14308440685272217,
"learning_rate": 4.648579429194463e-05,
"loss": 0.0959,
"step": 24000
},
{
"epoch": 2.140270192584076,
"grad_norm": 0.14518137276172638,
"learning_rate": 4.641065818370597e-05,
"loss": 0.0964,
"step": 24200
},
{
"epoch": 2.1579587414597476,
"grad_norm": 0.14385883510112762,
"learning_rate": 4.6334789380272235e-05,
"loss": 0.0966,
"step": 24400
},
{
"epoch": 2.175647290335419,
"grad_norm": 0.14189130067825317,
"learning_rate": 4.625819047795349e-05,
"loss": 0.0969,
"step": 24600
},
{
"epoch": 2.1933358392110907,
"grad_norm": 0.14711739122867584,
"learning_rate": 4.6180864098044584e-05,
"loss": 0.0967,
"step": 24800
},
{
"epoch": 2.2110243880867624,
"grad_norm": 0.14622507989406586,
"learning_rate": 4.610281288673539e-05,
"loss": 0.0967,
"step": 25000
},
{
"epoch": 2.228712936962434,
"grad_norm": 0.1425817757844925,
"learning_rate": 4.6024039515020276e-05,
"loss": 0.0981,
"step": 25200
},
{
"epoch": 2.2464014858381054,
"grad_norm": 0.1472136527299881,
"learning_rate": 4.5944546678606706e-05,
"loss": 0.0993,
"step": 25400
},
{
"epoch": 2.264090034713777,
"grad_norm": 0.14232853055000305,
"learning_rate": 4.586433709782296e-05,
"loss": 0.0985,
"step": 25600
},
{
"epoch": 2.281778583589449,
"grad_norm": 0.1511968970298767,
"learning_rate": 4.578341351752511e-05,
"loss": 0.098,
"step": 25800
},
{
"epoch": 2.2994671324651206,
"grad_norm": 0.14743036031723022,
"learning_rate": 4.570177870700298e-05,
"loss": 0.0974,
"step": 26000
},
{
"epoch": 2.317155681340792,
"grad_norm": 0.1461828500032425,
"learning_rate": 4.561943545988548e-05,
"loss": 0.0969,
"step": 26200
},
{
"epoch": 2.3348442302164636,
"grad_norm": 0.1614687293767929,
"learning_rate": 4.5536386594044956e-05,
"loss": 0.0988,
"step": 26400
},
{
"epoch": 2.3525327790921353,
"grad_norm": 0.1400858461856842,
"learning_rate": 4.5452634951500745e-05,
"loss": 0.0969,
"step": 26600
},
{
"epoch": 2.3702213279678066,
"grad_norm": 0.14338116347789764,
"learning_rate": 4.536818339832197e-05,
"loss": 0.0961,
"step": 26800
},
{
"epoch": 2.3879098768434783,
"grad_norm": 0.1501627415418625,
"learning_rate": 4.528303482452943e-05,
"loss": 0.0969,
"step": 27000
},
{
"epoch": 2.40559842571915,
"grad_norm": 0.14346472918987274,
"learning_rate": 4.519719214399667e-05,
"loss": 0.0958,
"step": 27200
},
{
"epoch": 2.423286974594822,
"grad_norm": 0.1539745032787323,
"learning_rate": 4.5110658294350326e-05,
"loss": 0.0966,
"step": 27400
},
{
"epoch": 2.4409755234704935,
"grad_norm": 0.1377008557319641,
"learning_rate": 4.502343623686956e-05,
"loss": 0.0971,
"step": 27600
},
{
"epoch": 2.458664072346165,
"grad_norm": 0.1407323032617569,
"learning_rate": 4.493552895638472e-05,
"loss": 0.0974,
"step": 27800
},
{
"epoch": 2.4763526212218365,
"grad_norm": 0.1503271609544754,
"learning_rate": 4.48469394611752e-05,
"loss": 0.0975,
"step": 28000
},
{
"epoch": 2.4940411700975083,
"grad_norm": 0.14528031647205353,
"learning_rate": 4.475767078286652e-05,
"loss": 0.0974,
"step": 28200
},
{
"epoch": 2.5117297189731795,
"grad_norm": 0.14073877036571503,
"learning_rate": 4.466772597632654e-05,
"loss": 0.0963,
"step": 28400
},
{
"epoch": 2.5294182678488513,
"grad_norm": 0.14889408648014069,
"learning_rate": 4.457710811956094e-05,
"loss": 0.0963,
"step": 28600
},
{
"epoch": 2.547106816724523,
"grad_norm": 0.14946310222148895,
"learning_rate": 4.4485820313607906e-05,
"loss": 0.0967,
"step": 28800
},
{
"epoch": 2.5647953656001947,
"grad_norm": 0.1508057564496994,
"learning_rate": 4.4393865682431955e-05,
"loss": 0.0973,
"step": 29000
},
{
"epoch": 2.5824839144758664,
"grad_norm": 0.1469687819480896,
"learning_rate": 4.4301247372817077e-05,
"loss": 0.0971,
"step": 29200
},
{
"epoch": 2.6001724633515377,
"grad_norm": 0.14045564830303192,
"learning_rate": 4.420796855425905e-05,
"loss": 0.0959,
"step": 29400
},
{
"epoch": 2.6178610122272095,
"grad_norm": 0.14008161425590515,
"learning_rate": 4.411403241885693e-05,
"loss": 0.0966,
"step": 29600
},
{
"epoch": 2.635549561102881,
"grad_norm": 0.15664730966091156,
"learning_rate": 4.4019442181203884e-05,
"loss": 0.0969,
"step": 29800
},
{
"epoch": 2.6532381099785525,
"grad_norm": 0.146159827709198,
"learning_rate": 4.3924201078277105e-05,
"loss": 0.0969,
"step": 30000
},
{
"epoch": 2.670926658854224,
"grad_norm": 0.14852313697338104,
"learning_rate": 4.382831236932711e-05,
"loss": 0.0955,
"step": 30200
},
{
"epoch": 2.688615207729896,
"grad_norm": 0.15001444518566132,
"learning_rate": 4.3731779335766154e-05,
"loss": 0.0959,
"step": 30400
},
{
"epoch": 2.7063037566055677,
"grad_norm": 0.14691965281963348,
"learning_rate": 4.363460528105597e-05,
"loss": 0.0957,
"step": 30600
},
{
"epoch": 2.723992305481239,
"grad_norm": 0.14318469166755676,
"learning_rate": 4.35367935305947e-05,
"loss": 0.0967,
"step": 30800
},
{
"epoch": 2.7416808543569107,
"grad_norm": 0.14508825540542603,
"learning_rate": 4.34383474316031e-05,
"loss": 0.0951,
"step": 31000
},
{
"epoch": 2.7593694032325824,
"grad_norm": 0.15151719748973846,
"learning_rate": 4.333927035301001e-05,
"loss": 0.0958,
"step": 31200
},
{
"epoch": 2.7770579521082537,
"grad_norm": 0.1437998265028,
"learning_rate": 4.3239565685337044e-05,
"loss": 0.0955,
"step": 31400
},
{
"epoch": 2.7947465009839254,
"grad_norm": 0.14122720062732697,
"learning_rate": 4.3139236840582575e-05,
"loss": 0.0951,
"step": 31600
},
{
"epoch": 2.812435049859597,
"grad_norm": 0.1338931769132614,
"learning_rate": 4.303828725210498e-05,
"loss": 0.0959,
"step": 31800
},
{
"epoch": 2.830123598735269,
"grad_norm": 0.144461989402771,
"learning_rate": 4.293672037450512e-05,
"loss": 0.0951,
"step": 32000
},
{
"epoch": 2.8478121476109406,
"grad_norm": 0.13668565452098846,
"learning_rate": 4.2834539683508166e-05,
"loss": 0.0959,
"step": 32200
},
{
"epoch": 2.865500696486612,
"grad_norm": 0.15980184078216553,
"learning_rate": 4.27317486758446e-05,
"loss": 0.0951,
"step": 32400
},
{
"epoch": 2.8831892453622836,
"grad_norm": 0.14222297072410583,
"learning_rate": 4.262835086913058e-05,
"loss": 0.0948,
"step": 32600
},
{
"epoch": 2.9008777942379553,
"grad_norm": 0.1509125828742981,
"learning_rate": 4.25243498017476e-05,
"loss": 0.0949,
"step": 32800
},
{
"epoch": 2.9185663431136266,
"grad_norm": 0.14582620561122894,
"learning_rate": 4.241974903272132e-05,
"loss": 0.0943,
"step": 33000
},
{
"epoch": 2.9362548919892983,
"grad_norm": 0.130904421210289,
"learning_rate": 4.231455214159985e-05,
"loss": 0.0945,
"step": 33200
},
{
"epoch": 2.95394344086497,
"grad_norm": 0.1373138129711151,
"learning_rate": 4.220876272833121e-05,
"loss": 0.0936,
"step": 33400
},
{
"epoch": 2.971631989740642,
"grad_norm": 0.14548689126968384,
"learning_rate": 4.210238441314017e-05,
"loss": 0.0934,
"step": 33600
},
{
"epoch": 2.9893205386163135,
"grad_norm": 0.13954471051692963,
"learning_rate": 4.199542083640432e-05,
"loss": 0.0937,
"step": 33800
},
{
"epoch": 3.0069869768058903,
"grad_norm": 0.14478568732738495,
"learning_rate": 4.188787565852952e-05,
"loss": 0.0847,
"step": 34000
},
{
"epoch": 3.024675525681562,
"grad_norm": 0.15503637492656708,
"learning_rate": 4.177975255982463e-05,
"loss": 0.0718,
"step": 34200
},
{
"epoch": 3.0423640745572333,
"grad_norm": 0.16748104989528656,
"learning_rate": 4.1671055240375575e-05,
"loss": 0.0714,
"step": 34400
},
{
"epoch": 3.060052623432905,
"grad_norm": 0.15309254825115204,
"learning_rate": 4.156178741991872e-05,
"loss": 0.0724,
"step": 34600
},
{
"epoch": 3.077741172308577,
"grad_norm": 0.15351246297359467,
"learning_rate": 4.1451952837713556e-05,
"loss": 0.0722,
"step": 34800
}
],
"logging_steps": 200,
"max_steps": 113060,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0180563807244386e+20,
"train_batch_size": 9,
"trial_name": null,
"trial_params": null
}