| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.077741172308577, | |
| "eval_steps": 500, | |
| "global_step": 34800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017688548875671613, | |
| "grad_norm": 4.9453349113464355, | |
| "learning_rate": 1.768972227136034e-06, | |
| "loss": 6.7403, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03537709775134323, | |
| "grad_norm": 8.81293773651123, | |
| "learning_rate": 3.537944454272068e-06, | |
| "loss": 4.3531, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.05306564662701484, | |
| "grad_norm": 5.486693382263184, | |
| "learning_rate": 5.306916681408102e-06, | |
| "loss": 2.2861, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07075419550268645, | |
| "grad_norm": 2.244082450866699, | |
| "learning_rate": 7.075888908544136e-06, | |
| "loss": 1.162, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.08844274437835806, | |
| "grad_norm": 0.6459134221076965, | |
| "learning_rate": 8.84486113568017e-06, | |
| "loss": 0.3574, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.10613129325402967, | |
| "grad_norm": 0.42250216007232666, | |
| "learning_rate": 1.0613833362816204e-05, | |
| "loss": 0.22, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.12381984212970129, | |
| "grad_norm": 0.4047611951828003, | |
| "learning_rate": 1.2382805589952239e-05, | |
| "loss": 0.1967, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.1415083910053729, | |
| "grad_norm": 0.34184110164642334, | |
| "learning_rate": 1.4151777817088272e-05, | |
| "loss": 0.1853, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.1591969398810445, | |
| "grad_norm": 0.32166364789009094, | |
| "learning_rate": 1.5920750044224307e-05, | |
| "loss": 0.1795, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1768854887567161, | |
| "grad_norm": 0.30309006571769714, | |
| "learning_rate": 1.768972227136034e-05, | |
| "loss": 0.1777, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.19457403763238773, | |
| "grad_norm": 0.28535833954811096, | |
| "learning_rate": 1.9458694498496373e-05, | |
| "loss": 0.1728, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.21226258650805935, | |
| "grad_norm": 0.2785949409008026, | |
| "learning_rate": 2.1227666725632408e-05, | |
| "loss": 0.1707, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.22995113538373096, | |
| "grad_norm": 0.28039732575416565, | |
| "learning_rate": 2.2996638952768443e-05, | |
| "loss": 0.1682, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.24763968425940258, | |
| "grad_norm": 0.2688646614551544, | |
| "learning_rate": 2.4765611179904478e-05, | |
| "loss": 0.1663, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.2653282331350742, | |
| "grad_norm": 0.2585384249687195, | |
| "learning_rate": 2.653458340704051e-05, | |
| "loss": 0.1672, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.2830167820107458, | |
| "grad_norm": 0.2583986222743988, | |
| "learning_rate": 2.8303555634176544e-05, | |
| "loss": 0.1646, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.30070533088641743, | |
| "grad_norm": 0.23997661471366882, | |
| "learning_rate": 3.007252786131258e-05, | |
| "loss": 0.1647, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.318393879762089, | |
| "grad_norm": 0.2527500092983246, | |
| "learning_rate": 3.1841500088448614e-05, | |
| "loss": 0.1629, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.3360824286377606, | |
| "grad_norm": 0.23295536637306213, | |
| "learning_rate": 3.3610472315584645e-05, | |
| "loss": 0.1615, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.3537709775134322, | |
| "grad_norm": 0.24165655672550201, | |
| "learning_rate": 3.537944454272068e-05, | |
| "loss": 0.1626, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.37145952638910384, | |
| "grad_norm": 0.22286227345466614, | |
| "learning_rate": 3.7148416769856715e-05, | |
| "loss": 0.1611, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.38914807526477546, | |
| "grad_norm": 0.23912444710731506, | |
| "learning_rate": 3.8917388996992746e-05, | |
| "loss": 0.1639, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.4068366241404471, | |
| "grad_norm": 0.22330383956432343, | |
| "learning_rate": 4.0686361224128784e-05, | |
| "loss": 0.1613, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.4245251730161187, | |
| "grad_norm": 0.2139132171869278, | |
| "learning_rate": 4.2455333451264816e-05, | |
| "loss": 0.1601, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.4422137218917903, | |
| "grad_norm": 0.19250735640525818, | |
| "learning_rate": 4.4224305678400854e-05, | |
| "loss": 0.1595, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.45990227076746193, | |
| "grad_norm": 0.19502046704292297, | |
| "learning_rate": 4.5993277905536885e-05, | |
| "loss": 0.159, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.47759081964313355, | |
| "grad_norm": 0.19040720164775848, | |
| "learning_rate": 4.776225013267292e-05, | |
| "loss": 0.1578, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.49527936851880516, | |
| "grad_norm": 0.20574906468391418, | |
| "learning_rate": 4.9531222359808955e-05, | |
| "loss": 0.1603, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.5129679173944768, | |
| "grad_norm": 0.19954811036586761, | |
| "learning_rate": 4.999976891136569e-05, | |
| "loss": 0.1582, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.5306564662701484, | |
| "grad_norm": 0.18808187544345856, | |
| "learning_rate": 4.999871234414489e-05, | |
| "loss": 0.1589, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.54834501514582, | |
| "grad_norm": 0.17781756818294525, | |
| "learning_rate": 4.999680029474971e-05, | |
| "loss": 0.158, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.5660335640214916, | |
| "grad_norm": 0.18824966251850128, | |
| "learning_rate": 4.999403282861248e-05, | |
| "loss": 0.1572, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.5837221128971632, | |
| "grad_norm": 0.20633359253406525, | |
| "learning_rate": 4.999041004043882e-05, | |
| "loss": 0.1572, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.6014106617728349, | |
| "grad_norm": 0.17536261677742004, | |
| "learning_rate": 4.998593205420432e-05, | |
| "loss": 0.1565, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.6190992106485064, | |
| "grad_norm": 0.1713794469833374, | |
| "learning_rate": 4.998059902315038e-05, | |
| "loss": 0.1554, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.636787759524178, | |
| "grad_norm": 0.16785737872123718, | |
| "learning_rate": 4.997441112977891e-05, | |
| "loss": 0.1526, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.6544763083998496, | |
| "grad_norm": 0.18514026701450348, | |
| "learning_rate": 4.996736858584613e-05, | |
| "loss": 0.153, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.6721648572755212, | |
| "grad_norm": 0.17102603614330292, | |
| "learning_rate": 4.995947163235527e-05, | |
| "loss": 0.1517, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.6898534061511928, | |
| "grad_norm": 0.16847601532936096, | |
| "learning_rate": 4.9950720539548384e-05, | |
| "loss": 0.1519, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.7075419550268645, | |
| "grad_norm": 0.16419926285743713, | |
| "learning_rate": 4.9941115606897036e-05, | |
| "loss": 0.1507, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.7252305039025361, | |
| "grad_norm": 0.17619839310646057, | |
| "learning_rate": 4.9930657163092123e-05, | |
| "loss": 0.15, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.7429190527782077, | |
| "grad_norm": 0.16663217544555664, | |
| "learning_rate": 4.991934556603254e-05, | |
| "loss": 0.1502, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.7606076016538793, | |
| "grad_norm": 0.18116818368434906, | |
| "learning_rate": 4.990718120281304e-05, | |
| "loss": 0.1511, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.7782961505295509, | |
| "grad_norm": 0.1595318764448166, | |
| "learning_rate": 4.989416448971088e-05, | |
| "loss": 0.1502, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.7959846994052225, | |
| "grad_norm": 0.15985798835754395, | |
| "learning_rate": 4.988029587217165e-05, | |
| "loss": 0.1513, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.8136732482808942, | |
| "grad_norm": 0.17328430712223053, | |
| "learning_rate": 4.9865575824794e-05, | |
| "loss": 0.1491, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.8313617971565658, | |
| "grad_norm": 0.1536557823419571, | |
| "learning_rate": 4.98500048513134e-05, | |
| "loss": 0.1474, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.8490503460322374, | |
| "grad_norm": 0.15362966060638428, | |
| "learning_rate": 4.983358348458491e-05, | |
| "loss": 0.1458, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.866738894907909, | |
| "grad_norm": 0.16235561668872833, | |
| "learning_rate": 4.9816312286564926e-05, | |
| "loss": 0.1454, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.8844274437835806, | |
| "grad_norm": 0.6078733801841736, | |
| "learning_rate": 4.979819184829197e-05, | |
| "loss": 0.148, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.9021159926592522, | |
| "grad_norm": 0.15071770548820496, | |
| "learning_rate": 4.9779222789866476e-05, | |
| "loss": 0.1461, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.9198045415349239, | |
| "grad_norm": 0.15512414276599884, | |
| "learning_rate": 4.9759405760429524e-05, | |
| "loss": 0.1455, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.9374930904105955, | |
| "grad_norm": 0.15411154925823212, | |
| "learning_rate": 4.9738741438140644e-05, | |
| "loss": 0.1437, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.9551816392862671, | |
| "grad_norm": 0.15488554537296295, | |
| "learning_rate": 4.9717230530154657e-05, | |
| "loss": 0.1445, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.9728701881619387, | |
| "grad_norm": 0.14975008368492126, | |
| "learning_rate": 4.9694873772597396e-05, | |
| "loss": 0.1437, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.9905587370376103, | |
| "grad_norm": 0.15676584839820862, | |
| "learning_rate": 4.967167193054058e-05, | |
| "loss": 0.1432, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.0082251752271874, | |
| "grad_norm": 0.15392336249351501, | |
| "learning_rate": 4.964762579797558e-05, | |
| "loss": 0.1355, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.0259137241028589, | |
| "grad_norm": 0.1635395586490631, | |
| "learning_rate": 4.962273619778632e-05, | |
| "loss": 0.1268, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.0436022729785306, | |
| "grad_norm": 0.15909574925899506, | |
| "learning_rate": 4.959700398172101e-05, | |
| "loss": 0.1263, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.061290821854202, | |
| "grad_norm": 0.16021914780139923, | |
| "learning_rate": 4.957043003036311e-05, | |
| "loss": 0.1264, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.0789793707298738, | |
| "grad_norm": 0.163675919175148, | |
| "learning_rate": 4.954301525310113e-05, | |
| "loss": 0.1262, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.0966679196055453, | |
| "grad_norm": 0.15520550310611725, | |
| "learning_rate": 4.951476058809751e-05, | |
| "loss": 0.1275, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.114356468481217, | |
| "grad_norm": 0.1602706015110016, | |
| "learning_rate": 4.948566700225654e-05, | |
| "loss": 0.1268, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.1320450173568886, | |
| "grad_norm": 0.15892288088798523, | |
| "learning_rate": 4.945573549119128e-05, | |
| "loss": 0.1269, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.1497335662325603, | |
| "grad_norm": 0.16024959087371826, | |
| "learning_rate": 4.9424967079189434e-05, | |
| "loss": 0.1265, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.1674221151082318, | |
| "grad_norm": 0.15324904024600983, | |
| "learning_rate": 4.939336281917837e-05, | |
| "loss": 0.1265, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.1851106639839033, | |
| "grad_norm": 0.15968984365463257, | |
| "learning_rate": 4.936092379268902e-05, | |
| "loss": 0.1269, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.202799212859575, | |
| "grad_norm": 0.1491909772157669, | |
| "learning_rate": 4.932765110981894e-05, | |
| "loss": 0.1261, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.2204877617352468, | |
| "grad_norm": 0.15451796352863312, | |
| "learning_rate": 4.929354590919424e-05, | |
| "loss": 0.1273, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.2381763106109183, | |
| "grad_norm": 0.1513613909482956, | |
| "learning_rate": 4.9258609357930686e-05, | |
| "loss": 0.1264, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.2558648594865898, | |
| "grad_norm": 0.1397712677717209, | |
| "learning_rate": 4.9222842651593736e-05, | |
| "loss": 0.1268, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.2735534083622615, | |
| "grad_norm": 0.13348053395748138, | |
| "learning_rate": 4.918624701415763e-05, | |
| "loss": 0.1267, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.2912419572379332, | |
| "grad_norm": 0.15651994943618774, | |
| "learning_rate": 4.9148823697963465e-05, | |
| "loss": 0.1258, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.3089305061136047, | |
| "grad_norm": 0.16271665692329407, | |
| "learning_rate": 4.9110573983676414e-05, | |
| "loss": 0.1258, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.3266190549892762, | |
| "grad_norm": 0.15141454339027405, | |
| "learning_rate": 4.907149918024185e-05, | |
| "loss": 0.1252, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.344307603864948, | |
| "grad_norm": 0.14476826786994934, | |
| "learning_rate": 4.903160062484056e-05, | |
| "loss": 0.1263, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.3619961527406195, | |
| "grad_norm": 0.14604584872722626, | |
| "learning_rate": 4.8990879682842964e-05, | |
| "loss": 0.1267, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.3796847016162912, | |
| "grad_norm": 0.1551065295934677, | |
| "learning_rate": 4.8949337747762465e-05, | |
| "loss": 0.1268, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.3973732504919627, | |
| "grad_norm": 0.15430127084255219, | |
| "learning_rate": 4.890697624120767e-05, | |
| "loss": 0.1258, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.4150617993676344, | |
| "grad_norm": 0.1534918248653412, | |
| "learning_rate": 4.886379661283379e-05, | |
| "loss": 0.1245, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.432750348243306, | |
| "grad_norm": 0.15945963561534882, | |
| "learning_rate": 4.881980034029303e-05, | |
| "loss": 0.1251, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.4504388971189777, | |
| "grad_norm": 0.15190242230892181, | |
| "learning_rate": 4.877498892918403e-05, | |
| "loss": 0.1246, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.4681274459946492, | |
| "grad_norm": 0.14410296082496643, | |
| "learning_rate": 4.872936391300029e-05, | |
| "loss": 0.1251, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.485815994870321, | |
| "grad_norm": 0.13909801840782166, | |
| "learning_rate": 4.868292685307776e-05, | |
| "loss": 0.1255, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.5035045437459924, | |
| "grad_norm": 0.14646555483341217, | |
| "learning_rate": 4.8635679338541364e-05, | |
| "loss": 0.1243, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.521193092621664, | |
| "grad_norm": 0.14244422316551208, | |
| "learning_rate": 4.858762298625065e-05, | |
| "loss": 0.1248, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.5388816414973356, | |
| "grad_norm": 0.15146219730377197, | |
| "learning_rate": 4.853875944074442e-05, | |
| "loss": 0.1235, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.5565701903730074, | |
| "grad_norm": 0.14190447330474854, | |
| "learning_rate": 4.848909037418449e-05, | |
| "loss": 0.1242, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.5742587392486789, | |
| "grad_norm": 0.13961580395698547, | |
| "learning_rate": 4.8438617486298455e-05, | |
| "loss": 0.1235, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.5919472881243504, | |
| "grad_norm": 0.13622143864631653, | |
| "learning_rate": 4.838734250432152e-05, | |
| "loss": 0.1231, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.609635837000022, | |
| "grad_norm": 0.140974760055542, | |
| "learning_rate": 4.833526718293736e-05, | |
| "loss": 0.1229, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.6273243858756938, | |
| "grad_norm": 0.1440068930387497, | |
| "learning_rate": 4.828239330421815e-05, | |
| "loss": 0.1233, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.6450129347513653, | |
| "grad_norm": 0.13277290761470795, | |
| "learning_rate": 4.822872267756351e-05, | |
| "loss": 0.122, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.6627014836270368, | |
| "grad_norm": 0.1427055150270462, | |
| "learning_rate": 4.817425713963861e-05, | |
| "loss": 0.123, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.6803900325027086, | |
| "grad_norm": 0.14716538786888123, | |
| "learning_rate": 4.8118998554311336e-05, | |
| "loss": 0.1235, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.6980785813783803, | |
| "grad_norm": 0.14058035612106323, | |
| "learning_rate": 4.806294881258846e-05, | |
| "loss": 0.1219, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.7157671302540518, | |
| "grad_norm": 0.1426689326763153, | |
| "learning_rate": 4.800610983255098e-05, | |
| "loss": 0.1223, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 1.7334556791297233, | |
| "grad_norm": 0.142822727560997, | |
| "learning_rate": 4.7948483559288445e-05, | |
| "loss": 0.1217, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 1.751144228005395, | |
| "grad_norm": 0.14739733934402466, | |
| "learning_rate": 4.7890071964832426e-05, | |
| "loss": 0.122, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 1.7688327768810668, | |
| "grad_norm": 0.1422308385372162, | |
| "learning_rate": 4.7830877048088974e-05, | |
| "loss": 0.1209, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.7865213257567383, | |
| "grad_norm": 0.13312865793704987, | |
| "learning_rate": 4.777090083477027e-05, | |
| "loss": 0.1209, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.8042098746324098, | |
| "grad_norm": 0.1410498321056366, | |
| "learning_rate": 4.771014537732529e-05, | |
| "loss": 0.1202, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.8218984235080815, | |
| "grad_norm": 0.14671219885349274, | |
| "learning_rate": 4.764861275486956e-05, | |
| "loss": 0.1195, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.839586972383753, | |
| "grad_norm": 0.13390739262104034, | |
| "learning_rate": 4.758630507311399e-05, | |
| "loss": 0.1204, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.8572755212594245, | |
| "grad_norm": 0.1377139389514923, | |
| "learning_rate": 4.7523224464292855e-05, | |
| "loss": 0.1194, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.8749640701350963, | |
| "grad_norm": 0.13953597843647003, | |
| "learning_rate": 4.745937308709079e-05, | |
| "loss": 0.1196, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.892652619010768, | |
| "grad_norm": 0.14722049236297607, | |
| "learning_rate": 4.739475312656895e-05, | |
| "loss": 0.1189, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.9103411678864395, | |
| "grad_norm": 0.129732146859169, | |
| "learning_rate": 4.7329366794090205e-05, | |
| "loss": 0.1195, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.928029716762111, | |
| "grad_norm": 0.13127955794334412, | |
| "learning_rate": 4.726321632724346e-05, | |
| "loss": 0.1188, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.9457182656377827, | |
| "grad_norm": 0.1353120505809784, | |
| "learning_rate": 4.719630398976714e-05, | |
| "loss": 0.1184, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.9634068145134544, | |
| "grad_norm": 0.14046898484230042, | |
| "learning_rate": 4.7128632071471667e-05, | |
| "loss": 0.1185, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.981095363389126, | |
| "grad_norm": 0.13207530975341797, | |
| "learning_rate": 4.7060202888161106e-05, | |
| "loss": 0.1174, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.9987839122647975, | |
| "grad_norm": 0.1329745352268219, | |
| "learning_rate": 4.6991018781553926e-05, | |
| "loss": 0.1181, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 2.0164503504543747, | |
| "grad_norm": 0.13534873723983765, | |
| "learning_rate": 4.692108211920287e-05, | |
| "loss": 0.097, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 2.0341388993300464, | |
| "grad_norm": 0.13999226689338684, | |
| "learning_rate": 4.685039529441393e-05, | |
| "loss": 0.096, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.0518274482057177, | |
| "grad_norm": 0.14331400394439697, | |
| "learning_rate": 4.677896072616444e-05, | |
| "loss": 0.0956, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 2.0695159970813894, | |
| "grad_norm": 0.1391351968050003, | |
| "learning_rate": 4.67067808590203e-05, | |
| "loss": 0.0955, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 2.087204545957061, | |
| "grad_norm": 0.15080305933952332, | |
| "learning_rate": 4.6633858163052324e-05, | |
| "loss": 0.0966, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 2.1048930948327325, | |
| "grad_norm": 0.14853306114673615, | |
| "learning_rate": 4.656019513375171e-05, | |
| "loss": 0.0955, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 2.122581643708404, | |
| "grad_norm": 0.14308440685272217, | |
| "learning_rate": 4.648579429194463e-05, | |
| "loss": 0.0959, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.140270192584076, | |
| "grad_norm": 0.14518137276172638, | |
| "learning_rate": 4.641065818370597e-05, | |
| "loss": 0.0964, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 2.1579587414597476, | |
| "grad_norm": 0.14385883510112762, | |
| "learning_rate": 4.6334789380272235e-05, | |
| "loss": 0.0966, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 2.175647290335419, | |
| "grad_norm": 0.14189130067825317, | |
| "learning_rate": 4.625819047795349e-05, | |
| "loss": 0.0969, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 2.1933358392110907, | |
| "grad_norm": 0.14711739122867584, | |
| "learning_rate": 4.6180864098044584e-05, | |
| "loss": 0.0967, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 2.2110243880867624, | |
| "grad_norm": 0.14622507989406586, | |
| "learning_rate": 4.610281288673539e-05, | |
| "loss": 0.0967, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.228712936962434, | |
| "grad_norm": 0.1425817757844925, | |
| "learning_rate": 4.6024039515020276e-05, | |
| "loss": 0.0981, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 2.2464014858381054, | |
| "grad_norm": 0.1472136527299881, | |
| "learning_rate": 4.5944546678606706e-05, | |
| "loss": 0.0993, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 2.264090034713777, | |
| "grad_norm": 0.14232853055000305, | |
| "learning_rate": 4.586433709782296e-05, | |
| "loss": 0.0985, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 2.281778583589449, | |
| "grad_norm": 0.1511968970298767, | |
| "learning_rate": 4.578341351752511e-05, | |
| "loss": 0.098, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 2.2994671324651206, | |
| "grad_norm": 0.14743036031723022, | |
| "learning_rate": 4.570177870700298e-05, | |
| "loss": 0.0974, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.317155681340792, | |
| "grad_norm": 0.1461828500032425, | |
| "learning_rate": 4.561943545988548e-05, | |
| "loss": 0.0969, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 2.3348442302164636, | |
| "grad_norm": 0.1614687293767929, | |
| "learning_rate": 4.5536386594044956e-05, | |
| "loss": 0.0988, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 2.3525327790921353, | |
| "grad_norm": 0.1400858461856842, | |
| "learning_rate": 4.5452634951500745e-05, | |
| "loss": 0.0969, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 2.3702213279678066, | |
| "grad_norm": 0.14338116347789764, | |
| "learning_rate": 4.536818339832197e-05, | |
| "loss": 0.0961, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 2.3879098768434783, | |
| "grad_norm": 0.1501627415418625, | |
| "learning_rate": 4.528303482452943e-05, | |
| "loss": 0.0969, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.40559842571915, | |
| "grad_norm": 0.14346472918987274, | |
| "learning_rate": 4.519719214399667e-05, | |
| "loss": 0.0958, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 2.423286974594822, | |
| "grad_norm": 0.1539745032787323, | |
| "learning_rate": 4.5110658294350326e-05, | |
| "loss": 0.0966, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 2.4409755234704935, | |
| "grad_norm": 0.1377008557319641, | |
| "learning_rate": 4.502343623686956e-05, | |
| "loss": 0.0971, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 2.458664072346165, | |
| "grad_norm": 0.1407323032617569, | |
| "learning_rate": 4.493552895638472e-05, | |
| "loss": 0.0974, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 2.4763526212218365, | |
| "grad_norm": 0.1503271609544754, | |
| "learning_rate": 4.48469394611752e-05, | |
| "loss": 0.0975, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.4940411700975083, | |
| "grad_norm": 0.14528031647205353, | |
| "learning_rate": 4.475767078286652e-05, | |
| "loss": 0.0974, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 2.5117297189731795, | |
| "grad_norm": 0.14073877036571503, | |
| "learning_rate": 4.466772597632654e-05, | |
| "loss": 0.0963, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 2.5294182678488513, | |
| "grad_norm": 0.14889408648014069, | |
| "learning_rate": 4.457710811956094e-05, | |
| "loss": 0.0963, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 2.547106816724523, | |
| "grad_norm": 0.14946310222148895, | |
| "learning_rate": 4.4485820313607906e-05, | |
| "loss": 0.0967, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 2.5647953656001947, | |
| "grad_norm": 0.1508057564496994, | |
| "learning_rate": 4.4393865682431955e-05, | |
| "loss": 0.0973, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.5824839144758664, | |
| "grad_norm": 0.1469687819480896, | |
| "learning_rate": 4.4301247372817077e-05, | |
| "loss": 0.0971, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 2.6001724633515377, | |
| "grad_norm": 0.14045564830303192, | |
| "learning_rate": 4.420796855425905e-05, | |
| "loss": 0.0959, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 2.6178610122272095, | |
| "grad_norm": 0.14008161425590515, | |
| "learning_rate": 4.411403241885693e-05, | |
| "loss": 0.0966, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 2.635549561102881, | |
| "grad_norm": 0.15664730966091156, | |
| "learning_rate": 4.4019442181203884e-05, | |
| "loss": 0.0969, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 2.6532381099785525, | |
| "grad_norm": 0.146159827709198, | |
| "learning_rate": 4.3924201078277105e-05, | |
| "loss": 0.0969, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.670926658854224, | |
| "grad_norm": 0.14852313697338104, | |
| "learning_rate": 4.382831236932711e-05, | |
| "loss": 0.0955, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 2.688615207729896, | |
| "grad_norm": 0.15001444518566132, | |
| "learning_rate": 4.3731779335766154e-05, | |
| "loss": 0.0959, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 2.7063037566055677, | |
| "grad_norm": 0.14691965281963348, | |
| "learning_rate": 4.363460528105597e-05, | |
| "loss": 0.0957, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 2.723992305481239, | |
| "grad_norm": 0.14318469166755676, | |
| "learning_rate": 4.35367935305947e-05, | |
| "loss": 0.0967, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 2.7416808543569107, | |
| "grad_norm": 0.14508825540542603, | |
| "learning_rate": 4.34383474316031e-05, | |
| "loss": 0.0951, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.7593694032325824, | |
| "grad_norm": 0.15151719748973846, | |
| "learning_rate": 4.333927035301001e-05, | |
| "loss": 0.0958, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 2.7770579521082537, | |
| "grad_norm": 0.1437998265028, | |
| "learning_rate": 4.3239565685337044e-05, | |
| "loss": 0.0955, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 2.7947465009839254, | |
| "grad_norm": 0.14122720062732697, | |
| "learning_rate": 4.3139236840582575e-05, | |
| "loss": 0.0951, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 2.812435049859597, | |
| "grad_norm": 0.1338931769132614, | |
| "learning_rate": 4.303828725210498e-05, | |
| "loss": 0.0959, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 2.830123598735269, | |
| "grad_norm": 0.144461989402771, | |
| "learning_rate": 4.293672037450512e-05, | |
| "loss": 0.0951, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.8478121476109406, | |
| "grad_norm": 0.13668565452098846, | |
| "learning_rate": 4.2834539683508166e-05, | |
| "loss": 0.0959, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 2.865500696486612, | |
| "grad_norm": 0.15980184078216553, | |
| "learning_rate": 4.27317486758446e-05, | |
| "loss": 0.0951, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 2.8831892453622836, | |
| "grad_norm": 0.14222297072410583, | |
| "learning_rate": 4.262835086913058e-05, | |
| "loss": 0.0948, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 2.9008777942379553, | |
| "grad_norm": 0.1509125828742981, | |
| "learning_rate": 4.25243498017476e-05, | |
| "loss": 0.0949, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 2.9185663431136266, | |
| "grad_norm": 0.14582620561122894, | |
| "learning_rate": 4.241974903272132e-05, | |
| "loss": 0.0943, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.9362548919892983, | |
| "grad_norm": 0.130904421210289, | |
| "learning_rate": 4.231455214159985e-05, | |
| "loss": 0.0945, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 2.95394344086497, | |
| "grad_norm": 0.1373138129711151, | |
| "learning_rate": 4.220876272833121e-05, | |
| "loss": 0.0936, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 2.971631989740642, | |
| "grad_norm": 0.14548689126968384, | |
| "learning_rate": 4.210238441314017e-05, | |
| "loss": 0.0934, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 2.9893205386163135, | |
| "grad_norm": 0.13954471051692963, | |
| "learning_rate": 4.199542083640432e-05, | |
| "loss": 0.0937, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 3.0069869768058903, | |
| "grad_norm": 0.14478568732738495, | |
| "learning_rate": 4.188787565852952e-05, | |
| "loss": 0.0847, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 3.024675525681562, | |
| "grad_norm": 0.15503637492656708, | |
| "learning_rate": 4.177975255982463e-05, | |
| "loss": 0.0718, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 3.0423640745572333, | |
| "grad_norm": 0.16748104989528656, | |
| "learning_rate": 4.1671055240375575e-05, | |
| "loss": 0.0714, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 3.060052623432905, | |
| "grad_norm": 0.15309254825115204, | |
| "learning_rate": 4.156178741991872e-05, | |
| "loss": 0.0724, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 3.077741172308577, | |
| "grad_norm": 0.15351246297359467, | |
| "learning_rate": 4.1451952837713556e-05, | |
| "loss": 0.0722, | |
| "step": 34800 | |
| } | |
| ], | |
| "logging_steps": 200, | |
| "max_steps": 113060, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0180563807244386e+20, | |
| "train_batch_size": 9, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |