| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 710, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014104372355430184, | |
| "grad_norm": 1.0829510688781738, | |
| "learning_rate": 1.348314606741573e-06, | |
| "loss": 1.3063, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.028208744710860368, | |
| "grad_norm": 0.91203373670578, | |
| "learning_rate": 3.033707865168539e-06, | |
| "loss": 1.2577, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04231311706629055, | |
| "grad_norm": 0.6704057455062866, | |
| "learning_rate": 4.719101123595506e-06, | |
| "loss": 1.334, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.056417489421720736, | |
| "grad_norm": 0.5428482294082642, | |
| "learning_rate": 6.404494382022472e-06, | |
| "loss": 1.2117, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07052186177715092, | |
| "grad_norm": 0.4570103585720062, | |
| "learning_rate": 8.089887640449438e-06, | |
| "loss": 1.1806, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0846262341325811, | |
| "grad_norm": 0.5431040525436401, | |
| "learning_rate": 9.775280898876405e-06, | |
| "loss": 1.1791, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09873060648801128, | |
| "grad_norm": 0.6045286059379578, | |
| "learning_rate": 1.146067415730337e-05, | |
| "loss": 1.1561, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11283497884344147, | |
| "grad_norm": 0.816319465637207, | |
| "learning_rate": 1.3146067415730338e-05, | |
| "loss": 1.1815, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12693935119887165, | |
| "grad_norm": 0.41476839780807495, | |
| "learning_rate": 1.4831460674157303e-05, | |
| "loss": 1.1569, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.14104372355430184, | |
| "grad_norm": 0.45171546936035156, | |
| "learning_rate": 1.651685393258427e-05, | |
| "loss": 1.1442, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15514809590973203, | |
| "grad_norm": 0.5023919343948364, | |
| "learning_rate": 1.8202247191011237e-05, | |
| "loss": 1.0983, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1692524682651622, | |
| "grad_norm": 0.54413241147995, | |
| "learning_rate": 1.98876404494382e-05, | |
| "loss": 1.1343, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.18335684062059238, | |
| "grad_norm": 0.40992623567581177, | |
| "learning_rate": 2.1573033707865168e-05, | |
| "loss": 1.1189, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.19746121297602257, | |
| "grad_norm": 0.5576924085617065, | |
| "learning_rate": 2.3258426966292135e-05, | |
| "loss": 1.1297, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.21156558533145275, | |
| "grad_norm": 0.439005047082901, | |
| "learning_rate": 2.4943820224719103e-05, | |
| "loss": 1.1228, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.22566995768688294, | |
| "grad_norm": 0.45460307598114014, | |
| "learning_rate": 2.6629213483146066e-05, | |
| "loss": 1.105, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2397743300423131, | |
| "grad_norm": 0.5392889976501465, | |
| "learning_rate": 2.8314606741573034e-05, | |
| "loss": 1.0453, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2538787023977433, | |
| "grad_norm": 0.49128258228302, | |
| "learning_rate": 3e-05, | |
| "loss": 1.0606, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2679830747531735, | |
| "grad_norm": 0.574536144733429, | |
| "learning_rate": 2.9999348997381465e-05, | |
| "loss": 1.0693, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2820874471086037, | |
| "grad_norm": 0.5417886972427368, | |
| "learning_rate": 2.999739604603311e-05, | |
| "loss": 1.0346, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.29619181946403383, | |
| "grad_norm": 0.6563856601715088, | |
| "learning_rate": 2.9994141315471794e-05, | |
| "loss": 0.9918, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.31029619181946405, | |
| "grad_norm": 0.5459888577461243, | |
| "learning_rate": 2.998958508820927e-05, | |
| "loss": 1.0295, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3244005641748942, | |
| "grad_norm": 0.5069293975830078, | |
| "learning_rate": 2.998372775972765e-05, | |
| "loss": 1.0381, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3385049365303244, | |
| "grad_norm": 0.5660542249679565, | |
| "learning_rate": 2.9976569838445096e-05, | |
| "loss": 0.9919, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3526093088857546, | |
| "grad_norm": 0.5492711663246155, | |
| "learning_rate": 2.9968111945671674e-05, | |
| "loss": 1.0029, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.36671368124118475, | |
| "grad_norm": 0.6453714370727539, | |
| "learning_rate": 2.9958354815555426e-05, | |
| "loss": 0.9913, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.38081805359661497, | |
| "grad_norm": 0.534812867641449, | |
| "learning_rate": 2.9947299295018656e-05, | |
| "loss": 0.9343, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.39492242595204513, | |
| "grad_norm": 0.5858612060546875, | |
| "learning_rate": 2.9934946343684404e-05, | |
| "loss": 0.9805, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4090267983074753, | |
| "grad_norm": 0.5869054198265076, | |
| "learning_rate": 2.9921297033793158e-05, | |
| "loss": 1.0044, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4231311706629055, | |
| "grad_norm": 0.7152486443519592, | |
| "learning_rate": 2.9906352550109787e-05, | |
| "loss": 0.9804, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.43723554301833567, | |
| "grad_norm": 0.6668853163719177, | |
| "learning_rate": 2.989011418982069e-05, | |
| "loss": 0.8677, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4513399153737659, | |
| "grad_norm": 0.7731665968894958, | |
| "learning_rate": 2.9872583362421203e-05, | |
| "loss": 0.9194, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.46544428772919605, | |
| "grad_norm": 0.7152626514434814, | |
| "learning_rate": 2.985376158959328e-05, | |
| "loss": 0.8777, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4795486600846262, | |
| "grad_norm": 0.6192904114723206, | |
| "learning_rate": 2.983365050507336e-05, | |
| "loss": 0.9021, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4936530324400564, | |
| "grad_norm": 0.6604310274124146, | |
| "learning_rate": 2.9812251854510603e-05, | |
| "loss": 0.9059, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5077574047954866, | |
| "grad_norm": 0.7105734348297119, | |
| "learning_rate": 2.9789567495315357e-05, | |
| "loss": 0.9273, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5218617771509168, | |
| "grad_norm": 0.7766591906547546, | |
| "learning_rate": 2.976559939649791e-05, | |
| "loss": 0.8934, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.535966149506347, | |
| "grad_norm": 0.9151214361190796, | |
| "learning_rate": 2.9740349638497614e-05, | |
| "loss": 0.9166, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5500705218617772, | |
| "grad_norm": 0.8262761831283569, | |
| "learning_rate": 2.971382041300228e-05, | |
| "loss": 0.8532, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5641748942172073, | |
| "grad_norm": 0.6876681447029114, | |
| "learning_rate": 2.9686014022757937e-05, | |
| "loss": 0.8347, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5782792665726375, | |
| "grad_norm": 0.7912429571151733, | |
| "learning_rate": 2.965693288136897e-05, | |
| "loss": 0.8992, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5923836389280677, | |
| "grad_norm": 0.7762657403945923, | |
| "learning_rate": 2.9626579513088606e-05, | |
| "loss": 0.85, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6064880112834978, | |
| "grad_norm": 0.8784447312355042, | |
| "learning_rate": 2.959495655259981e-05, | |
| "loss": 0.7934, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6205923836389281, | |
| "grad_norm": 0.8992597460746765, | |
| "learning_rate": 2.9562066744786587e-05, | |
| "loss": 0.8254, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6346967559943583, | |
| "grad_norm": 0.7901438474655151, | |
| "learning_rate": 2.9527912944495748e-05, | |
| "loss": 0.8078, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6488011283497884, | |
| "grad_norm": 0.8354968428611755, | |
| "learning_rate": 2.9492498116289072e-05, | |
| "loss": 0.856, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6629055007052186, | |
| "grad_norm": 1.1308631896972656, | |
| "learning_rate": 2.9455825334186023e-05, | |
| "loss": 0.7691, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6770098730606487, | |
| "grad_norm": 0.8235520124435425, | |
| "learning_rate": 2.9417897781396883e-05, | |
| "loss": 0.7801, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.691114245416079, | |
| "grad_norm": 0.7662566900253296, | |
| "learning_rate": 2.937871875004648e-05, | |
| "loss": 0.7438, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7052186177715092, | |
| "grad_norm": 0.8254992961883545, | |
| "learning_rate": 2.9338291640888413e-05, | |
| "loss": 0.8526, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7193229901269393, | |
| "grad_norm": 0.783316433429718, | |
| "learning_rate": 2.9296619963009866e-05, | |
| "loss": 0.782, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7334273624823695, | |
| "grad_norm": 0.910808801651001, | |
| "learning_rate": 2.925370733352704e-05, | |
| "loss": 0.7768, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7475317348377997, | |
| "grad_norm": 0.8336752653121948, | |
| "learning_rate": 2.920955747727115e-05, | |
| "loss": 0.7485, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7616361071932299, | |
| "grad_norm": 0.9764995574951172, | |
| "learning_rate": 2.9164174226465134e-05, | |
| "loss": 0.7457, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7757404795486601, | |
| "grad_norm": 0.9447649717330933, | |
| "learning_rate": 2.9117561520391002e-05, | |
| "loss": 0.7196, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7898448519040903, | |
| "grad_norm": 0.8971619606018066, | |
| "learning_rate": 2.9069723405047923e-05, | |
| "loss": 0.7608, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8039492242595204, | |
| "grad_norm": 0.9162759184837341, | |
| "learning_rate": 2.902066403280101e-05, | |
| "loss": 0.7038, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.8180535966149506, | |
| "grad_norm": 1.049428105354309, | |
| "learning_rate": 2.8970387662020898e-05, | |
| "loss": 0.705, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8321579689703809, | |
| "grad_norm": 0.8928109407424927, | |
| "learning_rate": 2.8918898656714127e-05, | |
| "loss": 0.7331, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.846262341325811, | |
| "grad_norm": 0.928637683391571, | |
| "learning_rate": 2.8866201486144336e-05, | |
| "loss": 0.7034, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8603667136812412, | |
| "grad_norm": 0.9384335279464722, | |
| "learning_rate": 2.881230072444432e-05, | |
| "loss": 0.735, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8744710860366713, | |
| "grad_norm": 0.879400908946991, | |
| "learning_rate": 2.8757201050219027e-05, | |
| "loss": 0.7082, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8885754583921015, | |
| "grad_norm": 0.9233940839767456, | |
| "learning_rate": 2.8700907246139413e-05, | |
| "loss": 0.6922, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9026798307475318, | |
| "grad_norm": 1.0604772567749023, | |
| "learning_rate": 2.8643424198527314e-05, | |
| "loss": 0.7058, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9167842031029619, | |
| "grad_norm": 0.9149603843688965, | |
| "learning_rate": 2.858475689693135e-05, | |
| "loss": 0.6531, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9308885754583921, | |
| "grad_norm": 0.9202451109886169, | |
| "learning_rate": 2.852491043369377e-05, | |
| "loss": 0.6501, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9449929478138223, | |
| "grad_norm": 1.123238205909729, | |
| "learning_rate": 2.8463890003508488e-05, | |
| "loss": 0.6438, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9590973201692524, | |
| "grad_norm": 0.9209488034248352, | |
| "learning_rate": 2.840170090297014e-05, | |
| "loss": 0.6568, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9732016925246827, | |
| "grad_norm": 1.0271188020706177, | |
| "learning_rate": 2.833834853011437e-05, | |
| "loss": 0.6731, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.9873060648801129, | |
| "grad_norm": 1.0789600610733032, | |
| "learning_rate": 2.827383838394926e-05, | |
| "loss": 0.6798, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.23028564453125, | |
| "learning_rate": 2.8208176063978018e-05, | |
| "loss": 0.6558, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.0141043723554302, | |
| "grad_norm": 0.9986002445220947, | |
| "learning_rate": 2.814136726971294e-05, | |
| "loss": 0.5642, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.0282087447108603, | |
| "grad_norm": 1.0259571075439453, | |
| "learning_rate": 2.8073417800180707e-05, | |
| "loss": 0.564, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.0423131170662905, | |
| "grad_norm": 0.9642548561096191, | |
| "learning_rate": 2.800433355341898e-05, | |
| "loss": 0.5423, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.0564174894217206, | |
| "grad_norm": 0.9903756380081177, | |
| "learning_rate": 2.793412052596451e-05, | |
| "loss": 0.5246, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.0705218617771508, | |
| "grad_norm": 0.9968107342720032, | |
| "learning_rate": 2.7862784812332592e-05, | |
| "loss": 0.5371, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.0846262341325812, | |
| "grad_norm": 1.2498723268508911, | |
| "learning_rate": 2.779033260448807e-05, | |
| "loss": 0.5538, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.0987306064880114, | |
| "grad_norm": 1.061000943183899, | |
| "learning_rate": 2.7716770191307887e-05, | |
| "loss": 0.5467, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.1128349788434415, | |
| "grad_norm": 0.9731029272079468, | |
| "learning_rate": 2.7642103958035188e-05, | |
| "loss": 0.4912, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.1269393511988717, | |
| "grad_norm": 0.9743561744689941, | |
| "learning_rate": 2.756634038572509e-05, | |
| "loss": 0.5527, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.1410437235543018, | |
| "grad_norm": 1.0373893976211548, | |
| "learning_rate": 2.748948605068212e-05, | |
| "loss": 0.5109, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.155148095909732, | |
| "grad_norm": 1.0031832456588745, | |
| "learning_rate": 2.7411547623889397e-05, | |
| "loss": 0.5417, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.1692524682651622, | |
| "grad_norm": 1.2156072854995728, | |
| "learning_rate": 2.7332531870429574e-05, | |
| "loss": 0.4838, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.1833568406205923, | |
| "grad_norm": 0.9628976583480835, | |
| "learning_rate": 2.7252445648897643e-05, | |
| "loss": 0.4965, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.1974612129760225, | |
| "grad_norm": 1.100713849067688, | |
| "learning_rate": 2.7171295910805585e-05, | |
| "loss": 0.4919, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.2115655853314529, | |
| "grad_norm": 1.0750036239624023, | |
| "learning_rate": 2.708908969997901e-05, | |
| "loss": 0.5031, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.225669957686883, | |
| "grad_norm": 1.0613516569137573, | |
| "learning_rate": 2.7005834151945708e-05, | |
| "loss": 0.5087, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.2397743300423132, | |
| "grad_norm": 1.2760698795318604, | |
| "learning_rate": 2.6921536493316327e-05, | |
| "loss": 0.5021, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.2538787023977433, | |
| "grad_norm": 0.9473477005958557, | |
| "learning_rate": 2.683620404115706e-05, | |
| "loss": 0.5002, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.2679830747531735, | |
| "grad_norm": 1.1720986366271973, | |
| "learning_rate": 2.674984420235455e-05, | |
| "loss": 0.5148, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2820874471086037, | |
| "grad_norm": 1.0552853345870972, | |
| "learning_rate": 2.6662464472972958e-05, | |
| "loss": 0.5056, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.2961918194640338, | |
| "grad_norm": 1.0545684099197388, | |
| "learning_rate": 2.65740724376033e-05, | |
| "loss": 0.4976, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.310296191819464, | |
| "grad_norm": 0.8876795172691345, | |
| "learning_rate": 2.6484675768705102e-05, | |
| "loss": 0.4761, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.3244005641748942, | |
| "grad_norm": 0.9161580204963684, | |
| "learning_rate": 2.6394282225940445e-05, | |
| "loss": 0.4589, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.3385049365303243, | |
| "grad_norm": 1.0101513862609863, | |
| "learning_rate": 2.63028996555004e-05, | |
| "loss": 0.4852, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.3526093088857545, | |
| "grad_norm": 1.0795925855636597, | |
| "learning_rate": 2.6210535989423978e-05, | |
| "loss": 0.4633, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.3667136812411846, | |
| "grad_norm": 1.032175064086914, | |
| "learning_rate": 2.6117199244909655e-05, | |
| "loss": 0.4791, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.380818053596615, | |
| "grad_norm": 0.9439494013786316, | |
| "learning_rate": 2.6022897523619423e-05, | |
| "loss": 0.4717, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.3949224259520452, | |
| "grad_norm": 1.0860165357589722, | |
| "learning_rate": 2.592763901097564e-05, | |
| "loss": 0.4829, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.4090267983074753, | |
| "grad_norm": 0.9606081247329712, | |
| "learning_rate": 2.583143197545044e-05, | |
| "loss": 0.4861, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4231311706629055, | |
| "grad_norm": 1.1352707147598267, | |
| "learning_rate": 2.5734284767848108e-05, | |
| "loss": 0.4769, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.4372355430183357, | |
| "grad_norm": 1.0786997079849243, | |
| "learning_rate": 2.5636205820580173e-05, | |
| "loss": 0.4504, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.4513399153737658, | |
| "grad_norm": 1.1144368648529053, | |
| "learning_rate": 2.553720364693351e-05, | |
| "loss": 0.4326, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.465444287729196, | |
| "grad_norm": 1.0776797533035278, | |
| "learning_rate": 2.543728684033135e-05, | |
| "loss": 0.4121, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.4795486600846262, | |
| "grad_norm": 1.1324986219406128, | |
| "learning_rate": 2.5336464073587395e-05, | |
| "loss": 0.3999, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.4936530324400565, | |
| "grad_norm": 0.951062023639679, | |
| "learning_rate": 2.5234744098153e-05, | |
| "loss": 0.4736, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.5077574047954867, | |
| "grad_norm": 1.1760550737380981, | |
| "learning_rate": 2.5132135743357546e-05, | |
| "loss": 0.4411, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.5218617771509169, | |
| "grad_norm": 1.1214802265167236, | |
| "learning_rate": 2.502864791564205e-05, | |
| "loss": 0.43, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.535966149506347, | |
| "grad_norm": 1.0796761512756348, | |
| "learning_rate": 2.492428959778609e-05, | |
| "loss": 0.455, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.5500705218617772, | |
| "grad_norm": 1.148000717163086, | |
| "learning_rate": 2.48190698481281e-05, | |
| "loss": 0.4235, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.5641748942172073, | |
| "grad_norm": 1.0369551181793213, | |
| "learning_rate": 2.4712997799779077e-05, | |
| "loss": 0.4066, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.5782792665726375, | |
| "grad_norm": 1.153380036354065, | |
| "learning_rate": 2.4606082659829852e-05, | |
| "loss": 0.4079, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.5923836389280677, | |
| "grad_norm": 1.2209539413452148, | |
| "learning_rate": 2.4498333708551906e-05, | |
| "loss": 0.4335, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.6064880112834978, | |
| "grad_norm": 1.097364068031311, | |
| "learning_rate": 2.4389760298591825e-05, | |
| "loss": 0.4187, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.620592383638928, | |
| "grad_norm": 1.0410133600234985, | |
| "learning_rate": 2.4280371854159502e-05, | |
| "loss": 0.4305, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.6346967559943582, | |
| "grad_norm": 1.2560479640960693, | |
| "learning_rate": 2.417017787021011e-05, | |
| "loss": 0.3681, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.6488011283497883, | |
| "grad_norm": 1.1253535747528076, | |
| "learning_rate": 2.405918791161992e-05, | |
| "loss": 0.416, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.6629055007052185, | |
| "grad_norm": 1.1615822315216064, | |
| "learning_rate": 2.3947411612356093e-05, | |
| "loss": 0.4539, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.6770098730606486, | |
| "grad_norm": 1.0008732080459595, | |
| "learning_rate": 2.3834858674640434e-05, | |
| "loss": 0.4258, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.691114245416079, | |
| "grad_norm": 0.9982715845108032, | |
| "learning_rate": 2.3721538868107226e-05, | |
| "loss": 0.4154, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.7052186177715092, | |
| "grad_norm": 1.1458238363265991, | |
| "learning_rate": 2.3607462028955245e-05, | |
| "loss": 0.3803, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.7193229901269393, | |
| "grad_norm": 1.0373518466949463, | |
| "learning_rate": 2.3492638059093958e-05, | |
| "loss": 0.3826, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.7334273624823695, | |
| "grad_norm": 1.3240524530410767, | |
| "learning_rate": 2.3377076925284037e-05, | |
| "loss": 0.4086, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.7475317348377997, | |
| "grad_norm": 1.0962055921554565, | |
| "learning_rate": 2.3260788658272244e-05, | |
| "loss": 0.3553, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.76163610719323, | |
| "grad_norm": 1.075670838356018, | |
| "learning_rate": 2.3143783351920753e-05, | |
| "loss": 0.3749, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.7757404795486602, | |
| "grad_norm": 0.9956583976745605, | |
| "learning_rate": 2.3026071162331012e-05, | |
| "loss": 0.376, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.7898448519040904, | |
| "grad_norm": 1.0385911464691162, | |
| "learning_rate": 2.2907662306962176e-05, | |
| "loss": 0.3529, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.8039492242595205, | |
| "grad_norm": 1.147087574005127, | |
| "learning_rate": 2.278856706374422e-05, | |
| "loss": 0.3979, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.8180535966149507, | |
| "grad_norm": 1.094008445739746, | |
| "learning_rate": 2.266879577018585e-05, | |
| "loss": 0.3979, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.8321579689703809, | |
| "grad_norm": 1.116995930671692, | |
| "learning_rate": 2.2548358822477158e-05, | |
| "loss": 0.3869, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.846262341325811, | |
| "grad_norm": 0.991368293762207, | |
| "learning_rate": 2.242726667458726e-05, | |
| "loss": 0.343, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.8603667136812412, | |
| "grad_norm": 1.0833584070205688, | |
| "learning_rate": 2.2305529837356857e-05, | |
| "loss": 0.3563, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.8744710860366713, | |
| "grad_norm": 1.0229146480560303, | |
| "learning_rate": 2.2183158877585937e-05, | |
| "loss": 0.3327, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.8885754583921015, | |
| "grad_norm": 0.9877346754074097, | |
| "learning_rate": 2.206016441711652e-05, | |
| "loss": 0.3832, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.9026798307475317, | |
| "grad_norm": 1.1531109809875488, | |
| "learning_rate": 2.1936557131910733e-05, | |
| "loss": 0.3904, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.9167842031029618, | |
| "grad_norm": 1.0331673622131348, | |
| "learning_rate": 2.1812347751124074e-05, | |
| "loss": 0.331, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.930888575458392, | |
| "grad_norm": 1.2358472347259521, | |
| "learning_rate": 2.1687547056174172e-05, | |
| "loss": 0.3152, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.9449929478138221, | |
| "grad_norm": 1.2115956544876099, | |
| "learning_rate": 2.156216587980491e-05, | |
| "loss": 0.3641, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.9590973201692523, | |
| "grad_norm": 1.191973328590393, | |
| "learning_rate": 2.1436215105146178e-05, | |
| "loss": 0.3296, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.9732016925246827, | |
| "grad_norm": 1.093503475189209, | |
| "learning_rate": 2.1309705664769198e-05, | |
| "loss": 0.3374, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.9873060648801129, | |
| "grad_norm": 1.0156272649765015, | |
| "learning_rate": 2.1182648539737547e-05, | |
| "loss": 0.3241, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.5677207708358765, | |
| "learning_rate": 2.1055054758654053e-05, | |
| "loss": 0.3256, | |
| "step": 710 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1775, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1044507413821522e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |