{ "best_global_step": 1044, "best_metric": 0.5790691375732422, "best_model_checkpoint": "/workspace/scripts/soutputs/8ca8a9ea-9ae3-4938-9713-015819984d61/checkpoint-1044", "epoch": 2.9914040114613183, "eval_steps": 500, "global_step": 1044, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014326647564469915, "grad_norm": 1.6428219079971313, "learning_rate": 1.4026021586989397e-06, "loss": 0.9723, "step": 5 }, { "epoch": 0.02865329512893983, "grad_norm": 1.3679360151290894, "learning_rate": 3.155854857072614e-06, "loss": 0.9416, "step": 10 }, { "epoch": 0.04297994269340974, "grad_norm": 1.0384185314178467, "learning_rate": 4.9091075554462895e-06, "loss": 0.8955, "step": 15 }, { "epoch": 0.05730659025787966, "grad_norm": 0.6389966607093811, "learning_rate": 6.662360253819964e-06, "loss": 0.8219, "step": 20 }, { "epoch": 0.07163323782234957, "grad_norm": 0.46849510073661804, "learning_rate": 8.415612952193638e-06, "loss": 0.7583, "step": 25 }, { "epoch": 0.08595988538681948, "grad_norm": 0.5466313362121582, "learning_rate": 1.0168865650567315e-05, "loss": 0.7283, "step": 30 }, { "epoch": 0.10028653295128939, "grad_norm": 0.46641281247138977, "learning_rate": 1.1922118348940989e-05, "loss": 0.708, "step": 35 }, { "epoch": 0.11461318051575932, "grad_norm": 0.5155534744262695, "learning_rate": 1.2272343115538091e-05, "loss": 0.7074, "step": 40 }, { "epoch": 0.12893982808022922, "grad_norm": 0.45078691840171814, "learning_rate": 1.2270613524924088e-05, "loss": 0.6699, "step": 45 }, { "epoch": 0.14326647564469913, "grad_norm": 0.4526143968105316, "learning_rate": 1.2267553922326047e-05, "loss": 0.6663, "step": 50 }, { "epoch": 0.15759312320916904, "grad_norm": 0.44320717453956604, "learning_rate": 1.2263165044858593e-05, "loss": 0.6612, "step": 55 }, { "epoch": 0.17191977077363896, "grad_norm": 0.4532703757286072, "learning_rate": 1.2257447949883163e-05, "loss": 0.6705, "step": 60 }, { "epoch": 0.18624641833810887, "grad_norm": 0.3655495345592499, "learning_rate": 1.2250404014753254e-05, "loss": 0.6574, "step": 65 }, { "epoch": 0.20057306590257878, "grad_norm": 0.3733099102973938, "learning_rate": 1.2242034936482603e-05, "loss": 0.6834, "step": 70 }, { "epoch": 0.2148997134670487, "grad_norm": 0.355129599571228, "learning_rate": 1.2232342731336339e-05, "loss": 0.6645, "step": 75 }, { "epoch": 0.22922636103151864, "grad_norm": 0.358656108379364, "learning_rate": 1.222132973434523e-05, "loss": 0.653, "step": 80 }, { "epoch": 0.24355300859598855, "grad_norm": 0.29975712299346924, "learning_rate": 1.2208998598743134e-05, "loss": 0.6719, "step": 85 }, { "epoch": 0.25787965616045844, "grad_norm": 0.32437002658843994, "learning_rate": 1.2195352295327777e-05, "loss": 0.6661, "step": 90 }, { "epoch": 0.2722063037249284, "grad_norm": 0.28565841913223267, "learning_rate": 1.2180394111745045e-05, "loss": 0.6515, "step": 95 }, { "epoch": 0.28653295128939826, "grad_norm": 0.28558802604675293, "learning_rate": 1.2164127651696922e-05, "loss": 0.6448, "step": 100 }, { "epoch": 0.28653295128939826, "eval_loss": 0.6602650880813599, "eval_runtime": 2.8798, "eval_samples_per_second": 14.584, "eval_steps_per_second": 14.584, "step": 100 }, { "epoch": 0.3008595988538682, "grad_norm": 0.3361125886440277, "learning_rate": 1.214655683407329e-05, "loss": 0.6516, "step": 105 }, { "epoch": 0.3151862464183381, "grad_norm": 0.2776224613189697, "learning_rate": 1.2127685892007806e-05, "loss": 0.6592, "step": 110 }, { "epoch": 0.32951289398280803, "grad_norm": 0.26801374554634094, "learning_rate": 1.2107519371858048e-05, "loss": 0.6565, "step": 115 }, { "epoch": 0.3438395415472779, "grad_norm": 0.297080934047699, "learning_rate": 1.2086062132110227e-05, "loss": 0.642, "step": 120 }, { "epoch": 0.35816618911174786, "grad_norm": 0.28340891003608704, "learning_rate": 1.2063319342208684e-05, "loss": 0.6478, "step": 125 }, { "epoch": 0.37249283667621774, "grad_norm": 0.2782769203186035, "learning_rate": 1.2039296481310471e-05, "loss": 0.6368, "step": 130 }, { "epoch": 0.3868194842406877, "grad_norm": 0.292530357837677, "learning_rate": 1.2013999336965322e-05, "loss": 0.6153, "step": 135 }, { "epoch": 0.40114613180515757, "grad_norm": 0.24663622677326202, "learning_rate": 1.1987434003721335e-05, "loss": 0.6424, "step": 140 }, { "epoch": 0.4154727793696275, "grad_norm": 0.2681853473186493, "learning_rate": 1.195960688165667e-05, "loss": 0.6348, "step": 145 }, { "epoch": 0.4297994269340974, "grad_norm": 0.2627250850200653, "learning_rate": 1.1930524674837664e-05, "loss": 0.6249, "step": 150 }, { "epoch": 0.44412607449856734, "grad_norm": 0.24072442948818207, "learning_rate": 1.1900194389703684e-05, "loss": 0.6391, "step": 155 }, { "epoch": 0.4584527220630373, "grad_norm": 0.25336554646492004, "learning_rate": 1.1868623333379166e-05, "loss": 0.6298, "step": 160 }, { "epoch": 0.47277936962750716, "grad_norm": 0.2672167718410492, "learning_rate": 1.1835819111913174e-05, "loss": 0.6368, "step": 165 }, { "epoch": 0.4871060171919771, "grad_norm": 0.2560673952102661, "learning_rate": 1.1801789628446977e-05, "loss": 0.6318, "step": 170 }, { "epoch": 0.501432664756447, "grad_norm": 0.27951574325561523, "learning_rate": 1.1766543081310029e-05, "loss": 0.6109, "step": 175 }, { "epoch": 0.5157593123209169, "grad_norm": 0.25252604484558105, "learning_rate": 1.1730087962044844e-05, "loss": 0.6273, "step": 180 }, { "epoch": 0.5300859598853869, "grad_norm": 0.25956350564956665, "learning_rate": 1.1692433053361224e-05, "loss": 0.6133, "step": 185 }, { "epoch": 0.5444126074498568, "grad_norm": 0.2530823349952698, "learning_rate": 1.165358742702035e-05, "loss": 0.6214, "step": 190 }, { "epoch": 0.5587392550143266, "grad_norm": 0.2583998143672943, "learning_rate": 1.1613560441649214e-05, "loss": 0.6105, "step": 195 }, { "epoch": 0.5730659025787965, "grad_norm": 0.27742502093315125, "learning_rate": 1.1572361740485967e-05, "loss": 0.6349, "step": 200 }, { "epoch": 0.5730659025787965, "eval_loss": 0.6322649717330933, "eval_runtime": 2.88, "eval_samples_per_second": 14.583, "eval_steps_per_second": 14.583, "step": 200 }, { "epoch": 0.5873925501432665, "grad_norm": 0.2662568688392639, "learning_rate": 1.1530001249056676e-05, "loss": 0.6299, "step": 205 }, { "epoch": 0.6017191977077364, "grad_norm": 0.2614499032497406, "learning_rate": 1.148648917278409e-05, "loss": 0.6005, "step": 210 }, { "epoch": 0.6160458452722063, "grad_norm": 0.26987332105636597, "learning_rate": 1.1441835994528954e-05, "loss": 0.6214, "step": 215 }, { "epoch": 0.6303724928366762, "grad_norm": 0.24090726673603058, "learning_rate": 1.1396052472064512e-05, "loss": 0.6245, "step": 220 }, { "epoch": 0.6446991404011462, "grad_norm": 0.2746104300022125, "learning_rate": 1.1349149635484741e-05, "loss": 0.6222, "step": 225 }, { "epoch": 0.6590257879656161, "grad_norm": 0.26875993609428406, "learning_rate": 1.1301138784547013e-05, "loss": 0.6092, "step": 230 }, { "epoch": 0.673352435530086, "grad_norm": 0.2399819940328598, "learning_rate": 1.1252031485949773e-05, "loss": 0.6177, "step": 235 }, { "epoch": 0.6876790830945558, "grad_norm": 0.27088305354118347, "learning_rate": 1.1201839570545898e-05, "loss": 0.6024, "step": 240 }, { "epoch": 0.7020057306590258, "grad_norm": 0.2598998248577118, "learning_rate": 1.1150575130492442e-05, "loss": 0.6068, "step": 245 }, { "epoch": 0.7163323782234957, "grad_norm": 0.26509082317352295, "learning_rate": 1.1098250516337403e-05, "loss": 0.6128, "step": 250 }, { "epoch": 0.7306590257879656, "grad_norm": 0.23148998618125916, "learning_rate": 1.1044878334044251e-05, "loss": 0.6225, "step": 255 }, { "epoch": 0.7449856733524355, "grad_norm": 0.23298867046833038, "learning_rate": 1.0990471441954915e-05, "loss": 0.6176, "step": 260 }, { "epoch": 0.7593123209169055, "grad_norm": 0.25643882155418396, "learning_rate": 1.093504294769198e-05, "loss": 0.6132, "step": 265 }, { "epoch": 0.7736389684813754, "grad_norm": 0.2456223964691162, "learning_rate": 1.087860620500081e-05, "loss": 0.6083, "step": 270 }, { "epoch": 0.7879656160458453, "grad_norm": 0.24799339473247528, "learning_rate": 1.0821174810532391e-05, "loss": 0.6064, "step": 275 }, { "epoch": 0.8022922636103151, "grad_norm": 0.24989920854568481, "learning_rate": 1.076276260056765e-05, "loss": 0.6063, "step": 280 }, { "epoch": 0.8166189111747851, "grad_norm": 0.253239244222641, "learning_rate": 1.0703383647684028e-05, "loss": 0.6071, "step": 285 }, { "epoch": 0.830945558739255, "grad_norm": 0.24544061720371246, "learning_rate": 1.064305225736515e-05, "loss": 0.611, "step": 290 }, { "epoch": 0.8452722063037249, "grad_norm": 0.24104644358158112, "learning_rate": 1.0581782964554359e-05, "loss": 0.5985, "step": 295 }, { "epoch": 0.8595988538681948, "grad_norm": 0.23256933689117432, "learning_rate": 1.0519590530152995e-05, "loss": 0.5887, "step": 300 }, { "epoch": 0.8595988538681948, "eval_loss": 0.6149212718009949, "eval_runtime": 2.8878, "eval_samples_per_second": 14.544, "eval_steps_per_second": 14.544, "step": 300 }, { "epoch": 0.8739255014326648, "grad_norm": 0.26569247245788574, "learning_rate": 1.0456489937464206e-05, "loss": 0.5988, "step": 305 }, { "epoch": 0.8882521489971347, "grad_norm": 0.2356170415878296, "learning_rate": 1.0392496388583203e-05, "loss": 0.6133, "step": 310 }, { "epoch": 0.9025787965616046, "grad_norm": 0.25165677070617676, "learning_rate": 1.0327625300734795e-05, "loss": 0.6022, "step": 315 }, { "epoch": 0.9169054441260746, "grad_norm": 0.2422744333744049, "learning_rate": 1.0261892302559097e-05, "loss": 0.6209, "step": 320 }, { "epoch": 0.9312320916905444, "grad_norm": 0.2504790723323822, "learning_rate": 1.019531323034629e-05, "loss": 0.5836, "step": 325 }, { "epoch": 0.9455587392550143, "grad_norm": 0.23083172738552094, "learning_rate": 1.0127904124221387e-05, "loss": 0.6036, "step": 330 }, { "epoch": 0.9598853868194842, "grad_norm": 0.23841316998004913, "learning_rate": 1.0059681224279856e-05, "loss": 0.6028, "step": 335 }, { "epoch": 0.9742120343839542, "grad_norm": 0.2634727656841278, "learning_rate": 9.990660966675092e-06, "loss": 0.6074, "step": 340 }, { "epoch": 0.9885386819484241, "grad_norm": 0.22332459688186646, "learning_rate": 9.920859979658633e-06, "loss": 0.6061, "step": 345 }, { "epoch": 0.997134670487106, "eval_loss": 0.6086744070053101, "eval_runtime": 2.8877, "eval_samples_per_second": 14.544, "eval_steps_per_second": 14.544, "step": 348 }, { "epoch": 1.0, "eval_loss": 0.6092488169670105, "eval_runtime": 2.8916, "eval_samples_per_second": 14.525, "eval_steps_per_second": 14.525, "step": 349 }, { "epoch": 1.002865329512894, "grad_norm": 0.23956461250782013, "learning_rate": 9.85029507957412e-06, "loss": 0.5824, "step": 350 }, { "epoch": 1.0171919770773639, "grad_norm": 0.2437165081501007, "learning_rate": 9.77898326680592e-06, "loss": 0.5803, "step": 355 }, { "epoch": 1.0315186246418337, "grad_norm": 0.2500912845134735, "learning_rate": 9.706941721683432e-06, "loss": 0.5957, "step": 360 }, { "epoch": 1.0458452722063036, "grad_norm": 0.2493949979543686, "learning_rate": 9.634187800342016e-06, "loss": 0.5911, "step": 365 }, { "epoch": 1.0601719197707737, "grad_norm": 0.23148047924041748, "learning_rate": 9.56073903054159e-06, "loss": 0.5688, "step": 370 }, { "epoch": 1.0744985673352436, "grad_norm": 0.23534221947193146, "learning_rate": 9.486613107443863e-06, "loss": 0.5938, "step": 375 }, { "epoch": 1.0888252148997135, "grad_norm": 0.23032759130001068, "learning_rate": 9.411827889349254e-06, "loss": 0.5675, "step": 380 }, { "epoch": 1.1031518624641834, "grad_norm": 0.23191657662391663, "learning_rate": 9.336401393394483e-06, "loss": 0.5899, "step": 385 }, { "epoch": 1.1174785100286533, "grad_norm": 0.2217395305633545, "learning_rate": 9.260351791211929e-06, "loss": 0.5726, "step": 390 }, { "epoch": 1.1318051575931232, "grad_norm": 0.2425890415906906, "learning_rate": 9.183697404551733e-06, "loss": 0.5762, "step": 395 }, { "epoch": 1.146131805157593, "grad_norm": 0.2324853092432022, "learning_rate": 9.106456700867764e-06, "loss": 0.596, "step": 400 }, { "epoch": 1.146131805157593, "eval_loss": 0.6035182476043701, "eval_runtime": 2.8972, "eval_samples_per_second": 14.497, "eval_steps_per_second": 14.497, "step": 400 }, { "epoch": 1.1604584527220632, "grad_norm": 0.23952153325080872, "learning_rate": 9.028648288868459e-06, "loss": 0.5904, "step": 405 }, { "epoch": 1.174785100286533, "grad_norm": 0.23701021075248718, "learning_rate": 8.950290914033645e-06, "loss": 0.5785, "step": 410 }, { "epoch": 1.189111747851003, "grad_norm": 0.2227863371372223, "learning_rate": 8.871403454098416e-06, "loss": 0.5724, "step": 415 }, { "epoch": 1.2034383954154728, "grad_norm": 0.2232217639684677, "learning_rate": 8.792004914505126e-06, "loss": 0.5727, "step": 420 }, { "epoch": 1.2177650429799427, "grad_norm": 0.24012598395347595, "learning_rate": 8.712114423824633e-06, "loss": 0.589, "step": 425 }, { "epoch": 1.2320916905444126, "grad_norm": 0.2352171540260315, "learning_rate": 8.631751229147881e-06, "loss": 0.5667, "step": 430 }, { "epoch": 1.2464183381088825, "grad_norm": 0.23246026039123535, "learning_rate": 8.550934691448907e-06, "loss": 0.5927, "step": 435 }, { "epoch": 1.2607449856733524, "grad_norm": 0.24500536918640137, "learning_rate": 8.469684280920438e-06, "loss": 0.5831, "step": 440 }, { "epoch": 1.2750716332378222, "grad_norm": 0.22870078682899475, "learning_rate": 8.388019572283156e-06, "loss": 0.5851, "step": 445 }, { "epoch": 1.2893982808022924, "grad_norm": 0.22906720638275146, "learning_rate": 8.305960240069795e-06, "loss": 0.586, "step": 450 }, { "epoch": 1.3037249283667622, "grad_norm": 0.22709061205387115, "learning_rate": 8.223526053885171e-06, "loss": 0.5719, "step": 455 }, { "epoch": 1.3180515759312321, "grad_norm": 0.2257590889930725, "learning_rate": 8.140736873643331e-06, "loss": 0.5718, "step": 460 }, { "epoch": 1.332378223495702, "grad_norm": 0.22583012282848358, "learning_rate": 8.05761264478293e-06, "loss": 0.5754, "step": 465 }, { "epoch": 1.346704871060172, "grad_norm": 0.22651982307434082, "learning_rate": 7.974173393462007e-06, "loss": 0.5651, "step": 470 }, { "epoch": 1.3610315186246418, "grad_norm": 0.24124553799629211, "learning_rate": 7.890439221733317e-06, "loss": 0.5826, "step": 475 }, { "epoch": 1.3753581661891117, "grad_norm": 0.22888998687267303, "learning_rate": 7.806430302701367e-06, "loss": 0.5705, "step": 480 }, { "epoch": 1.3896848137535818, "grad_norm": 0.21681609749794006, "learning_rate": 7.722166875662358e-06, "loss": 0.5814, "step": 485 }, { "epoch": 1.4040114613180517, "grad_norm": 0.2206772118806839, "learning_rate": 7.63766924122816e-06, "loss": 0.5844, "step": 490 }, { "epoch": 1.4183381088825215, "grad_norm": 0.22052349150180817, "learning_rate": 7.552957756435512e-06, "loss": 0.5674, "step": 495 }, { "epoch": 1.4326647564469914, "grad_norm": 0.24319517612457275, "learning_rate": 7.468052829841645e-06, "loss": 0.5813, "step": 500 }, { "epoch": 1.4326647564469914, "eval_loss": 0.5956406593322754, "eval_runtime": 2.8806, "eval_samples_per_second": 14.581, "eval_steps_per_second": 14.581, "step": 500 }, { "epoch": 1.4469914040114613, "grad_norm": 0.2275008261203766, "learning_rate": 7.382974916607492e-06, "loss": 0.5853, "step": 505 }, { "epoch": 1.4613180515759312, "grad_norm": 0.23689113557338715, "learning_rate": 7.297744513569644e-06, "loss": 0.5796, "step": 510 }, { "epoch": 1.475644699140401, "grad_norm": 0.23207077383995056, "learning_rate": 7.2123821543023e-06, "loss": 0.5832, "step": 515 }, { "epoch": 1.4899713467048712, "grad_norm": 0.237880676984787, "learning_rate": 7.126908404170343e-06, "loss": 0.5783, "step": 520 }, { "epoch": 1.5042979942693409, "grad_norm": 0.22841981053352356, "learning_rate": 7.041343855374771e-06, "loss": 0.5623, "step": 525 }, { "epoch": 1.518624641833811, "grad_norm": 0.223537415266037, "learning_rate": 6.955709121991649e-06, "loss": 0.574, "step": 530 }, { "epoch": 1.5329512893982808, "grad_norm": 0.22695119678974152, "learning_rate": 6.870024835005807e-06, "loss": 0.5592, "step": 535 }, { "epoch": 1.5472779369627507, "grad_norm": 0.22849540412425995, "learning_rate": 6.784311637340442e-06, "loss": 0.5613, "step": 540 }, { "epoch": 1.5616045845272206, "grad_norm": 0.2229369729757309, "learning_rate": 6.6985901788838775e-06, "loss": 0.566, "step": 545 }, { "epoch": 1.5759312320916905, "grad_norm": 0.21880346536636353, "learning_rate": 6.612881111514604e-06, "loss": 0.5767, "step": 550 }, { "epoch": 1.5902578796561606, "grad_norm": 0.21992699801921844, "learning_rate": 6.527205084125875e-06, "loss": 0.5711, "step": 555 }, { "epoch": 1.6045845272206303, "grad_norm": 0.23056058585643768, "learning_rate": 6.441582737651007e-06, "loss": 0.5607, "step": 560 }, { "epoch": 1.6189111747851004, "grad_norm": 0.22267192602157593, "learning_rate": 6.356034700090591e-06, "loss": 0.5549, "step": 565 }, { "epoch": 1.63323782234957, "grad_norm": 0.22011469304561615, "learning_rate": 6.270581581542831e-06, "loss": 0.5821, "step": 570 }, { "epoch": 1.6475644699140402, "grad_norm": 0.22847089171409607, "learning_rate": 6.185243969238195e-06, "loss": 0.5821, "step": 575 }, { "epoch": 1.66189111747851, "grad_norm": 0.22488202154636383, "learning_rate": 6.10004242257957e-06, "loss": 0.5585, "step": 580 }, { "epoch": 1.67621776504298, "grad_norm": 0.22973030805587769, "learning_rate": 6.01499746818912e-06, "loss": 0.5715, "step": 585 }, { "epoch": 1.6905444126074498, "grad_norm": 0.22791410982608795, "learning_rate": 5.930129594963047e-06, "loss": 0.5709, "step": 590 }, { "epoch": 1.7048710601719197, "grad_norm": 0.2369392067193985, "learning_rate": 5.845459249135437e-06, "loss": 0.5712, "step": 595 }, { "epoch": 1.7191977077363898, "grad_norm": 0.22787928581237793, "learning_rate": 5.7610068293523925e-06, "loss": 0.5806, "step": 600 }, { "epoch": 1.7191977077363898, "eval_loss": 0.589396595954895, "eval_runtime": 2.8838, "eval_samples_per_second": 14.564, "eval_steps_per_second": 14.564, "step": 600 }, { "epoch": 1.7335243553008595, "grad_norm": 0.2262052297592163, "learning_rate": 5.676792681757612e-06, "loss": 0.5653, "step": 605 }, { "epoch": 1.7478510028653296, "grad_norm": 0.2277483344078064, "learning_rate": 5.5928370950906355e-06, "loss": 0.5634, "step": 610 }, { "epoch": 1.7621776504297995, "grad_norm": 0.2228267937898636, "learning_rate": 5.5091602957989115e-06, "loss": 0.5472, "step": 615 }, { "epoch": 1.7765042979942693, "grad_norm": 0.22168482840061188, "learning_rate": 5.425782443164878e-06, "loss": 0.5565, "step": 620 }, { "epoch": 1.7908309455587392, "grad_norm": 0.22628583014011383, "learning_rate": 5.342723624449211e-06, "loss": 0.558, "step": 625 }, { "epoch": 1.8051575931232091, "grad_norm": 0.22420856356620789, "learning_rate": 5.260003850051442e-06, "loss": 0.5721, "step": 630 }, { "epoch": 1.8194842406876792, "grad_norm": 0.22148585319519043, "learning_rate": 5.177643048689078e-06, "loss": 0.5688, "step": 635 }, { "epoch": 1.8338108882521489, "grad_norm": 0.21723760664463043, "learning_rate": 5.095661062596411e-06, "loss": 0.5719, "step": 640 }, { "epoch": 1.848137535816619, "grad_norm": 0.22150275111198425, "learning_rate": 5.014077642744153e-06, "loss": 0.5486, "step": 645 }, { "epoch": 1.8624641833810889, "grad_norm": 0.21508848667144775, "learning_rate": 4.932912444081069e-06, "loss": 0.555, "step": 650 }, { "epoch": 1.8767908309455588, "grad_norm": 0.2276742309331894, "learning_rate": 4.852185020798736e-06, "loss": 0.5527, "step": 655 }, { "epoch": 1.8911174785100286, "grad_norm": 0.22282367944717407, "learning_rate": 4.771914821620574e-06, "loss": 0.5513, "step": 660 }, { "epoch": 1.9054441260744985, "grad_norm": 0.22503264248371124, "learning_rate": 4.6921211851162955e-06, "loss": 0.5656, "step": 665 }, { "epoch": 1.9197707736389686, "grad_norm": 0.22671757638454437, "learning_rate": 4.612823335042883e-06, "loss": 0.5746, "step": 670 }, { "epoch": 1.9340974212034383, "grad_norm": 0.2195613831281662, "learning_rate": 4.534040375713239e-06, "loss": 0.5481, "step": 675 }, { "epoch": 1.9484240687679084, "grad_norm": 0.2245696634054184, "learning_rate": 4.455791287393597e-06, "loss": 0.558, "step": 680 }, { "epoch": 1.962750716332378, "grad_norm": 0.21683502197265625, "learning_rate": 4.37809492173083e-06, "loss": 0.5523, "step": 685 }, { "epoch": 1.9770773638968482, "grad_norm": 0.2247258424758911, "learning_rate": 4.300969997210741e-06, "loss": 0.5735, "step": 690 }, { "epoch": 1.991404011461318, "grad_norm": 0.22837325930595398, "learning_rate": 4.224435094648434e-06, "loss": 0.5669, "step": 695 }, { "epoch": 1.994269340974212, "eval_loss": 0.5852823853492737, "eval_runtime": 2.8671, "eval_samples_per_second": 14.649, "eval_steps_per_second": 14.649, "step": 696 }, { "epoch": 2.0, "eval_loss": 0.5849316716194153, "eval_runtime": 2.8768, "eval_samples_per_second": 14.6, "eval_steps_per_second": 14.6, "step": 698 }, { "epoch": 2.005730659025788, "grad_norm": 0.21968944370746613, "learning_rate": 4.148508652711858e-06, "loss": 0.5577, "step": 700 }, { "epoch": 2.005730659025788, "eval_loss": 0.5852600932121277, "eval_runtime": 2.8671, "eval_samples_per_second": 14.649, "eval_steps_per_second": 14.649, "step": 700 }, { "epoch": 2.020057306590258, "grad_norm": 0.22937500476837158, "learning_rate": 4.073208963479584e-06, "loss": 0.5605, "step": 705 }, { "epoch": 2.0343839541547277, "grad_norm": 0.23057711124420166, "learning_rate": 3.998554168033906e-06, "loss": 0.5525, "step": 710 }, { "epoch": 2.048710601719198, "grad_norm": 0.2270784080028534, "learning_rate": 3.924562252090337e-06, "loss": 0.5562, "step": 715 }, { "epoch": 2.0630372492836675, "grad_norm": 0.2220994234085083, "learning_rate": 3.8512510416644995e-06, "loss": 0.5447, "step": 720 }, { "epoch": 2.0773638968481376, "grad_norm": 0.23204341530799866, "learning_rate": 3.778638198777512e-06, "loss": 0.549, "step": 725 }, { "epoch": 2.0916905444126073, "grad_norm": 0.22262004017829895, "learning_rate": 3.706741217200896e-06, "loss": 0.5499, "step": 730 }, { "epoch": 2.1060171919770774, "grad_norm": 0.22019214928150177, "learning_rate": 3.6355774182419905e-06, "loss": 0.55, "step": 735 }, { "epoch": 2.1203438395415475, "grad_norm": 0.22234179079532623, "learning_rate": 3.5651639465709426e-06, "loss": 0.5524, "step": 740 }, { "epoch": 2.134670487106017, "grad_norm": 0.22449831664562225, "learning_rate": 3.495517766090224e-06, "loss": 0.5459, "step": 745 }, { "epoch": 2.1489971346704873, "grad_norm": 0.23554570972919464, "learning_rate": 3.426655655847724e-06, "loss": 0.5617, "step": 750 }, { "epoch": 2.163323782234957, "grad_norm": 0.23134228587150574, "learning_rate": 3.3585942059943785e-06, "loss": 0.5523, "step": 755 }, { "epoch": 2.177650429799427, "grad_norm": 0.2272178828716278, "learning_rate": 3.291349813787276e-06, "loss": 0.5506, "step": 760 }, { "epoch": 2.1919770773638967, "grad_norm": 0.22482511401176453, "learning_rate": 3.2249386796392656e-06, "loss": 0.5451, "step": 765 }, { "epoch": 2.206303724928367, "grad_norm": 0.2274748831987381, "learning_rate": 3.159376803215985e-06, "loss": 0.5531, "step": 770 }, { "epoch": 2.2206303724928365, "grad_norm": 0.2227988839149475, "learning_rate": 3.0946799795812396e-06, "loss": 0.5489, "step": 775 }, { "epoch": 2.2349570200573066, "grad_norm": 0.22400720417499542, "learning_rate": 3.030863795391684e-06, "loss": 0.5456, "step": 780 }, { "epoch": 2.2492836676217767, "grad_norm": 0.2268913835287094, "learning_rate": 2.9679436251417016e-06, "loss": 0.5394, "step": 785 }, { "epoch": 2.2636103151862463, "grad_norm": 0.22335706651210785, "learning_rate": 2.9059346274594124e-06, "loss": 0.5377, "step": 790 }, { "epoch": 2.2779369627507164, "grad_norm": 0.22807373106479645, "learning_rate": 2.8448517414546884e-06, "loss": 0.5484, "step": 795 }, { "epoch": 2.292263610315186, "grad_norm": 0.22118327021598816, "learning_rate": 2.7847096831200282e-06, "loss": 0.5419, "step": 800 }, { "epoch": 2.292263610315186, "eval_loss": 0.5827357769012451, "eval_runtime": 2.9066, "eval_samples_per_second": 14.45, "eval_steps_per_second": 14.45, "step": 800 }, { "epoch": 2.306590257879656, "grad_norm": 0.22792136669158936, "learning_rate": 2.7255229417852123e-06, "loss": 0.5496, "step": 805 }, { "epoch": 2.3209169054441263, "grad_norm": 0.22095544636249542, "learning_rate": 2.667305776626566e-06, "loss": 0.554, "step": 810 }, { "epoch": 2.335243553008596, "grad_norm": 0.22290435433387756, "learning_rate": 2.6100722132316454e-06, "loss": 0.5492, "step": 815 }, { "epoch": 2.349570200573066, "grad_norm": 0.23009058833122253, "learning_rate": 2.553836040220221e-06, "loss": 0.5473, "step": 820 }, { "epoch": 2.3638968481375358, "grad_norm": 0.22500832378864288, "learning_rate": 2.49861080592235e-06, "loss": 0.5586, "step": 825 }, { "epoch": 2.378223495702006, "grad_norm": 0.22200486063957214, "learning_rate": 2.4444098151143295e-06, "loss": 0.5358, "step": 830 }, { "epoch": 2.3925501432664755, "grad_norm": 0.22904905676841736, "learning_rate": 2.391246125813331e-06, "loss": 0.5524, "step": 835 }, { "epoch": 2.4068767908309456, "grad_norm": 0.23062781989574432, "learning_rate": 2.339132546131483e-06, "loss": 0.5404, "step": 840 }, { "epoch": 2.4212034383954153, "grad_norm": 0.22324807941913605, "learning_rate": 2.288081631190158e-06, "loss": 0.5377, "step": 845 }, { "epoch": 2.4355300859598854, "grad_norm": 0.22595882415771484, "learning_rate": 2.2381056800952273e-06, "loss": 0.5465, "step": 850 }, { "epoch": 2.4498567335243555, "grad_norm": 0.23639383912086487, "learning_rate": 2.189216732973958e-06, "loss": 0.5518, "step": 855 }, { "epoch": 2.464183381088825, "grad_norm": 0.23035073280334473, "learning_rate": 2.1414265680743383e-06, "loss": 0.5444, "step": 860 }, { "epoch": 2.4785100286532953, "grad_norm": 0.22556614875793457, "learning_rate": 2.0947466989274793e-06, "loss": 0.5519, "step": 865 }, { "epoch": 2.492836676217765, "grad_norm": 0.22614265978336334, "learning_rate": 2.0491883715737904e-06, "loss": 0.5526, "step": 870 }, { "epoch": 2.507163323782235, "grad_norm": 0.22689661383628845, "learning_rate": 2.0047625618536037e-06, "loss": 0.5489, "step": 875 }, { "epoch": 2.5214899713467047, "grad_norm": 0.22763052582740784, "learning_rate": 1.961479972762888e-06, "loss": 0.5397, "step": 880 }, { "epoch": 2.535816618911175, "grad_norm": 0.22761483490467072, "learning_rate": 1.919351031874699e-06, "loss": 0.5452, "step": 885 }, { "epoch": 2.5501432664756445, "grad_norm": 0.22768139839172363, "learning_rate": 1.8783858888269978e-06, "loss": 0.5522, "step": 890 }, { "epoch": 2.5644699140401146, "grad_norm": 0.23226258158683777, "learning_rate": 1.8385944128773981e-06, "loss": 0.521, "step": 895 }, { "epoch": 2.5787965616045847, "grad_norm": 0.2272603064775467, "learning_rate": 1.7999861905254893e-06, "loss": 0.5526, "step": 900 }, { "epoch": 2.5787965616045847, "eval_loss": 0.5810644030570984, "eval_runtime": 2.9211, "eval_samples_per_second": 14.378, "eval_steps_per_second": 14.378, "step": 900 }, { "epoch": 2.5931232091690544, "grad_norm": 0.22808772325515747, "learning_rate": 1.7625705232032741e-06, "loss": 0.5573, "step": 905 }, { "epoch": 2.6074498567335245, "grad_norm": 0.22595611214637756, "learning_rate": 1.726356425034279e-06, "loss": 0.5378, "step": 910 }, { "epoch": 2.621776504297994, "grad_norm": 0.22707025706768036, "learning_rate": 1.6913526206618854e-06, "loss": 0.5243, "step": 915 }, { "epoch": 2.6361031518624642, "grad_norm": 0.2284831553697586, "learning_rate": 1.6575675431474023e-06, "loss": 0.5411, "step": 920 }, { "epoch": 2.6504297994269344, "grad_norm": 0.22921448945999146, "learning_rate": 1.6250093319383871e-06, "loss": 0.5411, "step": 925 }, { "epoch": 2.664756446991404, "grad_norm": 0.2303130179643631, "learning_rate": 1.5936858309077084e-06, "loss": 0.546, "step": 930 }, { "epoch": 2.6790830945558737, "grad_norm": 0.2226521223783493, "learning_rate": 1.5636045864637997e-06, "loss": 0.5378, "step": 935 }, { "epoch": 2.693409742120344, "grad_norm": 0.22775433957576752, "learning_rate": 1.5347728457326013e-06, "loss": 0.5341, "step": 940 }, { "epoch": 2.707736389684814, "grad_norm": 0.23151849210262299, "learning_rate": 1.507197554811592e-06, "loss": 0.5411, "step": 945 }, { "epoch": 2.7220630372492836, "grad_norm": 0.22131632268428802, "learning_rate": 1.480885357096343e-06, "loss": 0.5322, "step": 950 }, { "epoch": 2.7363896848137537, "grad_norm": 0.22514161467552185, "learning_rate": 1.4558425916800066e-06, "loss": 0.5287, "step": 955 }, { "epoch": 2.7507163323782233, "grad_norm": 0.22741974890232086, "learning_rate": 1.4320752918261058e-06, "loss": 0.5467, "step": 960 }, { "epoch": 2.7650429799426934, "grad_norm": 0.22180503606796265, "learning_rate": 1.4095891835150126e-06, "loss": 0.5398, "step": 965 }, { "epoch": 2.7793696275071635, "grad_norm": 0.2328280508518219, "learning_rate": 1.3883896840644583e-06, "loss": 0.5347, "step": 970 }, { "epoch": 2.793696275071633, "grad_norm": 0.22877122461795807, "learning_rate": 1.3684819008243952e-06, "loss": 0.5453, "step": 975 }, { "epoch": 2.8080229226361033, "grad_norm": 0.22728435695171356, "learning_rate": 1.3498706299465446e-06, "loss": 0.5356, "step": 980 }, { "epoch": 2.822349570200573, "grad_norm": 0.22559645771980286, "learning_rate": 1.3325603552289166e-06, "loss": 0.5432, "step": 985 }, { "epoch": 2.836676217765043, "grad_norm": 0.2304041087627411, "learning_rate": 1.3165552470355781e-06, "loss": 0.5441, "step": 990 }, { "epoch": 2.8510028653295127, "grad_norm": 0.22864393889904022, "learning_rate": 1.301859161291938e-06, "loss": 0.5417, "step": 995 }, { "epoch": 2.865329512893983, "grad_norm": 0.22412388026714325, "learning_rate": 1.2884756385557813e-06, "loss": 0.5374, "step": 1000 }, { "epoch": 2.865329512893983, "eval_loss": 0.5795248746871948, "eval_runtime": 2.889, "eval_samples_per_second": 14.538, "eval_steps_per_second": 14.538, "step": 1000 }, { "epoch": 2.8796561604584525, "grad_norm": 0.22551295161247253, "learning_rate": 1.2764079031642852e-06, "loss": 0.5425, "step": 1005 }, { "epoch": 2.8939828080229226, "grad_norm": 0.22314225137233734, "learning_rate": 1.265658862457217e-06, "loss": 0.5405, "step": 1010 }, { "epoch": 2.9083094555873927, "grad_norm": 0.22527816891670227, "learning_rate": 1.2562311060765001e-06, "loss": 0.5436, "step": 1015 }, { "epoch": 2.9226361031518624, "grad_norm": 0.22648297250270844, "learning_rate": 1.248126905342324e-06, "loss": 0.5497, "step": 1020 }, { "epoch": 2.9369627507163325, "grad_norm": 0.2278534471988678, "learning_rate": 1.2413482127059402e-06, "loss": 0.5391, "step": 1025 }, { "epoch": 2.951289398280802, "grad_norm": 0.2279985249042511, "learning_rate": 1.2358966612792807e-06, "loss": 0.5398, "step": 1030 }, { "epoch": 2.9656160458452723, "grad_norm": 0.23118627071380615, "learning_rate": 1.2317735644415136e-06, "loss": 0.5517, "step": 1035 }, { "epoch": 2.9799426934097424, "grad_norm": 0.22241578996181488, "learning_rate": 1.228979915522621e-06, "loss": 0.5407, "step": 1040 }, { "epoch": 2.9914040114613183, "eval_loss": 0.5790691375732422, "eval_runtime": 2.8699, "eval_samples_per_second": 14.635, "eval_steps_per_second": 14.635, "step": 1044 } ], "logging_steps": 5, "max_steps": 1047, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1950747837551084e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }