| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9982003599280143, | |
| "eval_steps": 500, | |
| "global_step": 832, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01199760047990402, | |
| "grad_norm": 8.655085086138964, | |
| "learning_rate": 2.9761904761904763e-06, | |
| "loss": 1.9665, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02399520095980804, | |
| "grad_norm": 4.415158437946272, | |
| "learning_rate": 5.9523809523809525e-06, | |
| "loss": 1.6691, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.035992801439712056, | |
| "grad_norm": 3.057605663016171, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 1.441, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04799040191961608, | |
| "grad_norm": 3.042012061427229, | |
| "learning_rate": 1.1904761904761905e-05, | |
| "loss": 1.3171, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.059988002399520096, | |
| "grad_norm": 2.1436915691999014, | |
| "learning_rate": 1.4880952380952381e-05, | |
| "loss": 1.2588, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.07198560287942411, | |
| "grad_norm": 2.03272405774148, | |
| "learning_rate": 1.785714285714286e-05, | |
| "loss": 1.2021, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08398320335932813, | |
| "grad_norm": 1.9317364452662562, | |
| "learning_rate": 2.0833333333333336e-05, | |
| "loss": 1.1645, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.09598080383923216, | |
| "grad_norm": 1.9450167856079439, | |
| "learning_rate": 2.380952380952381e-05, | |
| "loss": 1.1652, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10797840431913618, | |
| "grad_norm": 2.2175764125145943, | |
| "learning_rate": 2.6785714285714288e-05, | |
| "loss": 1.1492, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11997600479904019, | |
| "grad_norm": 1.9900888357541513, | |
| "learning_rate": 2.9761904761904762e-05, | |
| "loss": 1.146, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13197360527894422, | |
| "grad_norm": 2.1707343241074715, | |
| "learning_rate": 3.273809523809524e-05, | |
| "loss": 1.1312, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.14397120575884823, | |
| "grad_norm": 2.1008421926969874, | |
| "learning_rate": 3.571428571428572e-05, | |
| "loss": 1.1158, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15596880623875226, | |
| "grad_norm": 1.9739168298796461, | |
| "learning_rate": 3.8690476190476195e-05, | |
| "loss": 1.112, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.16796640671865626, | |
| "grad_norm": 1.8793192108758627, | |
| "learning_rate": 4.166666666666667e-05, | |
| "loss": 1.1216, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1799640071985603, | |
| "grad_norm": 2.1680515905272393, | |
| "learning_rate": 4.464285714285715e-05, | |
| "loss": 1.1154, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.19196160767846432, | |
| "grad_norm": 2.088302206757574, | |
| "learning_rate": 4.761904761904762e-05, | |
| "loss": 1.115, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.20395920815836832, | |
| "grad_norm": 2.0040693696661096, | |
| "learning_rate": 4.9999779501355384e-05, | |
| "loss": 1.1166, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.21595680863827235, | |
| "grad_norm": 1.8604233679285211, | |
| "learning_rate": 4.9992062457191e-05, | |
| "loss": 1.126, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22795440911817635, | |
| "grad_norm": 1.8293195388484245, | |
| "learning_rate": 4.997332437005931e-05, | |
| "loss": 1.1183, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.23995200959808038, | |
| "grad_norm": 1.9227439225768022, | |
| "learning_rate": 4.99435735031144e-05, | |
| "loss": 1.1341, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2519496100779844, | |
| "grad_norm": 1.7353567528325675, | |
| "learning_rate": 4.990282297594509e-05, | |
| "loss": 1.1111, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.26394721055788845, | |
| "grad_norm": 1.6246909472281814, | |
| "learning_rate": 4.98510907587894e-05, | |
| "loss": 1.1085, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.27594481103779245, | |
| "grad_norm": 1.5550505481063543, | |
| "learning_rate": 4.9788399664609985e-05, | |
| "loss": 1.1081, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.28794241151769645, | |
| "grad_norm": 1.6242317436366518, | |
| "learning_rate": 4.97147773390341e-05, | |
| "loss": 1.1187, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.29994001199760045, | |
| "grad_norm": 1.6719313503244617, | |
| "learning_rate": 4.963025624816232e-05, | |
| "loss": 1.0934, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3119376124775045, | |
| "grad_norm": 1.7557488929934109, | |
| "learning_rate": 4.953487366425163e-05, | |
| "loss": 1.0939, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3239352129574085, | |
| "grad_norm": 1.846886767486188, | |
| "learning_rate": 4.942867164927899e-05, | |
| "loss": 1.1016, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3359328134373125, | |
| "grad_norm": 1.564323242123012, | |
| "learning_rate": 4.931169703639282e-05, | |
| "loss": 1.0755, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3479304139172166, | |
| "grad_norm": 1.5659567424784107, | |
| "learning_rate": 4.918400140926042e-05, | |
| "loss": 1.0772, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3599280143971206, | |
| "grad_norm": 1.6772397525925744, | |
| "learning_rate": 4.9045641079320484e-05, | |
| "loss": 1.0975, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3719256148770246, | |
| "grad_norm": 1.4057626611756922, | |
| "learning_rate": 4.889667706095084e-05, | |
| "loss": 1.0894, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.38392321535692864, | |
| "grad_norm": 1.3508523989809198, | |
| "learning_rate": 4.873717504456219e-05, | |
| "loss": 1.0899, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.39592081583683264, | |
| "grad_norm": 1.4599727770410549, | |
| "learning_rate": 4.8567205367629835e-05, | |
| "loss": 1.0954, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.40791841631673664, | |
| "grad_norm": 1.557943385459695, | |
| "learning_rate": 4.8386842983676164e-05, | |
| "loss": 1.1022, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.41991601679664065, | |
| "grad_norm": 1.4392521405523333, | |
| "learning_rate": 4.8196167429217474e-05, | |
| "loss": 1.0858, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4319136172765447, | |
| "grad_norm": 1.4039232690942924, | |
| "learning_rate": 4.799526278868987e-05, | |
| "loss": 1.0776, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4439112177564487, | |
| "grad_norm": 1.3958637024927205, | |
| "learning_rate": 4.778421765736951e-05, | |
| "loss": 1.0656, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.4559088182363527, | |
| "grad_norm": 1.3165094843488907, | |
| "learning_rate": 4.7563125102303766e-05, | |
| "loss": 1.0701, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.46790641871625677, | |
| "grad_norm": 1.4462822477245483, | |
| "learning_rate": 4.7332082621270326e-05, | |
| "loss": 1.078, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.47990401919616077, | |
| "grad_norm": 1.4177688615580946, | |
| "learning_rate": 4.709119209978242e-05, | |
| "loss": 1.0748, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.49190161967606477, | |
| "grad_norm": 1.2487244087937184, | |
| "learning_rate": 4.684055976615924e-05, | |
| "loss": 1.0728, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5038992201559688, | |
| "grad_norm": 1.2843589104522166, | |
| "learning_rate": 4.6580296144681157e-05, | |
| "loss": 1.0741, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5158968206358728, | |
| "grad_norm": 1.3423526627782842, | |
| "learning_rate": 4.631051600685051e-05, | |
| "loss": 1.0545, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5278944211157769, | |
| "grad_norm": 1.3987872492900897, | |
| "learning_rate": 4.6031338320779534e-05, | |
| "loss": 1.0466, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5398920215956808, | |
| "grad_norm": 1.1228691632454497, | |
| "learning_rate": 4.57428861987275e-05, | |
| "loss": 1.0757, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5518896220755849, | |
| "grad_norm": 1.4386106052561902, | |
| "learning_rate": 4.544528684281056e-05, | |
| "loss": 1.0659, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.563887222555489, | |
| "grad_norm": 1.54532036456569, | |
| "learning_rate": 4.513867148890788e-05, | |
| "loss": 1.0731, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5758848230353929, | |
| "grad_norm": 1.1946309045723265, | |
| "learning_rate": 4.482317534878901e-05, | |
| "loss": 1.0724, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.587882423515297, | |
| "grad_norm": 1.2995921290959758, | |
| "learning_rate": 4.449893755048799e-05, | |
| "loss": 1.0538, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5998800239952009, | |
| "grad_norm": 1.1629232540517822, | |
| "learning_rate": 4.416610107695042e-05, | |
| "loss": 1.0653, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.611877624475105, | |
| "grad_norm": 1.141699716294497, | |
| "learning_rate": 4.3824812702980595e-05, | |
| "loss": 1.0593, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.623875224955009, | |
| "grad_norm": 1.1625584379097018, | |
| "learning_rate": 4.347522293051648e-05, | |
| "loss": 1.0728, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.635872825434913, | |
| "grad_norm": 1.2139933398328908, | |
| "learning_rate": 4.3117485922261136e-05, | |
| "loss": 1.0683, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.647870425914817, | |
| "grad_norm": 1.1676365588651592, | |
| "learning_rate": 4.275175943369975e-05, | |
| "loss": 1.0493, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6598680263947211, | |
| "grad_norm": 1.2056917804998961, | |
| "learning_rate": 4.2378204743532377e-05, | |
| "loss": 1.0779, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.671865626874625, | |
| "grad_norm": 1.356550764377353, | |
| "learning_rate": 4.199698658255298e-05, | |
| "loss": 1.0507, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6838632273545291, | |
| "grad_norm": 1.1721573272728156, | |
| "learning_rate": 4.160827306100611e-05, | |
| "loss": 1.0541, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6958608278344331, | |
| "grad_norm": 1.1802890937435413, | |
| "learning_rate": 4.121223559445343e-05, | |
| "loss": 1.0507, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7078584283143371, | |
| "grad_norm": 1.3591231013602916, | |
| "learning_rate": 4.0809048828182534e-05, | |
| "loss": 1.0529, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7198560287942412, | |
| "grad_norm": 1.230075489545995, | |
| "learning_rate": 4.039889056019159e-05, | |
| "loss": 1.0611, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7318536292741452, | |
| "grad_norm": 1.3962403495748514, | |
| "learning_rate": 3.9981941662783674e-05, | |
| "loss": 1.0477, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.7438512297540492, | |
| "grad_norm": 1.2104058914840572, | |
| "learning_rate": 3.955838600280535e-05, | |
| "loss": 1.0425, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7558488302339532, | |
| "grad_norm": 1.243572445349006, | |
| "learning_rate": 3.91284103605648e-05, | |
| "loss": 1.053, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7678464307138573, | |
| "grad_norm": 1.2315818992929564, | |
| "learning_rate": 3.869220434746509e-05, | |
| "loss": 1.0469, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7798440311937612, | |
| "grad_norm": 1.282062911918751, | |
| "learning_rate": 3.8249960322389e-05, | |
| "loss": 1.0507, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7918416316736653, | |
| "grad_norm": 1.2926792700193703, | |
| "learning_rate": 3.780187330687231e-05, | |
| "loss": 1.0286, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8038392321535693, | |
| "grad_norm": 1.1789915377278588, | |
| "learning_rate": 3.734814089910283e-05, | |
| "loss": 1.0275, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.8158368326334733, | |
| "grad_norm": 1.2214772745932698, | |
| "learning_rate": 3.6888963186783224e-05, | |
| "loss": 1.0152, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8278344331133773, | |
| "grad_norm": 1.0572719303807887, | |
| "learning_rate": 3.6424542658895944e-05, | |
| "loss": 1.0329, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8398320335932813, | |
| "grad_norm": 1.057795278450336, | |
| "learning_rate": 3.5955084116409385e-05, | |
| "loss": 1.0358, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8518296340731853, | |
| "grad_norm": 1.1933118891898369, | |
| "learning_rate": 3.5480794581964304e-05, | |
| "loss": 1.0257, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.8638272345530894, | |
| "grad_norm": 1.1930511530685082, | |
| "learning_rate": 3.5001883208580665e-05, | |
| "loss": 1.0462, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8758248350329934, | |
| "grad_norm": 1.206184339739026, | |
| "learning_rate": 3.451856118742498e-05, | |
| "loss": 1.0457, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8878224355128974, | |
| "grad_norm": 1.2488722797945375, | |
| "learning_rate": 3.403104165467883e-05, | |
| "loss": 1.0273, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8998200359928015, | |
| "grad_norm": 1.1879602242889826, | |
| "learning_rate": 3.353953959754973e-05, | |
| "loss": 1.0063, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.9118176364727054, | |
| "grad_norm": 1.1859307890368762, | |
| "learning_rate": 3.30442717594657e-05, | |
| "loss": 1.0155, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9238152369526095, | |
| "grad_norm": 1.1938213692010637, | |
| "learning_rate": 3.2545456544495365e-05, | |
| "loss": 1.025, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.9358128374325135, | |
| "grad_norm": 1.1843748330164985, | |
| "learning_rate": 3.2043313921035743e-05, | |
| "loss": 1.0328, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9478104379124175, | |
| "grad_norm": 1.117527742694121, | |
| "learning_rate": 3.1538065324810206e-05, | |
| "loss": 1.0248, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.9598080383923215, | |
| "grad_norm": 1.1115869963103877, | |
| "learning_rate": 3.1029933561219375e-05, | |
| "loss": 1.0027, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9718056388722256, | |
| "grad_norm": 1.0498821054649277, | |
| "learning_rate": 3.0519142707088026e-05, | |
| "loss": 1.0116, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9838032393521295, | |
| "grad_norm": 1.118470935254374, | |
| "learning_rate": 3.000591801185124e-05, | |
| "loss": 1.0147, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9958008398320336, | |
| "grad_norm": 1.1283471897635804, | |
| "learning_rate": 2.9490485798223623e-05, | |
| "loss": 1.0312, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.0095980803839233, | |
| "grad_norm": 1.0933704505218012, | |
| "learning_rate": 2.8973073362394998e-05, | |
| "loss": 1.0189, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0215956808638271, | |
| "grad_norm": 1.2757088787270183, | |
| "learning_rate": 2.8453908873797058e-05, | |
| "loss": 0.8115, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.0335932813437312, | |
| "grad_norm": 1.2014338555644504, | |
| "learning_rate": 2.7933221274484723e-05, | |
| "loss": 0.8041, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0455908818236352, | |
| "grad_norm": 1.136417187679828, | |
| "learning_rate": 2.7411240178176927e-05, | |
| "loss": 0.8078, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.0575884823035393, | |
| "grad_norm": 1.1276372766792626, | |
| "learning_rate": 2.6888195769001146e-05, | |
| "loss": 0.8179, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0695860827834434, | |
| "grad_norm": 1.0072604306741504, | |
| "learning_rate": 2.63643186999864e-05, | |
| "loss": 0.8072, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.0815836832633474, | |
| "grad_norm": 1.089321595722244, | |
| "learning_rate": 2.5839839991349506e-05, | |
| "loss": 0.8099, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0935812837432513, | |
| "grad_norm": 1.0056285329535712, | |
| "learning_rate": 2.5314990928619337e-05, | |
| "loss": 0.7985, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.1055788842231553, | |
| "grad_norm": 0.9752484724026733, | |
| "learning_rate": 2.479000296064417e-05, | |
| "loss": 0.8224, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1175764847030594, | |
| "grad_norm": 1.0643013774775827, | |
| "learning_rate": 2.4265107597526946e-05, | |
| "loss": 0.8156, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.1295740851829634, | |
| "grad_norm": 1.1300899339981707, | |
| "learning_rate": 2.374053630853358e-05, | |
| "loss": 0.8113, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1415716856628675, | |
| "grad_norm": 1.107379832173596, | |
| "learning_rate": 2.3216520420019195e-05, | |
| "loss": 0.8078, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.1535692861427713, | |
| "grad_norm": 1.1381056597190673, | |
| "learning_rate": 2.2693291013417453e-05, | |
| "loss": 0.7979, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1655668866226754, | |
| "grad_norm": 1.0344901370146822, | |
| "learning_rate": 2.2171078823337863e-05, | |
| "loss": 0.8114, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.1775644871025794, | |
| "grad_norm": 1.1433623054658555, | |
| "learning_rate": 2.165011413581605e-05, | |
| "loss": 0.8068, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1895620875824835, | |
| "grad_norm": 1.0574705191297809, | |
| "learning_rate": 2.1130626686761762e-05, | |
| "loss": 0.7931, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.2015596880623876, | |
| "grad_norm": 1.0955960697193892, | |
| "learning_rate": 2.0612845560649603e-05, | |
| "loss": 0.7961, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2135572885422916, | |
| "grad_norm": 1.1142604155745692, | |
| "learning_rate": 2.0096999089496913e-05, | |
| "loss": 0.7949, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.2255548890221957, | |
| "grad_norm": 1.081207113394286, | |
| "learning_rate": 1.958331475217357e-05, | |
| "loss": 0.8081, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2375524895020995, | |
| "grad_norm": 1.087424227244276, | |
| "learning_rate": 1.9072019074087876e-05, | |
| "loss": 0.7944, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.2495500899820036, | |
| "grad_norm": 1.1746075874674693, | |
| "learning_rate": 1.856333752729311e-05, | |
| "loss": 0.7982, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2615476904619076, | |
| "grad_norm": 1.1261850868143939, | |
| "learning_rate": 1.8057494431058365e-05, | |
| "loss": 0.8013, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.2735452909418117, | |
| "grad_norm": 1.0858958449694596, | |
| "learning_rate": 1.7554712852947913e-05, | |
| "loss": 0.8031, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.2855428914217157, | |
| "grad_norm": 1.1039599626344443, | |
| "learning_rate": 1.705521451045246e-05, | |
| "loss": 0.7924, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.2975404919016196, | |
| "grad_norm": 1.0379577155650672, | |
| "learning_rate": 1.6559219673215784e-05, | |
| "loss": 0.784, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3095380923815236, | |
| "grad_norm": 1.0180533259182083, | |
| "learning_rate": 1.6066947065899847e-05, | |
| "loss": 0.795, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.3215356928614277, | |
| "grad_norm": 1.1174277968751472, | |
| "learning_rate": 1.5578613771731213e-05, | |
| "loss": 0.7903, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3335332933413317, | |
| "grad_norm": 1.113625762435688, | |
| "learning_rate": 1.509443513677134e-05, | |
| "loss": 0.7841, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.3455308938212358, | |
| "grad_norm": 1.046198809224864, | |
| "learning_rate": 1.4614624674952842e-05, | |
| "loss": 0.7932, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.3575284943011399, | |
| "grad_norm": 1.0511943871992961, | |
| "learning_rate": 1.4139393973923798e-05, | |
| "loss": 0.7815, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.369526094781044, | |
| "grad_norm": 1.012108314446409, | |
| "learning_rate": 1.3668952601741441e-05, | |
| "loss": 0.7899, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3815236952609478, | |
| "grad_norm": 1.0590168334777237, | |
| "learning_rate": 1.320350801445649e-05, | |
| "loss": 0.7745, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.3935212957408518, | |
| "grad_norm": 1.0208126844360186, | |
| "learning_rate": 1.2743265464628786e-05, | |
| "loss": 0.7761, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.4055188962207559, | |
| "grad_norm": 1.1048330533675996, | |
| "learning_rate": 1.2288427910814699e-05, | |
| "loss": 0.7549, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.41751649670066, | |
| "grad_norm": 1.087557213168186, | |
| "learning_rate": 1.1839195928066102e-05, | |
| "loss": 0.7887, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.4295140971805638, | |
| "grad_norm": 1.066790026980483, | |
| "learning_rate": 1.1395767619480451e-05, | |
| "loss": 0.7891, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.4415116976604678, | |
| "grad_norm": 1.0511422556482182, | |
| "learning_rate": 1.0958338528840893e-05, | |
| "loss": 0.7828, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4535092981403719, | |
| "grad_norm": 1.0811572565881435, | |
| "learning_rate": 1.052710155438506e-05, | |
| "loss": 0.7985, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.465506898620276, | |
| "grad_norm": 0.9838694729298545, | |
| "learning_rate": 1.0102246863740496e-05, | |
| "loss": 0.7712, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.47750449910018, | |
| "grad_norm": 1.039397195061213, | |
| "learning_rate": 9.683961810064176e-06, | |
| "loss": 0.7813, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.489502099580084, | |
| "grad_norm": 1.0652079719056124, | |
| "learning_rate": 9.272430849423174e-06, | |
| "loss": 0.774, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.5014997000599881, | |
| "grad_norm": 1.107963886239366, | |
| "learning_rate": 8.867835459452925e-06, | |
| "loss": 0.7756, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.5134973005398922, | |
| "grad_norm": 1.0470978361528913, | |
| "learning_rate": 8.470354059328919e-06, | |
| "loss": 0.7683, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.525494901019796, | |
| "grad_norm": 1.2007634054581608, | |
| "learning_rate": 8.080161931087094e-06, | |
| "loss": 0.7827, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.5374925014997, | |
| "grad_norm": 1.1547346563119187, | |
| "learning_rate": 7.697431142327632e-06, | |
| "loss": 0.7787, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.5494901019796041, | |
| "grad_norm": 1.17217513079497, | |
| "learning_rate": 7.3223304703363135e-06, | |
| "loss": 0.7858, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.561487702459508, | |
| "grad_norm": 1.0566429470152394, | |
| "learning_rate": 6.955025327656839e-06, | |
| "loss": 0.766, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.573485302939412, | |
| "grad_norm": 1.022660107263399, | |
| "learning_rate": 6.5956776891468925e-06, | |
| "loss": 0.7694, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.585482903419316, | |
| "grad_norm": 1.0475499247182583, | |
| "learning_rate": 6.244446020550182e-06, | |
| "loss": 0.7627, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.5974805038992201, | |
| "grad_norm": 1.0898061259359026, | |
| "learning_rate": 5.901485208615948e-06, | |
| "loss": 0.7672, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.6094781043791242, | |
| "grad_norm": 1.0594055484172262, | |
| "learning_rate": 5.5669464927967655e-06, | |
| "loss": 0.7783, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6214757048590283, | |
| "grad_norm": 1.000104590500381, | |
| "learning_rate": 5.240977398554673e-06, | |
| "loss": 0.7578, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.6334733053389323, | |
| "grad_norm": 1.010755641973307, | |
| "learning_rate": 4.9237216723051485e-06, | |
| "loss": 0.7681, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.6454709058188364, | |
| "grad_norm": 1.0438538534480322, | |
| "learning_rate": 4.615319218027561e-06, | |
| "loss": 0.773, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.6574685062987402, | |
| "grad_norm": 1.00200099090404, | |
| "learning_rate": 4.315906035570094e-06, | |
| "loss": 0.7541, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.6694661067786443, | |
| "grad_norm": 0.9967511007903198, | |
| "learning_rate": 4.0256141606762836e-06, | |
| "loss": 0.7744, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.6814637072585483, | |
| "grad_norm": 0.954046235240287, | |
| "learning_rate": 3.7445716067596503e-06, | |
| "loss": 0.7472, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6934613077384522, | |
| "grad_norm": 1.013820429314171, | |
| "learning_rate": 3.4729023084521417e-06, | |
| "loss": 0.7499, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.7054589082183562, | |
| "grad_norm": 1.0142864719521136, | |
| "learning_rate": 3.2107260669512336e-06, | |
| "loss": 0.7539, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.7174565086982603, | |
| "grad_norm": 1.036622595232617, | |
| "learning_rate": 2.9581584971897697e-06, | |
| "loss": 0.7731, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.7294541091781643, | |
| "grad_norm": 1.02617492688862, | |
| "learning_rate": 2.7153109768518925e-06, | |
| "loss": 0.7495, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.7414517096580684, | |
| "grad_norm": 0.9783666612285059, | |
| "learning_rate": 2.4822905972575167e-06, | |
| "loss": 0.7649, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.7534493101379725, | |
| "grad_norm": 0.9915953729109124, | |
| "learning_rate": 2.2592001161370392e-06, | |
| "loss": 0.7662, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.7654469106178765, | |
| "grad_norm": 0.9904152103445725, | |
| "learning_rate": 2.0461379123170284e-06, | |
| "loss": 0.7681, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.7774445110977806, | |
| "grad_norm": 1.0176103785435524, | |
| "learning_rate": 1.8431979423369604e-06, | |
| "loss": 0.7606, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.7894421115776846, | |
| "grad_norm": 1.0288460382306641, | |
| "learning_rate": 1.650469699016116e-06, | |
| "loss": 0.7704, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.8014397120575885, | |
| "grad_norm": 0.9904963634021251, | |
| "learning_rate": 1.4680381719888807e-06, | |
| "loss": 0.7502, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8134373125374925, | |
| "grad_norm": 1.0256818925204132, | |
| "learning_rate": 1.2959838102258536e-06, | |
| "loss": 0.7713, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.8254349130173964, | |
| "grad_norm": 0.9849452100623455, | |
| "learning_rate": 1.134382486557342e-06, | |
| "loss": 0.758, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.8374325134973004, | |
| "grad_norm": 0.9397595503622964, | |
| "learning_rate": 9.833054642148066e-07, | |
| "loss": 0.7521, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.8494301139772045, | |
| "grad_norm": 1.0890895891545742, | |
| "learning_rate": 8.428193654051036e-07, | |
| "loss": 0.7509, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.8614277144571085, | |
| "grad_norm": 0.9884112088685092, | |
| "learning_rate": 7.129861419312822e-07, | |
| "loss": 0.7655, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.8734253149370126, | |
| "grad_norm": 0.9719952905492264, | |
| "learning_rate": 5.938630478729917e-07, | |
| "loss": 0.7498, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.8854229154169166, | |
| "grad_norm": 1.0012032904437846, | |
| "learning_rate": 4.855026143384733e-07, | |
| "loss": 0.7536, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.8974205158968207, | |
| "grad_norm": 0.9926612758167743, | |
| "learning_rate": 3.8795262629928996e-07, | |
| "loss": 0.7668, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.9094181163767248, | |
| "grad_norm": 0.9848391831057468, | |
| "learning_rate": 3.0125610151804374e-07, | |
| "loss": 0.7638, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.9214157168566288, | |
| "grad_norm": 1.0112841443251817, | |
| "learning_rate": 2.2545127157831413e-07, | |
| "loss": 0.755, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.9334133173365327, | |
| "grad_norm": 0.9951541506451825, | |
| "learning_rate": 1.605715650252415e-07, | |
| "loss": 0.7509, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.9454109178164367, | |
| "grad_norm": 1.0148344278049142, | |
| "learning_rate": 1.0664559262413831e-07, | |
| "loss": 0.7576, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.9574085182963408, | |
| "grad_norm": 1.0628673000773126, | |
| "learning_rate": 6.369713474366212e-08, | |
| "loss": 0.7668, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.9694061187762446, | |
| "grad_norm": 0.9218706861308771, | |
| "learning_rate": 3.1745130869123566e-08, | |
| "loss": 0.7593, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.9814037192561487, | |
| "grad_norm": 1.0123478252373648, | |
| "learning_rate": 1.08036712505033e-08, | |
| "loss": 0.7563, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.9934013197360527, | |
| "grad_norm": 1.0032109470714232, | |
| "learning_rate": 8.819906889168117e-10, | |
| "loss": 0.7589, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.9982003599280143, | |
| "step": 832, | |
| "total_flos": 619672101191680.0, | |
| "train_loss": 0.9412782498850272, | |
| "train_runtime": 43950.8497, | |
| "train_samples_per_second": 1.214, | |
| "train_steps_per_second": 0.019 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 832, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 619672101191680.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |