{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9953379953379953, "eval_steps": 500, "global_step": 856, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011655011655011656, "grad_norm": 5.380261825460372, "learning_rate": 9.237540571428572e-06, "loss": 4.2109, "step": 5 }, { "epoch": 0.023310023310023312, "grad_norm": 1.4502882732797857, "learning_rate": 2.0784466285714287e-05, "loss": 3.9981, "step": 10 }, { "epoch": 0.03496503496503497, "grad_norm": 0.9652384253156429, "learning_rate": 3.2331392000000005e-05, "loss": 3.2711, "step": 15 }, { "epoch": 0.046620046620046623, "grad_norm": 1.1395250787625393, "learning_rate": 4.3878317714285716e-05, "loss": 2.6312, "step": 20 }, { "epoch": 0.05827505827505827, "grad_norm": 3.4845160401616764, "learning_rate": 5.542524342857144e-05, "loss": 2.1284, "step": 25 }, { "epoch": 0.06993006993006994, "grad_norm": 1.9051485411146671, "learning_rate": 6.697216914285716e-05, "loss": 1.8294, "step": 30 }, { "epoch": 0.08158508158508158, "grad_norm": 0.33187533324621077, "learning_rate": 7.851909485714286e-05, "loss": 1.7266, "step": 35 }, { "epoch": 0.09324009324009325, "grad_norm": 0.44248932276377023, "learning_rate": 8.082695323179129e-05, "loss": 1.7283, "step": 40 }, { "epoch": 0.1048951048951049, "grad_norm": 1.2160485145535014, "learning_rate": 8.082075099954982e-05, "loss": 1.7117, "step": 45 }, { "epoch": 0.11655011655011654, "grad_norm": 65.29684577693374, "learning_rate": 8.080977885578941e-05, "loss": 1.8952, "step": 50 }, { "epoch": 0.1282051282051282, "grad_norm": 1.15546734315769, "learning_rate": 8.079403852760764e-05, "loss": 3.1909, "step": 55 }, { "epoch": 0.13986013986013987, "grad_norm": 2.634452844296507, "learning_rate": 8.077353249265015e-05, "loss": 1.6638, "step": 60 }, { "epoch": 0.15151515151515152, "grad_norm": 0.32202059804233973, "learning_rate": 8.07482639787204e-05, "loss": 1.6088, "step": 65 }, { "epoch": 0.16317016317016317, "grad_norm": 0.3642353360782208, "learning_rate": 8.071823696327185e-05, "loss": 1.5268, "step": 70 }, { "epoch": 0.17482517482517482, "grad_norm": 0.34692958418989034, "learning_rate": 8.068345617278169e-05, "loss": 1.5875, "step": 75 }, { "epoch": 0.1864801864801865, "grad_norm": 38.12513649812565, "learning_rate": 8.06439270820069e-05, "loss": 1.7234, "step": 80 }, { "epoch": 0.19813519813519814, "grad_norm": 0.3217004898392202, "learning_rate": 8.059965591312254e-05, "loss": 1.4259, "step": 85 }, { "epoch": 0.2097902097902098, "grad_norm": 0.24719946255426536, "learning_rate": 8.055064963474229e-05, "loss": 1.4095, "step": 90 }, { "epoch": 0.22144522144522144, "grad_norm": 0.5042383844039631, "learning_rate": 8.049691596082148e-05, "loss": 1.6829, "step": 95 }, { "epoch": 0.2331002331002331, "grad_norm": 0.2641927692422699, "learning_rate": 8.043846334944299e-05, "loss": 1.6542, "step": 100 }, { "epoch": 0.24475524475524477, "grad_norm": 0.2525842272998614, "learning_rate": 8.03753010014858e-05, "loss": 1.5055, "step": 105 }, { "epoch": 0.2564102564102564, "grad_norm": 0.2848401355775407, "learning_rate": 8.030743885917666e-05, "loss": 1.5441, "step": 110 }, { "epoch": 0.2680652680652681, "grad_norm": 0.2578389420241452, "learning_rate": 8.023488760452522e-05, "loss": 1.5164, "step": 115 }, { "epoch": 0.27972027972027974, "grad_norm": 0.384485780217046, "learning_rate": 8.01576586576425e-05, "loss": 1.4309, "step": 120 }, { "epoch": 0.2913752913752914, "grad_norm": 0.3005538892717842, "learning_rate": 8.007576417494336e-05, "loss": 1.4464, "step": 125 }, { "epoch": 0.30303030303030304, "grad_norm": 0.3038129052309793, "learning_rate": 7.998921704723294e-05, "loss": 1.308, "step": 130 }, { "epoch": 0.3146853146853147, "grad_norm": 0.27857195778115285, "learning_rate": 7.989803089767754e-05, "loss": 1.3887, "step": 135 }, { "epoch": 0.32634032634032634, "grad_norm": 0.2852295677864667, "learning_rate": 7.980222007966029e-05, "loss": 1.4992, "step": 140 }, { "epoch": 0.337995337995338, "grad_norm": 0.2744064066208899, "learning_rate": 7.970179967452175e-05, "loss": 1.4618, "step": 145 }, { "epoch": 0.34965034965034963, "grad_norm": 0.24802775085025972, "learning_rate": 7.959678548918605e-05, "loss": 1.3894, "step": 150 }, { "epoch": 0.3613053613053613, "grad_norm": 0.25341971271489, "learning_rate": 7.948719405367275e-05, "loss": 1.402, "step": 155 }, { "epoch": 0.372960372960373, "grad_norm": 0.24564392308719407, "learning_rate": 7.937304261849485e-05, "loss": 1.3474, "step": 160 }, { "epoch": 0.38461538461538464, "grad_norm": 0.28202938683817624, "learning_rate": 7.925434915194349e-05, "loss": 1.4338, "step": 165 }, { "epoch": 0.3962703962703963, "grad_norm": 0.310609971889877, "learning_rate": 7.913113233725954e-05, "loss": 1.4238, "step": 170 }, { "epoch": 0.40792540792540793, "grad_norm": 0.27009263717875037, "learning_rate": 7.90034115696928e-05, "loss": 1.3496, "step": 175 }, { "epoch": 0.4195804195804196, "grad_norm": 0.2710853328115736, "learning_rate": 7.887120695344898e-05, "loss": 1.4584, "step": 180 }, { "epoch": 0.43123543123543123, "grad_norm": 0.25152522039701575, "learning_rate": 7.873453929852514e-05, "loss": 1.3774, "step": 185 }, { "epoch": 0.4428904428904429, "grad_norm": 0.2835745821601218, "learning_rate": 7.85934301174341e-05, "loss": 1.4099, "step": 190 }, { "epoch": 0.45454545454545453, "grad_norm": 1.0519805093542818, "learning_rate": 7.844790162181818e-05, "loss": 1.437, "step": 195 }, { "epoch": 0.4662004662004662, "grad_norm": 0.8265825248892074, "learning_rate": 7.829797671895288e-05, "loss": 1.4522, "step": 200 }, { "epoch": 0.47785547785547783, "grad_norm": 0.7862211345050619, "learning_rate": 7.814367900814116e-05, "loss": 1.4808, "step": 205 }, { "epoch": 0.48951048951048953, "grad_norm": 2.729527753162334, "learning_rate": 7.79850327769987e-05, "loss": 1.3966, "step": 210 }, { "epoch": 0.5011655011655012, "grad_norm": 0.5270917240760644, "learning_rate": 7.78220629976309e-05, "loss": 1.4515, "step": 215 }, { "epoch": 0.5128205128205128, "grad_norm": 0.2370434474197243, "learning_rate": 7.765479532270198e-05, "loss": 1.354, "step": 220 }, { "epoch": 0.5244755244755245, "grad_norm": 0.302576280537011, "learning_rate": 7.748325608139717e-05, "loss": 1.4122, "step": 225 }, { "epoch": 0.5361305361305362, "grad_norm": 0.26006318976574305, "learning_rate": 7.730747227527824e-05, "loss": 1.4065, "step": 230 }, { "epoch": 0.5477855477855478, "grad_norm": 0.25033459622615245, "learning_rate": 7.712747157403322e-05, "loss": 1.3791, "step": 235 }, { "epoch": 0.5594405594405595, "grad_norm": 0.22820286614092533, "learning_rate": 7.694328231112112e-05, "loss": 1.339, "step": 240 }, { "epoch": 0.5710955710955711, "grad_norm": 0.2808234577155991, "learning_rate": 7.675493347931184e-05, "loss": 1.391, "step": 245 }, { "epoch": 0.5827505827505828, "grad_norm": 0.248396781368916, "learning_rate": 7.656245472612264e-05, "loss": 1.3796, "step": 250 }, { "epoch": 0.5944055944055944, "grad_norm": 0.24239687758201922, "learning_rate": 7.636587634915133e-05, "loss": 1.2704, "step": 255 }, { "epoch": 0.6060606060606061, "grad_norm": 0.2889579270489845, "learning_rate": 7.616522929130724e-05, "loss": 1.3738, "step": 260 }, { "epoch": 0.6177156177156177, "grad_norm": 0.37773683176723527, "learning_rate": 7.596054513594051e-05, "loss": 1.2168, "step": 265 }, { "epoch": 0.6293706293706294, "grad_norm": 0.2507614413849896, "learning_rate": 7.575185610187072e-05, "loss": 1.2799, "step": 270 }, { "epoch": 0.6410256410256411, "grad_norm": 0.2519925650582622, "learning_rate": 7.553919503831533e-05, "loss": 1.3737, "step": 275 }, { "epoch": 0.6526806526806527, "grad_norm": 0.25693488058335956, "learning_rate": 7.532259541971902e-05, "loss": 1.3132, "step": 280 }, { "epoch": 0.6643356643356644, "grad_norm": 0.2654099769032578, "learning_rate": 7.510209134048455e-05, "loss": 1.336, "step": 285 }, { "epoch": 0.675990675990676, "grad_norm": 0.2991450312535211, "learning_rate": 7.4877717509606e-05, "loss": 1.3136, "step": 290 }, { "epoch": 0.6876456876456877, "grad_norm": 0.7936383565363224, "learning_rate": 7.46495092452054e-05, "loss": 1.3218, "step": 295 }, { "epoch": 0.6993006993006993, "grad_norm": 0.2833344928086715, "learning_rate": 7.441750246897328e-05, "loss": 1.3775, "step": 300 }, { "epoch": 0.710955710955711, "grad_norm": 0.23315264052796472, "learning_rate": 7.418173370051446e-05, "loss": 1.3401, "step": 305 }, { "epoch": 0.7226107226107226, "grad_norm": 0.27055343448065056, "learning_rate": 7.394224005159947e-05, "loss": 1.3282, "step": 310 }, { "epoch": 0.7342657342657343, "grad_norm": 0.2480849022059861, "learning_rate": 7.369905922032295e-05, "loss": 1.2888, "step": 315 }, { "epoch": 0.745920745920746, "grad_norm": 0.31945412606809426, "learning_rate": 7.345222948516969e-05, "loss": 1.4228, "step": 320 }, { "epoch": 0.7575757575757576, "grad_norm": 0.2576143196799571, "learning_rate": 7.320178969898926e-05, "loss": 1.3058, "step": 325 }, { "epoch": 0.7692307692307693, "grad_norm": 0.43520117132833047, "learning_rate": 7.294777928288031e-05, "loss": 1.3131, "step": 330 }, { "epoch": 0.7808857808857809, "grad_norm": 0.2882495350884511, "learning_rate": 7.26902382199854e-05, "loss": 1.32, "step": 335 }, { "epoch": 0.7925407925407926, "grad_norm": 0.33796460872196393, "learning_rate": 7.242920704919733e-05, "loss": 1.3031, "step": 340 }, { "epoch": 0.8041958041958042, "grad_norm": 0.2549144290343736, "learning_rate": 7.216472685877808e-05, "loss": 1.2656, "step": 345 }, { "epoch": 0.8158508158508159, "grad_norm": 0.6420737440392777, "learning_rate": 7.189683927989109e-05, "loss": 1.3621, "step": 350 }, { "epoch": 0.8275058275058275, "grad_norm": 0.25149854473667893, "learning_rate": 7.162558648004833e-05, "loss": 1.3055, "step": 355 }, { "epoch": 0.8391608391608392, "grad_norm": 0.8756690609514122, "learning_rate": 7.13510111564727e-05, "loss": 1.306, "step": 360 }, { "epoch": 0.8508158508158508, "grad_norm": 0.27473178820064537, "learning_rate": 7.107315652937733e-05, "loss": 1.3125, "step": 365 }, { "epoch": 0.8624708624708625, "grad_norm": 5.526242806524436, "learning_rate": 7.079206633516216e-05, "loss": 1.2443, "step": 370 }, { "epoch": 0.8741258741258742, "grad_norm": 0.2859887823856341, "learning_rate": 7.050778481952977e-05, "loss": 1.3051, "step": 375 }, { "epoch": 0.8857808857808858, "grad_norm": 0.26065360686340344, "learning_rate": 7.022035673052052e-05, "loss": 1.2533, "step": 380 }, { "epoch": 0.8974358974358975, "grad_norm": 0.28515419529118324, "learning_rate": 6.992982731146909e-05, "loss": 1.292, "step": 385 }, { "epoch": 0.9090909090909091, "grad_norm": 0.2521847950964824, "learning_rate": 6.963624229388268e-05, "loss": 1.3149, "step": 390 }, { "epoch": 0.9207459207459208, "grad_norm": 0.24378682997123743, "learning_rate": 6.933964789024263e-05, "loss": 1.299, "step": 395 }, { "epoch": 0.9324009324009324, "grad_norm": 0.22312967942400147, "learning_rate": 6.904009078673016e-05, "loss": 1.2422, "step": 400 }, { "epoch": 0.9440559440559441, "grad_norm": 0.22503476475625694, "learning_rate": 6.873761813587769e-05, "loss": 1.2913, "step": 405 }, { "epoch": 0.9557109557109557, "grad_norm": 0.22697946460375637, "learning_rate": 6.843227754914657e-05, "loss": 1.29, "step": 410 }, { "epoch": 0.9673659673659674, "grad_norm": 0.3144384632248903, "learning_rate": 6.812411708943284e-05, "loss": 1.2395, "step": 415 }, { "epoch": 0.9790209790209791, "grad_norm": 0.24364089502734793, "learning_rate": 6.781318526350156e-05, "loss": 1.4002, "step": 420 }, { "epoch": 0.9906759906759907, "grad_norm": 0.23043000137266378, "learning_rate": 6.749953101435168e-05, "loss": 1.2886, "step": 425 }, { "epoch": 0.9976689976689976, "eval_loss": 1.346985936164856, "eval_runtime": 41.4179, "eval_samples_per_second": 1.69, "eval_steps_per_second": 0.217, "step": 428 }, { "epoch": 1.0023310023310024, "grad_norm": 0.6370362893316296, "learning_rate": 6.718320371351193e-05, "loss": 1.2846, "step": 430 }, { "epoch": 1.013986013986014, "grad_norm": 0.26814660795194867, "learning_rate": 6.686425315326941e-05, "loss": 1.0865, "step": 435 }, { "epoch": 1.0256410256410255, "grad_norm": 0.23807462749178174, "learning_rate": 6.654272953883189e-05, "loss": 1.1612, "step": 440 }, { "epoch": 1.0372960372960374, "grad_norm": 0.23960120741081306, "learning_rate": 6.621868348042517e-05, "loss": 1.1393, "step": 445 }, { "epoch": 1.048951048951049, "grad_norm": 0.2601814951485897, "learning_rate": 6.58921659853266e-05, "loss": 1.2539, "step": 450 }, { "epoch": 1.0606060606060606, "grad_norm": 0.2489367328385621, "learning_rate": 6.55632284498362e-05, "loss": 1.1934, "step": 455 }, { "epoch": 1.0722610722610724, "grad_norm": 0.26068816228316616, "learning_rate": 6.523192265118652e-05, "loss": 1.252, "step": 460 }, { "epoch": 1.083916083916084, "grad_norm": 0.2681555770705724, "learning_rate": 6.489830073939237e-05, "loss": 1.2231, "step": 465 }, { "epoch": 1.0955710955710956, "grad_norm": 0.24692924998707827, "learning_rate": 6.456241522904223e-05, "loss": 1.1952, "step": 470 }, { "epoch": 1.1072261072261071, "grad_norm": 0.24530619885017255, "learning_rate": 6.422431899103189e-05, "loss": 1.1665, "step": 475 }, { "epoch": 1.118881118881119, "grad_norm": 0.2677189175766508, "learning_rate": 6.388406524424222e-05, "loss": 1.176, "step": 480 }, { "epoch": 1.1305361305361306, "grad_norm": 0.2411887554782912, "learning_rate": 6.35417075471622e-05, "loss": 1.1704, "step": 485 }, { "epoch": 1.1421911421911422, "grad_norm": 0.2541256840409212, "learning_rate": 6.319729978945832e-05, "loss": 1.1782, "step": 490 }, { "epoch": 1.1538461538461537, "grad_norm": 0.24850101976269176, "learning_rate": 6.285089618349196e-05, "loss": 1.1847, "step": 495 }, { "epoch": 1.1655011655011656, "grad_norm": 0.2672134029476264, "learning_rate": 6.250255125578597e-05, "loss": 1.0407, "step": 500 }, { "epoch": 1.1655011655011656, "eval_loss": 1.335543155670166, "eval_runtime": 41.117, "eval_samples_per_second": 1.702, "eval_steps_per_second": 0.219, "step": 500 }, { "epoch": 1.1771561771561772, "grad_norm": 0.24056303633515866, "learning_rate": 6.21523198384418e-05, "loss": 1.223, "step": 505 }, { "epoch": 1.1888111888111887, "grad_norm": 0.25656987276961235, "learning_rate": 6.18002570605085e-05, "loss": 1.1589, "step": 510 }, { "epoch": 1.2004662004662006, "grad_norm": 0.25884269499583334, "learning_rate": 6.144641833930498e-05, "loss": 1.1185, "step": 515 }, { "epoch": 1.2121212121212122, "grad_norm": 0.24120488400409848, "learning_rate": 6.109085937169695e-05, "loss": 1.2347, "step": 520 }, { "epoch": 1.2237762237762237, "grad_norm": 0.24337953823169015, "learning_rate": 6.0733636125329776e-05, "loss": 1.0777, "step": 525 }, { "epoch": 1.2354312354312353, "grad_norm": 0.2553344373947113, "learning_rate": 6.0374804829818786e-05, "loss": 1.1297, "step": 530 }, { "epoch": 1.2470862470862472, "grad_norm": 0.26618655203985797, "learning_rate": 6.001442196789827e-05, "loss": 1.1129, "step": 535 }, { "epoch": 1.2587412587412588, "grad_norm": 0.2631162852747018, "learning_rate": 5.965254426653072e-05, "loss": 1.2239, "step": 540 }, { "epoch": 1.2703962703962703, "grad_norm": 0.23274413800511448, "learning_rate": 5.928922868797752e-05, "loss": 1.103, "step": 545 }, { "epoch": 1.282051282051282, "grad_norm": 0.23157161304838858, "learning_rate": 5.892453242083273e-05, "loss": 1.0679, "step": 550 }, { "epoch": 1.2937062937062938, "grad_norm": 0.28740045332560504, "learning_rate": 5.855851287102113e-05, "loss": 1.1891, "step": 555 }, { "epoch": 1.3053613053613053, "grad_norm": 0.2970846582151576, "learning_rate": 5.81912276527621e-05, "loss": 1.1354, "step": 560 }, { "epoch": 1.317016317016317, "grad_norm": 0.2630829061530002, "learning_rate": 5.7822734579500705e-05, "loss": 1.1793, "step": 565 }, { "epoch": 1.3286713286713288, "grad_norm": 0.24952577498784723, "learning_rate": 5.745309165480747e-05, "loss": 1.0412, "step": 570 }, { "epoch": 1.3403263403263403, "grad_norm": 0.29261484653650605, "learning_rate": 5.7082357063248116e-05, "loss": 1.1266, "step": 575 }, { "epoch": 1.351981351981352, "grad_norm": 0.27423549595827834, "learning_rate": 5.671058916122493e-05, "loss": 1.1526, "step": 580 }, { "epoch": 1.3636363636363638, "grad_norm": 0.26193784576015794, "learning_rate": 5.6337846467790995e-05, "loss": 1.1818, "step": 585 }, { "epoch": 1.3752913752913754, "grad_norm": 0.2536711363621732, "learning_rate": 5.596418765543887e-05, "loss": 1.1099, "step": 590 }, { "epoch": 1.386946386946387, "grad_norm": 0.25809295106016567, "learning_rate": 5.55896715408651e-05, "loss": 1.1281, "step": 595 }, { "epoch": 1.3986013986013985, "grad_norm": 0.2964280224538696, "learning_rate": 5.521435707571199e-05, "loss": 1.1209, "step": 600 }, { "epoch": 1.4102564102564101, "grad_norm": 0.2535619637821886, "learning_rate": 5.483830333728829e-05, "loss": 1.0736, "step": 605 }, { "epoch": 1.421911421911422, "grad_norm": 0.2834714269145168, "learning_rate": 5.4461569519269803e-05, "loss": 1.0904, "step": 610 }, { "epoch": 1.4335664335664335, "grad_norm": 0.26457948671956855, "learning_rate": 5.4084214922382015e-05, "loss": 1.1718, "step": 615 }, { "epoch": 1.4452214452214451, "grad_norm": 0.30016622504388313, "learning_rate": 5.370629894506561e-05, "loss": 1.1222, "step": 620 }, { "epoch": 1.456876456876457, "grad_norm": 0.24560811509210745, "learning_rate": 5.332788107412684e-05, "loss": 1.1179, "step": 625 }, { "epoch": 1.4685314685314685, "grad_norm": 0.27129656920581763, "learning_rate": 5.294902087537369e-05, "loss": 1.1379, "step": 630 }, { "epoch": 1.4801864801864801, "grad_norm": 0.26155557293622045, "learning_rate": 5.256977798423988e-05, "loss": 1.1354, "step": 635 }, { "epoch": 1.491841491841492, "grad_norm": 0.2542515790861205, "learning_rate": 5.2190212096397825e-05, "loss": 1.0926, "step": 640 }, { "epoch": 1.5034965034965035, "grad_norm": 0.2846310062143532, "learning_rate": 5.181038295836196e-05, "loss": 1.206, "step": 645 }, { "epoch": 1.5151515151515151, "grad_norm": 0.2578892076658671, "learning_rate": 5.143035035808435e-05, "loss": 1.1348, "step": 650 }, { "epoch": 1.526806526806527, "grad_norm": 0.2677065667604314, "learning_rate": 5.1050174115543476e-05, "loss": 1.1376, "step": 655 }, { "epoch": 1.5384615384615383, "grad_norm": 0.2623622010580359, "learning_rate": 5.066991407332825e-05, "loss": 1.147, "step": 660 }, { "epoch": 1.5501165501165501, "grad_norm": 0.2924192984321144, "learning_rate": 5.028963008721822e-05, "loss": 1.2343, "step": 665 }, { "epoch": 1.5617715617715617, "grad_norm": 0.25879469972966473, "learning_rate": 4.990938201676194e-05, "loss": 1.1271, "step": 670 }, { "epoch": 1.5734265734265733, "grad_norm": 0.2738138873868371, "learning_rate": 4.952922971585451e-05, "loss": 1.0558, "step": 675 }, { "epoch": 1.5850815850815851, "grad_norm": 0.24373153596626246, "learning_rate": 4.914923302331625e-05, "loss": 1.1583, "step": 680 }, { "epoch": 1.5967365967365967, "grad_norm": 0.26700970389535894, "learning_rate": 4.87694517534735e-05, "loss": 1.0228, "step": 685 }, { "epoch": 1.6083916083916083, "grad_norm": 0.29201876514809744, "learning_rate": 4.838994568674351e-05, "loss": 1.1897, "step": 690 }, { "epoch": 1.6200466200466201, "grad_norm": 0.2731368722942658, "learning_rate": 4.801077456022443e-05, "loss": 1.2347, "step": 695 }, { "epoch": 1.6317016317016317, "grad_norm": 0.24941385151182885, "learning_rate": 4.763199805829236e-05, "loss": 1.0522, "step": 700 }, { "epoch": 1.6433566433566433, "grad_norm": 0.2601913077602386, "learning_rate": 4.7253675803206544e-05, "loss": 1.1047, "step": 705 }, { "epoch": 1.6550116550116551, "grad_norm": 0.2627466378538171, "learning_rate": 4.687586734572431e-05, "loss": 1.2039, "step": 710 }, { "epoch": 1.6666666666666665, "grad_norm": 0.33292973388136243, "learning_rate": 4.649863215572747e-05, "loss": 1.0949, "step": 715 }, { "epoch": 1.6783216783216783, "grad_norm": 0.28930444872767674, "learning_rate": 4.612202961286117e-05, "loss": 1.1411, "step": 720 }, { "epoch": 1.68997668997669, "grad_norm": 0.2811673269560653, "learning_rate": 4.574611899718721e-05, "loss": 1.1188, "step": 725 }, { "epoch": 1.7016317016317015, "grad_norm": 0.2624778144533447, "learning_rate": 4.537095947985282e-05, "loss": 1.2099, "step": 730 }, { "epoch": 1.7132867132867133, "grad_norm": 0.31909240386960064, "learning_rate": 4.499661011377677e-05, "loss": 1.1953, "step": 735 }, { "epoch": 1.724941724941725, "grad_norm": 0.45177444374511333, "learning_rate": 4.46231298243539e-05, "loss": 1.1496, "step": 740 }, { "epoch": 1.7365967365967365, "grad_norm": 0.3304135322652554, "learning_rate": 4.425057740017993e-05, "loss": 1.1407, "step": 745 }, { "epoch": 1.7482517482517483, "grad_norm": 0.26814860174828314, "learning_rate": 4.38790114837976e-05, "loss": 1.0907, "step": 750 }, { "epoch": 1.75990675990676, "grad_norm": 0.29715139148195335, "learning_rate": 4.350849056246595e-05, "loss": 1.1766, "step": 755 }, { "epoch": 1.7715617715617715, "grad_norm": 0.274851580699654, "learning_rate": 4.313907295895397e-05, "loss": 1.1497, "step": 760 }, { "epoch": 1.7832167832167833, "grad_norm": 0.30500141557968397, "learning_rate": 4.277081682236013e-05, "loss": 1.0404, "step": 765 }, { "epoch": 1.7948717948717947, "grad_norm": 2.9176458727968746, "learning_rate": 4.240378011895935e-05, "loss": 1.0777, "step": 770 }, { "epoch": 1.8065268065268065, "grad_norm": 0.32004127534889776, "learning_rate": 4.2038020623078596e-05, "loss": 1.1521, "step": 775 }, { "epoch": 1.8181818181818183, "grad_norm": 0.38996610677191895, "learning_rate": 4.1673595908002826e-05, "loss": 1.0999, "step": 780 }, { "epoch": 1.8298368298368297, "grad_norm": 0.39155093719025963, "learning_rate": 4.131056333691247e-05, "loss": 1.1836, "step": 785 }, { "epoch": 1.8414918414918415, "grad_norm": 0.2765149318590016, "learning_rate": 4.094898005385408e-05, "loss": 1.1018, "step": 790 }, { "epoch": 1.8531468531468531, "grad_norm": 0.22839762225047938, "learning_rate": 4.058890297474543e-05, "loss": 1.1704, "step": 795 }, { "epoch": 1.8648018648018647, "grad_norm": 0.269742896374976, "learning_rate": 4.023038877841649e-05, "loss": 1.0703, "step": 800 }, { "epoch": 1.8764568764568765, "grad_norm": 0.29602473720606215, "learning_rate": 3.987349389768777e-05, "loss": 1.1029, "step": 805 }, { "epoch": 1.8881118881118881, "grad_norm": 0.2609207540513949, "learning_rate": 3.951827451048737e-05, "loss": 1.2367, "step": 810 }, { "epoch": 1.8997668997668997, "grad_norm": 0.2508364860523673, "learning_rate": 3.916478653100816e-05, "loss": 1.0849, "step": 815 }, { "epoch": 1.9114219114219115, "grad_norm": 1.2856984958855535, "learning_rate": 3.881308560090648e-05, "loss": 1.1273, "step": 820 }, { "epoch": 1.9230769230769231, "grad_norm": 0.24270883363297913, "learning_rate": 3.846322708054368e-05, "loss": 1.1128, "step": 825 }, { "epoch": 1.9347319347319347, "grad_norm": 0.26881059124314843, "learning_rate": 3.811526604027204e-05, "loss": 1.2133, "step": 830 }, { "epoch": 1.9463869463869465, "grad_norm": 0.2878909720257614, "learning_rate": 3.7769257251766225e-05, "loss": 1.1629, "step": 835 }, { "epoch": 1.958041958041958, "grad_norm": 0.31812523517669944, "learning_rate": 3.742525517940187e-05, "loss": 1.149, "step": 840 }, { "epoch": 1.9696969696969697, "grad_norm": 0.27576941685401857, "learning_rate": 3.708331397168247e-05, "loss": 1.1484, "step": 845 }, { "epoch": 1.9813519813519813, "grad_norm": 0.27263251333241223, "learning_rate": 3.674348745271595e-05, "loss": 1.1931, "step": 850 }, { "epoch": 1.993006993006993, "grad_norm": 0.263716889490106, "learning_rate": 3.6405829113742405e-05, "loss": 1.1308, "step": 855 }, { "epoch": 1.9953379953379953, "eval_loss": 1.2667760848999023, "eval_runtime": 41.1775, "eval_samples_per_second": 1.7, "eval_steps_per_second": 0.219, "step": 856 } ], "logging_steps": 5, "max_steps": 1287, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 887872819298304.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }