{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3932878867330886, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00026219192448872575, "grad_norm": 22.20619010925293, "learning_rate": 0.0, "loss": 10.5131, "step": 1 }, { "epoch": 0.0026219192448872575, "grad_norm": 22.429588317871094, "learning_rate": 4.4999999999999996e-05, "loss": 10.4662, "step": 10 }, { "epoch": 0.005243838489774515, "grad_norm": 22.83245086669922, "learning_rate": 9.5e-05, "loss": 10.1612, "step": 20 }, { "epoch": 0.007865757734661772, "grad_norm": 23.247602462768555, "learning_rate": 0.000145, "loss": 9.5256, "step": 30 }, { "epoch": 0.01048767697954903, "grad_norm": 23.51291275024414, "learning_rate": 0.00019500000000000002, "loss": 8.5708, "step": 40 }, { "epoch": 0.013109596224436287, "grad_norm": 22.496492385864258, "learning_rate": 0.000245, "loss": 7.3388, "step": 50 }, { "epoch": 0.015731515469323543, "grad_norm": 16.345460891723633, "learning_rate": 0.000295, "loss": 5.9703, "step": 60 }, { "epoch": 0.018353434714210803, "grad_norm": 3.921259880065918, "learning_rate": 0.000345, "loss": 4.9478, "step": 70 }, { "epoch": 0.02097535395909806, "grad_norm": 7.0385589599609375, "learning_rate": 0.000395, "loss": 4.6803, "step": 80 }, { "epoch": 0.023597273203985317, "grad_norm": 2.6207873821258545, "learning_rate": 0.00044500000000000003, "loss": 4.4974, "step": 90 }, { "epoch": 0.026219192448872573, "grad_norm": 1.9961260557174683, "learning_rate": 0.000495, "loss": 4.3314, "step": 100 }, { "epoch": 0.028841111693759833, "grad_norm": 1.6183704137802124, "learning_rate": 0.000545, "loss": 4.1959, "step": 110 }, { "epoch": 0.03146303093864709, "grad_norm": 1.331021785736084, "learning_rate": 0.0005949999999999999, "loss": 4.0158, "step": 120 }, { "epoch": 0.03408495018353435, "grad_norm": 1.14554762840271, "learning_rate": 0.0006450000000000001, "loss": 3.9321, "step": 130 }, { "epoch": 0.03670686942842161, "grad_norm": 0.9175837635993958, "learning_rate": 0.000695, "loss": 3.802, "step": 140 }, { "epoch": 0.03932878867330886, "grad_norm": 0.7335033416748047, "learning_rate": 0.000745, "loss": 3.6618, "step": 150 }, { "epoch": 0.04195070791819612, "grad_norm": 0.5916274785995483, "learning_rate": 0.000795, "loss": 3.5341, "step": 160 }, { "epoch": 0.04457262716308338, "grad_norm": 0.4947799742221832, "learning_rate": 0.0008449999999999999, "loss": 3.5311, "step": 170 }, { "epoch": 0.04719454640797063, "grad_norm": 0.40263015031814575, "learning_rate": 0.0008950000000000001, "loss": 3.4709, "step": 180 }, { "epoch": 0.04981646565285789, "grad_norm": 0.32677406072616577, "learning_rate": 0.000945, "loss": 3.2973, "step": 190 }, { "epoch": 0.05243838489774515, "grad_norm": 0.3071628212928772, "learning_rate": 0.000995, "loss": 3.28, "step": 200 }, { "epoch": 0.05506030414263241, "grad_norm": 0.3233015835285187, "learning_rate": 0.001045, "loss": 3.2038, "step": 210 }, { "epoch": 0.05768222338751967, "grad_norm": 0.39402100443840027, "learning_rate": 0.001095, "loss": 3.1627, "step": 220 }, { "epoch": 0.060304142632406924, "grad_norm": 0.5528343915939331, "learning_rate": 0.001145, "loss": 3.1341, "step": 230 }, { "epoch": 0.06292606187729417, "grad_norm": 0.4888489842414856, "learning_rate": 0.001195, "loss": 3.0192, "step": 240 }, { "epoch": 0.06554798112218144, "grad_norm": 0.5662292838096619, "learning_rate": 0.0012450000000000002, "loss": 2.991, "step": 250 }, { "epoch": 0.0681699003670687, "grad_norm": 0.5800466537475586, "learning_rate": 0.001295, "loss": 2.992, "step": 260 }, { "epoch": 0.07079181961195595, "grad_norm": 0.5511091947555542, "learning_rate": 0.001345, "loss": 2.9246, "step": 270 }, { "epoch": 0.07341373885684321, "grad_norm": 0.7486537098884583, "learning_rate": 0.001395, "loss": 2.8996, "step": 280 }, { "epoch": 0.07603565810173046, "grad_norm": 0.6995801329612732, "learning_rate": 0.001445, "loss": 2.7945, "step": 290 }, { "epoch": 0.07865757734661773, "grad_norm": 0.7938666939735413, "learning_rate": 0.0014950000000000002, "loss": 2.7632, "step": 300 }, { "epoch": 0.08127949659150498, "grad_norm": 0.7555065155029297, "learning_rate": 0.001545, "loss": 2.7513, "step": 310 }, { "epoch": 0.08390141583639224, "grad_norm": 0.7714865803718567, "learning_rate": 0.001595, "loss": 2.6165, "step": 320 }, { "epoch": 0.08652333508127949, "grad_norm": 0.7604843974113464, "learning_rate": 0.001645, "loss": 2.6391, "step": 330 }, { "epoch": 0.08914525432616675, "grad_norm": 0.7840315699577332, "learning_rate": 0.0016950000000000001, "loss": 2.5818, "step": 340 }, { "epoch": 0.09176717357105402, "grad_norm": 1.0126832723617554, "learning_rate": 0.0017450000000000002, "loss": 2.5417, "step": 350 }, { "epoch": 0.09438909281594127, "grad_norm": 1.0092129707336426, "learning_rate": 0.001795, "loss": 2.4844, "step": 360 }, { "epoch": 0.09701101206082853, "grad_norm": 1.1585489511489868, "learning_rate": 0.001845, "loss": 2.4645, "step": 370 }, { "epoch": 0.09963293130571578, "grad_norm": 1.0778034925460815, "learning_rate": 0.001895, "loss": 2.4003, "step": 380 }, { "epoch": 0.10225485055060304, "grad_norm": 1.146636962890625, "learning_rate": 0.0019450000000000001, "loss": 2.3466, "step": 390 }, { "epoch": 0.1048767697954903, "grad_norm": 0.9742526412010193, "learning_rate": 0.0019950000000000002, "loss": 2.3088, "step": 400 }, { "epoch": 0.10749868904037756, "grad_norm": 1.3035728931427002, "learning_rate": 0.0019999657054386192, "loss": 2.2834, "step": 410 }, { "epoch": 0.11012060828526482, "grad_norm": 1.0689384937286377, "learning_rate": 0.0019998471593574603, "loss": 2.2473, "step": 420 }, { "epoch": 0.11274252753015207, "grad_norm": 1.1519441604614258, "learning_rate": 0.001999643948402709, "loss": 2.1925, "step": 430 }, { "epoch": 0.11536444677503933, "grad_norm": 0.9427940249443054, "learning_rate": 0.0019993560897818255, "loss": 2.1774, "step": 440 }, { "epoch": 0.11798636601992658, "grad_norm": 0.9017934203147888, "learning_rate": 0.0019989836078700496, "loss": 2.152, "step": 450 }, { "epoch": 0.12060828526481385, "grad_norm": 1.018966555595398, "learning_rate": 0.001998526534208335, "loss": 2.0825, "step": 460 }, { "epoch": 0.1232302045097011, "grad_norm": 1.0533466339111328, "learning_rate": 0.0019979849075006813, "loss": 2.1358, "step": 470 }, { "epoch": 0.12585212375458835, "grad_norm": 0.941605806350708, "learning_rate": 0.001997358773610856, "loss": 2.0524, "step": 480 }, { "epoch": 0.12847404299947562, "grad_norm": 0.8877449035644531, "learning_rate": 0.0019966481855585075, "loss": 2.0308, "step": 490 }, { "epoch": 0.13109596224436287, "grad_norm": 0.8652307391166687, "learning_rate": 0.001995853203514682, "loss": 2.012, "step": 500 }, { "epoch": 0.13371788148925012, "grad_norm": 0.8943641781806946, "learning_rate": 0.0019949738947967217, "loss": 1.9729, "step": 510 }, { "epoch": 0.1363398007341374, "grad_norm": 0.9359736442565918, "learning_rate": 0.001994010333862568, "loss": 1.9997, "step": 520 }, { "epoch": 0.13896171997902465, "grad_norm": 1.0085017681121826, "learning_rate": 0.001992962602304456, "loss": 1.937, "step": 530 }, { "epoch": 0.1415836392239119, "grad_norm": 0.7549618482589722, "learning_rate": 0.0019918307888420065, "loss": 1.9268, "step": 540 }, { "epoch": 0.14420555846879915, "grad_norm": 0.8932085037231445, "learning_rate": 0.0019906149893147104, "loss": 1.9014, "step": 550 }, { "epoch": 0.14682747771368643, "grad_norm": 0.8130724430084229, "learning_rate": 0.001989315306673817, "loss": 1.8577, "step": 560 }, { "epoch": 0.14944939695857368, "grad_norm": 0.8497139811515808, "learning_rate": 0.0019879318509736137, "loss": 1.8185, "step": 570 }, { "epoch": 0.15207131620346093, "grad_norm": 0.6299962997436523, "learning_rate": 0.001986464739362106, "loss": 1.811, "step": 580 }, { "epoch": 0.1546932354483482, "grad_norm": 0.7180768251419067, "learning_rate": 0.0019849140960711024, "loss": 1.7944, "step": 590 }, { "epoch": 0.15731515469323545, "grad_norm": 0.8082334399223328, "learning_rate": 0.0019832800524056888, "loss": 1.8333, "step": 600 }, { "epoch": 0.1599370739381227, "grad_norm": 0.8284159302711487, "learning_rate": 0.0019815627467331142, "loss": 1.811, "step": 610 }, { "epoch": 0.16255899318300995, "grad_norm": 0.7332941293716431, "learning_rate": 0.0019797623244710715, "loss": 1.7704, "step": 620 }, { "epoch": 0.16518091242789723, "grad_norm": 0.7234723567962646, "learning_rate": 0.0019778789380753862, "loss": 1.7558, "step": 630 }, { "epoch": 0.16780283167278448, "grad_norm": 0.693242073059082, "learning_rate": 0.001975912747027104, "loss": 1.742, "step": 640 }, { "epoch": 0.17042475091767173, "grad_norm": 0.8523733019828796, "learning_rate": 0.0019738639178189885, "loss": 1.7438, "step": 650 }, { "epoch": 0.17304667016255898, "grad_norm": 0.7505561709403992, "learning_rate": 0.001971732623941422, "loss": 1.7251, "step": 660 }, { "epoch": 0.17566858940744626, "grad_norm": 0.7338821887969971, "learning_rate": 0.0019695190458677144, "loss": 1.7281, "step": 670 }, { "epoch": 0.1782905086523335, "grad_norm": 0.8278585076332092, "learning_rate": 0.001967223371038823, "loss": 1.6983, "step": 680 }, { "epoch": 0.18091242789722076, "grad_norm": 0.6785498261451721, "learning_rate": 0.0019648457938474776, "loss": 1.7018, "step": 690 }, { "epoch": 0.18353434714210803, "grad_norm": 0.7954968810081482, "learning_rate": 0.0019623865156217215, "loss": 1.6978, "step": 700 }, { "epoch": 0.18615626638699528, "grad_norm": 0.6877925992012024, "learning_rate": 0.001959845744607864, "loss": 1.6693, "step": 710 }, { "epoch": 0.18877818563188253, "grad_norm": 0.6183112859725952, "learning_rate": 0.001957223695952844, "loss": 1.656, "step": 720 }, { "epoch": 0.19140010487676978, "grad_norm": 0.6864896416664124, "learning_rate": 0.0019545205916860152, "loss": 1.6188, "step": 730 }, { "epoch": 0.19402202412165706, "grad_norm": 0.6678555011749268, "learning_rate": 0.0019517366607003429, "loss": 1.6195, "step": 740 }, { "epoch": 0.1966439433665443, "grad_norm": 0.724320113658905, "learning_rate": 0.0019488721387330222, "loss": 1.6067, "step": 750 }, { "epoch": 0.19926586261143156, "grad_norm": 0.6665757298469543, "learning_rate": 0.0019459272683455162, "loss": 1.5781, "step": 760 }, { "epoch": 0.20188778185631884, "grad_norm": 0.7139772772789001, "learning_rate": 0.0019429022989030176, "loss": 1.5647, "step": 770 }, { "epoch": 0.2045097011012061, "grad_norm": 0.6505457758903503, "learning_rate": 0.0019397974865533315, "loss": 1.5869, "step": 780 }, { "epoch": 0.20713162034609334, "grad_norm": 0.6815754175186157, "learning_rate": 0.001936613094205186, "loss": 1.5848, "step": 790 }, { "epoch": 0.2097535395909806, "grad_norm": 0.6977171897888184, "learning_rate": 0.00193334939150597, "loss": 1.5284, "step": 800 }, { "epoch": 0.21237545883586786, "grad_norm": 0.5965753197669983, "learning_rate": 0.0019300066548188998, "loss": 1.5468, "step": 810 }, { "epoch": 0.2149973780807551, "grad_norm": 0.596052885055542, "learning_rate": 0.001926585167199616, "loss": 1.5579, "step": 820 }, { "epoch": 0.21761929732564236, "grad_norm": 0.6821017861366272, "learning_rate": 0.001923085218372218, "loss": 1.4984, "step": 830 }, { "epoch": 0.22024121657052964, "grad_norm": 0.6523297429084778, "learning_rate": 0.0019195071047047277, "loss": 1.537, "step": 840 }, { "epoch": 0.2228631358154169, "grad_norm": 0.648935079574585, "learning_rate": 0.0019158511291839945, "loss": 1.5192, "step": 850 }, { "epoch": 0.22548505506030414, "grad_norm": 0.6102792620658875, "learning_rate": 0.0019121176013900407, "loss": 1.5209, "step": 860 }, { "epoch": 0.2281069743051914, "grad_norm": 0.6573307514190674, "learning_rate": 0.0019083068374698448, "loss": 1.49, "step": 870 }, { "epoch": 0.23072889355007867, "grad_norm": 0.6355723738670349, "learning_rate": 0.0019044191601105727, "loss": 1.4929, "step": 880 }, { "epoch": 0.23335081279496592, "grad_norm": 0.5931225419044495, "learning_rate": 0.0019004548985122511, "loss": 1.4813, "step": 890 }, { "epoch": 0.23597273203985317, "grad_norm": 0.6640650629997253, "learning_rate": 0.0018964143883598936, "loss": 1.4808, "step": 900 }, { "epoch": 0.23859465128474042, "grad_norm": 0.6377866268157959, "learning_rate": 0.0018922979717950748, "loss": 1.4901, "step": 910 }, { "epoch": 0.2412165705296277, "grad_norm": 0.6502982378005981, "learning_rate": 0.0018881059973869581, "loss": 1.4501, "step": 920 }, { "epoch": 0.24383848977451494, "grad_norm": 0.602969765663147, "learning_rate": 0.0018838388201027805, "loss": 1.4661, "step": 930 }, { "epoch": 0.2464604090194022, "grad_norm": 0.6061879396438599, "learning_rate": 0.001879496801277794, "loss": 1.4408, "step": 940 }, { "epoch": 0.24908232826428947, "grad_norm": 0.8049127459526062, "learning_rate": 0.001875080308584669, "loss": 1.4466, "step": 950 }, { "epoch": 0.2517042475091767, "grad_norm": 0.46771517395973206, "learning_rate": 0.00187058971600236, "loss": 1.4382, "step": 960 }, { "epoch": 0.254326166754064, "grad_norm": 0.6081333756446838, "learning_rate": 0.001866025403784439, "loss": 1.4518, "step": 970 }, { "epoch": 0.25694808599895125, "grad_norm": 0.6247040033340454, "learning_rate": 0.0018613877584268944, "loss": 1.4639, "step": 980 }, { "epoch": 0.2595700052438385, "grad_norm": 0.5699506998062134, "learning_rate": 0.0018566771726354063, "loss": 1.4218, "step": 990 }, { "epoch": 0.26219192448872575, "grad_norm": 0.5360729694366455, "learning_rate": 0.0018518940452920906, "loss": 1.4189, "step": 1000 }, { "epoch": 0.264813843733613, "grad_norm": 0.5921474695205688, "learning_rate": 0.0018470387814217232, "loss": 1.424, "step": 1010 }, { "epoch": 0.26743576297850025, "grad_norm": 0.6162559986114502, "learning_rate": 0.0018421117921574438, "loss": 1.4307, "step": 1020 }, { "epoch": 0.2700576822233875, "grad_norm": 0.5530286431312561, "learning_rate": 0.001837113494705942, "loss": 1.4158, "step": 1030 }, { "epoch": 0.2726796014682748, "grad_norm": 0.5585499405860901, "learning_rate": 0.0018320443123121283, "loss": 1.3861, "step": 1040 }, { "epoch": 0.27530152071316205, "grad_norm": 0.6225973963737488, "learning_rate": 0.0018269046742232966, "loss": 1.3942, "step": 1050 }, { "epoch": 0.2779234399580493, "grad_norm": 0.49642321467399597, "learning_rate": 0.0018216950156527737, "loss": 1.3912, "step": 1060 }, { "epoch": 0.28054535920293655, "grad_norm": 0.6089576482772827, "learning_rate": 0.0018164157777430681, "loss": 1.3732, "step": 1070 }, { "epoch": 0.2831672784478238, "grad_norm": 0.5753847360610962, "learning_rate": 0.0018110674075285157, "loss": 1.398, "step": 1080 }, { "epoch": 0.28578919769271105, "grad_norm": 0.5357734560966492, "learning_rate": 0.0018056503578974242, "loss": 1.3851, "step": 1090 }, { "epoch": 0.2884111169375983, "grad_norm": 0.5319791436195374, "learning_rate": 0.001800165087553724, "loss": 1.3804, "step": 1100 }, { "epoch": 0.2910330361824856, "grad_norm": 0.5765709280967712, "learning_rate": 0.0017946120609781276, "loss": 1.3534, "step": 1110 }, { "epoch": 0.29365495542737285, "grad_norm": 0.48765453696250916, "learning_rate": 0.001788991748388796, "loss": 1.3693, "step": 1120 }, { "epoch": 0.2962768746722601, "grad_norm": 0.5916075110435486, "learning_rate": 0.001783304625701524, "loss": 1.3697, "step": 1130 }, { "epoch": 0.29889879391714735, "grad_norm": 0.411699503660202, "learning_rate": 0.0017775511744894384, "loss": 1.3588, "step": 1140 }, { "epoch": 0.3015207131620346, "grad_norm": 0.5155631899833679, "learning_rate": 0.0017717318819422214, "loss": 1.3697, "step": 1150 }, { "epoch": 0.30414263240692185, "grad_norm": 0.5687488913536072, "learning_rate": 0.0017658472408248551, "loss": 1.3558, "step": 1160 }, { "epoch": 0.3067645516518091, "grad_norm": 0.5609891414642334, "learning_rate": 0.0017598977494358967, "loss": 1.3376, "step": 1170 }, { "epoch": 0.3093864708966964, "grad_norm": 0.5137512683868408, "learning_rate": 0.0017538839115652817, "loss": 1.3534, "step": 1180 }, { "epoch": 0.31200839014158366, "grad_norm": 0.5840641260147095, "learning_rate": 0.001747806236451666, "loss": 1.3394, "step": 1190 }, { "epoch": 0.3146303093864709, "grad_norm": 0.5758949518203735, "learning_rate": 0.0017416652387393027, "loss": 1.3417, "step": 1200 }, { "epoch": 0.31725222863135816, "grad_norm": 0.5121742486953735, "learning_rate": 0.0017354614384344658, "loss": 1.341, "step": 1210 }, { "epoch": 0.3198741478762454, "grad_norm": 0.5056650638580322, "learning_rate": 0.001729195360861414, "loss": 1.316, "step": 1220 }, { "epoch": 0.32249606712113266, "grad_norm": 0.4782615602016449, "learning_rate": 0.0017228675366179106, "loss": 1.3226, "step": 1230 }, { "epoch": 0.3251179863660199, "grad_norm": 0.49403342604637146, "learning_rate": 0.0017164785015302906, "loss": 1.37, "step": 1240 }, { "epoch": 0.3277399056109072, "grad_norm": 0.4836321175098419, "learning_rate": 0.0017100287966080906, "loss": 1.3272, "step": 1250 }, { "epoch": 0.33036182485579446, "grad_norm": 0.48174890875816345, "learning_rate": 0.001703518967998236, "loss": 1.3148, "step": 1260 }, { "epoch": 0.3329837441006817, "grad_norm": 0.4627121090888977, "learning_rate": 0.001696949566938795, "loss": 1.3161, "step": 1270 }, { "epoch": 0.33560566334556896, "grad_norm": 0.470414936542511, "learning_rate": 0.0016903211497123003, "loss": 1.3313, "step": 1280 }, { "epoch": 0.3382275825904562, "grad_norm": 0.4437310993671417, "learning_rate": 0.0016836342775986446, "loss": 1.3073, "step": 1290 }, { "epoch": 0.34084950183534346, "grad_norm": 0.47688329219818115, "learning_rate": 0.0016768895168275534, "loss": 1.3128, "step": 1300 }, { "epoch": 0.3434714210802307, "grad_norm": 0.5143507122993469, "learning_rate": 0.0016700874385306363, "loss": 1.3357, "step": 1310 }, { "epoch": 0.34609334032511796, "grad_norm": 0.4100657105445862, "learning_rate": 0.0016632286186930275, "loss": 1.3061, "step": 1320 }, { "epoch": 0.34871525957000526, "grad_norm": 0.4421868920326233, "learning_rate": 0.0016563136381046088, "loss": 1.3158, "step": 1330 }, { "epoch": 0.3513371788148925, "grad_norm": 0.4668099582195282, "learning_rate": 0.0016493430823108332, "loss": 1.3088, "step": 1340 }, { "epoch": 0.35395909805977976, "grad_norm": 0.5451709032058716, "learning_rate": 0.0016423175415631404, "loss": 1.3344, "step": 1350 }, { "epoch": 0.356581017304667, "grad_norm": 0.45294106006622314, "learning_rate": 0.0016352376107689754, "loss": 1.2778, "step": 1360 }, { "epoch": 0.35920293654955426, "grad_norm": 0.4404051601886749, "learning_rate": 0.0016281038894414143, "loss": 1.2871, "step": 1370 }, { "epoch": 0.3618248557944415, "grad_norm": 0.45863279700279236, "learning_rate": 0.0016209169816483971, "loss": 1.3286, "step": 1380 }, { "epoch": 0.36444677503932876, "grad_norm": 0.45011425018310547, "learning_rate": 0.0016136774959615784, "loss": 1.2979, "step": 1390 }, { "epoch": 0.36706869428421607, "grad_norm": 0.5113876461982727, "learning_rate": 0.0016063860454047943, "loss": 1.3088, "step": 1400 }, { "epoch": 0.3696906135291033, "grad_norm": 0.40740302205085754, "learning_rate": 0.001599043247402151, "loss": 1.2703, "step": 1410 }, { "epoch": 0.37231253277399057, "grad_norm": 0.4261358976364136, "learning_rate": 0.0015916497237257455, "loss": 1.2681, "step": 1420 }, { "epoch": 0.3749344520188778, "grad_norm": 0.4349290132522583, "learning_rate": 0.0015842061004430145, "loss": 1.317, "step": 1430 }, { "epoch": 0.37755637126376507, "grad_norm": 0.4363626539707184, "learning_rate": 0.0015767130078637183, "loss": 1.2707, "step": 1440 }, { "epoch": 0.3801782905086523, "grad_norm": 0.41238006949424744, "learning_rate": 0.0015691710804865706, "loss": 1.2763, "step": 1450 }, { "epoch": 0.38280020975353957, "grad_norm": 0.476226270198822, "learning_rate": 0.0015615809569455089, "loss": 1.3037, "step": 1460 }, { "epoch": 0.38542212899842687, "grad_norm": 0.45900896191596985, "learning_rate": 0.0015539432799556159, "loss": 1.287, "step": 1470 }, { "epoch": 0.3880440482433141, "grad_norm": 0.3873949348926544, "learning_rate": 0.0015462586962586972, "loss": 1.2793, "step": 1480 }, { "epoch": 0.39066596748820137, "grad_norm": 0.4380306601524353, "learning_rate": 0.001538527856568515, "loss": 1.2916, "step": 1490 }, { "epoch": 0.3932878867330886, "grad_norm": 0.39479300379753113, "learning_rate": 0.0015307514155156895, "loss": 1.272, "step": 1500 } ], "logging_steps": 10, "max_steps": 3814, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0095396499845284e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }