{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9470512268618166, "eval_steps": 300, "global_step": 3300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0028698522026115655, "grad_norm": 168.79563903808594, "learning_rate": 1.739130434782609e-05, "loss": 12.0169, "step": 10 }, { "epoch": 0.005739704405223131, "grad_norm": 20.983991622924805, "learning_rate": 4.63768115942029e-05, "loss": 7.8235, "step": 20 }, { "epoch": 0.008609556607834697, "grad_norm": 21.168655395507812, "learning_rate": 7.536231884057971e-05, "loss": 6.834, "step": 30 }, { "epoch": 0.011479408810446262, "grad_norm": 21.57039451599121, "learning_rate": 0.00010434782608695653, "loss": 3.9023, "step": 40 }, { "epoch": 0.014349261013057828, "grad_norm": 48.81906509399414, "learning_rate": 0.00013333333333333334, "loss": 2.9802, "step": 50 }, { "epoch": 0.017219113215669393, "grad_norm": 7.396921157836914, "learning_rate": 0.00016231884057971017, "loss": 2.6257, "step": 60 }, { "epoch": 0.02008896541828096, "grad_norm": 62.19234848022461, "learning_rate": 0.00019130434782608697, "loss": 2.3201, "step": 70 }, { "epoch": 0.022958817620892524, "grad_norm": 8.402580261230469, "learning_rate": 0.00019999792781461744, "loss": 2.1749, "step": 80 }, { "epoch": 0.02582866982350409, "grad_norm": 7.064925670623779, "learning_rate": 0.0001999877785419313, "loss": 1.8889, "step": 90 }, { "epoch": 0.028698522026115655, "grad_norm": 7.678985118865967, "learning_rate": 0.0001999691724338023, "loss": 1.8161, "step": 100 }, { "epoch": 0.03156837422872722, "grad_norm": 9.882554054260254, "learning_rate": 0.0001999421110639107, "loss": 1.9209, "step": 110 }, { "epoch": 0.034438226431338786, "grad_norm": 8.960328102111816, "learning_rate": 0.00019990659672107177, "loss": 1.8535, "step": 120 }, { "epoch": 0.03730807863395035, "grad_norm": 6.723909378051758, "learning_rate": 0.00019986263240904216, "loss": 1.7978, "step": 130 }, { "epoch": 0.04017793083656192, "grad_norm": 14.159058570861816, "learning_rate": 0.00019981022184626578, "loss": 1.686, "step": 140 }, { "epoch": 0.04304778303917348, "grad_norm": 12.402606010437012, "learning_rate": 0.00019974936946555948, "loss": 1.6932, "step": 150 }, { "epoch": 0.04591763524178505, "grad_norm": 7.793806076049805, "learning_rate": 0.000199680080413738, "loss": 1.5665, "step": 160 }, { "epoch": 0.048787487444396614, "grad_norm": 9.647517204284668, "learning_rate": 0.0001996023605511786, "loss": 1.5892, "step": 170 }, { "epoch": 0.05165733964700818, "grad_norm": 7.5883564949035645, "learning_rate": 0.00019951621645132556, "loss": 1.5003, "step": 180 }, { "epoch": 0.054527191849619745, "grad_norm": 9.5863676071167, "learning_rate": 0.00019942165540013412, "loss": 1.4324, "step": 190 }, { "epoch": 0.05739704405223131, "grad_norm": 10.761382102966309, "learning_rate": 0.00019931868539545416, "loss": 1.2652, "step": 200 }, { "epoch": 0.060266896254842876, "grad_norm": 23.32731056213379, "learning_rate": 0.00019920731514635396, "loss": 1.2868, "step": 210 }, { "epoch": 0.06313674845745444, "grad_norm": 15.128023147583008, "learning_rate": 0.00019908755407238343, "loss": 1.2272, "step": 220 }, { "epoch": 0.066006600660066, "grad_norm": 12.924105644226074, "learning_rate": 0.00019895941230277744, "loss": 1.307, "step": 230 }, { "epoch": 0.06887645286267757, "grad_norm": 9.334559440612793, "learning_rate": 0.00019882290067559915, "loss": 1.1858, "step": 240 }, { "epoch": 0.07174630506528913, "grad_norm": 12.918402671813965, "learning_rate": 0.0001986780307368233, "loss": 1.1668, "step": 250 }, { "epoch": 0.0746161572679007, "grad_norm": 8.966814994812012, "learning_rate": 0.00019852481473935974, "loss": 1.04, "step": 260 }, { "epoch": 0.07748600947051226, "grad_norm": 10.825933456420898, "learning_rate": 0.000198363265642017, "loss": 1.0674, "step": 270 }, { "epoch": 0.08035586167312384, "grad_norm": 20.35280418395996, "learning_rate": 0.00019819339710840626, "loss": 1.1564, "step": 280 }, { "epoch": 0.0832257138757354, "grad_norm": 24.500883102416992, "learning_rate": 0.00019801522350578577, "loss": 1.0751, "step": 290 }, { "epoch": 0.08609556607834697, "grad_norm": 8.19206428527832, "learning_rate": 0.00019782875990384568, "loss": 1.0476, "step": 300 }, { "epoch": 0.08896541828095852, "grad_norm": 8.840872764587402, "learning_rate": 0.00019763402207343338, "loss": 1.0478, "step": 310 }, { "epoch": 0.0918352704835701, "grad_norm": 11.326393127441406, "learning_rate": 0.00019743102648521967, "loss": 1.0235, "step": 320 }, { "epoch": 0.09470512268618166, "grad_norm": 15.35113525390625, "learning_rate": 0.00019721979030830572, "loss": 0.9794, "step": 330 }, { "epoch": 0.09757497488879323, "grad_norm": 11.8535795211792, "learning_rate": 0.0001970003314087709, "loss": 1.0072, "step": 340 }, { "epoch": 0.10044482709140479, "grad_norm": 24.779190063476562, "learning_rate": 0.0001967726683481617, "loss": 1.0056, "step": 350 }, { "epoch": 0.10331467929401636, "grad_norm": 20.744426727294922, "learning_rate": 0.00019653682038192188, "loss": 1.0066, "step": 360 }, { "epoch": 0.10618453149662792, "grad_norm": 21.19144630432129, "learning_rate": 0.00019629280745776364, "loss": 0.9673, "step": 370 }, { "epoch": 0.10905438369923949, "grad_norm": 18.140127182006836, "learning_rate": 0.0001960406502139808, "loss": 0.9903, "step": 380 }, { "epoch": 0.11192423590185105, "grad_norm": 19.997053146362305, "learning_rate": 0.00019578036997770296, "loss": 0.9715, "step": 390 }, { "epoch": 0.11479408810446262, "grad_norm": 15.790470123291016, "learning_rate": 0.0001955119887630919, "loss": 0.9508, "step": 400 }, { "epoch": 0.11766394030707418, "grad_norm": 18.330507278442383, "learning_rate": 0.0001952355292694795, "loss": 0.9867, "step": 410 }, { "epoch": 0.12053379250968575, "grad_norm": 13.211642265319824, "learning_rate": 0.0001949510148794478, "loss": 1.0481, "step": 420 }, { "epoch": 0.12340364471229731, "grad_norm": 9.442767143249512, "learning_rate": 0.00019465846965685158, "loss": 0.9686, "step": 430 }, { "epoch": 0.12627349691490888, "grad_norm": 15.597809791564941, "learning_rate": 0.00019435791834478293, "loss": 1.0821, "step": 440 }, { "epoch": 0.12914334911752046, "grad_norm": 13.517879486083984, "learning_rate": 0.0001940493863634784, "loss": 0.9397, "step": 450 }, { "epoch": 0.132013201320132, "grad_norm": 13.031438827514648, "learning_rate": 0.00019373289980816917, "loss": 1.0009, "step": 460 }, { "epoch": 0.13488305352274357, "grad_norm": 14.64666748046875, "learning_rate": 0.00019340848544687386, "loss": 0.9571, "step": 470 }, { "epoch": 0.13775290572535515, "grad_norm": 10.706031799316406, "learning_rate": 0.00019307617071813454, "loss": 1.0283, "step": 480 }, { "epoch": 0.14062275792796672, "grad_norm": 9.723997116088867, "learning_rate": 0.00019273598372869603, "loss": 0.9815, "step": 490 }, { "epoch": 0.14349261013057826, "grad_norm": 9.667860984802246, "learning_rate": 0.0001923879532511287, "loss": 0.9424, "step": 500 }, { "epoch": 0.14636246233318984, "grad_norm": 6.956273078918457, "learning_rate": 0.00019203210872139476, "loss": 0.9793, "step": 510 }, { "epoch": 0.1492323145358014, "grad_norm": 15.395605087280273, "learning_rate": 0.00019166848023635883, "loss": 1.0637, "step": 520 }, { "epoch": 0.15210216673841298, "grad_norm": 23.60310173034668, "learning_rate": 0.0001912970985512422, "loss": 0.9625, "step": 530 }, { "epoch": 0.15497201894102453, "grad_norm": 20.658727645874023, "learning_rate": 0.00019091799507702181, "loss": 0.9393, "step": 540 }, { "epoch": 0.1578418711436361, "grad_norm": 18.22756576538086, "learning_rate": 0.0001905312018777733, "loss": 0.9354, "step": 550 }, { "epoch": 0.16071172334624767, "grad_norm": 11.863499641418457, "learning_rate": 0.00019013675166795922, "loss": 0.933, "step": 560 }, { "epoch": 0.16358157554885924, "grad_norm": 11.65882682800293, "learning_rate": 0.00018973467780966202, "loss": 0.9119, "step": 570 }, { "epoch": 0.1664514277514708, "grad_norm": 11.474069595336914, "learning_rate": 0.00018932501430976242, "loss": 0.9511, "step": 580 }, { "epoch": 0.16932127995408236, "grad_norm": 8.225656509399414, "learning_rate": 0.00018890779581706303, "loss": 0.9474, "step": 590 }, { "epoch": 0.17219113215669393, "grad_norm": 15.780831336975098, "learning_rate": 0.00018848305761935797, "loss": 0.9528, "step": 600 }, { "epoch": 0.1750609843593055, "grad_norm": 9.415815353393555, "learning_rate": 0.00018805083564044802, "loss": 0.8619, "step": 610 }, { "epoch": 0.17793083656191705, "grad_norm": 9.250490188598633, "learning_rate": 0.0001876111664371025, "loss": 0.9168, "step": 620 }, { "epoch": 0.18080068876452862, "grad_norm": 15.730814933776855, "learning_rate": 0.0001871640871959672, "loss": 0.94, "step": 630 }, { "epoch": 0.1836705409671402, "grad_norm": 9.073026657104492, "learning_rate": 0.0001867096357304191, "loss": 0.9471, "step": 640 }, { "epoch": 0.18654039316975177, "grad_norm": 8.982126235961914, "learning_rate": 0.00018624785047736842, "loss": 0.9177, "step": 650 }, { "epoch": 0.1894102453723633, "grad_norm": 10.682122230529785, "learning_rate": 0.00018577877049400746, "loss": 0.9402, "step": 660 }, { "epoch": 0.19228009757497488, "grad_norm": 8.706944465637207, "learning_rate": 0.0001853024354545073, "loss": 0.8867, "step": 670 }, { "epoch": 0.19514994977758646, "grad_norm": 5.8472371101379395, "learning_rate": 0.00018481888564666208, "loss": 0.9135, "step": 680 }, { "epoch": 0.19801980198019803, "grad_norm": 5.432713508605957, "learning_rate": 0.00018432816196848172, "loss": 0.8525, "step": 690 }, { "epoch": 0.20088965418280957, "grad_norm": 28.993038177490234, "learning_rate": 0.00018383030592473266, "loss": 0.8779, "step": 700 }, { "epoch": 0.20375950638542115, "grad_norm": 5.313049793243408, "learning_rate": 0.0001833253596234274, "loss": 0.9551, "step": 710 }, { "epoch": 0.20662935858803272, "grad_norm": 18.639175415039062, "learning_rate": 0.00018281336577226327, "loss": 0.8694, "step": 720 }, { "epoch": 0.2094992107906443, "grad_norm": 15.578129768371582, "learning_rate": 0.00018229436767501012, "loss": 0.9017, "step": 730 }, { "epoch": 0.21236906299325584, "grad_norm": 18.0419864654541, "learning_rate": 0.0001817684092278477, "loss": 0.8616, "step": 740 }, { "epoch": 0.2152389151958674, "grad_norm": 8.34323787689209, "learning_rate": 0.00018123553491565308, "loss": 0.8902, "step": 750 }, { "epoch": 0.21810876739847898, "grad_norm": 8.49802017211914, "learning_rate": 0.00018069578980823816, "loss": 0.8781, "step": 760 }, { "epoch": 0.22097861960109055, "grad_norm": 6.250750541687012, "learning_rate": 0.00018014921955653772, "loss": 0.8405, "step": 770 }, { "epoch": 0.2238484718037021, "grad_norm": 25.283082962036133, "learning_rate": 0.00017959587038874822, "loss": 0.93, "step": 780 }, { "epoch": 0.22671832400631367, "grad_norm": 18.443071365356445, "learning_rate": 0.00017903578910641814, "loss": 0.9202, "step": 790 }, { "epoch": 0.22958817620892524, "grad_norm": 18.457555770874023, "learning_rate": 0.0001784690230804892, "loss": 0.9446, "step": 800 }, { "epoch": 0.23245802841153682, "grad_norm": 7.786270618438721, "learning_rate": 0.00017789562024729012, "loss": 0.899, "step": 810 }, { "epoch": 0.23532788061414836, "grad_norm": 6.527904033660889, "learning_rate": 0.00017731562910448202, "loss": 0.8866, "step": 820 }, { "epoch": 0.23819773281675993, "grad_norm": 8.394437789916992, "learning_rate": 0.00017672909870695665, "loss": 0.8749, "step": 830 }, { "epoch": 0.2410675850193715, "grad_norm": 6.815917491912842, "learning_rate": 0.00017613607866268742, "loss": 0.8542, "step": 840 }, { "epoch": 0.24393743722198308, "grad_norm": 16.42218780517578, "learning_rate": 0.00017553661912853347, "loss": 0.8658, "step": 850 }, { "epoch": 0.24680728942459462, "grad_norm": 14.373140335083008, "learning_rate": 0.00017493077080599768, "loss": 0.8756, "step": 860 }, { "epoch": 0.2496771416272062, "grad_norm": 17.368059158325195, "learning_rate": 0.0001743185849369381, "loss": 0.9572, "step": 870 }, { "epoch": 0.25254699382981777, "grad_norm": 8.744333267211914, "learning_rate": 0.0001737001132992344, "loss": 0.8743, "step": 880 }, { "epoch": 0.2554168460324293, "grad_norm": 9.240042686462402, "learning_rate": 0.0001730754082024082, "loss": 0.8666, "step": 890 }, { "epoch": 0.2582866982350409, "grad_norm": 8.81686782836914, "learning_rate": 0.00017244452248319896, "loss": 0.8771, "step": 900 }, { "epoch": 0.26115655043765246, "grad_norm": 46.30351638793945, "learning_rate": 0.00017180750950109504, "loss": 0.788, "step": 910 }, { "epoch": 0.264026402640264, "grad_norm": 6.262620449066162, "learning_rate": 0.0001711644231338208, "loss": 0.916, "step": 920 }, { "epoch": 0.2668962548428756, "grad_norm": 7.936816215515137, "learning_rate": 0.00017051531777277952, "loss": 0.8425, "step": 930 }, { "epoch": 0.26976610704548715, "grad_norm": 10.233474731445312, "learning_rate": 0.00016986024831845296, "loss": 0.9159, "step": 940 }, { "epoch": 0.27263595924809875, "grad_norm": 13.751338958740234, "learning_rate": 0.00016919927017575832, "loss": 0.8484, "step": 950 }, { "epoch": 0.2755058114507103, "grad_norm": 18.70934295654297, "learning_rate": 0.00016853243924936173, "loss": 0.8387, "step": 960 }, { "epoch": 0.27837566365332184, "grad_norm": 6.2156853675842285, "learning_rate": 0.0001678598119389502, "loss": 0.9127, "step": 970 }, { "epoch": 0.28124551585593344, "grad_norm": 10.486414909362793, "learning_rate": 0.00016718144513446127, "loss": 0.861, "step": 980 }, { "epoch": 0.284115368058545, "grad_norm": 7.782724380493164, "learning_rate": 0.00016649739621127146, "loss": 0.8739, "step": 990 }, { "epoch": 0.2869852202611565, "grad_norm": 30.388168334960938, "learning_rate": 0.00016580772302534337, "loss": 0.9009, "step": 1000 }, { "epoch": 0.2898550724637681, "grad_norm": 7.943617343902588, "learning_rate": 0.0001651124839083324, "loss": 0.8113, "step": 1010 }, { "epoch": 0.29272492466637967, "grad_norm": 8.402076721191406, "learning_rate": 0.00016441173766265315, "loss": 0.8076, "step": 1020 }, { "epoch": 0.29559477686899127, "grad_norm": 7.3927764892578125, "learning_rate": 0.00016370554355650584, "loss": 0.8263, "step": 1030 }, { "epoch": 0.2984646290716028, "grad_norm": 8.749371528625488, "learning_rate": 0.0001629939613188638, "loss": 0.8673, "step": 1040 }, { "epoch": 0.30133448127421436, "grad_norm": 4.924167156219482, "learning_rate": 0.0001622770511344213, "loss": 0.869, "step": 1050 }, { "epoch": 0.30420433347682596, "grad_norm": 34.14529037475586, "learning_rate": 0.00016155487363850342, "loss": 0.9202, "step": 1060 }, { "epoch": 0.3070741856794375, "grad_norm": 13.217582702636719, "learning_rate": 0.00016082748991193757, "loss": 0.8409, "step": 1070 }, { "epoch": 0.30994403788204905, "grad_norm": 19.251298904418945, "learning_rate": 0.00016009496147588735, "loss": 0.8624, "step": 1080 }, { "epoch": 0.31281389008466065, "grad_norm": 52.710453033447266, "learning_rate": 0.00015935735028664908, "loss": 0.8695, "step": 1090 }, { "epoch": 0.3156837422872722, "grad_norm": 15.96419906616211, "learning_rate": 0.00015861471873041184, "loss": 0.8773, "step": 1100 }, { "epoch": 0.3185535944898838, "grad_norm": 7.947400093078613, "learning_rate": 0.0001578671296179806, "loss": 0.8387, "step": 1110 }, { "epoch": 0.32142344669249534, "grad_norm": 13.167436599731445, "learning_rate": 0.00015711464617946402, "loss": 0.8582, "step": 1120 }, { "epoch": 0.3242932988951069, "grad_norm": 11.579595565795898, "learning_rate": 0.00015635733205892653, "loss": 0.8615, "step": 1130 }, { "epoch": 0.3271631510977185, "grad_norm": 4.840546131134033, "learning_rate": 0.00015559525130900523, "loss": 0.822, "step": 1140 }, { "epoch": 0.33003300330033003, "grad_norm": 8.159014701843262, "learning_rate": 0.0001548284683854925, "loss": 0.8512, "step": 1150 }, { "epoch": 0.3329028555029416, "grad_norm": 33.13652038574219, "learning_rate": 0.00015405704814188442, "loss": 0.8686, "step": 1160 }, { "epoch": 0.3357727077055532, "grad_norm": 5.398830890655518, "learning_rate": 0.00015328105582389557, "loss": 0.8685, "step": 1170 }, { "epoch": 0.3386425599081647, "grad_norm": 23.8563289642334, "learning_rate": 0.00015250055706394057, "loss": 0.8617, "step": 1180 }, { "epoch": 0.3415124121107763, "grad_norm": 5.886293411254883, "learning_rate": 0.00015171561787558297, "loss": 0.8559, "step": 1190 }, { "epoch": 0.34438226431338786, "grad_norm": 7.887658596038818, "learning_rate": 0.000150926304647952, "loss": 0.8811, "step": 1200 }, { "epoch": 0.3472521165159994, "grad_norm": 6.111181259155273, "learning_rate": 0.00015013268414012742, "loss": 0.8297, "step": 1210 }, { "epoch": 0.350121968718611, "grad_norm": 6.417325496673584, "learning_rate": 0.00014933482347549303, "loss": 0.8296, "step": 1220 }, { "epoch": 0.35299182092122255, "grad_norm": 48.331573486328125, "learning_rate": 0.00014853279013605957, "loss": 0.7966, "step": 1230 }, { "epoch": 0.3558616731238341, "grad_norm": 8.638408660888672, "learning_rate": 0.00014772665195675718, "loss": 0.8522, "step": 1240 }, { "epoch": 0.3587315253264457, "grad_norm": 6.308197498321533, "learning_rate": 0.00014691647711969803, "loss": 0.8228, "step": 1250 }, { "epoch": 0.36160137752905724, "grad_norm": 6.23061990737915, "learning_rate": 0.0001461023341484094, "loss": 0.7915, "step": 1260 }, { "epoch": 0.36447122973166884, "grad_norm": 6.377804756164551, "learning_rate": 0.00014528429190203824, "loss": 0.8486, "step": 1270 }, { "epoch": 0.3673410819342804, "grad_norm": 6.146363258361816, "learning_rate": 0.00014446241956952714, "loss": 0.8927, "step": 1280 }, { "epoch": 0.37021093413689193, "grad_norm": 3.900587320327759, "learning_rate": 0.0001436367866637622, "loss": 0.8167, "step": 1290 }, { "epoch": 0.37308078633950353, "grad_norm": 8.58018684387207, "learning_rate": 0.00014280746301569407, "loss": 0.8128, "step": 1300 }, { "epoch": 0.3759506385421151, "grad_norm": 5.754461288452148, "learning_rate": 0.00014197451876843138, "loss": 0.8441, "step": 1310 }, { "epoch": 0.3788204907447266, "grad_norm": 7.290277004241943, "learning_rate": 0.00014113802437130845, "loss": 0.8555, "step": 1320 }, { "epoch": 0.3816903429473382, "grad_norm": 43.14801788330078, "learning_rate": 0.00014029805057392655, "loss": 0.8299, "step": 1330 }, { "epoch": 0.38456019514994977, "grad_norm": 5.909049034118652, "learning_rate": 0.0001394546684201701, "loss": 0.8448, "step": 1340 }, { "epoch": 0.38743004735256137, "grad_norm": 4.810829162597656, "learning_rate": 0.00013860794924219782, "loss": 0.8592, "step": 1350 }, { "epoch": 0.3902998995551729, "grad_norm": 6.602210998535156, "learning_rate": 0.00013775796465440956, "loss": 0.8351, "step": 1360 }, { "epoch": 0.39316975175778446, "grad_norm": 7.952111721038818, "learning_rate": 0.0001369047865473893, "loss": 0.8243, "step": 1370 }, { "epoch": 0.39603960396039606, "grad_norm": 8.271283149719238, "learning_rate": 0.00013604848708182466, "loss": 0.8239, "step": 1380 }, { "epoch": 0.3989094561630076, "grad_norm": 12.694669723510742, "learning_rate": 0.00013518913868240372, "loss": 0.8381, "step": 1390 }, { "epoch": 0.40177930836561915, "grad_norm": 22.169252395629883, "learning_rate": 0.00013432681403168932, "loss": 0.8227, "step": 1400 }, { "epoch": 0.40464916056823075, "grad_norm": 127.96073913574219, "learning_rate": 0.00013346158606397182, "loss": 0.8376, "step": 1410 }, { "epoch": 0.4075190127708423, "grad_norm": 12.16250991821289, "learning_rate": 0.0001325935279591003, "loss": 0.8253, "step": 1420 }, { "epoch": 0.4103888649734539, "grad_norm": 11.346808433532715, "learning_rate": 0.00013172271313629315, "loss": 0.8554, "step": 1430 }, { "epoch": 0.41325871717606544, "grad_norm": 18.371610641479492, "learning_rate": 0.0001308492152479283, "loss": 0.7743, "step": 1440 }, { "epoch": 0.416128569378677, "grad_norm": 17.174100875854492, "learning_rate": 0.00012997310817331392, "loss": 0.8342, "step": 1450 }, { "epoch": 0.4189984215812886, "grad_norm": 15.853143692016602, "learning_rate": 0.00012909446601243972, "loss": 0.8514, "step": 1460 }, { "epoch": 0.4218682737839001, "grad_norm": 6.734909534454346, "learning_rate": 0.00012821336307970965, "loss": 0.7947, "step": 1470 }, { "epoch": 0.42473812598651167, "grad_norm": 7.687751770019531, "learning_rate": 0.00012732987389765658, "loss": 0.8249, "step": 1480 }, { "epoch": 0.4276079781891233, "grad_norm": 4.791903972625732, "learning_rate": 0.00012644407319063918, "loss": 0.7755, "step": 1490 }, { "epoch": 0.4304778303917348, "grad_norm": 3.5958361625671387, "learning_rate": 0.0001255560358785219, "loss": 0.7828, "step": 1500 }, { "epoch": 0.4333476825943464, "grad_norm": 5.9140400886535645, "learning_rate": 0.00012466583707033832, "loss": 0.8044, "step": 1510 }, { "epoch": 0.43621753479695796, "grad_norm": 5.575759410858154, "learning_rate": 0.00012377355205793854, "loss": 0.7996, "step": 1520 }, { "epoch": 0.4390873869995695, "grad_norm": 6.771875381469727, "learning_rate": 0.00012287925630962107, "loss": 0.8261, "step": 1530 }, { "epoch": 0.4419572392021811, "grad_norm": 18.849271774291992, "learning_rate": 0.00012198302546374978, "loss": 0.8224, "step": 1540 }, { "epoch": 0.44482709140479265, "grad_norm": 5.645337104797363, "learning_rate": 0.00012108493532235666, "loss": 0.8185, "step": 1550 }, { "epoch": 0.4476969436074042, "grad_norm": 4.3476481437683105, "learning_rate": 0.00012018506184473038, "loss": 0.7985, "step": 1560 }, { "epoch": 0.4505667958100158, "grad_norm": 8.391561508178711, "learning_rate": 0.00011928348114099195, "loss": 0.7965, "step": 1570 }, { "epoch": 0.45343664801262734, "grad_norm": 11.707796096801758, "learning_rate": 0.00011838026946565723, "loss": 0.8174, "step": 1580 }, { "epoch": 0.45630650021523894, "grad_norm": 9.046381950378418, "learning_rate": 0.00011747550321118763, "loss": 0.8, "step": 1590 }, { "epoch": 0.4591763524178505, "grad_norm": 8.26490306854248, "learning_rate": 0.00011656925890152877, "loss": 0.8229, "step": 1600 }, { "epoch": 0.46204620462046203, "grad_norm": 6.398012638092041, "learning_rate": 0.00011566161318563821, "loss": 0.8027, "step": 1610 }, { "epoch": 0.46491605682307363, "grad_norm": 5.92479133605957, "learning_rate": 0.0001147526428310027, "loss": 0.8094, "step": 1620 }, { "epoch": 0.4677859090256852, "grad_norm": 7.79962158203125, "learning_rate": 0.00011384242471714512, "loss": 0.8049, "step": 1630 }, { "epoch": 0.4706557612282967, "grad_norm": 4.564454078674316, "learning_rate": 0.00011293103582912221, "loss": 0.8382, "step": 1640 }, { "epoch": 0.4735256134309083, "grad_norm": 20.43712043762207, "learning_rate": 0.00011201855325101332, "loss": 0.829, "step": 1650 }, { "epoch": 0.47639546563351987, "grad_norm": 5.778446674346924, "learning_rate": 0.0001111050541594006, "loss": 0.8333, "step": 1660 }, { "epoch": 0.47926531783613147, "grad_norm": 5.030070781707764, "learning_rate": 0.00011019061581684165, "loss": 0.769, "step": 1670 }, { "epoch": 0.482135170038743, "grad_norm": 5.967840671539307, "learning_rate": 0.00010927531556533456, "loss": 0.8041, "step": 1680 }, { "epoch": 0.48500502224135456, "grad_norm": 4.707633972167969, "learning_rate": 0.00010835923081977673, "loss": 0.8105, "step": 1690 }, { "epoch": 0.48787487444396616, "grad_norm": 6.354760646820068, "learning_rate": 0.0001074424390614169, "loss": 0.8031, "step": 1700 }, { "epoch": 0.4907447266465777, "grad_norm": 6.2033915519714355, "learning_rate": 0.00010652501783130208, "loss": 0.7559, "step": 1710 }, { "epoch": 0.49361457884918924, "grad_norm": 3.7331125736236572, "learning_rate": 0.00010560704472371919, "loss": 0.8233, "step": 1720 }, { "epoch": 0.49648443105180085, "grad_norm": 9.511772155761719, "learning_rate": 0.00010468859737963217, "loss": 0.7945, "step": 1730 }, { "epoch": 0.4993542832544124, "grad_norm": 12.07361125946045, "learning_rate": 0.00010376975348011533, "loss": 0.8368, "step": 1740 }, { "epoch": 0.5022241354570239, "grad_norm": 4.957511901855469, "learning_rate": 0.00010285059073978312, "loss": 0.8241, "step": 1750 }, { "epoch": 0.5050939876596355, "grad_norm": 4.124336242675781, "learning_rate": 0.00010193118690021699, "loss": 0.807, "step": 1760 }, { "epoch": 0.5079638398622471, "grad_norm": 4.789161205291748, "learning_rate": 0.00010101161972339046, "loss": 0.8143, "step": 1770 }, { "epoch": 0.5108336920648586, "grad_norm": 5.026962757110596, "learning_rate": 0.00010009196698509173, "loss": 0.7765, "step": 1780 }, { "epoch": 0.5137035442674702, "grad_norm": 8.285078048706055, "learning_rate": 9.91723064683458e-05, "loss": 0.8053, "step": 1790 }, { "epoch": 0.5165733964700818, "grad_norm": 4.77803897857666, "learning_rate": 9.825271595683548e-05, "loss": 0.8072, "step": 1800 }, { "epoch": 0.5194432486726933, "grad_norm": 4.466314315795898, "learning_rate": 9.73332732283226e-05, "loss": 0.7936, "step": 1810 }, { "epoch": 0.5223131008753049, "grad_norm": 6.21898078918457, "learning_rate": 9.641405604806983e-05, "loss": 0.8018, "step": 1820 }, { "epoch": 0.5251829530779165, "grad_norm": 3.505802869796753, "learning_rate": 9.549514216226311e-05, "loss": 0.823, "step": 1830 }, { "epoch": 0.528052805280528, "grad_norm": 4.254824161529541, "learning_rate": 9.45766092914363e-05, "loss": 0.824, "step": 1840 }, { "epoch": 0.5309226574831396, "grad_norm": 10.659527778625488, "learning_rate": 9.365853512389735e-05, "loss": 0.8169, "step": 1850 }, { "epoch": 0.5337925096857512, "grad_norm": 5.28292989730835, "learning_rate": 9.274099730915778e-05, "loss": 0.8076, "step": 1860 }, { "epoch": 0.5366623618883628, "grad_norm": 5.907596588134766, "learning_rate": 9.182407345136506e-05, "loss": 0.7863, "step": 1870 }, { "epoch": 0.5395322140909743, "grad_norm": 4.142882347106934, "learning_rate": 9.090784110273896e-05, "loss": 0.8133, "step": 1880 }, { "epoch": 0.5424020662935859, "grad_norm": 4.616401195526123, "learning_rate": 8.99923777570124e-05, "loss": 0.7853, "step": 1890 }, { "epoch": 0.5452719184961975, "grad_norm": 7.957604885101318, "learning_rate": 8.907776084287693e-05, "loss": 0.8275, "step": 1900 }, { "epoch": 0.548141770698809, "grad_norm": 3.326878070831299, "learning_rate": 8.816406771743412e-05, "loss": 0.7724, "step": 1910 }, { "epoch": 0.5510116229014206, "grad_norm": 4.447857856750488, "learning_rate": 8.725137565965262e-05, "loss": 0.8049, "step": 1920 }, { "epoch": 0.5538814751040322, "grad_norm": 5.452672004699707, "learning_rate": 8.633976186383217e-05, "loss": 0.8034, "step": 1930 }, { "epoch": 0.5567513273066437, "grad_norm": 5.054596900939941, "learning_rate": 8.542930343307444e-05, "loss": 0.7745, "step": 1940 }, { "epoch": 0.5596211795092553, "grad_norm": 25.82883071899414, "learning_rate": 8.452007737276191e-05, "loss": 0.7756, "step": 1950 }, { "epoch": 0.5624910317118669, "grad_norm": 4.046459197998047, "learning_rate": 8.361216058404468e-05, "loss": 0.7597, "step": 1960 }, { "epoch": 0.5653608839144784, "grad_norm": 18.29205894470215, "learning_rate": 8.270562985733652e-05, "loss": 0.7863, "step": 1970 }, { "epoch": 0.56823073611709, "grad_norm": 7.219738006591797, "learning_rate": 8.180056186581976e-05, "loss": 0.7651, "step": 1980 }, { "epoch": 0.5711005883197016, "grad_norm": 4.146981716156006, "learning_rate": 8.089703315896058e-05, "loss": 0.7578, "step": 1990 }, { "epoch": 0.573970440522313, "grad_norm": 4.7924675941467285, "learning_rate": 7.999512015603438e-05, "loss": 0.7974, "step": 2000 }, { "epoch": 0.5768402927249247, "grad_norm": 5.102847576141357, "learning_rate": 7.909489913966261e-05, "loss": 0.805, "step": 2010 }, { "epoch": 0.5797101449275363, "grad_norm": 5.353450298309326, "learning_rate": 7.819644624936051e-05, "loss": 0.7895, "step": 2020 }, { "epoch": 0.5825799971301477, "grad_norm": 5.74714469909668, "learning_rate": 7.72998374750977e-05, "loss": 0.8029, "step": 2030 }, { "epoch": 0.5854498493327593, "grad_norm": 4.67111873626709, "learning_rate": 7.640514865087077e-05, "loss": 0.7763, "step": 2040 }, { "epoch": 0.5883197015353709, "grad_norm": 4.226963996887207, "learning_rate": 7.551245544828944e-05, "loss": 0.7935, "step": 2050 }, { "epoch": 0.5911895537379825, "grad_norm": 6.067037105560303, "learning_rate": 7.46218333701765e-05, "loss": 0.7835, "step": 2060 }, { "epoch": 0.594059405940594, "grad_norm": 6.7161736488342285, "learning_rate": 7.373335774418158e-05, "loss": 0.7793, "step": 2070 }, { "epoch": 0.5969292581432056, "grad_norm": 4.633667945861816, "learning_rate": 7.28471037164103e-05, "loss": 0.793, "step": 2080 }, { "epoch": 0.5997991103458172, "grad_norm": 5.508072376251221, "learning_rate": 7.196314624506834e-05, "loss": 0.7589, "step": 2090 }, { "epoch": 0.6026689625484287, "grad_norm": 4.465757369995117, "learning_rate": 7.108156009412176e-05, "loss": 0.7569, "step": 2100 }, { "epoch": 0.6055388147510403, "grad_norm": 3.5824501514434814, "learning_rate": 7.02024198269733e-05, "loss": 0.7963, "step": 2110 }, { "epoch": 0.6084086669536519, "grad_norm": 8.07539176940918, "learning_rate": 6.932579980015618e-05, "loss": 0.8183, "step": 2120 }, { "epoch": 0.6112785191562634, "grad_norm": 5.9698615074157715, "learning_rate": 6.845177415704484e-05, "loss": 0.749, "step": 2130 }, { "epoch": 0.614148371358875, "grad_norm": 4.034762859344482, "learning_rate": 6.758041682158431e-05, "loss": 0.7853, "step": 2140 }, { "epoch": 0.6170182235614866, "grad_norm": 8.13531494140625, "learning_rate": 6.671180149203751e-05, "loss": 0.7871, "step": 2150 }, { "epoch": 0.6198880757640981, "grad_norm": 5.809640884399414, "learning_rate": 6.584600163475222e-05, "loss": 0.8037, "step": 2160 }, { "epoch": 0.6227579279667097, "grad_norm": 5.849427223205566, "learning_rate": 6.498309047794713e-05, "loss": 0.8076, "step": 2170 }, { "epoch": 0.6256277801693213, "grad_norm": 4.466967582702637, "learning_rate": 6.412314100551854e-05, "loss": 0.7863, "step": 2180 }, { "epoch": 0.6284976323719328, "grad_norm": 4.934723377227783, "learning_rate": 6.326622595086722e-05, "loss": 0.7747, "step": 2190 }, { "epoch": 0.6313674845745444, "grad_norm": 4.067635536193848, "learning_rate": 6.241241779074705e-05, "loss": 0.7804, "step": 2200 }, { "epoch": 0.634237336777156, "grad_norm": 4.629720687866211, "learning_rate": 6.156178873913468e-05, "loss": 0.7672, "step": 2210 }, { "epoch": 0.6371071889797676, "grad_norm": 3.9992971420288086, "learning_rate": 6.071441074112194e-05, "loss": 0.7856, "step": 2220 }, { "epoch": 0.6399770411823791, "grad_norm": 6.1507062911987305, "learning_rate": 5.9870355466830885e-05, "loss": 0.752, "step": 2230 }, { "epoch": 0.6428468933849907, "grad_norm": 4.305118083953857, "learning_rate": 5.902969430535186e-05, "loss": 0.7506, "step": 2240 }, { "epoch": 0.6457167455876023, "grad_norm": 3.7307469844818115, "learning_rate": 5.819249835870566e-05, "loss": 0.7744, "step": 2250 }, { "epoch": 0.6485865977902138, "grad_norm": 5.391602516174316, "learning_rate": 5.7358838435829664e-05, "loss": 0.8067, "step": 2260 }, { "epoch": 0.6514564499928254, "grad_norm": 4.221368789672852, "learning_rate": 5.6528785046589115e-05, "loss": 0.8257, "step": 2270 }, { "epoch": 0.654326302195437, "grad_norm": 5.274345397949219, "learning_rate": 5.570240839581323e-05, "loss": 0.7638, "step": 2280 }, { "epoch": 0.6571961543980485, "grad_norm": 4.528804779052734, "learning_rate": 5.487977837735756e-05, "loss": 0.7805, "step": 2290 }, { "epoch": 0.6600660066006601, "grad_norm": 4.387100696563721, "learning_rate": 5.406096456819234e-05, "loss": 0.7811, "step": 2300 }, { "epoch": 0.6629358588032717, "grad_norm": 5.64663028717041, "learning_rate": 5.324603622251797e-05, "loss": 0.771, "step": 2310 }, { "epoch": 0.6658057110058831, "grad_norm": 4.328652381896973, "learning_rate": 5.243506226590722e-05, "loss": 0.7711, "step": 2320 }, { "epoch": 0.6686755632084947, "grad_norm": 4.763848781585693, "learning_rate": 5.162811128947602e-05, "loss": 0.7849, "step": 2330 }, { "epoch": 0.6715454154111064, "grad_norm": 6.142160892486572, "learning_rate": 5.082525154408173e-05, "loss": 0.7587, "step": 2340 }, { "epoch": 0.6744152676137178, "grad_norm": 6.3459553718566895, "learning_rate": 5.002655093455086e-05, "loss": 0.7762, "step": 2350 }, { "epoch": 0.6772851198163294, "grad_norm": 5.520603656768799, "learning_rate": 4.9232077013935606e-05, "loss": 0.7854, "step": 2360 }, { "epoch": 0.680154972018941, "grad_norm": 3.9489786624908447, "learning_rate": 4.844189697780033e-05, "loss": 0.7599, "step": 2370 }, { "epoch": 0.6830248242215526, "grad_norm": 5.653624057769775, "learning_rate": 4.765607765853828e-05, "loss": 0.7875, "step": 2380 }, { "epoch": 0.6858946764241641, "grad_norm": 4.3883957862854, "learning_rate": 4.6874685519718945e-05, "loss": 0.7825, "step": 2390 }, { "epoch": 0.6887645286267757, "grad_norm": 3.743744134902954, "learning_rate": 4.60977866504668e-05, "loss": 0.7796, "step": 2400 }, { "epoch": 0.6916343808293873, "grad_norm": 5.168239593505859, "learning_rate": 4.5325446759871316e-05, "loss": 0.7764, "step": 2410 }, { "epoch": 0.6945042330319988, "grad_norm": 3.202075958251953, "learning_rate": 4.455773117142965e-05, "loss": 0.7483, "step": 2420 }, { "epoch": 0.6973740852346104, "grad_norm": 4.126010417938232, "learning_rate": 4.379470481752139e-05, "loss": 0.7702, "step": 2430 }, { "epoch": 0.700243937437222, "grad_norm": 5.2914509773254395, "learning_rate": 4.303643223391698e-05, "loss": 0.7663, "step": 2440 }, { "epoch": 0.7031137896398335, "grad_norm": 5.010975360870361, "learning_rate": 4.2282977554319034e-05, "loss": 0.7911, "step": 2450 }, { "epoch": 0.7059836418424451, "grad_norm": 3.504735231399536, "learning_rate": 4.153440450493823e-05, "loss": 0.7452, "step": 2460 }, { "epoch": 0.7088534940450567, "grad_norm": 5.5859880447387695, "learning_rate": 4.0790776399103294e-05, "loss": 0.758, "step": 2470 }, { "epoch": 0.7117233462476682, "grad_norm": 6.027501583099365, "learning_rate": 4.0052156131906214e-05, "loss": 0.7945, "step": 2480 }, { "epoch": 0.7145931984502798, "grad_norm": 5.546058654785156, "learning_rate": 3.93186061748824e-05, "loss": 0.7676, "step": 2490 }, { "epoch": 0.7174630506528914, "grad_norm": 4.879994869232178, "learning_rate": 3.859018857072719e-05, "loss": 0.7926, "step": 2500 }, { "epoch": 0.7203329028555029, "grad_norm": 4.717655181884766, "learning_rate": 3.786696492804812e-05, "loss": 0.7451, "step": 2510 }, { "epoch": 0.7232027550581145, "grad_norm": 6.432432174682617, "learning_rate": 3.714899641615438e-05, "loss": 0.7938, "step": 2520 }, { "epoch": 0.7260726072607261, "grad_norm": 5.008986473083496, "learning_rate": 3.6436343759882926e-05, "loss": 0.765, "step": 2530 }, { "epoch": 0.7289424594633377, "grad_norm": 7.00074577331543, "learning_rate": 3.5729067234462785e-05, "loss": 0.7794, "step": 2540 }, { "epoch": 0.7318123116659492, "grad_norm": 6.525863170623779, "learning_rate": 3.5027226660416736e-05, "loss": 0.7979, "step": 2550 }, { "epoch": 0.7346821638685608, "grad_norm": 5.4863786697387695, "learning_rate": 3.433088139850193e-05, "loss": 0.7625, "step": 2560 }, { "epoch": 0.7375520160711724, "grad_norm": 3.975086212158203, "learning_rate": 3.364009034468926e-05, "loss": 0.7471, "step": 2570 }, { "epoch": 0.7404218682737839, "grad_norm": 3.787874460220337, "learning_rate": 3.2954911925181876e-05, "loss": 0.7662, "step": 2580 }, { "epoch": 0.7432917204763955, "grad_norm": 4.633001804351807, "learning_rate": 3.2275404091473795e-05, "loss": 0.774, "step": 2590 }, { "epoch": 0.7461615726790071, "grad_norm": 4.832580089569092, "learning_rate": 3.1601624315448166e-05, "loss": 0.7749, "step": 2600 }, { "epoch": 0.7490314248816186, "grad_norm": 4.763906955718994, "learning_rate": 3.0933629584516665e-05, "loss": 0.7438, "step": 2610 }, { "epoch": 0.7519012770842302, "grad_norm": 4.065663814544678, "learning_rate": 3.027147639679928e-05, "loss": 0.7546, "step": 2620 }, { "epoch": 0.7547711292868418, "grad_norm": 4.496669769287109, "learning_rate": 2.961522075634604e-05, "loss": 0.7878, "step": 2630 }, { "epoch": 0.7576409814894532, "grad_norm": 3.8822827339172363, "learning_rate": 2.896491816840008e-05, "loss": 0.7884, "step": 2640 }, { "epoch": 0.7605108336920648, "grad_norm": 4.25615119934082, "learning_rate": 2.8320623634703147e-05, "loss": 0.7418, "step": 2650 }, { "epoch": 0.7633806858946764, "grad_norm": 4.472879886627197, "learning_rate": 2.76823916488436e-05, "loss": 0.7944, "step": 2660 }, { "epoch": 0.7662505380972879, "grad_norm": 6.644125938415527, "learning_rate": 2.705027619164754e-05, "loss": 0.7525, "step": 2670 }, { "epoch": 0.7691203902998995, "grad_norm": 3.8960325717926025, "learning_rate": 2.6424330726612946e-05, "loss": 0.748, "step": 2680 }, { "epoch": 0.7719902425025111, "grad_norm": 3.907740354537964, "learning_rate": 2.5804608195388057e-05, "loss": 0.7686, "step": 2690 }, { "epoch": 0.7748600947051227, "grad_norm": 4.432440757751465, "learning_rate": 2.5191161013293396e-05, "loss": 0.7671, "step": 2700 }, { "epoch": 0.7777299469077342, "grad_norm": 4.681542873382568, "learning_rate": 2.4584041064888798e-05, "loss": 0.765, "step": 2710 }, { "epoch": 0.7805997991103458, "grad_norm": 4.8185343742370605, "learning_rate": 2.398329969958486e-05, "loss": 0.772, "step": 2720 }, { "epoch": 0.7834696513129574, "grad_norm": 4.85504150390625, "learning_rate": 2.3388987727299982e-05, "loss": 0.7655, "step": 2730 }, { "epoch": 0.7863395035155689, "grad_norm": 4.443562030792236, "learning_rate": 2.2801155414162934e-05, "loss": 0.7885, "step": 2740 }, { "epoch": 0.7892093557181805, "grad_norm": 4.084039211273193, "learning_rate": 2.221985247826138e-05, "loss": 0.7679, "step": 2750 }, { "epoch": 0.7920792079207921, "grad_norm": 5.327516555786133, "learning_rate": 2.164512808543686e-05, "loss": 0.7704, "step": 2760 }, { "epoch": 0.7949490601234036, "grad_norm": 5.7689313888549805, "learning_rate": 2.1077030845126256e-05, "loss": 0.7572, "step": 2770 }, { "epoch": 0.7978189123260152, "grad_norm": 5.112376689910889, "learning_rate": 2.0515608806250665e-05, "loss": 0.7633, "step": 2780 }, { "epoch": 0.8006887645286268, "grad_norm": 4.748579502105713, "learning_rate": 1.996090945315128e-05, "loss": 0.7757, "step": 2790 }, { "epoch": 0.8035586167312383, "grad_norm": 4.38164758682251, "learning_rate": 1.941297970157344e-05, "loss": 0.7517, "step": 2800 }, { "epoch": 0.8064284689338499, "grad_norm": 4.2106523513793945, "learning_rate": 1.8871865894698336e-05, "loss": 0.7783, "step": 2810 }, { "epoch": 0.8092983211364615, "grad_norm": 6.83260440826416, "learning_rate": 1.8337613799223586e-05, "loss": 0.758, "step": 2820 }, { "epoch": 0.812168173339073, "grad_norm": 4.018373012542725, "learning_rate": 1.7810268601492164e-05, "loss": 0.7464, "step": 2830 }, { "epoch": 0.8150380255416846, "grad_norm": 5.183018207550049, "learning_rate": 1.7289874903670677e-05, "loss": 0.75, "step": 2840 }, { "epoch": 0.8179078777442962, "grad_norm": 3.9134421348571777, "learning_rate": 1.6776476719976974e-05, "loss": 0.7991, "step": 2850 }, { "epoch": 0.8207777299469078, "grad_norm": 5.056222915649414, "learning_rate": 1.6270117472957534e-05, "loss": 0.7419, "step": 2860 }, { "epoch": 0.8236475821495193, "grad_norm": 4.9499311447143555, "learning_rate": 1.5770839989814677e-05, "loss": 0.7927, "step": 2870 }, { "epoch": 0.8265174343521309, "grad_norm": 4.165496826171875, "learning_rate": 1.527868649878451e-05, "loss": 0.7502, "step": 2880 }, { "epoch": 0.8293872865547425, "grad_norm": 5.458337306976318, "learning_rate": 1.4793698625565122e-05, "loss": 0.7699, "step": 2890 }, { "epoch": 0.832257138757354, "grad_norm": 4.831928253173828, "learning_rate": 1.4315917389796119e-05, "loss": 0.7577, "step": 2900 }, { "epoch": 0.8351269909599656, "grad_norm": 5.4457221031188965, "learning_rate": 1.3845383201589057e-05, "loss": 0.76, "step": 2910 }, { "epoch": 0.8379968431625772, "grad_norm": 4.1194586753845215, "learning_rate": 1.3382135858109735e-05, "loss": 0.7865, "step": 2920 }, { "epoch": 0.8408666953651887, "grad_norm": 4.45517110824585, "learning_rate": 1.2926214540212155e-05, "loss": 0.7414, "step": 2930 }, { "epoch": 0.8437365475678003, "grad_norm": 4.03952169418335, "learning_rate": 1.2477657809124631e-05, "loss": 0.78, "step": 2940 }, { "epoch": 0.8466063997704119, "grad_norm": 4.787744998931885, "learning_rate": 1.2036503603188464e-05, "loss": 0.7862, "step": 2950 }, { "epoch": 0.8494762519730233, "grad_norm": 6.612007141113281, "learning_rate": 1.1602789234648948e-05, "loss": 0.7356, "step": 2960 }, { "epoch": 0.8523461041756349, "grad_norm": 4.051847457885742, "learning_rate": 1.1176551386499757e-05, "loss": 0.7261, "step": 2970 }, { "epoch": 0.8552159563782465, "grad_norm": 6.460504055023193, "learning_rate": 1.0757826109380165e-05, "loss": 0.7701, "step": 2980 }, { "epoch": 0.858085808580858, "grad_norm": 7.030419826507568, "learning_rate": 1.034664881852614e-05, "loss": 0.7938, "step": 2990 }, { "epoch": 0.8609556607834696, "grad_norm": 6.365281581878662, "learning_rate": 9.943054290774756e-06, "loss": 0.7574, "step": 3000 }, { "epoch": 0.8638255129860812, "grad_norm": 5.900289535522461, "learning_rate": 9.547076661622922e-06, "loss": 0.7758, "step": 3010 }, { "epoch": 0.8666953651886928, "grad_norm": 5.241759777069092, "learning_rate": 9.15874942234024e-06, "loss": 0.7805, "step": 3020 }, { "epoch": 0.8695652173913043, "grad_norm": 4.609664440155029, "learning_rate": 8.778105417136395e-06, "loss": 0.7642, "step": 3030 }, { "epoch": 0.8724350695939159, "grad_norm": 6.470444202423096, "learning_rate": 8.405176840383122e-06, "loss": 0.7928, "step": 3040 }, { "epoch": 0.8753049217965275, "grad_norm": 3.531794786453247, "learning_rate": 8.039995233891362e-06, "loss": 0.7503, "step": 3050 }, { "epoch": 0.878174773999139, "grad_norm": 5.537559986114502, "learning_rate": 7.682591484243417e-06, "loss": 0.7343, "step": 3060 }, { "epoch": 0.8810446262017506, "grad_norm": 3.7967238426208496, "learning_rate": 7.332995820180677e-06, "loss": 0.7345, "step": 3070 }, { "epoch": 0.8839144784043622, "grad_norm": 4.1268839836120605, "learning_rate": 6.991237810046847e-06, "loss": 0.7557, "step": 3080 }, { "epoch": 0.8867843306069737, "grad_norm": 7.182312965393066, "learning_rate": 6.6573463592871085e-06, "loss": 0.7635, "step": 3090 }, { "epoch": 0.8896541828095853, "grad_norm": 3.4768388271331787, "learning_rate": 6.331349708003365e-06, "loss": 0.7325, "step": 3100 }, { "epoch": 0.8925240350121969, "grad_norm": 5.252262115478516, "learning_rate": 6.013275428565712e-06, "loss": 0.7513, "step": 3110 }, { "epoch": 0.8953938872148084, "grad_norm": 4.213047027587891, "learning_rate": 5.703150423280401e-06, "loss": 0.7685, "step": 3120 }, { "epoch": 0.89826373941742, "grad_norm": 4.207084655761719, "learning_rate": 5.401000922114485e-06, "loss": 0.7313, "step": 3130 }, { "epoch": 0.9011335916200316, "grad_norm": 6.862100124359131, "learning_rate": 5.10685248047732e-06, "loss": 0.7626, "step": 3140 }, { "epoch": 0.9040034438226431, "grad_norm": 3.541048049926758, "learning_rate": 4.82072997705908e-06, "loss": 0.7748, "step": 3150 }, { "epoch": 0.9068732960252547, "grad_norm": 4.149963855743408, "learning_rate": 4.542657611726664e-06, "loss": 0.7651, "step": 3160 }, { "epoch": 0.9097431482278663, "grad_norm": 6.455443859100342, "learning_rate": 4.272658903476745e-06, "loss": 0.7769, "step": 3170 }, { "epoch": 0.9126130004304779, "grad_norm": 5.111416339874268, "learning_rate": 4.010756688446726e-06, "loss": 0.779, "step": 3180 }, { "epoch": 0.9154828526330894, "grad_norm": 5.0384440422058105, "learning_rate": 3.7569731179831537e-06, "loss": 0.7353, "step": 3190 }, { "epoch": 0.918352704835701, "grad_norm": 4.619420528411865, "learning_rate": 3.5113296567682476e-06, "loss": 0.7686, "step": 3200 }, { "epoch": 0.9212225570383126, "grad_norm": 5.13969612121582, "learning_rate": 3.2738470810044553e-06, "loss": 0.7475, "step": 3210 }, { "epoch": 0.9240924092409241, "grad_norm": 4.138948917388916, "learning_rate": 3.0445454766572235e-06, "loss": 0.743, "step": 3220 }, { "epoch": 0.9269622614435357, "grad_norm": 3.4994235038757324, "learning_rate": 2.8234442377561232e-06, "loss": 0.7491, "step": 3230 }, { "epoch": 0.9298321136461473, "grad_norm": 3.714160442352295, "learning_rate": 2.6105620647545734e-06, "loss": 0.7516, "step": 3240 }, { "epoch": 0.9327019658487588, "grad_norm": 3.1646008491516113, "learning_rate": 2.4059169629481403e-06, "loss": 0.751, "step": 3250 }, { "epoch": 0.9355718180513704, "grad_norm": 4.828333377838135, "learning_rate": 2.209526240951665e-06, "loss": 0.741, "step": 3260 }, { "epoch": 0.938441670253982, "grad_norm": 3.3315179347991943, "learning_rate": 2.021406509235402e-06, "loss": 0.7554, "step": 3270 }, { "epoch": 0.9413115224565934, "grad_norm": 6.141576766967773, "learning_rate": 1.8415736787200433e-06, "loss": 0.7465, "step": 3280 }, { "epoch": 0.944181374659205, "grad_norm": 4.839749336242676, "learning_rate": 1.6700429594310063e-06, "loss": 0.761, "step": 3290 }, { "epoch": 0.9470512268618166, "grad_norm": 4.683228969573975, "learning_rate": 1.5068288592120283e-06, "loss": 0.751, "step": 3300 } ], "logging_steps": 10, "max_steps": 3485, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.35032131289088e+20, "train_batch_size": 6, "trial_name": null, "trial_params": null }