{ "best_global_step": 1359, "best_metric": 0.05789753, "best_model_checkpoint": "/home/kiol/runs-v2-full-clean-r1/qwen35-qlora-forest/v0-20260424-170332/checkpoint-1359", "epoch": 3.0, "eval_steps": 200, "global_step": 1359, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002210708117443869, "grad_norm": 2.3902769088745117, "learning_rate": 2.173913043478261e-06, "loss": 0.5411382913589478, "step": 1, "token_acc": 0.8289156626506025 }, { "epoch": 0.011053540587219343, "grad_norm": 2.6440868377685547, "learning_rate": 1.0869565217391305e-05, "loss": 0.5509852766990662, "step": 5, "token_acc": 0.8427947598253275 }, { "epoch": 0.022107081174438686, "grad_norm": 2.8517158031463623, "learning_rate": 2.173913043478261e-05, "loss": 0.5423528671264648, "step": 10, "token_acc": 0.8413485374318295 }, { "epoch": 0.03316062176165803, "grad_norm": 3.078794479370117, "learning_rate": 3.260869565217392e-05, "loss": 0.4979440689086914, "step": 15, "token_acc": 0.8463063956370848 }, { "epoch": 0.04421416234887737, "grad_norm": 3.6568212509155273, "learning_rate": 4.347826086956522e-05, "loss": 0.4140318393707275, "step": 20, "token_acc": 0.8631159780985564 }, { "epoch": 0.055267702936096716, "grad_norm": 2.4268248081207275, "learning_rate": 4.9997331144187255e-05, "loss": 0.292206335067749, "step": 25, "token_acc": 0.9111776447105788 }, { "epoch": 0.06632124352331606, "grad_norm": 3.583557605743408, "learning_rate": 4.996731305997416e-05, "loss": 0.2570572137832642, "step": 30, "token_acc": 0.9214214214214215 }, { "epoch": 0.0773747841105354, "grad_norm": 3.932553768157959, "learning_rate": 4.990398100856367e-05, "loss": 0.20562427043914794, "step": 35, "token_acc": 0.9317507418397626 }, { "epoch": 0.08842832469775475, "grad_norm": 2.240527391433716, "learning_rate": 4.980741949411839e-05, "loss": 0.21206333637237548, "step": 40, "token_acc": 0.932 }, { "epoch": 0.09948186528497409, "grad_norm": 2.624908447265625, "learning_rate": 4.967775735898179e-05, "loss": 0.21093401908874512, "step": 45, "token_acc": 0.9330024813895782 }, { "epoch": 0.11053540587219343, "grad_norm": 1.4830845594406128, "learning_rate": 4.9515167611763434e-05, "loss": 0.1953817367553711, "step": 50, "token_acc": 0.9338308457711443 }, { "epoch": 0.12158894645941278, "grad_norm": 2.913989305496216, "learning_rate": 4.931986719649299e-05, "loss": 0.16938740015029907, "step": 55, "token_acc": 0.9406234537357744 }, { "epoch": 0.13264248704663212, "grad_norm": 2.571464776992798, "learning_rate": 4.909211670315114e-05, "loss": 0.18647342920303345, "step": 60, "token_acc": 0.9361914257228315 }, { "epoch": 0.14369602763385148, "grad_norm": 2.3652737140655518, "learning_rate": 4.8832220019963514e-05, "loss": 0.16005475521087648, "step": 65, "token_acc": 0.9426719840478565 }, { "epoch": 0.1547495682210708, "grad_norm": 2.1498990058898926, "learning_rate": 4.8540523927921616e-05, "loss": 0.17245489358901978, "step": 70, "token_acc": 0.9344672336168084 }, { "epoch": 0.16580310880829016, "grad_norm": 2.2657954692840576, "learning_rate": 4.821741763807186e-05, "loss": 0.1709640145301819, "step": 75, "token_acc": 0.9368473396320238 }, { "epoch": 0.1768566493955095, "grad_norm": 2.3392601013183594, "learning_rate": 4.786333227218995e-05, "loss": 0.16215839385986328, "step": 80, "token_acc": 0.9388667992047713 }, { "epoch": 0.18791018998272885, "grad_norm": 3.259490489959717, "learning_rate": 4.747874028753375e-05, "loss": 0.16813175678253173, "step": 85, "token_acc": 0.9411177644710579 }, { "epoch": 0.19896373056994818, "grad_norm": 2.805907726287842, "learning_rate": 4.706415484644195e-05, "loss": 0.16006848812103272, "step": 90, "token_acc": 0.944666001994018 }, { "epoch": 0.21001727115716753, "grad_norm": 2.46066951751709, "learning_rate": 4.662012913161997e-05, "loss": 0.15026205778121948, "step": 95, "token_acc": 0.9417910447761194 }, { "epoch": 0.22107081174438686, "grad_norm": 2.5938236713409424, "learning_rate": 4.6147255608026394e-05, "loss": 0.15484832525253295, "step": 100, "token_acc": 0.9400099157164105 }, { "epoch": 0.23212435233160622, "grad_norm": 2.2375521659851074, "learning_rate": 4.564616523234511e-05, "loss": 0.16213221549987794, "step": 105, "token_acc": 0.9409704852426213 }, { "epoch": 0.24317789291882555, "grad_norm": 2.779315233230591, "learning_rate": 4.511752661109768e-05, "loss": 0.15692013502120972, "step": 110, "token_acc": 0.9458519622454049 }, { "epoch": 0.2542314335060449, "grad_norm": 2.144108295440674, "learning_rate": 4.4562045108519565e-05, "loss": 0.1405424118041992, "step": 115, "token_acc": 0.9512437810945273 }, { "epoch": 0.26528497409326424, "grad_norm": 2.1729953289031982, "learning_rate": 4.398046190539025e-05, "loss": 0.13323441743850709, "step": 120, "token_acc": 0.9441953163926258 }, { "epoch": 0.2763385146804836, "grad_norm": 2.5072147846221924, "learning_rate": 4.3373553010073355e-05, "loss": 0.1478448271751404, "step": 125, "token_acc": 0.9433681073025335 }, { "epoch": 0.28739205526770295, "grad_norm": 3.7264955043792725, "learning_rate": 4.274212822308612e-05, "loss": 0.13619284629821776, "step": 130, "token_acc": 0.9491778774289985 }, { "epoch": 0.29844559585492225, "grad_norm": 3.988476276397705, "learning_rate": 4.208703005657999e-05, "loss": 0.1341521382331848, "step": 135, "token_acc": 0.9512922465208747 }, { "epoch": 0.3094991364421416, "grad_norm": 3.7429981231689453, "learning_rate": 4.140913261017382e-05, "loss": 0.13466038703918456, "step": 140, "token_acc": 0.9512074913750616 }, { "epoch": 0.32055267702936097, "grad_norm": 2.5042636394500732, "learning_rate": 4.070934040463998e-05, "loss": 0.13430129289627074, "step": 145, "token_acc": 0.9491525423728814 }, { "epoch": 0.3316062176165803, "grad_norm": 3.5047497749328613, "learning_rate": 3.998858717499931e-05, "loss": 0.1267208695411682, "step": 150, "token_acc": 0.9575636545182227 }, { "epoch": 0.3426597582037997, "grad_norm": 3.953479528427124, "learning_rate": 3.924783462463541e-05, "loss": 0.11603262424468994, "step": 155, "token_acc": 0.9582297364495276 }, { "epoch": 0.353713298791019, "grad_norm": 4.1243791580200195, "learning_rate": 3.848807114209074e-05, "loss": 0.1214677095413208, "step": 160, "token_acc": 0.9600591715976331 }, { "epoch": 0.36476683937823834, "grad_norm": 4.218201160430908, "learning_rate": 3.7710310482256526e-05, "loss": 0.12736471891403198, "step": 165, "token_acc": 0.9536390827517448 }, { "epoch": 0.3758203799654577, "grad_norm": 5.623844623565674, "learning_rate": 3.691559041371631e-05, "loss": 0.11791330575942993, "step": 170, "token_acc": 0.9556109725685785 }, { "epoch": 0.38687392055267705, "grad_norm": 2.831035852432251, "learning_rate": 3.6104971334047956e-05, "loss": 0.10760444402694702, "step": 175, "token_acc": 0.9595461272816971 }, { "epoch": 0.39792746113989635, "grad_norm": 3.888122797012329, "learning_rate": 3.527953485493168e-05, "loss": 0.11927105188369751, "step": 180, "token_acc": 0.956737941322725 }, { "epoch": 0.4089810017271157, "grad_norm": 3.259575128555298, "learning_rate": 3.444038235895212e-05, "loss": 0.12609773874282837, "step": 185, "token_acc": 0.955 }, { "epoch": 0.42003454231433507, "grad_norm": 4.910781383514404, "learning_rate": 3.358863353001987e-05, "loss": 0.12669839859008789, "step": 190, "token_acc": 0.9573412698412699 }, { "epoch": 0.4310880829015544, "grad_norm": 2.9438295364379883, "learning_rate": 3.272542485937369e-05, "loss": 0.11617603302001953, "step": 195, "token_acc": 0.958958958958959 }, { "epoch": 0.4421416234887737, "grad_norm": 5.1633195877075195, "learning_rate": 3.185190812915646e-05, "loss": 0.10707591772079468, "step": 200, "token_acc": 0.959479739869935 }, { "epoch": 0.4421416234887737, "eval_loss": 0.11586810648441315, "eval_runtime": 1992.0002, "eval_samples_per_second": 1.178, "eval_steps_per_second": 1.178, "eval_token_acc": 0.9600378980137381, "step": 200 }, { "epoch": 0.4531951640759931, "grad_norm": 4.371393203735352, "learning_rate": 3.096924887558855e-05, "loss": 0.11527643203735352, "step": 205, "token_acc": 0.9623202776400595 }, { "epoch": 0.46424870466321244, "grad_norm": 5.050312519073486, "learning_rate": 3.007862483378906e-05, "loss": 0.12530215978622436, "step": 210, "token_acc": 0.9602583209140586 }, { "epoch": 0.4753022452504318, "grad_norm": 2.541715145111084, "learning_rate": 2.9181224366319947e-05, "loss": 0.10488080978393555, "step": 215, "token_acc": 0.9631474103585658 }, { "epoch": 0.4863557858376511, "grad_norm": 5.020773410797119, "learning_rate": 2.827824487755007e-05, "loss": 0.13031703233718872, "step": 220, "token_acc": 0.9523809523809523 }, { "epoch": 0.49740932642487046, "grad_norm": 5.226169586181641, "learning_rate": 2.7370891215954568e-05, "loss": 0.13065972328186035, "step": 225, "token_acc": 0.9522150323544052 }, { "epoch": 0.5084628670120898, "grad_norm": 3.837796211242676, "learning_rate": 2.646037406648165e-05, "loss": 0.11477760076522828, "step": 230, "token_acc": 0.9575848303393214 }, { "epoch": 0.5195164075993092, "grad_norm": 3.0600061416625977, "learning_rate": 2.5547908335131704e-05, "loss": 0.1025763750076294, "step": 235, "token_acc": 0.9629446640316206 }, { "epoch": 0.5305699481865285, "grad_norm": 4.423151016235352, "learning_rate": 2.4634711527904272e-05, "loss": 0.11243470907211303, "step": 240, "token_acc": 0.9575212393803099 }, { "epoch": 0.5416234887737479, "grad_norm": 3.2377634048461914, "learning_rate": 2.3722002126275824e-05, "loss": 0.11171900033950806, "step": 245, "token_acc": 0.9635182408795602 }, { "epoch": 0.5526770293609672, "grad_norm": 3.354611396789551, "learning_rate": 2.281099796137594e-05, "loss": 0.10156643390655518, "step": 250, "token_acc": 0.965965965965966 }, { "epoch": 0.5637305699481865, "grad_norm": 4.366595268249512, "learning_rate": 2.19029145890313e-05, "loss": 0.09984329342842102, "step": 255, "token_acc": 0.9646061814556331 }, { "epoch": 0.5747841105354059, "grad_norm": 4.832893371582031, "learning_rate": 2.0998963667845535e-05, "loss": 0.08920307159423828, "step": 260, "token_acc": 0.9660847880299251 }, { "epoch": 0.5858376511226252, "grad_norm": 10.402474403381348, "learning_rate": 2.0100351342479216e-05, "loss": 0.10913920402526855, "step": 265, "token_acc": 0.9623389494549058 }, { "epoch": 0.5968911917098445, "grad_norm": 5.610108852386475, "learning_rate": 1.9208276634287143e-05, "loss": 0.11966934204101562, "step": 270, "token_acc": 0.9542060726729716 }, { "epoch": 0.6079447322970639, "grad_norm": 5.160745143890381, "learning_rate": 1.832392984146018e-05, "loss": 0.12343072891235352, "step": 275, "token_acc": 0.9561752988047809 }, { "epoch": 0.6189982728842832, "grad_norm": 4.599337100982666, "learning_rate": 1.7448490950806552e-05, "loss": 0.09566409587860107, "step": 280, "token_acc": 0.9677579365079365 }, { "epoch": 0.6300518134715026, "grad_norm": 4.375885486602783, "learning_rate": 1.6583128063291576e-05, "loss": 0.12313523292541503, "step": 285, "token_acc": 0.9544328875681031 }, { "epoch": 0.6411053540587219, "grad_norm": 6.371300220489502, "learning_rate": 1.572899583543671e-05, "loss": 0.10101137161254883, "step": 290, "token_acc": 0.96250616674889 }, { "epoch": 0.6521588946459412, "grad_norm": 4.738733291625977, "learning_rate": 1.488723393865766e-05, "loss": 0.10386931896209717, "step": 295, "token_acc": 0.963681592039801 }, { "epoch": 0.6632124352331606, "grad_norm": 6.841185569763184, "learning_rate": 1.4058965538597033e-05, "loss": 0.1303364634513855, "step": 300, "token_acc": 0.9556772908366534 }, { "epoch": 0.67426597582038, "grad_norm": 14.118032455444336, "learning_rate": 1.3245295796480789e-05, "loss": 0.12113407850265503, "step": 305, "token_acc": 0.9616342800199302 }, { "epoch": 0.6853195164075994, "grad_norm": 3.676441192626953, "learning_rate": 1.2447310394498019e-05, "loss": 0.09884743690490723, "step": 310, "token_acc": 0.9665518937530743 }, { "epoch": 0.6963730569948187, "grad_norm": 5.252943992614746, "learning_rate": 1.1666074087171627e-05, "loss": 0.1010090470314026, "step": 315, "token_acc": 0.9651741293532339 }, { "epoch": 0.707426597582038, "grad_norm": 4.592480182647705, "learning_rate": 1.0902629280652931e-05, "loss": 0.10196793079376221, "step": 320, "token_acc": 0.9611166500498505 }, { "epoch": 0.7184801381692574, "grad_norm": 6.885665416717529, "learning_rate": 1.0157994641835736e-05, "loss": 0.11955760717391968, "step": 325, "token_acc": 0.95773247140726 }, { "epoch": 0.7295336787564767, "grad_norm": 5.868223667144775, "learning_rate": 9.433163739145773e-06, "loss": 0.09359505772590637, "step": 330, "token_acc": 0.9641434262948207 }, { "epoch": 0.740587219343696, "grad_norm": 6.838636875152588, "learning_rate": 8.729103716819112e-06, "loss": 0.09510601162910462, "step": 335, "token_acc": 0.9697870232788509 }, { "epoch": 0.7516407599309154, "grad_norm": 5.597217559814453, "learning_rate": 8.046754004438429e-06, "loss": 0.0947553813457489, "step": 340, "token_acc": 0.9647992067426872 }, { "epoch": 0.7626943005181347, "grad_norm": 4.449385643005371, "learning_rate": 7.387025063449082e-06, "loss": 0.11492215394973755, "step": 345, "token_acc": 0.9548834903321765 }, { "epoch": 0.7737478411053541, "grad_norm": 5.535495758056641, "learning_rate": 6.750797172327442e-06, "loss": 0.10709909200668336, "step": 350, "token_acc": 0.9646590343454455 }, { "epoch": 0.7848013816925734, "grad_norm": 3.9693377017974854, "learning_rate": 6.138919252022435e-06, "loss": 0.09377566576004029, "step": 355, "token_acc": 0.963220675944334 }, { "epoch": 0.7958549222797927, "grad_norm": 4.245842456817627, "learning_rate": 5.5522077332375436e-06, "loss": 0.11761529445648193, "step": 360, "token_acc": 0.9603371343579573 }, { "epoch": 0.8069084628670121, "grad_norm": 5.86805534362793, "learning_rate": 4.99144546706469e-06, "loss": 0.11018631458282471, "step": 365, "token_acc": 0.9619000494804553 }, { "epoch": 0.8179620034542314, "grad_norm": 3.946427583694458, "learning_rate": 4.457380680423434e-06, "loss": 0.09712615013122558, "step": 370, "token_acc": 0.9691695673794132 }, { "epoch": 0.8290155440414507, "grad_norm": 3.8146259784698486, "learning_rate": 3.950725977699396e-06, "loss": 0.10309855937957764, "step": 375, "token_acc": 0.9674837418709354 }, { "epoch": 0.8400690846286701, "grad_norm": 6.905686855316162, "learning_rate": 3.4721573899138743e-06, "loss": 0.12204140424728394, "step": 380, "token_acc": 0.9587064676616915 }, { "epoch": 0.8511226252158894, "grad_norm": 5.100115776062012, "learning_rate": 3.0223134726934472e-06, "loss": 0.10104206800460816, "step": 385, "token_acc": 0.9645885286783042 }, { "epoch": 0.8621761658031089, "grad_norm": 6.091329574584961, "learning_rate": 2.6017944542431393e-06, "loss": 0.09742544889450074, "step": 390, "token_acc": 0.9626307922272048 }, { "epoch": 0.8732297063903282, "grad_norm": 5.460122585296631, "learning_rate": 2.2111614344599683e-06, "loss": 0.10580202341079711, "step": 395, "token_acc": 0.9581673306772909 }, { "epoch": 0.8842832469775475, "grad_norm": 3.7578561305999756, "learning_rate": 1.8509356362554963e-06, "loss": 0.09627346396446228, "step": 400, "token_acc": 0.9677898909811695 }, { "epoch": 0.8842832469775475, "eval_loss": 0.10459936410188675, "eval_runtime": 2136.0319, "eval_samples_per_second": 1.099, "eval_steps_per_second": 1.099, "eval_token_acc": 0.9643014245592664, "step": 400 }, { "epoch": 0.8953367875647669, "grad_norm": 4.357903480529785, "learning_rate": 1.5215977100864392e-06, "loss": 0.11347759962081909, "step": 405, "token_acc": 0.9596814335490294 }, { "epoch": 0.9063903281519862, "grad_norm": 2.9337828159332275, "learning_rate": 1.2235870926211619e-06, "loss": 0.11455904245376587, "step": 410, "token_acc": 0.9613095238095238 }, { "epoch": 0.9174438687392056, "grad_norm": 2.6067817211151123, "learning_rate": 9.573014203979242e-07, "loss": 0.0897371768951416, "step": 415, "token_acc": 0.9702970297029703 }, { "epoch": 0.9284974093264249, "grad_norm": 4.809932708740234, "learning_rate": 7.230959992571368e-07, "loss": 0.10621033906936646, "step": 420, "token_acc": 0.9666001994017946 }, { "epoch": 0.9395509499136442, "grad_norm": 6.890130996704102, "learning_rate": 5.212833302556258e-07, "loss": 0.10602649450302123, "step": 425, "token_acc": 0.9638076351016361 }, { "epoch": 0.9506044905008636, "grad_norm": 4.578585147857666, "learning_rate": 3.521326926954532e-07, "loss": 0.08783534765243531, "step": 430, "token_acc": 0.9670822942643391 }, { "epoch": 0.9616580310880829, "grad_norm": 4.462143898010254, "learning_rate": 2.158697848236607e-07, "loss": 0.08878173232078553, "step": 435, "token_acc": 0.9683950617283951 }, { "epoch": 0.9727115716753022, "grad_norm": 3.5091371536254883, "learning_rate": 1.1267642268238121e-07, "loss": 0.11377729177474975, "step": 440, "token_acc": 0.9592647789369101 }, { "epoch": 0.9837651122625216, "grad_norm": 5.187692165374756, "learning_rate": 4.26902975110749e-08, "loss": 0.08824495673179626, "step": 445, "token_acc": 0.9681750372948782 }, { "epoch": 0.9948186528497409, "grad_norm": 4.813872337341309, "learning_rate": 6.004792024680295e-09, "loss": 0.12380951642990112, "step": 450, "token_acc": 0.9557213930348258 }, { "epoch": 1.0, "eval_loss": 0.10443862527608871, "eval_runtime": 2140.8222, "eval_samples_per_second": 1.096, "eval_steps_per_second": 1.096, "eval_token_acc": 0.9642337495347342, "step": 453 }, { "epoch": 1.0044214162348877, "grad_norm": 2.683554172515869, "learning_rate": 2.9411764705882355e-06, "loss": 0.08228109031915665, "step": 455, "token_acc": 0.9726708074534162 }, { "epoch": 1.015474956822107, "grad_norm": 3.1228041648864746, "learning_rate": 1.0294117647058824e-05, "loss": 0.08381627202033996, "step": 460, "token_acc": 0.9706905116741182 }, { "epoch": 1.0265284974093265, "grad_norm": 4.306940078735352, "learning_rate": 1.7647058823529414e-05, "loss": 0.09802506566047668, "step": 465, "token_acc": 0.9675810473815462 }, { "epoch": 1.0375820379965457, "grad_norm": 3.2758865356445312, "learning_rate": 2.5e-05, "loss": 0.08836065530776978, "step": 470, "token_acc": 0.9721531576330183 }, { "epoch": 1.048635578583765, "grad_norm": 5.335510730743408, "learning_rate": 3.235294117647059e-05, "loss": 0.09924425482749939, "step": 475, "token_acc": 0.963220675944334 }, { "epoch": 1.0596891191709845, "grad_norm": 6.677077770233154, "learning_rate": 3.970588235294117e-05, "loss": 0.09869781732559205, "step": 480, "token_acc": 0.9626307922272048 }, { "epoch": 1.0707426597582037, "grad_norm": 7.235245227813721, "learning_rate": 4.705882352941177e-05, "loss": 0.09256232976913452, "step": 485, "token_acc": 0.967869500741473 }, { "epoch": 1.0817962003454231, "grad_norm": 2.617497682571411, "learning_rate": 5.441176470588235e-05, "loss": 0.11165302991867065, "step": 490, "token_acc": 0.9609804902451226 }, { "epoch": 1.0928497409326425, "grad_norm": 4.969885349273682, "learning_rate": 6.176470588235295e-05, "loss": 0.10181330442428589, "step": 495, "token_acc": 0.9626865671641791 }, { "epoch": 1.103903281519862, "grad_norm": 3.402235746383667, "learning_rate": 6.911764705882354e-05, "loss": 0.09162335991859435, "step": 500, "token_acc": 0.9699062654168722 }, { "epoch": 1.1149568221070811, "grad_norm": 3.815495729446411, "learning_rate": 7.647058823529411e-05, "loss": 0.11382390260696411, "step": 505, "token_acc": 0.954228855721393 }, { "epoch": 1.1260103626943005, "grad_norm": 4.899413108825684, "learning_rate": 8.382352941176471e-05, "loss": 0.1054335355758667, "step": 510, "token_acc": 0.9592850049652433 }, { "epoch": 1.1370639032815197, "grad_norm": 4.382949352264404, "learning_rate": 9.11764705882353e-05, "loss": 0.0767556071281433, "step": 515, "token_acc": 0.9732540861812778 }, { "epoch": 1.1481174438687392, "grad_norm": 4.010786533355713, "learning_rate": 9.852941176470589e-05, "loss": 0.0905315101146698, "step": 520, "token_acc": 0.9681274900398407 }, { "epoch": 1.1591709844559586, "grad_norm": 8.248323440551758, "learning_rate": 9.99976313340166e-05, "loss": 0.14047706127166748, "step": 525, "token_acc": 0.9527127924340468 }, { "epoch": 1.170224525043178, "grad_norm": 6.752266883850098, "learning_rate": 9.998800901308916e-05, "loss": 0.1028751015663147, "step": 530, "token_acc": 0.9620947630922694 }, { "epoch": 1.1812780656303972, "grad_norm": 4.091604709625244, "learning_rate": 9.997098641899562e-05, "loss": 0.1167901635169983, "step": 535, "token_acc": 0.9626679940268791 }, { "epoch": 1.1923316062176166, "grad_norm": 3.13606858253479, "learning_rate": 9.994656607177722e-05, "loss": 0.11653541326522827, "step": 540, "token_acc": 0.955050505050505 }, { "epoch": 1.203385146804836, "grad_norm": 4.286221027374268, "learning_rate": 9.991475158664578e-05, "loss": 0.09516905546188355, "step": 545, "token_acc": 0.968031968031968 }, { "epoch": 1.2144386873920552, "grad_norm": 5.616212368011475, "learning_rate": 9.987554767344845e-05, "loss": 0.12145495414733887, "step": 550, "token_acc": 0.9621890547263682 }, { "epoch": 1.2254922279792746, "grad_norm": 5.72205114364624, "learning_rate": 9.982896013597038e-05, "loss": 0.12007032632827759, "step": 555, "token_acc": 0.9582089552238806 }, { "epoch": 1.236545768566494, "grad_norm": 3.8363678455352783, "learning_rate": 9.977499587107569e-05, "loss": 0.08600590825080871, "step": 560, "token_acc": 0.967196819085487 }, { "epoch": 1.2475993091537134, "grad_norm": 2.206713914871216, "learning_rate": 9.971366286768629e-05, "loss": 0.11412311792373657, "step": 565, "token_acc": 0.961824491819534 }, { "epoch": 1.2586528497409326, "grad_norm": 2.48056697845459, "learning_rate": 9.964497020559926e-05, "loss": 0.09548119902610779, "step": 570, "token_acc": 0.968222442899702 }, { "epoch": 1.269706390328152, "grad_norm": 4.197746276855469, "learning_rate": 9.956892805414272e-05, "loss": 0.11575849056243896, "step": 575, "token_acc": 0.9597215315763302 }, { "epoch": 1.2807599309153712, "grad_norm": 3.9247732162475586, "learning_rate": 9.948554767067025e-05, "loss": 0.09247745275497436, "step": 580, "token_acc": 0.9692001987083955 }, { "epoch": 1.2918134715025906, "grad_norm": 3.7010436058044434, "learning_rate": 9.93948413988944e-05, "loss": 0.11627188920974732, "step": 585, "token_acc": 0.9602780536246276 }, { "epoch": 1.30286701208981, "grad_norm": 6.0411858558654785, "learning_rate": 9.92968226670593e-05, "loss": 0.09203023314476014, "step": 590, "token_acc": 0.9705882352941176 }, { "epoch": 1.3139205526770295, "grad_norm": 4.776832103729248, "learning_rate": 9.919150598595276e-05, "loss": 0.07992117404937744, "step": 595, "token_acc": 0.9711155378486056 }, { "epoch": 1.3249740932642486, "grad_norm": 2.1442465782165527, "learning_rate": 9.907890694675803e-05, "loss": 0.08411768078804016, "step": 600, "token_acc": 0.971301335972291 }, { "epoch": 1.3249740932642486, "eval_loss": 0.10072976350784302, "eval_runtime": 2790.9554, "eval_samples_per_second": 0.841, "eval_steps_per_second": 0.841, "eval_token_acc": 0.9663655128074984, "step": 600 }, { "epoch": 1.336027633851468, "grad_norm": 3.444389820098877, "learning_rate": 9.89590422187457e-05, "loss": 0.0943886399269104, "step": 605, "token_acc": 0.969261279127417 }, { "epoch": 1.3470811744386875, "grad_norm": 3.3243448734283447, "learning_rate": 9.883192954680593e-05, "loss": 0.07429519295692444, "step": 610, "token_acc": 0.9723046488625123 }, { "epoch": 1.3581347150259067, "grad_norm": 3.897686243057251, "learning_rate": 9.869758774882154e-05, "loss": 0.10087257623672485, "step": 615, "token_acc": 0.9645 }, { "epoch": 1.369188255613126, "grad_norm": 3.0886733531951904, "learning_rate": 9.855603671288215e-05, "loss": 0.0957147240638733, "step": 620, "token_acc": 0.9651394422310757 }, { "epoch": 1.3802417962003455, "grad_norm": 5.4413604736328125, "learning_rate": 9.840729739433992e-05, "loss": 0.0882586419582367, "step": 625, "token_acc": 0.9705441837244134 }, { "epoch": 1.391295336787565, "grad_norm": 3.051844358444214, "learning_rate": 9.82513918127073e-05, "loss": 0.09511439800262451, "step": 630, "token_acc": 0.9663532904502722 }, { "epoch": 1.402348877374784, "grad_norm": 2.9811363220214844, "learning_rate": 9.808834304839729e-05, "loss": 0.10007621049880981, "step": 635, "token_acc": 0.9672943508424182 }, { "epoch": 1.4134024179620035, "grad_norm": 3.030879497528076, "learning_rate": 9.791817523930653e-05, "loss": 0.08152814507484436, "step": 640, "token_acc": 0.9720837487537388 }, { "epoch": 1.4244559585492227, "grad_norm": 2.544180154800415, "learning_rate": 9.774091357724196e-05, "loss": 0.07389838099479676, "step": 645, "token_acc": 0.9755854509217738 }, { "epoch": 1.435509499136442, "grad_norm": 5.591005802154541, "learning_rate": 9.755658430419132e-05, "loss": 0.09485760927200318, "step": 650, "token_acc": 0.9637357178340785 }, { "epoch": 1.4465630397236615, "grad_norm": 4.964202880859375, "learning_rate": 9.736521470843838e-05, "loss": 0.08160382509231567, "step": 655, "token_acc": 0.9705441837244134 }, { "epoch": 1.457616580310881, "grad_norm": 4.984673500061035, "learning_rate": 9.7166833120523e-05, "loss": 0.08802146315574647, "step": 660, "token_acc": 0.9641076769690927 }, { "epoch": 1.4686701208981001, "grad_norm": 2.584303140640259, "learning_rate": 9.696146890904722e-05, "loss": 0.09760611653327941, "step": 665, "token_acc": 0.9701343952215032 }, { "epoch": 1.4797236614853195, "grad_norm": 2.9796664714813232, "learning_rate": 9.674915247632739e-05, "loss": 0.09098277688026428, "step": 670, "token_acc": 0.9660678642714571 }, { "epoch": 1.490777202072539, "grad_norm": 10.275652885437012, "learning_rate": 9.652991525389337e-05, "loss": 0.08257744312286378, "step": 675, "token_acc": 0.9722084367245658 }, { "epoch": 1.5018307426597581, "grad_norm": 4.8155131340026855, "learning_rate": 9.630378969783547e-05, "loss": 0.07055800557136535, "step": 680, "token_acc": 0.974090682610862 }, { "epoch": 1.5128842832469775, "grad_norm": 3.6014211177825928, "learning_rate": 9.607080928399958e-05, "loss": 0.09370391964912414, "step": 685, "token_acc": 0.9658584858980702 }, { "epoch": 1.523937823834197, "grad_norm": 2.675119400024414, "learning_rate": 9.58310085030313e-05, "loss": 0.09670426845550537, "step": 690, "token_acc": 0.9641969169567379 }, { "epoch": 1.5349913644214164, "grad_norm": 3.324349880218506, "learning_rate": 9.558442285527e-05, "loss": 0.08441510200500488, "step": 695, "token_acc": 0.9696819085487077 }, { "epoch": 1.5460449050086356, "grad_norm": 3.4498190879821777, "learning_rate": 9.533108884549333e-05, "loss": 0.06717776656150817, "step": 700, "token_acc": 0.975597609561753 }, { "epoch": 1.557098445595855, "grad_norm": 2.959386110305786, "learning_rate": 9.50710439775129e-05, "loss": 0.08139981031417846, "step": 705, "token_acc": 0.9747023809523809 }, { "epoch": 1.5681519861830742, "grad_norm": 2.3681604862213135, "learning_rate": 9.480432674862232e-05, "loss": 0.07764554619789124, "step": 710, "token_acc": 0.9675810473815462 }, { "epoch": 1.5792055267702936, "grad_norm": 2.840590715408325, "learning_rate": 9.453097664389789e-05, "loss": 0.08232161402702332, "step": 715, "token_acc": 0.9701789264413518 }, { "epoch": 1.590259067357513, "grad_norm": 2.655217409133911, "learning_rate": 9.425103413035335e-05, "loss": 0.0968110740184784, "step": 720, "token_acc": 0.9689534301452178 }, { "epoch": 1.6013126079447324, "grad_norm": 2.4610400199890137, "learning_rate": 9.396454065094891e-05, "loss": 0.09739276766777039, "step": 725, "token_acc": 0.964729259811227 }, { "epoch": 1.6123661485319516, "grad_norm": 4.132114887237549, "learning_rate": 9.367153861845617e-05, "loss": 0.08105069994926453, "step": 730, "token_acc": 0.9716981132075472 }, { "epoch": 1.623419689119171, "grad_norm": 3.3622491359710693, "learning_rate": 9.337207140917919e-05, "loss": 0.09018557667732238, "step": 735, "token_acc": 0.9642324888226528 }, { "epoch": 1.6344732297063902, "grad_norm": 2.985978364944458, "learning_rate": 9.306618335653307e-05, "loss": 0.08649082779884339, "step": 740, "token_acc": 0.9683011391778108 }, { "epoch": 1.6455267702936096, "grad_norm": 3.509003162384033, "learning_rate": 9.275391974448076e-05, "loss": 0.0744367241859436, "step": 745, "token_acc": 0.9770687936191426 }, { "epoch": 1.656580310880829, "grad_norm": 2.937560796737671, "learning_rate": 9.243532680082915e-05, "loss": 0.07034647464752197, "step": 750, "token_acc": 0.9767211490837048 }, { "epoch": 1.6676338514680484, "grad_norm": 3.8314759731292725, "learning_rate": 9.211045169038554e-05, "loss": 0.07900274395942689, "step": 755, "token_acc": 0.9711729622266402 }, { "epoch": 1.6786873920552678, "grad_norm": 5.002687931060791, "learning_rate": 9.17793425079753e-05, "loss": 0.07675303220748901, "step": 760, "token_acc": 0.9689349112426036 }, { "epoch": 1.689740932642487, "grad_norm": 1.7825733423233032, "learning_rate": 9.144204827132175e-05, "loss": 0.08085300326347351, "step": 765, "token_acc": 0.9716981132075472 }, { "epoch": 1.7007944732297062, "grad_norm": 1.8953182697296143, "learning_rate": 9.10986189137897e-05, "loss": 0.07259147167205811, "step": 770, "token_acc": 0.9760479041916168 }, { "epoch": 1.7118480138169256, "grad_norm": 3.7877821922302246, "learning_rate": 9.074910527699313e-05, "loss": 0.08823164105415345, "step": 775, "token_acc": 0.972139303482587 }, { "epoch": 1.722901554404145, "grad_norm": 3.614501714706421, "learning_rate": 9.039355910326863e-05, "loss": 0.10905979871749878, "step": 780, "token_acc": 0.9652087475149106 }, { "epoch": 1.7339550949913645, "grad_norm": 3.0243847370147705, "learning_rate": 9.00320330280154e-05, "loss": 0.07965280413627625, "step": 785, "token_acc": 0.9723593287265548 }, { "epoch": 1.7450086355785839, "grad_norm": 4.025770664215088, "learning_rate": 8.966458057190301e-05, "loss": 0.07108275294303894, "step": 790, "token_acc": 0.9760956175298805 }, { "epoch": 1.756062176165803, "grad_norm": 2.9816761016845703, "learning_rate": 8.92912561329482e-05, "loss": 0.0776334285736084, "step": 795, "token_acc": 0.9706467661691542 }, { "epoch": 1.7671157167530225, "grad_norm": 3.0242762565612793, "learning_rate": 8.891211497846171e-05, "loss": 0.07837628722190856, "step": 800, "token_acc": 0.974155069582505 }, { "epoch": 1.7671157167530225, "eval_loss": 0.08094792068004608, "eval_runtime": 2286.4128, "eval_samples_per_second": 1.026, "eval_steps_per_second": 1.026, "eval_token_acc": 0.9727608026257909, "step": 800 }, { "epoch": 1.7781692573402417, "grad_norm": 3.771763324737549, "learning_rate": 8.852721323686648e-05, "loss": 0.08302398324012757, "step": 805, "token_acc": 0.9725411882176734 }, { "epoch": 1.789222797927461, "grad_norm": 3.6321651935577393, "learning_rate": 8.813660788938833e-05, "loss": 0.06937822699546814, "step": 810, "token_acc": 0.9772839506172839 }, { "epoch": 1.8002763385146805, "grad_norm": 6.7241644859313965, "learning_rate": 8.774035676162043e-05, "loss": 0.05159105062484741, "step": 815, "token_acc": 0.9829488465396189 }, { "epoch": 1.8113298791019, "grad_norm": 4.365586757659912, "learning_rate": 8.733851851496268e-05, "loss": 0.08399490118026734, "step": 820, "token_acc": 0.9721254355400697 }, { "epoch": 1.8223834196891193, "grad_norm": 3.0722031593322754, "learning_rate": 8.693115263793747e-05, "loss": 0.07215502858161926, "step": 825, "token_acc": 0.9740648379052369 }, { "epoch": 1.8334369602763385, "grad_norm": 4.519292831420898, "learning_rate": 8.651831943738296e-05, "loss": 0.06996339559555054, "step": 830, "token_acc": 0.9770459081836327 }, { "epoch": 1.8444905008635577, "grad_norm": 4.166755199432373, "learning_rate": 8.610008002952513e-05, "loss": 0.07142719030380248, "step": 835, "token_acc": 0.9767211490837048 }, { "epoch": 1.8555440414507771, "grad_norm": 4.686975002288818, "learning_rate": 8.567649633093016e-05, "loss": 0.06802060008049012, "step": 840, "token_acc": 0.9775 }, { "epoch": 1.8665975820379965, "grad_norm": 3.0628044605255127, "learning_rate": 8.524763104933816e-05, "loss": 0.06818159222602845, "step": 845, "token_acc": 0.973644952759821 }, { "epoch": 1.877651122625216, "grad_norm": 3.939176321029663, "learning_rate": 8.481354767437988e-05, "loss": 0.07347306013107299, "step": 850, "token_acc": 0.972568578553616 }, { "epoch": 1.8887046632124354, "grad_norm": 2.8834567070007324, "learning_rate": 8.437431046817769e-05, "loss": 0.06994418501853943, "step": 855, "token_acc": 0.971712158808933 }, { "epoch": 1.8997582037996545, "grad_norm": 3.4866409301757812, "learning_rate": 8.392998445583212e-05, "loss": 0.07565975189208984, "step": 860, "token_acc": 0.9760956175298805 }, { "epoch": 1.910811744386874, "grad_norm": 3.4372429847717285, "learning_rate": 8.348063541579545e-05, "loss": 0.07984944581985473, "step": 865, "token_acc": 0.9727452923686819 }, { "epoch": 1.9218652849740931, "grad_norm": 4.127252578735352, "learning_rate": 8.302632987013388e-05, "loss": 0.07774015665054321, "step": 870, "token_acc": 0.9744872436218109 }, { "epoch": 1.9329188255613126, "grad_norm": 3.960955858230591, "learning_rate": 8.256713507467941e-05, "loss": 0.08486457467079163, "step": 875, "token_acc": 0.9731743666169895 }, { "epoch": 1.943972366148532, "grad_norm": 3.087674617767334, "learning_rate": 8.210311900907339e-05, "loss": 0.07507517337799072, "step": 880, "token_acc": 0.9787023278850916 }, { "epoch": 1.9550259067357514, "grad_norm": 2.5197293758392334, "learning_rate": 8.163435036670261e-05, "loss": 0.08100587725639344, "step": 885, "token_acc": 0.9724724724724725 }, { "epoch": 1.9660794473229708, "grad_norm": 1.6736986637115479, "learning_rate": 8.116089854452995e-05, "loss": 0.07375568151473999, "step": 890, "token_acc": 0.9772727272727273 }, { "epoch": 1.97713298791019, "grad_norm": 3.322634220123291, "learning_rate": 8.068283363282074e-05, "loss": 0.07798144817352295, "step": 895, "token_acc": 0.9738400789733465 }, { "epoch": 1.9881865284974092, "grad_norm": 1.9556145668029785, "learning_rate": 8.020022640476654e-05, "loss": 0.06203848123550415, "step": 900, "token_acc": 0.9791666666666666 }, { "epoch": 1.9992400690846286, "grad_norm": 3.805701494216919, "learning_rate": 7.971314830600783e-05, "loss": 0.06745657324790955, "step": 905, "token_acc": 0.9751491053677932 }, { "epoch": 2.0088428324697754, "grad_norm": 2.904778003692627, "learning_rate": 7.922167144405706e-05, "loss": 0.06268702149391174, "step": 910, "token_acc": 0.9777777777777777 }, { "epoch": 2.0198963730569948, "grad_norm": 3.050584554672241, "learning_rate": 7.87258685776239e-05, "loss": 0.07978938817977906, "step": 915, "token_acc": 0.9681116093672147 }, { "epoch": 2.030949913644214, "grad_norm": 3.2201292514801025, "learning_rate": 7.822581310584388e-05, "loss": 0.07445316910743713, "step": 920, "token_acc": 0.9754509018036072 }, { "epoch": 2.0420034542314336, "grad_norm": 4.6090545654296875, "learning_rate": 7.772157905741231e-05, "loss": 0.06728174090385437, "step": 925, "token_acc": 0.977205153617443 }, { "epoch": 2.053056994818653, "grad_norm": 3.3497800827026367, "learning_rate": 7.721324107962506e-05, "loss": 0.06557589173316955, "step": 930, "token_acc": 0.9775784753363229 }, { "epoch": 2.064110535405872, "grad_norm": 2.1937358379364014, "learning_rate": 7.670087442732763e-05, "loss": 0.05688057541847229, "step": 935, "token_acc": 0.981028457314029 }, { "epoch": 2.0751640759930914, "grad_norm": 5.671027660369873, "learning_rate": 7.618455495177445e-05, "loss": 0.08629457950592041, "step": 940, "token_acc": 0.9695304695304695 }, { "epoch": 2.086217616580311, "grad_norm": 3.635037899017334, "learning_rate": 7.566435908939967e-05, "loss": 0.0566463053226471, "step": 945, "token_acc": 0.9820717131474104 }, { "epoch": 2.09727115716753, "grad_norm": 3.0799365043640137, "learning_rate": 7.514036385050147e-05, "loss": 0.06808796525001526, "step": 950, "token_acc": 0.9766052762568442 }, { "epoch": 2.1083246977547496, "grad_norm": 5.38563871383667, "learning_rate": 7.461264680784151e-05, "loss": 0.07369622588157654, "step": 955, "token_acc": 0.9737363726461843 }, { "epoch": 2.119378238341969, "grad_norm": 3.824101448059082, "learning_rate": 7.408128608516077e-05, "loss": 0.06465582847595215, "step": 960, "token_acc": 0.9786917740336968 }, { "epoch": 2.1304317789291884, "grad_norm": 3.410547971725464, "learning_rate": 7.354636034561418e-05, "loss": 0.051229971647262576, "step": 965, "token_acc": 0.9821428571428571 }, { "epoch": 2.1414853195164074, "grad_norm": 3.721679210662842, "learning_rate": 7.30079487801252e-05, "loss": 0.06965258717536926, "step": 970, "token_acc": 0.9755854509217738 }, { "epoch": 2.152538860103627, "grad_norm": 2.798208236694336, "learning_rate": 7.246613109566238e-05, "loss": 0.07870134711265564, "step": 975, "token_acc": 0.9751243781094527 }, { "epoch": 2.1635924006908462, "grad_norm": 1.8357700109481812, "learning_rate": 7.192098750343935e-05, "loss": 0.0715235412120819, "step": 980, "token_acc": 0.97675568743818 }, { "epoch": 2.1746459412780657, "grad_norm": 3.544813632965088, "learning_rate": 7.137259870704036e-05, "loss": 0.055529987812042235, "step": 985, "token_acc": 0.9841112214498511 }, { "epoch": 2.185699481865285, "grad_norm": 2.2707366943359375, "learning_rate": 7.082104589047285e-05, "loss": 0.05665128231048584, "step": 990, "token_acc": 0.9795102448775612 }, { "epoch": 2.1967530224525045, "grad_norm": 4.324965476989746, "learning_rate": 7.026641070614884e-05, "loss": 0.06112373471260071, "step": 995, "token_acc": 0.9775112443778111 }, { "epoch": 2.207806563039724, "grad_norm": 3.2108352184295654, "learning_rate": 6.970877526279702e-05, "loss": 0.061422485113143924, "step": 1000, "token_acc": 0.9781854238968766 }, { "epoch": 2.207806563039724, "eval_loss": 0.07163181900978088, "eval_runtime": 2173.3435, "eval_samples_per_second": 1.08, "eval_steps_per_second": 1.08, "eval_token_acc": 0.9747910533617569, "step": 1000 }, { "epoch": 2.218860103626943, "grad_norm": 4.137533187866211, "learning_rate": 6.914822211330742e-05, "loss": 0.06986818313598633, "step": 1005, "token_acc": 0.9751367478866235 }, { "epoch": 2.2299136442141623, "grad_norm": 1.7135125398635864, "learning_rate": 6.858483424251001e-05, "loss": 0.0670811414718628, "step": 1010, "token_acc": 0.9776341948310139 }, { "epoch": 2.2409671848013817, "grad_norm": 2.10587739944458, "learning_rate": 6.801869505488969e-05, "loss": 0.06850314140319824, "step": 1015, "token_acc": 0.9761312779711586 }, { "epoch": 2.252020725388601, "grad_norm": 3.5003225803375244, "learning_rate": 6.744988836223893e-05, "loss": 0.06779593229293823, "step": 1020, "token_acc": 0.9794692038057086 }, { "epoch": 2.2630742659758205, "grad_norm": 2.2275619506835938, "learning_rate": 6.687849837125027e-05, "loss": 0.05072577595710755, "step": 1025, "token_acc": 0.9836309523809523 }, { "epoch": 2.2741278065630395, "grad_norm": 2.6360952854156494, "learning_rate": 6.630460967105018e-05, "loss": 0.05415867567062378, "step": 1030, "token_acc": 0.9781746031746031 }, { "epoch": 2.285181347150259, "grad_norm": 1.4451478719711304, "learning_rate": 6.572830722067653e-05, "loss": 0.055239105224609376, "step": 1035, "token_acc": 0.981094527363184 }, { "epoch": 2.2962348877374783, "grad_norm": 1.9783443212509155, "learning_rate": 6.5149676336501e-05, "loss": 0.05858151912689209, "step": 1040, "token_acc": 0.9850224663005491 }, { "epoch": 2.3072884283246977, "grad_norm": 2.043297529220581, "learning_rate": 6.456880267959894e-05, "loss": 0.05577117800712585, "step": 1045, "token_acc": 0.9800697558545092 }, { "epoch": 2.318341968911917, "grad_norm": 4.235545635223389, "learning_rate": 6.39857722430679e-05, "loss": 0.05815597772598267, "step": 1050, "token_acc": 0.9801587301587301 }, { "epoch": 2.3293955094991365, "grad_norm": 5.21077823638916, "learning_rate": 6.340067133929719e-05, "loss": 0.054069459438323975, "step": 1055, "token_acc": 0.9801291604570294 }, { "epoch": 2.340449050086356, "grad_norm": 2.011702537536621, "learning_rate": 6.281358658719011e-05, "loss": 0.07169802188873291, "step": 1060, "token_acc": 0.9752107089737233 }, { "epoch": 2.351502590673575, "grad_norm": 2.3686680793762207, "learning_rate": 6.22246048993407e-05, "loss": 0.05615015029907226, "step": 1065, "token_acc": 0.9800299550673989 }, { "epoch": 2.3625561312607943, "grad_norm": 7.002840995788574, "learning_rate": 6.163381346916732e-05, "loss": 0.06114639043807983, "step": 1070, "token_acc": 0.9760239760239761 }, { "epoch": 2.3736096718480137, "grad_norm": 4.902541160583496, "learning_rate": 6.104129975800427e-05, "loss": 0.07762741446495056, "step": 1075, "token_acc": 0.9730807577268196 }, { "epoch": 2.384663212435233, "grad_norm": 4.661471843719482, "learning_rate": 6.0447151482153955e-05, "loss": 0.06509206891059875, "step": 1080, "token_acc": 0.9771144278606965 }, { "epoch": 2.3957167530224526, "grad_norm": 1.999237298965454, "learning_rate": 5.985145659990138e-05, "loss": 0.06380823254585266, "step": 1085, "token_acc": 0.9796626984126984 }, { "epoch": 2.406770293609672, "grad_norm": 2.382763385772705, "learning_rate": 5.925430329849264e-05, "loss": 0.05442737936973572, "step": 1090, "token_acc": 0.9821073558648111 }, { "epoch": 2.4178238341968914, "grad_norm": 1.8456308841705322, "learning_rate": 5.865577998107961e-05, "loss": 0.04835757613182068, "step": 1095, "token_acc": 0.9831013916500994 }, { "epoch": 2.4288773747841104, "grad_norm": 4.092327117919922, "learning_rate": 5.805597525363263e-05, "loss": 0.07175707817077637, "step": 1100, "token_acc": 0.977589641434263 }, { "epoch": 2.43993091537133, "grad_norm": 2.878070831298828, "learning_rate": 5.745497791182325e-05, "loss": 0.054905033111572264, "step": 1105, "token_acc": 0.9791459781529295 }, { "epoch": 2.450984455958549, "grad_norm": 3.2568767070770264, "learning_rate": 5.685287692787883e-05, "loss": 0.060244417190551756, "step": 1110, "token_acc": 0.9787549407114624 }, { "epoch": 2.4620379965457686, "grad_norm": 3.9391286373138428, "learning_rate": 5.6249761437410895e-05, "loss": 0.07208690047264099, "step": 1115, "token_acc": 0.9765234765234765 }, { "epoch": 2.473091537132988, "grad_norm": 1.7733020782470703, "learning_rate": 5.5645720726219584e-05, "loss": 0.05974746346473694, "step": 1120, "token_acc": 0.9806066633515664 }, { "epoch": 2.4841450777202074, "grad_norm": 4.430456161499023, "learning_rate": 5.504084421707555e-05, "loss": 0.0642861008644104, "step": 1125, "token_acc": 0.9781529294935452 }, { "epoch": 2.495198618307427, "grad_norm": 2.7552433013916016, "learning_rate": 5.443522145648181e-05, "loss": 0.06047917008399963, "step": 1130, "token_acc": 0.9830677290836654 }, { "epoch": 2.506252158894646, "grad_norm": 1.864843487739563, "learning_rate": 5.3828942101417136e-05, "loss": 0.044628658890724184, "step": 1135, "token_acc": 0.9856719367588933 }, { "epoch": 2.5173056994818652, "grad_norm": 4.15530252456665, "learning_rate": 5.322209590606323e-05, "loss": 0.0662376880645752, "step": 1140, "token_acc": 0.9765234765234765 }, { "epoch": 2.5283592400690846, "grad_norm": 3.1273319721221924, "learning_rate": 5.2614772708517324e-05, "loss": 0.06211344003677368, "step": 1145, "token_acc": 0.9807217004448838 }, { "epoch": 2.539412780656304, "grad_norm": 2.2132909297943115, "learning_rate": 5.200706241749257e-05, "loss": 0.05160966515541077, "step": 1150, "token_acc": 0.9830423940149626 }, { "epoch": 2.5504663212435235, "grad_norm": 2.4676833152770996, "learning_rate": 5.1399054999007756e-05, "loss": 0.05153646469116211, "step": 1155, "token_acc": 0.9795816733067729 }, { "epoch": 2.5615198618307424, "grad_norm": 2.5266482830047607, "learning_rate": 5.079084046306877e-05, "loss": 0.05694507360458374, "step": 1160, "token_acc": 0.9826474962816063 }, { "epoch": 2.572573402417962, "grad_norm": 3.3381104469299316, "learning_rate": 5.018250885034328e-05, "loss": 0.056800955533981325, "step": 1165, "token_acc": 0.9816377171215881 }, { "epoch": 2.5836269430051813, "grad_norm": 3.0276429653167725, "learning_rate": 4.957415021883121e-05, "loss": 0.061768895387649535, "step": 1170, "token_acc": 0.9804413239719157 }, { "epoch": 2.5946804835924007, "grad_norm": 1.4948302507400513, "learning_rate": 4.89658546305323e-05, "loss": 0.048909342288970946, "step": 1175, "token_acc": 0.9846306395637084 }, { "epoch": 2.60573402417962, "grad_norm": 2.0935652256011963, "learning_rate": 4.835771213811336e-05, "loss": 0.05250586867332459, "step": 1180, "token_acc": 0.9846534653465346 }, { "epoch": 2.6167875647668395, "grad_norm": 2.5400516986846924, "learning_rate": 4.774981277157673e-05, "loss": 0.05397605299949646, "step": 1185, "token_acc": 0.9816740960871718 }, { "epoch": 2.627841105354059, "grad_norm": 1.8448779582977295, "learning_rate": 4.714224652493212e-05, "loss": 0.0550678551197052, "step": 1190, "token_acc": 0.9800299550673989 }, { "epoch": 2.638894645941278, "grad_norm": 2.3702774047851562, "learning_rate": 4.6535103342873885e-05, "loss": 0.055988776683807376, "step": 1195, "token_acc": 0.9795511221945137 }, { "epoch": 2.6499481865284973, "grad_norm": 3.8877556324005127, "learning_rate": 4.592847310746549e-05, "loss": 0.054580336809158324, "step": 1200, "token_acc": 0.9802078179119248 }, { "epoch": 2.6499481865284973, "eval_loss": 0.06643825024366379, "eval_runtime": 2174.0597, "eval_samples_per_second": 1.08, "eval_steps_per_second": 1.08, "eval_token_acc": 0.9772273542449159, "step": 1200 }, { "epoch": 2.6610017271157167, "grad_norm": 4.951501846313477, "learning_rate": 4.5322445624833255e-05, "loss": 0.05614232420921326, "step": 1205, "token_acc": 0.979571499750872 }, { "epoch": 2.672055267702936, "grad_norm": 1.8844807147979736, "learning_rate": 4.471711061187144e-05, "loss": 0.05867302417755127, "step": 1210, "token_acc": 0.9790523690773068 }, { "epoch": 2.6831088082901555, "grad_norm": 2.765587329864502, "learning_rate": 4.411255768296038e-05, "loss": 0.05516909956932068, "step": 1215, "token_acc": 0.9800697558545092 }, { "epoch": 2.694162348877375, "grad_norm": 3.7630977630615234, "learning_rate": 4.3508876336699974e-05, "loss": 0.05011783838272095, "step": 1220, "token_acc": 0.981555333998006 }, { "epoch": 2.7052158894645943, "grad_norm": 3.2775371074676514, "learning_rate": 4.290615594266013e-05, "loss": 0.04247501492500305, "step": 1225, "token_acc": 0.9855 }, { "epoch": 2.7162694300518133, "grad_norm": 2.6794540882110596, "learning_rate": 4.230448572815053e-05, "loss": 0.04988014101982117, "step": 1230, "token_acc": 0.9826130153999006 }, { "epoch": 2.7273229706390327, "grad_norm": 2.045193910598755, "learning_rate": 4.170395476501119e-05, "loss": 0.04342162907123566, "step": 1235, "token_acc": 0.9841269841269841 }, { "epoch": 2.738376511226252, "grad_norm": 2.58183217048645, "learning_rate": 4.1104651956426296e-05, "loss": 0.04766501486301422, "step": 1240, "token_acc": 0.9831097863884749 }, { "epoch": 2.7494300518134716, "grad_norm": 3.195136785507202, "learning_rate": 4.050666602376287e-05, "loss": 0.05665205121040344, "step": 1245, "token_acc": 0.9766749379652605 }, { "epoch": 2.760483592400691, "grad_norm": 3.2019379138946533, "learning_rate": 3.991008549343626e-05, "loss": 0.07038918733596802, "step": 1250, "token_acc": 0.9775449101796407 }, { "epoch": 2.77153713298791, "grad_norm": 4.3877854347229, "learning_rate": 3.931499868380482e-05, "loss": 0.06642740964889526, "step": 1255, "token_acc": 0.9761904761904762 }, { "epoch": 2.78259067357513, "grad_norm": 1.3834000825881958, "learning_rate": 3.872149369209491e-05, "loss": 0.0616798460483551, "step": 1260, "token_acc": 0.979571499750872 }, { "epoch": 2.7936442141623488, "grad_norm": 4.06312370300293, "learning_rate": 3.8129658381359156e-05, "loss": 0.07107862830162048, "step": 1265, "token_acc": 0.9755244755244755 }, { "epoch": 2.804697754749568, "grad_norm": 3.0957045555114746, "learning_rate": 3.753958036746894e-05, "loss": 0.0476302444934845, "step": 1270, "token_acc": 0.9815645241654211 }, { "epoch": 2.8157512953367876, "grad_norm": 4.347245693206787, "learning_rate": 3.695134700614372e-05, "loss": 0.06514678001403809, "step": 1275, "token_acc": 0.9790836653386454 }, { "epoch": 2.826804835924007, "grad_norm": 3.311340808868408, "learning_rate": 3.636504538001882e-05, "loss": 0.0548922598361969, "step": 1280, "token_acc": 0.9831432821021319 }, { "epoch": 2.8378583765112264, "grad_norm": 1.910610556602478, "learning_rate": 3.5780762285753616e-05, "loss": 0.04039471745491028, "step": 1285, "token_acc": 0.9861454725383474 }, { "epoch": 2.8489119170984454, "grad_norm": 3.0203163623809814, "learning_rate": 3.519858422118206e-05, "loss": 0.06901986002922059, "step": 1290, "token_acc": 0.9781312127236581 }, { "epoch": 2.859965457685665, "grad_norm": 7.763772487640381, "learning_rate": 3.461859737250752e-05, "loss": 0.042749127745628356, "step": 1295, "token_acc": 0.9840637450199203 }, { "epoch": 2.871018998272884, "grad_norm": 4.355055332183838, "learning_rate": 3.4040887601543574e-05, "loss": 0.06063474416732788, "step": 1300, "token_acc": 0.9815277084373439 }, { "epoch": 2.8820725388601036, "grad_norm": 3.254500389099121, "learning_rate": 3.346554043300308e-05, "loss": 0.058100783824920656, "step": 1305, "token_acc": 0.9791976225854383 }, { "epoch": 2.893126079447323, "grad_norm": 4.576897621154785, "learning_rate": 3.289264104183691e-05, "loss": 0.05097652673721313, "step": 1310, "token_acc": 0.983201581027668 }, { "epoch": 2.9041796200345424, "grad_norm": 4.716442584991455, "learning_rate": 3.232227424062464e-05, "loss": 0.05045266747474671, "step": 1315, "token_acc": 0.9829059829059829 }, { "epoch": 2.915233160621762, "grad_norm": 3.8253066539764404, "learning_rate": 3.175452446701873e-05, "loss": 0.05482856035232544, "step": 1320, "token_acc": 0.9791356184798807 }, { "epoch": 2.926286701208981, "grad_norm": 4.78739595413208, "learning_rate": 3.118947577124439e-05, "loss": 0.056392842531204225, "step": 1325, "token_acc": 0.9797130133597229 }, { "epoch": 2.9373402417962002, "grad_norm": 2.2553937435150146, "learning_rate": 3.062721180365669e-05, "loss": 0.05316250324249268, "step": 1330, "token_acc": 0.9816831683168317 }, { "epoch": 2.9483937823834196, "grad_norm": 3.2007408142089844, "learning_rate": 3.0067815802356714e-05, "loss": 0.055870598554611205, "step": 1335, "token_acc": 0.9796526054590571 }, { "epoch": 2.959447322970639, "grad_norm": 1.5691827535629272, "learning_rate": 2.9511370580869213e-05, "loss": 0.04847137331962585, "step": 1340, "token_acc": 0.980635551142006 }, { "epoch": 2.9705008635578585, "grad_norm": 4.0064826011657715, "learning_rate": 2.895795851588252e-05, "loss": 0.061286211013793945, "step": 1345, "token_acc": 0.9805583250249252 }, { "epoch": 2.981554404145078, "grad_norm": 3.6499500274658203, "learning_rate": 2.8407661535053588e-05, "loss": 0.0678468644618988, "step": 1350, "token_acc": 0.9766401590457257 }, { "epoch": 2.9926079447322973, "grad_norm": 3.093632936477661, "learning_rate": 2.7860561104879357e-05, "loss": 0.04808221161365509, "step": 1355, "token_acc": 0.9815920398009951 }, { "epoch": 3.0, "eval_loss": 0.057897526770830154, "eval_runtime": 2238.3251, "eval_samples_per_second": 1.049, "eval_steps_per_second": 1.049, "eval_token_acc": 0.9798328426894055, "step": 1359 } ], "logging_steps": 5, "max_steps": 1359, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.83259479920255e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }