| { | |
| "best_global_step": 45850, | |
| "best_metric": 0.05883299, | |
| "best_model_checkpoint": "/mnt/bn/ocr-generation-lf/zhuhanshen/wuxc/ms-swift/IT-SFT-MERGE-1128/v2-20251128-180713/checkpoint-45850", | |
| "epoch": 2.0, | |
| "eval_steps": 1000, | |
| "global_step": 45850, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 4.362050163576881e-05, | |
| "grad_norm": 36.4930419921875, | |
| "learning_rate": 2.1805494984736157e-09, | |
| "loss": 1.106152057647705, | |
| "step": 1, | |
| "token_acc": 0.768033946251768 | |
| }, | |
| { | |
| "epoch": 0.004362050163576881, | |
| "grad_norm": 12.102117538452148, | |
| "learning_rate": 2.1805494984736155e-07, | |
| "loss": 0.8755137125651041, | |
| "step": 100, | |
| "token_acc": 0.8027415733279217 | |
| }, | |
| { | |
| "epoch": 0.008724100327153763, | |
| "grad_norm": 6.443205833435059, | |
| "learning_rate": 4.361098996947231e-07, | |
| "loss": 0.4035614776611328, | |
| "step": 200, | |
| "token_acc": 0.8832118196196428 | |
| }, | |
| { | |
| "epoch": 0.013086150490730643, | |
| "grad_norm": 9.192578315734863, | |
| "learning_rate": 6.541648495420847e-07, | |
| "loss": 0.319589729309082, | |
| "step": 300, | |
| "token_acc": 0.9065540095345141 | |
| }, | |
| { | |
| "epoch": 0.017448200654307525, | |
| "grad_norm": 5.373167037963867, | |
| "learning_rate": 8.722197993894462e-07, | |
| "loss": 0.27497081756591796, | |
| "step": 400, | |
| "token_acc": 0.9191371328484643 | |
| }, | |
| { | |
| "epoch": 0.021810250817884406, | |
| "grad_norm": 10.11259651184082, | |
| "learning_rate": 1.0902747492368077e-06, | |
| "loss": 0.25123407363891603, | |
| "step": 500, | |
| "token_acc": 0.925168467355569 | |
| }, | |
| { | |
| "epoch": 0.026172300981461286, | |
| "grad_norm": 10.016520500183105, | |
| "learning_rate": 1.3083296990841693e-06, | |
| "loss": 0.23664604187011717, | |
| "step": 600, | |
| "token_acc": 0.9300838425465398 | |
| }, | |
| { | |
| "epoch": 0.030534351145038167, | |
| "grad_norm": 6.1170501708984375, | |
| "learning_rate": 1.526384648931531e-06, | |
| "loss": 0.22281688690185547, | |
| "step": 700, | |
| "token_acc": 0.9335510798908284 | |
| }, | |
| { | |
| "epoch": 0.03489640130861505, | |
| "grad_norm": 5.652917385101318, | |
| "learning_rate": 1.7444395987788924e-06, | |
| "loss": 0.20703411102294922, | |
| "step": 800, | |
| "token_acc": 0.938460532766792 | |
| }, | |
| { | |
| "epoch": 0.03925845147219193, | |
| "grad_norm": 4.53310489654541, | |
| "learning_rate": 1.9624945486262538e-06, | |
| "loss": 0.19653003692626952, | |
| "step": 900, | |
| "token_acc": 0.9402567018683997 | |
| }, | |
| { | |
| "epoch": 0.04362050163576881, | |
| "grad_norm": 5.1060662269592285, | |
| "learning_rate": 2.1805494984736154e-06, | |
| "loss": 0.1954585075378418, | |
| "step": 1000, | |
| "token_acc": 0.9415839373883811 | |
| }, | |
| { | |
| "epoch": 0.04362050163576881, | |
| "eval_loss": 0.16520032286643982, | |
| "eval_runtime": 250.1581, | |
| "eval_samples_per_second": 59.243, | |
| "eval_steps_per_second": 0.927, | |
| "eval_token_acc": 0.9401546990968435, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.04798255179934569, | |
| "grad_norm": 6.413122653961182, | |
| "learning_rate": 2.398604448320977e-06, | |
| "loss": 0.18896270751953126, | |
| "step": 1100, | |
| "token_acc": 0.9428930293702289 | |
| }, | |
| { | |
| "epoch": 0.05234460196292257, | |
| "grad_norm": 5.541325569152832, | |
| "learning_rate": 2.6166593981683387e-06, | |
| "loss": 0.18908491134643554, | |
| "step": 1200, | |
| "token_acc": 0.9417954190899537 | |
| }, | |
| { | |
| "epoch": 0.05670665212649945, | |
| "grad_norm": 7.229689121246338, | |
| "learning_rate": 2.8347143480157003e-06, | |
| "loss": 0.1891629409790039, | |
| "step": 1300, | |
| "token_acc": 0.9430973872957417 | |
| }, | |
| { | |
| "epoch": 0.061068702290076333, | |
| "grad_norm": 8.033756256103516, | |
| "learning_rate": 3.052769297863062e-06, | |
| "loss": 0.1889020347595215, | |
| "step": 1400, | |
| "token_acc": 0.9424450611947768 | |
| }, | |
| { | |
| "epoch": 0.06543075245365322, | |
| "grad_norm": 4.568666458129883, | |
| "learning_rate": 3.2708242477104235e-06, | |
| "loss": 0.18495147705078124, | |
| "step": 1500, | |
| "token_acc": 0.9448793426926312 | |
| }, | |
| { | |
| "epoch": 0.0697928026172301, | |
| "grad_norm": 5.559028148651123, | |
| "learning_rate": 3.4888791975577847e-06, | |
| "loss": 0.1811268424987793, | |
| "step": 1600, | |
| "token_acc": 0.9450138440087491 | |
| }, | |
| { | |
| "epoch": 0.07415485278080698, | |
| "grad_norm": 6.583218574523926, | |
| "learning_rate": 3.7069341474051464e-06, | |
| "loss": 0.18984716415405273, | |
| "step": 1700, | |
| "token_acc": 0.9421092101583751 | |
| }, | |
| { | |
| "epoch": 0.07851690294438386, | |
| "grad_norm": 9.186613082885742, | |
| "learning_rate": 3.9249890972525076e-06, | |
| "loss": 0.17964080810546876, | |
| "step": 1800, | |
| "token_acc": 0.9452867846742794 | |
| }, | |
| { | |
| "epoch": 0.08287895310796074, | |
| "grad_norm": 5.135467052459717, | |
| "learning_rate": 4.14304404709987e-06, | |
| "loss": 0.1825981330871582, | |
| "step": 1900, | |
| "token_acc": 0.944733164609133 | |
| }, | |
| { | |
| "epoch": 0.08724100327153762, | |
| "grad_norm": 10.182524681091309, | |
| "learning_rate": 4.361098996947231e-06, | |
| "loss": 0.177908935546875, | |
| "step": 2000, | |
| "token_acc": 0.9461056495645801 | |
| }, | |
| { | |
| "epoch": 0.08724100327153762, | |
| "eval_loss": 0.1509065330028534, | |
| "eval_runtime": 255.4608, | |
| "eval_samples_per_second": 58.013, | |
| "eval_steps_per_second": 0.908, | |
| "eval_token_acc": 0.9452293660019673, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0916030534351145, | |
| "grad_norm": 5.487904071807861, | |
| "learning_rate": 4.579153946794593e-06, | |
| "loss": 0.17657140731811524, | |
| "step": 2100, | |
| "token_acc": 0.9470306750061767 | |
| }, | |
| { | |
| "epoch": 0.09596510359869138, | |
| "grad_norm": 6.613377571105957, | |
| "learning_rate": 4.797208896641954e-06, | |
| "loss": 0.18289306640625, | |
| "step": 2200, | |
| "token_acc": 0.9451059860400784 | |
| }, | |
| { | |
| "epoch": 0.10032715376226826, | |
| "grad_norm": 11.444597244262695, | |
| "learning_rate": 4.999999681367619e-06, | |
| "loss": 0.18539167404174806, | |
| "step": 2300, | |
| "token_acc": 0.9451543586613773 | |
| }, | |
| { | |
| "epoch": 0.10468920392584515, | |
| "grad_norm": 6.6478681564331055, | |
| "learning_rate": 4.999925550936637e-06, | |
| "loss": 0.17520687103271484, | |
| "step": 2400, | |
| "token_acc": 0.9466563107809991 | |
| }, | |
| { | |
| "epoch": 0.10905125408942203, | |
| "grad_norm": 5.2834882736206055, | |
| "learning_rate": 4.999721370906455e-06, | |
| "loss": 0.18468694686889647, | |
| "step": 2500, | |
| "token_acc": 0.944860093749592 | |
| }, | |
| { | |
| "epoch": 0.1134133042529989, | |
| "grad_norm": 5.122148036956787, | |
| "learning_rate": 4.9993871518988e-06, | |
| "loss": 0.17573898315429687, | |
| "step": 2600, | |
| "token_acc": 0.9476367310518581 | |
| }, | |
| { | |
| "epoch": 0.11777535441657579, | |
| "grad_norm": 5.123068809509277, | |
| "learning_rate": 4.99892291130021e-06, | |
| "loss": 0.1807489013671875, | |
| "step": 2700, | |
| "token_acc": 0.9463905801777397 | |
| }, | |
| { | |
| "epoch": 0.12213740458015267, | |
| "grad_norm": 5.782346248626709, | |
| "learning_rate": 4.998328673261126e-06, | |
| "loss": 0.17619085311889648, | |
| "step": 2800, | |
| "token_acc": 0.9469496873533967 | |
| }, | |
| { | |
| "epoch": 0.12649945474372956, | |
| "grad_norm": 3.942211151123047, | |
| "learning_rate": 4.9976044686946355e-06, | |
| "loss": 0.17367206573486327, | |
| "step": 2900, | |
| "token_acc": 0.9483415648386821 | |
| }, | |
| { | |
| "epoch": 0.13086150490730644, | |
| "grad_norm": 4.226406097412109, | |
| "learning_rate": 4.996750335274866e-06, | |
| "loss": 0.17105751037597655, | |
| "step": 3000, | |
| "token_acc": 0.948902939369314 | |
| }, | |
| { | |
| "epoch": 0.13086150490730644, | |
| "eval_loss": 0.1425495445728302, | |
| "eval_runtime": 252.4401, | |
| "eval_samples_per_second": 58.707, | |
| "eval_steps_per_second": 0.919, | |
| "eval_token_acc": 0.9480601135652329, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.13522355507088332, | |
| "grad_norm": 5.636692047119141, | |
| "learning_rate": 4.9957663174350245e-06, | |
| "loss": 0.17473587036132812, | |
| "step": 3100, | |
| "token_acc": 0.9479665881897636 | |
| }, | |
| { | |
| "epoch": 0.1395856052344602, | |
| "grad_norm": 6.189432144165039, | |
| "learning_rate": 4.9946524663650856e-06, | |
| "loss": 0.17271400451660157, | |
| "step": 3200, | |
| "token_acc": 0.948410186178116 | |
| }, | |
| { | |
| "epoch": 0.14394765539803708, | |
| "grad_norm": 13.567493438720703, | |
| "learning_rate": 4.993408840009129e-06, | |
| "loss": 0.16095369338989257, | |
| "step": 3300, | |
| "token_acc": 0.951935940949682 | |
| }, | |
| { | |
| "epoch": 0.14830970556161396, | |
| "grad_norm": 7.645431041717529, | |
| "learning_rate": 4.992035503062324e-06, | |
| "loss": 0.17597726821899415, | |
| "step": 3400, | |
| "token_acc": 0.9478462417576773 | |
| }, | |
| { | |
| "epoch": 0.15267175572519084, | |
| "grad_norm": 6.895127296447754, | |
| "learning_rate": 4.990532526967568e-06, | |
| "loss": 0.16777828216552734, | |
| "step": 3500, | |
| "token_acc": 0.9506341819509079 | |
| }, | |
| { | |
| "epoch": 0.15703380588876772, | |
| "grad_norm": 6.079124927520752, | |
| "learning_rate": 4.988899989911762e-06, | |
| "loss": 0.16899423599243163, | |
| "step": 3600, | |
| "token_acc": 0.9485503365232598 | |
| }, | |
| { | |
| "epoch": 0.1613958560523446, | |
| "grad_norm": 4.00687837600708, | |
| "learning_rate": 4.987137976821754e-06, | |
| "loss": 0.15569159507751465, | |
| "step": 3700, | |
| "token_acc": 0.9528974769328038 | |
| }, | |
| { | |
| "epoch": 0.16575790621592149, | |
| "grad_norm": 12.302860260009766, | |
| "learning_rate": 4.98524657935991e-06, | |
| "loss": 0.17022018432617186, | |
| "step": 3800, | |
| "token_acc": 0.9489793956484122 | |
| }, | |
| { | |
| "epoch": 0.17011995637949837, | |
| "grad_norm": 3.2564656734466553, | |
| "learning_rate": 4.983225895919354e-06, | |
| "loss": 0.16914573669433594, | |
| "step": 3900, | |
| "token_acc": 0.9495082261518536 | |
| }, | |
| { | |
| "epoch": 0.17448200654307525, | |
| "grad_norm": 6.390711784362793, | |
| "learning_rate": 4.981076031618844e-06, | |
| "loss": 0.16581764221191406, | |
| "step": 4000, | |
| "token_acc": 0.9505939681455173 | |
| }, | |
| { | |
| "epoch": 0.17448200654307525, | |
| "eval_loss": 0.13437031209468842, | |
| "eval_runtime": 249.7749, | |
| "eval_samples_per_second": 59.333, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9513994455870518, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.17884405670665213, | |
| "grad_norm": 5.330230712890625, | |
| "learning_rate": 4.9787970982973085e-06, | |
| "loss": 0.16594120025634765, | |
| "step": 4100, | |
| "token_acc": 0.9504188464871457 | |
| }, | |
| { | |
| "epoch": 0.183206106870229, | |
| "grad_norm": 4.424008369445801, | |
| "learning_rate": 4.976389214508022e-06, | |
| "loss": 0.16885875701904296, | |
| "step": 4200, | |
| "token_acc": 0.9493527907248485 | |
| }, | |
| { | |
| "epoch": 0.1875681570338059, | |
| "grad_norm": 7.039103031158447, | |
| "learning_rate": 4.973852505512445e-06, | |
| "loss": 0.16290786743164062, | |
| "step": 4300, | |
| "token_acc": 0.9510095676938338 | |
| }, | |
| { | |
| "epoch": 0.19193020719738277, | |
| "grad_norm": 4.440979480743408, | |
| "learning_rate": 4.971187103273701e-06, | |
| "loss": 0.1564283561706543, | |
| "step": 4400, | |
| "token_acc": 0.9533942774165869 | |
| }, | |
| { | |
| "epoch": 0.19629225736095965, | |
| "grad_norm": 3.2784507274627686, | |
| "learning_rate": 4.968393146449718e-06, | |
| "loss": 0.16346294403076173, | |
| "step": 4500, | |
| "token_acc": 0.9512642397065876 | |
| }, | |
| { | |
| "epoch": 0.20065430752453653, | |
| "grad_norm": 4.944409370422363, | |
| "learning_rate": 4.9654707803860095e-06, | |
| "loss": 0.15641886711120606, | |
| "step": 4600, | |
| "token_acc": 0.9534258320991503 | |
| }, | |
| { | |
| "epoch": 0.2050163576881134, | |
| "grad_norm": 4.601531505584717, | |
| "learning_rate": 4.9624201571081164e-06, | |
| "loss": 0.16025123596191407, | |
| "step": 4700, | |
| "token_acc": 0.9523697102048043 | |
| }, | |
| { | |
| "epoch": 0.2093784078516903, | |
| "grad_norm": 6.826204299926758, | |
| "learning_rate": 4.9592414353137e-06, | |
| "loss": 0.15847814559936524, | |
| "step": 4800, | |
| "token_acc": 0.9529489381993481 | |
| }, | |
| { | |
| "epoch": 0.21374045801526717, | |
| "grad_norm": 4.184462070465088, | |
| "learning_rate": 4.955934780364281e-06, | |
| "loss": 0.16379379272460937, | |
| "step": 4900, | |
| "token_acc": 0.9510652183746131 | |
| }, | |
| { | |
| "epoch": 0.21810250817884405, | |
| "grad_norm": 6.352680206298828, | |
| "learning_rate": 4.952500364276644e-06, | |
| "loss": 0.16284820556640625, | |
| "step": 5000, | |
| "token_acc": 0.9512094566302702 | |
| }, | |
| { | |
| "epoch": 0.21810250817884405, | |
| "eval_loss": 0.12947490811347961, | |
| "eval_runtime": 249.5303, | |
| "eval_samples_per_second": 59.392, | |
| "eval_steps_per_second": 0.93, | |
| "eval_token_acc": 0.9529140212823035, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.22246455834242093, | |
| "grad_norm": 6.26838493347168, | |
| "learning_rate": 4.948938365713883e-06, | |
| "loss": 0.16305082321166992, | |
| "step": 5100, | |
| "token_acc": 0.9515196791570008 | |
| }, | |
| { | |
| "epoch": 0.2268266085059978, | |
| "grad_norm": 5.207289218902588, | |
| "learning_rate": 4.9452489699761095e-06, | |
| "loss": 0.15841794967651368, | |
| "step": 5200, | |
| "token_acc": 0.9528224991360029 | |
| }, | |
| { | |
| "epoch": 0.2311886586695747, | |
| "grad_norm": 5.445582389831543, | |
| "learning_rate": 4.941432368990816e-06, | |
| "loss": 0.1492086696624756, | |
| "step": 5300, | |
| "token_acc": 0.9551668315495998 | |
| }, | |
| { | |
| "epoch": 0.23555070883315157, | |
| "grad_norm": 6.060046672821045, | |
| "learning_rate": 4.9374887613028845e-06, | |
| "loss": 0.16065792083740235, | |
| "step": 5400, | |
| "token_acc": 0.9520704754902909 | |
| }, | |
| { | |
| "epoch": 0.23991275899672845, | |
| "grad_norm": 6.347756862640381, | |
| "learning_rate": 4.933418352064265e-06, | |
| "loss": 0.15583457946777343, | |
| "step": 5500, | |
| "token_acc": 0.9537081698366554 | |
| }, | |
| { | |
| "epoch": 0.24427480916030533, | |
| "grad_norm": 5.751275062561035, | |
| "learning_rate": 4.929221353023299e-06, | |
| "loss": 0.17144599914550782, | |
| "step": 5600, | |
| "token_acc": 0.9501105779801089 | |
| }, | |
| { | |
| "epoch": 0.24863685932388221, | |
| "grad_norm": 6.227544784545898, | |
| "learning_rate": 4.924897982513706e-06, | |
| "loss": 0.16434305191040038, | |
| "step": 5700, | |
| "token_acc": 0.9513792820279583 | |
| }, | |
| { | |
| "epoch": 0.2529989094874591, | |
| "grad_norm": 22.748313903808594, | |
| "learning_rate": 4.920448465443224e-06, | |
| "loss": 0.15164663314819335, | |
| "step": 5800, | |
| "token_acc": 0.9544478258894679 | |
| }, | |
| { | |
| "epoch": 0.257360959651036, | |
| "grad_norm": 6.24751615524292, | |
| "learning_rate": 4.91587303328191e-06, | |
| "loss": 0.15351497650146484, | |
| "step": 5900, | |
| "token_acc": 0.9543150580279145 | |
| }, | |
| { | |
| "epoch": 0.2617230098146129, | |
| "grad_norm": 18.683191299438477, | |
| "learning_rate": 4.911171924050102e-06, | |
| "loss": 0.15083285331726073, | |
| "step": 6000, | |
| "token_acc": 0.9558162853385074 | |
| }, | |
| { | |
| "epoch": 0.2617230098146129, | |
| "eval_loss": 0.12582732737064362, | |
| "eval_runtime": 249.7653, | |
| "eval_samples_per_second": 59.336, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9549986586783511, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.26608505997818976, | |
| "grad_norm": 4.456823348999023, | |
| "learning_rate": 4.906345382306029e-06, | |
| "loss": 0.15498119354248047, | |
| "step": 6100, | |
| "token_acc": 0.9538203027177121 | |
| }, | |
| { | |
| "epoch": 0.27044711014176664, | |
| "grad_norm": 5.5751261711120605, | |
| "learning_rate": 4.901393659133097e-06, | |
| "loss": 0.160562744140625, | |
| "step": 6200, | |
| "token_acc": 0.95239975292447 | |
| }, | |
| { | |
| "epoch": 0.2748091603053435, | |
| "grad_norm": 8.7870512008667, | |
| "learning_rate": 4.896317012126823e-06, | |
| "loss": 0.1592543315887451, | |
| "step": 6300, | |
| "token_acc": 0.9525488439137464 | |
| }, | |
| { | |
| "epoch": 0.2791712104689204, | |
| "grad_norm": 6.550958156585693, | |
| "learning_rate": 4.891115705381435e-06, | |
| "loss": 0.15966859817504883, | |
| "step": 6400, | |
| "token_acc": 0.9526111128964875 | |
| }, | |
| { | |
| "epoch": 0.2835332606324973, | |
| "grad_norm": 4.731610298156738, | |
| "learning_rate": 4.885790009476132e-06, | |
| "loss": 0.15472820281982422, | |
| "step": 6500, | |
| "token_acc": 0.9540492592976276 | |
| }, | |
| { | |
| "epoch": 0.28789531079607417, | |
| "grad_norm": 3.9044625759124756, | |
| "learning_rate": 4.880340201461015e-06, | |
| "loss": 0.15400704383850097, | |
| "step": 6600, | |
| "token_acc": 0.9542899599431451 | |
| }, | |
| { | |
| "epoch": 0.29225736095965105, | |
| "grad_norm": 4.739759922027588, | |
| "learning_rate": 4.874766564842662e-06, | |
| "loss": 0.1476905059814453, | |
| "step": 6700, | |
| "token_acc": 0.9566351143445723 | |
| }, | |
| { | |
| "epoch": 0.2966194111232279, | |
| "grad_norm": 4.103377342224121, | |
| "learning_rate": 4.869069389569394e-06, | |
| "loss": 0.14938350677490234, | |
| "step": 6800, | |
| "token_acc": 0.9559722494927678 | |
| }, | |
| { | |
| "epoch": 0.3009814612868048, | |
| "grad_norm": 5.790430545806885, | |
| "learning_rate": 4.863248972016179e-06, | |
| "loss": 0.15306180953979492, | |
| "step": 6900, | |
| "token_acc": 0.9547221557428477 | |
| }, | |
| { | |
| "epoch": 0.3053435114503817, | |
| "grad_norm": 4.019115924835205, | |
| "learning_rate": 4.857305614969224e-06, | |
| "loss": 0.1495196056365967, | |
| "step": 7000, | |
| "token_acc": 0.9557983880238603 | |
| }, | |
| { | |
| "epoch": 0.3053435114503817, | |
| "eval_loss": 0.12187354266643524, | |
| "eval_runtime": 248.7288, | |
| "eval_samples_per_second": 59.583, | |
| "eval_steps_per_second": 0.933, | |
| "eval_token_acc": 0.9559152284717876, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.30970556161395857, | |
| "grad_norm": 4.442187786102295, | |
| "learning_rate": 4.851239627610216e-06, | |
| "loss": 0.14913738250732422, | |
| "step": 7100, | |
| "token_acc": 0.9559660106753005 | |
| }, | |
| { | |
| "epoch": 0.31406761177753545, | |
| "grad_norm": 5.366152286529541, | |
| "learning_rate": 4.8450513255002415e-06, | |
| "loss": 0.14996286392211913, | |
| "step": 7200, | |
| "token_acc": 0.9556774135003047 | |
| }, | |
| { | |
| "epoch": 0.31842966194111233, | |
| "grad_norm": 4.185603618621826, | |
| "learning_rate": 4.838741030563372e-06, | |
| "loss": 0.15599340438842774, | |
| "step": 7300, | |
| "token_acc": 0.953399045204792 | |
| }, | |
| { | |
| "epoch": 0.3227917121046892, | |
| "grad_norm": 5.234697341918945, | |
| "learning_rate": 4.832309071069914e-06, | |
| "loss": 0.15486830711364746, | |
| "step": 7400, | |
| "token_acc": 0.9538847084824248 | |
| }, | |
| { | |
| "epoch": 0.3271537622682661, | |
| "grad_norm": 11.299803733825684, | |
| "learning_rate": 4.825755781619333e-06, | |
| "loss": 0.15180639266967774, | |
| "step": 7500, | |
| "token_acc": 0.9557206634731319 | |
| }, | |
| { | |
| "epoch": 0.33151581243184297, | |
| "grad_norm": 5.087841510772705, | |
| "learning_rate": 4.819081503122847e-06, | |
| "loss": 0.15074671745300294, | |
| "step": 7600, | |
| "token_acc": 0.9549014645668269 | |
| }, | |
| { | |
| "epoch": 0.33587786259541985, | |
| "grad_norm": 4.772754192352295, | |
| "learning_rate": 4.812286582785697e-06, | |
| "loss": 0.15997478485107422, | |
| "step": 7700, | |
| "token_acc": 0.9523766333888324 | |
| }, | |
| { | |
| "epoch": 0.34023991275899673, | |
| "grad_norm": 5.332608222961426, | |
| "learning_rate": 4.805371374089071e-06, | |
| "loss": 0.15052309036254882, | |
| "step": 7800, | |
| "token_acc": 0.9560667154238967 | |
| }, | |
| { | |
| "epoch": 0.3446019629225736, | |
| "grad_norm": 6.446188926696777, | |
| "learning_rate": 4.798336236771733e-06, | |
| "loss": 0.14415891647338866, | |
| "step": 7900, | |
| "token_acc": 0.957361806745217 | |
| }, | |
| { | |
| "epoch": 0.3489640130861505, | |
| "grad_norm": 5.3451032638549805, | |
| "learning_rate": 4.791181536811295e-06, | |
| "loss": 0.14976163864135741, | |
| "step": 8000, | |
| "token_acc": 0.9560646367214662 | |
| }, | |
| { | |
| "epoch": 0.3489640130861505, | |
| "eval_loss": 0.11995680630207062, | |
| "eval_runtime": 247.9086, | |
| "eval_samples_per_second": 59.78, | |
| "eval_steps_per_second": 0.936, | |
| "eval_token_acc": 0.9568038540642047, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.3533260632497274, | |
| "grad_norm": 5.629948616027832, | |
| "learning_rate": 4.783907646405187e-06, | |
| "loss": 0.1462325096130371, | |
| "step": 8100, | |
| "token_acc": 0.9567767130904812 | |
| }, | |
| { | |
| "epoch": 0.35768811341330425, | |
| "grad_norm": 3.4954240322113037, | |
| "learning_rate": 4.7765149439512904e-06, | |
| "loss": 0.1433693313598633, | |
| "step": 8200, | |
| "token_acc": 0.9574897617811616 | |
| }, | |
| { | |
| "epoch": 0.36205016357688113, | |
| "grad_norm": 8.395429611206055, | |
| "learning_rate": 4.769003814028254e-06, | |
| "loss": 0.14990657806396485, | |
| "step": 8300, | |
| "token_acc": 0.955732380643066 | |
| }, | |
| { | |
| "epoch": 0.366412213740458, | |
| "grad_norm": 5.691331386566162, | |
| "learning_rate": 4.761374647375488e-06, | |
| "loss": 0.14862375259399413, | |
| "step": 8400, | |
| "token_acc": 0.9561096803907705 | |
| }, | |
| { | |
| "epoch": 0.3707742639040349, | |
| "grad_norm": 7.700660705566406, | |
| "learning_rate": 4.753627840872838e-06, | |
| "loss": 0.1447600269317627, | |
| "step": 8500, | |
| "token_acc": 0.9575172594313386 | |
| }, | |
| { | |
| "epoch": 0.3751363140676118, | |
| "grad_norm": 5.046755313873291, | |
| "learning_rate": 4.745763797519937e-06, | |
| "loss": 0.14034244537353516, | |
| "step": 8600, | |
| "token_acc": 0.9585937652755961 | |
| }, | |
| { | |
| "epoch": 0.37949836423118866, | |
| "grad_norm": 6.037604808807373, | |
| "learning_rate": 4.737782926415242e-06, | |
| "loss": 0.14538028717041016, | |
| "step": 8700, | |
| "token_acc": 0.956606432091631 | |
| }, | |
| { | |
| "epoch": 0.38386041439476554, | |
| "grad_norm": 5.605445384979248, | |
| "learning_rate": 4.729685642734753e-06, | |
| "loss": 0.14044106483459473, | |
| "step": 8800, | |
| "token_acc": 0.9584157391967529 | |
| }, | |
| { | |
| "epoch": 0.3882224645583424, | |
| "grad_norm": 3.2041120529174805, | |
| "learning_rate": 4.721472367710412e-06, | |
| "loss": 0.1447988510131836, | |
| "step": 8900, | |
| "token_acc": 0.9571702297990734 | |
| }, | |
| { | |
| "epoch": 0.3925845147219193, | |
| "grad_norm": 4.4470343589782715, | |
| "learning_rate": 4.713143528608194e-06, | |
| "loss": 0.14285174369812012, | |
| "step": 9000, | |
| "token_acc": 0.9576638578941464 | |
| }, | |
| { | |
| "epoch": 0.3925845147219193, | |
| "eval_loss": 0.11620920896530151, | |
| "eval_runtime": 250.0276, | |
| "eval_samples_per_second": 59.273, | |
| "eval_steps_per_second": 0.928, | |
| "eval_token_acc": 0.9578210229813109, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.3969465648854962, | |
| "grad_norm": 5.5821356773376465, | |
| "learning_rate": 4.704699558705876e-06, | |
| "loss": 0.1477054214477539, | |
| "step": 9100, | |
| "token_acc": 0.9566980840543882 | |
| }, | |
| { | |
| "epoch": 0.40130861504907306, | |
| "grad_norm": 9.140554428100586, | |
| "learning_rate": 4.6961408972705e-06, | |
| "loss": 0.15634747505187988, | |
| "step": 9200, | |
| "token_acc": 0.9540431258291597 | |
| }, | |
| { | |
| "epoch": 0.40567066521264994, | |
| "grad_norm": 2.6849653720855713, | |
| "learning_rate": 4.687467989535522e-06, | |
| "loss": 0.13982583045959474, | |
| "step": 9300, | |
| "token_acc": 0.9585451193466928 | |
| }, | |
| { | |
| "epoch": 0.4100327153762268, | |
| "grad_norm": 4.507665634155273, | |
| "learning_rate": 4.678681286677644e-06, | |
| "loss": 0.14462839126586913, | |
| "step": 9400, | |
| "token_acc": 0.9575857813842605 | |
| }, | |
| { | |
| "epoch": 0.4143947655398037, | |
| "grad_norm": 4.900688648223877, | |
| "learning_rate": 4.669781245793356e-06, | |
| "loss": 0.14218985557556152, | |
| "step": 9500, | |
| "token_acc": 0.9583531835447112 | |
| }, | |
| { | |
| "epoch": 0.4187568157033806, | |
| "grad_norm": 3.7620060443878174, | |
| "learning_rate": 4.6607683298751435e-06, | |
| "loss": 0.13646729469299315, | |
| "step": 9600, | |
| "token_acc": 0.9592932889999409 | |
| }, | |
| { | |
| "epoch": 0.42311886586695746, | |
| "grad_norm": 5.526325225830078, | |
| "learning_rate": 4.651643007787412e-06, | |
| "loss": 0.13613853454589844, | |
| "step": 9700, | |
| "token_acc": 0.9601200068126138 | |
| }, | |
| { | |
| "epoch": 0.42748091603053434, | |
| "grad_norm": 6.252800464630127, | |
| "learning_rate": 4.642405754242089e-06, | |
| "loss": 0.14448049545288086, | |
| "step": 9800, | |
| "token_acc": 0.9579820567152068 | |
| }, | |
| { | |
| "epoch": 0.4318429661941112, | |
| "grad_norm": 5.828685283660889, | |
| "learning_rate": 4.633057049773932e-06, | |
| "loss": 0.14130407333374023, | |
| "step": 9900, | |
| "token_acc": 0.9588267632506549 | |
| }, | |
| { | |
| "epoch": 0.4362050163576881, | |
| "grad_norm": 6.362551689147949, | |
| "learning_rate": 4.623597380715534e-06, | |
| "loss": 0.1415507125854492, | |
| "step": 10000, | |
| "token_acc": 0.9588815789473685 | |
| }, | |
| { | |
| "epoch": 0.4362050163576881, | |
| "eval_loss": 0.11561030149459839, | |
| "eval_runtime": 244.6104, | |
| "eval_samples_per_second": 60.586, | |
| "eval_steps_per_second": 0.948, | |
| "eval_token_acc": 0.9586286103907717, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.440567066521265, | |
| "grad_norm": 4.7389912605285645, | |
| "learning_rate": 4.614027239172017e-06, | |
| "loss": 0.13512346267700195, | |
| "step": 10100, | |
| "token_acc": 0.9598309226603262 | |
| }, | |
| { | |
| "epoch": 0.44492911668484186, | |
| "grad_norm": 6.846667289733887, | |
| "learning_rate": 4.604347122995434e-06, | |
| "loss": 0.1509090805053711, | |
| "step": 10200, | |
| "token_acc": 0.9573597748543753 | |
| }, | |
| { | |
| "epoch": 0.44929116684841874, | |
| "grad_norm": 6.515282154083252, | |
| "learning_rate": 4.594557535758874e-06, | |
| "loss": 0.1417671775817871, | |
| "step": 10300, | |
| "token_acc": 0.9583131932015624 | |
| }, | |
| { | |
| "epoch": 0.4536532170119956, | |
| "grad_norm": 4.678475379943848, | |
| "learning_rate": 4.584658986730261e-06, | |
| "loss": 0.13684335708618164, | |
| "step": 10400, | |
| "token_acc": 0.9602577662866768 | |
| }, | |
| { | |
| "epoch": 0.4580152671755725, | |
| "grad_norm": 7.365673542022705, | |
| "learning_rate": 4.574651990845864e-06, | |
| "loss": 0.13511247634887696, | |
| "step": 10500, | |
| "token_acc": 0.9606173846680176 | |
| }, | |
| { | |
| "epoch": 0.4623773173391494, | |
| "grad_norm": 8.739450454711914, | |
| "learning_rate": 4.564537068683507e-06, | |
| "loss": 0.1433129596710205, | |
| "step": 10600, | |
| "token_acc": 0.9589074287510477 | |
| }, | |
| { | |
| "epoch": 0.46673936750272627, | |
| "grad_norm": 5.756815433502197, | |
| "learning_rate": 4.554314746435487e-06, | |
| "loss": 0.13850313186645508, | |
| "step": 10700, | |
| "token_acc": 0.9598506612832259 | |
| }, | |
| { | |
| "epoch": 0.47110141766630315, | |
| "grad_norm": 5.801908493041992, | |
| "learning_rate": 4.543985555881208e-06, | |
| "loss": 0.1446135425567627, | |
| "step": 10800, | |
| "token_acc": 0.9583247382806457 | |
| }, | |
| { | |
| "epoch": 0.47546346782988, | |
| "grad_norm": 5.760519504547119, | |
| "learning_rate": 4.533550034359506e-06, | |
| "loss": 0.14331352233886718, | |
| "step": 10900, | |
| "token_acc": 0.9582483464207748 | |
| }, | |
| { | |
| "epoch": 0.4798255179934569, | |
| "grad_norm": 4.116934776306152, | |
| "learning_rate": 4.523008724740705e-06, | |
| "loss": 0.14439311981201172, | |
| "step": 11000, | |
| "token_acc": 0.9575995368881456 | |
| }, | |
| { | |
| "epoch": 0.4798255179934569, | |
| "eval_loss": 0.11400242149829865, | |
| "eval_runtime": 249.7471, | |
| "eval_samples_per_second": 59.34, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9592769158544219, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.4841875681570338, | |
| "grad_norm": 3.357717990875244, | |
| "learning_rate": 4.512362175398371e-06, | |
| "loss": 0.14085225105285645, | |
| "step": 11100, | |
| "token_acc": 0.9586402774839236 | |
| }, | |
| { | |
| "epoch": 0.48854961832061067, | |
| "grad_norm": 6.080904483795166, | |
| "learning_rate": 4.501610940180789e-06, | |
| "loss": 0.1399165916442871, | |
| "step": 11200, | |
| "token_acc": 0.9586773087105651 | |
| }, | |
| { | |
| "epoch": 0.49291166848418755, | |
| "grad_norm": 5.0486273765563965, | |
| "learning_rate": 4.490755578382145e-06, | |
| "loss": 0.14417312622070313, | |
| "step": 11300, | |
| "token_acc": 0.9579395177290202 | |
| }, | |
| { | |
| "epoch": 0.49727371864776443, | |
| "grad_norm": 7.045108318328857, | |
| "learning_rate": 4.479796654713438e-06, | |
| "loss": 0.1347987174987793, | |
| "step": 11400, | |
| "token_acc": 0.9606777638720471 | |
| }, | |
| { | |
| "epoch": 0.5016357688113413, | |
| "grad_norm": 4.105020523071289, | |
| "learning_rate": 4.468734739273095e-06, | |
| "loss": 0.13610049247741698, | |
| "step": 11500, | |
| "token_acc": 0.9600762304805243 | |
| }, | |
| { | |
| "epoch": 0.5059978189749182, | |
| "grad_norm": 4.2766947746276855, | |
| "learning_rate": 4.457570407517324e-06, | |
| "loss": 0.13926528930664062, | |
| "step": 11600, | |
| "token_acc": 0.9589045543988622 | |
| }, | |
| { | |
| "epoch": 0.5103598691384951, | |
| "grad_norm": 17.53934097290039, | |
| "learning_rate": 4.446304240230167e-06, | |
| "loss": 0.13595869064331054, | |
| "step": 11700, | |
| "token_acc": 0.9598472992528664 | |
| }, | |
| { | |
| "epoch": 0.514721919302072, | |
| "grad_norm": 5.563167095184326, | |
| "learning_rate": 4.434936823493293e-06, | |
| "loss": 0.13069713592529297, | |
| "step": 11800, | |
| "token_acc": 0.9616947223112704 | |
| }, | |
| { | |
| "epoch": 0.5190839694656488, | |
| "grad_norm": 8.064498901367188, | |
| "learning_rate": 4.4234687486555084e-06, | |
| "loss": 0.13252490997314453, | |
| "step": 11900, | |
| "token_acc": 0.961364667811125 | |
| }, | |
| { | |
| "epoch": 0.5234460196292258, | |
| "grad_norm": 4.162057399749756, | |
| "learning_rate": 4.4119006123019945e-06, | |
| "loss": 0.14607376098632813, | |
| "step": 12000, | |
| "token_acc": 0.957758365222321 | |
| }, | |
| { | |
| "epoch": 0.5234460196292258, | |
| "eval_loss": 0.11156473308801651, | |
| "eval_runtime": 250.7613, | |
| "eval_samples_per_second": 59.1, | |
| "eval_steps_per_second": 0.925, | |
| "eval_token_acc": 0.9600286148618439, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.5278080697928026, | |
| "grad_norm": 4.963797092437744, | |
| "learning_rate": 4.400233016223271e-06, | |
| "loss": 0.13640110015869142, | |
| "step": 12100, | |
| "token_acc": 0.9606004612884625 | |
| }, | |
| { | |
| "epoch": 0.5321701199563795, | |
| "grad_norm": 5.048738956451416, | |
| "learning_rate": 4.38846656738389e-06, | |
| "loss": 0.15161012649536132, | |
| "step": 12200, | |
| "token_acc": 0.9557634182813146 | |
| }, | |
| { | |
| "epoch": 0.5365321701199564, | |
| "grad_norm": 7.779881000518799, | |
| "learning_rate": 4.3766018778908615e-06, | |
| "loss": 0.1338326835632324, | |
| "step": 12300, | |
| "token_acc": 0.9601769911504425 | |
| }, | |
| { | |
| "epoch": 0.5408942202835333, | |
| "grad_norm": 4.614696502685547, | |
| "learning_rate": 4.364639564961812e-06, | |
| "loss": 0.12876497268676756, | |
| "step": 12400, | |
| "token_acc": 0.9624142336006742 | |
| }, | |
| { | |
| "epoch": 0.5452562704471101, | |
| "grad_norm": 4.806646823883057, | |
| "learning_rate": 4.352580250892875e-06, | |
| "loss": 0.134474515914917, | |
| "step": 12500, | |
| "token_acc": 0.9602631174026773 | |
| }, | |
| { | |
| "epoch": 0.549618320610687, | |
| "grad_norm": 4.253026008605957, | |
| "learning_rate": 4.340424563026315e-06, | |
| "loss": 0.14678755760192871, | |
| "step": 12600, | |
| "token_acc": 0.957807593859406 | |
| }, | |
| { | |
| "epoch": 0.5539803707742639, | |
| "grad_norm": 6.865671157836914, | |
| "learning_rate": 4.328173133717899e-06, | |
| "loss": 0.13899526596069336, | |
| "step": 12700, | |
| "token_acc": 0.9594970899385832 | |
| }, | |
| { | |
| "epoch": 0.5583424209378408, | |
| "grad_norm": 4.684392929077148, | |
| "learning_rate": 4.315826600303994e-06, | |
| "loss": 0.13189617156982422, | |
| "step": 12800, | |
| "token_acc": 0.9610336745212099 | |
| }, | |
| { | |
| "epoch": 0.5627044711014176, | |
| "grad_norm": 4.648969650268555, | |
| "learning_rate": 4.303385605068417e-06, | |
| "loss": 0.13608951568603517, | |
| "step": 12900, | |
| "token_acc": 0.9604944752442288 | |
| }, | |
| { | |
| "epoch": 0.5670665212649946, | |
| "grad_norm": 5.1123833656311035, | |
| "learning_rate": 4.2908507952090174e-06, | |
| "loss": 0.13563767433166504, | |
| "step": 13000, | |
| "token_acc": 0.960596330573991 | |
| }, | |
| { | |
| "epoch": 0.5670665212649946, | |
| "eval_loss": 0.1084199920296669, | |
| "eval_runtime": 254.881, | |
| "eval_samples_per_second": 58.145, | |
| "eval_steps_per_second": 0.91, | |
| "eval_token_acc": 0.9608250245908969, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 4.262386322021484, | |
| "learning_rate": 4.278222822804015e-06, | |
| "loss": 0.13114137649536134, | |
| "step": 13100, | |
| "token_acc": 0.962146988771691 | |
| }, | |
| { | |
| "epoch": 0.5757906215921483, | |
| "grad_norm": 8.251741409301758, | |
| "learning_rate": 4.265502344778071e-06, | |
| "loss": 0.13535155296325685, | |
| "step": 13200, | |
| "token_acc": 0.9604900042755529 | |
| }, | |
| { | |
| "epoch": 0.5801526717557252, | |
| "grad_norm": 5.114648342132568, | |
| "learning_rate": 4.252690022868119e-06, | |
| "loss": 0.13799442291259767, | |
| "step": 13300, | |
| "token_acc": 0.9601804553738738 | |
| }, | |
| { | |
| "epoch": 0.5845147219193021, | |
| "grad_norm": 4.916436195373535, | |
| "learning_rate": 4.239786523588941e-06, | |
| "loss": 0.13074012756347655, | |
| "step": 13400, | |
| "token_acc": 0.961597333022013 | |
| }, | |
| { | |
| "epoch": 0.5888767720828789, | |
| "grad_norm": 5.750058174133301, | |
| "learning_rate": 4.22679251819849e-06, | |
| "loss": 0.12944320678710938, | |
| "step": 13500, | |
| "token_acc": 0.9623505189348078 | |
| }, | |
| { | |
| "epoch": 0.5932388222464559, | |
| "grad_norm": 4.390042781829834, | |
| "learning_rate": 4.2137086826629735e-06, | |
| "loss": 0.12797122001647948, | |
| "step": 13600, | |
| "token_acc": 0.9626673415487246 | |
| }, | |
| { | |
| "epoch": 0.5976008724100327, | |
| "grad_norm": 5.039231777191162, | |
| "learning_rate": 4.200535697621687e-06, | |
| "loss": 0.1344309616088867, | |
| "step": 13700, | |
| "token_acc": 0.9587961989254508 | |
| }, | |
| { | |
| "epoch": 0.6019629225736096, | |
| "grad_norm": 4.506139278411865, | |
| "learning_rate": 4.187274248351607e-06, | |
| "loss": 0.12951894760131835, | |
| "step": 13800, | |
| "token_acc": 0.9618089667601382 | |
| }, | |
| { | |
| "epoch": 0.6063249727371864, | |
| "grad_norm": 4.769443035125732, | |
| "learning_rate": 4.173925024731744e-06, | |
| "loss": 0.13222533226013183, | |
| "step": 13900, | |
| "token_acc": 0.961495694448036 | |
| }, | |
| { | |
| "epoch": 0.6106870229007634, | |
| "grad_norm": 5.142453193664551, | |
| "learning_rate": 4.1604887212072515e-06, | |
| "loss": 0.1336323833465576, | |
| "step": 14000, | |
| "token_acc": 0.9605494391895384 | |
| }, | |
| { | |
| "epoch": 0.6106870229007634, | |
| "eval_loss": 0.10780877619981766, | |
| "eval_runtime": 254.9746, | |
| "eval_samples_per_second": 58.123, | |
| "eval_steps_per_second": 0.91, | |
| "eval_token_acc": 0.9610849056603774, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.6150490730643402, | |
| "grad_norm": 4.960557460784912, | |
| "learning_rate": 4.146966036753298e-06, | |
| "loss": 0.12569026947021483, | |
| "step": 14100, | |
| "token_acc": 0.9631472014680823 | |
| }, | |
| { | |
| "epoch": 0.6194111232279171, | |
| "grad_norm": 6.882476806640625, | |
| "learning_rate": 4.133357674838711e-06, | |
| "loss": 0.1261822509765625, | |
| "step": 14200, | |
| "token_acc": 0.962310356794416 | |
| }, | |
| { | |
| "epoch": 0.623773173391494, | |
| "grad_norm": 4.472857475280762, | |
| "learning_rate": 4.119664343389379e-06, | |
| "loss": 0.14244023323059082, | |
| "step": 14300, | |
| "token_acc": 0.9589147138040676 | |
| }, | |
| { | |
| "epoch": 0.6281352235550709, | |
| "grad_norm": 4.520143508911133, | |
| "learning_rate": 4.105886754751419e-06, | |
| "loss": 0.14673561096191406, | |
| "step": 14400, | |
| "token_acc": 0.9569507419620775 | |
| }, | |
| { | |
| "epoch": 0.6324972737186477, | |
| "grad_norm": 5.614750385284424, | |
| "learning_rate": 4.092025625654129e-06, | |
| "loss": 0.13182031631469726, | |
| "step": 14500, | |
| "token_acc": 0.9618331926192794 | |
| }, | |
| { | |
| "epoch": 0.6368593238822247, | |
| "grad_norm": 5.235232830047607, | |
| "learning_rate": 4.078081677172695e-06, | |
| "loss": 0.13167083740234375, | |
| "step": 14600, | |
| "token_acc": 0.9610090699262053 | |
| }, | |
| { | |
| "epoch": 0.6412213740458015, | |
| "grad_norm": 6.400154113769531, | |
| "learning_rate": 4.064055634690684e-06, | |
| "loss": 0.1283753204345703, | |
| "step": 14700, | |
| "token_acc": 0.9634285038096678 | |
| }, | |
| { | |
| "epoch": 0.6455834242093784, | |
| "grad_norm": 4.708637237548828, | |
| "learning_rate": 4.049948227862305e-06, | |
| "loss": 0.12938753128051758, | |
| "step": 14800, | |
| "token_acc": 0.9620657866347606 | |
| }, | |
| { | |
| "epoch": 0.6499454743729552, | |
| "grad_norm": 3.1443099975585938, | |
| "learning_rate": 4.0357601905744545e-06, | |
| "loss": 0.1246177864074707, | |
| "step": 14900, | |
| "token_acc": 0.9637490695177302 | |
| }, | |
| { | |
| "epoch": 0.6543075245365322, | |
| "grad_norm": 4.834894180297852, | |
| "learning_rate": 4.021492260908538e-06, | |
| "loss": 0.13603127479553223, | |
| "step": 15000, | |
| "token_acc": 0.9604635702322964 | |
| }, | |
| { | |
| "epoch": 0.6543075245365322, | |
| "eval_loss": 0.10531982779502869, | |
| "eval_runtime": 249.9605, | |
| "eval_samples_per_second": 59.289, | |
| "eval_steps_per_second": 0.928, | |
| "eval_token_acc": 0.9620853080568721, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.658669574700109, | |
| "grad_norm": 6.40881872177124, | |
| "learning_rate": 4.007145181102073e-06, | |
| "loss": 0.13621252059936523, | |
| "step": 15100, | |
| "token_acc": 0.9602403567077255 | |
| }, | |
| { | |
| "epoch": 0.6630316248636859, | |
| "grad_norm": 4.037698268890381, | |
| "learning_rate": 3.992719697510077e-06, | |
| "loss": 0.12186765670776367, | |
| "step": 15200, | |
| "token_acc": 0.9640692471775663 | |
| }, | |
| { | |
| "epoch": 0.6673936750272628, | |
| "grad_norm": 3.7594399452209473, | |
| "learning_rate": 3.97821656056624e-06, | |
| "loss": 0.12971938133239747, | |
| "step": 15300, | |
| "token_acc": 0.9622328323773512 | |
| }, | |
| { | |
| "epoch": 0.6717557251908397, | |
| "grad_norm": 5.0803422927856445, | |
| "learning_rate": 3.963636524743891e-06, | |
| "loss": 0.1299113655090332, | |
| "step": 15400, | |
| "token_acc": 0.9613922835754047 | |
| }, | |
| { | |
| "epoch": 0.6761177753544165, | |
| "grad_norm": 10.828523635864258, | |
| "learning_rate": 3.948980348516742e-06, | |
| "loss": 0.13089241981506347, | |
| "step": 15500, | |
| "token_acc": 0.9614092769880411 | |
| }, | |
| { | |
| "epoch": 0.6804798255179935, | |
| "grad_norm": 4.439146518707275, | |
| "learning_rate": 3.934248794319435e-06, | |
| "loss": 0.12397289276123047, | |
| "step": 15600, | |
| "token_acc": 0.963767595283974 | |
| }, | |
| { | |
| "epoch": 0.6848418756815703, | |
| "grad_norm": 4.211355209350586, | |
| "learning_rate": 3.9194426285078794e-06, | |
| "loss": 0.12579789161682128, | |
| "step": 15700, | |
| "token_acc": 0.9628140306004733 | |
| }, | |
| { | |
| "epoch": 0.6892039258451472, | |
| "grad_norm": 6.285272598266602, | |
| "learning_rate": 3.904562621319385e-06, | |
| "loss": 0.12318273544311524, | |
| "step": 15800, | |
| "token_acc": 0.9640143499860016 | |
| }, | |
| { | |
| "epoch": 0.693565976008724, | |
| "grad_norm": 5.098615646362305, | |
| "learning_rate": 3.889609546832592e-06, | |
| "loss": 0.12975415229797363, | |
| "step": 15900, | |
| "token_acc": 0.9626485457720456 | |
| }, | |
| { | |
| "epoch": 0.697928026172301, | |
| "grad_norm": 4.703002452850342, | |
| "learning_rate": 3.874584182927203e-06, | |
| "loss": 0.1291140651702881, | |
| "step": 16000, | |
| "token_acc": 0.9626422307463804 | |
| }, | |
| { | |
| "epoch": 0.697928026172301, | |
| "eval_loss": 0.1023755595088005, | |
| "eval_runtime": 249.8751, | |
| "eval_samples_per_second": 59.31, | |
| "eval_steps_per_second": 0.928, | |
| "eval_token_acc": 0.9632030760976482, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.7022900763358778, | |
| "grad_norm": 5.157517910003662, | |
| "learning_rate": 3.8594873112435155e-06, | |
| "loss": 0.12747785568237305, | |
| "step": 16100, | |
| "token_acc": 0.9628675375791396 | |
| }, | |
| { | |
| "epoch": 0.7066521264994547, | |
| "grad_norm": 4.166143894195557, | |
| "learning_rate": 3.844319717141764e-06, | |
| "loss": 0.12231754302978516, | |
| "step": 16200, | |
| "token_acc": 0.9642421806110275 | |
| }, | |
| { | |
| "epoch": 0.7110141766630316, | |
| "grad_norm": 5.694368839263916, | |
| "learning_rate": 3.829082189661256e-06, | |
| "loss": 0.12773503303527833, | |
| "step": 16300, | |
| "token_acc": 0.9625522173794959 | |
| }, | |
| { | |
| "epoch": 0.7153762268266085, | |
| "grad_norm": 18.599973678588867, | |
| "learning_rate": 3.8137755214793358e-06, | |
| "loss": 0.12870462417602538, | |
| "step": 16400, | |
| "token_acc": 0.963150284289799 | |
| }, | |
| { | |
| "epoch": 0.7197382769901853, | |
| "grad_norm": 4.071221828460693, | |
| "learning_rate": 3.798400508870142e-06, | |
| "loss": 0.12437307357788085, | |
| "step": 16500, | |
| "token_acc": 0.9635778517683454 | |
| }, | |
| { | |
| "epoch": 0.7241003271537623, | |
| "grad_norm": 3.744788885116577, | |
| "learning_rate": 3.7829579516631824e-06, | |
| "loss": 0.1270986557006836, | |
| "step": 16600, | |
| "token_acc": 0.9625778535370091 | |
| }, | |
| { | |
| "epoch": 0.7284623773173392, | |
| "grad_norm": 4.1468186378479, | |
| "learning_rate": 3.7674486532017314e-06, | |
| "loss": 0.12220178604125977, | |
| "step": 16700, | |
| "token_acc": 0.9635083061616383 | |
| }, | |
| { | |
| "epoch": 0.732824427480916, | |
| "grad_norm": 4.590456008911133, | |
| "learning_rate": 3.7518734203010353e-06, | |
| "loss": 0.12810008049011232, | |
| "step": 16800, | |
| "token_acc": 0.9624704568475184 | |
| }, | |
| { | |
| "epoch": 0.737186477644493, | |
| "grad_norm": 4.199416160583496, | |
| "learning_rate": 3.7362330632063403e-06, | |
| "loss": 0.125726900100708, | |
| "step": 16900, | |
| "token_acc": 0.9630880153604012 | |
| }, | |
| { | |
| "epoch": 0.7415485278080698, | |
| "grad_norm": 7.344795227050781, | |
| "learning_rate": 3.7205283955507455e-06, | |
| "loss": 0.13037595748901368, | |
| "step": 17000, | |
| "token_acc": 0.9619103020492814 | |
| }, | |
| { | |
| "epoch": 0.7415485278080698, | |
| "eval_loss": 0.09986680746078491, | |
| "eval_runtime": 249.7928, | |
| "eval_samples_per_second": 59.329, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9643292273987302, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.7459105779716467, | |
| "grad_norm": 4.345545768737793, | |
| "learning_rate": 3.704760234312874e-06, | |
| "loss": 0.12362366676330566, | |
| "step": 17100, | |
| "token_acc": 0.9646065349863416 | |
| }, | |
| { | |
| "epoch": 0.7502726281352236, | |
| "grad_norm": 4.425832271575928, | |
| "learning_rate": 3.6889293997743723e-06, | |
| "loss": 0.11739083290100098, | |
| "step": 17200, | |
| "token_acc": 0.965255087601945 | |
| }, | |
| { | |
| "epoch": 0.7546346782988005, | |
| "grad_norm": 5.167355537414551, | |
| "learning_rate": 3.6730367154772414e-06, | |
| "loss": 0.12582969665527344, | |
| "step": 17300, | |
| "token_acc": 0.9627481582817909 | |
| }, | |
| { | |
| "epoch": 0.7589967284623773, | |
| "grad_norm": 3.649265766143799, | |
| "learning_rate": 3.65708300818099e-06, | |
| "loss": 0.13049283981323243, | |
| "step": 17400, | |
| "token_acc": 0.9616916513063766 | |
| }, | |
| { | |
| "epoch": 0.7633587786259542, | |
| "grad_norm": 5.919361114501953, | |
| "learning_rate": 3.6410691078196285e-06, | |
| "loss": 0.12486766815185547, | |
| "step": 17500, | |
| "token_acc": 0.9645121544266756 | |
| }, | |
| { | |
| "epoch": 0.7677208287895311, | |
| "grad_norm": 3.0380961894989014, | |
| "learning_rate": 3.6249958474584954e-06, | |
| "loss": 0.12427781105041504, | |
| "step": 17600, | |
| "token_acc": 0.9628813407305854 | |
| }, | |
| { | |
| "epoch": 0.772082878953108, | |
| "grad_norm": 6.409759521484375, | |
| "learning_rate": 3.6088640632509196e-06, | |
| "loss": 0.12428503036499024, | |
| "step": 17700, | |
| "token_acc": 0.9637545993458708 | |
| }, | |
| { | |
| "epoch": 0.7764449291166848, | |
| "grad_norm": 6.19067907333374, | |
| "learning_rate": 3.5926745943947185e-06, | |
| "loss": 0.12289785385131836, | |
| "step": 17800, | |
| "token_acc": 0.963990416541882 | |
| }, | |
| { | |
| "epoch": 0.7808069792802618, | |
| "grad_norm": 3.6801064014434814, | |
| "learning_rate": 3.5764282830885467e-06, | |
| "loss": 0.1264265727996826, | |
| "step": 17900, | |
| "token_acc": 0.9629325823828039 | |
| }, | |
| { | |
| "epoch": 0.7851690294438386, | |
| "grad_norm": 10.158914566040039, | |
| "learning_rate": 3.5601259744880833e-06, | |
| "loss": 0.12125222206115722, | |
| "step": 18000, | |
| "token_acc": 0.9642156023136009 | |
| }, | |
| { | |
| "epoch": 0.7851690294438386, | |
| "eval_loss": 0.0974874198436737, | |
| "eval_runtime": 249.4923, | |
| "eval_samples_per_second": 59.401, | |
| "eval_steps_per_second": 0.93, | |
| "eval_token_acc": 0.9646729410712689, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.7895310796074155, | |
| "grad_norm": 7.951091289520264, | |
| "learning_rate": 3.543768516662063e-06, | |
| "loss": 0.12031344413757324, | |
| "step": 18100, | |
| "token_acc": 0.9641105894627021 | |
| }, | |
| { | |
| "epoch": 0.7938931297709924, | |
| "grad_norm": 4.855859756469727, | |
| "learning_rate": 3.527356760548159e-06, | |
| "loss": 0.11818990707397461, | |
| "step": 18200, | |
| "token_acc": 0.9647326801868844 | |
| }, | |
| { | |
| "epoch": 0.7982551799345693, | |
| "grad_norm": 8.738151550292969, | |
| "learning_rate": 3.51089155990872e-06, | |
| "loss": 0.1165078067779541, | |
| "step": 18300, | |
| "token_acc": 0.9654456415279138 | |
| }, | |
| { | |
| "epoch": 0.8026172300981461, | |
| "grad_norm": 5.64874792098999, | |
| "learning_rate": 3.494373771286349e-06, | |
| "loss": 0.120614013671875, | |
| "step": 18400, | |
| "token_acc": 0.9641536709864776 | |
| }, | |
| { | |
| "epoch": 0.806979280261723, | |
| "grad_norm": 4.42671537399292, | |
| "learning_rate": 3.477804253959352e-06, | |
| "loss": 0.1219012451171875, | |
| "step": 18500, | |
| "token_acc": 0.9641036100346726 | |
| }, | |
| { | |
| "epoch": 0.8113413304252999, | |
| "grad_norm": 4.415754318237305, | |
| "learning_rate": 3.4611838698970335e-06, | |
| "loss": 0.11962775230407714, | |
| "step": 18600, | |
| "token_acc": 0.9655911722473921 | |
| }, | |
| { | |
| "epoch": 0.8157033805888768, | |
| "grad_norm": 6.6899027824401855, | |
| "learning_rate": 3.4445134837148553e-06, | |
| "loss": 0.11862802505493164, | |
| "step": 18700, | |
| "token_acc": 0.9649808710911543 | |
| }, | |
| { | |
| "epoch": 0.8200654307524536, | |
| "grad_norm": 3.721449375152588, | |
| "learning_rate": 3.427793962629459e-06, | |
| "loss": 0.1211440086364746, | |
| "step": 18800, | |
| "token_acc": 0.9638717768052799 | |
| }, | |
| { | |
| "epoch": 0.8244274809160306, | |
| "grad_norm": 4.877586841583252, | |
| "learning_rate": 3.4110261764135525e-06, | |
| "loss": 0.11775184631347656, | |
| "step": 18900, | |
| "token_acc": 0.9653165583354041 | |
| }, | |
| { | |
| "epoch": 0.8287895310796074, | |
| "grad_norm": 4.989749908447266, | |
| "learning_rate": 3.3942109973506636e-06, | |
| "loss": 0.1220939540863037, | |
| "step": 19000, | |
| "token_acc": 0.9634719545856987 | |
| }, | |
| { | |
| "epoch": 0.8287895310796074, | |
| "eval_loss": 0.09567935019731522, | |
| "eval_runtime": 249.6723, | |
| "eval_samples_per_second": 59.358, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9653603684163462, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.8331515812431843, | |
| "grad_norm": 7.156066417694092, | |
| "learning_rate": 3.377349300189761e-06, | |
| "loss": 0.12005048751831054, | |
| "step": 19100, | |
| "token_acc": 0.9646030979629654 | |
| }, | |
| { | |
| "epoch": 0.8375136314067612, | |
| "grad_norm": 5.540706157684326, | |
| "learning_rate": 3.360441962099748e-06, | |
| "loss": 0.11568243980407715, | |
| "step": 19200, | |
| "token_acc": 0.9662971175166297 | |
| }, | |
| { | |
| "epoch": 0.8418756815703381, | |
| "grad_norm": 4.6166510581970215, | |
| "learning_rate": 3.343489862623836e-06, | |
| "loss": 0.11525155067443847, | |
| "step": 19300, | |
| "token_acc": 0.9660013046314416 | |
| }, | |
| { | |
| "epoch": 0.8462377317339149, | |
| "grad_norm": 3.9313056468963623, | |
| "learning_rate": 3.326493883633783e-06, | |
| "loss": 0.11743115425109864, | |
| "step": 19400, | |
| "token_acc": 0.9656000417601921 | |
| }, | |
| { | |
| "epoch": 0.8505997818974919, | |
| "grad_norm": 4.582152366638184, | |
| "learning_rate": 3.3094549092840195e-06, | |
| "loss": 0.12221628189086914, | |
| "step": 19500, | |
| "token_acc": 0.9639358598680389 | |
| }, | |
| { | |
| "epoch": 0.8549618320610687, | |
| "grad_norm": 10.774201393127441, | |
| "learning_rate": 3.2923738259656585e-06, | |
| "loss": 0.11821553230285645, | |
| "step": 19600, | |
| "token_acc": 0.9650054941503458 | |
| }, | |
| { | |
| "epoch": 0.8593238822246456, | |
| "grad_norm": 4.74281644821167, | |
| "learning_rate": 3.2752515222603766e-06, | |
| "loss": 0.11718135833740234, | |
| "step": 19700, | |
| "token_acc": 0.9661495155373397 | |
| }, | |
| { | |
| "epoch": 0.8636859323882224, | |
| "grad_norm": 5.704650402069092, | |
| "learning_rate": 3.2580888888941908e-06, | |
| "loss": 0.11361128807067872, | |
| "step": 19800, | |
| "token_acc": 0.96557928802589 | |
| }, | |
| { | |
| "epoch": 0.8680479825517994, | |
| "grad_norm": 2.6005077362060547, | |
| "learning_rate": 3.2408868186911285e-06, | |
| "loss": 0.11735478401184082, | |
| "step": 19900, | |
| "token_acc": 0.9663002389186196 | |
| }, | |
| { | |
| "epoch": 0.8724100327153762, | |
| "grad_norm": 7.039544582366943, | |
| "learning_rate": 3.2236462065267715e-06, | |
| "loss": 0.1120327091217041, | |
| "step": 20000, | |
| "token_acc": 0.9665029780149579 | |
| }, | |
| { | |
| "epoch": 0.8724100327153762, | |
| "eval_loss": 0.09346429258584976, | |
| "eval_runtime": 254.7297, | |
| "eval_samples_per_second": 58.179, | |
| "eval_steps_per_second": 0.911, | |
| "eval_token_acc": 0.9658605696145935, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.8767720828789531, | |
| "grad_norm": 4.584099769592285, | |
| "learning_rate": 3.206367949281708e-06, | |
| "loss": 0.11660341262817382, | |
| "step": 20100, | |
| "token_acc": 0.965891313165129 | |
| }, | |
| { | |
| "epoch": 0.88113413304253, | |
| "grad_norm": 3.811115026473999, | |
| "learning_rate": 3.1890529457948794e-06, | |
| "loss": 0.11867828369140625, | |
| "step": 20200, | |
| "token_acc": 0.9647862570539326 | |
| }, | |
| { | |
| "epoch": 0.8854961832061069, | |
| "grad_norm": 3.324962854385376, | |
| "learning_rate": 3.171702096816814e-06, | |
| "loss": 0.1159024715423584, | |
| "step": 20300, | |
| "token_acc": 0.9654955434176213 | |
| }, | |
| { | |
| "epoch": 0.8898582333696837, | |
| "grad_norm": 9.917572021484375, | |
| "learning_rate": 3.1543163049627727e-06, | |
| "loss": 0.10901348114013672, | |
| "step": 20400, | |
| "token_acc": 0.9676555921987456 | |
| }, | |
| { | |
| "epoch": 0.8942202835332607, | |
| "grad_norm": 5.332941055297852, | |
| "learning_rate": 3.136896474665796e-06, | |
| "loss": 0.11584772109985352, | |
| "step": 20500, | |
| "token_acc": 0.9658386091204165 | |
| }, | |
| { | |
| "epoch": 0.8985823336968375, | |
| "grad_norm": 4.178663730621338, | |
| "learning_rate": 3.1194435121296477e-06, | |
| "loss": 0.11288185119628906, | |
| "step": 20600, | |
| "token_acc": 0.9666555365689182 | |
| }, | |
| { | |
| "epoch": 0.9029443838604144, | |
| "grad_norm": 5.721475124359131, | |
| "learning_rate": 3.10195832528168e-06, | |
| "loss": 0.11232204437255859, | |
| "step": 20700, | |
| "token_acc": 0.9669557900187714 | |
| }, | |
| { | |
| "epoch": 0.9073064340239912, | |
| "grad_norm": 2.847933053970337, | |
| "learning_rate": 3.0844418237255962e-06, | |
| "loss": 0.11065910339355468, | |
| "step": 20800, | |
| "token_acc": 0.9669379220157056 | |
| }, | |
| { | |
| "epoch": 0.9116684841875682, | |
| "grad_norm": 7.752315521240234, | |
| "learning_rate": 3.0668949186941357e-06, | |
| "loss": 0.11427606582641602, | |
| "step": 20900, | |
| "token_acc": 0.9659426171427099 | |
| }, | |
| { | |
| "epoch": 0.916030534351145, | |
| "grad_norm": 4.635708332061768, | |
| "learning_rate": 3.049318523001669e-06, | |
| "loss": 0.11044464111328126, | |
| "step": 21000, | |
| "token_acc": 0.9672973482754359 | |
| }, | |
| { | |
| "epoch": 0.916030534351145, | |
| "eval_loss": 0.09045585989952087, | |
| "eval_runtime": 254.415, | |
| "eval_samples_per_second": 58.251, | |
| "eval_steps_per_second": 0.912, | |
| "eval_token_acc": 0.966824644549763, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.920392584514722, | |
| "grad_norm": 4.291543483734131, | |
| "learning_rate": 3.0317135509967095e-06, | |
| "loss": 0.11297191619873047, | |
| "step": 21100, | |
| "token_acc": 0.9673089666463106 | |
| }, | |
| { | |
| "epoch": 0.9247546346782988, | |
| "grad_norm": 4.2850189208984375, | |
| "learning_rate": 3.014080918514353e-06, | |
| "loss": 0.11317073822021484, | |
| "step": 21200, | |
| "token_acc": 0.9658741385799385 | |
| }, | |
| { | |
| "epoch": 0.9291166848418757, | |
| "grad_norm": 3.6628763675689697, | |
| "learning_rate": 2.99642154282863e-06, | |
| "loss": 0.10480199813842773, | |
| "step": 21300, | |
| "token_acc": 0.9688065128551662 | |
| }, | |
| { | |
| "epoch": 0.9334787350054525, | |
| "grad_norm": 6.277792930603027, | |
| "learning_rate": 2.97873634260479e-06, | |
| "loss": 0.1182522201538086, | |
| "step": 21400, | |
| "token_acc": 0.9656542191019528 | |
| }, | |
| { | |
| "epoch": 0.9378407851690295, | |
| "grad_norm": 3.970945358276367, | |
| "learning_rate": 2.961026237851511e-06, | |
| "loss": 0.10972850799560546, | |
| "step": 21500, | |
| "token_acc": 0.9674839091324113 | |
| }, | |
| { | |
| "epoch": 0.9422028353326063, | |
| "grad_norm": 5.1818366050720215, | |
| "learning_rate": 2.94329214987304e-06, | |
| "loss": 0.10691708564758301, | |
| "step": 21600, | |
| "token_acc": 0.9680576689964666 | |
| }, | |
| { | |
| "epoch": 0.9465648854961832, | |
| "grad_norm": 6.335598945617676, | |
| "learning_rate": 2.925535001221262e-06, | |
| "loss": 0.1139387321472168, | |
| "step": 21700, | |
| "token_acc": 0.966435651801084 | |
| }, | |
| { | |
| "epoch": 0.95092693565976, | |
| "grad_norm": 4.5589599609375, | |
| "learning_rate": 2.9077557156477124e-06, | |
| "loss": 0.11017286300659179, | |
| "step": 21800, | |
| "token_acc": 0.967085074906488 | |
| }, | |
| { | |
| "epoch": 0.955288985823337, | |
| "grad_norm": 5.797848701477051, | |
| "learning_rate": 2.889955218055521e-06, | |
| "loss": 0.10730340957641601, | |
| "step": 21900, | |
| "token_acc": 0.9681335161263331 | |
| }, | |
| { | |
| "epoch": 0.9596510359869138, | |
| "grad_norm": 4.161041736602783, | |
| "learning_rate": 2.8721344344512934e-06, | |
| "loss": 0.11035999298095703, | |
| "step": 22000, | |
| "token_acc": 0.9674398288808526 | |
| }, | |
| { | |
| "epoch": 0.9596510359869138, | |
| "eval_loss": 0.08762744069099426, | |
| "eval_runtime": 249.5745, | |
| "eval_samples_per_second": 59.381, | |
| "eval_steps_per_second": 0.93, | |
| "eval_token_acc": 0.9679787400518645, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.9640130861504908, | |
| "grad_norm": 3.1247737407684326, | |
| "learning_rate": 2.854294291896944e-06, | |
| "loss": 0.1087168025970459, | |
| "step": 22100, | |
| "token_acc": 0.9678269463050401 | |
| }, | |
| { | |
| "epoch": 0.9683751363140676, | |
| "grad_norm": 6.124391078948975, | |
| "learning_rate": 2.8364357184614668e-06, | |
| "loss": 0.11305822372436523, | |
| "step": 22200, | |
| "token_acc": 0.9667042707493957 | |
| }, | |
| { | |
| "epoch": 0.9727371864776445, | |
| "grad_norm": 8.205916404724121, | |
| "learning_rate": 2.8185596431726527e-06, | |
| "loss": 0.11026342391967774, | |
| "step": 22300, | |
| "token_acc": 0.9671849023275145 | |
| }, | |
| { | |
| "epoch": 0.9770992366412213, | |
| "grad_norm": 3.7859928607940674, | |
| "learning_rate": 2.8006669959687668e-06, | |
| "loss": 0.10871770858764648, | |
| "step": 22400, | |
| "token_acc": 0.9682362974681484 | |
| }, | |
| { | |
| "epoch": 0.9814612868047983, | |
| "grad_norm": 7.330371856689453, | |
| "learning_rate": 2.7827587076501673e-06, | |
| "loss": 0.11897387504577636, | |
| "step": 22500, | |
| "token_acc": 0.9650655582069638 | |
| }, | |
| { | |
| "epoch": 0.9858233369683751, | |
| "grad_norm": 6.279266834259033, | |
| "learning_rate": 2.764835709830884e-06, | |
| "loss": 0.10861226081848145, | |
| "step": 22600, | |
| "token_acc": 0.9683295584335133 | |
| }, | |
| { | |
| "epoch": 0.990185387131952, | |
| "grad_norm": 3.766507148742676, | |
| "learning_rate": 2.7468989348901555e-06, | |
| "loss": 0.10774114608764648, | |
| "step": 22700, | |
| "token_acc": 0.9677088507163268 | |
| }, | |
| { | |
| "epoch": 0.9945474372955289, | |
| "grad_norm": 4.713306903839111, | |
| "learning_rate": 2.728949315923927e-06, | |
| "loss": 0.09772121429443359, | |
| "step": 22800, | |
| "token_acc": 0.970906443111303 | |
| }, | |
| { | |
| "epoch": 0.9989094874591058, | |
| "grad_norm": 3.5282657146453857, | |
| "learning_rate": 2.7109877866963063e-06, | |
| "loss": 0.10501280784606934, | |
| "step": 22900, | |
| "token_acc": 0.968406404054063 | |
| }, | |
| { | |
| "epoch": 1.0032715376226826, | |
| "grad_norm": 6.511656284332275, | |
| "learning_rate": 2.6930152815909894e-06, | |
| "loss": 0.08737098693847656, | |
| "step": 23000, | |
| "token_acc": 0.9734185907901215 | |
| }, | |
| { | |
| "epoch": 1.0032715376226826, | |
| "eval_loss": 0.0876716896891594, | |
| "eval_runtime": 249.8103, | |
| "eval_samples_per_second": 59.325, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9688981042654028, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.0076335877862594, | |
| "grad_norm": 4.062010288238525, | |
| "learning_rate": 2.6750327355626536e-06, | |
| "loss": 0.07467103481292725, | |
| "step": 23100, | |
| "token_acc": 0.9765526602582927 | |
| }, | |
| { | |
| "epoch": 1.0119956379498365, | |
| "grad_norm": 7.563171863555908, | |
| "learning_rate": 2.657041084088318e-06, | |
| "loss": 0.07544662952423095, | |
| "step": 23200, | |
| "token_acc": 0.9759494669313387 | |
| }, | |
| { | |
| "epoch": 1.0163576881134133, | |
| "grad_norm": 4.794340133666992, | |
| "learning_rate": 2.6390412631186802e-06, | |
| "loss": 0.07672032833099365, | |
| "step": 23300, | |
| "token_acc": 0.9753995963996285 | |
| }, | |
| { | |
| "epoch": 1.0207197382769901, | |
| "grad_norm": 2.7201950550079346, | |
| "learning_rate": 2.6210342090294266e-06, | |
| "loss": 0.08015254974365234, | |
| "step": 23400, | |
| "token_acc": 0.9749328386849175 | |
| }, | |
| { | |
| "epoch": 1.025081788440567, | |
| "grad_norm": 2.8014349937438965, | |
| "learning_rate": 2.603020858572521e-06, | |
| "loss": 0.0747481918334961, | |
| "step": 23500, | |
| "token_acc": 0.9764472630518457 | |
| }, | |
| { | |
| "epoch": 1.029443838604144, | |
| "grad_norm": 4.592669486999512, | |
| "learning_rate": 2.5850021488274694e-06, | |
| "loss": 0.07323034763336182, | |
| "step": 23600, | |
| "token_acc": 0.9767670199622862 | |
| }, | |
| { | |
| "epoch": 1.0338058887677208, | |
| "grad_norm": 28.001813888549805, | |
| "learning_rate": 2.566979017152581e-06, | |
| "loss": 0.07620226860046386, | |
| "step": 23700, | |
| "token_acc": 0.9755634028892456 | |
| }, | |
| { | |
| "epoch": 1.0381679389312977, | |
| "grad_norm": 3.804150342941284, | |
| "learning_rate": 2.5489524011361962e-06, | |
| "loss": 0.07047548294067382, | |
| "step": 23800, | |
| "token_acc": 0.9773679250222524 | |
| }, | |
| { | |
| "epoch": 1.0425299890948745, | |
| "grad_norm": 4.701569080352783, | |
| "learning_rate": 2.5309232385479155e-06, | |
| "loss": 0.07270308017730713, | |
| "step": 23900, | |
| "token_acc": 0.977279508101026 | |
| }, | |
| { | |
| "epoch": 1.0468920392584515, | |
| "grad_norm": 3.494173288345337, | |
| "learning_rate": 2.5128924672898183e-06, | |
| "loss": 0.07390243530273438, | |
| "step": 24000, | |
| "token_acc": 0.9771897454554904 | |
| }, | |
| { | |
| "epoch": 1.0468920392584515, | |
| "eval_loss": 0.08581704646348953, | |
| "eval_runtime": 256.0699, | |
| "eval_samples_per_second": 57.875, | |
| "eval_steps_per_second": 0.906, | |
| "eval_token_acc": 0.9695715595099705, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.0512540894220284, | |
| "grad_norm": 8.801218032836914, | |
| "learning_rate": 2.4948610253476664e-06, | |
| "loss": 0.0676074504852295, | |
| "step": 24100, | |
| "token_acc": 0.9789494252124923 | |
| }, | |
| { | |
| "epoch": 1.0556161395856052, | |
| "grad_norm": 19.8681583404541, | |
| "learning_rate": 2.4768298507421133e-06, | |
| "loss": 0.06861214637756348, | |
| "step": 24200, | |
| "token_acc": 0.9784243812299924 | |
| }, | |
| { | |
| "epoch": 1.059978189749182, | |
| "grad_norm": 4.816915512084961, | |
| "learning_rate": 2.458799881479905e-06, | |
| "loss": 0.06901247501373291, | |
| "step": 24300, | |
| "token_acc": 0.9777164475725784 | |
| }, | |
| { | |
| "epoch": 1.064340239912759, | |
| "grad_norm": 4.382708549499512, | |
| "learning_rate": 2.4407720555050827e-06, | |
| "loss": 0.07236534595489502, | |
| "step": 24400, | |
| "token_acc": 0.9769710653737226 | |
| }, | |
| { | |
| "epoch": 1.0687022900763359, | |
| "grad_norm": 4.277945041656494, | |
| "learning_rate": 2.4227473106501922e-06, | |
| "loss": 0.07162453174591064, | |
| "step": 24500, | |
| "token_acc": 0.9774640927444261 | |
| }, | |
| { | |
| "epoch": 1.0730643402399127, | |
| "grad_norm": 2.9444453716278076, | |
| "learning_rate": 2.404726584587495e-06, | |
| "loss": 0.07076330184936523, | |
| "step": 24600, | |
| "token_acc": 0.9775714884256834 | |
| }, | |
| { | |
| "epoch": 1.0774263904034895, | |
| "grad_norm": 4.69867467880249, | |
| "learning_rate": 2.386710814780189e-06, | |
| "loss": 0.07056962966918945, | |
| "step": 24700, | |
| "token_acc": 0.9776056401564706 | |
| }, | |
| { | |
| "epoch": 1.0817884405670666, | |
| "grad_norm": 4.424916744232178, | |
| "learning_rate": 2.3687009384336403e-06, | |
| "loss": 0.0715576934814453, | |
| "step": 24800, | |
| "token_acc": 0.977195460013599 | |
| }, | |
| { | |
| "epoch": 1.0861504907306434, | |
| "grad_norm": 2.6273691654205322, | |
| "learning_rate": 2.3506978924466305e-06, | |
| "loss": 0.0693655014038086, | |
| "step": 24900, | |
| "token_acc": 0.9780883749365871 | |
| }, | |
| { | |
| "epoch": 1.0905125408942202, | |
| "grad_norm": 3.6074914932250977, | |
| "learning_rate": 2.332702613362614e-06, | |
| "loss": 0.0702785873413086, | |
| "step": 25000, | |
| "token_acc": 0.9770827224448572 | |
| }, | |
| { | |
| "epoch": 1.0905125408942202, | |
| "eval_loss": 0.08393809199333191, | |
| "eval_runtime": 250.0137, | |
| "eval_samples_per_second": 59.277, | |
| "eval_steps_per_second": 0.928, | |
| "eval_token_acc": 0.9700745551283197, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.0948745910577973, | |
| "grad_norm": 4.213324546813965, | |
| "learning_rate": 2.3147160373210036e-06, | |
| "loss": 0.07728311538696289, | |
| "step": 25100, | |
| "token_acc": 0.9752774455718015 | |
| }, | |
| { | |
| "epoch": 1.099236641221374, | |
| "grad_norm": 4.420457363128662, | |
| "learning_rate": 2.2967391000084647e-06, | |
| "loss": 0.07622474670410156, | |
| "step": 25200, | |
| "token_acc": 0.9757396866456227 | |
| }, | |
| { | |
| "epoch": 1.103598691384951, | |
| "grad_norm": 4.92677640914917, | |
| "learning_rate": 2.2787727366102435e-06, | |
| "loss": 0.07702836990356446, | |
| "step": 25300, | |
| "token_acc": 0.9761410922020535 | |
| }, | |
| { | |
| "epoch": 1.1079607415485277, | |
| "grad_norm": 5.670831203460693, | |
| "learning_rate": 2.2608178817615165e-06, | |
| "loss": 0.07056490898132324, | |
| "step": 25400, | |
| "token_acc": 0.9777433009634354 | |
| }, | |
| { | |
| "epoch": 1.1123227917121046, | |
| "grad_norm": 7.146980285644531, | |
| "learning_rate": 2.2428754694987715e-06, | |
| "loss": 0.0676431655883789, | |
| "step": 25500, | |
| "token_acc": 0.978816706848244 | |
| }, | |
| { | |
| "epoch": 1.1166848418756816, | |
| "grad_norm": 2.4701433181762695, | |
| "learning_rate": 2.224946433211212e-06, | |
| "loss": 0.07373625755310059, | |
| "step": 25600, | |
| "token_acc": 0.9769120234037206 | |
| }, | |
| { | |
| "epoch": 1.1210468920392584, | |
| "grad_norm": 5.163082122802734, | |
| "learning_rate": 2.207031705592207e-06, | |
| "loss": 0.0715975284576416, | |
| "step": 25700, | |
| "token_acc": 0.9775948493313291 | |
| }, | |
| { | |
| "epoch": 1.1254089422028353, | |
| "grad_norm": 3.2471859455108643, | |
| "learning_rate": 2.189132218590769e-06, | |
| "loss": 0.07088047981262208, | |
| "step": 25800, | |
| "token_acc": 0.9779005524861878 | |
| }, | |
| { | |
| "epoch": 1.1297709923664123, | |
| "grad_norm": 4.81692361831665, | |
| "learning_rate": 2.1712489033630706e-06, | |
| "loss": 0.06931174755096435, | |
| "step": 25900, | |
| "token_acc": 0.9776110790536642 | |
| }, | |
| { | |
| "epoch": 1.1341330425299891, | |
| "grad_norm": 6.65491247177124, | |
| "learning_rate": 2.153382690224007e-06, | |
| "loss": 0.07666608810424805, | |
| "step": 26000, | |
| "token_acc": 0.9760280797392595 | |
| }, | |
| { | |
| "epoch": 1.1341330425299891, | |
| "eval_loss": 0.08234628289937973, | |
| "eval_runtime": 250.0101, | |
| "eval_samples_per_second": 59.278, | |
| "eval_steps_per_second": 0.928, | |
| "eval_token_acc": 0.9705160735044264, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.138495092693566, | |
| "grad_norm": 4.733185768127441, | |
| "learning_rate": 2.1355345085988014e-06, | |
| "loss": 0.0709274673461914, | |
| "step": 26100, | |
| "token_acc": 0.9779928213603264 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 5.480454444885254, | |
| "learning_rate": 2.117705286974649e-06, | |
| "loss": 0.07390942573547363, | |
| "step": 26200, | |
| "token_acc": 0.9774670075669534 | |
| }, | |
| { | |
| "epoch": 1.1472191930207196, | |
| "grad_norm": 4.35108757019043, | |
| "learning_rate": 2.0998959528524202e-06, | |
| "loss": 0.06665233612060546, | |
| "step": 26300, | |
| "token_acc": 0.9792105141620537 | |
| }, | |
| { | |
| "epoch": 1.1515812431842967, | |
| "grad_norm": 5.042696952819824, | |
| "learning_rate": 2.082107432698413e-06, | |
| "loss": 0.06301040172576905, | |
| "step": 26400, | |
| "token_acc": 0.9799591431893273 | |
| }, | |
| { | |
| "epoch": 1.1559432933478735, | |
| "grad_norm": 2.658958911895752, | |
| "learning_rate": 2.064340651896149e-06, | |
| "loss": 0.07521211624145507, | |
| "step": 26500, | |
| "token_acc": 0.9763495912473549 | |
| }, | |
| { | |
| "epoch": 1.1603053435114503, | |
| "grad_norm": 4.5513153076171875, | |
| "learning_rate": 2.0465965346982427e-06, | |
| "loss": 0.06404022216796874, | |
| "step": 26600, | |
| "token_acc": 0.9796154142384021 | |
| }, | |
| { | |
| "epoch": 1.1646673936750274, | |
| "grad_norm": 6.858781814575195, | |
| "learning_rate": 2.028876004178315e-06, | |
| "loss": 0.06282252311706543, | |
| "step": 26700, | |
| "token_acc": 0.9801569262999783 | |
| }, | |
| { | |
| "epoch": 1.1690294438386042, | |
| "grad_norm": 4.299166679382324, | |
| "learning_rate": 2.011179982182973e-06, | |
| "loss": 0.06631457328796386, | |
| "step": 26800, | |
| "token_acc": 0.9796262194883046 | |
| }, | |
| { | |
| "epoch": 1.173391494002181, | |
| "grad_norm": 3.9742937088012695, | |
| "learning_rate": 1.9935093892838608e-06, | |
| "loss": 0.0690745735168457, | |
| "step": 26900, | |
| "token_acc": 0.978327122747711 | |
| }, | |
| { | |
| "epoch": 1.1777535441657578, | |
| "grad_norm": 2.4530298709869385, | |
| "learning_rate": 1.9758651447297606e-06, | |
| "loss": 0.06880934715270996, | |
| "step": 27000, | |
| "token_acc": 0.978106382841394 | |
| }, | |
| { | |
| "epoch": 1.1777535441657578, | |
| "eval_loss": 0.08033985644578934, | |
| "eval_runtime": 244.5702, | |
| "eval_samples_per_second": 60.596, | |
| "eval_steps_per_second": 0.949, | |
| "eval_token_acc": 0.9716534024859161, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.1821155943293349, | |
| "grad_norm": 3.7700579166412354, | |
| "learning_rate": 1.9582481663987795e-06, | |
| "loss": 0.06770928382873535, | |
| "step": 27100, | |
| "token_acc": 0.9788943939453049 | |
| }, | |
| { | |
| "epoch": 1.1864776444929117, | |
| "grad_norm": 5.555079460144043, | |
| "learning_rate": 1.9406593707506e-06, | |
| "loss": 0.06606301307678222, | |
| "step": 27200, | |
| "token_acc": 0.9796218185617092 | |
| }, | |
| { | |
| "epoch": 1.1908396946564885, | |
| "grad_norm": 3.66662859916687, | |
| "learning_rate": 1.923099672778798e-06, | |
| "loss": 0.06830612182617188, | |
| "step": 27300, | |
| "token_acc": 0.9784953863415075 | |
| }, | |
| { | |
| "epoch": 1.1952017448200654, | |
| "grad_norm": 2.253833532333374, | |
| "learning_rate": 1.9055699859632502e-06, | |
| "loss": 0.06353707313537597, | |
| "step": 27400, | |
| "token_acc": 0.9802849547555753 | |
| }, | |
| { | |
| "epoch": 1.1995637949836424, | |
| "grad_norm": 5.8428449630737305, | |
| "learning_rate": 1.8880712222226127e-06, | |
| "loss": 0.06870734214782714, | |
| "step": 27500, | |
| "token_acc": 0.9787229893127389 | |
| }, | |
| { | |
| "epoch": 1.2039258451472192, | |
| "grad_norm": 2.712040424346924, | |
| "learning_rate": 1.8706042918668755e-06, | |
| "loss": 0.06457172870635987, | |
| "step": 27600, | |
| "token_acc": 0.9792487660543853 | |
| }, | |
| { | |
| "epoch": 1.208287895310796, | |
| "grad_norm": 4.38408899307251, | |
| "learning_rate": 1.8531701035500172e-06, | |
| "loss": 0.07690935134887696, | |
| "step": 27700, | |
| "token_acc": 0.9740492962199486 | |
| }, | |
| { | |
| "epoch": 1.2126499454743729, | |
| "grad_norm": 2.473857879638672, | |
| "learning_rate": 1.835769564222728e-06, | |
| "loss": 0.06370730876922608, | |
| "step": 27800, | |
| "token_acc": 0.979825486251706 | |
| }, | |
| { | |
| "epoch": 1.21701199563795, | |
| "grad_norm": 3.47666072845459, | |
| "learning_rate": 1.8184035790852278e-06, | |
| "loss": 0.07317141532897949, | |
| "step": 27900, | |
| "token_acc": 0.9769554903352452 | |
| }, | |
| { | |
| "epoch": 1.2213740458015268, | |
| "grad_norm": 3.806673765182495, | |
| "learning_rate": 1.801073051540185e-06, | |
| "loss": 0.06659706592559815, | |
| "step": 28000, | |
| "token_acc": 0.9792695661086874 | |
| }, | |
| { | |
| "epoch": 1.2213740458015268, | |
| "eval_loss": 0.07924070209264755, | |
| "eval_runtime": 249.7936, | |
| "eval_samples_per_second": 59.329, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.972083743181615, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.2257360959651036, | |
| "grad_norm": 2.7492287158966064, | |
| "learning_rate": 1.783778883145712e-06, | |
| "loss": 0.06997788429260254, | |
| "step": 28100, | |
| "token_acc": 0.9779032340469014 | |
| }, | |
| { | |
| "epoch": 1.2300981461286804, | |
| "grad_norm": 5.293479919433594, | |
| "learning_rate": 1.7665219735684658e-06, | |
| "loss": 0.07057114601135255, | |
| "step": 28200, | |
| "token_acc": 0.9777492239580683 | |
| }, | |
| { | |
| "epoch": 1.2344601962922575, | |
| "grad_norm": 4.3521599769592285, | |
| "learning_rate": 1.7493032205368513e-06, | |
| "loss": 0.0686982250213623, | |
| "step": 28300, | |
| "token_acc": 0.978551340703205 | |
| }, | |
| { | |
| "epoch": 1.2388222464558343, | |
| "grad_norm": 5.842819690704346, | |
| "learning_rate": 1.7321235197943168e-06, | |
| "loss": 0.0651836109161377, | |
| "step": 28400, | |
| "token_acc": 0.979650824442289 | |
| }, | |
| { | |
| "epoch": 1.243184296619411, | |
| "grad_norm": 3.996281623840332, | |
| "learning_rate": 1.7149837650527523e-06, | |
| "loss": 0.06602362632751464, | |
| "step": 28500, | |
| "token_acc": 0.9792255874942151 | |
| }, | |
| { | |
| "epoch": 1.247546346782988, | |
| "grad_norm": 2.58259654045105, | |
| "learning_rate": 1.697884847946006e-06, | |
| "loss": 0.062000880241394045, | |
| "step": 28600, | |
| "token_acc": 0.9799171574655362 | |
| }, | |
| { | |
| "epoch": 1.2519083969465647, | |
| "grad_norm": 4.274127960205078, | |
| "learning_rate": 1.6808276579834943e-06, | |
| "loss": 0.06488137245178223, | |
| "step": 28700, | |
| "token_acc": 0.9792955370379838 | |
| }, | |
| { | |
| "epoch": 1.2562704471101418, | |
| "grad_norm": 4.309569358825684, | |
| "learning_rate": 1.6638130825039275e-06, | |
| "loss": 0.0708838415145874, | |
| "step": 28800, | |
| "token_acc": 0.9784192156078426 | |
| }, | |
| { | |
| "epoch": 1.2606324972737186, | |
| "grad_norm": 3.1259348392486572, | |
| "learning_rate": 1.6468420066291536e-06, | |
| "loss": 0.061966466903686526, | |
| "step": 28900, | |
| "token_acc": 0.9806353930430375 | |
| }, | |
| { | |
| "epoch": 1.2649945474372954, | |
| "grad_norm": 3.426515817642212, | |
| "learning_rate": 1.6299153132181106e-06, | |
| "loss": 0.0645901918411255, | |
| "step": 29000, | |
| "token_acc": 0.9794154619736015 | |
| }, | |
| { | |
| "epoch": 1.2649945474372954, | |
| "eval_loss": 0.07756964862346649, | |
| "eval_runtime": 254.7344, | |
| "eval_samples_per_second": 58.178, | |
| "eval_steps_per_second": 0.911, | |
| "eval_token_acc": 0.9729472189931145, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.2693565976008725, | |
| "grad_norm": 6.645819187164307, | |
| "learning_rate": 1.6130338828208968e-06, | |
| "loss": 0.06351682662963867, | |
| "step": 29100, | |
| "token_acc": 0.9802265678215497 | |
| }, | |
| { | |
| "epoch": 1.2737186477644493, | |
| "grad_norm": 4.421413421630859, | |
| "learning_rate": 1.5961985936329694e-06, | |
| "loss": 0.06718688488006591, | |
| "step": 29200, | |
| "token_acc": 0.9788990531214893 | |
| }, | |
| { | |
| "epoch": 1.2780806979280261, | |
| "grad_norm": 3.6267154216766357, | |
| "learning_rate": 1.5794103214494528e-06, | |
| "loss": 0.06186320304870605, | |
| "step": 29300, | |
| "token_acc": 0.9807249300449015 | |
| }, | |
| { | |
| "epoch": 1.282442748091603, | |
| "grad_norm": 3.70088791847229, | |
| "learning_rate": 1.5626699396195821e-06, | |
| "loss": 0.06981894493103027, | |
| "step": 29400, | |
| "token_acc": 0.9779153963414634 | |
| }, | |
| { | |
| "epoch": 1.28680479825518, | |
| "grad_norm": 5.71915340423584, | |
| "learning_rate": 1.5459783190012707e-06, | |
| "loss": 0.0649720811843872, | |
| "step": 29500, | |
| "token_acc": 0.9794114456024354 | |
| }, | |
| { | |
| "epoch": 1.2911668484187568, | |
| "grad_norm": 3.4748384952545166, | |
| "learning_rate": 1.5293363279158055e-06, | |
| "loss": 0.06922075271606445, | |
| "step": 29600, | |
| "token_acc": 0.9789335302041239 | |
| }, | |
| { | |
| "epoch": 1.2955288985823337, | |
| "grad_norm": 2.7103006839752197, | |
| "learning_rate": 1.5127448321026756e-06, | |
| "loss": 0.06484994888305665, | |
| "step": 29700, | |
| "token_acc": 0.979845880260818 | |
| }, | |
| { | |
| "epoch": 1.2998909487459107, | |
| "grad_norm": 7.105724334716797, | |
| "learning_rate": 1.4962046946745369e-06, | |
| "loss": 0.06801267147064209, | |
| "step": 29800, | |
| "token_acc": 0.9781458581259378 | |
| }, | |
| { | |
| "epoch": 1.3042529989094875, | |
| "grad_norm": 4.009284019470215, | |
| "learning_rate": 1.4797167760723102e-06, | |
| "loss": 0.06112378120422363, | |
| "step": 29900, | |
| "token_acc": 0.9805348371839185 | |
| }, | |
| { | |
| "epoch": 1.3086150490730644, | |
| "grad_norm": 3.753431558609009, | |
| "learning_rate": 1.4632819340204208e-06, | |
| "loss": 0.06059444427490234, | |
| "step": 30000, | |
| "token_acc": 0.9809932238964539 | |
| }, | |
| { | |
| "epoch": 1.3086150490730644, | |
| "eval_loss": 0.0756845697760582, | |
| "eval_runtime": 254.1828, | |
| "eval_samples_per_second": 58.304, | |
| "eval_steps_per_second": 0.913, | |
| "eval_token_acc": 0.9734082983099347, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.3129770992366412, | |
| "grad_norm": 2.8951034545898438, | |
| "learning_rate": 1.4469010234821765e-06, | |
| "loss": 0.06441830635070801, | |
| "step": 30100, | |
| "token_acc": 0.9801274671845959 | |
| }, | |
| { | |
| "epoch": 1.317339149400218, | |
| "grad_norm": 4.482010364532471, | |
| "learning_rate": 1.4305748966152978e-06, | |
| "loss": 0.06479699134826661, | |
| "step": 30200, | |
| "token_acc": 0.9794524119947848 | |
| }, | |
| { | |
| "epoch": 1.321701199563795, | |
| "grad_norm": 2.2887697219848633, | |
| "learning_rate": 1.414304402727574e-06, | |
| "loss": 0.0626700496673584, | |
| "step": 30300, | |
| "token_acc": 0.9796751852488993 | |
| }, | |
| { | |
| "epoch": 1.3260632497273719, | |
| "grad_norm": 2.8628365993499756, | |
| "learning_rate": 1.3980903882326985e-06, | |
| "loss": 0.06823697090148925, | |
| "step": 30400, | |
| "token_acc": 0.9791249697232385 | |
| }, | |
| { | |
| "epoch": 1.3304252998909487, | |
| "grad_norm": 3.2624971866607666, | |
| "learning_rate": 1.381933696606223e-06, | |
| "loss": 0.06213871479034424, | |
| "step": 30500, | |
| "token_acc": 0.9808548009367681 | |
| }, | |
| { | |
| "epoch": 1.3347873500545258, | |
| "grad_norm": 3.104295492172241, | |
| "learning_rate": 1.3658351683416838e-06, | |
| "loss": 0.05973214626312256, | |
| "step": 30600, | |
| "token_acc": 0.9814683008691223 | |
| }, | |
| { | |
| "epoch": 1.3391494002181026, | |
| "grad_norm": 6.34499454498291, | |
| "learning_rate": 1.3497956409068824e-06, | |
| "loss": 0.06145791053771973, | |
| "step": 30700, | |
| "token_acc": 0.9810582677574679 | |
| }, | |
| { | |
| "epoch": 1.3435114503816794, | |
| "grad_norm": 4.992619514465332, | |
| "learning_rate": 1.333815948700311e-06, | |
| "loss": 0.06483103752136231, | |
| "step": 30800, | |
| "token_acc": 0.9799949403545644 | |
| }, | |
| { | |
| "epoch": 1.3478735005452562, | |
| "grad_norm": 6.377118110656738, | |
| "learning_rate": 1.3178969230077515e-06, | |
| "loss": 0.06162656784057617, | |
| "step": 30900, | |
| "token_acc": 0.9806395873967511 | |
| }, | |
| { | |
| "epoch": 1.352235550708833, | |
| "grad_norm": 3.695734739303589, | |
| "learning_rate": 1.302039391959031e-06, | |
| "loss": 0.06112282276153565, | |
| "step": 31000, | |
| "token_acc": 0.9815428766118042 | |
| }, | |
| { | |
| "epoch": 1.352235550708833, | |
| "eval_loss": 0.07410430908203125, | |
| "eval_runtime": 249.9929, | |
| "eval_samples_per_second": 59.282, | |
| "eval_steps_per_second": 0.928, | |
| "eval_token_acc": 0.97419353035858, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.35659760087241, | |
| "grad_norm": 3.6432061195373535, | |
| "learning_rate": 1.2862441804849376e-06, | |
| "loss": 0.06405759811401367, | |
| "step": 31100, | |
| "token_acc": 0.980182532705931 | |
| }, | |
| { | |
| "epoch": 1.360959651035987, | |
| "grad_norm": 3.5472171306610107, | |
| "learning_rate": 1.270512110274309e-06, | |
| "loss": 0.05868791580200195, | |
| "step": 31200, | |
| "token_acc": 0.9816291636211757 | |
| }, | |
| { | |
| "epoch": 1.3653217011995638, | |
| "grad_norm": 4.953434944152832, | |
| "learning_rate": 1.2548439997312883e-06, | |
| "loss": 0.05954179763793945, | |
| "step": 31300, | |
| "token_acc": 0.9819224968854501 | |
| }, | |
| { | |
| "epoch": 1.3696837513631408, | |
| "grad_norm": 4.584559917449951, | |
| "learning_rate": 1.2392406639327454e-06, | |
| "loss": 0.05943380355834961, | |
| "step": 31400, | |
| "token_acc": 0.9810650276994911 | |
| }, | |
| { | |
| "epoch": 1.3740458015267176, | |
| "grad_norm": 5.9618425369262695, | |
| "learning_rate": 1.223702914585881e-06, | |
| "loss": 0.062167482376098634, | |
| "step": 31500, | |
| "token_acc": 0.9804242388827208 | |
| }, | |
| { | |
| "epoch": 1.3784078516902945, | |
| "grad_norm": 3.6155030727386475, | |
| "learning_rate": 1.2082315599859954e-06, | |
| "loss": 0.06322122097015381, | |
| "step": 31600, | |
| "token_acc": 0.9801517767413096 | |
| }, | |
| { | |
| "epoch": 1.3827699018538713, | |
| "grad_norm": 2.845133066177368, | |
| "learning_rate": 1.1928274049744406e-06, | |
| "loss": 0.05848141193389893, | |
| "step": 31700, | |
| "token_acc": 0.9813240790872739 | |
| }, | |
| { | |
| "epoch": 1.387131952017448, | |
| "grad_norm": 3.7700917720794678, | |
| "learning_rate": 1.177491250896757e-06, | |
| "loss": 0.058195791244506835, | |
| "step": 31800, | |
| "token_acc": 0.9819997313392738 | |
| }, | |
| { | |
| "epoch": 1.3914940021810251, | |
| "grad_norm": 3.283059597015381, | |
| "learning_rate": 1.1622238955609785e-06, | |
| "loss": 0.06348293304443359, | |
| "step": 31900, | |
| "token_acc": 0.9804331695834613 | |
| }, | |
| { | |
| "epoch": 1.395856052344602, | |
| "grad_norm": 4.16409969329834, | |
| "learning_rate": 1.1470261331961324e-06, | |
| "loss": 0.06061270713806152, | |
| "step": 32000, | |
| "token_acc": 0.9811250377820794 | |
| }, | |
| { | |
| "epoch": 1.395856052344602, | |
| "eval_loss": 0.07234535366296768, | |
| "eval_runtime": 249.7003, | |
| "eval_samples_per_second": 59.351, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9748586023428418, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.4002181025081788, | |
| "grad_norm": 5.3219170570373535, | |
| "learning_rate": 1.1318987544109269e-06, | |
| "loss": 0.06023798942565918, | |
| "step": 32100, | |
| "token_acc": 0.9810668274906329 | |
| }, | |
| { | |
| "epoch": 1.4045801526717558, | |
| "grad_norm": 4.018277645111084, | |
| "learning_rate": 1.116842546152615e-06, | |
| "loss": 0.060137758255004885, | |
| "step": 32200, | |
| "token_acc": 0.9817034659235192 | |
| }, | |
| { | |
| "epoch": 1.4089422028353327, | |
| "grad_norm": 3.68279767036438, | |
| "learning_rate": 1.1018582916660605e-06, | |
| "loss": 0.05815181255340576, | |
| "step": 32300, | |
| "token_acc": 0.9821691863093951 | |
| }, | |
| { | |
| "epoch": 1.4133042529989095, | |
| "grad_norm": 3.751368761062622, | |
| "learning_rate": 1.0869467704529942e-06, | |
| "loss": 0.05569493293762207, | |
| "step": 32400, | |
| "token_acc": 0.9825266311370547 | |
| }, | |
| { | |
| "epoch": 1.4176663031624863, | |
| "grad_norm": 4.616822719573975, | |
| "learning_rate": 1.072108758231459e-06, | |
| "loss": 0.05629618167877197, | |
| "step": 32500, | |
| "token_acc": 0.9823124687557555 | |
| }, | |
| { | |
| "epoch": 1.4220283533260631, | |
| "grad_norm": 3.3701086044311523, | |
| "learning_rate": 1.0573450268954577e-06, | |
| "loss": 0.05684140205383301, | |
| "step": 32600, | |
| "token_acc": 0.9825464295056239 | |
| }, | |
| { | |
| "epoch": 1.4263904034896402, | |
| "grad_norm": 2.300793409347534, | |
| "learning_rate": 1.042656344474801e-06, | |
| "loss": 0.057010469436645506, | |
| "step": 32700, | |
| "token_acc": 0.9822899964004058 | |
| }, | |
| { | |
| "epoch": 1.430752453653217, | |
| "grad_norm": 4.348355770111084, | |
| "learning_rate": 1.0280434750951474e-06, | |
| "loss": 0.061251497268676756, | |
| "step": 32800, | |
| "token_acc": 0.9808127634339351 | |
| }, | |
| { | |
| "epoch": 1.4351145038167938, | |
| "grad_norm": 3.6465044021606445, | |
| "learning_rate": 1.0135071789382566e-06, | |
| "loss": 0.05879298210144043, | |
| "step": 32900, | |
| "token_acc": 0.9820062890640164 | |
| }, | |
| { | |
| "epoch": 1.4394765539803709, | |
| "grad_norm": 4.484274864196777, | |
| "learning_rate": 9.990482122024458e-07, | |
| "loss": 0.055512037277221676, | |
| "step": 33000, | |
| "token_acc": 0.9827889984324192 | |
| }, | |
| { | |
| "epoch": 1.4394765539803709, | |
| "eval_loss": 0.07081891596317291, | |
| "eval_runtime": 249.4133, | |
| "eval_samples_per_second": 59.419, | |
| "eval_steps_per_second": 0.93, | |
| "eval_token_acc": 0.9751492220334437, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.4438386041439477, | |
| "grad_norm": 3.2281529903411865, | |
| "learning_rate": 9.84667327063245e-07, | |
| "loss": 0.05788352012634277, | |
| "step": 33100, | |
| "token_acc": 0.9818554954720177 | |
| }, | |
| { | |
| "epoch": 1.4482006543075245, | |
| "grad_norm": 5.004483222961426, | |
| "learning_rate": 9.70365271634271e-07, | |
| "loss": 0.0612343692779541, | |
| "step": 33200, | |
| "token_acc": 0.9810140790866233 | |
| }, | |
| { | |
| "epoch": 1.4525627044711014, | |
| "grad_norm": 3.850135087966919, | |
| "learning_rate": 9.561427899283125e-07, | |
| "loss": 0.05688544273376465, | |
| "step": 33300, | |
| "token_acc": 0.9823226364637407 | |
| }, | |
| { | |
| "epoch": 1.4569247546346782, | |
| "grad_norm": 3.7914345264434814, | |
| "learning_rate": 9.420006218186198e-07, | |
| "loss": 0.057846450805664064, | |
| "step": 33400, | |
| "token_acc": 0.9820036576349888 | |
| }, | |
| { | |
| "epoch": 1.4612868047982552, | |
| "grad_norm": 2.471034526824951, | |
| "learning_rate": 9.27939503000419e-07, | |
| "loss": 0.05780147552490234, | |
| "step": 33500, | |
| "token_acc": 0.9819559772443386 | |
| }, | |
| { | |
| "epoch": 1.465648854961832, | |
| "grad_norm": 4.725645542144775, | |
| "learning_rate": 9.139601649526416e-07, | |
| "loss": 0.059658441543579105, | |
| "step": 33600, | |
| "token_acc": 0.9816973978417965 | |
| }, | |
| { | |
| "epoch": 1.4700109051254089, | |
| "grad_norm": 3.542151927947998, | |
| "learning_rate": 9.000633348998669e-07, | |
| "loss": 0.05712653636932373, | |
| "step": 33700, | |
| "token_acc": 0.9819699628907641 | |
| }, | |
| { | |
| "epoch": 1.474372955288986, | |
| "grad_norm": 4.53084135055542, | |
| "learning_rate": 8.862497357744945e-07, | |
| "loss": 0.05675581455230713, | |
| "step": 33800, | |
| "token_acc": 0.9820234465526059 | |
| }, | |
| { | |
| "epoch": 1.4787350054525628, | |
| "grad_norm": 4.158478736877441, | |
| "learning_rate": 8.725200861791378e-07, | |
| "loss": 0.05835529804229736, | |
| "step": 33900, | |
| "token_acc": 0.9820663243996941 | |
| }, | |
| { | |
| "epoch": 1.4830970556161396, | |
| "grad_norm": 5.382080078125, | |
| "learning_rate": 8.58875100349236e-07, | |
| "loss": 0.05506091117858887, | |
| "step": 34000, | |
| "token_acc": 0.9825372133740343 | |
| }, | |
| { | |
| "epoch": 1.4830970556161396, | |
| "eval_loss": 0.06877533346414566, | |
| "eval_runtime": 254.8617, | |
| "eval_samples_per_second": 58.149, | |
| "eval_steps_per_second": 0.91, | |
| "eval_token_acc": 0.9760518197263703, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.4874591057797164, | |
| "grad_norm": 4.1885528564453125, | |
| "learning_rate": 8.453154881159023e-07, | |
| "loss": 0.06418917655944824, | |
| "step": 34100, | |
| "token_acc": 0.9805802465917681 | |
| }, | |
| { | |
| "epoch": 1.4918211559432932, | |
| "grad_norm": 3.279444932937622, | |
| "learning_rate": 8.318419548689993e-07, | |
| "loss": 0.05651405334472656, | |
| "step": 34200, | |
| "token_acc": 0.9823017340079185 | |
| }, | |
| { | |
| "epoch": 1.4961832061068703, | |
| "grad_norm": 3.4269838333129883, | |
| "learning_rate": 8.184552015204383e-07, | |
| "loss": 0.05106703758239746, | |
| "step": 34300, | |
| "token_acc": 0.9836486664038891 | |
| }, | |
| { | |
| "epoch": 1.500545256270447, | |
| "grad_norm": 3.865947723388672, | |
| "learning_rate": 8.051559244677199e-07, | |
| "loss": 0.0522431755065918, | |
| "step": 34400, | |
| "token_acc": 0.9838395317031918 | |
| }, | |
| { | |
| "epoch": 1.5049073064340242, | |
| "grad_norm": 6.236225605010986, | |
| "learning_rate": 7.919448155577089e-07, | |
| "loss": 0.05624629020690918, | |
| "step": 34500, | |
| "token_acc": 0.9829597809300092 | |
| }, | |
| { | |
| "epoch": 1.509269356597601, | |
| "grad_norm": 6.7572431564331055, | |
| "learning_rate": 7.788225620506384e-07, | |
| "loss": 0.054970383644104004, | |
| "step": 34600, | |
| "token_acc": 0.9824237722392088 | |
| }, | |
| { | |
| "epoch": 1.5136314067611778, | |
| "grad_norm": 4.074737071990967, | |
| "learning_rate": 7.657898465843599e-07, | |
| "loss": 0.054400978088378904, | |
| "step": 34700, | |
| "token_acc": 0.9830726460195386 | |
| }, | |
| { | |
| "epoch": 1.5179934569247546, | |
| "grad_norm": 4.249269485473633, | |
| "learning_rate": 7.528473471388342e-07, | |
| "loss": 0.05802037239074707, | |
| "step": 34800, | |
| "token_acc": 0.9820589628377946 | |
| }, | |
| { | |
| "epoch": 1.5223555070883314, | |
| "grad_norm": 4.001960277557373, | |
| "learning_rate": 7.399957370008568e-07, | |
| "loss": 0.0512929630279541, | |
| "step": 34900, | |
| "token_acc": 0.9846466805286435 | |
| }, | |
| { | |
| "epoch": 1.5267175572519083, | |
| "grad_norm": 4.0334153175354, | |
| "learning_rate": 7.272356847290357e-07, | |
| "loss": 0.054765453338623045, | |
| "step": 35000, | |
| "token_acc": 0.9834754901641374 | |
| }, | |
| { | |
| "epoch": 1.5267175572519083, | |
| "eval_loss": 0.06686375290155411, | |
| "eval_runtime": 249.712, | |
| "eval_samples_per_second": 59.348, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9764514218009479, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.5310796074154853, | |
| "grad_norm": 3.1412196159362793, | |
| "learning_rate": 7.145678541190132e-07, | |
| "loss": 0.05724092483520508, | |
| "step": 35100, | |
| "token_acc": 0.982305148733112 | |
| }, | |
| { | |
| "epoch": 1.5354416575790621, | |
| "grad_norm": 4.317452430725098, | |
| "learning_rate": 7.019929041689308e-07, | |
| "loss": 0.056807117462158205, | |
| "step": 35200, | |
| "token_acc": 0.9833986233324803 | |
| }, | |
| { | |
| "epoch": 1.5398037077426392, | |
| "grad_norm": 2.5507566928863525, | |
| "learning_rate": 6.895114890451487e-07, | |
| "loss": 0.05457932472229004, | |
| "step": 35300, | |
| "token_acc": 0.9831009055880454 | |
| }, | |
| { | |
| "epoch": 1.544165757906216, | |
| "grad_norm": 3.851468563079834, | |
| "learning_rate": 6.771242580482188e-07, | |
| "loss": 0.055332069396972654, | |
| "step": 35400, | |
| "token_acc": 0.9831133615205685 | |
| }, | |
| { | |
| "epoch": 1.5485278080697928, | |
| "grad_norm": 5.143461227416992, | |
| "learning_rate": 6.648318555791e-07, | |
| "loss": 0.05229470252990723, | |
| "step": 35500, | |
| "token_acc": 0.9841282328645885 | |
| }, | |
| { | |
| "epoch": 1.5528898582333697, | |
| "grad_norm": 3.2583236694335938, | |
| "learning_rate": 6.526349211056409e-07, | |
| "loss": 0.05265366554260254, | |
| "step": 35600, | |
| "token_acc": 0.9836261310683998 | |
| }, | |
| { | |
| "epoch": 1.5572519083969465, | |
| "grad_norm": 3.030510187149048, | |
| "learning_rate": 6.405340891293143e-07, | |
| "loss": 0.05120317459106445, | |
| "step": 35700, | |
| "token_acc": 0.9843944043261996 | |
| }, | |
| { | |
| "epoch": 1.5616139585605233, | |
| "grad_norm": 3.0830142498016357, | |
| "learning_rate": 6.285299891522048e-07, | |
| "loss": 0.059448165893554686, | |
| "step": 35800, | |
| "token_acc": 0.9815313972588839 | |
| }, | |
| { | |
| "epoch": 1.5659760087241004, | |
| "grad_norm": 3.755048990249634, | |
| "learning_rate": 6.166232456442645e-07, | |
| "loss": 0.05295629024505615, | |
| "step": 35900, | |
| "token_acc": 0.9834888356480129 | |
| }, | |
| { | |
| "epoch": 1.5703380588876772, | |
| "grad_norm": 4.288339138031006, | |
| "learning_rate": 6.048144780108289e-07, | |
| "loss": 0.05355148792266846, | |
| "step": 36000, | |
| "token_acc": 0.9836604652353047 | |
| }, | |
| { | |
| "epoch": 1.5703380588876772, | |
| "eval_loss": 0.06514008343219757, | |
| "eval_runtime": 249.5354, | |
| "eval_samples_per_second": 59.39, | |
| "eval_steps_per_second": 0.93, | |
| "eval_token_acc": 0.9770801663238845, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.5747001090512542, | |
| "grad_norm": 7.691909313201904, | |
| "learning_rate": 5.931043005603907e-07, | |
| "loss": 0.053992023468017576, | |
| "step": 36100, | |
| "token_acc": 0.9830635714701752 | |
| }, | |
| { | |
| "epoch": 1.579062159214831, | |
| "grad_norm": 3.0412437915802, | |
| "learning_rate": 5.814933224726432e-07, | |
| "loss": 0.052196903228759764, | |
| "step": 36200, | |
| "token_acc": 0.9839295311250639 | |
| }, | |
| { | |
| "epoch": 1.5834242093784079, | |
| "grad_norm": 4.929081916809082, | |
| "learning_rate": 5.699821477667947e-07, | |
| "loss": 0.05482297897338867, | |
| "step": 36300, | |
| "token_acc": 0.9832719951365237 | |
| }, | |
| { | |
| "epoch": 1.5877862595419847, | |
| "grad_norm": 4.209242820739746, | |
| "learning_rate": 5.585713752701394e-07, | |
| "loss": 0.05226144790649414, | |
| "step": 36400, | |
| "token_acc": 0.9841242428312833 | |
| }, | |
| { | |
| "epoch": 1.5921483097055615, | |
| "grad_norm": 2.3831539154052734, | |
| "learning_rate": 5.472615985869104e-07, | |
| "loss": 0.0528890323638916, | |
| "step": 36500, | |
| "token_acc": 0.9840980187695516 | |
| }, | |
| { | |
| "epoch": 1.5965103598691384, | |
| "grad_norm": 3.2346951961517334, | |
| "learning_rate": 5.360534060673994e-07, | |
| "loss": 0.05202345371246338, | |
| "step": 36600, | |
| "token_acc": 0.9840068698790669 | |
| }, | |
| { | |
| "epoch": 1.6008724100327154, | |
| "grad_norm": 2.5527503490448, | |
| "learning_rate": 5.249473807773472e-07, | |
| "loss": 0.05285400390625, | |
| "step": 36700, | |
| "token_acc": 0.9839257650748936 | |
| }, | |
| { | |
| "epoch": 1.6052344601962922, | |
| "grad_norm": 1.54398775100708, | |
| "learning_rate": 5.139441004676138e-07, | |
| "loss": 0.05471937656402588, | |
| "step": 36800, | |
| "token_acc": 0.9833169259040494 | |
| }, | |
| { | |
| "epoch": 1.6095965103598693, | |
| "grad_norm": 3.731952667236328, | |
| "learning_rate": 5.030441375441239e-07, | |
| "loss": 0.05202892303466797, | |
| "step": 36900, | |
| "token_acc": 0.9836451150625598 | |
| }, | |
| { | |
| "epoch": 1.613958560523446, | |
| "grad_norm": 5.551258563995361, | |
| "learning_rate": 4.922480590380865e-07, | |
| "loss": 0.054112329483032226, | |
| "step": 37000, | |
| "token_acc": 0.9834300729468978 | |
| }, | |
| { | |
| "epoch": 1.613958560523446, | |
| "eval_loss": 0.06393582373857498, | |
| "eval_runtime": 249.8278, | |
| "eval_samples_per_second": 59.321, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9774182911562193, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.618320610687023, | |
| "grad_norm": 3.7928690910339355, | |
| "learning_rate": 4.815564265764994e-07, | |
| "loss": 0.05264884948730469, | |
| "step": 37100, | |
| "token_acc": 0.9840745348053698 | |
| }, | |
| { | |
| "epoch": 1.6226826608505998, | |
| "grad_norm": 5.02518367767334, | |
| "learning_rate": 4.709697963529333e-07, | |
| "loss": 0.056769208908081056, | |
| "step": 37200, | |
| "token_acc": 0.9824920285478644 | |
| }, | |
| { | |
| "epoch": 1.6270447110141766, | |
| "grad_norm": 3.828500747680664, | |
| "learning_rate": 4.604887190985957e-07, | |
| "loss": 0.055812931060791014, | |
| "step": 37300, | |
| "token_acc": 0.9835446977359371 | |
| }, | |
| { | |
| "epoch": 1.6314067611777534, | |
| "grad_norm": 2.125394821166992, | |
| "learning_rate": 4.5011374005368147e-07, | |
| "loss": 0.051757588386535644, | |
| "step": 37400, | |
| "token_acc": 0.9842062000939408 | |
| }, | |
| { | |
| "epoch": 1.6357688113413305, | |
| "grad_norm": 2.8596160411834717, | |
| "learning_rate": 4.3984539893900976e-07, | |
| "loss": 0.0530195140838623, | |
| "step": 37500, | |
| "token_acc": 0.9834978568645278 | |
| }, | |
| { | |
| "epoch": 1.6401308615049073, | |
| "grad_norm": 3.184962034225464, | |
| "learning_rate": 4.296842299279469e-07, | |
| "loss": 0.05626538276672363, | |
| "step": 37600, | |
| "token_acc": 0.9829366662234632 | |
| }, | |
| { | |
| "epoch": 1.6444929116684843, | |
| "grad_norm": 2.6302707195281982, | |
| "learning_rate": 4.196307616186185e-07, | |
| "loss": 0.053105955123901365, | |
| "step": 37700, | |
| "token_acc": 0.9834460881105752 | |
| }, | |
| { | |
| "epoch": 1.6488549618320612, | |
| "grad_norm": 3.290652275085449, | |
| "learning_rate": 4.0968551700640654e-07, | |
| "loss": 0.05122368812561035, | |
| "step": 37800, | |
| "token_acc": 0.9839382289698312 | |
| }, | |
| { | |
| "epoch": 1.653217011995638, | |
| "grad_norm": 6.050076484680176, | |
| "learning_rate": 3.998490134567498e-07, | |
| "loss": 0.04652623176574707, | |
| "step": 37900, | |
| "token_acc": 0.9856113210284411 | |
| }, | |
| { | |
| "epoch": 1.6575790621592148, | |
| "grad_norm": 4.384199142456055, | |
| "learning_rate": 3.90121762678225e-07, | |
| "loss": 0.05324809551239014, | |
| "step": 38000, | |
| "token_acc": 0.9837580092385636 | |
| }, | |
| { | |
| "epoch": 1.6575790621592148, | |
| "eval_loss": 0.06297693401575089, | |
| "eval_runtime": 249.685, | |
| "eval_samples_per_second": 59.355, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9780330635786462, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.6619411123227916, | |
| "grad_norm": 4.2886433601379395, | |
| "learning_rate": 3.8050427069592526e-07, | |
| "loss": 0.04979161262512207, | |
| "step": 38100, | |
| "token_acc": 0.9845926138438266 | |
| }, | |
| { | |
| "epoch": 1.6663031624863684, | |
| "grad_norm": 3.2638423442840576, | |
| "learning_rate": 3.7099703782514236e-07, | |
| "loss": 0.05182206630706787, | |
| "step": 38200, | |
| "token_acc": 0.9839714186037497 | |
| }, | |
| { | |
| "epoch": 1.6706652126499455, | |
| "grad_norm": 3.262000799179077, | |
| "learning_rate": 3.6160055864533526e-07, | |
| "loss": 0.05675747871398926, | |
| "step": 38300, | |
| "token_acc": 0.9826982452279391 | |
| }, | |
| { | |
| "epoch": 1.6750272628135223, | |
| "grad_norm": 3.224867820739746, | |
| "learning_rate": 3.5231532197439884e-07, | |
| "loss": 0.0510973072052002, | |
| "step": 38400, | |
| "token_acc": 0.9846604806212441 | |
| }, | |
| { | |
| "epoch": 1.6793893129770994, | |
| "grad_norm": 7.291615009307861, | |
| "learning_rate": 3.4314181084324215e-07, | |
| "loss": 0.04996752738952637, | |
| "step": 38500, | |
| "token_acc": 0.9848244576449638 | |
| }, | |
| { | |
| "epoch": 1.6837513631406762, | |
| "grad_norm": 3.04451060295105, | |
| "learning_rate": 3.340805024706559e-07, | |
| "loss": 0.0533557939529419, | |
| "step": 38600, | |
| "token_acc": 0.9838469152943294 | |
| }, | |
| { | |
| "epoch": 1.688113413304253, | |
| "grad_norm": 3.626009702682495, | |
| "learning_rate": 3.251318682384838e-07, | |
| "loss": 0.05408650398254394, | |
| "step": 38700, | |
| "token_acc": 0.983619685023991 | |
| }, | |
| { | |
| "epoch": 1.6924754634678298, | |
| "grad_norm": 7.859470367431641, | |
| "learning_rate": 3.1629637366710877e-07, | |
| "loss": 0.051053762435913086, | |
| "step": 38800, | |
| "token_acc": 0.9844172007416103 | |
| }, | |
| { | |
| "epoch": 1.6968375136314067, | |
| "grad_norm": 1.7803997993469238, | |
| "learning_rate": 3.075744783912304e-07, | |
| "loss": 0.0526139497756958, | |
| "step": 38900, | |
| "token_acc": 0.9835836512403247 | |
| }, | |
| { | |
| "epoch": 1.7011995637949835, | |
| "grad_norm": 2.8114235401153564, | |
| "learning_rate": 2.989666361359525e-07, | |
| "loss": 0.05366787433624268, | |
| "step": 39000, | |
| "token_acc": 0.9837218445616883 | |
| }, | |
| { | |
| "epoch": 1.7011995637949835, | |
| "eval_loss": 0.061601582914590836, | |
| "eval_runtime": 255.8728, | |
| "eval_samples_per_second": 57.919, | |
| "eval_steps_per_second": 0.907, | |
| "eval_token_acc": 0.9786226862201556, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.7055616139585605, | |
| "grad_norm": 10.10936450958252, | |
| "learning_rate": 2.9047329469318515e-07, | |
| "loss": 0.05378860473632813, | |
| "step": 39100, | |
| "token_acc": 0.9834365733325614 | |
| }, | |
| { | |
| "epoch": 1.7099236641221374, | |
| "grad_norm": 9.03071403503418, | |
| "learning_rate": 2.8209489589834713e-07, | |
| "loss": 0.050548648834228514, | |
| "step": 39200, | |
| "token_acc": 0.9842391860108396 | |
| }, | |
| { | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 2.1598613262176514, | |
| "learning_rate": 2.7383187560737825e-07, | |
| "loss": 0.05034954071044922, | |
| "step": 39300, | |
| "token_acc": 0.9845266314810692 | |
| }, | |
| { | |
| "epoch": 1.7186477644492912, | |
| "grad_norm": 4.462788105010986, | |
| "learning_rate": 2.656846636740712e-07, | |
| "loss": 0.051555233001708986, | |
| "step": 39400, | |
| "token_acc": 0.9836437894791176 | |
| }, | |
| { | |
| "epoch": 1.723009814612868, | |
| "grad_norm": 3.6997790336608887, | |
| "learning_rate": 2.576536839277069e-07, | |
| "loss": 0.05385732650756836, | |
| "step": 39500, | |
| "token_acc": 0.9834032885363317 | |
| }, | |
| { | |
| "epoch": 1.7273718647764449, | |
| "grad_norm": 4.76052713394165, | |
| "learning_rate": 2.497393541510046e-07, | |
| "loss": 0.05027077674865723, | |
| "step": 39600, | |
| "token_acc": 0.9847386493704693 | |
| }, | |
| { | |
| "epoch": 1.7317339149400217, | |
| "grad_norm": 5.727189064025879, | |
| "learning_rate": 2.4194208605839096e-07, | |
| "loss": 0.04907273292541504, | |
| "step": 39700, | |
| "token_acc": 0.9850905973429936 | |
| }, | |
| { | |
| "epoch": 1.7360959651035985, | |
| "grad_norm": 3.36122989654541, | |
| "learning_rate": 2.342622852745824e-07, | |
| "loss": 0.04743415832519531, | |
| "step": 39800, | |
| "token_acc": 0.9850597151289282 | |
| }, | |
| { | |
| "epoch": 1.7404580152671756, | |
| "grad_norm": 3.0961196422576904, | |
| "learning_rate": 2.2670035131348135e-07, | |
| "loss": 0.04388811111450195, | |
| "step": 39900, | |
| "token_acc": 0.9863217267251401 | |
| }, | |
| { | |
| "epoch": 1.7448200654307524, | |
| "grad_norm": 2.886570692062378, | |
| "learning_rate": 2.1925667755739427e-07, | |
| "loss": 0.050310606956481936, | |
| "step": 40000, | |
| "token_acc": 0.9843371271999157 | |
| }, | |
| { | |
| "epoch": 1.7448200654307524, | |
| "eval_loss": 0.060765430331230164, | |
| "eval_runtime": 250.6299, | |
| "eval_samples_per_second": 59.131, | |
| "eval_steps_per_second": 0.926, | |
| "eval_token_acc": 0.978932866851471, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.7491821155943295, | |
| "grad_norm": 3.1743409633636475, | |
| "learning_rate": 2.119316512365696e-07, | |
| "loss": 0.04878216743469238, | |
| "step": 40100, | |
| "token_acc": 0.9853699903545512 | |
| }, | |
| { | |
| "epoch": 1.7535441657579063, | |
| "grad_norm": 2.7623131275177, | |
| "learning_rate": 2.0472565340904887e-07, | |
| "loss": 0.05069635391235352, | |
| "step": 40200, | |
| "token_acc": 0.9844571383958778 | |
| }, | |
| { | |
| "epoch": 1.757906215921483, | |
| "grad_norm": 4.026466369628906, | |
| "learning_rate": 1.9763905894084723e-07, | |
| "loss": 0.052156386375427244, | |
| "step": 40300, | |
| "token_acc": 0.9838101347017318 | |
| }, | |
| { | |
| "epoch": 1.76226826608506, | |
| "grad_norm": 2.6961493492126465, | |
| "learning_rate": 1.9067223648645188e-07, | |
| "loss": 0.04677755355834961, | |
| "step": 40400, | |
| "token_acc": 0.9858562442207259 | |
| }, | |
| { | |
| "epoch": 1.7666303162486368, | |
| "grad_norm": 3.7131080627441406, | |
| "learning_rate": 1.8382554846964218e-07, | |
| "loss": 0.049273710250854495, | |
| "step": 40500, | |
| "token_acc": 0.985027988651177 | |
| }, | |
| { | |
| "epoch": 1.7709923664122136, | |
| "grad_norm": 1.813375473022461, | |
| "learning_rate": 1.7709935106463792e-07, | |
| "loss": 0.05270167350769043, | |
| "step": 40600, | |
| "token_acc": 0.9837776105069173 | |
| }, | |
| { | |
| "epoch": 1.7753544165757906, | |
| "grad_norm": 4.872157096862793, | |
| "learning_rate": 1.7049399417757018e-07, | |
| "loss": 0.04908137321472168, | |
| "step": 40700, | |
| "token_acc": 0.9845669230221923 | |
| }, | |
| { | |
| "epoch": 1.7797164667393675, | |
| "grad_norm": 3.0371429920196533, | |
| "learning_rate": 1.6400982142827787e-07, | |
| "loss": 0.05342394828796387, | |
| "step": 40800, | |
| "token_acc": 0.9832475559923447 | |
| }, | |
| { | |
| "epoch": 1.7840785169029445, | |
| "grad_norm": 2.185772657394409, | |
| "learning_rate": 1.576471701324331e-07, | |
| "loss": 0.04955248832702637, | |
| "step": 40900, | |
| "token_acc": 0.9850666460511651 | |
| }, | |
| { | |
| "epoch": 1.7884405670665213, | |
| "grad_norm": 3.0958948135375977, | |
| "learning_rate": 1.514063712839925e-07, | |
| "loss": 0.04495782852172851, | |
| "step": 41000, | |
| "token_acc": 0.9864484417348619 | |
| }, | |
| { | |
| "epoch": 1.7884405670665213, | |
| "eval_loss": 0.060102950781583786, | |
| "eval_runtime": 250.0116, | |
| "eval_samples_per_second": 59.277, | |
| "eval_steps_per_second": 0.928, | |
| "eval_token_acc": 0.9792290753822767, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.7928026172300982, | |
| "grad_norm": 2.581108331680298, | |
| "learning_rate": 1.4528774953798092e-07, | |
| "loss": 0.049694914817810056, | |
| "step": 41100, | |
| "token_acc": 0.9852027649171952 | |
| }, | |
| { | |
| "epoch": 1.797164667393675, | |
| "grad_norm": 4.794500827789307, | |
| "learning_rate": 1.3929162319359868e-07, | |
| "loss": 0.052889041900634766, | |
| "step": 41200, | |
| "token_acc": 0.9838971151391371 | |
| }, | |
| { | |
| "epoch": 1.8015267175572518, | |
| "grad_norm": 3.068047523498535, | |
| "learning_rate": 1.3341830417766572e-07, | |
| "loss": 0.04843362331390381, | |
| "step": 41300, | |
| "token_acc": 0.9855183429921717 | |
| }, | |
| { | |
| "epoch": 1.8058887677208286, | |
| "grad_norm": 3.2265470027923584, | |
| "learning_rate": 1.2766809802839536e-07, | |
| "loss": 0.05105172157287598, | |
| "step": 41400, | |
| "token_acc": 0.9843682767213199 | |
| }, | |
| { | |
| "epoch": 1.8102508178844057, | |
| "grad_norm": 2.618612051010132, | |
| "learning_rate": 1.2204130387949735e-07, | |
| "loss": 0.04559971809387207, | |
| "step": 41500, | |
| "token_acc": 0.9859196891191709 | |
| }, | |
| { | |
| "epoch": 1.8146128680479825, | |
| "grad_norm": 2.534839153289795, | |
| "learning_rate": 1.1653821444461755e-07, | |
| "loss": 0.048215513229370115, | |
| "step": 41600, | |
| "token_acc": 0.9848556340531701 | |
| }, | |
| { | |
| "epoch": 1.8189749182115595, | |
| "grad_norm": 2.000978708267212, | |
| "learning_rate": 1.1115911600211194e-07, | |
| "loss": 0.04903024673461914, | |
| "step": 41700, | |
| "token_acc": 0.9847122828227678 | |
| }, | |
| { | |
| "epoch": 1.8233369683751364, | |
| "grad_norm": 3.0504517555236816, | |
| "learning_rate": 1.0590428838015255e-07, | |
| "loss": 0.05070067405700684, | |
| "step": 41800, | |
| "token_acc": 0.9843092181728956 | |
| }, | |
| { | |
| "epoch": 1.8276990185387132, | |
| "grad_norm": 3.5902857780456543, | |
| "learning_rate": 1.007740049421696e-07, | |
| "loss": 0.04867252349853515, | |
| "step": 41900, | |
| "token_acc": 0.985157748180839 | |
| }, | |
| { | |
| "epoch": 1.83206106870229, | |
| "grad_norm": 3.775128126144409, | |
| "learning_rate": 9.576853257263374e-08, | |
| "loss": 0.04645414352416992, | |
| "step": 42000, | |
| "token_acc": 0.9858936035130417 | |
| }, | |
| { | |
| "epoch": 1.83206106870229, | |
| "eval_loss": 0.05955477058887482, | |
| "eval_runtime": 253.7866, | |
| "eval_samples_per_second": 58.396, | |
| "eval_steps_per_second": 0.914, | |
| "eval_token_acc": 0.9794414513100241, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.8364231188658668, | |
| "grad_norm": 4.869281768798828, | |
| "learning_rate": 9.088813166316879e-08, | |
| "loss": 0.04662482261657715, | |
| "step": 42100, | |
| "token_acc": 0.985914570659612 | |
| }, | |
| { | |
| "epoch": 1.840785169029444, | |
| "grad_norm": 2.82537579536438, | |
| "learning_rate": 8.613305609900841e-08, | |
| "loss": 0.04755397796630859, | |
| "step": 42200, | |
| "token_acc": 0.9857374328685857 | |
| }, | |
| { | |
| "epoch": 1.8451472191930207, | |
| "grad_norm": 3.2263779640197754, | |
| "learning_rate": 8.150355324578807e-08, | |
| "loss": 0.04893510818481445, | |
| "step": 42300, | |
| "token_acc": 0.9853583504779954 | |
| }, | |
| { | |
| "epoch": 1.8495092693565978, | |
| "grad_norm": 3.73339581489563, | |
| "learning_rate": 7.699986393667535e-08, | |
| "loss": 0.04965659618377685, | |
| "step": 42400, | |
| "token_acc": 0.9852581619444843 | |
| }, | |
| { | |
| "epoch": 1.8538713195201746, | |
| "grad_norm": 5.45037317276001, | |
| "learning_rate": 7.262222245984329e-08, | |
| "loss": 0.05175324916839599, | |
| "step": 42500, | |
| "token_acc": 0.9840829382972983 | |
| }, | |
| { | |
| "epoch": 1.8582333696837514, | |
| "grad_norm": 2.7033138275146484, | |
| "learning_rate": 6.83708565462815e-08, | |
| "loss": 0.05096693515777588, | |
| "step": 42600, | |
| "token_acc": 0.9843310384171561 | |
| }, | |
| { | |
| "epoch": 1.8625954198473282, | |
| "grad_norm": 2.480644464492798, | |
| "learning_rate": 6.424598735794929e-08, | |
| "loss": 0.05035747051239014, | |
| "step": 42700, | |
| "token_acc": 0.9847126296208089 | |
| }, | |
| { | |
| "epoch": 1.866957470010905, | |
| "grad_norm": 6.001003265380859, | |
| "learning_rate": 6.024782947627039e-08, | |
| "loss": 0.05004034996032715, | |
| "step": 42800, | |
| "token_acc": 0.984943015304461 | |
| }, | |
| { | |
| "epoch": 1.8713195201744819, | |
| "grad_norm": 2.7746217250823975, | |
| "learning_rate": 5.6376590890971085e-08, | |
| "loss": 0.05255108833312988, | |
| "step": 42900, | |
| "token_acc": 0.9837951328137426 | |
| }, | |
| { | |
| "epoch": 1.875681570338059, | |
| "grad_norm": 3.266484022140503, | |
| "learning_rate": 5.2632472989259384e-08, | |
| "loss": 0.0456111478805542, | |
| "step": 43000, | |
| "token_acc": 0.9861970334118388 | |
| }, | |
| { | |
| "epoch": 1.875681570338059, | |
| "eval_loss": 0.05910694971680641, | |
| "eval_runtime": 249.7817, | |
| "eval_samples_per_second": 59.332, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9795839667352231, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.8800436205016358, | |
| "grad_norm": 2.7523422241210938, | |
| "learning_rate": 4.901567054534761e-08, | |
| "loss": 0.046754164695739744, | |
| "step": 43100, | |
| "token_acc": 0.985937347833144 | |
| }, | |
| { | |
| "epoch": 1.8844056706652128, | |
| "grad_norm": 4.6909003257751465, | |
| "learning_rate": 4.552637171032326e-08, | |
| "loss": 0.04618264198303223, | |
| "step": 43200, | |
| "token_acc": 0.9854022284965875 | |
| }, | |
| { | |
| "epoch": 1.8887677208287896, | |
| "grad_norm": 8.2206392288208, | |
| "learning_rate": 4.216475800235736e-08, | |
| "loss": 0.0508054256439209, | |
| "step": 43300, | |
| "token_acc": 0.9843883895493057 | |
| }, | |
| { | |
| "epoch": 1.8931297709923665, | |
| "grad_norm": 3.2640438079833984, | |
| "learning_rate": 3.8931004297263765e-08, | |
| "loss": 0.04786904811859131, | |
| "step": 43400, | |
| "token_acc": 0.985393875230365 | |
| }, | |
| { | |
| "epoch": 1.8974918211559433, | |
| "grad_norm": 1.9893529415130615, | |
| "learning_rate": 3.5825278819401344e-08, | |
| "loss": 0.047113409042358396, | |
| "step": 43500, | |
| "token_acc": 0.9858144453728849 | |
| }, | |
| { | |
| "epoch": 1.90185387131952, | |
| "grad_norm": 3.4239087104797363, | |
| "learning_rate": 3.284774313292327e-08, | |
| "loss": 0.04778111457824707, | |
| "step": 43600, | |
| "token_acc": 0.9853008490693761 | |
| }, | |
| { | |
| "epoch": 1.906215921483097, | |
| "grad_norm": 7.985106945037842, | |
| "learning_rate": 2.999855213337094e-08, | |
| "loss": 0.047200527191162106, | |
| "step": 43700, | |
| "token_acc": 0.9858358995324792 | |
| }, | |
| { | |
| "epoch": 1.910577971646674, | |
| "grad_norm": 2.5651211738586426, | |
| "learning_rate": 2.7277854039617356e-08, | |
| "loss": 0.047189741134643554, | |
| "step": 43800, | |
| "token_acc": 0.9852690961552147 | |
| }, | |
| { | |
| "epoch": 1.9149400218102508, | |
| "grad_norm": 4.144567489624023, | |
| "learning_rate": 2.4685790386156093e-08, | |
| "loss": 0.048973798751831055, | |
| "step": 43900, | |
| "token_acc": 0.9847606160190734 | |
| }, | |
| { | |
| "epoch": 1.9193020719738279, | |
| "grad_norm": 2.7335407733917236, | |
| "learning_rate": 2.222249601573828e-08, | |
| "loss": 0.045456886291503906, | |
| "step": 44000, | |
| "token_acc": 0.986095739252033 | |
| }, | |
| { | |
| "epoch": 1.9193020719738279, | |
| "eval_loss": 0.05891130864620209, | |
| "eval_runtime": 249.6153, | |
| "eval_samples_per_second": 59.371, | |
| "eval_steps_per_second": 0.929, | |
| "eval_token_acc": 0.9796035276759367, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.9236641221374047, | |
| "grad_norm": 2.71073317527771, | |
| "learning_rate": 1.9888099072358502e-08, | |
| "loss": 0.04772358894348144, | |
| "step": 44100, | |
| "token_acc": 0.9852039956985347 | |
| }, | |
| { | |
| "epoch": 1.9280261723009815, | |
| "grad_norm": 5.084836006164551, | |
| "learning_rate": 1.76827209945879e-08, | |
| "loss": 0.049145450592041014, | |
| "step": 44200, | |
| "token_acc": 0.985264099224324 | |
| }, | |
| { | |
| "epoch": 1.9323882224645583, | |
| "grad_norm": 2.6878726482391357, | |
| "learning_rate": 1.560647650925645e-08, | |
| "loss": 0.04851144790649414, | |
| "step": 44300, | |
| "token_acc": 0.9848727188638159 | |
| }, | |
| { | |
| "epoch": 1.9367502726281351, | |
| "grad_norm": 3.3305675983428955, | |
| "learning_rate": 1.3659473625486641e-08, | |
| "loss": 0.05027200222015381, | |
| "step": 44400, | |
| "token_acc": 0.9853740048258687 | |
| }, | |
| { | |
| "epoch": 1.941112322791712, | |
| "grad_norm": 3.962172508239746, | |
| "learning_rate": 1.1841813629072108e-08, | |
| "loss": 0.04904890060424805, | |
| "step": 44500, | |
| "token_acc": 0.984241933059261 | |
| }, | |
| { | |
| "epoch": 1.945474372955289, | |
| "grad_norm": 6.480011463165283, | |
| "learning_rate": 1.0153591077210479e-08, | |
| "loss": 0.044611949920654294, | |
| "step": 44600, | |
| "token_acc": 0.986387756168967 | |
| }, | |
| { | |
| "epoch": 1.9498364231188658, | |
| "grad_norm": 3.123936414718628, | |
| "learning_rate": 8.594893793583692e-09, | |
| "loss": 0.04712478637695312, | |
| "step": 44700, | |
| "token_acc": 0.9857043601373849 | |
| }, | |
| { | |
| "epoch": 1.954198473282443, | |
| "grad_norm": 3.129568099975586, | |
| "learning_rate": 7.165802863789151e-09, | |
| "loss": 0.047671728134155274, | |
| "step": 44800, | |
| "token_acc": 0.9856617238607475 | |
| }, | |
| { | |
| "epoch": 1.9585605234460197, | |
| "grad_norm": 4.54371452331543, | |
| "learning_rate": 5.866392631121709e-09, | |
| "loss": 0.04312769412994385, | |
| "step": 44900, | |
| "token_acc": 0.9869960142783764 | |
| }, | |
| { | |
| "epoch": 1.9629225736095965, | |
| "grad_norm": 6.765581130981445, | |
| "learning_rate": 4.6967306927067634e-09, | |
| "loss": 0.04882259845733643, | |
| "step": 45000, | |
| "token_acc": 0.9849781376518219 | |
| }, | |
| { | |
| "epoch": 1.9629225736095965, | |
| "eval_loss": 0.05884711444377899, | |
| "eval_runtime": 254.1874, | |
| "eval_samples_per_second": 58.303, | |
| "eval_steps_per_second": 0.913, | |
| "eval_token_acc": 0.9796482383975678, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.9672846237731734, | |
| "grad_norm": 3.8652217388153076, | |
| "learning_rate": 3.6568778959825137e-09, | |
| "loss": 0.04858424186706543, | |
| "step": 45100, | |
| "token_acc": 0.9854189208567443 | |
| }, | |
| { | |
| "epoch": 1.9716466739367502, | |
| "grad_norm": 4.594508171081543, | |
| "learning_rate": 2.746888335535547e-09, | |
| "loss": 0.045993332862854, | |
| "step": 45200, | |
| "token_acc": 0.985808285313814 | |
| }, | |
| { | |
| "epoch": 1.976008724100327, | |
| "grad_norm": 6.799201965332031, | |
| "learning_rate": 1.96680935028698e-09, | |
| "loss": 0.04716442108154297, | |
| "step": 45300, | |
| "token_acc": 0.9854687663492728 | |
| }, | |
| { | |
| "epoch": 1.980370774263904, | |
| "grad_norm": 3.836419105529785, | |
| "learning_rate": 1.316681521028873e-09, | |
| "loss": 0.04726198196411133, | |
| "step": 45400, | |
| "token_acc": 0.9852152973770747 | |
| }, | |
| { | |
| "epoch": 1.984732824427481, | |
| "grad_norm": 6.797372341156006, | |
| "learning_rate": 7.965386683139731e-10, | |
| "loss": 0.050753302574157715, | |
| "step": 45500, | |
| "token_acc": 0.984968922406266 | |
| }, | |
| { | |
| "epoch": 1.989094874591058, | |
| "grad_norm": 3.1903374195098877, | |
| "learning_rate": 4.0640785069545606e-10, | |
| "loss": 0.04919458389282227, | |
| "step": 45600, | |
| "token_acc": 0.9844628013569153 | |
| }, | |
| { | |
| "epoch": 1.9934569247546348, | |
| "grad_norm": 2.859959363937378, | |
| "learning_rate": 1.4630936332082902e-10, | |
| "loss": 0.04802582263946533, | |
| "step": 45700, | |
| "token_acc": 0.9855202858400435 | |
| }, | |
| { | |
| "epoch": 1.9978189749182116, | |
| "grad_norm": 5.337630748748779, | |
| "learning_rate": 1.6256736874442624e-11, | |
| "loss": 0.04464107036590576, | |
| "step": 45800, | |
| "token_acc": 0.9865233937765838 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.05883299186825752, | |
| "eval_runtime": 250.142, | |
| "eval_samples_per_second": 59.246, | |
| "eval_steps_per_second": 0.927, | |
| "eval_token_acc": 0.9796258830367522, | |
| "step": 45850 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 45850, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.039735006810985e+23, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |