diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,5198 +3,4807 @@ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, - "eval_steps": 500, - "global_step": 60267, + "eval_steps": 300, + "global_step": 49697, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0016593035073527887, - "grad_norm": 1.5118614435195923, - "learning_rate": 1.6417910447761196e-06, - "loss": 1.4236, + "epoch": 0.002012224262394044, + "grad_norm": 1.5522648096084595, + "learning_rate": 1.9919517102615694e-06, + "loss": 1.2489, "step": 100 }, { - "epoch": 0.0033186070147055774, - "grad_norm": 1.5862103700637817, - "learning_rate": 3.3001658374792705e-06, - "loss": 1.319, + "epoch": 0.004024448524788088, + "grad_norm": 1.59534752368927, + "learning_rate": 4.0040241448692155e-06, + "loss": 1.1249, "step": 200 }, { - "epoch": 0.004977910522058366, - "grad_norm": 1.423666000366211, - "learning_rate": 4.9585406301824215e-06, - "loss": 1.2717, + "epoch": 0.006036672787182132, + "grad_norm": 1.5959556102752686, + "learning_rate": 6.016096579476862e-06, + "loss": 1.0698, "step": 300 }, { - "epoch": 0.006637214029411155, - "grad_norm": 1.4161261320114136, - "learning_rate": 6.6169154228855725e-06, - "loss": 1.25, + "epoch": 0.006036672787182132, + "eval_loss": 0.9532507061958313, + "eval_runtime": 11.1234, + "eval_samples_per_second": 33.982, + "eval_steps_per_second": 1.169, + "step": 300 + }, + { + "epoch": 0.008048897049576176, + "grad_norm": 1.6886017322540283, + "learning_rate": 8.028169014084509e-06, + "loss": 1.0365, "step": 400 }, { - "epoch": 0.008296517536763944, - "grad_norm": 1.3309650421142578, - "learning_rate": 8.275290215588724e-06, - "loss": 1.2317, + "epoch": 0.01006112131197022, + "grad_norm": 1.421373963356018, + "learning_rate": 1.0040241448692154e-05, + "loss": 1.0155, "step": 500 }, { - "epoch": 0.008296517536763944, - "eval_loss": 0.8697910904884338, - "eval_runtime": 11.1786, - "eval_samples_per_second": 33.815, - "eval_steps_per_second": 1.163, - "step": 500 + "epoch": 0.012073345574364264, + "grad_norm": 1.578765630722046, + "learning_rate": 1.20523138832998e-05, + "loss": 1.006, + "step": 600 }, { - "epoch": 0.009955821044116731, - "grad_norm": 1.5689911842346191, - "learning_rate": 9.933665008291875e-06, - "loss": 1.2111, + "epoch": 0.012073345574364264, + "eval_loss": 0.8175720572471619, + "eval_runtime": 11.5611, + "eval_samples_per_second": 32.696, + "eval_steps_per_second": 1.124, "step": 600 }, { - "epoch": 0.01161512455146952, - "grad_norm": 1.638684868812561, - "learning_rate": 1.1592039800995025e-05, - "loss": 1.2117, + "epoch": 0.014085569836758306, + "grad_norm": 1.1900346279144287, + "learning_rate": 1.4064386317907446e-05, + "loss": 0.9972, "step": 700 }, { - "epoch": 0.01327442805882231, - "grad_norm": 1.5162866115570068, - "learning_rate": 1.3250414593698177e-05, - "loss": 1.2043, + "epoch": 0.01609779409915235, + "grad_norm": 1.4590531587600708, + "learning_rate": 1.607645875251509e-05, + "loss": 0.9895, "step": 800 }, { - "epoch": 0.014933731566175099, - "grad_norm": 1.0932368040084839, - "learning_rate": 1.4908789386401328e-05, - "loss": 1.213, + "epoch": 0.018110018361546396, + "grad_norm": 1.8518555164337158, + "learning_rate": 1.8088531187122737e-05, + "loss": 0.9718, "step": 900 }, { - "epoch": 0.016593035073527888, - "grad_norm": 1.2268598079681396, - "learning_rate": 1.6567164179104477e-05, - "loss": 1.1887, - "step": 1000 + "epoch": 0.018110018361546396, + "eval_loss": 0.781577467918396, + "eval_runtime": 11.4278, + "eval_samples_per_second": 33.077, + "eval_steps_per_second": 1.138, + "step": 900 }, { - "epoch": 0.016593035073527888, - "eval_loss": 0.7974580526351929, - "eval_runtime": 15.4385, - "eval_samples_per_second": 24.484, - "eval_steps_per_second": 0.842, + "epoch": 0.02012224262394044, + "grad_norm": 1.351710319519043, + "learning_rate": 1.999999947988626e-05, + "loss": 0.972, "step": 1000 }, { - "epoch": 0.018252338580880677, - "grad_norm": 1.2971336841583252, - "learning_rate": 1.822553897180763e-05, - "loss": 1.2009, + "epoch": 0.022134466886334483, + "grad_norm": 1.2841336727142334, + "learning_rate": 1.9999770630715236e-05, + "loss": 0.9662, "step": 1100 }, { - "epoch": 0.019911642088233462, - "grad_norm": 1.2302576303482056, - "learning_rate": 1.988391376451078e-05, - "loss": 1.1941, + "epoch": 0.024146691148728527, + "grad_norm": 1.2296431064605713, + "learning_rate": 1.9999125701534677e-05, + "loss": 0.9578, + "step": 1200 + }, + { + "epoch": 0.024146691148728527, + "eval_loss": 0.9337042570114136, + "eval_runtime": 11.324, + "eval_samples_per_second": 33.38, + "eval_steps_per_second": 1.148, "step": 1200 }, { - "epoch": 0.02157094559558625, - "grad_norm": 1.3103522062301636, - "learning_rate": 1.999987764177474e-05, - "loss": 1.1896, + "epoch": 0.026158915411122568, + "grad_norm": 1.3778767585754395, + "learning_rate": 1.9998064719179408e-05, + "loss": 0.9614, "step": 1300 }, { - "epoch": 0.02323024910293904, - "grad_norm": 1.4299105405807495, - "learning_rate": 1.999947303840931e-05, - "loss": 1.1872, + "epoch": 0.028171139673516612, + "grad_norm": 1.3921650648117065, + "learning_rate": 1.9996587727795803e-05, + "loss": 0.9541, "step": 1400 }, { - "epoch": 0.02488955261029183, - "grad_norm": 1.1204850673675537, - "learning_rate": 1.999878550814255e-05, - "loss": 1.1855, + "epoch": 0.030183363935910656, + "grad_norm": 1.3527588844299316, + "learning_rate": 1.9994694788839924e-05, + "loss": 0.9488, "step": 1500 }, { - "epoch": 0.02488955261029183, - "eval_loss": 0.7359133958816528, - "eval_runtime": 11.3115, - "eval_samples_per_second": 33.417, - "eval_steps_per_second": 1.149, + "epoch": 0.030183363935910656, + "eval_loss": 0.7029635310173035, + "eval_runtime": 11.506, + "eval_samples_per_second": 32.853, + "eval_steps_per_second": 1.13, "step": 1500 }, { - "epoch": 0.02654885611764462, - "grad_norm": 1.017000436782837, - "learning_rate": 1.9997815070427582e-05, - "loss": 1.1796, + "epoch": 0.0321955881983047, + "grad_norm": 0.8907983303070068, + "learning_rate": 1.9992385981074994e-05, + "loss": 0.9418, "step": 1600 }, { - "epoch": 0.028208159624997408, - "grad_norm": 1.1664379835128784, - "learning_rate": 1.999656175272214e-05, - "loss": 1.1736, + "epoch": 0.03420781246069875, + "grad_norm": 1.136816382408142, + "learning_rate": 1.998966140056808e-05, + "loss": 0.946, "step": 1700 }, { - "epoch": 0.029867463132350197, - "grad_norm": 1.0235024690628052, - "learning_rate": 1.9995025590487822e-05, - "loss": 1.1775, + "epoch": 0.03622003672309279, + "grad_norm": 1.0668370723724365, + "learning_rate": 1.9986521160686134e-05, + "loss": 0.9357, + "step": 1800 + }, + { + "epoch": 0.03622003672309279, + "eval_loss": 0.68252032995224, + "eval_runtime": 11.3032, + "eval_samples_per_second": 33.442, + "eval_steps_per_second": 1.15, "step": 1800 }, { - "epoch": 0.03152676663970298, - "grad_norm": 1.0118759870529175, - "learning_rate": 1.9993206627189083e-05, - "loss": 1.183, + "epoch": 0.038232260985486835, + "grad_norm": 0.8517168760299683, + "learning_rate": 1.9982965392091262e-05, + "loss": 0.936, "step": 1900 }, { - "epoch": 0.033186070147055775, - "grad_norm": 1.0136981010437012, - "learning_rate": 1.9991104914291998e-05, - "loss": 1.157, + "epoch": 0.04024448524788088, + "grad_norm": 1.0746815204620361, + "learning_rate": 1.9978994242735275e-05, + "loss": 0.9384, "step": 2000 }, { - "epoch": 0.033186070147055775, - "eval_loss": 0.7383748888969421, - "eval_runtime": 11.4221, - "eval_samples_per_second": 33.094, - "eval_steps_per_second": 1.138, - "step": 2000 + "epoch": 0.04225670951027492, + "grad_norm": 1.0119695663452148, + "learning_rate": 1.9974607877853555e-05, + "loss": 0.9252, + "step": 2100 }, { - "epoch": 0.03484537365440856, - "grad_norm": 0.9680323004722595, - "learning_rate": 1.998872051126281e-05, - "loss": 1.1549, + "epoch": 0.04225670951027492, + "eval_loss": 0.672024130821228, + "eval_runtime": 11.3298, + "eval_samples_per_second": 33.363, + "eval_steps_per_second": 1.147, "step": 2100 }, { - "epoch": 0.036504677161761354, - "grad_norm": 1.176108717918396, - "learning_rate": 1.9986053485566255e-05, - "loss": 1.1532, + "epoch": 0.04426893377266897, + "grad_norm": 0.7535356283187866, + "learning_rate": 1.9969806479958154e-05, + "loss": 0.9215, "step": 2200 }, { - "epoch": 0.03816398066911414, - "grad_norm": 1.0318026542663574, - "learning_rate": 1.998310391266364e-05, - "loss": 1.1662, + "epoch": 0.04628115803506301, + "grad_norm": 0.837115466594696, + "learning_rate": 1.996459024883023e-05, + "loss": 0.9229, "step": 2300 }, { - "epoch": 0.039823284176466925, - "grad_norm": 0.843786358833313, - "learning_rate": 1.997987187601071e-05, - "loss": 1.1537, + "epoch": 0.048293382297457055, + "grad_norm": 0.9772033095359802, + "learning_rate": 1.995895940151171e-05, + "loss": 0.9155, "step": 2400 }, { - "epoch": 0.04148258768381972, - "grad_norm": 0.9860417246818542, - "learning_rate": 1.9976357467055302e-05, - "loss": 1.1569, - "step": 2500 + "epoch": 0.048293382297457055, + "eval_loss": 0.6609585285186768, + "eval_runtime": 11.8546, + "eval_samples_per_second": 31.886, + "eval_steps_per_second": 1.097, + "step": 2400 }, { - "epoch": 0.04148258768381972, - "eval_loss": 0.6743523478507996, - "eval_runtime": 11.4058, - "eval_samples_per_second": 33.141, - "eval_steps_per_second": 1.14, + "epoch": 0.0503056065598511, + "grad_norm": 0.9059876799583435, + "learning_rate": 1.9952914172296264e-05, + "loss": 0.9104, "step": 2500 }, { - "epoch": 0.0431418911911725, - "grad_norm": 1.3837993144989014, - "learning_rate": 1.9972560785234734e-05, - "loss": 1.1604, + "epoch": 0.052317830822245136, + "grad_norm": 1.090819239616394, + "learning_rate": 1.9946454812719572e-05, + "loss": 0.9056, "step": 2600 }, { - "epoch": 0.044801194698525296, - "grad_norm": 0.9329952597618103, - "learning_rate": 1.9968481937973015e-05, - "loss": 1.1563, + "epoch": 0.05433005508463918, + "grad_norm": 0.8924378156661987, + "learning_rate": 1.9939581591548833e-05, + "loss": 0.9102, "step": 2700 }, { - "epoch": 0.04646049820587808, - "grad_norm": 0.8984239101409912, - "learning_rate": 1.996412104067779e-05, - "loss": 1.147, + "epoch": 0.05433005508463918, + "eval_loss": 0.6568426489830017, + "eval_runtime": 11.3424, + "eval_samples_per_second": 33.326, + "eval_steps_per_second": 1.146, + "step": 2700 + }, + { + "epoch": 0.056342279347033224, + "grad_norm": 0.9142224788665771, + "learning_rate": 1.9932294794771596e-05, + "loss": 0.9101, "step": 2800 }, { - "epoch": 0.048119801713230874, - "grad_norm": 1.0862194299697876, - "learning_rate": 1.9959478216737063e-05, - "loss": 1.1565, + "epoch": 0.05835450360942727, + "grad_norm": 1.060359239578247, + "learning_rate": 1.992459472558387e-05, + "loss": 0.9013, "step": 2900 }, { - "epoch": 0.04977910522058366, - "grad_norm": 0.9737164974212646, - "learning_rate": 1.9954553597515752e-05, - "loss": 1.1457, + "epoch": 0.06036672787182131, + "grad_norm": 0.7167413234710693, + "learning_rate": 1.9916481704377487e-05, + "loss": 0.9002, "step": 3000 }, { - "epoch": 0.04977910522058366, - "eval_loss": 0.665993869304657, - "eval_runtime": 11.3689, - "eval_samples_per_second": 33.249, - "eval_steps_per_second": 1.143, + "epoch": 0.06036672787182131, + "eval_loss": 0.6527668237686157, + "eval_runtime": 11.4, + "eval_samples_per_second": 33.158, + "eval_steps_per_second": 1.14, "step": 3000 }, { - "epoch": 0.05143840872793645, - "grad_norm": 0.8984717130661011, - "learning_rate": 1.9949347322351914e-05, - "loss": 1.1604, + "epoch": 0.062378952134215356, + "grad_norm": 0.783549427986145, + "learning_rate": 1.9907956068726782e-05, + "loss": 0.897, "step": 3100 }, { - "epoch": 0.05309771223528924, - "grad_norm": 0.9714403748512268, - "learning_rate": 1.9943859538552847e-05, - "loss": 1.1484, + "epoch": 0.0643911763966094, + "grad_norm": 0.9683724045753479, + "learning_rate": 1.9899018173374552e-05, + "loss": 0.9294, "step": 3200 }, { - "epoch": 0.05475701574264202, - "grad_norm": 0.8448699116706848, - "learning_rate": 1.9938090401390896e-05, - "loss": 1.1514, + "epoch": 0.06640340065900345, + "grad_norm": 1.1547231674194336, + "learning_rate": 1.9889668390217284e-05, + "loss": 0.901, + "step": 3300 + }, + { + "epoch": 0.06640340065900345, + "eval_loss": 0.6419159173965454, + "eval_runtime": 11.429, + "eval_samples_per_second": 33.074, + "eval_steps_per_second": 1.137, "step": 3300 }, { - "epoch": 0.056416319249994816, - "grad_norm": 0.8744631409645081, - "learning_rate": 1.993204007409908e-05, - "loss": 1.1447, + "epoch": 0.0684156249213975, + "grad_norm": 0.81548011302948, + "learning_rate": 1.9879907108289684e-05, + "loss": 0.9008, "step": 3400 }, { - "epoch": 0.0580756227573476, - "grad_norm": 1.0349507331848145, - "learning_rate": 1.9925708727866447e-05, - "loss": 1.1352, + "epoch": 0.07042784918379154, + "grad_norm": 0.7857891321182251, + "learning_rate": 1.98697347337485e-05, + "loss": 0.8928, "step": 3500 }, { - "epoch": 0.0580756227573476, - "eval_loss": 0.6517411470413208, - "eval_runtime": 11.4538, - "eval_samples_per_second": 33.002, - "eval_steps_per_second": 1.135, - "step": 3500 + "epoch": 0.07244007344618558, + "grad_norm": 0.8332715630531311, + "learning_rate": 1.985915168985561e-05, + "loss": 0.8889, + "step": 3600 }, { - "epoch": 0.059734926264700394, - "grad_norm": 0.8414900302886963, - "learning_rate": 1.9919096541833257e-05, - "loss": 1.1298, + "epoch": 0.07244007344618558, + "eval_loss": 0.6356409192085266, + "eval_runtime": 11.2917, + "eval_samples_per_second": 33.476, + "eval_steps_per_second": 1.151, "step": 3600 }, { - "epoch": 0.06139422977205318, - "grad_norm": 0.9680725336074829, - "learning_rate": 1.99122037030859e-05, - "loss": 1.1302, + "epoch": 0.07445229770857963, + "grad_norm": 0.9201735258102417, + "learning_rate": 1.9848158416960414e-05, + "loss": 0.8869, "step": 3700 }, { - "epoch": 0.06305353327940597, - "grad_norm": 0.9834539294242859, - "learning_rate": 1.9905030406651606e-05, - "loss": 1.1315, + "epoch": 0.07646452197097367, + "grad_norm": 0.7852803468704224, + "learning_rate": 1.9836755372481512e-05, + "loss": 0.8973, "step": 3800 }, { - "epoch": 0.06471283678675875, - "grad_norm": 0.8444752097129822, - "learning_rate": 1.9897576855492918e-05, - "loss": 1.129, + "epoch": 0.07847674623336771, + "grad_norm": 0.7758309841156006, + "learning_rate": 1.982494303088767e-05, + "loss": 0.8925, "step": 3900 }, { - "epoch": 0.06637214029411155, - "grad_norm": 0.8134438991546631, - "learning_rate": 1.9889843260501966e-05, - "loss": 1.1153, - "step": 4000 + "epoch": 0.07847674623336771, + "eval_loss": 0.6345422863960266, + "eval_runtime": 11.3533, + "eval_samples_per_second": 33.294, + "eval_steps_per_second": 1.145, + "step": 3900 }, { - "epoch": 0.06637214029411155, - "eval_loss": 0.6426942944526672, - "eval_runtime": 11.5285, - "eval_samples_per_second": 32.788, - "eval_steps_per_second": 1.128, + "epoch": 0.08048897049576176, + "grad_norm": 0.9436432123184204, + "learning_rate": 1.981272188367809e-05, + "loss": 0.8847, "step": 4000 }, { - "epoch": 0.06803144380146434, - "grad_norm": 0.8562799096107483, - "learning_rate": 1.988182984049449e-05, - "loss": 1.1291, + "epoch": 0.0825011947581558, + "grad_norm": 0.8394960165023804, + "learning_rate": 1.980009243936193e-05, + "loss": 0.8923, "step": 4100 }, { - "epoch": 0.06969074730881712, - "grad_norm": 1.0887460708618164, - "learning_rate": 1.987353682220364e-05, - "loss": 1.1166, + "epoch": 0.08451341902054985, + "grad_norm": 0.8079524636268616, + "learning_rate": 1.9787055223437184e-05, + "loss": 0.8828, + "step": 4200 + }, + { + "epoch": 0.08451341902054985, + "eval_loss": 0.6277508735656738, + "eval_runtime": 11.2988, + "eval_samples_per_second": 33.455, + "eval_steps_per_second": 1.151, "step": 4200 }, { - "epoch": 0.07135005081616991, - "grad_norm": 0.975968599319458, - "learning_rate": 1.9864964440273586e-05, - "loss": 1.1291, + "epoch": 0.08652564328294389, + "grad_norm": 0.8562188744544983, + "learning_rate": 1.977361077836878e-05, + "loss": 0.8801, "step": 4300 }, { - "epoch": 0.07300935432352271, - "grad_norm": 0.8442144393920898, - "learning_rate": 1.985611293725285e-05, - "loss": 1.1293, + "epoch": 0.08853786754533793, + "grad_norm": 0.9642734527587891, + "learning_rate": 1.9759759663566032e-05, + "loss": 0.896, "step": 4400 }, { - "epoch": 0.07466865783087549, - "grad_norm": 0.7740936875343323, - "learning_rate": 1.984698256358746e-05, - "loss": 1.1262, + "epoch": 0.09055009180773198, + "grad_norm": 0.8723398447036743, + "learning_rate": 1.9745502455359367e-05, + "loss": 0.8879, "step": 4500 }, { - "epoch": 0.07466865783087549, - "eval_loss": 0.6394479870796204, - "eval_runtime": 11.3332, - "eval_samples_per_second": 33.353, - "eval_steps_per_second": 1.147, + "epoch": 0.09055009180773198, + "eval_loss": 0.6282201409339905, + "eval_runtime": 11.4757, + "eval_samples_per_second": 32.939, + "eval_steps_per_second": 1.133, "step": 4500 }, { - "epoch": 0.07632796133822828, - "grad_norm": 0.8332346677780151, - "learning_rate": 1.9837573577613868e-05, - "loss": 1.1419, + "epoch": 0.09256231607012602, + "grad_norm": 0.8613621592521667, + "learning_rate": 1.9730839746976314e-05, + "loss": 0.8854, "step": 4600 }, { - "epoch": 0.07798726484558106, - "grad_norm": 1.0351996421813965, - "learning_rate": 1.9827886245551625e-05, - "loss": 1.1129, + "epoch": 0.09457454033252007, + "grad_norm": 0.7336219549179077, + "learning_rate": 1.9715772148516855e-05, + "loss": 0.8806, "step": 4700 }, { - "epoch": 0.07964656835293385, - "grad_norm": 0.8040492534637451, - "learning_rate": 1.9817920841495856e-05, - "loss": 1.1272, + "epoch": 0.09658676459491411, + "grad_norm": 0.7842460870742798, + "learning_rate": 1.970030028692802e-05, + "loss": 0.8798, + "step": 4800 + }, + { + "epoch": 0.09658676459491411, + "eval_loss": 0.6203732490539551, + "eval_runtime": 11.2931, + "eval_samples_per_second": 33.472, + "eval_steps_per_second": 1.151, "step": 4800 }, { - "epoch": 0.08130587186028665, - "grad_norm": 1.120984435081482, - "learning_rate": 1.980767764740951e-05, - "loss": 1.1321, + "epoch": 0.09859898885730815, + "grad_norm": 1.042386770248413, + "learning_rate": 1.968442480597781e-05, + "loss": 0.8786, "step": 4900 }, { - "epoch": 0.08296517536763943, - "grad_norm": 0.8339391350746155, - "learning_rate": 1.979715695311538e-05, - "loss": 1.1137, + "epoch": 0.1006112131197022, + "grad_norm": 0.8358279466629028, + "learning_rate": 1.9668146366228398e-05, + "loss": 0.8834, "step": 5000 }, { - "epoch": 0.08296517536763943, - "eval_loss": 0.6323764324188232, - "eval_runtime": 11.4003, - "eval_samples_per_second": 33.157, - "eval_steps_per_second": 1.14, - "step": 5000 + "epoch": 0.10262343738209624, + "grad_norm": 0.9129268527030945, + "learning_rate": 1.965146564500866e-05, + "loss": 0.8763, + "step": 5100 }, { - "epoch": 0.08462447887499222, - "grad_norm": 1.0801151990890503, - "learning_rate": 1.978635905628789e-05, - "loss": 1.1296, + "epoch": 0.10262343738209624, + "eval_loss": 0.6140510439872742, + "eval_runtime": 11.3122, + "eval_samples_per_second": 33.415, + "eval_steps_per_second": 1.149, "step": 5100 }, { - "epoch": 0.086283782382345, - "grad_norm": 0.7998140454292297, - "learning_rate": 1.977528426244469e-05, - "loss": 1.1255, + "epoch": 0.10463566164449027, + "grad_norm": 0.9329330325126648, + "learning_rate": 1.963438333638598e-05, + "loss": 0.8724, "step": 5200 }, { - "epoch": 0.0879430858896978, - "grad_norm": 0.7596673369407654, - "learning_rate": 1.9763932884938007e-05, - "loss": 1.1193, + "epoch": 0.10664788590688432, + "grad_norm": 0.9156613349914551, + "learning_rate": 1.9616900151137375e-05, + "loss": 0.8798, "step": 5300 }, { - "epoch": 0.08960238939705059, - "grad_norm": 0.837907612323761, - "learning_rate": 1.9752305244945768e-05, - "loss": 1.1197, + "epoch": 0.10866011016927836, + "grad_norm": 1.0988123416900635, + "learning_rate": 1.9599016816719912e-05, + "loss": 0.8864, "step": 5400 }, { - "epoch": 0.09126169290440338, - "grad_norm": 0.8659106492996216, - "learning_rate": 1.9740401671462527e-05, - "loss": 1.1156, - "step": 5500 + "epoch": 0.10866011016927836, + "eval_loss": 0.613735556602478, + "eval_runtime": 11.5595, + "eval_samples_per_second": 32.7, + "eval_steps_per_second": 1.125, + "step": 5400 }, { - "epoch": 0.09126169290440338, - "eval_loss": 0.6233054399490356, - "eval_runtime": 11.3643, - "eval_samples_per_second": 33.262, - "eval_steps_per_second": 1.144, + "epoch": 0.1106723344316724, + "grad_norm": 0.9962302446365356, + "learning_rate": 1.9580734077240467e-05, + "loss": 0.879, "step": 5500 }, { - "epoch": 0.09292099641175616, - "grad_norm": 0.8374246954917908, - "learning_rate": 1.9728222501290143e-05, - "loss": 1.1153, + "epoch": 0.11268455869406645, + "grad_norm": 0.6542097926139832, + "learning_rate": 1.9562052693424724e-05, + "loss": 0.8754, "step": 5600 }, { - "epoch": 0.09458029991910895, - "grad_norm": 0.8681025505065918, - "learning_rate": 1.971576807902827e-05, - "loss": 1.1153, + "epoch": 0.11469678295646049, + "grad_norm": 0.8420646786689758, + "learning_rate": 1.9542973442585542e-05, + "loss": 0.8753, + "step": 5700 + }, + { + "epoch": 0.11469678295646049, + "eval_loss": 0.6112973690032959, + "eval_runtime": 11.3099, + "eval_samples_per_second": 33.422, + "eval_steps_per_second": 1.149, "step": 5700 }, { - "epoch": 0.09623960342646175, - "grad_norm": 0.7995213866233826, - "learning_rate": 1.9703038757064585e-05, - "loss": 1.1192, + "epoch": 0.11670900721885454, + "grad_norm": 1.0234030485153198, + "learning_rate": 1.9523497118590625e-05, + "loss": 0.869, "step": 5800 }, { - "epoch": 0.09789890693381453, - "grad_norm": 0.9082884192466736, - "learning_rate": 1.9690034895564827e-05, - "loss": 1.1125, + "epoch": 0.11872123148124858, + "grad_norm": 0.7687940001487732, + "learning_rate": 1.9503624531829463e-05, + "loss": 0.875, "step": 5900 }, { - "epoch": 0.09955821044116732, - "grad_norm": 0.8288666605949402, - "learning_rate": 1.9676756862462612e-05, - "loss": 1.1131, + "epoch": 0.12073345574364262, + "grad_norm": 0.858860194683075, + "learning_rate": 1.9483356509179633e-05, + "loss": 0.8682, "step": 6000 }, { - "epoch": 0.09955821044116732, - "eval_loss": 0.642926812171936, - "eval_runtime": 11.3143, - "eval_samples_per_second": 33.409, - "eval_steps_per_second": 1.149, + "epoch": 0.12073345574364262, + "eval_loss": 0.6082560420036316, + "eval_runtime": 11.2984, + "eval_samples_per_second": 33.456, + "eval_steps_per_second": 1.151, "step": 6000 }, { - "epoch": 0.1012175139485201, - "grad_norm": 0.8566969037055969, - "learning_rate": 1.966320503344901e-05, - "loss": 1.1099, + "epoch": 0.12274568000603667, + "grad_norm": 0.7500011324882507, + "learning_rate": 1.946269389397239e-05, + "loss": 0.8667, "step": 6100 }, { - "epoch": 0.1028768174558729, - "grad_norm": 0.9833664298057556, - "learning_rate": 1.9649379791961932e-05, - "loss": 1.1123, + "epoch": 0.12475790426843071, + "grad_norm": 0.8498502373695374, + "learning_rate": 1.9441637545957558e-05, + "loss": 0.8717, "step": 6200 }, { - "epoch": 0.10453612096322569, - "grad_norm": 0.9843934774398804, - "learning_rate": 1.9635281529175258e-05, - "loss": 1.106, + "epoch": 0.12677012853082475, + "grad_norm": 0.9230628609657288, + "learning_rate": 1.9420188341267783e-05, + "loss": 0.8689, + "step": 6300 + }, + { + "epoch": 0.12677012853082475, + "eval_loss": 0.6047795414924622, + "eval_runtime": 11.3052, + "eval_samples_per_second": 33.436, + "eval_steps_per_second": 1.15, "step": 6300 }, { - "epoch": 0.10619542447057848, - "grad_norm": 0.8110288381576538, - "learning_rate": 1.962091064398779e-05, - "loss": 1.1135, + "epoch": 0.1287823527932188, + "grad_norm": 0.7312197089195251, + "learning_rate": 1.939834717238207e-05, + "loss": 0.8676, "step": 6400 }, { - "epoch": 0.10785472797793126, - "grad_norm": 0.8397781252861023, - "learning_rate": 1.9606267543011957e-05, - "loss": 1.1051, + "epoch": 0.13079457705561284, + "grad_norm": 0.7080931067466736, + "learning_rate": 1.9376114948088634e-05, + "loss": 0.8632, "step": 6500 }, { - "epoch": 0.10785472797793126, - "eval_loss": 0.6197161078453064, - "eval_runtime": 11.3832, - "eval_samples_per_second": 33.207, - "eval_steps_per_second": 1.142, - "step": 6500 + "epoch": 0.1328068013180069, + "grad_norm": 0.793525755405426, + "learning_rate": 1.9353492593447107e-05, + "loss": 0.8682, + "step": 6600 }, { - "epoch": 0.10951403148528405, - "grad_norm": 0.898611307144165, - "learning_rate": 1.9591352640562316e-05, - "loss": 1.0992, + "epoch": 0.1328068013180069, + "eval_loss": 0.6011930704116821, + "eval_runtime": 11.4543, + "eval_samples_per_second": 33.001, + "eval_steps_per_second": 1.135, "step": 6600 }, { - "epoch": 0.11117333499263685, - "grad_norm": 0.7973142862319946, - "learning_rate": 1.957616635864381e-05, - "loss": 1.1086, + "epoch": 0.13481902558040093, + "grad_norm": 0.7798284292221069, + "learning_rate": 1.9330481049750028e-05, + "loss": 0.8636, "step": 6700 }, { - "epoch": 0.11283263849998963, - "grad_norm": 0.9038134217262268, - "learning_rate": 1.9560709126939853e-05, - "loss": 1.1006, + "epoch": 0.136831249842795, + "grad_norm": 0.9270545840263367, + "learning_rate": 1.9307081274483698e-05, + "loss": 0.8644, "step": 6800 }, { - "epoch": 0.11449194200734242, - "grad_norm": 0.8284549117088318, - "learning_rate": 1.954498138280016e-05, - "loss": 1.1233, + "epoch": 0.13884347410518902, + "grad_norm": 0.7777066826820374, + "learning_rate": 1.9283294241288315e-05, + "loss": 0.8682, "step": 6900 }, { - "epoch": 0.1161512455146952, - "grad_norm": 0.750993013381958, - "learning_rate": 1.952898357122837e-05, - "loss": 1.0933, - "step": 7000 + "epoch": 0.13884347410518902, + "eval_loss": 0.6046885848045349, + "eval_runtime": 11.4509, + "eval_samples_per_second": 33.01, + "eval_steps_per_second": 1.135, + "step": 6900 }, { - "epoch": 0.1161512455146952, - "eval_loss": 0.6124024391174316, - "eval_runtime": 11.3497, - "eval_samples_per_second": 33.305, - "eval_steps_per_second": 1.145, + "epoch": 0.14085569836758308, + "grad_norm": 0.7538514733314514, + "learning_rate": 1.925912093991748e-05, + "loss": 0.8654, "step": 7000 }, { - "epoch": 0.11781054902204799, - "grad_norm": 0.8644236326217651, - "learning_rate": 1.9512716144869465e-05, - "loss": 1.1054, + "epoch": 0.1428679226299771, + "grad_norm": 0.6866621375083923, + "learning_rate": 1.9234562376197015e-05, + "loss": 0.8497, "step": 7100 }, { - "epoch": 0.11946985252940079, - "grad_norm": 0.7980636954307556, - "learning_rate": 1.949617956399695e-05, - "loss": 1.0947, + "epoch": 0.14488014689237116, + "grad_norm": 0.829768717288971, + "learning_rate": 1.92096195719831e-05, + "loss": 0.8575, + "step": 7200 + }, + { + "epoch": 0.14488014689237116, + "eval_loss": 0.6001401543617249, + "eval_runtime": 11.2516, + "eval_samples_per_second": 33.595, + "eval_steps_per_second": 1.155, "step": 7200 }, { - "epoch": 0.12112915603675357, - "grad_norm": 0.8986984491348267, - "learning_rate": 1.9479374296499842e-05, - "loss": 1.1028, + "epoch": 0.1468923711547652, + "grad_norm": 0.8665058016777039, + "learning_rate": 1.9184293565119755e-05, + "loss": 0.8612, "step": 7300 }, { - "epoch": 0.12278845954410636, - "grad_norm": 0.751697301864624, - "learning_rate": 1.9462300817869418e-05, - "loss": 1.1, + "epoch": 0.14890459541715925, + "grad_norm": 0.7740942239761353, + "learning_rate": 1.9158585409395674e-05, + "loss": 0.8596, "step": 7400 }, { - "epoch": 0.12444776305145915, - "grad_norm": 0.799580454826355, - "learning_rate": 1.944495961118578e-05, - "loss": 1.1051, + "epoch": 0.15091681967955328, + "grad_norm": 0.672917902469635, + "learning_rate": 1.9132496174500364e-05, + "loss": 0.854, "step": 7500 }, { - "epoch": 0.12444776305145915, - "eval_loss": 0.6077128648757935, - "eval_runtime": 11.31, - "eval_samples_per_second": 33.422, + "epoch": 0.15091681967955328, + "eval_loss": 0.5939906239509583, + "eval_runtime": 11.3101, + "eval_samples_per_second": 33.421, "eval_steps_per_second": 1.149, "step": 7500 }, { - "epoch": 0.12610706655881193, - "grad_norm": 0.7901577949523926, - "learning_rate": 1.942735116710417e-05, - "loss": 1.1179, + "epoch": 0.15292904394194734, + "grad_norm": 0.719465970993042, + "learning_rate": 1.9106026945979627e-05, + "loss": 0.8615, "step": 7600 }, { - "epoch": 0.12776637006616473, - "grad_norm": 0.8165197968482971, - "learning_rate": 1.9409475983841094e-05, - "loss": 1.0993, + "epoch": 0.15494126820434137, + "grad_norm": 0.7433097958564758, + "learning_rate": 1.9079178825190416e-05, + "loss": 0.8564, "step": 7700 }, { - "epoch": 0.1294256735735175, - "grad_norm": 0.8566355109214783, - "learning_rate": 1.9391334567160222e-05, - "loss": 1.1062, + "epoch": 0.15695349246673543, + "grad_norm": 0.7390840649604797, + "learning_rate": 1.9051952929254983e-05, + "loss": 0.8526, + "step": 7800 + }, + { + "epoch": 0.15695349246673543, + "eval_loss": 0.5941105484962463, + "eval_runtime": 11.2494, + "eval_samples_per_second": 33.602, + "eval_steps_per_second": 1.156, "step": 7800 }, { - "epoch": 0.1310849770808703, - "grad_norm": 0.7045994997024536, - "learning_rate": 1.9372927430358087e-05, - "loss": 1.0963, + "epoch": 0.15896571672912946, + "grad_norm": 0.721076488494873, + "learning_rate": 1.902435039101442e-05, + "loss": 0.8535, "step": 7900 }, { - "epoch": 0.1327442805882231, - "grad_norm": 0.8410146832466125, - "learning_rate": 1.935425509424955e-05, - "loss": 1.0993, + "epoch": 0.16097794099152352, + "grad_norm": 0.7117634415626526, + "learning_rate": 1.899637235898151e-05, + "loss": 0.8548, "step": 8000 }, { - "epoch": 0.1327442805882231, - "eval_loss": 0.6101858615875244, - "eval_runtime": 11.3214, - "eval_samples_per_second": 33.388, - "eval_steps_per_second": 1.148, - "step": 8000 + "epoch": 0.16299016525391755, + "grad_norm": 0.7325859069824219, + "learning_rate": 1.8968019997292937e-05, + "loss": 0.8661, + "step": 8100 }, { - "epoch": 0.13440358409557587, - "grad_norm": 0.9924967288970947, - "learning_rate": 1.9335318087153074e-05, - "loss": 1.1048, + "epoch": 0.16299016525391755, + "eval_loss": 0.5943772196769714, + "eval_runtime": 11.2277, + "eval_samples_per_second": 33.667, + "eval_steps_per_second": 1.158, "step": 8100 }, { - "epoch": 0.13606288760292867, - "grad_norm": 0.858170211315155, - "learning_rate": 1.9316116944875763e-05, - "loss": 1.1054, + "epoch": 0.1650023895163116, + "grad_norm": 0.8927565217018127, + "learning_rate": 1.893929448566085e-05, + "loss": 0.8535, "step": 8200 }, { - "epoch": 0.13772219111028147, - "grad_norm": 0.7710052132606506, - "learning_rate": 1.9296652210698216e-05, - "loss": 1.0991, + "epoch": 0.16701461377870563, + "grad_norm": 0.9083840250968933, + "learning_rate": 1.8910197019323782e-05, + "loss": 0.8581, "step": 8300 }, { - "epoch": 0.13938149461763424, - "grad_norm": 0.8862308859825134, - "learning_rate": 1.9276924435359147e-05, - "loss": 1.0989, + "epoch": 0.1690268380410997, + "grad_norm": 0.7133694291114807, + "learning_rate": 1.8880728808996906e-05, + "loss": 0.8491, "step": 8400 }, { - "epoch": 0.14104079812498704, - "grad_norm": 0.8026833534240723, - "learning_rate": 1.925693417703981e-05, - "loss": 1.0869, - "step": 8500 + "epoch": 0.1690268380410997, + "eval_loss": 0.5923792719841003, + "eval_runtime": 11.2757, + "eval_samples_per_second": 33.523, + "eval_steps_per_second": 1.153, + "step": 8400 }, { - "epoch": 0.14104079812498704, - "eval_loss": 0.6018590927124023, - "eval_runtime": 11.294, - "eval_samples_per_second": 33.469, - "eval_steps_per_second": 1.151, + "epoch": 0.17103906230349372, + "grad_norm": 0.7994174361228943, + "learning_rate": 1.8850891080821673e-05, + "loss": 0.8577, "step": 8500 }, { - "epoch": 0.14270010163233982, - "grad_norm": 0.7632691860198975, - "learning_rate": 1.9236682001348188e-05, - "loss": 1.0997, + "epoch": 0.17305128656588778, + "grad_norm": 1.106224775314331, + "learning_rate": 1.8820685076314782e-05, + "loss": 0.849, "step": 8600 }, { - "epoch": 0.14435940513969261, - "grad_norm": 0.8231498599052429, - "learning_rate": 1.921616848130301e-05, - "loss": 1.1022, + "epoch": 0.1750635108282818, + "grad_norm": 1.0492300987243652, + "learning_rate": 1.8790112052316523e-05, + "loss": 0.8579, "step": 8700 }, { - "epoch": 0.14601870864704541, - "grad_norm": 0.8111631870269775, - "learning_rate": 1.919539419731753e-05, - "loss": 1.0936, + "epoch": 0.1750635108282818, + "eval_loss": 0.6185858845710754, + "eval_runtime": 11.3469, + "eval_samples_per_second": 33.313, + "eval_steps_per_second": 1.146, + "step": 8700 + }, + { + "epoch": 0.17707573509067587, + "grad_norm": 0.7523091435432434, + "learning_rate": 1.875917328093849e-05, + "loss": 0.8548, "step": 8800 }, { - "epoch": 0.14767801215439819, - "grad_norm": 0.8485898375511169, - "learning_rate": 1.91743597371831e-05, - "loss": 1.1046, + "epoch": 0.1790879593530699, + "grad_norm": 0.8177125453948975, + "learning_rate": 1.8727870049510636e-05, + "loss": 0.8512, "step": 8900 }, { - "epoch": 0.14933731566175099, - "grad_norm": 0.8139607906341553, - "learning_rate": 1.9153065696052545e-05, - "loss": 1.0924, + "epoch": 0.18110018361546396, + "grad_norm": 0.7863544821739197, + "learning_rate": 1.869620366052772e-05, + "loss": 0.8474, "step": 9000 }, { - "epoch": 0.14933731566175099, - "eval_loss": 0.5995320081710815, - "eval_runtime": 11.3171, - "eval_samples_per_second": 33.401, - "eval_steps_per_second": 1.149, + "epoch": 0.18110018361546396, + "eval_loss": 0.5867164134979248, + "eval_runtime": 11.2542, + "eval_samples_per_second": 33.588, + "eval_steps_per_second": 1.155, "step": 9000 }, { - "epoch": 0.15099661916910376, - "grad_norm": 0.960515558719635, - "learning_rate": 1.913151267642332e-05, - "loss": 1.0977, + "epoch": 0.18311240787785799, + "grad_norm": 0.7436131834983826, + "learning_rate": 1.8664175431595106e-05, + "loss": 0.8587, "step": 9100 }, { - "epoch": 0.15265592267645656, - "grad_norm": 0.8029621243476868, - "learning_rate": 1.9109701288120466e-05, - "loss": 1.1071, + "epoch": 0.18512463214025204, + "grad_norm": 0.803816020488739, + "learning_rate": 1.8631786695373943e-05, + "loss": 0.8455, "step": 9200 }, { - "epoch": 0.15431522618380936, - "grad_norm": 0.7130656838417053, - "learning_rate": 1.9087632148279366e-05, - "loss": 1.0911, + "epoch": 0.18713685640264607, + "grad_norm": 0.9202460050582886, + "learning_rate": 1.8599038799525712e-05, + "loss": 0.8513, + "step": 9300 + }, + { + "epoch": 0.18713685640264607, + "eval_loss": 0.583454430103302, + "eval_runtime": 11.2388, + "eval_samples_per_second": 33.633, + "eval_steps_per_second": 1.157, "step": 9300 }, { - "epoch": 0.15597452969116213, - "grad_norm": 1.0337978601455688, - "learning_rate": 1.906530588132824e-05, - "loss": 1.0957, + "epoch": 0.18914908066504013, + "grad_norm": 0.8134105801582336, + "learning_rate": 1.856593310665614e-05, + "loss": 0.8499, "step": 9400 }, { - "epoch": 0.15763383319851493, - "grad_norm": 0.8102251887321472, - "learning_rate": 1.9042723118970544e-05, - "loss": 1.0989, + "epoch": 0.19116130492743416, + "grad_norm": 0.7113932967185974, + "learning_rate": 1.8532470994258533e-05, + "loss": 0.849, "step": 9500 }, { - "epoch": 0.15763383319851493, - "eval_loss": 0.5979866981506348, - "eval_runtime": 11.333, - "eval_samples_per_second": 33.354, - "eval_steps_per_second": 1.147, - "step": 9500 + "epoch": 0.19317352918982822, + "grad_norm": 0.8230564594268799, + "learning_rate": 1.8498653854656424e-05, + "loss": 0.8413, + "step": 9600 }, { - "epoch": 0.1592931367058677, - "grad_norm": 0.7073745727539062, - "learning_rate": 1.901988450016704e-05, - "loss": 1.0951, + "epoch": 0.19317352918982822, + "eval_loss": 0.5848163962364197, + "eval_runtime": 11.2801, + "eval_samples_per_second": 33.51, + "eval_steps_per_second": 1.152, "step": 9600 }, { - "epoch": 0.1609524402132205, - "grad_norm": 0.8226743936538696, - "learning_rate": 1.899679067111775e-05, - "loss": 1.0932, + "epoch": 0.19518575345222225, + "grad_norm": 0.6756404638290405, + "learning_rate": 1.8464483094945667e-05, + "loss": 0.8543, "step": 9700 }, { - "epoch": 0.1626117437205733, - "grad_norm": 0.8722303509712219, - "learning_rate": 1.897344228524365e-05, - "loss": 1.0992, + "epoch": 0.1971979777146163, + "grad_norm": 0.7398785352706909, + "learning_rate": 1.8429960136935878e-05, + "loss": 0.8428, "step": 9800 }, { - "epoch": 0.16427104722792607, - "grad_norm": 0.771023690700531, - "learning_rate": 1.89498400031682e-05, - "loss": 1.0908, + "epoch": 0.19921020197701034, + "grad_norm": 0.7419747710227966, + "learning_rate": 1.8395086417091272e-05, + "loss": 0.8516, "step": 9900 }, { - "epoch": 0.16593035073527887, - "grad_norm": 0.738682210445404, - "learning_rate": 1.892598449269865e-05, - "loss": 1.0871, - "step": 10000 + "epoch": 0.19921020197701034, + "eval_loss": 0.5863896608352661, + "eval_runtime": 11.3198, + "eval_samples_per_second": 33.393, + "eval_steps_per_second": 1.148, + "step": 9900 }, { - "epoch": 0.16593035073527887, - "eval_loss": 0.5946006774902344, - "eval_runtime": 12.2855, - "eval_samples_per_second": 30.768, - "eval_steps_per_second": 1.058, + "epoch": 0.2012224262394044, + "grad_norm": 0.8145945072174072, + "learning_rate": 1.8359863386470904e-05, + "loss": 0.8508, "step": 10000 }, { - "epoch": 0.16758965424263164, - "grad_norm": 0.8228720426559448, - "learning_rate": 1.890187642880713e-05, - "loss": 1.0889, + "epoch": 0.20323465050179843, + "grad_norm": 0.7068437933921814, + "learning_rate": 1.8324292510668278e-05, + "loss": 0.8495, "step": 10100 }, { - "epoch": 0.16924895774998444, - "grad_norm": 0.6772521734237671, - "learning_rate": 1.8877516493611564e-05, - "loss": 1.0814, + "epoch": 0.20524687476419248, + "grad_norm": 0.7419267892837524, + "learning_rate": 1.828837526975038e-05, + "loss": 0.8461, + "step": 10200 + }, + { + "epoch": 0.20524687476419248, + "eval_loss": 0.5834963917732239, + "eval_runtime": 11.7842, + "eval_samples_per_second": 32.077, + "eval_steps_per_second": 1.103, "step": 10200 }, { - "epoch": 0.17090826125733724, - "grad_norm": 0.7009317874908447, - "learning_rate": 1.8852905376356373e-05, - "loss": 1.0892, + "epoch": 0.2072590990265865, + "grad_norm": 1.129436731338501, + "learning_rate": 1.8252113158196078e-05, + "loss": 0.8435, "step": 10300 }, { - "epoch": 0.17256756476469, - "grad_norm": 0.7752845883369446, - "learning_rate": 1.8828043773392964e-05, - "loss": 1.0994, + "epoch": 0.20927132328898054, + "grad_norm": 0.6937255859375, + "learning_rate": 1.821550768483396e-05, + "loss": 0.8485, "step": 10400 }, { - "epoch": 0.1742268682720428, - "grad_norm": 0.7827178835868835, - "learning_rate": 1.880293238816004e-05, - "loss": 1.0936, + "epoch": 0.2112835475513746, + "grad_norm": 0.8506975769996643, + "learning_rate": 1.8178560372779525e-05, + "loss": 0.8473, "step": 10500 }, { - "epoch": 0.1742268682720428, - "eval_loss": 0.5933986306190491, - "eval_runtime": 11.3127, - "eval_samples_per_second": 33.414, - "eval_steps_per_second": 1.149, + "epoch": 0.2112835475513746, + "eval_loss": 0.5813661217689514, + "eval_runtime": 11.832, + "eval_samples_per_second": 31.947, + "eval_steps_per_second": 1.099, "step": 10500 }, { - "epoch": 0.1758861717793956, - "grad_norm": 0.7205496430397034, - "learning_rate": 1.8777571931163677e-05, - "loss": 1.0899, + "epoch": 0.21329577181376863, + "grad_norm": 0.733964204788208, + "learning_rate": 1.814127275937183e-05, + "loss": 0.836, "step": 10600 }, { - "epoch": 0.17754547528674838, - "grad_norm": 0.6745385527610779, - "learning_rate": 1.8751963119957245e-05, - "loss": 1.1047, + "epoch": 0.2153079960761627, + "grad_norm": 0.7400948405265808, + "learning_rate": 1.8103646396109523e-05, + "loss": 0.8473, "step": 10700 }, { - "epoch": 0.17920477879410118, - "grad_norm": 0.7485048770904541, - "learning_rate": 1.8726106679121083e-05, - "loss": 1.0804, + "epoch": 0.21732022033855672, + "grad_norm": 0.9023438096046448, + "learning_rate": 1.8065682848586266e-05, + "loss": 0.8468, + "step": 10800 + }, + { + "epoch": 0.21732022033855672, + "eval_loss": 0.5793610215187073, + "eval_runtime": 11.234, + "eval_samples_per_second": 33.648, + "eval_steps_per_second": 1.157, "step": 10800 }, { - "epoch": 0.18086408230145395, - "grad_norm": 0.7836139798164368, - "learning_rate": 1.870000334024201e-05, - "loss": 1.0853, + "epoch": 0.21933244460095078, + "grad_norm": 0.82066810131073, + "learning_rate": 1.8027383696425613e-05, + "loss": 0.8457, "step": 10900 }, { - "epoch": 0.18252338580880675, - "grad_norm": 0.7088773846626282, - "learning_rate": 1.8673653841892628e-05, - "loss": 1.0857, + "epoch": 0.2213446688633448, + "grad_norm": 0.6094478964805603, + "learning_rate": 1.7988750533215276e-05, + "loss": 0.8408, "step": 11000 }, { - "epoch": 0.18252338580880675, - "eval_loss": 0.6022927165031433, - "eval_runtime": 11.331, - "eval_samples_per_second": 33.36, - "eval_steps_per_second": 1.147, - "step": 11000 + "epoch": 0.22335689312573886, + "grad_norm": 0.7535290122032166, + "learning_rate": 1.7949784966440823e-05, + "loss": 0.8403, + "step": 11100 }, { - "epoch": 0.18418268931615955, - "grad_norm": 0.7482512593269348, - "learning_rate": 1.864705892961041e-05, - "loss": 1.0796, + "epoch": 0.22335689312573886, + "eval_loss": 0.578126072883606, + "eval_runtime": 11.202, + "eval_samples_per_second": 33.744, + "eval_steps_per_second": 1.161, "step": 11100 }, { - "epoch": 0.18584199282351233, - "grad_norm": 0.9476237893104553, - "learning_rate": 1.8620219355876627e-05, - "loss": 1.0864, + "epoch": 0.2253691173881329, + "grad_norm": 0.7472143769264221, + "learning_rate": 1.791048861741877e-05, + "loss": 0.8434, "step": 11200 }, { - "epoch": 0.18750129633086512, - "grad_norm": 0.841853678226471, - "learning_rate": 1.8593135880095038e-05, - "loss": 1.0771, + "epoch": 0.22738134165052695, + "grad_norm": 0.8236815333366394, + "learning_rate": 1.7870863121229162e-05, + "loss": 0.8273, "step": 11300 }, { - "epoch": 0.1891605998382179, - "grad_norm": 0.7368480563163757, - "learning_rate": 1.856580926857041e-05, - "loss": 1.081, + "epoch": 0.22939356591292098, + "grad_norm": 0.6772099137306213, + "learning_rate": 1.783091012664749e-05, + "loss": 0.8355, "step": 11400 }, { - "epoch": 0.1908199033455707, - "grad_norm": 0.9033071398735046, - "learning_rate": 1.853824029448684e-05, - "loss": 1.0902, - "step": 11500 + "epoch": 0.22939356591292098, + "eval_loss": 0.5848814249038696, + "eval_runtime": 11.4019, + "eval_samples_per_second": 33.152, + "eval_steps_per_second": 1.14, + "step": 11400 }, { - "epoch": 0.1908199033455707, - "eval_loss": 0.5891455411911011, - "eval_runtime": 14.6575, - "eval_samples_per_second": 25.789, - "eval_steps_per_second": 0.887, + "epoch": 0.23140579017531504, + "grad_norm": 0.7480434775352478, + "learning_rate": 1.779063129607612e-05, + "loss": 0.8437, "step": 11500 }, { - "epoch": 0.1924792068529235, - "grad_norm": 0.7858784794807434, - "learning_rate": 1.851042973788588e-05, - "loss": 1.0756, + "epoch": 0.23341801443770907, + "grad_norm": 0.8341161608695984, + "learning_rate": 1.7750028305475125e-05, + "loss": 0.8384, "step": 11600 }, { - "epoch": 0.19413851036027627, - "grad_norm": 0.9438132643699646, - "learning_rate": 1.8482378385644442e-05, - "loss": 1.0831, + "epoch": 0.23543023870010313, + "grad_norm": 0.9399694800376892, + "learning_rate": 1.7709102844292516e-05, + "loss": 0.8419, + "step": 11700 + }, + { + "epoch": 0.23543023870010313, + "eval_loss": 0.5769637227058411, + "eval_runtime": 11.2547, + "eval_samples_per_second": 33.586, + "eval_steps_per_second": 1.155, "step": 11700 }, { - "epoch": 0.19579781386762907, - "grad_norm": 1.0717864036560059, - "learning_rate": 1.8454087031452584e-05, - "loss": 1.0748, + "epoch": 0.23744246296249716, + "grad_norm": 0.8473734855651855, + "learning_rate": 1.7667856615393987e-05, + "loss": 0.8346, "step": 11800 }, { - "epoch": 0.19745711737498184, - "grad_norm": 0.6684989333152771, - "learning_rate": 1.8425556475790995e-05, - "loss": 1.0839, + "epoch": 0.23945468722489122, + "grad_norm": 0.6887069940567017, + "learning_rate": 1.7626291334992027e-05, + "loss": 0.8381, "step": 11900 }, { - "epoch": 0.19911642088233464, - "grad_norm": 0.7369375228881836, - "learning_rate": 1.8396787525908385e-05, - "loss": 1.0786, + "epoch": 0.24146691148728525, + "grad_norm": 0.6946566700935364, + "learning_rate": 1.758440873257454e-05, + "loss": 0.8345, "step": 12000 }, { - "epoch": 0.19911642088233464, - "eval_loss": 0.5899787545204163, - "eval_runtime": 11.4649, - "eval_samples_per_second": 32.97, - "eval_steps_per_second": 1.134, + "epoch": 0.24146691148728525, + "eval_loss": 0.5747541785240173, + "eval_runtime": 11.4122, + "eval_samples_per_second": 33.122, + "eval_steps_per_second": 1.139, "step": 12000 }, { - "epoch": 0.20077572438968744, - "grad_norm": 0.6706173419952393, - "learning_rate": 1.8367780995798637e-05, - "loss": 1.082, + "epoch": 0.2434791357496793, + "grad_norm": 0.681305468082428, + "learning_rate": 1.7542210550832854e-05, + "loss": 0.841, "step": 12100 }, { - "epoch": 0.2024350278970402, - "grad_norm": 0.767432689666748, - "learning_rate": 1.8338537706177767e-05, - "loss": 1.076, + "epoch": 0.24549136001207333, + "grad_norm": 0.8475384712219238, + "learning_rate": 1.749969854558923e-05, + "loss": 0.8392, "step": 12200 }, { - "epoch": 0.204094331404393, - "grad_norm": 0.7077805399894714, - "learning_rate": 1.8309058484460703e-05, - "loss": 1.0874, + "epoch": 0.2475035842744674, + "grad_norm": 1.1652250289916992, + "learning_rate": 1.745687448572379e-05, + "loss": 0.8388, + "step": 12300 + }, + { + "epoch": 0.2475035842744674, + "eval_loss": 0.5746700763702393, + "eval_runtime": 11.4476, + "eval_samples_per_second": 33.02, + "eval_steps_per_second": 1.136, "step": 12300 }, { - "epoch": 0.2057536349117458, - "grad_norm": 0.6875599026679993, - "learning_rate": 1.8279344164737893e-05, - "loss": 1.073, + "epoch": 0.24951580853686142, + "grad_norm": 0.7575956583023071, + "learning_rate": 1.741374015310094e-05, + "loss": 0.8362, "step": 12400 }, { - "epoch": 0.20741293841909858, - "grad_norm": 0.7826254963874817, - "learning_rate": 1.8249395587751674e-05, - "loss": 1.0792, + "epoch": 0.25152803279925545, + "grad_norm": 0.7489831447601318, + "learning_rate": 1.737029734249519e-05, + "loss": 0.836, "step": 12500 }, { - "epoch": 0.20741293841909858, - "eval_loss": 0.5929903984069824, - "eval_runtime": 11.5407, - "eval_samples_per_second": 32.754, - "eval_steps_per_second": 1.126, - "step": 12500 + "epoch": 0.2535402570616495, + "grad_norm": 0.7467206716537476, + "learning_rate": 1.732654786151651e-05, + "loss": 0.8317, + "step": 12600 }, { - "epoch": 0.20907224192645138, - "grad_norm": 0.8002166748046875, - "learning_rate": 1.821921360087251e-05, - "loss": 1.0816, + "epoch": 0.2535402570616495, + "eval_loss": 0.5750060081481934, + "eval_runtime": 11.2549, + "eval_samples_per_second": 33.585, + "eval_steps_per_second": 1.155, "step": 12600 }, { - "epoch": 0.21073154543380415, - "grad_norm": 0.7406095862388611, - "learning_rate": 1.8188799058075003e-05, - "loss": 1.077, + "epoch": 0.25555248132404357, + "grad_norm": 0.7825116515159607, + "learning_rate": 1.7282493530535095e-05, + "loss": 0.8335, "step": 12700 }, { - "epoch": 0.21239084894115695, - "grad_norm": 0.7558517456054688, - "learning_rate": 1.8158152819913738e-05, - "loss": 1.0921, + "epoch": 0.2575647055864376, + "grad_norm": 0.8054665923118591, + "learning_rate": 1.723813618260564e-05, + "loss": 0.8332, "step": 12800 }, { - "epoch": 0.21405015244850975, - "grad_norm": 0.7371704578399658, - "learning_rate": 1.8127275753498924e-05, - "loss": 1.0908, + "epoch": 0.25957692984883163, + "grad_norm": 0.740932822227478, + "learning_rate": 1.7193477663391055e-05, + "loss": 0.8333, "step": 12900 }, { - "epoch": 0.21570945595586252, - "grad_norm": 0.6803996562957764, - "learning_rate": 1.809616873247188e-05, - "loss": 1.0772, - "step": 13000 + "epoch": 0.25957692984883163, + "eval_loss": 0.574753999710083, + "eval_runtime": 11.3005, + "eval_samples_per_second": 33.45, + "eval_steps_per_second": 1.15, + "step": 12900 }, { - "epoch": 0.21570945595586252, - "eval_loss": 0.5890976190567017, - "eval_runtime": 11.4676, - "eval_samples_per_second": 32.962, - "eval_steps_per_second": 1.134, + "epoch": 0.2615891541112257, + "grad_norm": 0.6655648350715637, + "learning_rate": 1.714851983108567e-05, + "loss": 0.8332, "step": 13000 }, { - "epoch": 0.21736875946321532, - "grad_norm": 0.7379328608512878, - "learning_rate": 1.8064832636980284e-05, - "loss": 1.0661, + "epoch": 0.26360137837361974, + "grad_norm": 0.8892366886138916, + "learning_rate": 1.710326455633792e-05, + "loss": 0.833, "step": 13100 }, { - "epoch": 0.2190280629705681, - "grad_norm": 0.8288509845733643, - "learning_rate": 1.8033268353653303e-05, - "loss": 1.0841, + "epoch": 0.2656136026360138, + "grad_norm": 0.7081986665725708, + "learning_rate": 1.7057713722172505e-05, + "loss": 0.8352, "step": 13200 }, { - "epoch": 0.2206873664779209, - "grad_norm": 0.7796428799629211, - "learning_rate": 1.8001476775576483e-05, - "loss": 1.0841, + "epoch": 0.2656136026360138, + "eval_loss": 0.569306492805481, + "eval_runtime": 11.2208, + "eval_samples_per_second": 33.688, + "eval_steps_per_second": 1.159, + "step": 13200 + }, + { + "epoch": 0.2676258268984078, + "grad_norm": 0.7726171612739563, + "learning_rate": 1.701186922391206e-05, + "loss": 0.8325, "step": 13300 }, { - "epoch": 0.2223466699852737, - "grad_norm": 0.7639874219894409, - "learning_rate": 1.7969458802266493e-05, - "loss": 1.0866, + "epoch": 0.26963805116080186, + "grad_norm": 0.6000068187713623, + "learning_rate": 1.6965732969098262e-05, + "loss": 0.8303, "step": 13400 }, { - "epoch": 0.22400597349262646, - "grad_norm": 0.8602641224861145, - "learning_rate": 1.7937215339645676e-05, - "loss": 1.0826, + "epoch": 0.2716502754231959, + "grad_norm": 0.7751488089561462, + "learning_rate": 1.6919306877412474e-05, + "loss": 0.8311, "step": 13500 }, { - "epoch": 0.22400597349262646, - "eval_loss": 0.5829499959945679, - "eval_runtime": 11.4906, - "eval_samples_per_second": 32.896, - "eval_steps_per_second": 1.131, + "epoch": 0.2716502754231959, + "eval_loss": 0.5708428621292114, + "eval_runtime": 11.2236, + "eval_samples_per_second": 33.679, + "eval_steps_per_second": 1.158, "step": 13500 }, { - "epoch": 0.22566527699997926, - "grad_norm": 0.6520683169364929, - "learning_rate": 1.7904747300016393e-05, - "loss": 1.0767, + "epoch": 0.27366249968559, + "grad_norm": 0.7674184441566467, + "learning_rate": 1.6872592880595872e-05, + "loss": 0.8391, "step": 13600 }, { - "epoch": 0.22732458050733204, - "grad_norm": 0.7716849446296692, - "learning_rate": 1.7872055602035245e-05, - "loss": 1.0761, + "epoch": 0.275674723947984, + "grad_norm": 0.999799370765686, + "learning_rate": 1.6825592922369066e-05, + "loss": 0.8215, "step": 13700 }, { - "epoch": 0.22898388401468484, - "grad_norm": 0.6995418071746826, - "learning_rate": 1.7839141170687055e-05, - "loss": 1.0712, + "epoch": 0.27768694821037804, + "grad_norm": 0.7192254662513733, + "learning_rate": 1.6778308958351213e-05, + "loss": 0.8304, "step": 13800 }, { - "epoch": 0.23064318752203763, - "grad_norm": 0.8901168704032898, - "learning_rate": 1.7806004937258703e-05, - "loss": 1.0693, + "epoch": 0.27768694821037804, + "eval_loss": 0.5696760416030884, + "eval_runtime": 11.2331, + "eval_samples_per_second": 33.65, + "eval_steps_per_second": 1.157, + "step": 13800 + }, + { + "epoch": 0.2796991724727721, + "grad_norm": 1.1758594512939453, + "learning_rate": 1.673074295597867e-05, + "loss": 0.8346, "step": 13900 }, { - "epoch": 0.2323024910293904, - "grad_norm": 0.6333503127098083, - "learning_rate": 1.777264783931278e-05, - "loss": 1.0602, + "epoch": 0.28171139673516615, + "grad_norm": 0.5974677801132202, + "learning_rate": 1.6682896894423094e-05, + "loss": 0.824, "step": 14000 }, { - "epoch": 0.2323024910293904, - "eval_loss": 0.582682192325592, - "eval_runtime": 11.4326, - "eval_samples_per_second": 33.063, - "eval_steps_per_second": 1.137, - "step": 14000 + "epoch": 0.28372362099756016, + "grad_norm": 0.720886766910553, + "learning_rate": 1.6634772764509128e-05, + "loss": 0.8246, + "step": 14100 }, { - "epoch": 0.2339617945367432, - "grad_norm": 0.7268567085266113, - "learning_rate": 1.7739070820661056e-05, - "loss": 1.08, + "epoch": 0.28372362099756016, + "eval_loss": 0.5675772428512573, + "eval_runtime": 11.3956, + "eval_samples_per_second": 33.171, + "eval_steps_per_second": 1.141, "step": 14100 }, { - "epoch": 0.23562109804409598, - "grad_norm": 0.6840221881866455, - "learning_rate": 1.7705274831337783e-05, - "loss": 1.0715, + "epoch": 0.2857358452599542, + "grad_norm": 0.6889091730117798, + "learning_rate": 1.6586372568631545e-05, + "loss": 0.8231, "step": 14200 }, { - "epoch": 0.23728040155144878, - "grad_norm": 0.6487464904785156, - "learning_rate": 1.76712608275728e-05, - "loss": 1.085, + "epoch": 0.28774806952234827, + "grad_norm": 0.6523007154464722, + "learning_rate": 1.6537698320671933e-05, + "loss": 0.8272, "step": 14300 }, { - "epoch": 0.23893970505880158, - "grad_norm": 0.7024466395378113, - "learning_rate": 1.7637029771764495e-05, - "loss": 1.0794, + "epoch": 0.28976029378474233, + "grad_norm": 0.7638033628463745, + "learning_rate": 1.64887520459149e-05, + "loss": 0.8306, "step": 14400 }, { - "epoch": 0.24059900856615435, - "grad_norm": 0.8654534816741943, - "learning_rate": 1.7602582632452553e-05, - "loss": 1.0816, - "step": 14500 + "epoch": 0.28976029378474233, + "eval_loss": 0.569464921951294, + "eval_runtime": 11.248, + "eval_samples_per_second": 33.606, + "eval_steps_per_second": 1.156, + "step": 14400 }, { - "epoch": 0.24059900856615435, - "eval_loss": 0.584790825843811, - "eval_runtime": 11.4412, - "eval_samples_per_second": 33.038, - "eval_steps_per_second": 1.136, + "epoch": 0.29177251804713633, + "grad_norm": 0.6883799433708191, + "learning_rate": 1.6439535780963808e-05, + "loss": 0.8327, "step": 14500 }, { - "epoch": 0.24225831207350715, - "grad_norm": 0.6696279644966125, - "learning_rate": 1.756792038429058e-05, - "loss": 1.0653, + "epoch": 0.2937847423095304, + "grad_norm": 0.8693552017211914, + "learning_rate": 1.6390051573656028e-05, + "loss": 0.8299, "step": 14600 }, { - "epoch": 0.24391761558085995, - "grad_norm": 0.7731645107269287, - "learning_rate": 1.7533044008018505e-05, - "loss": 1.0827, + "epoch": 0.29579696657192445, + "grad_norm": 0.6811352372169495, + "learning_rate": 1.634030148297773e-05, + "loss": 0.8257, + "step": 14700 + }, + { + "epoch": 0.29579696657192445, + "eval_loss": 0.5680450797080994, + "eval_runtime": 11.451, + "eval_samples_per_second": 33.01, + "eval_steps_per_second": 1.135, "step": 14700 }, { - "epoch": 0.24557691908821272, - "grad_norm": 0.7346585988998413, - "learning_rate": 1.7497954490434833e-05, - "loss": 1.0707, + "epoch": 0.2978091908343185, + "grad_norm": 0.7108572721481323, + "learning_rate": 1.629028757897821e-05, + "loss": 0.826, "step": 14800 }, { - "epoch": 0.24723622259556552, - "grad_norm": 0.6957754492759705, - "learning_rate": 1.7462652824368724e-05, - "loss": 1.0734, + "epoch": 0.2998214150967125, + "grad_norm": 0.701524555683136, + "learning_rate": 1.6240011942683774e-05, + "loss": 0.8233, "step": 14900 }, { - "epoch": 0.2488955261029183, - "grad_norm": 0.703118085861206, - "learning_rate": 1.742714000865192e-05, - "loss": 1.0813, + "epoch": 0.30183363935910656, + "grad_norm": 0.6415804028511047, + "learning_rate": 1.6189476666011123e-05, + "loss": 0.8174, "step": 15000 }, { - "epoch": 0.2488955261029183, - "eval_loss": 0.5780225992202759, - "eval_runtime": 11.386, - "eval_samples_per_second": 33.199, - "eval_steps_per_second": 1.142, + "epoch": 0.30183363935910656, + "eval_loss": 0.5662389397621155, + "eval_runtime": 11.3747, + "eval_samples_per_second": 33.232, + "eval_steps_per_second": 1.143, "step": 15000 }, { - "epoch": 0.2505548296102711, - "grad_norm": 0.7246736884117126, - "learning_rate": 1.739141704809046e-05, - "loss": 1.0915, + "epoch": 0.3038458636215006, + "grad_norm": 0.593760073184967, + "learning_rate": 1.6138683851680328e-05, + "loss": 0.8269, "step": 15100 }, { - "epoch": 0.25221413311762386, - "grad_norm": 0.652607798576355, - "learning_rate": 1.7355484953436253e-05, - "loss": 1.0622, + "epoch": 0.3058580878838947, + "grad_norm": 0.6708555221557617, + "learning_rate": 1.608763561312733e-05, + "loss": 0.8277, "step": 15200 }, { - "epoch": 0.2538734366249767, - "grad_norm": 0.6764200925827026, - "learning_rate": 1.73193447413585e-05, - "loss": 1.0598, + "epoch": 0.3078703121462887, + "grad_norm": 0.5819365382194519, + "learning_rate": 1.603633407441601e-05, + "loss": 0.8237, "step": 15300 }, { - "epoch": 0.25553274013232946, - "grad_norm": 0.6260489225387573, - "learning_rate": 1.7282997434414913e-05, - "loss": 1.0665, + "epoch": 0.3078703121462887, + "eval_loss": 0.5628697872161865, + "eval_runtime": 11.3199, + "eval_samples_per_second": 33.393, + "eval_steps_per_second": 1.148, + "step": 15300 + }, + { + "epoch": 0.30988253640868274, + "grad_norm": 0.725537896156311, + "learning_rate": 1.5984781370149798e-05, + "loss": 0.8355, "step": 15400 }, { - "epoch": 0.25719204363968223, - "grad_norm": 0.7212610840797424, - "learning_rate": 1.724644406102277e-05, - "loss": 1.0769, + "epoch": 0.3118947606710768, + "grad_norm": 0.642382800579071, + "learning_rate": 1.5932979645382863e-05, + "loss": 0.8292, "step": 15500 }, { - "epoch": 0.25719204363968223, - "eval_loss": 0.5796706676483154, - "eval_runtime": 11.41, - "eval_samples_per_second": 33.129, - "eval_steps_per_second": 1.139, - "step": 15500 + "epoch": 0.31390698493347086, + "grad_norm": 0.6141934394836426, + "learning_rate": 1.588093105553086e-05, + "loss": 0.8306, + "step": 15600 }, { - "epoch": 0.258851347147035, - "grad_norm": 0.7525830268859863, - "learning_rate": 1.7209685655429837e-05, - "loss": 1.0717, + "epoch": 0.31390698493347086, + "eval_loss": 0.5633600354194641, + "eval_runtime": 11.3793, + "eval_samples_per_second": 33.218, + "eval_steps_per_second": 1.142, "step": 15600 }, { - "epoch": 0.26051065065438783, - "grad_norm": 0.6779541373252869, - "learning_rate": 1.7172723257685104e-05, - "loss": 1.0685, + "epoch": 0.31591920919586486, + "grad_norm": 0.6902384757995605, + "learning_rate": 1.5828637766281238e-05, + "loss": 0.8243, "step": 15700 }, { - "epoch": 0.2621699541617406, - "grad_norm": 0.7670557498931885, - "learning_rate": 1.7135557913609345e-05, - "loss": 1.0745, + "epoch": 0.3179314334582589, + "grad_norm": 0.7464603781700134, + "learning_rate": 1.5776101953503134e-05, + "loss": 0.8296, "step": 15800 }, { - "epoch": 0.2638292576690934, - "grad_norm": 0.7213120460510254, - "learning_rate": 1.7098190674765542e-05, - "loss": 1.0748, + "epoch": 0.319943657720653, + "grad_norm": 0.6735148429870605, + "learning_rate": 1.5723325803156834e-05, + "loss": 0.8168, "step": 15900 }, { - "epoch": 0.2654885611764462, - "grad_norm": 0.6520830988883972, - "learning_rate": 1.7060622598429117e-05, - "loss": 1.0754, - "step": 16000 + "epoch": 0.319943657720653, + "eval_loss": 0.5626727938652039, + "eval_runtime": 11.3991, + "eval_samples_per_second": 33.16, + "eval_steps_per_second": 1.14, + "step": 15900 }, { - "epoch": 0.2654885611764462, - "eval_loss": 0.5761074423789978, - "eval_runtime": 14.6805, - "eval_samples_per_second": 25.748, - "eval_steps_per_second": 0.886, + "epoch": 0.32195588198304703, + "grad_norm": 0.7461301684379578, + "learning_rate": 1.5670311511202823e-05, + "loss": 0.8175, "step": 16000 }, { - "epoch": 0.267147864683799, - "grad_norm": 0.7006266713142395, - "learning_rate": 1.7022854747558018e-05, - "loss": 1.0614, + "epoch": 0.32396810624544103, + "grad_norm": 0.6454249620437622, + "learning_rate": 1.5617061283510404e-05, + "loss": 0.8287, "step": 16100 }, { - "epoch": 0.26880716819115175, - "grad_norm": 0.7130475640296936, - "learning_rate": 1.6984888190762673e-05, - "loss": 1.0643, + "epoch": 0.3259803305078351, + "grad_norm": 0.723892331123352, + "learning_rate": 1.5563577335765925e-05, + "loss": 0.8256, + "step": 16200 + }, + { + "epoch": 0.3259803305078351, + "eval_loss": 0.5635449290275574, + "eval_runtime": 11.3171, + "eval_samples_per_second": 33.401, + "eval_steps_per_second": 1.149, "step": 16200 }, { - "epoch": 0.2704664716985046, - "grad_norm": 0.7075759768486023, - "learning_rate": 1.6946724002275704e-05, - "loss": 1.0593, + "epoch": 0.32799255477022915, + "grad_norm": 0.6277914047241211, + "learning_rate": 1.5509861893380576e-05, + "loss": 0.8274, "step": 16300 }, { - "epoch": 0.27212577520585735, - "grad_norm": 0.6970399618148804, - "learning_rate": 1.6908363261921582e-05, - "loss": 1.0648, + "epoch": 0.3300047790326232, + "grad_norm": 0.6103200316429138, + "learning_rate": 1.5455917191397806e-05, + "loss": 0.8207, "step": 16400 }, { - "epoch": 0.2737850787132101, - "grad_norm": 0.636163055896759, - "learning_rate": 1.6869807055086037e-05, - "loss": 1.0742, + "epoch": 0.3320170032950172, + "grad_norm": 0.6216299533843994, + "learning_rate": 1.5401745474400306e-05, + "loss": 0.8218, "step": 16500 }, { - "epoch": 0.2737850787132101, - "eval_loss": 0.5752170085906982, - "eval_runtime": 11.4927, - "eval_samples_per_second": 32.891, - "eval_steps_per_second": 1.131, + "epoch": 0.3320170032950172, + "eval_loss": 0.5613713264465332, + "eval_runtime": 11.3097, + "eval_samples_per_second": 33.423, + "eval_steps_per_second": 1.149, "step": 16500 }, { - "epoch": 0.27544438222056294, - "grad_norm": 0.7744444608688354, - "learning_rate": 1.683105647268537e-05, - "loss": 1.0656, + "epoch": 0.33402922755741127, + "grad_norm": 0.6130411624908447, + "learning_rate": 1.5347348996416626e-05, + "loss": 0.8193, "step": 16600 }, { - "epoch": 0.2771036857279157, - "grad_norm": 0.6467107534408569, - "learning_rate": 1.6792112611135577e-05, - "loss": 1.0766, + "epoch": 0.3360414518198053, + "grad_norm": 0.7175905704498291, + "learning_rate": 1.5292730020827394e-05, + "loss": 0.8205, "step": 16700 }, { - "epoch": 0.2787629892352685, - "grad_norm": 0.7055858969688416, - "learning_rate": 1.6752976572321333e-05, - "loss": 1.074, + "epoch": 0.3380536760821994, + "grad_norm": 0.5804928541183472, + "learning_rate": 1.5237890820271124e-05, + "loss": 0.8256, + "step": 16800 + }, + { + "epoch": 0.3380536760821994, + "eval_loss": 0.558940589427948, + "eval_runtime": 11.507, + "eval_samples_per_second": 32.849, + "eval_steps_per_second": 1.13, "step": 16800 }, { - "epoch": 0.28042229274262126, - "grad_norm": 0.703170895576477, - "learning_rate": 1.671364946356481e-05, - "loss": 1.0632, + "epoch": 0.3400659003445934, + "grad_norm": 0.7494300007820129, + "learning_rate": 1.518283367654966e-05, + "loss": 0.8225, "step": 16900 }, { - "epoch": 0.2820815962499741, - "grad_norm": 0.6387879848480225, - "learning_rate": 1.667413239759434e-05, - "loss": 1.0571, + "epoch": 0.34207812460698744, + "grad_norm": 0.5440366268157959, + "learning_rate": 1.5127560880533242e-05, + "loss": 0.8272, "step": 17000 }, { - "epoch": 0.2820815962499741, - "eval_loss": 0.5720813274383545, - "eval_runtime": 11.5637, - "eval_samples_per_second": 32.688, - "eval_steps_per_second": 1.124, - "step": 17000 + "epoch": 0.3440903488693815, + "grad_norm": 0.5601567625999451, + "learning_rate": 1.5072074732065165e-05, + "loss": 0.829, + "step": 17100 }, { - "epoch": 0.28374089975732686, - "grad_norm": 0.6398823261260986, - "learning_rate": 1.663442649251295e-05, - "loss": 1.0569, + "epoch": 0.3440903488693815, + "eval_loss": 0.5592995285987854, + "eval_runtime": 11.056, + "eval_samples_per_second": 34.19, + "eval_steps_per_second": 1.176, "step": 17100 }, { - "epoch": 0.28540020326467963, - "grad_norm": 0.7881171703338623, - "learning_rate": 1.6594532871766712e-05, - "loss": 1.0784, + "epoch": 0.34610257313177556, + "grad_norm": 0.6553789377212524, + "learning_rate": 1.5016377539866106e-05, + "loss": 0.824, "step": 17200 }, { - "epoch": 0.28705950677203246, - "grad_norm": 0.7140219807624817, - "learning_rate": 1.6554452664112954e-05, - "loss": 1.0624, + "epoch": 0.34811479739416956, + "grad_norm": 0.7243614792823792, + "learning_rate": 1.4960471621438047e-05, + "loss": 0.8206, "step": 17300 }, { - "epoch": 0.28871881027938523, - "grad_norm": 0.7242003679275513, - "learning_rate": 1.6514187003588333e-05, - "loss": 1.0561, + "epoch": 0.3501270216565636, + "grad_norm": 0.7584229111671448, + "learning_rate": 1.4904359302967848e-05, + "loss": 0.8264, "step": 17400 }, { - "epoch": 0.290378113786738, - "grad_norm": 0.7598658800125122, - "learning_rate": 1.6473737029476735e-05, - "loss": 1.0615, - "step": 17500 + "epoch": 0.3501270216565636, + "eval_loss": 0.5582433342933655, + "eval_runtime": 11.4613, + "eval_samples_per_second": 32.98, + "eval_steps_per_second": 1.134, + "step": 17400 }, { - "epoch": 0.290378113786738, - "eval_loss": 0.5700154900550842, - "eval_runtime": 11.5819, - "eval_samples_per_second": 32.637, - "eval_steps_per_second": 1.122, + "epoch": 0.3521392459189577, + "grad_norm": 0.9413104057312012, + "learning_rate": 1.4848042919230464e-05, + "loss": 0.8082, "step": 17500 }, { - "epoch": 0.29203741729409083, - "grad_norm": 0.690846860408783, - "learning_rate": 1.6433103886277063e-05, - "loss": 1.0588, + "epoch": 0.35415147018135174, + "grad_norm": 0.7952352166175842, + "learning_rate": 1.4791524813491789e-05, + "loss": 0.8138, "step": 17600 }, { - "epoch": 0.2936967208014436, - "grad_norm": 0.7652044892311096, - "learning_rate": 1.6392288723670824e-05, - "loss": 1.0661, + "epoch": 0.35616369444374574, + "grad_norm": 0.6611462235450745, + "learning_rate": 1.4734807337411166e-05, + "loss": 0.817, + "step": 17700 + }, + { + "epoch": 0.35616369444374574, + "eval_loss": 0.5570442080497742, + "eval_runtime": 11.4931, + "eval_samples_per_second": 32.889, + "eval_steps_per_second": 1.131, "step": 17700 }, { - "epoch": 0.29535602430879637, - "grad_norm": 0.7652806043624878, - "learning_rate": 1.6351292696489624e-05, - "loss": 1.0548, + "epoch": 0.3581759187061398, + "grad_norm": 0.8845998644828796, + "learning_rate": 1.4677892850943516e-05, + "loss": 0.8124, "step": 17800 }, { - "epoch": 0.29701532781614914, - "grad_norm": 0.7272448539733887, - "learning_rate": 1.631011696468248e-05, - "loss": 1.0569, + "epoch": 0.36018814296853385, + "grad_norm": 0.6421878337860107, + "learning_rate": 1.462078372224117e-05, + "loss": 0.814, "step": 17900 }, { - "epoch": 0.29867463132350197, - "grad_norm": 0.715149462223053, - "learning_rate": 1.6268762693283008e-05, - "loss": 1.068, + "epoch": 0.3622003672309279, + "grad_norm": 0.6532554030418396, + "learning_rate": 1.456348232755531e-05, + "loss": 0.8081, "step": 18000 }, { - "epoch": 0.29867463132350197, - "eval_loss": 0.5687873959541321, - "eval_runtime": 11.5496, - "eval_samples_per_second": 32.729, - "eval_steps_per_second": 1.126, + "epoch": 0.3622003672309279, + "eval_loss": 0.5557852983474731, + "eval_runtime": 11.4159, + "eval_samples_per_second": 33.112, + "eval_steps_per_second": 1.139, "step": 18000 }, { - "epoch": 0.30033393483085474, - "grad_norm": 0.7110170125961304, - "learning_rate": 1.6227231052376453e-05, - "loss": 1.0608, + "epoch": 0.3642125914933219, + "grad_norm": 0.8483557105064392, + "learning_rate": 1.4505991051137112e-05, + "loss": 0.8137, "step": 18100 }, { - "epoch": 0.3019932383382075, - "grad_norm": 0.6534146666526794, - "learning_rate": 1.6185523217066585e-05, - "loss": 1.0557, + "epoch": 0.36622481575571597, + "grad_norm": 0.7414484620094299, + "learning_rate": 1.4448312285138524e-05, + "loss": 0.8095, "step": 18200 }, { - "epoch": 0.30365254184556034, - "grad_norm": 0.6942999958992004, - "learning_rate": 1.6143640367442447e-05, - "loss": 1.0641, + "epoch": 0.36823704001811003, + "grad_norm": 0.6685389280319214, + "learning_rate": 1.4390448429512747e-05, + "loss": 0.8108, + "step": 18300 + }, + { + "epoch": 0.36823704001811003, + "eval_loss": 0.5559925436973572, + "eval_runtime": 11.4267, + "eval_samples_per_second": 33.081, + "eval_steps_per_second": 1.138, "step": 18300 }, { - "epoch": 0.3053118453529131, - "grad_norm": 0.7738041877746582, - "learning_rate": 1.610158368854498e-05, - "loss": 1.0536, + "epoch": 0.3702492642805041, + "grad_norm": 0.5973154306411743, + "learning_rate": 1.4332401891914365e-05, + "loss": 0.8144, "step": 18400 }, { - "epoch": 0.3069711488602659, - "grad_norm": 0.7291901707649231, - "learning_rate": 1.605935437033347e-05, - "loss": 1.0561, + "epoch": 0.3722614885428981, + "grad_norm": 0.6153602004051208, + "learning_rate": 1.4274175087599166e-05, + "loss": 0.8234, "step": 18500 }, { - "epoch": 0.3069711488602659, - "eval_loss": 0.5693631172180176, - "eval_runtime": 11.5129, - "eval_samples_per_second": 32.833, - "eval_steps_per_second": 1.129, - "step": 18500 + "epoch": 0.37427371280529215, + "grad_norm": 0.6379988789558411, + "learning_rate": 1.4215770439323657e-05, + "loss": 0.8137, + "step": 18600 }, { - "epoch": 0.3086304523676187, - "grad_norm": 0.648060142993927, - "learning_rate": 1.601695360765189e-05, - "loss": 1.0784, + "epoch": 0.37427371280529215, + "eval_loss": 0.5545734763145447, + "eval_runtime": 11.3444, + "eval_samples_per_second": 33.32, + "eval_steps_per_second": 1.146, "step": 18600 }, { - "epoch": 0.3102897558749715, - "grad_norm": 0.6197877526283264, - "learning_rate": 1.597438260019511e-05, - "loss": 1.0529, + "epoch": 0.3762859370676862, + "grad_norm": 0.6836999654769897, + "learning_rate": 1.4157190377244233e-05, + "loss": 0.811, "step": 18700 }, { - "epoch": 0.31194905938232426, - "grad_norm": 0.7363802790641785, - "learning_rate": 1.593164255247492e-05, - "loss": 1.0624, + "epoch": 0.37829816133008026, + "grad_norm": 0.5659916400909424, + "learning_rate": 1.409843733881608e-05, + "loss": 0.8175, "step": 18800 }, { - "epoch": 0.3136083628896771, - "grad_norm": 0.7164952158927917, - "learning_rate": 1.5888734673785967e-05, - "loss": 1.0475, + "epoch": 0.38031038559247426, + "grad_norm": 0.6270354986190796, + "learning_rate": 1.4039513768691753e-05, + "loss": 0.8221, "step": 18900 }, { - "epoch": 0.31526766639702986, - "grad_norm": 0.7213062644004822, - "learning_rate": 1.5845660178171548e-05, - "loss": 1.0688, - "step": 19000 + "epoch": 0.38031038559247426, + "eval_loss": 0.5561990737915039, + "eval_runtime": 11.437, + "eval_samples_per_second": 33.051, + "eval_steps_per_second": 1.137, + "step": 18900 }, { - "epoch": 0.31526766639702986, - "eval_loss": 0.5676007270812988, - "eval_runtime": 14.2901, - "eval_samples_per_second": 26.452, - "eval_steps_per_second": 0.91, + "epoch": 0.3823226098548683, + "grad_norm": 0.6403433680534363, + "learning_rate": 1.3980422118619447e-05, + "loss": 0.8156, "step": 19000 }, { - "epoch": 0.3169269699043826, - "grad_norm": 0.7193216681480408, - "learning_rate": 1.5802420284389246e-05, - "loss": 1.0596, + "epoch": 0.3843348341172624, + "grad_norm": 0.5956655144691467, + "learning_rate": 1.3921164847340996e-05, + "loss": 0.8161, "step": 19100 }, { - "epoch": 0.3185862734117354, - "grad_norm": 0.7098826169967651, - "learning_rate": 1.5759016215876443e-05, - "loss": 1.064, + "epoch": 0.38634705837965644, + "grad_norm": 1.1075905561447144, + "learning_rate": 1.3861744420489547e-05, + "loss": 0.8115, + "step": 19200 + }, + { + "epoch": 0.38634705837965644, + "eval_loss": 0.5551438927650452, + "eval_runtime": 11.6061, + "eval_samples_per_second": 32.569, + "eval_steps_per_second": 1.12, "step": 19200 }, { - "epoch": 0.3202455769190882, - "grad_norm": 0.6447449922561646, - "learning_rate": 1.571544920071572e-05, - "loss": 1.0554, + "epoch": 0.38835928264205044, + "grad_norm": 0.5919958353042603, + "learning_rate": 1.380216331048699e-05, + "loss": 0.8042, "step": 19300 }, { - "epoch": 0.321904880426441, - "grad_norm": 0.6695635318756104, - "learning_rate": 1.5671720471600087e-05, - "loss": 1.0495, + "epoch": 0.3903715069044445, + "grad_norm": 0.599104106426239, + "learning_rate": 1.3742423996441067e-05, + "loss": 0.8107, "step": 19400 }, { - "epoch": 0.32356418393379377, - "grad_norm": 0.6962459087371826, - "learning_rate": 1.562783126579813e-05, - "loss": 1.0597, + "epoch": 0.39238373116683856, + "grad_norm": 0.6891294121742249, + "learning_rate": 1.3682528964042234e-05, + "loss": 0.8082, "step": 19500 }, { - "epoch": 0.32356418393379377, - "eval_loss": 0.5666677355766296, - "eval_runtime": 11.5966, - "eval_samples_per_second": 32.596, - "eval_steps_per_second": 1.121, + "epoch": 0.39238373116683856, + "eval_loss": 0.5554007291793823, + "eval_runtime": 11.5763, + "eval_samples_per_second": 32.653, + "eval_steps_per_second": 1.123, "step": 19500 }, { - "epoch": 0.3252234874411466, - "grad_norm": 0.644284725189209, - "learning_rate": 1.558378282511899e-05, - "loss": 1.0574, + "epoch": 0.3943959554292326, + "grad_norm": 0.6625336408615112, + "learning_rate": 1.3622480705460217e-05, + "loss": 0.8161, "step": 19600 }, { - "epoch": 0.32688279094849937, - "grad_norm": 0.7356826663017273, - "learning_rate": 1.553957639587723e-05, - "loss": 1.0478, + "epoch": 0.3964081796916266, + "grad_norm": 0.6874691843986511, + "learning_rate": 1.3562281719240323e-05, + "loss": 0.808, "step": 19700 }, { - "epoch": 0.32854209445585214, - "grad_norm": 0.7161983847618103, - "learning_rate": 1.549521322885755e-05, - "loss": 1.0637, + "epoch": 0.3984204039540207, + "grad_norm": 0.6335239410400391, + "learning_rate": 1.3501934510199479e-05, + "loss": 0.8172, "step": 19800 }, { - "epoch": 0.33020139796320497, - "grad_norm": 0.6642721891403198, - "learning_rate": 1.5450694579279454e-05, - "loss": 1.0616, + "epoch": 0.3984204039540207, + "eval_loss": 0.5533725023269653, + "eval_runtime": 11.4224, + "eval_samples_per_second": 33.093, + "eval_steps_per_second": 1.138, + "step": 19800 + }, + { + "epoch": 0.40043262821641473, + "grad_norm": 0.6799935102462769, + "learning_rate": 1.3441441589322013e-05, + "loss": 0.8102, "step": 19900 }, { - "epoch": 0.33186070147055774, - "grad_norm": 0.6332038640975952, - "learning_rate": 1.5406021706761657e-05, - "loss": 1.0532, + "epoch": 0.4024448524788088, + "grad_norm": 0.7125223278999329, + "learning_rate": 1.338080547365517e-05, + "loss": 0.8196, "step": 20000 }, { - "epoch": 0.33186070147055774, - "eval_loss": 0.56368488073349, - "eval_runtime": 11.5786, - "eval_samples_per_second": 32.647, - "eval_steps_per_second": 1.123, - "step": 20000 + "epoch": 0.4044570767412028, + "grad_norm": 0.6379702091217041, + "learning_rate": 1.3320028686204378e-05, + "loss": 0.7988, + "step": 20100 }, { - "epoch": 0.3335200049779105, - "grad_norm": 0.7290294766426086, - "learning_rate": 1.5361195875286518e-05, - "loss": 1.0632, + "epoch": 0.4044570767412028, + "eval_loss": 0.5532128214836121, + "eval_runtime": 11.5518, + "eval_samples_per_second": 32.722, + "eval_steps_per_second": 1.125, "step": 20100 }, { - "epoch": 0.3351793084852633, - "grad_norm": 0.6711296439170837, - "learning_rate": 1.5316218353164228e-05, - "loss": 1.0545, + "epoch": 0.40646930100359685, + "grad_norm": 0.6244897842407227, + "learning_rate": 1.325911375582827e-05, + "loss": 0.8078, "step": 20200 }, { - "epoch": 0.3368386119926161, - "grad_norm": 0.6269948482513428, - "learning_rate": 1.5271090412996944e-05, - "loss": 1.0506, + "epoch": 0.4084815252659909, + "grad_norm": 0.6567655801773071, + "learning_rate": 1.319806321713346e-05, + "loss": 0.812, "step": 20300 }, { - "epoch": 0.3384979154999689, - "grad_norm": 0.6367862820625305, - "learning_rate": 1.5225813331642782e-05, - "loss": 1.0626, + "epoch": 0.41049374952838497, + "grad_norm": 0.7605450749397278, + "learning_rate": 1.3136879610369091e-05, + "loss": 0.8078, "step": 20400 }, { - "epoch": 0.34015721900732165, - "grad_norm": 0.7977243065834045, - "learning_rate": 1.518038839017968e-05, - "loss": 1.0511, - "step": 20500 + "epoch": 0.41049374952838497, + "eval_loss": 0.5506391525268555, + "eval_runtime": 11.3697, + "eval_samples_per_second": 33.246, + "eval_steps_per_second": 1.143, + "step": 20400 }, { - "epoch": 0.34015721900732165, - "eval_loss": 0.5636546015739441, - "eval_runtime": 14.8478, - "eval_samples_per_second": 25.458, - "eval_steps_per_second": 0.876, + "epoch": 0.41250597379077897, + "grad_norm": 0.669282853603363, + "learning_rate": 1.3075565481321122e-05, + "loss": 0.8086, "step": 20500 }, { - "epoch": 0.3418165225146745, - "grad_norm": 0.6192759871482849, - "learning_rate": 1.5134816873869157e-05, - "loss": 1.0575, + "epoch": 0.414518198053173, + "grad_norm": 0.6792070269584656, + "learning_rate": 1.301412338120641e-05, + "loss": 0.8075, "step": 20600 }, { - "epoch": 0.34347582602202725, - "grad_norm": 0.6976715922355652, - "learning_rate": 1.5089100072119954e-05, - "loss": 1.0619, + "epoch": 0.4165304223155671, + "grad_norm": 0.5937780737876892, + "learning_rate": 1.2952555866566554e-05, + "loss": 0.8151, + "step": 20700 + }, + { + "epoch": 0.4165304223155671, + "eval_loss": 0.5495349168777466, + "eval_runtime": 11.3633, + "eval_samples_per_second": 33.265, + "eval_steps_per_second": 1.144, "step": 20700 }, { - "epoch": 0.34513512952938, - "grad_norm": 0.7139025330543518, - "learning_rate": 1.504323927845154e-05, - "loss": 1.0476, + "epoch": 0.4185426465779611, + "grad_norm": 0.6547305583953857, + "learning_rate": 1.2890865499161522e-05, + "loss": 0.8022, "step": 20800 }, { - "epoch": 0.34679443303673285, - "grad_norm": 0.7258477807044983, - "learning_rate": 1.4997235790457514e-05, - "loss": 1.0523, + "epoch": 0.42055487084035514, + "grad_norm": 0.5942917466163635, + "learning_rate": 1.2829054845863054e-05, + "loss": 0.8079, "step": 20900 }, { - "epoch": 0.3484537365440856, - "grad_norm": 0.6508980393409729, - "learning_rate": 1.4951090909768904e-05, - "loss": 1.0575, + "epoch": 0.4225670951027492, + "grad_norm": 0.5794849991798401, + "learning_rate": 1.2767126478547865e-05, + "loss": 0.8152, "step": 21000 }, { - "epoch": 0.3484537365440856, - "eval_loss": 0.5632456541061401, - "eval_runtime": 11.6568, - "eval_samples_per_second": 32.427, - "eval_steps_per_second": 1.115, + "epoch": 0.4225670951027492, + "eval_loss": 0.5491987466812134, + "eval_runtime": 11.3343, + "eval_samples_per_second": 33.35, + "eval_steps_per_second": 1.147, "step": 21000 }, { - "epoch": 0.3501130400514384, - "grad_norm": 0.5940484404563904, - "learning_rate": 1.4904805942017315e-05, - "loss": 1.0562, + "epoch": 0.42457931936514326, + "grad_norm": 0.6574000120162964, + "learning_rate": 1.2705082973990623e-05, + "loss": 0.8087, "step": 21100 }, { - "epoch": 0.3517723435587912, - "grad_norm": 0.6419463753700256, - "learning_rate": 1.4858382196798011e-05, - "loss": 1.0524, + "epoch": 0.42659154362753726, + "grad_norm": 0.6523112654685974, + "learning_rate": 1.264292691375674e-05, + "loss": 0.8098, "step": 21200 }, { - "epoch": 0.353431647066144, - "grad_norm": 0.660627007484436, - "learning_rate": 1.4811820987632852e-05, - "loss": 1.0518, + "epoch": 0.4286037678899313, + "grad_norm": 0.6403859853744507, + "learning_rate": 1.2580660884094944e-05, + "loss": 0.8125, + "step": 21300 + }, + { + "epoch": 0.4286037678899313, + "eval_loss": 0.5487639307975769, + "eval_runtime": 11.6017, + "eval_samples_per_second": 32.581, + "eval_steps_per_second": 1.121, "step": 21300 }, { - "epoch": 0.35509095057349677, - "grad_norm": 0.7089664936065674, - "learning_rate": 1.4765123631933118e-05, - "loss": 1.0523, + "epoch": 0.4306159921523254, + "grad_norm": 0.6883541345596313, + "learning_rate": 1.2518287475829687e-05, + "loss": 0.804, "step": 21400 }, { - "epoch": 0.35675025408084954, - "grad_norm": 0.6418955326080322, - "learning_rate": 1.471829145096225e-05, - "loss": 1.0465, + "epoch": 0.43262821641471944, + "grad_norm": 0.6650357246398926, + "learning_rate": 1.2455809284253329e-05, + "loss": 0.8097, "step": 21500 }, { - "epoch": 0.35675025408084954, - "eval_loss": 0.5611768960952759, - "eval_runtime": 11.6869, - "eval_samples_per_second": 32.344, - "eval_steps_per_second": 1.112, - "step": 21500 + "epoch": 0.43464044067711344, + "grad_norm": 0.6048406958580017, + "learning_rate": 1.239322890901815e-05, + "loss": 0.8059, + "step": 21600 }, { - "epoch": 0.35840955758820237, - "grad_norm": 0.6870258450508118, - "learning_rate": 1.4671325769798462e-05, - "loss": 1.0541, + "epoch": 0.43464044067711344, + "eval_loss": 0.5487421751022339, + "eval_runtime": 11.4779, + "eval_samples_per_second": 32.933, + "eval_steps_per_second": 1.133, "step": 21600 }, { - "epoch": 0.36006886109555514, - "grad_norm": 0.6814928650856018, - "learning_rate": 1.462422791729724e-05, - "loss": 1.047, + "epoch": 0.4366526649395075, + "grad_norm": 0.6876850724220276, + "learning_rate": 1.233054895402819e-05, + "loss": 0.8027, "step": 21700 }, { - "epoch": 0.3617281646029079, - "grad_norm": 0.7126842141151428, - "learning_rate": 1.4576999226053758e-05, - "loss": 1.0433, + "epoch": 0.43866488920190155, + "grad_norm": 0.656778872013092, + "learning_rate": 1.2267772027330893e-05, + "loss": 0.8124, "step": 21800 }, { - "epoch": 0.36338746811026074, - "grad_norm": 0.6246159076690674, - "learning_rate": 1.4529641032365155e-05, - "loss": 1.0422, + "epoch": 0.4406771134642956, + "grad_norm": 0.6603732109069824, + "learning_rate": 1.22049007410086e-05, + "loss": 0.8032, "step": 21900 }, { - "epoch": 0.3650467716176135, - "grad_norm": 0.6283680200576782, - "learning_rate": 1.4482154676192744e-05, - "loss": 1.0573, - "step": 22000 + "epoch": 0.4406771134642956, + "eval_loss": 0.547619104385376, + "eval_runtime": 11.4392, + "eval_samples_per_second": 33.044, + "eval_steps_per_second": 1.136, + "step": 21900 }, { - "epoch": 0.3650467716176135, - "eval_loss": 0.5592057108879089, - "eval_runtime": 15.2268, - "eval_samples_per_second": 24.825, - "eval_steps_per_second": 0.854, + "epoch": 0.4426893377266896, + "grad_norm": 0.5987362861633301, + "learning_rate": 1.2141937711069857e-05, + "loss": 0.8075, "step": 22000 }, { - "epoch": 0.3667060751249663, - "grad_norm": 0.6936048865318298, - "learning_rate": 1.4434541501124088e-05, - "loss": 1.0401, + "epoch": 0.44470156198908367, + "grad_norm": 0.6756895780563354, + "learning_rate": 1.2078885557340562e-05, + "loss": 0.8092, "step": 22100 }, { - "epoch": 0.3683653786323191, - "grad_norm": 0.6777641773223877, - "learning_rate": 1.4386802854334988e-05, - "loss": 1.0652, + "epoch": 0.44671378625147773, + "grad_norm": 0.7242164015769958, + "learning_rate": 1.2015746903354968e-05, + "loss": 0.8156, "step": 22200 }, { - "epoch": 0.3700246821396719, - "grad_norm": 0.6012935042381287, - "learning_rate": 1.4338940086551357e-05, - "loss": 1.0563, + "epoch": 0.44671378625147773, + "eval_loss": 0.5490314364433289, + "eval_runtime": 11.6139, + "eval_samples_per_second": 32.547, + "eval_steps_per_second": 1.119, + "step": 22200 + }, + { + "epoch": 0.4487260105138718, + "grad_norm": 0.77918541431427, + "learning_rate": 1.1952524376246504e-05, + "loss": 0.8063, "step": 22300 }, { - "epoch": 0.37168398564702465, - "grad_norm": 0.7001439929008484, - "learning_rate": 1.4290954552011021e-05, - "loss": 1.046, + "epoch": 0.4507382347762658, + "grad_norm": 0.6913318634033203, + "learning_rate": 1.1889220606638476e-05, + "loss": 0.8079, "step": 22400 }, { - "epoch": 0.3733432891543774, - "grad_norm": 0.6927397847175598, - "learning_rate": 1.4242847608425383e-05, - "loss": 1.0473, + "epoch": 0.45275045903865985, + "grad_norm": 0.747986376285553, + "learning_rate": 1.1825838228534607e-05, + "loss": 0.8033, "step": 22500 }, { - "epoch": 0.3733432891543774, - "eval_loss": 0.5581105351448059, - "eval_runtime": 11.8265, - "eval_samples_per_second": 31.962, - "eval_steps_per_second": 1.099, + "epoch": 0.45275045903865985, + "eval_loss": 0.5468713045120239, + "eval_runtime": 11.4, + "eval_samples_per_second": 33.158, + "eval_steps_per_second": 1.14, "step": 22500 }, { - "epoch": 0.37500259266173025, - "grad_norm": 0.7469542622566223, - "learning_rate": 1.4194620616941017e-05, - "loss": 1.0493, + "epoch": 0.4547626833010539, + "grad_norm": 0.6693961024284363, + "learning_rate": 1.1762379879209442e-05, + "loss": 0.8089, "step": 22600 }, { - "epoch": 0.376661896169083, - "grad_norm": 0.7251051068305969, - "learning_rate": 1.4146274942101163e-05, - "loss": 1.051, + "epoch": 0.45677490756344796, + "grad_norm": 0.6168875098228455, + "learning_rate": 1.1698848199098596e-05, + "loss": 0.7998, "step": 22700 }, { - "epoch": 0.3783211996764358, - "grad_norm": 0.8564967513084412, - "learning_rate": 1.4097811951807092e-05, - "loss": 1.0471, + "epoch": 0.45878713182584197, + "grad_norm": 0.6753715872764587, + "learning_rate": 1.1635245831688913e-05, + "loss": 0.8057, + "step": 22800 + }, + { + "epoch": 0.45878713182584197, + "eval_loss": 0.5467536449432373, + "eval_runtime": 11.3082, + "eval_samples_per_second": 33.427, + "eval_steps_per_second": 1.15, "step": 22800 }, { - "epoch": 0.3799805031837886, - "grad_norm": 0.6386659145355225, - "learning_rate": 1.4049233017279436e-05, - "loss": 1.0499, + "epoch": 0.460799356088236, + "grad_norm": 0.6399224996566772, + "learning_rate": 1.1571575423408456e-05, + "loss": 0.7965, "step": 22900 }, { - "epoch": 0.3816398066911414, - "grad_norm": 0.6405443549156189, - "learning_rate": 1.4000539513019365e-05, - "loss": 1.0455, + "epoch": 0.4628115803506301, + "grad_norm": 0.5371870994567871, + "learning_rate": 1.1507839623516401e-05, + "loss": 0.8014, "step": 23000 }, { - "epoch": 0.3816398066911414, - "eval_loss": 0.5561164021492004, - "eval_runtime": 13.2111, - "eval_samples_per_second": 28.612, - "eval_steps_per_second": 0.984, - "step": 23000 + "epoch": 0.46482380461302414, + "grad_norm": 0.711793839931488, + "learning_rate": 1.1444041083992801e-05, + "loss": 0.8081, + "step": 23100 }, { - "epoch": 0.38329911019849416, - "grad_norm": 0.6323944926261902, - "learning_rate": 1.3951732816769707e-05, - "loss": 1.0328, + "epoch": 0.46482380461302414, + "eval_loss": 0.5455725193023682, + "eval_runtime": 11.4796, + "eval_samples_per_second": 32.928, + "eval_steps_per_second": 1.132, "step": 23100 }, { - "epoch": 0.384958413705847, - "grad_norm": 0.7174931168556213, - "learning_rate": 1.3902814309475968e-05, - "loss": 1.0433, + "epoch": 0.46683602887541814, + "grad_norm": 0.566677451133728, + "learning_rate": 1.1380182459428234e-05, + "loss": 0.8027, "step": 23200 }, { - "epoch": 0.38661771721319976, - "grad_norm": 0.6990362405776978, - "learning_rate": 1.3853785375247253e-05, - "loss": 1.0551, + "epoch": 0.4688482531378122, + "grad_norm": 0.7086474895477295, + "learning_rate": 1.1316266406913355e-05, + "loss": 0.8024, "step": 23300 }, { - "epoch": 0.38827702072055253, - "grad_norm": 0.6381626129150391, - "learning_rate": 1.3804647401317106e-05, - "loss": 1.0567, + "epoch": 0.47086047740020626, + "grad_norm": 0.6261083483695984, + "learning_rate": 1.1252295585928343e-05, + "loss": 0.8054, "step": 23400 }, { - "epoch": 0.38993632422790536, - "grad_norm": 0.6417682766914368, - "learning_rate": 1.3755401778004266e-05, - "loss": 1.0429, - "step": 23500 + "epoch": 0.47086047740020626, + "eval_loss": 0.5444592833518982, + "eval_runtime": 11.5945, + "eval_samples_per_second": 32.602, + "eval_steps_per_second": 1.121, + "step": 23400 }, { - "epoch": 0.38993632422790536, - "eval_loss": 0.5571095943450928, - "eval_runtime": 18.5738, - "eval_samples_per_second": 20.351, - "eval_steps_per_second": 0.7, + "epoch": 0.4728727016626003, + "grad_norm": 0.6763809323310852, + "learning_rate": 1.1188272658232228e-05, + "loss": 0.7952, "step": 23500 }, { - "epoch": 0.39159562773525813, - "grad_norm": 0.7145078182220459, - "learning_rate": 1.3706049898673315e-05, - "loss": 1.0379, + "epoch": 0.4748849259249943, + "grad_norm": 0.6690487265586853, + "learning_rate": 1.1124200287752157e-05, + "loss": 0.807, "step": 23600 }, { - "epoch": 0.3932549312426109, - "grad_norm": 0.7557052969932556, - "learning_rate": 1.3656593159695267e-05, - "loss": 1.0513, + "epoch": 0.4768971501873884, + "grad_norm": 0.5711999535560608, + "learning_rate": 1.1060081140472519e-05, + "loss": 0.8052, + "step": 23700 + }, + { + "epoch": 0.4768971501873884, + "eval_loss": 0.5443876385688782, + "eval_runtime": 11.4195, + "eval_samples_per_second": 33.101, + "eval_steps_per_second": 1.138, "step": 23700 }, { - "epoch": 0.3949142347499637, - "grad_norm": 0.5949908494949341, - "learning_rate": 1.3607032960408051e-05, - "loss": 1.0324, + "epoch": 0.47890937444978243, + "grad_norm": 0.6411765217781067, + "learning_rate": 1.0995917884324056e-05, + "loss": 0.7976, "step": 23800 }, { - "epoch": 0.3965735382573165, - "grad_norm": 0.6159701347351074, - "learning_rate": 1.3557370703076924e-05, - "loss": 1.0381, + "epoch": 0.4809215987121765, + "grad_norm": 0.5719566941261292, + "learning_rate": 1.0931713189072827e-05, + "loss": 0.7992, "step": 23900 }, { - "epoch": 0.3982328417646693, - "grad_norm": 0.6602431535720825, - "learning_rate": 1.3507607792854789e-05, - "loss": 1.0429, + "epoch": 0.4829338229745705, + "grad_norm": 0.5175074934959412, + "learning_rate": 1.086746972620913e-05, + "loss": 0.8009, "step": 24000 }, { - "epoch": 0.3982328417646693, - "eval_loss": 0.5576474070549011, - "eval_runtime": 12.4814, - "eval_samples_per_second": 30.285, - "eval_steps_per_second": 1.042, + "epoch": 0.4829338229745705, + "eval_loss": 0.5424737334251404, + "eval_runtime": 11.3763, + "eval_samples_per_second": 33.227, + "eval_steps_per_second": 1.143, "step": 24000 }, { - "epoch": 0.39989214527202205, - "grad_norm": 0.609024703502655, - "learning_rate": 1.3457745637742442e-05, - "loss": 1.06, + "epoch": 0.48494604723696455, + "grad_norm": 0.6476929783821106, + "learning_rate": 1.0803190168836341e-05, + "loss": 0.7984, "step": 24100 }, { - "epoch": 0.4015514487793749, - "grad_norm": 0.6816290616989136, - "learning_rate": 1.3407785648548733e-05, - "loss": 1.0406, + "epoch": 0.4869582714993586, + "grad_norm": 0.6742759943008423, + "learning_rate": 1.0738877191559691e-05, + "loss": 0.7989, "step": 24200 }, { - "epoch": 0.40321075228672765, - "grad_norm": 0.6846993565559387, - "learning_rate": 1.3357729238850638e-05, - "loss": 1.0444, + "epoch": 0.48897049576175267, + "grad_norm": 0.5645999908447266, + "learning_rate": 1.067453347037498e-05, + "loss": 0.7985, "step": 24300 }, { - "epoch": 0.4048700557940804, - "grad_norm": 0.8666418790817261, - "learning_rate": 1.3307577824953288e-05, - "loss": 1.038, - "step": 24400 + "epoch": 0.48897049576175267, + "eval_loss": 0.5427749752998352, + "eval_runtime": 11.4256, + "eval_samples_per_second": 33.084, + "eval_steps_per_second": 1.138, + "step": 24300 }, { - "epoch": 0.40652935930143325, - "grad_norm": 0.6372240781784058, - "learning_rate": 1.325733282584987e-05, - "loss": 1.0425, - "step": 24500 + "epoch": 0.49098272002414667, + "grad_norm": 0.5972943902015686, + "learning_rate": 1.0610161682557225e-05, + "loss": 0.7961, + "step": 24400 }, { - "epoch": 0.40652935930143325, - "eval_loss": 0.5539091229438782, - "eval_runtime": 11.7875, - "eval_samples_per_second": 32.068, - "eval_steps_per_second": 1.103, + "epoch": 0.4929949442865407, + "grad_norm": 0.6340279579162598, + "learning_rate": 1.0545764506549273e-05, + "loss": 0.8033, "step": 24500 }, { - "epoch": 0.408188662808786, - "grad_norm": 0.6796088814735413, - "learning_rate": 1.3206995663181484e-05, - "loss": 1.0512, + "epoch": 0.4950071685489348, + "grad_norm": 0.6096486449241638, + "learning_rate": 1.0481344621850347e-05, + "loss": 0.7955, "step": 24600 }, { - "epoch": 0.4098479663161388, - "grad_norm": 0.6541156768798828, - "learning_rate": 1.3156567761196934e-05, - "loss": 1.0428, + "epoch": 0.4950071685489348, + "eval_loss": 0.5418882369995117, + "eval_runtime": 11.4157, + "eval_samples_per_second": 33.112, + "eval_steps_per_second": 1.139, + "step": 24600 + }, + { + "epoch": 0.49701939281132884, + "grad_norm": 0.5778651833534241, + "learning_rate": 1.041690470890455e-05, + "loss": 0.7954, "step": 24700 }, { - "epoch": 0.4115072698234916, - "grad_norm": 0.7349140048027039, - "learning_rate": 1.3106050546712408e-05, - "loss": 1.0378, + "epoch": 0.49903161707372284, + "grad_norm": 0.5838211178779602, + "learning_rate": 1.0352447448989337e-05, + "loss": 0.7854, "step": 24800 }, { - "epoch": 0.4131665733308444, - "grad_norm": 0.6726204752922058, - "learning_rate": 1.3055445449071124e-05, - "loss": 1.048, + "epoch": 0.5010438413361169, + "grad_norm": 0.5919055342674255, + "learning_rate": 1.0287975524103964e-05, + "loss": 0.7925, "step": 24900 }, { - "epoch": 0.41482587683819716, - "grad_norm": 0.644631028175354, - "learning_rate": 1.3004753900102886e-05, - "loss": 1.0402, - "step": 25000 + "epoch": 0.5010438413361169, + "eval_loss": 0.541851818561554, + "eval_runtime": 11.2979, + "eval_samples_per_second": 33.457, + "eval_steps_per_second": 1.151, + "step": 24900 }, { - "epoch": 0.41482587683819716, - "eval_loss": 0.5533541440963745, - "eval_runtime": 11.6271, - "eval_samples_per_second": 32.51, - "eval_steps_per_second": 1.118, + "epoch": 0.5030560655985109, + "grad_norm": 0.5358749628067017, + "learning_rate": 1.022349161685787e-05, + "loss": 0.7986, "step": 25000 }, { - "epoch": 0.41648518034554993, - "grad_norm": 0.7161290645599365, - "learning_rate": 1.2953977334083554e-05, - "loss": 1.0324, + "epoch": 0.505068289860905, + "grad_norm": 0.6401896476745605, + "learning_rate": 1.0158998410359074e-05, + "loss": 0.7914, "step": 25100 }, { - "epoch": 0.41814448385290276, - "grad_norm": 0.6660270690917969, - "learning_rate": 1.290311718769449e-05, - "loss": 1.0524, + "epoch": 0.507080514123299, + "grad_norm": 0.5817869901657104, + "learning_rate": 1.0094498588102523e-05, + "loss": 0.7956, + "step": 25200 + }, + { + "epoch": 0.507080514123299, + "eval_loss": 0.5417122840881348, + "eval_runtime": 11.503, + "eval_samples_per_second": 32.861, + "eval_steps_per_second": 1.13, "step": 25200 }, { - "epoch": 0.41980378736025553, - "grad_norm": 0.6349546909332275, - "learning_rate": 1.2852174899981884e-05, - "loss": 1.039, + "epoch": 0.5090927383856931, + "grad_norm": 0.5595591068267822, + "learning_rate": 1.0029994833858438e-05, + "loss": 0.7943, "step": 25300 }, { - "epoch": 0.4214630908676083, - "grad_norm": 0.6448563933372498, - "learning_rate": 1.2801151912316053e-05, - "loss": 1.0358, + "epoch": 0.5111049626480871, + "grad_norm": 0.5861169099807739, + "learning_rate": 9.965489831560652e-06, + "loss": 0.8006, "step": 25400 }, { - "epoch": 0.42312239437496113, - "grad_norm": 0.6865149736404419, - "learning_rate": 1.275004966835065e-05, - "loss": 1.0465, + "epoch": 0.5131171869104811, + "grad_norm": 0.5644922852516174, + "learning_rate": 9.900986265194924e-06, + "loss": 0.7868, "step": 25500 }, { - "epoch": 0.42312239437496113, - "eval_loss": 0.5532709956169128, - "eval_runtime": 11.7773, - "eval_samples_per_second": 32.096, - "eval_steps_per_second": 1.104, + "epoch": 0.5131171869104811, + "eval_loss": 0.5409750938415527, + "eval_runtime": 11.3254, + "eval_samples_per_second": 33.376, + "eval_steps_per_second": 1.148, "step": 25500 }, { - "epoch": 0.4247816978823139, - "grad_norm": 0.5589303374290466, - "learning_rate": 1.2698869613981825e-05, - "loss": 1.037, + "epoch": 0.5151294111728753, + "grad_norm": 0.5210478901863098, + "learning_rate": 9.836486818687262e-06, + "loss": 0.7967, "step": 25600 }, { - "epoch": 0.4264410013896667, - "grad_norm": 0.6359687447547913, - "learning_rate": 1.2647613197307305e-05, - "loss": 1.0366, + "epoch": 0.5171416354352693, + "grad_norm": 0.5937855839729309, + "learning_rate": 9.771994175792262e-06, + "loss": 0.7839, "step": 25700 }, { - "epoch": 0.4281003048970195, - "grad_norm": 0.6659601926803589, - "learning_rate": 1.2596281868585428e-05, - "loss": 1.0534, + "epoch": 0.5191538596976633, + "grad_norm": 0.68199622631073, + "learning_rate": 9.707511019981416e-06, + "loss": 0.7929, + "step": 25800 + }, + { + "epoch": 0.5191538596976633, + "eval_loss": 0.53957599401474, + "eval_runtime": 11.2847, + "eval_samples_per_second": 33.497, + "eval_steps_per_second": 1.152, "step": 25800 }, { - "epoch": 0.4297596084043723, - "grad_norm": 0.7066001296043396, - "learning_rate": 1.2544877080194104e-05, - "loss": 1.0431, + "epoch": 0.5211660839600574, + "grad_norm": 0.6363146305084229, + "learning_rate": 9.643040034331475e-06, + "loss": 0.7893, "step": 25900 }, { - "epoch": 0.43141891191172504, - "grad_norm": 0.6346915364265442, - "learning_rate": 1.2493400286589728e-05, - "loss": 1.045, + "epoch": 0.5231783082224514, + "grad_norm": 0.6275014877319336, + "learning_rate": 9.578583901412802e-06, + "loss": 0.7883, "step": 26000 }, { - "epoch": 0.43141891191172504, - "eval_loss": 0.5507645606994629, - "eval_runtime": 11.7491, - "eval_samples_per_second": 32.173, - "eval_steps_per_second": 1.106, - "step": 26000 + "epoch": 0.5251905324848455, + "grad_norm": 0.5840523838996887, + "learning_rate": 9.514145303177751e-06, + "loss": 0.7961, + "step": 26100 }, { - "epoch": 0.4330782154190778, - "grad_norm": 0.6226300597190857, - "learning_rate": 1.2441852944266025e-05, - "loss": 1.0389, + "epoch": 0.5251905324848455, + "eval_loss": 0.5387553572654724, + "eval_runtime": 11.2936, + "eval_samples_per_second": 33.47, + "eval_steps_per_second": 1.151, "step": 26100 }, { - "epoch": 0.43473751892643064, - "grad_norm": 0.6582189798355103, - "learning_rate": 1.239023651171283e-05, - "loss": 1.0337, + "epoch": 0.5272027567472395, + "grad_norm": 0.706901490688324, + "learning_rate": 9.449726920849085e-06, + "loss": 0.795, "step": 26200 }, { - "epoch": 0.4363968224337834, - "grad_norm": 0.6174708008766174, - "learning_rate": 1.2338552449374834e-05, - "loss": 1.0379, + "epoch": 0.5292149810096335, + "grad_norm": 0.5236905813217163, + "learning_rate": 9.385331434808386e-06, + "loss": 0.7919, "step": 26300 }, { - "epoch": 0.4380561259411362, - "grad_norm": 0.6207261681556702, - "learning_rate": 1.2286802219610266e-05, - "loss": 1.0403, + "epoch": 0.5312272052720276, + "grad_norm": 0.6014547348022461, + "learning_rate": 9.320961524484565e-06, + "loss": 0.7917, "step": 26400 }, { - "epoch": 0.439715429448489, - "grad_norm": 0.8077992796897888, - "learning_rate": 1.2234987286649492e-05, - "loss": 1.0401, - "step": 26500 + "epoch": 0.5312272052720276, + "eval_loss": 0.5388390421867371, + "eval_runtime": 11.3827, + "eval_samples_per_second": 33.208, + "eval_steps_per_second": 1.142, + "step": 26400 }, { - "epoch": 0.439715429448489, - "eval_loss": 0.5515198111534119, - "eval_runtime": 11.8447, - "eval_samples_per_second": 31.913, - "eval_steps_per_second": 1.098, + "epoch": 0.5332394295344216, + "grad_norm": 0.5613085031509399, + "learning_rate": 9.256619868242341e-06, + "loss": 0.7957, "step": 26500 }, { - "epoch": 0.4413747329558418, - "grad_norm": 0.6707153916358948, - "learning_rate": 1.2183109116553616e-05, - "loss": 1.027, + "epoch": 0.5352516537968156, + "grad_norm": 0.6822344064712524, + "learning_rate": 9.192309143270818e-06, + "loss": 0.7867, "step": 26600 }, { - "epoch": 0.44303403646319456, - "grad_norm": 0.6405569314956665, - "learning_rate": 1.2131169177172974e-05, - "loss": 1.032, + "epoch": 0.5372638780592097, + "grad_norm": 0.6041319370269775, + "learning_rate": 9.128032025472077e-06, + "loss": 0.7884, "step": 26700 }, { - "epoch": 0.4446933399705474, - "grad_norm": 0.5976676940917969, - "learning_rate": 1.2079168938105625e-05, - "loss": 1.0361, + "epoch": 0.5372638780592097, + "eval_loss": 0.5368719696998596, + "eval_runtime": 11.3484, + "eval_samples_per_second": 33.309, + "eval_steps_per_second": 1.146, + "step": 26700 + }, + { + "epoch": 0.5392761023216037, + "grad_norm": 0.644088089466095, + "learning_rate": 9.063791189349841e-06, + "loss": 0.7867, "step": 26800 }, { - "epoch": 0.44635264347790016, - "grad_norm": 0.6009154319763184, - "learning_rate": 1.2027109870655746e-05, - "loss": 1.036, + "epoch": 0.5412883265839978, + "grad_norm": 0.627928614616394, + "learning_rate": 8.999589307898192e-06, + "loss": 0.7896, "step": 26900 }, { - "epoch": 0.44801194698525293, - "grad_norm": 0.6396293640136719, - "learning_rate": 1.1974993447792025e-05, - "loss": 1.0343, + "epoch": 0.5433005508463918, + "grad_norm": 0.6207029819488525, + "learning_rate": 8.935429052490347e-06, + "loss": 0.7853, "step": 27000 }, { - "epoch": 0.44801194698525293, - "eval_loss": 0.5514323711395264, - "eval_runtime": 12.191, - "eval_samples_per_second": 31.006, - "eval_steps_per_second": 1.066, + "epoch": 0.5433005508463918, + "eval_loss": 0.5371023416519165, + "eval_runtime": 11.3461, + "eval_samples_per_second": 33.316, + "eval_steps_per_second": 1.146, "step": 27000 }, { - "epoch": 0.44967125049260576, - "grad_norm": 0.5939125418663025, - "learning_rate": 1.1922821144105967e-05, - "loss": 1.0295, + "epoch": 0.5453127751087858, + "grad_norm": 0.541533887386322, + "learning_rate": 8.87131309276751e-06, + "loss": 0.7916, "step": 27100 }, { - "epoch": 0.45133055399995853, - "grad_norm": 0.6601003408432007, - "learning_rate": 1.1870594435770184e-05, - "loss": 1.038, + "epoch": 0.54732499937118, + "grad_norm": 0.590813934803009, + "learning_rate": 8.807244096527783e-06, + "loss": 0.7948, "step": 27200 }, { - "epoch": 0.4529898575073113, - "grad_norm": 0.6417076587677002, - "learning_rate": 1.1818314800496622e-05, - "loss": 1.0264, + "epoch": 0.549337223633574, + "grad_norm": 0.584229588508606, + "learning_rate": 8.743224729615168e-06, + "loss": 0.7918, + "step": 27300 + }, + { + "epoch": 0.549337223633574, + "eval_loss": 0.5366615653038025, + "eval_runtime": 11.3157, + "eval_samples_per_second": 33.405, + "eval_steps_per_second": 1.149, "step": 27300 }, { - "epoch": 0.45464916101466407, - "grad_norm": 0.6616116166114807, - "learning_rate": 1.1765983717494747e-05, - "loss": 1.0444, + "epoch": 0.551349447895968, + "grad_norm": 0.6746295094490051, + "learning_rate": 8.679257655808645e-06, + "loss": 0.7911, "step": 27400 }, { - "epoch": 0.4563084645220169, - "grad_norm": 0.6725832223892212, - "learning_rate": 1.1713602667429704e-05, - "loss": 1.0283, + "epoch": 0.5533616721583621, + "grad_norm": 0.6765587329864502, + "learning_rate": 8.615345536711331e-06, + "loss": 0.7906, "step": 27500 }, { - "epoch": 0.4563084645220169, - "eval_loss": 0.5484245419502258, - "eval_runtime": 14.2305, - "eval_samples_per_second": 26.563, - "eval_steps_per_second": 0.914, - "step": 27500 + "epoch": 0.5553738964207561, + "grad_norm": 0.5838325619697571, + "learning_rate": 8.551491031639736e-06, + "loss": 0.7937, + "step": 27600 }, { - "epoch": 0.45796776802936967, - "grad_norm": 0.7447541356086731, - "learning_rate": 1.166117313238041e-05, - "loss": 1.0406, + "epoch": 0.5553738964207561, + "eval_loss": 0.5361348390579224, + "eval_runtime": 11.3123, + "eval_samples_per_second": 33.415, + "eval_steps_per_second": 1.149, "step": 27600 }, { - "epoch": 0.45962707153672244, - "grad_norm": 0.7025007605552673, - "learning_rate": 1.1608696595797624e-05, - "loss": 1.0363, + "epoch": 0.5573861206831502, + "grad_norm": 0.6001378893852234, + "learning_rate": 8.487696797513108e-06, + "loss": 0.7777, "step": 27700 }, { - "epoch": 0.46128637504407527, - "grad_norm": 0.6461624503135681, - "learning_rate": 1.1556174542461982e-05, - "loss": 1.0294, + "epoch": 0.5593983449455442, + "grad_norm": 0.5667701363563538, + "learning_rate": 8.423965488742885e-06, + "loss": 0.7856, "step": 27800 }, { - "epoch": 0.46294567855142804, - "grad_norm": 0.6059824228286743, - "learning_rate": 1.1503608458441968e-05, - "loss": 1.0294, + "epoch": 0.5614105692079382, + "grad_norm": 0.632291316986084, + "learning_rate": 8.360299757122247e-06, + "loss": 0.7792, "step": 27900 }, { - "epoch": 0.4646049820587808, - "grad_norm": 0.7821197509765625, - "learning_rate": 1.1450999831051888e-05, - "loss": 1.0332, - "step": 28000 + "epoch": 0.5614105692079382, + "eval_loss": 0.5353109240531921, + "eval_runtime": 11.3749, + "eval_samples_per_second": 33.231, + "eval_steps_per_second": 1.143, + "step": 27900 }, { - "epoch": 0.4646049820587808, - "eval_loss": 0.5501440763473511, - "eval_runtime": 11.7601, - "eval_samples_per_second": 32.142, - "eval_steps_per_second": 1.105, + "epoch": 0.5634227934703323, + "grad_norm": 0.5472155213356018, + "learning_rate": 8.296702251715778e-06, + "loss": 0.7831, "step": 28000 }, { - "epoch": 0.46626428556613364, - "grad_norm": 0.627096951007843, - "learning_rate": 1.1398350148809773e-05, - "loss": 1.0271, + "epoch": 0.5654350177327263, + "grad_norm": 0.590352475643158, + "learning_rate": 8.233175618749243e-06, + "loss": 0.7833, "step": 28100 }, { - "epoch": 0.4679235890734864, - "grad_norm": 0.5786092281341553, - "learning_rate": 1.134566090139527e-05, - "loss": 1.0302, + "epoch": 0.5674472419951203, + "grad_norm": 0.5392365455627441, + "learning_rate": 8.16972250149947e-06, + "loss": 0.7846, + "step": 28200 + }, + { + "epoch": 0.5674472419951203, + "eval_loss": 0.5345659852027893, + "eval_runtime": 11.3797, + "eval_samples_per_second": 33.217, + "eval_steps_per_second": 1.142, "step": 28200 }, { - "epoch": 0.4695828925808392, - "grad_norm": 0.6343611478805542, - "learning_rate": 1.1292933579607488e-05, - "loss": 1.0284, + "epoch": 0.5694594662575144, + "grad_norm": 0.5367996692657471, + "learning_rate": 8.106345540184382e-06, + "loss": 0.7881, "step": 28300 }, { - "epoch": 0.47124219608819196, - "grad_norm": 0.633843719959259, - "learning_rate": 1.1240169675322816e-05, - "loss": 1.0274, + "epoch": 0.5714716905199084, + "grad_norm": 0.7017585039138794, + "learning_rate": 8.043047371853135e-06, + "loss": 0.7902, "step": 28400 }, { - "epoch": 0.4729014995955448, - "grad_norm": 0.6252873539924622, - "learning_rate": 1.1187370681452728e-05, - "loss": 1.032, + "epoch": 0.5734839147823025, + "grad_norm": 0.6775383353233337, + "learning_rate": 7.979830630276384e-06, + "loss": 0.795, "step": 28500 }, { - "epoch": 0.4729014995955448, - "eval_loss": 0.546782374382019, - "eval_runtime": 11.904, - "eval_samples_per_second": 31.754, - "eval_steps_per_second": 1.092, + "epoch": 0.5734839147823025, + "eval_loss": 0.5349369645118713, + "eval_runtime": 11.3477, + "eval_samples_per_second": 33.311, + "eval_steps_per_second": 1.146, "step": 28500 }, { - "epoch": 0.47456080310289755, - "grad_norm": 0.6776255965232849, - "learning_rate": 1.1134538091901512e-05, - "loss": 1.0299, + "epoch": 0.5754961390446965, + "grad_norm": 0.5782616138458252, + "learning_rate": 7.91669794583671e-06, + "loss": 0.7902, "step": 28600 }, { - "epoch": 0.4762201066102503, - "grad_norm": 0.5924820303916931, - "learning_rate": 1.108167340152403e-05, - "loss": 1.0335, + "epoch": 0.5775083633070905, + "grad_norm": 0.5419892072677612, + "learning_rate": 7.853651945419155e-06, + "loss": 0.7858, "step": 28700 }, { - "epoch": 0.47787941011760315, - "grad_norm": 0.63683021068573, - "learning_rate": 1.1028778106083402e-05, - "loss": 1.0352, + "epoch": 0.5795205875694847, + "grad_norm": 0.6611707210540771, + "learning_rate": 7.790695252301938e-06, + "loss": 0.7894, + "step": 28800 + }, + { + "epoch": 0.5795205875694847, + "eval_loss": 0.5343945026397705, + "eval_runtime": 11.4492, + "eval_samples_per_second": 33.015, + "eval_steps_per_second": 1.135, "step": 28800 }, { - "epoch": 0.4795387136249559, - "grad_norm": 0.5792187452316284, - "learning_rate": 1.0975853702208705e-05, - "loss": 1.0359, + "epoch": 0.5815328118318787, + "grad_norm": 0.5788918137550354, + "learning_rate": 7.727830486047288e-06, + "loss": 0.7868, "step": 28900 }, { - "epoch": 0.4811980171323087, - "grad_norm": 0.5817322731018066, - "learning_rate": 1.0922901687352605e-05, - "loss": 1.0172, + "epoch": 0.5835450360942727, + "grad_norm": 0.5480091571807861, + "learning_rate": 7.665060262392461e-06, + "loss": 0.7858, "step": 29000 }, { - "epoch": 0.4811980171323087, - "eval_loss": 0.5474241375923157, - "eval_runtime": 16.9968, - "eval_samples_per_second": 22.239, - "eval_steps_per_second": 0.765, - "step": 29000 + "epoch": 0.5855572603566668, + "grad_norm": 0.730056881904602, + "learning_rate": 7.602387193140887e-06, + "loss": 0.7884, + "step": 29100 }, { - "epoch": 0.4828573206396615, - "grad_norm": 0.5945357084274292, - "learning_rate": 1.0869923559748998e-05, - "loss": 1.0353, + "epoch": 0.5855572603566668, + "eval_loss": 0.5339014530181885, + "eval_runtime": 11.3802, + "eval_samples_per_second": 33.216, + "eval_steps_per_second": 1.142, "step": 29100 }, { - "epoch": 0.4845166241470143, - "grad_norm": 0.6166268587112427, - "learning_rate": 1.0816920818370626e-05, - "loss": 1.0303, + "epoch": 0.5875694846190608, + "grad_norm": 0.5774337649345398, + "learning_rate": 7.539813886053502e-06, + "loss": 0.7893, "step": 29200 }, { - "epoch": 0.48617592765436707, - "grad_norm": 0.6337193846702576, - "learning_rate": 1.0763894962886657e-05, - "loss": 1.027, + "epoch": 0.5895817088814549, + "grad_norm": 0.615470290184021, + "learning_rate": 7.477342944740249e-06, + "loss": 0.7817, "step": 29300 }, { - "epoch": 0.4878352311617199, - "grad_norm": 0.6805664300918579, - "learning_rate": 1.071084749362024e-05, - "loss": 1.0252, + "epoch": 0.5915939331438489, + "grad_norm": 0.6776989698410034, + "learning_rate": 7.414976968551735e-06, + "loss": 0.7783, "step": 29400 }, { - "epoch": 0.48949453466907267, - "grad_norm": 0.6085631251335144, - "learning_rate": 1.0657779911506089e-05, - "loss": 1.0231, - "step": 29500 + "epoch": 0.5915939331438489, + "eval_loss": 0.533939003944397, + "eval_runtime": 11.3711, + "eval_samples_per_second": 33.242, + "eval_steps_per_second": 1.143, + "step": 29400 }, { - "epoch": 0.48949453466907267, - "eval_loss": 0.5449956059455872, - "eval_runtime": 16.6183, - "eval_samples_per_second": 22.746, - "eval_steps_per_second": 0.782, + "epoch": 0.5936061574062429, + "grad_norm": 0.5885875821113586, + "learning_rate": 7.352718552471077e-06, + "loss": 0.784, "step": 29500 }, { - "epoch": 0.49115383817642544, - "grad_norm": 0.6720906496047974, - "learning_rate": 1.060469371804798e-05, - "loss": 1.0342, + "epoch": 0.595618381668637, + "grad_norm": 0.5772850513458252, + "learning_rate": 7.290570287005931e-06, + "loss": 0.7819, "step": 29600 }, { - "epoch": 0.4928131416837782, - "grad_norm": 0.5992006063461304, - "learning_rate": 1.0551590415276285e-05, - "loss": 1.0359, + "epoch": 0.597630605931031, + "grad_norm": 0.6122897863388062, + "learning_rate": 7.228534758080694e-06, + "loss": 0.7891, "step": 29700 }, { - "epoch": 0.49447244519113104, - "grad_norm": 0.5916235446929932, - "learning_rate": 1.0498471505705475e-05, - "loss": 1.015, + "epoch": 0.597630605931031, + "eval_loss": 0.5327485799789429, + "eval_runtime": 11.3326, + "eval_samples_per_second": 33.355, + "eval_steps_per_second": 1.147, + "step": 29700 + }, + { + "epoch": 0.599642830193425, + "grad_norm": 0.6210538148880005, + "learning_rate": 7.1666145469289226e-06, + "loss": 0.7832, "step": 29800 }, { - "epoch": 0.4961317486984838, - "grad_norm": 0.5814716219902039, - "learning_rate": 1.0445338492291595e-05, - "loss": 1.0234, + "epoch": 0.6016550544558191, + "grad_norm": 0.593087911605835, + "learning_rate": 7.1048122299859145e-06, + "loss": 0.7888, "step": 29900 }, { - "epoch": 0.4977910522058366, - "grad_norm": 0.5856944918632507, - "learning_rate": 1.0392192878389748e-05, - "loss": 1.0352, + "epoch": 0.6036672787182131, + "grad_norm": 0.5805263519287109, + "learning_rate": 7.043130378781516e-06, + "loss": 0.7825, "step": 30000 }, { - "epoch": 0.4977910522058366, - "eval_loss": 0.5452983975410461, - "eval_runtime": 12.5987, - "eval_samples_per_second": 30.003, - "eval_steps_per_second": 1.032, + "epoch": 0.6036672787182131, + "eval_loss": 0.5322030782699585, + "eval_runtime": 11.3763, + "eval_samples_per_second": 33.227, + "eval_steps_per_second": 1.143, "step": 30000 }, { - "epoch": 0.4994503557131894, - "grad_norm": 0.9082821607589722, - "learning_rate": 1.0339036167711567e-05, - "loss": 1.0222, + "epoch": 0.6056795029806072, + "grad_norm": 0.5463854074478149, + "learning_rate": 6.981571559833122e-06, + "loss": 0.7881, "step": 30100 }, { - "epoch": 0.5011096592205422, - "grad_norm": 0.6375318765640259, - "learning_rate": 1.0285869864282646e-05, - "loss": 1.0268, + "epoch": 0.6076917272430012, + "grad_norm": 0.5730445384979248, + "learning_rate": 6.920138334538878e-06, + "loss": 0.7858, "step": 30200 }, { - "epoch": 0.502768962727895, - "grad_norm": 0.6737388968467712, - "learning_rate": 1.0232695472400008e-05, - "loss": 1.0242, + "epoch": 0.6097039515053952, + "grad_norm": 0.5871597528457642, + "learning_rate": 6.858833259071108e-06, + "loss": 0.7777, + "step": 30300 + }, + { + "epoch": 0.6097039515053952, + "eval_loss": 0.5328507423400879, + "eval_runtime": 11.3806, + "eval_samples_per_second": 33.215, + "eval_steps_per_second": 1.142, "step": 30300 }, { - "epoch": 0.5044282662352477, - "grad_norm": 0.5903255939483643, - "learning_rate": 1.0179514496589525e-05, - "loss": 1.0126, + "epoch": 0.6117161757677894, + "grad_norm": 0.6252338290214539, + "learning_rate": 6.797658884269962e-06, + "loss": 0.778, "step": 30400 }, { - "epoch": 0.5060875697426005, - "grad_norm": 0.6914079189300537, - "learning_rate": 1.012632844156336e-05, - "loss": 1.0214, + "epoch": 0.6137284000301834, + "grad_norm": 0.588524580001831, + "learning_rate": 6.736617755537267e-06, + "loss": 0.7772, "step": 30500 }, { - "epoch": 0.5060875697426005, - "eval_loss": 0.5441185832023621, - "eval_runtime": 13.033, - "eval_samples_per_second": 29.003, - "eval_steps_per_second": 0.997, - "step": 30500 + "epoch": 0.6157406242925774, + "grad_norm": 0.621525228023529, + "learning_rate": 6.675712412730625e-06, + "loss": 0.7832, + "step": 30600 }, { - "epoch": 0.5077468732499534, - "grad_norm": 0.7019312381744385, - "learning_rate": 1.007313881217739e-05, - "loss": 1.0313, + "epoch": 0.6157406242925774, + "eval_loss": 0.5325730443000793, + "eval_runtime": 11.3314, + "eval_samples_per_second": 33.359, + "eval_steps_per_second": 1.147, "step": 30600 }, { - "epoch": 0.5094061767573062, - "grad_norm": 0.6039997339248657, - "learning_rate": 1.0019947113388622e-05, - "loss": 1.0274, + "epoch": 0.6177528485549715, + "grad_norm": 0.5612871646881104, + "learning_rate": 6.614945390057723e-06, + "loss": 0.7831, "step": 30700 }, { - "epoch": 0.5110654802646589, - "grad_norm": 0.6162710785865784, - "learning_rate": 9.966754850212612e-06, - "loss": 1.0249, + "epoch": 0.6197650728173655, + "grad_norm": 0.5247837901115417, + "learning_rate": 6.554319215970895e-06, + "loss": 0.7828, "step": 30800 }, { - "epoch": 0.5127247837720117, - "grad_norm": 0.5899170637130737, - "learning_rate": 9.913563527680896e-06, - "loss": 1.0265, + "epoch": 0.6217772970797596, + "grad_norm": 0.5758721232414246, + "learning_rate": 6.493836413061907e-06, + "loss": 0.781, "step": 30900 }, { - "epoch": 0.5143840872793645, - "grad_norm": 0.6007364988327026, - "learning_rate": 9.86037465079838e-06, - "loss": 1.026, - "step": 31000 + "epoch": 0.6217772970797596, + "eval_loss": 0.5314515829086304, + "eval_runtime": 11.3823, + "eval_samples_per_second": 33.21, + "eval_steps_per_second": 1.142, + "step": 30900 }, { - "epoch": 0.5143840872793645, - "eval_loss": 0.5434030294418335, - "eval_runtime": 14.6635, - "eval_samples_per_second": 25.778, - "eval_steps_per_second": 0.887, + "epoch": 0.6237895213421536, + "grad_norm": 0.7134236693382263, + "learning_rate": 6.433499497957006e-06, + "loss": 0.7852, "step": 31000 }, { - "epoch": 0.5160433907867172, - "grad_norm": 0.6537649035453796, - "learning_rate": 9.807189724500785e-06, - "loss": 1.0235, + "epoch": 0.6258017456045476, + "grad_norm": 0.5432785153388977, + "learning_rate": 6.373310981212197e-06, + "loss": 0.7776, "step": 31100 }, { - "epoch": 0.51770269429407, - "grad_norm": 0.6707360148429871, - "learning_rate": 9.754010253612045e-06, - "loss": 1.0207, + "epoch": 0.6278139698669417, + "grad_norm": 0.6110942959785461, + "learning_rate": 6.3132733672087875e-06, + "loss": 0.787, "step": 31200 }, { - "epoch": 0.5193619978014229, - "grad_norm": 0.6710958480834961, - "learning_rate": 9.700837742801752e-06, - "loss": 1.0174, + "epoch": 0.6278139698669417, + "eval_loss": 0.5303037166595459, + "eval_runtime": 11.4219, + "eval_samples_per_second": 33.094, + "eval_steps_per_second": 1.138, + "step": 31200 + }, + { + "epoch": 0.6298261941293357, + "grad_norm": 0.5783369541168213, + "learning_rate": 6.253389154049177e-06, + "loss": 0.7807, "step": 31300 }, { - "epoch": 0.5210213013087757, - "grad_norm": 0.6296388506889343, - "learning_rate": 9.647673696542545e-06, - "loss": 1.022, + "epoch": 0.6318384183917297, + "grad_norm": 0.5356603860855103, + "learning_rate": 6.19366083345291e-06, + "loss": 0.7801, "step": 31400 }, { - "epoch": 0.5226806048161284, - "grad_norm": 0.5719345211982727, - "learning_rate": 9.594519619067583e-06, - "loss": 1.023, + "epoch": 0.6338506426541238, + "grad_norm": 0.5529428124427795, + "learning_rate": 6.134090890653015e-06, + "loss": 0.7774, "step": 31500 }, { - "epoch": 0.5226806048161284, - "eval_loss": 0.5418269038200378, - "eval_runtime": 12.7625, - "eval_samples_per_second": 29.618, - "eval_steps_per_second": 1.019, + "epoch": 0.6338506426541238, + "eval_loss": 0.5301904678344727, + "eval_runtime": 11.4476, + "eval_samples_per_second": 33.02, + "eval_steps_per_second": 1.136, "step": 31500 }, { - "epoch": 0.5243399083234812, - "grad_norm": 0.6241312026977539, - "learning_rate": 9.541377014327967e-06, - "loss": 1.0293, + "epoch": 0.6358628669165178, + "grad_norm": 0.5553627610206604, + "learning_rate": 6.074681804292581e-06, + "loss": 0.7791, "step": 31600 }, { - "epoch": 0.525999211830834, - "grad_norm": 0.568593442440033, - "learning_rate": 9.488247385950173e-06, - "loss": 1.0219, + "epoch": 0.6378750911789118, + "grad_norm": 0.5281953811645508, + "learning_rate": 6.0154360463216325e-06, + "loss": 0.7769, "step": 31700 }, { - "epoch": 0.5276585153381868, - "grad_norm": 0.6001921892166138, - "learning_rate": 9.435132237193531e-06, - "loss": 1.0229, + "epoch": 0.639887315441306, + "grad_norm": 0.6406475305557251, + "learning_rate": 5.956356081894259e-06, + "loss": 0.7799, + "step": 31800 + }, + { + "epoch": 0.639887315441306, + "eval_loss": 0.5294053554534912, + "eval_runtime": 11.3422, + "eval_samples_per_second": 33.327, + "eval_steps_per_second": 1.146, "step": 31800 }, { - "epoch": 0.5293178188455396, - "grad_norm": 0.589522659778595, - "learning_rate": 9.382033070907687e-06, - "loss": 1.0177, + "epoch": 0.6418995397037, + "grad_norm": 0.49855828285217285, + "learning_rate": 5.897444369266066e-06, + "loss": 0.7759, "step": 31900 }, { - "epoch": 0.5309771223528924, - "grad_norm": 0.5934117436408997, - "learning_rate": 9.328951389490063e-06, - "loss": 1.0196, + "epoch": 0.6439117639660941, + "grad_norm": 0.5699638724327087, + "learning_rate": 5.838703359691873e-06, + "loss": 0.7673, "step": 32000 }, { - "epoch": 0.5309771223528924, - "eval_loss": 0.5427911877632141, - "eval_runtime": 11.7254, - "eval_samples_per_second": 32.238, - "eval_steps_per_second": 1.109, - "step": 32000 + "epoch": 0.6459239882284881, + "grad_norm": 0.5306676030158997, + "learning_rate": 5.780135497323724e-06, + "loss": 0.7799, + "step": 32100 }, { - "epoch": 0.5326364258602452, - "grad_norm": 0.5999579429626465, - "learning_rate": 9.275888694843367e-06, - "loss": 1.0043, + "epoch": 0.6459239882284881, + "eval_loss": 0.5290261507034302, + "eval_runtime": 11.3435, + "eval_samples_per_second": 33.323, + "eval_steps_per_second": 1.146, "step": 32100 }, { - "epoch": 0.534295729367598, - "grad_norm": 0.6093688011169434, - "learning_rate": 9.2228464883331e-06, - "loss": 1.0142, + "epoch": 0.6479362124908821, + "grad_norm": 0.5989037752151489, + "learning_rate": 5.721743219109187e-06, + "loss": 0.7757, "step": 32200 }, { - "epoch": 0.5359550328749507, - "grad_norm": 0.6463187336921692, - "learning_rate": 9.169826270745052e-06, - "loss": 1.0105, + "epoch": 0.6499484367532762, + "grad_norm": 0.5595914721488953, + "learning_rate": 5.663528954689958e-06, + "loss": 0.7761, "step": 32300 }, { - "epoch": 0.5376143363823035, - "grad_norm": 0.6449178457260132, - "learning_rate": 9.116829542242868e-06, - "loss": 1.0266, + "epoch": 0.6519606610156702, + "grad_norm": 0.5618345737457275, + "learning_rate": 5.605495126300766e-06, + "loss": 0.779, "step": 32400 }, { - "epoch": 0.5392736398896563, - "grad_norm": 0.603361189365387, - "learning_rate": 9.063857802325581e-06, - "loss": 1.019, - "step": 32500 + "epoch": 0.6519606610156702, + "eval_loss": 0.529247522354126, + "eval_runtime": 11.3716, + "eval_samples_per_second": 33.241, + "eval_steps_per_second": 1.143, + "step": 32400 }, { - "epoch": 0.5392736398896563, - "eval_loss": 0.5403046607971191, - "eval_runtime": 14.8563, - "eval_samples_per_second": 25.444, - "eval_steps_per_second": 0.875, + "epoch": 0.6539728852780642, + "grad_norm": 0.5271475315093994, + "learning_rate": 5.547644148668585e-06, + "loss": 0.7747, "step": 32500 }, { - "epoch": 0.5409329433970091, - "grad_norm": 0.6394901275634766, - "learning_rate": 9.010912549785193e-06, - "loss": 1.0153, + "epoch": 0.6559851095404583, + "grad_norm": 0.5703973770141602, + "learning_rate": 5.489978428912157e-06, + "loss": 0.7801, "step": 32600 }, { - "epoch": 0.5425922469043619, - "grad_norm": 0.5955016016960144, - "learning_rate": 8.957995282664272e-06, - "loss": 1.0251, + "epoch": 0.6579973338028523, + "grad_norm": 0.570797860622406, + "learning_rate": 5.432500366441843e-06, + "loss": 0.7756, + "step": 32700 + }, + { + "epoch": 0.6579973338028523, + "eval_loss": 0.5275307893753052, + "eval_runtime": 11.3412, + "eval_samples_per_second": 33.33, + "eval_steps_per_second": 1.146, "step": 32700 }, { - "epoch": 0.5442515504117147, - "grad_norm": 0.6822714805603027, - "learning_rate": 8.905107498213563e-06, - "loss": 1.0244, + "epoch": 0.6600095580652464, + "grad_norm": 0.564414918422699, + "learning_rate": 5.3752123528597746e-06, + "loss": 0.7688, "step": 32800 }, { - "epoch": 0.5459108539190675, - "grad_norm": 0.7377395629882812, - "learning_rate": 8.852250692849608e-06, - "loss": 1.0169, + "epoch": 0.6620217823276404, + "grad_norm": 0.5405446290969849, + "learning_rate": 5.318116771860351e-06, + "loss": 0.7777, "step": 32900 }, { - "epoch": 0.5475701574264202, - "grad_norm": 0.6069468855857849, - "learning_rate": 8.799426362112438e-06, - "loss": 1.0162, + "epoch": 0.6640340065900344, + "grad_norm": 0.5645068883895874, + "learning_rate": 5.261215999131055e-06, + "loss": 0.7723, "step": 33000 }, { - "epoch": 0.5475701574264202, - "eval_loss": 0.5396648645401001, - "eval_runtime": 12.7508, - "eval_samples_per_second": 29.645, - "eval_steps_per_second": 1.02, + "epoch": 0.6640340065900344, + "eval_loss": 0.5280060172080994, + "eval_runtime": 11.3103, + "eval_samples_per_second": 33.421, + "eval_steps_per_second": 1.149, "step": 33000 }, { - "epoch": 0.549229460933773, - "grad_norm": 0.6045710444450378, - "learning_rate": 8.74663600062324e-06, - "loss": 1.0264, + "epoch": 0.6660462308524285, + "grad_norm": 0.5821409225463867, + "learning_rate": 5.204512402253592e-06, + "loss": 0.7857, "step": 33100 }, { - "epoch": 0.5508887644411259, - "grad_norm": 0.6170222759246826, - "learning_rate": 8.693881102042052e-06, - "loss": 1.0293, + "epoch": 0.6680584551148225, + "grad_norm": 0.5534176230430603, + "learning_rate": 5.148008340605393e-06, + "loss": 0.7726, "step": 33200 }, { - "epoch": 0.5525480679484787, - "grad_norm": 0.6849231123924255, - "learning_rate": 8.641163159025537e-06, - "loss": 1.0159, + "epoch": 0.6700706793772165, + "grad_norm": 0.5734113454818726, + "learning_rate": 5.091706165261438e-06, + "loss": 0.7806, + "step": 33300 + }, + { + "epoch": 0.6700706793772165, + "eval_loss": 0.527226984500885, + "eval_runtime": 11.3532, + "eval_samples_per_second": 33.295, + "eval_steps_per_second": 1.145, "step": 33300 }, { - "epoch": 0.5542073714558314, - "grad_norm": 0.5818921327590942, - "learning_rate": 8.588483663184725e-06, - "loss": 1.0131, + "epoch": 0.6720829036396107, + "grad_norm": 0.5118337273597717, + "learning_rate": 5.035608218896424e-06, + "loss": 0.7794, "step": 33400 }, { - "epoch": 0.5558666749631842, - "grad_norm": 0.6297749876976013, - "learning_rate": 8.535844105042806e-06, - "loss": 1.0096, + "epoch": 0.6740951279020047, + "grad_norm": 0.520524799823761, + "learning_rate": 4.979716835687296e-06, + "loss": 0.7833, "step": 33500 }, { - "epoch": 0.5558666749631842, - "eval_loss": 0.5385372638702393, - "eval_runtime": 12.8234, - "eval_samples_per_second": 29.477, - "eval_steps_per_second": 1.014, - "step": 33500 + "epoch": 0.6761073521643988, + "grad_norm": 0.5260956883430481, + "learning_rate": 4.924034341216123e-06, + "loss": 0.7722, + "step": 33600 }, { - "epoch": 0.557525978470537, - "grad_norm": 0.592827558517456, - "learning_rate": 8.483245973992978e-06, - "loss": 1.0294, + "epoch": 0.6761073521643988, + "eval_loss": 0.5266076326370239, + "eval_runtime": 11.3351, + "eval_samples_per_second": 33.348, + "eval_steps_per_second": 1.147, "step": 33600 }, { - "epoch": 0.5591852819778897, - "grad_norm": 0.6097353100776672, - "learning_rate": 8.430690758256286e-06, - "loss": 1.0246, + "epoch": 0.6781195764267928, + "grad_norm": 0.5933238863945007, + "learning_rate": 4.868563052373329e-06, + "loss": 0.778, "step": 33700 }, { - "epoch": 0.5608445854852425, - "grad_norm": 0.6435708999633789, - "learning_rate": 8.378179944839526e-06, - "loss": 1.0284, + "epoch": 0.6801318006891868, + "grad_norm": 0.5882487297058105, + "learning_rate": 4.813305277261294e-06, + "loss": 0.778, "step": 33800 }, { - "epoch": 0.5625038889925954, - "grad_norm": 0.6222900152206421, - "learning_rate": 8.325715019493159e-06, - "loss": 1.0128, + "epoch": 0.6821440249515809, + "grad_norm": 0.5495398640632629, + "learning_rate": 4.758263315098319e-06, + "loss": 0.7749, "step": 33900 }, { - "epoch": 0.5641631924999482, - "grad_norm": 0.6095114946365356, - "learning_rate": 8.27329746666929e-06, - "loss": 1.0269, - "step": 34000 + "epoch": 0.6821440249515809, + "eval_loss": 0.527021050453186, + "eval_runtime": 11.3019, + "eval_samples_per_second": 33.446, + "eval_steps_per_second": 1.15, + "step": 33900 }, { - "epoch": 0.5641631924999482, - "eval_loss": 0.5385136008262634, - "eval_runtime": 15.4981, - "eval_samples_per_second": 24.39, - "eval_steps_per_second": 0.839, + "epoch": 0.6841562492139749, + "grad_norm": 0.5372888445854187, + "learning_rate": 4.703439456122942e-06, + "loss": 0.7726, "step": 34000 }, { - "epoch": 0.565822496007301, - "grad_norm": 0.6315720081329346, - "learning_rate": 8.220928769479648e-06, - "loss": 1.0218, + "epoch": 0.6861684734763689, + "grad_norm": 0.5453928709030151, + "learning_rate": 4.648835981498665e-06, + "loss": 0.7736, "step": 34100 }, { - "epoch": 0.5674817995146537, - "grad_norm": 0.6922375559806824, - "learning_rate": 8.168610409653642e-06, - "loss": 1.0086, + "epoch": 0.688180697738763, + "grad_norm": 0.534249484539032, + "learning_rate": 4.594455163219025e-06, + "loss": 0.7669, "step": 34200 }, { - "epoch": 0.5691411030220065, - "grad_norm": 0.5871058106422424, - "learning_rate": 8.11634386749642e-06, - "loss": 1.027, + "epoch": 0.688180697738763, + "eval_loss": 0.5258325934410095, + "eval_runtime": 11.3315, + "eval_samples_per_second": 33.358, + "eval_steps_per_second": 1.147, + "step": 34200 + }, + { + "epoch": 0.690192922001157, + "grad_norm": 0.602557897567749, + "learning_rate": 4.5402992640130615e-06, + "loss": 0.7776, "step": 34300 }, { - "epoch": 0.5708004065293593, - "grad_norm": 0.6269960403442383, - "learning_rate": 8.064130621846988e-06, - "loss": 1.0058, + "epoch": 0.6922051462635511, + "grad_norm": 0.6340908408164978, + "learning_rate": 4.486370537251166e-06, + "loss": 0.7724, "step": 34400 }, { - "epoch": 0.5724597100367121, - "grad_norm": 0.6070001125335693, - "learning_rate": 8.01197215003638e-06, - "loss": 1.012, + "epoch": 0.6942173705259451, + "grad_norm": 0.5442144870758057, + "learning_rate": 4.43267122685132e-06, + "loss": 0.7678, "step": 34500 }, { - "epoch": 0.5724597100367121, - "eval_loss": 0.5384827256202698, - "eval_runtime": 12.7257, - "eval_samples_per_second": 29.704, - "eval_steps_per_second": 1.022, + "epoch": 0.6942173705259451, + "eval_loss": 0.52588951587677, + "eval_runtime": 11.3113, + "eval_samples_per_second": 33.418, + "eval_steps_per_second": 1.149, "step": 34500 }, { - "epoch": 0.5741190135440649, - "grad_norm": 0.5841761827468872, - "learning_rate": 7.959869927845847e-06, - "loss": 1.0111, + "epoch": 0.6962295947883391, + "grad_norm": 0.5438702702522278, + "learning_rate": 4.379203567185733e-06, + "loss": 0.7722, "step": 34600 }, { - "epoch": 0.5757783170514177, - "grad_norm": 0.6075329780578613, - "learning_rate": 7.907825429465092e-06, - "loss": 1.021, + "epoch": 0.6982418190507332, + "grad_norm": 0.575579822063446, + "learning_rate": 4.325969782987868e-06, + "loss": 0.7806, "step": 34700 }, { - "epoch": 0.5774376205587705, - "grad_norm": 0.62599778175354, - "learning_rate": 7.855840127450587e-06, - "loss": 1.0187, + "epoch": 0.7002540433131272, + "grad_norm": 0.53037029504776, + "learning_rate": 4.2729720892598725e-06, + "loss": 0.7677, "step": 34800 }, { - "epoch": 0.5790969240661232, - "grad_norm": 0.6155137419700623, - "learning_rate": 7.803915492683879e-06, - "loss": 1.0237, + "epoch": 0.7002540433131272, + "eval_loss": 0.5252464413642883, + "eval_runtime": 11.2976, + "eval_samples_per_second": 33.458, + "eval_steps_per_second": 1.151, + "step": 34800 + }, + { + "epoch": 0.7022662675755212, + "grad_norm": 0.5570893883705139, + "learning_rate": 4.220212691180422e-06, + "loss": 0.7674, "step": 34900 }, { - "epoch": 0.580756227573476, - "grad_norm": 0.5985190868377686, - "learning_rate": 7.752052994329995e-06, - "loss": 1.0079, + "epoch": 0.7042784918379154, + "grad_norm": 0.564457893371582, + "learning_rate": 4.167693784012948e-06, + "loss": 0.7774, "step": 35000 }, { - "epoch": 0.580756227573476, - "eval_loss": 0.5377073884010315, - "eval_runtime": 12.8866, - "eval_samples_per_second": 29.333, - "eval_steps_per_second": 1.009, - "step": 35000 + "epoch": 0.7062907161003094, + "grad_norm": 0.6193362474441528, + "learning_rate": 4.115417553014317e-06, + "loss": 0.7739, + "step": 35100 }, { - "epoch": 0.5824155310808288, - "grad_norm": 0.5882691740989685, - "learning_rate": 7.700254099795847e-06, - "loss": 1.0207, + "epoch": 0.7062907161003094, + "eval_loss": 0.5251539349555969, + "eval_runtime": 11.3037, + "eval_samples_per_second": 33.44, + "eval_steps_per_second": 1.15, "step": 35100 }, { - "epoch": 0.5840748345881817, - "grad_norm": 0.5812239646911621, - "learning_rate": 7.648520274688747e-06, - "loss": 1.0179, + "epoch": 0.7083029403627035, + "grad_norm": 0.5650792121887207, + "learning_rate": 4.063386173343888e-06, + "loss": 0.775, "step": 35200 }, { - "epoch": 0.5857341380955344, - "grad_norm": 0.5958197116851807, - "learning_rate": 7.596852982774912e-06, - "loss": 1.0199, + "epoch": 0.7103151646250975, + "grad_norm": 0.5598296523094177, + "learning_rate": 4.0116018099730155e-06, + "loss": 0.7736, "step": 35300 }, { - "epoch": 0.5873934416028872, - "grad_norm": 0.6720435619354248, - "learning_rate": 7.545253685938054e-06, - "loss": 1.0044, + "epoch": 0.7123273888874915, + "grad_norm": 0.5999264717102051, + "learning_rate": 3.960066617594962e-06, + "loss": 0.7728, "step": 35400 }, { - "epoch": 0.58905274511024, - "grad_norm": 0.6013358235359192, - "learning_rate": 7.493723844138025e-06, - "loss": 1.0078, - "step": 35500 + "epoch": 0.7123273888874915, + "eval_loss": 0.5251903533935547, + "eval_runtime": 11.3608, + "eval_samples_per_second": 33.272, + "eval_steps_per_second": 1.144, + "step": 35400 }, { - "epoch": 0.58905274511024, - "eval_loss": 0.5365841388702393, - "eval_runtime": 14.7976, - "eval_samples_per_second": 25.545, - "eval_steps_per_second": 0.879, + "epoch": 0.7143396131498856, + "grad_norm": 0.5485169291496277, + "learning_rate": 3.908782740535244e-06, + "loss": 0.7663, "step": 35500 }, { - "epoch": 0.5907120486175927, - "grad_norm": 0.5762251615524292, - "learning_rate": 7.442264915369506e-06, - "loss": 1.0117, + "epoch": 0.7163518374122796, + "grad_norm": 0.5973437428474426, + "learning_rate": 3.857752312662413e-06, + "loss": 0.7731, "step": 35600 }, { - "epoch": 0.5923713521249455, - "grad_norm": 0.6333112716674805, - "learning_rate": 7.390878355620747e-06, - "loss": 1.0166, + "epoch": 0.7183640616746736, + "grad_norm": 0.559617280960083, + "learning_rate": 3.8069774572992614e-06, + "loss": 0.7623, + "step": 35700 + }, + { + "epoch": 0.7183640616746736, + "eval_loss": 0.5247710347175598, + "eval_runtime": 11.3529, + "eval_samples_per_second": 33.296, + "eval_steps_per_second": 1.145, "step": 35700 }, { - "epoch": 0.5940306556322983, - "grad_norm": 0.5892146229743958, - "learning_rate": 7.339565618832371e-06, - "loss": 1.0057, + "epoch": 0.7203762859370677, + "grad_norm": 0.5565606355667114, + "learning_rate": 3.756460287134479e-06, + "loss": 0.7773, "step": 35800 }, { - "epoch": 0.5956899591396512, - "grad_norm": 0.7321692109107971, - "learning_rate": 7.288328156856255e-06, - "loss": 1.0159, + "epoch": 0.7223885101994617, + "grad_norm": 0.5371571779251099, + "learning_rate": 3.706202904134747e-06, + "loss": 0.7761, "step": 35900 }, { - "epoch": 0.5973492626470039, - "grad_norm": 0.5949785113334656, - "learning_rate": 7.2371674194144215e-06, - "loss": 1.0148, + "epoch": 0.7244007344618558, + "grad_norm": 0.5425861477851868, + "learning_rate": 3.6562073994572624e-06, + "loss": 0.7775, "step": 36000 }, { - "epoch": 0.5973492626470039, - "eval_loss": 0.5353676080703735, - "eval_runtime": 11.8717, - "eval_samples_per_second": 31.84, - "eval_steps_per_second": 1.095, + "epoch": 0.7244007344618558, + "eval_loss": 0.5243012309074402, + "eval_runtime": 11.3858, + "eval_samples_per_second": 33.199, + "eval_steps_per_second": 1.142, "step": 36000 }, { - "epoch": 0.5990085661543567, - "grad_norm": 0.6124995946884155, - "learning_rate": 7.186084854058046e-06, - "loss": 1.0121, + "epoch": 0.7264129587242498, + "grad_norm": 0.5546737909317017, + "learning_rate": 3.6064758533627496e-06, + "loss": 0.7712, "step": 36100 }, { - "epoch": 0.6006678696617095, - "grad_norm": 0.6093805432319641, - "learning_rate": 7.135081906126491e-06, - "loss": 1.0246, + "epoch": 0.7284251829866438, + "grad_norm": 0.6678885221481323, + "learning_rate": 3.55701033512889e-06, + "loss": 0.769, "step": 36200 }, { - "epoch": 0.6023271731690623, - "grad_norm": 0.6078169345855713, - "learning_rate": 7.084160018706389e-06, - "loss": 1.0142, + "epoch": 0.7304374072490379, + "grad_norm": 0.5747791528701782, + "learning_rate": 3.5078129029642192e-06, + "loss": 0.7671, "step": 36300 }, { - "epoch": 0.603986476676415, - "grad_norm": 0.6098805069923401, - "learning_rate": 7.0333206325908575e-06, - "loss": 1.0201, + "epoch": 0.7304374072490379, + "eval_loss": 0.523876428604126, + "eval_runtime": 11.3643, + "eval_samples_per_second": 33.262, + "eval_steps_per_second": 1.144, + "step": 36300 + }, + { + "epoch": 0.7324496315114319, + "grad_norm": 0.6479108333587646, + "learning_rate": 3.458885603922498e-06, + "loss": 0.7678, "step": 36400 }, { - "epoch": 0.6056457801837679, - "grad_norm": 0.5873403549194336, - "learning_rate": 6.982565186238696e-06, - "loss": 1.0179, + "epoch": 0.734461855773826, + "grad_norm": 0.5260623693466187, + "learning_rate": 3.4102304738175264e-06, + "loss": 0.7686, "step": 36500 }, { - "epoch": 0.6056457801837679, - "eval_loss": 0.5345824956893921, - "eval_runtime": 13.4925, - "eval_samples_per_second": 28.016, - "eval_steps_per_second": 0.964, - "step": 36500 + "epoch": 0.7364740800362201, + "grad_norm": 0.5565561056137085, + "learning_rate": 3.3618495371384384e-06, + "loss": 0.7722, + "step": 36600 }, { - "epoch": 0.6073050836911207, - "grad_norm": 0.6021679639816284, - "learning_rate": 6.931895115733687e-06, - "loss": 1.0004, + "epoch": 0.7364740800362201, + "eval_loss": 0.5241602659225464, + "eval_runtime": 11.2637, + "eval_samples_per_second": 33.559, + "eval_steps_per_second": 1.154, "step": 36600 }, { - "epoch": 0.6089643871984735, - "grad_norm": 0.6355155110359192, - "learning_rate": 6.88131185474399e-06, - "loss": 1.0134, + "epoch": 0.7384863042986141, + "grad_norm": 0.5522435307502747, + "learning_rate": 3.3137448069654687e-06, + "loss": 0.7753, "step": 36700 }, { - "epoch": 0.6106236907058262, - "grad_norm": 0.5957785844802856, - "learning_rate": 6.8308168344815575e-06, - "loss": 1.0117, + "epoch": 0.7404985285610082, + "grad_norm": 0.5111953020095825, + "learning_rate": 3.265918284886186e-06, + "loss": 0.7739, "step": 36800 }, { - "epoch": 0.612282994213179, - "grad_norm": 0.5896579027175903, - "learning_rate": 6.780411483661627e-06, - "loss": 1.0162, + "epoch": 0.7425107528234022, + "grad_norm": 0.5280485153198242, + "learning_rate": 3.2183719609122146e-06, + "loss": 0.7626, "step": 36900 }, { - "epoch": 0.6139422977205318, - "grad_norm": 0.5432151556015015, - "learning_rate": 6.730097228462333e-06, - "loss": 1.0077, - "step": 37000 + "epoch": 0.7425107528234022, + "eval_loss": 0.5227437615394592, + "eval_runtime": 11.3194, + "eval_samples_per_second": 33.394, + "eval_steps_per_second": 1.148, + "step": 36900 }, { - "epoch": 0.6139422977205318, - "eval_loss": 0.5348573923110962, - "eval_runtime": 15.1628, - "eval_samples_per_second": 24.929, - "eval_steps_per_second": 0.857, + "epoch": 0.7445229770857962, + "grad_norm": 0.5183678865432739, + "learning_rate": 3.171107813396418e-06, + "loss": 0.7745, "step": 37000 }, { - "epoch": 0.6156016012278845, - "grad_norm": 0.6325159668922424, - "learning_rate": 6.6798754924843265e-06, - "loss": 0.9919, + "epoch": 0.7465352013481903, + "grad_norm": 0.5712314248085022, + "learning_rate": 3.124127808950602e-06, + "loss": 0.7711, "step": 37100 }, { - "epoch": 0.6172609047352374, - "grad_norm": 0.6163633465766907, - "learning_rate": 6.629747696710496e-06, - "loss": 0.9935, + "epoch": 0.7485474256105843, + "grad_norm": 0.5488412380218506, + "learning_rate": 3.0774339023636756e-06, + "loss": 0.7689, "step": 37200 }, { - "epoch": 0.6189202082425902, - "grad_norm": 0.5845269560813904, - "learning_rate": 6.579715259465777e-06, - "loss": 1.0164, + "epoch": 0.7485474256105843, + "eval_loss": 0.5230608582496643, + "eval_runtime": 11.338, + "eval_samples_per_second": 33.339, + "eval_steps_per_second": 1.147, + "step": 37200 + }, + { + "epoch": 0.7505596498729783, + "grad_norm": 0.5331023335456848, + "learning_rate": 3.0310280365203102e-06, + "loss": 0.7663, "step": 37300 }, { - "epoch": 0.620579511749943, - "grad_norm": 0.5965647101402283, - "learning_rate": 6.5297795963770125e-06, - "loss": 1.0123, + "epoch": 0.7525718741353724, + "grad_norm": 0.5227448344230652, + "learning_rate": 2.9849121423201054e-06, + "loss": 0.7645, "step": 37400 }, { - "epoch": 0.6222388152572957, - "grad_norm": 0.6072404384613037, - "learning_rate": 6.479942120332897e-06, - "loss": 1.0107, + "epoch": 0.7545840983977664, + "grad_norm": 0.5383438467979431, + "learning_rate": 2.9390881385972445e-06, + "loss": 0.7624, "step": 37500 }, { - "epoch": 0.6222388152572957, - "eval_loss": 0.533125638961792, - "eval_runtime": 12.8853, - "eval_samples_per_second": 29.336, - "eval_steps_per_second": 1.009, + "epoch": 0.7545840983977664, + "eval_loss": 0.5230525732040405, + "eval_runtime": 11.3076, + "eval_samples_per_second": 33.429, + "eval_steps_per_second": 1.15, "step": 37500 }, { - "epoch": 0.6238981187646485, - "grad_norm": 0.6402060985565186, - "learning_rate": 6.430204241444005e-06, - "loss": 1.0298, + "epoch": 0.7565963226601605, + "grad_norm": 0.5267183184623718, + "learning_rate": 2.8935579320406504e-06, + "loss": 0.7744, "step": 37600 }, { - "epoch": 0.6255574222720013, - "grad_norm": 0.6250644326210022, - "learning_rate": 6.38056736700289e-06, - "loss": 1.0111, + "epoch": 0.7586085469225545, + "grad_norm": 0.5995730757713318, + "learning_rate": 2.8483234171146544e-06, + "loss": 0.77, "step": 37700 }, { - "epoch": 0.6272167257793542, - "grad_norm": 0.5802329182624817, - "learning_rate": 6.3310329014442695e-06, - "loss": 1.0106, + "epoch": 0.7606207711849485, + "grad_norm": 0.5342182517051697, + "learning_rate": 2.803386475980171e-06, + "loss": 0.772, "step": 37800 }, { - "epoch": 0.6288760292867069, - "grad_norm": 0.6037899255752563, - "learning_rate": 6.281602246305282e-06, - "loss": 1.0082, + "epoch": 0.7606207711849485, + "eval_loss": 0.5222497582435608, + "eval_runtime": 11.6813, + "eval_samples_per_second": 32.36, + "eval_steps_per_second": 1.113, + "step": 37800 + }, + { + "epoch": 0.7626329954473426, + "grad_norm": 0.5149078965187073, + "learning_rate": 2.758748978416369e-06, + "loss": 0.7675, "step": 37900 }, { - "epoch": 0.6305353327940597, - "grad_norm": 0.5910411477088928, - "learning_rate": 6.232276800185842e-06, - "loss": 1.0169, + "epoch": 0.7646452197097366, + "grad_norm": 0.5688450932502747, + "learning_rate": 2.7144127817428965e-06, + "loss": 0.7655, "step": 38000 }, { - "epoch": 0.6305353327940597, - "eval_loss": 0.5327327847480774, - "eval_runtime": 12.9447, - "eval_samples_per_second": 29.201, - "eval_steps_per_second": 1.004, - "step": 38000 + "epoch": 0.7666574439721306, + "grad_norm": 0.5706648826599121, + "learning_rate": 2.6703797307425792e-06, + "loss": 0.7645, + "step": 38100 }, { - "epoch": 0.6321946363014125, - "grad_norm": 0.6423376202583313, - "learning_rate": 6.183057958709049e-06, - "loss": 1.0079, + "epoch": 0.7666574439721306, + "eval_loss": 0.5218858122825623, + "eval_runtime": 11.6659, + "eval_samples_per_second": 32.402, + "eval_steps_per_second": 1.114, "step": 38100 }, { - "epoch": 0.6338539398087653, - "grad_norm": 0.5916347503662109, - "learning_rate": 6.133947114481722e-06, - "loss": 1.0153, + "epoch": 0.7686696682345248, + "grad_norm": 0.5271847248077393, + "learning_rate": 2.626651657584672e-06, + "loss": 0.7699, "step": 38200 }, { - "epoch": 0.635513243316118, - "grad_norm": 0.5857393145561218, - "learning_rate": 6.084945657054983e-06, - "loss": 1.0048, + "epoch": 0.7706818924969188, + "grad_norm": 0.5311073064804077, + "learning_rate": 2.5832303817486137e-06, + "loss": 0.766, "step": 38300 }, { - "epoch": 0.6371725468234708, - "grad_norm": 0.5945661664009094, - "learning_rate": 6.036054972884949e-06, - "loss": 1.0172, + "epoch": 0.7726941167593129, + "grad_norm": 0.5762016177177429, + "learning_rate": 2.540117709948332e-06, + "loss": 0.7612, "step": 38400 }, { - "epoch": 0.6388318503308237, - "grad_norm": 0.5578923225402832, - "learning_rate": 5.987276445293484e-06, - "loss": 1.0233, - "step": 38500 + "epoch": 0.7726941167593129, + "eval_loss": 0.5214508175849915, + "eval_runtime": 11.4525, + "eval_samples_per_second": 33.006, + "eval_steps_per_second": 1.135, + "step": 38400 }, { - "epoch": 0.6388318503308237, - "eval_loss": 0.5333393216133118, - "eval_runtime": 12.8475, - "eval_samples_per_second": 29.422, - "eval_steps_per_second": 1.012, + "epoch": 0.7747063410217069, + "grad_norm": 0.5659816861152649, + "learning_rate": 2.497315436057064e-06, + "loss": 0.7693, "step": 38500 }, { - "epoch": 0.6404911538381765, - "grad_norm": 0.6216848492622375, - "learning_rate": 5.938611454429086e-06, - "loss": 1.0313, + "epoch": 0.7767185652841009, + "grad_norm": 0.530085563659668, + "learning_rate": 2.4548253410327104e-06, + "loss": 0.7598, "step": 38600 }, { - "epoch": 0.6421504573455292, - "grad_norm": 0.6229336261749268, - "learning_rate": 5.890061377227827e-06, - "loss": 1.0072, + "epoch": 0.778730789546495, + "grad_norm": 0.624070405960083, + "learning_rate": 2.412649192843739e-06, + "loss": 0.7722, + "step": 38700 + }, + { + "epoch": 0.778730789546495, + "eval_loss": 0.5214821100234985, + "eval_runtime": 11.3194, + "eval_samples_per_second": 33.394, + "eval_steps_per_second": 1.148, "step": 38700 }, { - "epoch": 0.643809760852882, - "grad_norm": 0.6161938905715942, - "learning_rate": 5.841627587374375e-06, - "loss": 1.0059, + "epoch": 0.780743013808889, + "grad_norm": 0.5348799228668213, + "learning_rate": 2.3707887463956146e-06, + "loss": 0.7615, "step": 38800 }, { - "epoch": 0.6454690643602348, - "grad_norm": 0.6332730650901794, - "learning_rate": 5.793311455263158e-06, - "loss": 1.001, + "epoch": 0.782755238071283, + "grad_norm": 0.5490187406539917, + "learning_rate": 2.3292457434577854e-06, + "loss": 0.7714, "step": 38900 }, { - "epoch": 0.6471283678675875, - "grad_norm": 0.633680522441864, - "learning_rate": 5.745114347959573e-06, - "loss": 1.0078, + "epoch": 0.7847674623336771, + "grad_norm": 0.5568532943725586, + "learning_rate": 2.2880219125912064e-06, + "loss": 0.7604, "step": 39000 }, { - "epoch": 0.6471283678675875, - "eval_loss": 0.5315578579902649, - "eval_runtime": 13.5426, - "eval_samples_per_second": 27.912, - "eval_steps_per_second": 0.96, + "epoch": 0.7847674623336771, + "eval_loss": 0.5214923620223999, + "eval_runtime": 11.3214, + "eval_samples_per_second": 33.388, + "eval_steps_per_second": 1.148, "step": 39000 }, { - "epoch": 0.6487876713749404, - "grad_norm": 0.562219500541687, - "learning_rate": 5.697037629161297e-06, - "loss": 1.0205, + "epoch": 0.7867796865960711, + "grad_norm": 0.5511381030082703, + "learning_rate": 2.2471189690764093e-06, + "loss": 0.7644, "step": 39100 }, { - "epoch": 0.6504469748822932, - "grad_norm": 0.643469512462616, - "learning_rate": 5.649082659159721e-06, - "loss": 1.006, + "epoch": 0.7887919108584652, + "grad_norm": 0.5425460338592529, + "learning_rate": 2.2065386148421486e-06, + "loss": 0.7633, "step": 39200 }, { - "epoch": 0.652106278389646, - "grad_norm": 0.5786009430885315, - "learning_rate": 5.60125079480146e-06, - "loss": 1.0023, + "epoch": 0.7908041351208592, + "grad_norm": 0.4867189824581146, + "learning_rate": 2.1662825383945686e-06, + "loss": 0.7674, "step": 39300 }, { - "epoch": 0.6537655818969987, - "grad_norm": 0.5841802358627319, - "learning_rate": 5.553543389449938e-06, - "loss": 1.0138, + "epoch": 0.7908041351208592, + "eval_loss": 0.5209300518035889, + "eval_runtime": 11.3182, + "eval_samples_per_second": 33.397, + "eval_steps_per_second": 1.149, + "step": 39300 + }, + { + "epoch": 0.7928163593832532, + "grad_norm": 0.5154452919960022, + "learning_rate": 2.1263524147469573e-06, + "loss": 0.7663, "step": 39400 }, { - "epoch": 0.6554248854043515, - "grad_norm": 0.5957034826278687, - "learning_rate": 5.505961792947126e-06, - "loss": 0.9886, + "epoch": 0.7948285836456473, + "grad_norm": 0.5264437198638916, + "learning_rate": 2.0867499053500473e-06, + "loss": 0.7642, "step": 39500 }, { - "epoch": 0.6554248854043515, - "eval_loss": 0.5314927697181702, - "eval_runtime": 12.6689, - "eval_samples_per_second": 29.837, - "eval_steps_per_second": 1.026, - "step": 39500 + "epoch": 0.7968408079080413, + "grad_norm": 0.5303503274917603, + "learning_rate": 2.047476658022881e-06, + "loss": 0.7722, + "step": 39600 }, { - "epoch": 0.6570841889117043, - "grad_norm": 0.580834686756134, - "learning_rate": 5.458507351575337e-06, - "loss": 1.0158, + "epoch": 0.7968408079080413, + "eval_loss": 0.5208966135978699, + "eval_runtime": 11.3632, + "eval_samples_per_second": 33.265, + "eval_steps_per_second": 1.144, "step": 39600 }, { - "epoch": 0.658743492419057, - "grad_norm": 0.5881614685058594, - "learning_rate": 5.411181408019124e-06, - "loss": 1.0034, + "epoch": 0.7988530321704354, + "grad_norm": 0.5367266535758972, + "learning_rate": 2.0085343068842546e-06, + "loss": 0.753, "step": 39700 }, { - "epoch": 0.6604027959264099, - "grad_norm": 0.6529964208602905, - "learning_rate": 5.363985301327311e-06, - "loss": 1.0068, + "epoch": 0.8008652564328295, + "grad_norm": 0.5081086754798889, + "learning_rate": 1.9699244722847143e-06, + "loss": 0.7571, "step": 39800 }, { - "epoch": 0.6620620994337627, - "grad_norm": 0.6376907229423523, - "learning_rate": 5.316920366875091e-06, - "loss": 1.0118, + "epoch": 0.8028774806952235, + "grad_norm": 0.5019336938858032, + "learning_rate": 1.9316487607391465e-06, + "loss": 0.7723, "step": 39900 }, { - "epoch": 0.6637214029411155, - "grad_norm": 0.5695504546165466, - "learning_rate": 5.269987936326239e-06, - "loss": 1.002, - "step": 40000 + "epoch": 0.8028774806952235, + "eval_loss": 0.5206644535064697, + "eval_runtime": 11.3602, + "eval_samples_per_second": 33.274, + "eval_steps_per_second": 1.144, + "step": 39900 }, { - "epoch": 0.6637214029411155, - "eval_loss": 0.5303639769554138, - "eval_runtime": 13.2748, - "eval_samples_per_second": 28.475, - "eval_steps_per_second": 0.979, + "epoch": 0.8048897049576176, + "grad_norm": 0.5184951424598694, + "learning_rate": 1.893708764859924e-06, + "loss": 0.7677, "step": 40000 }, { - "epoch": 0.6653807064484683, - "grad_norm": 0.6005424857139587, - "learning_rate": 5.223189337595446e-06, - "loss": 1.0112, + "epoch": 0.8069019292200116, + "grad_norm": 0.5265465974807739, + "learning_rate": 1.8561060632906369e-06, + "loss": 0.7686, "step": 40100 }, { - "epoch": 0.667040009955821, - "grad_norm": 0.664351761341095, - "learning_rate": 5.176525894810747e-06, - "loss": 1.0088, + "epoch": 0.8089141534824056, + "grad_norm": 0.5161654353141785, + "learning_rate": 1.8188422206404165e-06, + "loss": 0.769, "step": 40200 }, { - "epoch": 0.6686993134631738, - "grad_norm": 0.6507367491722107, - "learning_rate": 5.1299989282760365e-06, - "loss": 1.0067, + "epoch": 0.8089141534824056, + "eval_loss": 0.5201809406280518, + "eval_runtime": 11.369, + "eval_samples_per_second": 33.248, + "eval_steps_per_second": 1.143, + "step": 40200 + }, + { + "epoch": 0.8109263777447997, + "grad_norm": 0.5580165982246399, + "learning_rate": 1.7819187874188293e-06, + "loss": 0.7686, "step": 40300 }, { - "epoch": 0.6703586169705266, - "grad_norm": 0.5738443732261658, - "learning_rate": 5.083609754433736e-06, - "loss": 1.0037, + "epoch": 0.8129386020071937, + "grad_norm": 0.5577532052993774, + "learning_rate": 1.7453372999713557e-06, + "loss": 0.7616, "step": 40400 }, { - "epoch": 0.6720179204778794, - "grad_norm": 0.6231431365013123, - "learning_rate": 5.037359685827536e-06, - "loss": 1.0094, + "epoch": 0.8149508262695877, + "grad_norm": 0.5307947993278503, + "learning_rate": 1.709099280415476e-06, + "loss": 0.7705, "step": 40500 }, { - "epoch": 0.6720179204778794, - "eval_loss": 0.5298513770103455, - "eval_runtime": 12.9027, - "eval_samples_per_second": 29.296, - "eval_steps_per_second": 1.008, + "epoch": 0.8149508262695877, + "eval_loss": 0.5200989842414856, + "eval_runtime": 11.3357, + "eval_samples_per_second": 33.346, + "eval_steps_per_second": 1.147, "step": 40500 }, { - "epoch": 0.6736772239852322, - "grad_norm": 0.5760438442230225, - "learning_rate": 4.99125003106525e-06, - "loss": 1.0166, + "epoch": 0.8169630505319818, + "grad_norm": 0.5261068940162659, + "learning_rate": 1.6732062365773272e-06, + "loss": 0.7674, "step": 40600 }, { - "epoch": 0.675336527492585, - "grad_norm": 0.5474633574485779, - "learning_rate": 4.945282094781802e-06, - "loss": 0.9994, + "epoch": 0.8189752747943758, + "grad_norm": 0.4946574568748474, + "learning_rate": 1.6376596619289653e-06, + "loss": 0.7654, "step": 40700 }, { - "epoch": 0.6769958309999378, - "grad_norm": 0.6012837886810303, - "learning_rate": 4.899457177602312e-06, - "loss": 1.0077, + "epoch": 0.8209874990567699, + "grad_norm": 0.5491064786911011, + "learning_rate": 1.6024610355262282e-06, + "loss": 0.7695, + "step": 40800 + }, + { + "epoch": 0.8209874990567699, + "eval_loss": 0.5198547840118408, + "eval_runtime": 11.316, + "eval_samples_per_second": 33.404, + "eval_steps_per_second": 1.149, "step": 40800 }, { - "epoch": 0.6786551345072905, - "grad_norm": 0.6029711961746216, - "learning_rate": 4.853776576105278e-06, - "loss": 1.0088, + "epoch": 0.8229997233191639, + "grad_norm": 0.5306958556175232, + "learning_rate": 1.5676118219471891e-06, + "loss": 0.7619, "step": 40900 }, { - "epoch": 0.6803144380146433, - "grad_norm": 0.6109251976013184, - "learning_rate": 4.80824158278592e-06, - "loss": 1.0049, + "epoch": 0.8250119475815579, + "grad_norm": 0.5380471348762512, + "learning_rate": 1.5331134712312235e-06, + "loss": 0.767, "step": 41000 }, { - "epoch": 0.6803144380146433, - "eval_loss": 0.5289680361747742, - "eval_runtime": 12.9084, - "eval_samples_per_second": 29.283, - "eval_steps_per_second": 1.007, - "step": 41000 + "epoch": 0.827024171843952, + "grad_norm": 0.5167573094367981, + "learning_rate": 1.4989674188186598e-06, + "loss": 0.7599, + "step": 41100 }, { - "epoch": 0.6819737415219962, - "grad_norm": 0.5774779319763184, - "learning_rate": 4.762853486019587e-06, - "loss": 1.0043, + "epoch": 0.827024171843952, + "eval_loss": 0.5196862816810608, + "eval_runtime": 11.2973, + "eval_samples_per_second": 33.459, + "eval_steps_per_second": 1.151, "step": 41100 }, { - "epoch": 0.683633045029349, - "grad_norm": 0.5759828686714172, - "learning_rate": 4.717613570025304e-06, - "loss": 1.0038, + "epoch": 0.829036396106346, + "grad_norm": 0.5409244894981384, + "learning_rate": 1.4651750854910685e-06, + "loss": 0.7587, "step": 41200 }, { - "epoch": 0.6852923485367017, - "grad_norm": 0.6227918267250061, - "learning_rate": 4.6725231148294514e-06, - "loss": 1.0113, + "epoch": 0.83104862036874, + "grad_norm": 0.5431727170944214, + "learning_rate": 1.4317378773121393e-06, + "loss": 0.7579, "step": 41300 }, { - "epoch": 0.6869516520440545, - "grad_norm": 0.6436517238616943, - "learning_rate": 4.627583396229539e-06, - "loss": 1.0026, + "epoch": 0.8330608446311342, + "grad_norm": 0.53000807762146, + "learning_rate": 1.3986571855691744e-06, + "loss": 0.7688, "step": 41400 }, { - "epoch": 0.6886109555514073, - "grad_norm": 0.5544204115867615, - "learning_rate": 4.5827956857580995e-06, - "loss": 0.993, - "step": 41500 + "epoch": 0.8330608446311342, + "eval_loss": 0.5197826623916626, + "eval_runtime": 11.3928, + "eval_samples_per_second": 33.179, + "eval_steps_per_second": 1.141, + "step": 41400 }, { - "epoch": 0.6886109555514073, - "eval_loss": 0.5285223722457886, - "eval_runtime": 12.1139, - "eval_samples_per_second": 31.204, - "eval_steps_per_second": 1.073, + "epoch": 0.8350730688935282, + "grad_norm": 0.5434339046478271, + "learning_rate": 1.3659343867151975e-06, + "loss": 0.7695, "step": 41500 }, { - "epoch": 0.69027025905876, - "grad_norm": 0.6123907566070557, - "learning_rate": 4.538161250646726e-06, - "loss": 1.0052, + "epoch": 0.8370852931559222, + "grad_norm": 0.5368450284004211, + "learning_rate": 1.3335708423116856e-06, + "loss": 0.7636, "step": 41600 }, { - "epoch": 0.6919295625661128, - "grad_norm": 0.5870750546455383, - "learning_rate": 4.49368135379021e-06, - "loss": 1.0032, + "epoch": 0.8390975174183163, + "grad_norm": 0.5331200361251831, + "learning_rate": 1.3015678989719116e-06, + "loss": 0.7696, "step": 41700 }, { - "epoch": 0.6935888660734657, - "grad_norm": 0.6387689113616943, - "learning_rate": 4.4493572537108165e-06, - "loss": 1.0093, + "epoch": 0.8390975174183163, + "eval_loss": 0.519400954246521, + "eval_runtime": 11.3064, + "eval_samples_per_second": 33.432, + "eval_steps_per_second": 1.15, + "step": 41700 + }, + { + "epoch": 0.8411097416807103, + "grad_norm": 0.5858904123306274, + "learning_rate": 1.2699268883049154e-06, + "loss": 0.7648, "step": 41800 }, { - "epoch": 0.6952481695808185, - "grad_norm": 0.578326940536499, - "learning_rate": 4.405190204522652e-06, - "loss": 0.9977, + "epoch": 0.8431219659431044, + "grad_norm": 0.5302870273590088, + "learning_rate": 1.2386491268600976e-06, + "loss": 0.7553, "step": 41900 }, { - "epoch": 0.6969074730881712, - "grad_norm": 0.5934872627258301, - "learning_rate": 4.361181455896209e-06, - "loss": 1.0033, + "epoch": 0.8451341902054984, + "grad_norm": 0.4971041679382324, + "learning_rate": 1.2077359160724388e-06, + "loss": 0.7655, "step": 42000 }, { - "epoch": 0.6969074730881712, - "eval_loss": 0.528695821762085, - "eval_runtime": 13.3932, - "eval_samples_per_second": 28.223, - "eval_steps_per_second": 0.971, + "epoch": 0.8451341902054984, + "eval_loss": 0.519396960735321, + "eval_runtime": 11.3912, + "eval_samples_per_second": 33.183, + "eval_steps_per_second": 1.141, "step": 42000 }, { - "epoch": 0.698566776595524, - "grad_norm": 0.5795491337776184, - "learning_rate": 4.317332253022994e-06, - "loss": 0.996, + "epoch": 0.8471464144678924, + "grad_norm": 0.5351930856704712, + "learning_rate": 1.1771885422083418e-06, + "loss": 0.7603, "step": 42100 }, { - "epoch": 0.7002260801028768, - "grad_norm": 0.5513399839401245, - "learning_rate": 4.273643836580298e-06, - "loss": 1.0128, + "epoch": 0.8491586387302865, + "grad_norm": 0.4970718026161194, + "learning_rate": 1.1470082763121227e-06, + "loss": 0.7661, "step": 42200 }, { - "epoch": 0.7018853836102296, - "grad_norm": 0.6419488787651062, - "learning_rate": 4.230117442696081e-06, - "loss": 0.9935, + "epoch": 0.8511708629926805, + "grad_norm": 0.5322678089141846, + "learning_rate": 1.1171963741531178e-06, + "loss": 0.7616, + "step": 42300 + }, + { + "epoch": 0.8511708629926805, + "eval_loss": 0.5193082094192505, + "eval_runtime": 11.3559, + "eval_samples_per_second": 33.287, + "eval_steps_per_second": 1.145, "step": 42300 }, { - "epoch": 0.7035446871175824, - "grad_norm": 0.5781517624855042, - "learning_rate": 4.1867543029140205e-06, - "loss": 0.9991, + "epoch": 0.8531830872550745, + "grad_norm": 0.5380090475082397, + "learning_rate": 1.0877540761734317e-06, + "loss": 0.7623, "step": 42400 }, { - "epoch": 0.7052039906249352, - "grad_norm": 0.6141367554664612, - "learning_rate": 4.143555644158647e-06, - "loss": 1.0024, + "epoch": 0.8551953115174686, + "grad_norm": 0.5419859290122986, + "learning_rate": 1.0586826074363277e-06, + "loss": 0.761, "step": 42500 }, { - "epoch": 0.7052039906249352, - "eval_loss": 0.5273035168647766, - "eval_runtime": 13.0603, - "eval_samples_per_second": 28.943, - "eval_steps_per_second": 0.995, - "step": 42500 + "epoch": 0.8572075357798626, + "grad_norm": 0.5447313189506531, + "learning_rate": 1.0299831775752478e-06, + "loss": 0.7635, + "step": 42600 }, { - "epoch": 0.706863294132288, - "grad_norm": 0.6785457730293274, - "learning_rate": 4.100522688700635e-06, - "loss": 0.998, + "epoch": 0.8572075357798626, + "eval_loss": 0.5189518332481384, + "eval_runtime": 11.3146, + "eval_samples_per_second": 33.408, + "eval_steps_per_second": 1.149, "step": 42600 }, { - "epoch": 0.7085225976396408, - "grad_norm": 0.5750902891159058, - "learning_rate": 4.057656654122226e-06, - "loss": 0.9981, + "epoch": 0.8592197600422568, + "grad_norm": 0.5054132342338562, + "learning_rate": 1.0016569807434894e-06, + "loss": 0.7553, "step": 42700 }, { - "epoch": 0.7101819011469935, - "grad_norm": 0.5711331963539124, - "learning_rate": 4.014958753282759e-06, - "loss": 0.9969, + "epoch": 0.8612319843046508, + "grad_norm": 0.5626354217529297, + "learning_rate": 9.737051955645104e-07, + "loss": 0.76, "step": 42800 }, { - "epoch": 0.7118412046543463, - "grad_norm": 0.6364001035690308, - "learning_rate": 3.9724301942843756e-06, - "loss": 1.0018, + "epoch": 0.8632442085670448, + "grad_norm": 0.6139233112335205, + "learning_rate": 9.461289850828936e-07, + "loss": 0.7586, "step": 42900 }, { - "epoch": 0.7135005081616991, - "grad_norm": 0.5494205355644226, - "learning_rate": 3.930072180437834e-06, - "loss": 0.9904, - "step": 43000 + "epoch": 0.8632442085670448, + "eval_loss": 0.5188504457473755, + "eval_runtime": 11.3931, + "eval_samples_per_second": 33.178, + "eval_steps_per_second": 1.141, + "step": 42900 }, { - "epoch": 0.7135005081616991, - "eval_loss": 0.5261385440826416, - "eval_runtime": 13.3755, - "eval_samples_per_second": 28.261, - "eval_steps_per_second": 0.972, + "epoch": 0.8652564328294389, + "grad_norm": 0.5168823003768921, + "learning_rate": 9.189294967159457e-07, + "loss": 0.7569, "step": 43000 }, { - "epoch": 0.715159811669052, - "grad_norm": 0.6434171795845032, - "learning_rate": 3.887885910228439e-06, - "loss": 0.9949, + "epoch": 0.8672686570918329, + "grad_norm": 0.5103846192359924, + "learning_rate": 8.921078622059643e-07, + "loss": 0.7598, "step": 43100 }, { - "epoch": 0.7168191151764047, - "grad_norm": 0.5774708390235901, - "learning_rate": 3.8458725772821685e-06, - "loss": 0.9988, + "epoch": 0.8692808813542269, + "grad_norm": 0.5376741290092468, + "learning_rate": 8.656651975731434e-07, + "loss": 0.7687, + "step": 43200 + }, + { + "epoch": 0.8692808813542269, + "eval_loss": 0.5187187790870667, + "eval_runtime": 11.3132, + "eval_samples_per_second": 33.412, + "eval_steps_per_second": 1.149, "step": 43200 }, { - "epoch": 0.7184784186837575, - "grad_norm": 0.5714669823646545, - "learning_rate": 3.8040333703318756e-06, - "loss": 1.0031, + "epoch": 0.871293105616621, + "grad_norm": 0.5139674544334412, + "learning_rate": 8.396026030691329e-07, + "loss": 0.7543, "step": 43300 }, { - "epoch": 0.7201377221911103, - "grad_norm": 0.5711707472801208, - "learning_rate": 3.7623694731836536e-06, - "loss": 0.9961, + "epoch": 0.873305329879015, + "grad_norm": 0.4912608563899994, + "learning_rate": 8.139211631312638e-07, + "loss": 0.759, "step": 43400 }, { - "epoch": 0.721797025698463, - "grad_norm": 0.5816017985343933, - "learning_rate": 3.7208820646833586e-06, - "loss": 0.9972, + "epoch": 0.8753175541414091, + "grad_norm": 0.5286913514137268, + "learning_rate": 7.886219463374256e-07, + "loss": 0.7579, "step": 43500 }, { - "epoch": 0.721797025698463, - "eval_loss": 0.5259422063827515, - "eval_runtime": 12.805, - "eval_samples_per_second": 29.52, - "eval_steps_per_second": 1.015, + "epoch": 0.8753175541414091, + "eval_loss": 0.5185059905052185, + "eval_runtime": 11.3249, + "eval_samples_per_second": 33.378, + "eval_steps_per_second": 1.148, "step": 43500 }, { - "epoch": 0.7234563292058158, - "grad_norm": 0.6087881326675415, - "learning_rate": 3.6795723186832444e-06, - "loss": 1.0036, + "epoch": 0.8773297784038031, + "grad_norm": 0.4960270822048187, + "learning_rate": 7.637060053615963e-07, + "loss": 0.7582, "step": 43600 }, { - "epoch": 0.7251156327131687, - "grad_norm": 0.5798130035400391, - "learning_rate": 3.63844140400874e-06, - "loss": 0.9946, + "epoch": 0.8793420026661971, + "grad_norm": 0.5134163498878479, + "learning_rate": 7.391743769300541e-07, + "loss": 0.7624, "step": 43700 }, { - "epoch": 0.7267749362205215, - "grad_norm": 0.5766187906265259, - "learning_rate": 3.597490484425398e-06, - "loss": 1.0186, + "epoch": 0.8813542269285912, + "grad_norm": 0.5594838857650757, + "learning_rate": 7.150280817782296e-07, + "loss": 0.7626, "step": 43800 }, { - "epoch": 0.7284342397278742, - "grad_norm": 0.5991424322128296, - "learning_rate": 3.5567207186059582e-06, - "loss": 0.9931, + "epoch": 0.8813542269285912, + "eval_loss": 0.5184139013290405, + "eval_runtime": 11.3303, + "eval_samples_per_second": 33.362, + "eval_steps_per_second": 1.147, + "step": 43800 + }, + { + "epoch": 0.8833664511909852, + "grad_norm": 0.523009717464447, + "learning_rate": 6.912681246082409e-07, + "loss": 0.7554, "step": 43900 }, { - "epoch": 0.730093543235227, - "grad_norm": 0.5690820813179016, - "learning_rate": 3.516133260097553e-06, - "loss": 0.9894, + "epoch": 0.8853786754533792, + "grad_norm": 0.50362229347229, + "learning_rate": 6.678954940470806e-07, + "loss": 0.758, "step": 44000 }, { - "epoch": 0.730093543235227, - "eval_loss": 0.5260379314422607, - "eval_runtime": 13.1379, - "eval_samples_per_second": 28.772, - "eval_steps_per_second": 0.99, - "step": 44000 + "epoch": 0.8873908997157733, + "grad_norm": 0.5441898107528687, + "learning_rate": 6.449111626054927e-07, + "loss": 0.7573, + "step": 44100 }, { - "epoch": 0.7317528467425798, - "grad_norm": 0.6331527829170227, - "learning_rate": 3.4757292572890866e-06, - "loss": 0.9947, + "epoch": 0.8873908997157733, + "eval_loss": 0.5184325575828552, + "eval_runtime": 11.3938, + "eval_samples_per_second": 33.176, + "eval_steps_per_second": 1.141, "step": 44100 }, { - "epoch": 0.7334121502499326, - "grad_norm": 0.6297070384025574, - "learning_rate": 3.4355098533787377e-06, - "loss": 0.9974, + "epoch": 0.8894031239781673, + "grad_norm": 0.520699679851532, + "learning_rate": 6.223160866374967e-07, + "loss": 0.7638, "step": 44200 }, { - "epoch": 0.7350714537572853, - "grad_norm": 0.6061388850212097, - "learning_rate": 3.3954761863415984e-06, - "loss": 1.0056, + "epoch": 0.8914153482405615, + "grad_norm": 0.4745332598686218, + "learning_rate": 6.001112063005998e-07, + "loss": 0.7577, "step": 44300 }, { - "epoch": 0.7367307572646382, - "grad_norm": 0.5723554491996765, - "learning_rate": 3.355629388897498e-06, - "loss": 0.9966, + "epoch": 0.8934275725029555, + "grad_norm": 0.49645400047302246, + "learning_rate": 5.782974455166767e-07, + "loss": 0.7619, "step": 44400 }, { - "epoch": 0.738390060771991, - "grad_norm": 0.6564521789550781, - "learning_rate": 3.3159705884789426e-06, - "loss": 0.9981, - "step": 44500 + "epoch": 0.8934275725029555, + "eval_loss": 0.518170952796936, + "eval_runtime": 11.3133, + "eval_samples_per_second": 33.412, + "eval_steps_per_second": 1.149, + "step": 44400 }, { - "epoch": 0.738390060771991, - "eval_loss": 0.525342583656311, - "eval_runtime": 13.379, - "eval_samples_per_second": 28.253, - "eval_steps_per_second": 0.972, + "epoch": 0.8954397967653495, + "grad_norm": 0.5159271955490112, + "learning_rate": 5.568757119335244e-07, + "loss": 0.7571, "step": 44500 }, { - "epoch": 0.7400493642793438, - "grad_norm": 0.58149653673172, - "learning_rate": 3.2765009071992106e-06, - "loss": 1.0024, + "epoch": 0.8974520210277436, + "grad_norm": 0.5097435712814331, + "learning_rate": 5.358468968871e-07, + "loss": 0.7697, "step": 44600 }, { - "epoch": 0.7417086677866965, - "grad_norm": 0.598773717880249, - "learning_rate": 3.2372214618206156e-06, - "loss": 1.0048, + "epoch": 0.8994642452901376, + "grad_norm": 0.5482389330863953, + "learning_rate": 5.152118753644275e-07, + "loss": 0.7682, "step": 44700 }, { - "epoch": 0.7433679712940493, - "grad_norm": 0.5654010772705078, - "learning_rate": 3.1981333637229006e-06, - "loss": 0.9895, + "epoch": 0.8994642452901376, + "eval_loss": 0.5181338787078857, + "eval_runtime": 11.4656, + "eval_samples_per_second": 32.968, + "eval_steps_per_second": 1.134, + "step": 44700 + }, + { + "epoch": 0.9014764695525316, + "grad_norm": 0.5253916382789612, + "learning_rate": 4.949715059671978e-07, + "loss": 0.7656, "step": 44800 }, { - "epoch": 0.7450272748014021, - "grad_norm": 0.6237831115722656, - "learning_rate": 3.1592377188717904e-06, - "loss": 1.0019, + "epoch": 0.9034886938149257, + "grad_norm": 0.4978592097759247, + "learning_rate": 4.7512663087603826e-07, + "loss": 0.7621, "step": 44900 }, { - "epoch": 0.7466865783087548, - "grad_norm": 0.5665493607521057, - "learning_rate": 3.1205356277877053e-06, - "loss": 1.0134, + "epoch": 0.9055009180773197, + "grad_norm": 0.5216113924980164, + "learning_rate": 4.5567807581546664e-07, + "loss": 0.7595, "step": 45000 }, { - "epoch": 0.7466865783087548, - "eval_loss": 0.5244150161743164, - "eval_runtime": 13.5719, - "eval_samples_per_second": 27.852, - "eval_steps_per_second": 0.958, + "epoch": 0.9055009180773197, + "eval_loss": 0.5181112885475159, + "eval_runtime": 11.5213, + "eval_samples_per_second": 32.809, + "eval_steps_per_second": 1.128, "step": 45000 }, { - "epoch": 0.7483458818161077, - "grad_norm": 0.5779845714569092, - "learning_rate": 3.082028185514623e-06, - "loss": 0.9978, + "epoch": 0.9075131423397138, + "grad_norm": 0.5027504563331604, + "learning_rate": 4.366266500195426e-07, + "loss": 0.7588, "step": 45100 }, { - "epoch": 0.7500051853234605, - "grad_norm": 0.5865043997764587, - "learning_rate": 3.0437164815890917e-06, - "loss": 0.9957, + "epoch": 0.9095253666021078, + "grad_norm": 0.5365561842918396, + "learning_rate": 4.1797314619819285e-07, + "loss": 0.7612, "step": 45200 }, { - "epoch": 0.7516644888308133, - "grad_norm": 0.6308901309967041, - "learning_rate": 3.005601600009395e-06, - "loss": 1.0034, + "epoch": 0.9115375908645018, + "grad_norm": 0.5316836833953857, + "learning_rate": 3.997183405042238e-07, + "loss": 0.7639, + "step": 45300 + }, + { + "epoch": 0.9115375908645018, + "eval_loss": 0.5180224776268005, + "eval_runtime": 11.5144, + "eval_samples_per_second": 32.828, + "eval_steps_per_second": 1.129, "step": 45300 }, { - "epoch": 0.753323792338166, - "grad_norm": 0.5870101451873779, - "learning_rate": 2.9676846192049004e-06, - "loss": 1.0062, + "epoch": 0.9135498151268959, + "grad_norm": 0.5350984930992126, + "learning_rate": 3.8186299250103085e-07, + "loss": 0.7582, "step": 45400 }, { - "epoch": 0.7549830958455188, - "grad_norm": 0.5671665072441101, - "learning_rate": 2.9299666120055315e-06, - "loss": 1.0136, + "epoch": 0.9155620393892899, + "grad_norm": 0.5509154796600342, + "learning_rate": 3.644078451309907e-07, + "loss": 0.7686, "step": 45500 }, { - "epoch": 0.7549830958455188, - "eval_loss": 0.5242615342140198, - "eval_runtime": 12.9774, - "eval_samples_per_second": 29.127, - "eval_steps_per_second": 1.002, - "step": 45500 + "epoch": 0.9175742636516839, + "grad_norm": 0.5419358611106873, + "learning_rate": 3.47353624684551e-07, + "loss": 0.762, + "step": 45600 }, { - "epoch": 0.7566423993528716, - "grad_norm": 0.6181697845458984, - "learning_rate": 2.892448645611412e-06, - "loss": 1.0007, + "epoch": 0.9175742636516839, + "eval_loss": 0.5179212689399719, + "eval_runtime": 11.4423, + "eval_samples_per_second": 33.035, + "eval_steps_per_second": 1.136, "step": 45600 }, { - "epoch": 0.7583017028602245, - "grad_norm": 0.6317583918571472, - "learning_rate": 2.85513178156268e-06, - "loss": 1.0105, + "epoch": 0.919586487914078, + "grad_norm": 0.5258903503417969, + "learning_rate": 3.307010407700084e-07, + "loss": 0.7598, "step": 45700 }, { - "epoch": 0.7599610063675772, - "grad_norm": 0.5862682461738586, - "learning_rate": 2.818017075709446e-06, - "loss": 0.9926, + "epoch": 0.921598712176472, + "grad_norm": 0.519910454750061, + "learning_rate": 3.1445078628398294e-07, + "loss": 0.7589, "step": 45800 }, { - "epoch": 0.76162030987493, - "grad_norm": 0.6446713209152222, - "learning_rate": 2.781105578181924e-06, - "loss": 0.9916, + "epoch": 0.9236109364388662, + "grad_norm": 0.5140842795372009, + "learning_rate": 2.986035373825902e-07, + "loss": 0.762, "step": 45900 }, { - "epoch": 0.7632796133822828, - "grad_norm": 0.6362677812576294, - "learning_rate": 2.744398333360705e-06, - "loss": 0.9981, - "step": 46000 + "epoch": 0.9236109364388662, + "eval_loss": 0.5178348422050476, + "eval_runtime": 11.4694, + "eval_samples_per_second": 32.957, + "eval_steps_per_second": 1.133, + "step": 45900 }, { - "epoch": 0.7632796133822828, - "eval_loss": 0.523980975151062, - "eval_runtime": 13.668, - "eval_samples_per_second": 27.656, - "eval_steps_per_second": 0.951, + "epoch": 0.9256231607012602, + "grad_norm": 0.5274850726127625, + "learning_rate": 2.8315995345329804e-07, + "loss": 0.758, "step": 46000 }, { - "epoch": 0.7649389168896356, - "grad_norm": 0.6043516993522644, - "learning_rate": 2.7078963798472236e-06, - "loss": 0.9926, + "epoch": 0.9276353849636542, + "grad_norm": 0.5443992018699646, + "learning_rate": 2.681206770875022e-07, + "loss": 0.7614, "step": 46100 }, { - "epoch": 0.7665982203969883, - "grad_norm": 0.5952335596084595, - "learning_rate": 2.6716007504343644e-06, - "loss": 1.0023, + "epoch": 0.9296476092260483, + "grad_norm": 0.5250468254089355, + "learning_rate": 2.5348633405378296e-07, + "loss": 0.7666, + "step": 46200 + }, + { + "epoch": 0.9296476092260483, + "eval_loss": 0.5178038477897644, + "eval_runtime": 11.6986, + "eval_samples_per_second": 32.311, + "eval_steps_per_second": 1.111, "step": 46200 }, { - "epoch": 0.7682575239043411, - "grad_norm": 0.5768688321113586, - "learning_rate": 2.6355124720772417e-06, - "loss": 0.9996, + "epoch": 0.9316598334884423, + "grad_norm": 0.5096211433410645, + "learning_rate": 2.392575332718627e-07, + "loss": 0.7697, "step": 46300 }, { - "epoch": 0.769916827411694, - "grad_norm": 0.5763490796089172, - "learning_rate": 2.599632565864142e-06, - "loss": 0.9956, + "epoch": 0.9336720577508363, + "grad_norm": 0.549790620803833, + "learning_rate": 2.2543486678727855e-07, + "loss": 0.7676, "step": 46400 }, { - "epoch": 0.7715761309190468, - "grad_norm": 0.5711187124252319, - "learning_rate": 2.563962046987625e-06, - "loss": 0.9882, + "epoch": 0.9356842820132304, + "grad_norm": 0.524726152420044, + "learning_rate": 2.120189097467451e-07, + "loss": 0.7673, "step": 46500 }, { - "epoch": 0.7715761309190468, - "eval_loss": 0.5233112573623657, - "eval_runtime": 13.0363, - "eval_samples_per_second": 28.996, - "eval_steps_per_second": 0.997, + "epoch": 0.9356842820132304, + "eval_loss": 0.5176617503166199, + "eval_runtime": 11.5673, + "eval_samples_per_second": 32.678, + "eval_steps_per_second": 1.124, "step": 46500 }, { - "epoch": 0.7732354344263995, - "grad_norm": 0.5754934549331665, - "learning_rate": 2.5285019247158138e-06, - "loss": 0.9988, + "epoch": 0.9376965062756244, + "grad_norm": 0.5029181838035583, + "learning_rate": 1.9901022037421723e-07, + "loss": 0.7642, "step": 46600 }, { - "epoch": 0.7748947379337523, - "grad_norm": 0.6119757294654846, - "learning_rate": 2.4932532023638356e-06, - "loss": 1.0014, + "epoch": 0.9397087305380185, + "grad_norm": 0.5207979679107666, + "learning_rate": 1.8640933994767073e-07, + "loss": 0.7592, "step": 46700 }, { - "epoch": 0.7765540414411051, - "grad_norm": 0.5794397592544556, - "learning_rate": 2.458216877265419e-06, - "loss": 0.9866, + "epoch": 0.9417209548004125, + "grad_norm": 0.5468851923942566, + "learning_rate": 1.74216792776577e-07, + "loss": 0.7631, "step": 46800 }, { - "epoch": 0.7782133449484578, - "grad_norm": 0.5890904068946838, - "learning_rate": 2.423393940744695e-06, - "loss": 0.9999, + "epoch": 0.9417209548004125, + "eval_loss": 0.5177092552185059, + "eval_runtime": 11.4559, + "eval_samples_per_second": 32.996, + "eval_steps_per_second": 1.135, + "step": 46800 + }, + { + "epoch": 0.9437331790628065, + "grad_norm": 0.5044853091239929, + "learning_rate": 1.62433086180086e-07, + "loss": 0.7644, "step": 46900 }, { - "epoch": 0.7798726484558107, - "grad_norm": 0.6056330800056458, - "learning_rate": 2.3887853780881397e-06, - "loss": 0.9941, + "epoch": 0.9457454033252006, + "grad_norm": 0.5245229005813599, + "learning_rate": 1.5105871046592e-07, + "loss": 0.7605, "step": 47000 }, { - "epoch": 0.7798726484558107, - "eval_loss": 0.5230756402015686, - "eval_runtime": 13.0978, - "eval_samples_per_second": 28.86, - "eval_steps_per_second": 0.993, - "step": 47000 + "epoch": 0.9477576275875946, + "grad_norm": 0.49839621782302856, + "learning_rate": 1.400941389099697e-07, + "loss": 0.7565, + "step": 47100 }, { - "epoch": 0.7815319519631635, - "grad_norm": 0.5889580249786377, - "learning_rate": 2.354392168516687e-06, - "loss": 0.9974, + "epoch": 0.9477576275875946, + "eval_loss": 0.5176432132720947, + "eval_runtime": 11.5662, + "eval_samples_per_second": 32.681, + "eval_steps_per_second": 1.124, "step": 47100 }, { - "epoch": 0.7831912554705163, - "grad_norm": 0.5722795724868774, - "learning_rate": 2.3202152851580416e-06, - "loss": 0.9879, + "epoch": 0.9497698518499886, + "grad_norm": 0.4973909556865692, + "learning_rate": 1.2953982773660223e-07, + "loss": 0.7656, "step": 47200 }, { - "epoch": 0.784850558977869, - "grad_norm": 0.5718373656272888, - "learning_rate": 2.286255695019135e-06, - "loss": 0.9892, + "epoch": 0.9517820761123827, + "grad_norm": 0.5007102489471436, + "learning_rate": 1.1939621609968088e-07, + "loss": 0.7506, "step": 47300 }, { - "epoch": 0.7865098624852218, - "grad_norm": 0.5903875827789307, - "learning_rate": 2.252514358958755e-06, - "loss": 0.9918, + "epoch": 0.9537943003747767, + "grad_norm": 0.49358874559402466, + "learning_rate": 1.0966372606428855e-07, + "loss": 0.7562, "step": 47400 }, { - "epoch": 0.7881691659925746, - "grad_norm": 0.614603579044342, - "learning_rate": 2.2189922316603796e-06, - "loss": 0.9902, - "step": 47500 + "epoch": 0.9537943003747767, + "eval_loss": 0.5176478624343872, + "eval_runtime": 11.3727, + "eval_samples_per_second": 33.237, + "eval_steps_per_second": 1.143, + "step": 47400 }, { - "epoch": 0.7881691659925746, - "eval_loss": 0.5229316353797913, - "eval_runtime": 13.0166, - "eval_samples_per_second": 29.04, - "eval_steps_per_second": 0.999, + "epoch": 0.9558065246371709, + "grad_norm": 0.5771644115447998, + "learning_rate": 1.0034276258916953e-07, + "loss": 0.766, "step": 47500 }, { - "epoch": 0.7898284694999274, - "grad_norm": 0.584284245967865, - "learning_rate": 2.185690261605151e-06, - "loss": 0.9958, + "epoch": 0.9578187488995649, + "grad_norm": 0.5385919213294983, + "learning_rate": 9.14337135098764e-08, + "loss": 0.7605, "step": 47600 }, { - "epoch": 0.7914877730072802, - "grad_norm": 0.5983254313468933, - "learning_rate": 2.1526093910450387e-06, - "loss": 0.9826, + "epoch": 0.9598309731619589, + "grad_norm": 0.5119192004203796, + "learning_rate": 8.293694952263286e-08, + "loss": 0.757, + "step": 47700 + }, + { + "epoch": 0.9598309731619589, + "eval_loss": 0.5176236033439636, + "eval_runtime": 11.3818, + "eval_samples_per_second": 33.211, + "eval_steps_per_second": 1.142, "step": 47700 }, { - "epoch": 0.793147076514633, - "grad_norm": 0.5463616847991943, - "learning_rate": 2.1197505559761867e-06, - "loss": 0.9911, + "epoch": 0.961843197424353, + "grad_norm": 0.5380053520202637, + "learning_rate": 7.485282416891393e-08, + "loss": 0.7574, "step": 47800 }, { - "epoch": 0.7948063800219858, - "grad_norm": 0.5666741728782654, - "learning_rate": 2.0871146861124292e-06, - "loss": 0.9861, + "epoch": 0.963855421686747, + "grad_norm": 0.5267532467842102, + "learning_rate": 6.718167382072983e-08, + "loss": 0.7668, "step": 47900 }, { - "epoch": 0.7964656835293386, - "grad_norm": 0.5529145002365112, - "learning_rate": 2.054702704858976e-06, - "loss": 0.9919, + "epoch": 0.965867645949141, + "grad_norm": 0.5199303030967712, + "learning_rate": 5.99238176666328e-08, + "loss": 0.756, "step": 48000 }, { - "epoch": 0.7964656835293386, - "eval_loss": 0.5223823189735413, - "eval_runtime": 12.7808, - "eval_samples_per_second": 29.576, - "eval_steps_per_second": 1.017, + "epoch": 0.965867645949141, + "eval_loss": 0.5175907015800476, + "eval_runtime": 11.4752, + "eval_samples_per_second": 32.941, + "eval_steps_per_second": 1.133, "step": 48000 }, { - "epoch": 0.7981249870366913, - "grad_norm": 0.5657969117164612, - "learning_rate": 2.0225155292862963e-06, - "loss": 0.9929, + "epoch": 0.9678798702115351, + "grad_norm": 0.5405638217926025, + "learning_rate": 5.307955769843443e-08, + "loss": 0.7612, "step": 48100 }, { - "epoch": 0.7997842905440441, - "grad_norm": 0.5925653576850891, - "learning_rate": 1.990554070104168e-06, - "loss": 1.0057, + "epoch": 0.9698920944739291, + "grad_norm": 0.47063717246055603, + "learning_rate": 4.664917869864338e-08, + "loss": 0.7667, "step": 48200 }, { - "epoch": 0.801443594051397, - "grad_norm": 0.60003262758255, - "learning_rate": 1.9588192316359013e-06, - "loss": 0.9905, + "epoch": 0.9719043187363232, + "grad_norm": 0.48465442657470703, + "learning_rate": 4.063294822861163e-08, + "loss": 0.7605, + "step": 48300 + }, + { + "epoch": 0.9719043187363232, + "eval_loss": 0.5175836682319641, + "eval_runtime": 11.3838, + "eval_samples_per_second": 33.205, + "eval_steps_per_second": 1.142, "step": 48300 }, { - "epoch": 0.8031028975587498, - "grad_norm": 0.6448950171470642, - "learning_rate": 1.9273119117927676e-06, - "loss": 0.993, + "epoch": 0.9739165429987172, + "grad_norm": 0.48423367738723755, + "learning_rate": 3.5031116617404435e-08, + "loss": 0.7574, "step": 48400 }, { - "epoch": 0.8047622010661025, - "grad_norm": 0.5476670861244202, - "learning_rate": 1.8960330020485828e-06, - "loss": 0.9895, + "epoch": 0.9759287672611112, + "grad_norm": 0.5320655107498169, + "learning_rate": 2.9843916951382e-08, + "loss": 0.767, "step": 48500 }, { - "epoch": 0.8047622010661025, - "eval_loss": 0.5221726894378662, - "eval_runtime": 13.2164, - "eval_samples_per_second": 28.601, - "eval_steps_per_second": 0.984, - "step": 48500 + "epoch": 0.9779409915235053, + "grad_norm": 0.5267395377159119, + "learning_rate": 2.5071565064506143e-08, + "loss": 0.7593, + "step": 48600 }, { - "epoch": 0.8064215045734553, - "grad_norm": 0.5977376103401184, - "learning_rate": 1.8649833874144807e-06, - "loss": 0.9986, + "epoch": 0.9779409915235053, + "eval_loss": 0.5175591707229614, + "eval_runtime": 11.3501, + "eval_samples_per_second": 33.304, + "eval_steps_per_second": 1.145, "step": 48600 }, { - "epoch": 0.8080808080808081, - "grad_norm": 0.5937013626098633, - "learning_rate": 1.8341639464138817e-06, - "loss": 0.9942, + "epoch": 0.9799532157858993, + "grad_norm": 0.514837920665741, + "learning_rate": 2.071425952934969e-08, + "loss": 0.7641, "step": 48700 }, { - "epoch": 0.8097401115881608, - "grad_norm": 0.5677590370178223, - "learning_rate": 1.8035755510576348e-06, - "loss": 1.0045, + "epoch": 0.9819654400482933, + "grad_norm": 0.5345449447631836, + "learning_rate": 1.677218164884753e-08, + "loss": 0.7685, "step": 48800 }, { - "epoch": 0.8113994150955136, - "grad_norm": 0.5833272933959961, - "learning_rate": 1.7732190668193394e-06, - "loss": 0.974, + "epoch": 0.9839776643106874, + "grad_norm": 0.5339971780776978, + "learning_rate": 1.3245495448739321e-08, + "loss": 0.7612, "step": 48900 }, { - "epoch": 0.8130587186028665, - "grad_norm": 0.5606270432472229, - "learning_rate": 1.743095352610854e-06, - "loss": 0.9901, - "step": 49000 + "epoch": 0.9839776643106874, + "eval_loss": 0.5175919532775879, + "eval_runtime": 11.3829, + "eval_samples_per_second": 33.208, + "eval_steps_per_second": 1.142, + "step": 48900 }, { - "epoch": 0.8130587186028665, - "eval_loss": 0.5221291184425354, - "eval_runtime": 12.9691, - "eval_samples_per_second": 29.146, - "eval_steps_per_second": 1.002, + "epoch": 0.9859898885730815, + "grad_norm": 0.49889686703681946, + "learning_rate": 1.013434767075605e-08, + "loss": 0.7692, "step": 49000 }, { - "epoch": 0.8147180221102193, - "grad_norm": 0.593102753162384, - "learning_rate": 1.7132052607580064e-06, - "loss": 0.9949, + "epoch": 0.9880021128354756, + "grad_norm": 0.5119482278823853, + "learning_rate": 7.438867766504931e-09, + "loss": 0.7578, "step": 49100 }, { - "epoch": 0.816377325617572, - "grad_norm": 0.5594152808189392, - "learning_rate": 1.6835496369764737e-06, - "loss": 1.0043, + "epoch": 0.9900143370978696, + "grad_norm": 0.5316244959831238, + "learning_rate": 5.159167892089256e-09, + "loss": 0.7568, + "step": 49200 + }, + { + "epoch": 0.9900143370978696, + "eval_loss": 0.5176030993461609, + "eval_runtime": 11.4046, + "eval_samples_per_second": 33.145, + "eval_steps_per_second": 1.14, "step": 49200 }, { - "epoch": 0.8180366291249248, - "grad_norm": 0.5467879772186279, - "learning_rate": 1.6541293203478402e-06, - "loss": 0.9913, + "epoch": 0.9920265613602636, + "grad_norm": 7.261257648468018, + "learning_rate": 3.2953429034399133e-09, + "loss": 0.7576, "step": 49300 }, { - "epoch": 0.8196959326322776, - "grad_norm": 0.5851374268531799, - "learning_rate": 1.6249451432958774e-06, - "loss": 0.9892, + "epoch": 0.9940387856226577, + "grad_norm": 0.48430758714675903, + "learning_rate": 1.847470352367431e-09, + "loss": 0.7577, "step": 49400 }, { - "epoch": 0.8213552361396304, - "grad_norm": 0.5654325485229492, - "learning_rate": 1.5959979315629748e-06, - "loss": 0.9984, + "epoch": 0.9960510098850517, + "grad_norm": 0.4918181598186493, + "learning_rate": 8.156104833345613e-10, + "loss": 0.7649, "step": 49500 }, { - "epoch": 0.8213552361396304, - "eval_loss": 0.5219221711158752, - "eval_runtime": 12.7283, - "eval_samples_per_second": 29.698, - "eval_steps_per_second": 1.021, + "epoch": 0.9960510098850517, + "eval_loss": 0.5175663232803345, + "eval_runtime": 11.4598, + "eval_samples_per_second": 32.985, + "eval_steps_per_second": 1.134, "step": 49500 }, { - "epoch": 0.8230145396469832, - "grad_norm": 0.5532642006874084, - "learning_rate": 1.567288504186788e-06, - "loss": 0.9847, + "epoch": 0.9980632341474457, + "grad_norm": 0.5409220457077026, + "learning_rate": 1.9980623095494645e-10, + "loss": 0.7531, "step": 49600 - }, - { - "epoch": 0.824673843154336, - "grad_norm": 0.5498866438865662, - "learning_rate": 1.5388176734770498e-06, - "loss": 0.9977, - "step": 49700 - }, - { - "epoch": 0.8263331466616888, - "grad_norm": 0.5780646204948425, - "learning_rate": 1.5105862449926045e-06, - "loss": 0.9836, - "step": 49800 - }, - { - "epoch": 0.8279924501690415, - "grad_norm": 0.5943354964256287, - "learning_rate": 1.482595017518601e-06, - "loss": 0.9966, - "step": 49900 - }, - { - "epoch": 0.8296517536763943, - "grad_norm": 0.5729531049728394, - "learning_rate": 1.4548447830439027e-06, - "loss": 0.9914, - "step": 50000 - }, - { - "epoch": 0.8296517536763943, - "eval_loss": 0.5218319296836853, - "eval_runtime": 12.633, - "eval_samples_per_second": 29.922, - "eval_steps_per_second": 1.029, - "step": 50000 - }, - { - "epoch": 0.8313110571837471, - "grad_norm": 0.6536192893981934, - "learning_rate": 1.4273363267386686e-06, - "loss": 0.9921, - "step": 50100 - }, - { - "epoch": 0.8329703606910999, - "grad_norm": 0.5611860156059265, - "learning_rate": 1.400070426932143e-06, - "loss": 0.996, - "step": 50200 - }, - { - "epoch": 0.8346296641984527, - "grad_norm": 0.5498550534248352, - "learning_rate": 1.3730478550906335e-06, - "loss": 0.9923, - "step": 50300 - }, - { - "epoch": 0.8362889677058055, - "grad_norm": 0.5699523091316223, - "learning_rate": 1.3462693757956847e-06, - "loss": 0.9819, - "step": 50400 - }, - { - "epoch": 0.8379482712131583, - "grad_norm": 0.5774253606796265, - "learning_rate": 1.319735746722437e-06, - "loss": 0.9952, - "step": 50500 - }, - { - "epoch": 0.8379482712131583, - "eval_loss": 0.5215762853622437, - "eval_runtime": 12.4993, - "eval_samples_per_second": 30.242, - "eval_steps_per_second": 1.04, - "step": 50500 - }, - { - "epoch": 0.8396075747205111, - "grad_norm": 0.5578539967536926, - "learning_rate": 1.2934477186181982e-06, - "loss": 0.9856, - "step": 50600 - }, - { - "epoch": 0.8412668782278638, - "grad_norm": 0.5727275609970093, - "learning_rate": 1.2674060352812e-06, - "loss": 1.0059, - "step": 50700 - }, - { - "epoch": 0.8429261817352166, - "grad_norm": 0.5760875344276428, - "learning_rate": 1.2416114335395424e-06, - "loss": 0.9815, - "step": 50800 - }, - { - "epoch": 0.8445854852425694, - "grad_norm": 0.5728462934494019, - "learning_rate": 1.2160646432303625e-06, - "loss": 0.9923, - "step": 50900 - }, - { - "epoch": 0.8462447887499223, - "grad_norm": 0.5854160785675049, - "learning_rate": 1.1907663871791731e-06, - "loss": 1.004, - "step": 51000 - }, - { - "epoch": 0.8462447887499223, - "eval_loss": 0.521274745464325, - "eval_runtime": 12.9379, - "eval_samples_per_second": 29.217, - "eval_steps_per_second": 1.005, - "step": 51000 - }, - { - "epoch": 0.847904092257275, - "grad_norm": 0.5791363716125488, - "learning_rate": 1.1657173811794098e-06, - "loss": 1.0007, - "step": 51100 - }, - { - "epoch": 0.8495633957646278, - "grad_norm": 0.5561227798461914, - "learning_rate": 1.1409183339721874e-06, - "loss": 0.9917, - "step": 51200 - }, - { - "epoch": 0.8512226992719806, - "grad_norm": 0.5726980566978455, - "learning_rate": 1.1163699472262412e-06, - "loss": 0.9838, - "step": 51300 - }, - { - "epoch": 0.8528820027793333, - "grad_norm": 0.5962728261947632, - "learning_rate": 1.0920729155180686e-06, - "loss": 0.9917, - "step": 51400 - }, - { - "epoch": 0.8545413062866861, - "grad_norm": 0.7328312993049622, - "learning_rate": 1.0680279263122873e-06, - "loss": 0.9846, - "step": 51500 - }, - { - "epoch": 0.8545413062866861, - "eval_loss": 0.5210849046707153, - "eval_runtime": 13.54, - "eval_samples_per_second": 27.917, - "eval_steps_per_second": 0.96, - "step": 51500 - }, - { - "epoch": 0.856200609794039, - "grad_norm": 0.570810079574585, - "learning_rate": 1.0442356599421788e-06, - "loss": 0.9872, - "step": 51600 - }, - { - "epoch": 0.8578599133013918, - "grad_norm": 0.6257898211479187, - "learning_rate": 1.020696789590434e-06, - "loss": 0.9941, - "step": 51700 - }, - { - "epoch": 0.8595192168087445, - "grad_norm": 0.5606411695480347, - "learning_rate": 9.974119812701155e-07, - "loss": 0.9941, - "step": 51800 - }, - { - "epoch": 0.8611785203160973, - "grad_norm": 0.5635068416595459, - "learning_rate": 9.743818938058091e-07, - "loss": 0.986, - "step": 51900 - }, - { - "epoch": 0.8628378238234501, - "grad_norm": 0.582958459854126, - "learning_rate": 9.516071788149772e-07, - "loss": 0.9883, - "step": 52000 - }, - { - "epoch": 0.8628378238234501, - "eval_loss": 0.5208600759506226, - "eval_runtime": 13.0679, - "eval_samples_per_second": 28.926, - "eval_steps_per_second": 0.995, - "step": 52000 - }, - { - "epoch": 0.8644971273308029, - "grad_norm": 0.5963780879974365, - "learning_rate": 9.290884806895339e-07, - "loss": 0.9915, - "step": 52100 - }, - { - "epoch": 0.8661564308381556, - "grad_norm": 0.5490589141845703, - "learning_rate": 9.068264365776014e-07, - "loss": 0.9891, - "step": 52200 - }, - { - "epoch": 0.8678157343455085, - "grad_norm": 0.5765030980110168, - "learning_rate": 8.848216763654894e-07, - "loss": 0.9836, - "step": 52300 - }, - { - "epoch": 0.8694750378528613, - "grad_norm": 0.5995026230812073, - "learning_rate": 8.630748226598673e-07, - "loss": 0.994, - "step": 52400 - }, - { - "epoch": 0.8711343413602141, - "grad_norm": 0.5637781620025635, - "learning_rate": 8.415864907701532e-07, - "loss": 0.989, - "step": 52500 - }, - { - "epoch": 0.8711343413602141, - "eval_loss": 0.520824134349823, - "eval_runtime": 12.8484, - "eval_samples_per_second": 29.42, - "eval_steps_per_second": 1.012, - "step": 52500 - }, - { - "epoch": 0.8727936448675668, - "grad_norm": 0.585106611251831, - "learning_rate": 8.203572886911026e-07, - "loss": 0.9835, - "step": 52600 - }, - { - "epoch": 0.8744529483749196, - "grad_norm": 0.54038405418396, - "learning_rate": 7.993878170856028e-07, - "loss": 0.9896, - "step": 52700 - }, - { - "epoch": 0.8761122518822724, - "grad_norm": 0.5863847732543945, - "learning_rate": 7.786786692676829e-07, - "loss": 0.9873, - "step": 52800 - }, - { - "epoch": 0.8777715553896253, - "grad_norm": 0.5839337110519409, - "learning_rate": 7.582304311857225e-07, - "loss": 0.9918, - "step": 52900 - }, - { - "epoch": 0.879430858896978, - "grad_norm": 0.567760705947876, - "learning_rate": 7.380436814058723e-07, - "loss": 0.9868, - "step": 53000 - }, - { - "epoch": 0.879430858896978, - "eval_loss": 0.520458996295929, - "eval_runtime": 13.3163, - "eval_samples_per_second": 28.386, - "eval_steps_per_second": 0.976, - "step": 53000 - }, - { - "epoch": 0.8810901624043308, - "grad_norm": 0.5426855087280273, - "learning_rate": 7.181189910956865e-07, - "loss": 0.9923, - "step": 53100 - }, - { - "epoch": 0.8827494659116836, - "grad_norm": 0.5498548150062561, - "learning_rate": 6.98456924007963e-07, - "loss": 0.9974, - "step": 53200 - }, - { - "epoch": 0.8844087694190363, - "grad_norm": 0.6108880639076233, - "learning_rate": 6.790580364647903e-07, - "loss": 0.9828, - "step": 53300 - }, - { - "epoch": 0.8860680729263891, - "grad_norm": 0.5660552978515625, - "learning_rate": 6.599228773418032e-07, - "loss": 0.9775, - "step": 53400 - }, - { - "epoch": 0.8877273764337419, - "grad_norm": 0.5404841303825378, - "learning_rate": 6.410519880526623e-07, - "loss": 0.9815, - "step": 53500 - }, - { - "epoch": 0.8877273764337419, - "eval_loss": 0.5203107595443726, - "eval_runtime": 12.8343, - "eval_samples_per_second": 29.452, - "eval_steps_per_second": 1.013, - "step": 53500 - }, - { - "epoch": 0.8893866799410948, - "grad_norm": 0.6040447354316711, - "learning_rate": 6.224459025337248e-07, - "loss": 0.9856, - "step": 53600 - }, - { - "epoch": 0.8910459834484475, - "grad_norm": 0.6258560419082642, - "learning_rate": 6.041051472289472e-07, - "loss": 0.9981, - "step": 53700 - }, - { - "epoch": 0.8927052869558003, - "grad_norm": 0.612902820110321, - "learning_rate": 5.860302410749819e-07, - "loss": 0.9862, - "step": 53800 - }, - { - "epoch": 0.8943645904631531, - "grad_norm": 0.5610395073890686, - "learning_rate": 5.68221695486495e-07, - "loss": 1.0003, - "step": 53900 - }, - { - "epoch": 0.8960238939705059, - "grad_norm": 0.5533348321914673, - "learning_rate": 5.506800143417024e-07, - "loss": 0.9921, - "step": 54000 - }, - { - "epoch": 0.8960238939705059, - "eval_loss": 0.5201912522315979, - "eval_runtime": 12.8576, - "eval_samples_per_second": 29.399, - "eval_steps_per_second": 1.011, - "step": 54000 - }, - { - "epoch": 0.8976831974778586, - "grad_norm": 0.5346217751502991, - "learning_rate": 5.334056939681087e-07, - "loss": 1.0011, - "step": 54100 - }, - { - "epoch": 0.8993425009852115, - "grad_norm": 0.5841894745826721, - "learning_rate": 5.163992231284609e-07, - "loss": 0.9898, - "step": 54200 - }, - { - "epoch": 0.9010018044925643, - "grad_norm": 0.5578398704528809, - "learning_rate": 4.996610830069271e-07, - "loss": 0.9895, - "step": 54300 - }, - { - "epoch": 0.9026611079999171, - "grad_norm": 0.5741618275642395, - "learning_rate": 4.831917471954772e-07, - "loss": 0.9907, - "step": 54400 - }, - { - "epoch": 0.9043204115072698, - "grad_norm": 0.5788339972496033, - "learning_rate": 4.669916816804776e-07, - "loss": 0.9963, - "step": 54500 - }, - { - "epoch": 0.9043204115072698, - "eval_loss": 0.5202063322067261, - "eval_runtime": 12.1551, - "eval_samples_per_second": 31.098, - "eval_steps_per_second": 1.07, - "step": 54500 - }, - { - "epoch": 0.9059797150146226, - "grad_norm": 0.5326141119003296, - "learning_rate": 4.510613448295176e-07, - "loss": 0.9914, - "step": 54600 - }, - { - "epoch": 0.9076390185219754, - "grad_norm": 0.5347960591316223, - "learning_rate": 4.3540118737843697e-07, - "loss": 0.988, - "step": 54700 - }, - { - "epoch": 0.9092983220293281, - "grad_norm": 0.5588984489440918, - "learning_rate": 4.200116524185627e-07, - "loss": 0.9863, - "step": 54800 - }, - { - "epoch": 0.910957625536681, - "grad_norm": 0.589580237865448, - "learning_rate": 4.048931753841856e-07, - "loss": 0.9902, - "step": 54900 - }, - { - "epoch": 0.9126169290440338, - "grad_norm": 0.5734642148017883, - "learning_rate": 3.9004618404023476e-07, - "loss": 0.987, - "step": 55000 - }, - { - "epoch": 0.9126169290440338, - "eval_loss": 0.5200493931770325, - "eval_runtime": 12.6665, - "eval_samples_per_second": 29.843, - "eval_steps_per_second": 1.026, - "step": 55000 - }, - { - "epoch": 0.9142762325513866, - "grad_norm": 0.5449884533882141, - "learning_rate": 3.754710984701704e-07, - "loss": 0.9866, - "step": 55100 - }, - { - "epoch": 0.9159355360587393, - "grad_norm": 0.5736817121505737, - "learning_rate": 3.6116833106410454e-07, - "loss": 0.9935, - "step": 55200 - }, - { - "epoch": 0.9175948395660921, - "grad_norm": 0.5701528191566467, - "learning_rate": 3.4713828650713043e-07, - "loss": 0.9834, - "step": 55300 - }, - { - "epoch": 0.9192541430734449, - "grad_norm": 0.5518535375595093, - "learning_rate": 3.333813617678672e-07, - "loss": 0.9871, - "step": 55400 - }, - { - "epoch": 0.9209134465807977, - "grad_norm": 0.5550582408905029, - "learning_rate": 3.1989794608723647e-07, - "loss": 0.9898, - "step": 55500 - }, - { - "epoch": 0.9209134465807977, - "eval_loss": 0.5198869109153748, - "eval_runtime": 12.5873, - "eval_samples_per_second": 30.03, - "eval_steps_per_second": 1.033, - "step": 55500 - }, - { - "epoch": 0.9225727500881505, - "grad_norm": 0.539126455783844, - "learning_rate": 3.0668842096744253e-07, - "loss": 0.9856, - "step": 55600 - }, - { - "epoch": 0.9242320535955033, - "grad_norm": 0.5302634239196777, - "learning_rate": 2.9375316016118425e-07, - "loss": 0.9809, - "step": 55700 - }, - { - "epoch": 0.9258913571028561, - "grad_norm": 0.5867940783500671, - "learning_rate": 2.8109252966107226e-07, - "loss": 0.9983, - "step": 55800 - }, - { - "epoch": 0.9275506606102089, - "grad_norm": 0.5523255467414856, - "learning_rate": 2.687068876892773e-07, - "loss": 0.9909, - "step": 55900 - }, - { - "epoch": 0.9292099641175616, - "grad_norm": 0.5510666966438293, - "learning_rate": 2.565965846874008e-07, - "loss": 0.9889, - "step": 56000 - }, - { - "epoch": 0.9292099641175616, - "eval_loss": 0.5197787284851074, - "eval_runtime": 12.1131, - "eval_samples_per_second": 31.206, - "eval_steps_per_second": 1.073, - "step": 56000 - }, - { - "epoch": 0.9308692676249144, - "grad_norm": 0.5827771425247192, - "learning_rate": 2.447619633065457e-07, - "loss": 0.997, - "step": 56100 - }, - { - "epoch": 0.9325285711322673, - "grad_norm": 0.5223729610443115, - "learning_rate": 2.3320335839763565e-07, - "loss": 0.988, - "step": 56200 - }, - { - "epoch": 0.93418787463962, - "grad_norm": 0.5550760626792908, - "learning_rate": 2.219210970019292e-07, - "loss": 0.9843, - "step": 56300 - }, - { - "epoch": 0.9358471781469728, - "grad_norm": 0.5889800786972046, - "learning_rate": 2.1091549834177495e-07, - "loss": 0.9899, - "step": 56400 - }, - { - "epoch": 0.9375064816543256, - "grad_norm": 0.566161572933197, - "learning_rate": 2.001868738115742e-07, - "loss": 1.0022, - "step": 56500 - }, - { - "epoch": 0.9375064816543256, - "eval_loss": 0.5197795033454895, - "eval_runtime": 15.0929, - "eval_samples_per_second": 25.045, - "eval_steps_per_second": 0.861, - "step": 56500 - }, - { - "epoch": 0.9391657851616784, - "grad_norm": 0.5723180174827576, - "learning_rate": 1.897355269689749e-07, - "loss": 0.983, - "step": 56600 - }, - { - "epoch": 0.9408250886690311, - "grad_norm": 0.5928417444229126, - "learning_rate": 1.795617535262739e-07, - "loss": 0.9949, - "step": 56700 - }, - { - "epoch": 0.9424843921763839, - "grad_norm": 0.5741714835166931, - "learning_rate": 1.6966584134206265e-07, - "loss": 0.9871, - "step": 56800 - }, - { - "epoch": 0.9441436956837368, - "grad_norm": 0.5464352369308472, - "learning_rate": 1.6004807041307136e-07, - "loss": 0.9879, - "step": 56900 - }, - { - "epoch": 0.9458029991910896, - "grad_norm": 0.5840194821357727, - "learning_rate": 1.5070871286625323e-07, - "loss": 0.9841, - "step": 57000 - }, - { - "epoch": 0.9458029991910896, - "eval_loss": 0.5197437405586243, - "eval_runtime": 12.1835, - "eval_samples_per_second": 31.026, - "eval_steps_per_second": 1.067, - "step": 57000 - }, - { - "epoch": 0.9474623026984423, - "grad_norm": 0.5888699293136597, - "learning_rate": 1.4164803295108275e-07, - "loss": 0.9895, - "step": 57100 - }, - { - "epoch": 0.9491216062057951, - "grad_norm": 0.5972567200660706, - "learning_rate": 1.328662870320785e-07, - "loss": 0.9822, - "step": 57200 - }, - { - "epoch": 0.9507809097131479, - "grad_norm": 0.5945199131965637, - "learning_rate": 1.243637235815487e-07, - "loss": 0.9903, - "step": 57300 - }, - { - "epoch": 0.9524402132205007, - "grad_norm": 0.5498097538948059, - "learning_rate": 1.1614058317256593e-07, - "loss": 0.9927, - "step": 57400 - }, - { - "epoch": 0.9540995167278535, - "grad_norm": 0.605226457118988, - "learning_rate": 1.0819709847215253e-07, - "loss": 0.9856, - "step": 57500 - }, - { - "epoch": 0.9540995167278535, - "eval_loss": 0.5196654200553894, - "eval_runtime": 12.2784, - "eval_samples_per_second": 30.786, - "eval_steps_per_second": 1.059, - "step": 57500 - }, - { - "epoch": 0.9557588202352063, - "grad_norm": 0.5570980310440063, - "learning_rate": 1.0053349423470471e-07, - "loss": 0.9933, - "step": 57600 - }, - { - "epoch": 0.9574181237425591, - "grad_norm": 0.5370175242424011, - "learning_rate": 9.314998729562763e-08, - "loss": 0.9866, - "step": 57700 - }, - { - "epoch": 0.9590774272499119, - "grad_norm": 0.5578773617744446, - "learning_rate": 8.604678656520482e-08, - "loss": 0.9896, - "step": 57800 - }, - { - "epoch": 0.9607367307572646, - "grad_norm": 0.6047046780586243, - "learning_rate": 7.922409302268397e-08, - "loss": 0.9844, - "step": 57900 - }, - { - "epoch": 0.9623960342646174, - "grad_norm": 0.5857073664665222, - "learning_rate": 7.268209971059148e-08, - "loss": 0.9901, - "step": 58000 - }, - { - "epoch": 0.9623960342646174, - "eval_loss": 0.5196630954742432, - "eval_runtime": 12.9692, - "eval_samples_per_second": 29.146, - "eval_steps_per_second": 1.002, - "step": 58000 - }, - { - "epoch": 0.9640553377719702, - "grad_norm": 0.5455501675605774, - "learning_rate": 6.642099172927241e-08, - "loss": 0.9874, - "step": 58100 - }, - { - "epoch": 0.965714641279323, - "grad_norm": 0.5487021207809448, - "learning_rate": 6.044094623164798e-08, - "loss": 0.9864, - "step": 58200 - }, - { - "epoch": 0.9673739447866758, - "grad_norm": 0.5846928954124451, - "learning_rate": 5.474213241821069e-08, - "loss": 0.9945, - "step": 58300 - }, - { - "epoch": 0.9690332482940286, - "grad_norm": 0.5437830090522766, - "learning_rate": 4.93247115322304e-08, - "loss": 0.9947, - "step": 58400 - }, - { - "epoch": 0.9706925518013814, - "grad_norm": 0.5384014844894409, - "learning_rate": 4.418883685519348e-08, - "loss": 0.9902, - "step": 58500 - }, - { - "epoch": 0.9706925518013814, - "eval_loss": 0.5197026133537292, - "eval_runtime": 12.3731, - "eval_samples_per_second": 30.55, - "eval_steps_per_second": 1.051, - "step": 58500 - }, - { - "epoch": 0.9723518553087341, - "grad_norm": 0.5737383365631104, - "learning_rate": 3.9334653702469695e-08, - "loss": 0.9876, - "step": 58600 - }, - { - "epoch": 0.9740111588160869, - "grad_norm": 0.5581490993499756, - "learning_rate": 3.476229941919651e-08, - "loss": 0.9882, - "step": 58700 - }, - { - "epoch": 0.9756704623234398, - "grad_norm": 0.5709626078605652, - "learning_rate": 3.0471903376393386e-08, - "loss": 0.9944, - "step": 58800 - }, - { - "epoch": 0.9773297658307926, - "grad_norm": 0.5843920111656189, - "learning_rate": 2.6463586967302446e-08, - "loss": 0.9803, - "step": 58900 - }, - { - "epoch": 0.9789890693381453, - "grad_norm": 0.5755345821380615, - "learning_rate": 2.273746360395568e-08, - "loss": 0.9805, - "step": 59000 - }, - { - "epoch": 0.9789890693381453, - "eval_loss": 0.5196226239204407, - "eval_runtime": 12.5179, - "eval_samples_per_second": 30.197, - "eval_steps_per_second": 1.039, - "step": 59000 - }, - { - "epoch": 0.9806483728454981, - "grad_norm": 0.6106430292129517, - "learning_rate": 1.9293638713961948e-08, - "loss": 0.9805, - "step": 59100 - }, - { - "epoch": 0.9823076763528509, - "grad_norm": 0.5703415274620056, - "learning_rate": 1.613220973752605e-08, - "loss": 0.9878, - "step": 59200 - }, - { - "epoch": 0.9839669798602036, - "grad_norm": 0.5435425043106079, - "learning_rate": 1.3253266124692021e-08, - "loss": 0.9798, - "step": 59300 - }, - { - "epoch": 0.9856262833675564, - "grad_norm": 0.580873429775238, - "learning_rate": 1.0656889332812948e-08, - "loss": 0.99, - "step": 59400 - }, - { - "epoch": 0.9872855868749093, - "grad_norm": 0.5808476805686951, - "learning_rate": 8.343152824242806e-09, - "loss": 0.9884, - "step": 59500 - }, - { - "epoch": 0.9872855868749093, - "eval_loss": 0.5197011232376099, - "eval_runtime": 12.5279, - "eval_samples_per_second": 30.173, - "eval_steps_per_second": 1.038, - "step": 59500 - }, - { - "epoch": 0.9889448903822621, - "grad_norm": 0.5640795826911926, - "learning_rate": 6.312122064262571e-09, - "loss": 1.0066, - "step": 59600 - }, - { - "epoch": 0.9906041938896148, - "grad_norm": 0.571942925453186, - "learning_rate": 4.56385451922281e-09, - "loss": 0.9882, - "step": 59700 - }, - { - "epoch": 0.9922634973969676, - "grad_norm": 0.5753745436668396, - "learning_rate": 3.098399654923867e-09, - "loss": 0.9925, - "step": 59800 - }, - { - "epoch": 0.9939228009043204, - "grad_norm": 0.577826738357544, - "learning_rate": 1.9157989352103225e-09, - "loss": 0.9998, - "step": 59900 - }, - { - "epoch": 0.9955821044116732, - "grad_norm": 0.5551530718803406, - "learning_rate": 1.0160858208008162e-09, - "loss": 0.9794, - "step": 60000 - }, - { - "epoch": 0.9955821044116732, - "eval_loss": 0.5197359919548035, - "eval_runtime": 12.4944, - "eval_samples_per_second": 30.254, - "eval_steps_per_second": 1.04, - "step": 60000 - }, - { - "epoch": 0.9972414079190259, - "grad_norm": 0.5505213141441345, - "learning_rate": 3.992857683421392e-10, - "loss": 0.9859, - "step": 60100 - }, - { - "epoch": 0.9989007114263788, - "grad_norm": 0.5432794094085693, - "learning_rate": 6.541622968536665e-11, - "loss": 0.9818, - "step": 60200 - }, - { - "epoch": 1.0, - "step": 60267, - "total_flos": 1.111556929356531e+20, - "train_loss": 0.04376575569283096, - "train_runtime": 51513.6998, - "train_samples_per_second": 140.389, - "train_steps_per_second": 1.17 } ], "logging_steps": 100, - "max_steps": 60267, + "max_steps": 49697, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, @@ -5210,7 +4819,7 @@ "attributes": {} } }, - "total_flos": 1.111556929356531e+20, + "total_flos": 9.166027593741658e+19, "train_batch_size": 10, "trial_name": null, "trial_params": null