| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 300, | |
| "global_step": 49697, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002012224262394044, | |
| "grad_norm": 1.5522648096084595, | |
| "learning_rate": 1.9919517102615694e-06, | |
| "loss": 1.2489, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.004024448524788088, | |
| "grad_norm": 1.59534752368927, | |
| "learning_rate": 4.0040241448692155e-06, | |
| "loss": 1.1249, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.006036672787182132, | |
| "grad_norm": 1.5959556102752686, | |
| "learning_rate": 6.016096579476862e-06, | |
| "loss": 1.0698, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.006036672787182132, | |
| "eval_loss": 0.9532507061958313, | |
| "eval_runtime": 11.1234, | |
| "eval_samples_per_second": 33.982, | |
| "eval_steps_per_second": 1.169, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.008048897049576176, | |
| "grad_norm": 1.6886017322540283, | |
| "learning_rate": 8.028169014084509e-06, | |
| "loss": 1.0365, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.01006112131197022, | |
| "grad_norm": 1.421373963356018, | |
| "learning_rate": 1.0040241448692154e-05, | |
| "loss": 1.0155, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.012073345574364264, | |
| "grad_norm": 1.578765630722046, | |
| "learning_rate": 1.20523138832998e-05, | |
| "loss": 1.006, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.012073345574364264, | |
| "eval_loss": 0.8175720572471619, | |
| "eval_runtime": 11.5611, | |
| "eval_samples_per_second": 32.696, | |
| "eval_steps_per_second": 1.124, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.014085569836758306, | |
| "grad_norm": 1.1900346279144287, | |
| "learning_rate": 1.4064386317907446e-05, | |
| "loss": 0.9972, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.01609779409915235, | |
| "grad_norm": 1.4590531587600708, | |
| "learning_rate": 1.607645875251509e-05, | |
| "loss": 0.9895, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.018110018361546396, | |
| "grad_norm": 1.8518555164337158, | |
| "learning_rate": 1.8088531187122737e-05, | |
| "loss": 0.9718, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.018110018361546396, | |
| "eval_loss": 0.781577467918396, | |
| "eval_runtime": 11.4278, | |
| "eval_samples_per_second": 33.077, | |
| "eval_steps_per_second": 1.138, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.02012224262394044, | |
| "grad_norm": 1.351710319519043, | |
| "learning_rate": 1.999999947988626e-05, | |
| "loss": 0.972, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.022134466886334483, | |
| "grad_norm": 1.2841336727142334, | |
| "learning_rate": 1.9999770630715236e-05, | |
| "loss": 0.9662, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.024146691148728527, | |
| "grad_norm": 1.2296431064605713, | |
| "learning_rate": 1.9999125701534677e-05, | |
| "loss": 0.9578, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.024146691148728527, | |
| "eval_loss": 0.9337042570114136, | |
| "eval_runtime": 11.324, | |
| "eval_samples_per_second": 33.38, | |
| "eval_steps_per_second": 1.148, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.026158915411122568, | |
| "grad_norm": 1.3778767585754395, | |
| "learning_rate": 1.9998064719179408e-05, | |
| "loss": 0.9614, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.028171139673516612, | |
| "grad_norm": 1.3921650648117065, | |
| "learning_rate": 1.9996587727795803e-05, | |
| "loss": 0.9541, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.030183363935910656, | |
| "grad_norm": 1.3527588844299316, | |
| "learning_rate": 1.9994694788839924e-05, | |
| "loss": 0.9488, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.030183363935910656, | |
| "eval_loss": 0.7029635310173035, | |
| "eval_runtime": 11.506, | |
| "eval_samples_per_second": 32.853, | |
| "eval_steps_per_second": 1.13, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0321955881983047, | |
| "grad_norm": 0.8907983303070068, | |
| "learning_rate": 1.9992385981074994e-05, | |
| "loss": 0.9418, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.03420781246069875, | |
| "grad_norm": 1.136816382408142, | |
| "learning_rate": 1.998966140056808e-05, | |
| "loss": 0.946, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.03622003672309279, | |
| "grad_norm": 1.0668370723724365, | |
| "learning_rate": 1.9986521160686134e-05, | |
| "loss": 0.9357, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.03622003672309279, | |
| "eval_loss": 0.68252032995224, | |
| "eval_runtime": 11.3032, | |
| "eval_samples_per_second": 33.442, | |
| "eval_steps_per_second": 1.15, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.038232260985486835, | |
| "grad_norm": 0.8517168760299683, | |
| "learning_rate": 1.9982965392091262e-05, | |
| "loss": 0.936, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.04024448524788088, | |
| "grad_norm": 1.0746815204620361, | |
| "learning_rate": 1.9978994242735275e-05, | |
| "loss": 0.9384, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.04225670951027492, | |
| "grad_norm": 1.0119695663452148, | |
| "learning_rate": 1.9974607877853555e-05, | |
| "loss": 0.9252, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.04225670951027492, | |
| "eval_loss": 0.672024130821228, | |
| "eval_runtime": 11.3298, | |
| "eval_samples_per_second": 33.363, | |
| "eval_steps_per_second": 1.147, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.04426893377266897, | |
| "grad_norm": 0.7535356283187866, | |
| "learning_rate": 1.9969806479958154e-05, | |
| "loss": 0.9215, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.04628115803506301, | |
| "grad_norm": 0.837115466594696, | |
| "learning_rate": 1.996459024883023e-05, | |
| "loss": 0.9229, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.048293382297457055, | |
| "grad_norm": 0.9772033095359802, | |
| "learning_rate": 1.995895940151171e-05, | |
| "loss": 0.9155, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.048293382297457055, | |
| "eval_loss": 0.6609585285186768, | |
| "eval_runtime": 11.8546, | |
| "eval_samples_per_second": 31.886, | |
| "eval_steps_per_second": 1.097, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.0503056065598511, | |
| "grad_norm": 0.9059876799583435, | |
| "learning_rate": 1.9952914172296264e-05, | |
| "loss": 0.9104, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.052317830822245136, | |
| "grad_norm": 1.090819239616394, | |
| "learning_rate": 1.9946454812719572e-05, | |
| "loss": 0.9056, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.05433005508463918, | |
| "grad_norm": 0.8924378156661987, | |
| "learning_rate": 1.9939581591548833e-05, | |
| "loss": 0.9102, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.05433005508463918, | |
| "eval_loss": 0.6568426489830017, | |
| "eval_runtime": 11.3424, | |
| "eval_samples_per_second": 33.326, | |
| "eval_steps_per_second": 1.146, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.056342279347033224, | |
| "grad_norm": 0.9142224788665771, | |
| "learning_rate": 1.9932294794771596e-05, | |
| "loss": 0.9101, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.05835450360942727, | |
| "grad_norm": 1.060359239578247, | |
| "learning_rate": 1.992459472558387e-05, | |
| "loss": 0.9013, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.06036672787182131, | |
| "grad_norm": 0.7167413234710693, | |
| "learning_rate": 1.9916481704377487e-05, | |
| "loss": 0.9002, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.06036672787182131, | |
| "eval_loss": 0.6527668237686157, | |
| "eval_runtime": 11.4, | |
| "eval_samples_per_second": 33.158, | |
| "eval_steps_per_second": 1.14, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.062378952134215356, | |
| "grad_norm": 0.783549427986145, | |
| "learning_rate": 1.9907956068726782e-05, | |
| "loss": 0.897, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.0643911763966094, | |
| "grad_norm": 0.9683724045753479, | |
| "learning_rate": 1.9899018173374552e-05, | |
| "loss": 0.9294, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.06640340065900345, | |
| "grad_norm": 1.1547231674194336, | |
| "learning_rate": 1.9889668390217284e-05, | |
| "loss": 0.901, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.06640340065900345, | |
| "eval_loss": 0.6419159173965454, | |
| "eval_runtime": 11.429, | |
| "eval_samples_per_second": 33.074, | |
| "eval_steps_per_second": 1.137, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.0684156249213975, | |
| "grad_norm": 0.81548011302948, | |
| "learning_rate": 1.9879907108289684e-05, | |
| "loss": 0.9008, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.07042784918379154, | |
| "grad_norm": 0.7857891321182251, | |
| "learning_rate": 1.98697347337485e-05, | |
| "loss": 0.8928, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.07244007344618558, | |
| "grad_norm": 0.8332715630531311, | |
| "learning_rate": 1.985915168985561e-05, | |
| "loss": 0.8889, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.07244007344618558, | |
| "eval_loss": 0.6356409192085266, | |
| "eval_runtime": 11.2917, | |
| "eval_samples_per_second": 33.476, | |
| "eval_steps_per_second": 1.151, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.07445229770857963, | |
| "grad_norm": 0.9201735258102417, | |
| "learning_rate": 1.9848158416960414e-05, | |
| "loss": 0.8869, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.07646452197097367, | |
| "grad_norm": 0.7852803468704224, | |
| "learning_rate": 1.9836755372481512e-05, | |
| "loss": 0.8973, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.07847674623336771, | |
| "grad_norm": 0.7758309841156006, | |
| "learning_rate": 1.982494303088767e-05, | |
| "loss": 0.8925, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.07847674623336771, | |
| "eval_loss": 0.6345422863960266, | |
| "eval_runtime": 11.3533, | |
| "eval_samples_per_second": 33.294, | |
| "eval_steps_per_second": 1.145, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.08048897049576176, | |
| "grad_norm": 0.9436432123184204, | |
| "learning_rate": 1.981272188367809e-05, | |
| "loss": 0.8847, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.0825011947581558, | |
| "grad_norm": 0.8394960165023804, | |
| "learning_rate": 1.980009243936193e-05, | |
| "loss": 0.8923, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.08451341902054985, | |
| "grad_norm": 0.8079524636268616, | |
| "learning_rate": 1.9787055223437184e-05, | |
| "loss": 0.8828, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.08451341902054985, | |
| "eval_loss": 0.6277508735656738, | |
| "eval_runtime": 11.2988, | |
| "eval_samples_per_second": 33.455, | |
| "eval_steps_per_second": 1.151, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.08652564328294389, | |
| "grad_norm": 0.8562188744544983, | |
| "learning_rate": 1.977361077836878e-05, | |
| "loss": 0.8801, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.08853786754533793, | |
| "grad_norm": 0.9642734527587891, | |
| "learning_rate": 1.9759759663566032e-05, | |
| "loss": 0.896, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.09055009180773198, | |
| "grad_norm": 0.8723398447036743, | |
| "learning_rate": 1.9745502455359367e-05, | |
| "loss": 0.8879, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.09055009180773198, | |
| "eval_loss": 0.6282201409339905, | |
| "eval_runtime": 11.4757, | |
| "eval_samples_per_second": 32.939, | |
| "eval_steps_per_second": 1.133, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.09256231607012602, | |
| "grad_norm": 0.8613621592521667, | |
| "learning_rate": 1.9730839746976314e-05, | |
| "loss": 0.8854, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.09457454033252007, | |
| "grad_norm": 0.7336219549179077, | |
| "learning_rate": 1.9715772148516855e-05, | |
| "loss": 0.8806, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.09658676459491411, | |
| "grad_norm": 0.7842460870742798, | |
| "learning_rate": 1.970030028692802e-05, | |
| "loss": 0.8798, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.09658676459491411, | |
| "eval_loss": 0.6203732490539551, | |
| "eval_runtime": 11.2931, | |
| "eval_samples_per_second": 33.472, | |
| "eval_steps_per_second": 1.151, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.09859898885730815, | |
| "grad_norm": 1.042386770248413, | |
| "learning_rate": 1.968442480597781e-05, | |
| "loss": 0.8786, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.1006112131197022, | |
| "grad_norm": 0.8358279466629028, | |
| "learning_rate": 1.9668146366228398e-05, | |
| "loss": 0.8834, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.10262343738209624, | |
| "grad_norm": 0.9129268527030945, | |
| "learning_rate": 1.965146564500866e-05, | |
| "loss": 0.8763, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.10262343738209624, | |
| "eval_loss": 0.6140510439872742, | |
| "eval_runtime": 11.3122, | |
| "eval_samples_per_second": 33.415, | |
| "eval_steps_per_second": 1.149, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.10463566164449027, | |
| "grad_norm": 0.9329330325126648, | |
| "learning_rate": 1.963438333638598e-05, | |
| "loss": 0.8724, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.10664788590688432, | |
| "grad_norm": 0.9156613349914551, | |
| "learning_rate": 1.9616900151137375e-05, | |
| "loss": 0.8798, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.10866011016927836, | |
| "grad_norm": 1.0988123416900635, | |
| "learning_rate": 1.9599016816719912e-05, | |
| "loss": 0.8864, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.10866011016927836, | |
| "eval_loss": 0.613735556602478, | |
| "eval_runtime": 11.5595, | |
| "eval_samples_per_second": 32.7, | |
| "eval_steps_per_second": 1.125, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.1106723344316724, | |
| "grad_norm": 0.9962302446365356, | |
| "learning_rate": 1.9580734077240467e-05, | |
| "loss": 0.879, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.11268455869406645, | |
| "grad_norm": 0.6542097926139832, | |
| "learning_rate": 1.9562052693424724e-05, | |
| "loss": 0.8754, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.11469678295646049, | |
| "grad_norm": 0.8420646786689758, | |
| "learning_rate": 1.9542973442585542e-05, | |
| "loss": 0.8753, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.11469678295646049, | |
| "eval_loss": 0.6112973690032959, | |
| "eval_runtime": 11.3099, | |
| "eval_samples_per_second": 33.422, | |
| "eval_steps_per_second": 1.149, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.11670900721885454, | |
| "grad_norm": 1.0234030485153198, | |
| "learning_rate": 1.9523497118590625e-05, | |
| "loss": 0.869, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.11872123148124858, | |
| "grad_norm": 0.7687940001487732, | |
| "learning_rate": 1.9503624531829463e-05, | |
| "loss": 0.875, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.12073345574364262, | |
| "grad_norm": 0.858860194683075, | |
| "learning_rate": 1.9483356509179633e-05, | |
| "loss": 0.8682, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.12073345574364262, | |
| "eval_loss": 0.6082560420036316, | |
| "eval_runtime": 11.2984, | |
| "eval_samples_per_second": 33.456, | |
| "eval_steps_per_second": 1.151, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.12274568000603667, | |
| "grad_norm": 0.7500011324882507, | |
| "learning_rate": 1.946269389397239e-05, | |
| "loss": 0.8667, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.12475790426843071, | |
| "grad_norm": 0.8498502373695374, | |
| "learning_rate": 1.9441637545957558e-05, | |
| "loss": 0.8717, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.12677012853082475, | |
| "grad_norm": 0.9230628609657288, | |
| "learning_rate": 1.9420188341267783e-05, | |
| "loss": 0.8689, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.12677012853082475, | |
| "eval_loss": 0.6047795414924622, | |
| "eval_runtime": 11.3052, | |
| "eval_samples_per_second": 33.436, | |
| "eval_steps_per_second": 1.15, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.1287823527932188, | |
| "grad_norm": 0.7312197089195251, | |
| "learning_rate": 1.939834717238207e-05, | |
| "loss": 0.8676, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.13079457705561284, | |
| "grad_norm": 0.7080931067466736, | |
| "learning_rate": 1.9376114948088634e-05, | |
| "loss": 0.8632, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.1328068013180069, | |
| "grad_norm": 0.793525755405426, | |
| "learning_rate": 1.9353492593447107e-05, | |
| "loss": 0.8682, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.1328068013180069, | |
| "eval_loss": 0.6011930704116821, | |
| "eval_runtime": 11.4543, | |
| "eval_samples_per_second": 33.001, | |
| "eval_steps_per_second": 1.135, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.13481902558040093, | |
| "grad_norm": 0.7798284292221069, | |
| "learning_rate": 1.9330481049750028e-05, | |
| "loss": 0.8636, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.136831249842795, | |
| "grad_norm": 0.9270545840263367, | |
| "learning_rate": 1.9307081274483698e-05, | |
| "loss": 0.8644, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.13884347410518902, | |
| "grad_norm": 0.7777066826820374, | |
| "learning_rate": 1.9283294241288315e-05, | |
| "loss": 0.8682, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.13884347410518902, | |
| "eval_loss": 0.6046885848045349, | |
| "eval_runtime": 11.4509, | |
| "eval_samples_per_second": 33.01, | |
| "eval_steps_per_second": 1.135, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.14085569836758308, | |
| "grad_norm": 0.7538514733314514, | |
| "learning_rate": 1.925912093991748e-05, | |
| "loss": 0.8654, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.1428679226299771, | |
| "grad_norm": 0.6866621375083923, | |
| "learning_rate": 1.9234562376197015e-05, | |
| "loss": 0.8497, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.14488014689237116, | |
| "grad_norm": 0.829768717288971, | |
| "learning_rate": 1.92096195719831e-05, | |
| "loss": 0.8575, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.14488014689237116, | |
| "eval_loss": 0.6001401543617249, | |
| "eval_runtime": 11.2516, | |
| "eval_samples_per_second": 33.595, | |
| "eval_steps_per_second": 1.155, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.1468923711547652, | |
| "grad_norm": 0.8665058016777039, | |
| "learning_rate": 1.9184293565119755e-05, | |
| "loss": 0.8612, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.14890459541715925, | |
| "grad_norm": 0.7740942239761353, | |
| "learning_rate": 1.9158585409395674e-05, | |
| "loss": 0.8596, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.15091681967955328, | |
| "grad_norm": 0.672917902469635, | |
| "learning_rate": 1.9132496174500364e-05, | |
| "loss": 0.854, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.15091681967955328, | |
| "eval_loss": 0.5939906239509583, | |
| "eval_runtime": 11.3101, | |
| "eval_samples_per_second": 33.421, | |
| "eval_steps_per_second": 1.149, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.15292904394194734, | |
| "grad_norm": 0.719465970993042, | |
| "learning_rate": 1.9106026945979627e-05, | |
| "loss": 0.8615, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.15494126820434137, | |
| "grad_norm": 0.7433097958564758, | |
| "learning_rate": 1.9079178825190416e-05, | |
| "loss": 0.8564, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.15695349246673543, | |
| "grad_norm": 0.7390840649604797, | |
| "learning_rate": 1.9051952929254983e-05, | |
| "loss": 0.8526, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.15695349246673543, | |
| "eval_loss": 0.5941105484962463, | |
| "eval_runtime": 11.2494, | |
| "eval_samples_per_second": 33.602, | |
| "eval_steps_per_second": 1.156, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.15896571672912946, | |
| "grad_norm": 0.721076488494873, | |
| "learning_rate": 1.902435039101442e-05, | |
| "loss": 0.8535, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.16097794099152352, | |
| "grad_norm": 0.7117634415626526, | |
| "learning_rate": 1.899637235898151e-05, | |
| "loss": 0.8548, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.16299016525391755, | |
| "grad_norm": 0.7325859069824219, | |
| "learning_rate": 1.8968019997292937e-05, | |
| "loss": 0.8661, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.16299016525391755, | |
| "eval_loss": 0.5943772196769714, | |
| "eval_runtime": 11.2277, | |
| "eval_samples_per_second": 33.667, | |
| "eval_steps_per_second": 1.158, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.1650023895163116, | |
| "grad_norm": 0.8927565217018127, | |
| "learning_rate": 1.893929448566085e-05, | |
| "loss": 0.8535, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.16701461377870563, | |
| "grad_norm": 0.9083840250968933, | |
| "learning_rate": 1.8910197019323782e-05, | |
| "loss": 0.8581, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.1690268380410997, | |
| "grad_norm": 0.7133694291114807, | |
| "learning_rate": 1.8880728808996906e-05, | |
| "loss": 0.8491, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.1690268380410997, | |
| "eval_loss": 0.5923792719841003, | |
| "eval_runtime": 11.2757, | |
| "eval_samples_per_second": 33.523, | |
| "eval_steps_per_second": 1.153, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.17103906230349372, | |
| "grad_norm": 0.7994174361228943, | |
| "learning_rate": 1.8850891080821673e-05, | |
| "loss": 0.8577, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.17305128656588778, | |
| "grad_norm": 1.106224775314331, | |
| "learning_rate": 1.8820685076314782e-05, | |
| "loss": 0.849, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.1750635108282818, | |
| "grad_norm": 1.0492300987243652, | |
| "learning_rate": 1.8790112052316523e-05, | |
| "loss": 0.8579, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.1750635108282818, | |
| "eval_loss": 0.6185858845710754, | |
| "eval_runtime": 11.3469, | |
| "eval_samples_per_second": 33.313, | |
| "eval_steps_per_second": 1.146, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.17707573509067587, | |
| "grad_norm": 0.7523091435432434, | |
| "learning_rate": 1.875917328093849e-05, | |
| "loss": 0.8548, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.1790879593530699, | |
| "grad_norm": 0.8177125453948975, | |
| "learning_rate": 1.8727870049510636e-05, | |
| "loss": 0.8512, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.18110018361546396, | |
| "grad_norm": 0.7863544821739197, | |
| "learning_rate": 1.869620366052772e-05, | |
| "loss": 0.8474, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.18110018361546396, | |
| "eval_loss": 0.5867164134979248, | |
| "eval_runtime": 11.2542, | |
| "eval_samples_per_second": 33.588, | |
| "eval_steps_per_second": 1.155, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.18311240787785799, | |
| "grad_norm": 0.7436131834983826, | |
| "learning_rate": 1.8664175431595106e-05, | |
| "loss": 0.8587, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.18512463214025204, | |
| "grad_norm": 0.803816020488739, | |
| "learning_rate": 1.8631786695373943e-05, | |
| "loss": 0.8455, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.18713685640264607, | |
| "grad_norm": 0.9202460050582886, | |
| "learning_rate": 1.8599038799525712e-05, | |
| "loss": 0.8513, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.18713685640264607, | |
| "eval_loss": 0.583454430103302, | |
| "eval_runtime": 11.2388, | |
| "eval_samples_per_second": 33.633, | |
| "eval_steps_per_second": 1.157, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.18914908066504013, | |
| "grad_norm": 0.8134105801582336, | |
| "learning_rate": 1.856593310665614e-05, | |
| "loss": 0.8499, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.19116130492743416, | |
| "grad_norm": 0.7113932967185974, | |
| "learning_rate": 1.8532470994258533e-05, | |
| "loss": 0.849, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.19317352918982822, | |
| "grad_norm": 0.8230564594268799, | |
| "learning_rate": 1.8498653854656424e-05, | |
| "loss": 0.8413, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.19317352918982822, | |
| "eval_loss": 0.5848163962364197, | |
| "eval_runtime": 11.2801, | |
| "eval_samples_per_second": 33.51, | |
| "eval_steps_per_second": 1.152, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.19518575345222225, | |
| "grad_norm": 0.6756404638290405, | |
| "learning_rate": 1.8464483094945667e-05, | |
| "loss": 0.8543, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.1971979777146163, | |
| "grad_norm": 0.7398785352706909, | |
| "learning_rate": 1.8429960136935878e-05, | |
| "loss": 0.8428, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.19921020197701034, | |
| "grad_norm": 0.7419747710227966, | |
| "learning_rate": 1.8395086417091272e-05, | |
| "loss": 0.8516, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.19921020197701034, | |
| "eval_loss": 0.5863896608352661, | |
| "eval_runtime": 11.3198, | |
| "eval_samples_per_second": 33.393, | |
| "eval_steps_per_second": 1.148, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.2012224262394044, | |
| "grad_norm": 0.8145945072174072, | |
| "learning_rate": 1.8359863386470904e-05, | |
| "loss": 0.8508, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.20323465050179843, | |
| "grad_norm": 0.7068437933921814, | |
| "learning_rate": 1.8324292510668278e-05, | |
| "loss": 0.8495, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.20524687476419248, | |
| "grad_norm": 0.7419267892837524, | |
| "learning_rate": 1.828837526975038e-05, | |
| "loss": 0.8461, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.20524687476419248, | |
| "eval_loss": 0.5834963917732239, | |
| "eval_runtime": 11.7842, | |
| "eval_samples_per_second": 32.077, | |
| "eval_steps_per_second": 1.103, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.2072590990265865, | |
| "grad_norm": 1.129436731338501, | |
| "learning_rate": 1.8252113158196078e-05, | |
| "loss": 0.8435, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.20927132328898054, | |
| "grad_norm": 0.6937255859375, | |
| "learning_rate": 1.821550768483396e-05, | |
| "loss": 0.8485, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.2112835475513746, | |
| "grad_norm": 0.8506975769996643, | |
| "learning_rate": 1.8178560372779525e-05, | |
| "loss": 0.8473, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.2112835475513746, | |
| "eval_loss": 0.5813661217689514, | |
| "eval_runtime": 11.832, | |
| "eval_samples_per_second": 31.947, | |
| "eval_steps_per_second": 1.099, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.21329577181376863, | |
| "grad_norm": 0.733964204788208, | |
| "learning_rate": 1.814127275937183e-05, | |
| "loss": 0.836, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.2153079960761627, | |
| "grad_norm": 0.7400948405265808, | |
| "learning_rate": 1.8103646396109523e-05, | |
| "loss": 0.8473, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.21732022033855672, | |
| "grad_norm": 0.9023438096046448, | |
| "learning_rate": 1.8065682848586266e-05, | |
| "loss": 0.8468, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.21732022033855672, | |
| "eval_loss": 0.5793610215187073, | |
| "eval_runtime": 11.234, | |
| "eval_samples_per_second": 33.648, | |
| "eval_steps_per_second": 1.157, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.21933244460095078, | |
| "grad_norm": 0.82066810131073, | |
| "learning_rate": 1.8027383696425613e-05, | |
| "loss": 0.8457, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.2213446688633448, | |
| "grad_norm": 0.6094478964805603, | |
| "learning_rate": 1.7988750533215276e-05, | |
| "loss": 0.8408, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.22335689312573886, | |
| "grad_norm": 0.7535290122032166, | |
| "learning_rate": 1.7949784966440823e-05, | |
| "loss": 0.8403, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.22335689312573886, | |
| "eval_loss": 0.578126072883606, | |
| "eval_runtime": 11.202, | |
| "eval_samples_per_second": 33.744, | |
| "eval_steps_per_second": 1.161, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.2253691173881329, | |
| "grad_norm": 0.7472143769264221, | |
| "learning_rate": 1.791048861741877e-05, | |
| "loss": 0.8434, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.22738134165052695, | |
| "grad_norm": 0.8236815333366394, | |
| "learning_rate": 1.7870863121229162e-05, | |
| "loss": 0.8273, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.22939356591292098, | |
| "grad_norm": 0.6772099137306213, | |
| "learning_rate": 1.783091012664749e-05, | |
| "loss": 0.8355, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.22939356591292098, | |
| "eval_loss": 0.5848814249038696, | |
| "eval_runtime": 11.4019, | |
| "eval_samples_per_second": 33.152, | |
| "eval_steps_per_second": 1.14, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.23140579017531504, | |
| "grad_norm": 0.7480434775352478, | |
| "learning_rate": 1.779063129607612e-05, | |
| "loss": 0.8437, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.23341801443770907, | |
| "grad_norm": 0.8341161608695984, | |
| "learning_rate": 1.7750028305475125e-05, | |
| "loss": 0.8384, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.23543023870010313, | |
| "grad_norm": 0.9399694800376892, | |
| "learning_rate": 1.7709102844292516e-05, | |
| "loss": 0.8419, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.23543023870010313, | |
| "eval_loss": 0.5769637227058411, | |
| "eval_runtime": 11.2547, | |
| "eval_samples_per_second": 33.586, | |
| "eval_steps_per_second": 1.155, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.23744246296249716, | |
| "grad_norm": 0.8473734855651855, | |
| "learning_rate": 1.7667856615393987e-05, | |
| "loss": 0.8346, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.23945468722489122, | |
| "grad_norm": 0.6887069940567017, | |
| "learning_rate": 1.7626291334992027e-05, | |
| "loss": 0.8381, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.24146691148728525, | |
| "grad_norm": 0.6946566700935364, | |
| "learning_rate": 1.758440873257454e-05, | |
| "loss": 0.8345, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.24146691148728525, | |
| "eval_loss": 0.5747541785240173, | |
| "eval_runtime": 11.4122, | |
| "eval_samples_per_second": 33.122, | |
| "eval_steps_per_second": 1.139, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.2434791357496793, | |
| "grad_norm": 0.681305468082428, | |
| "learning_rate": 1.7542210550832854e-05, | |
| "loss": 0.841, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.24549136001207333, | |
| "grad_norm": 0.8475384712219238, | |
| "learning_rate": 1.749969854558923e-05, | |
| "loss": 0.8392, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.2475035842744674, | |
| "grad_norm": 1.1652250289916992, | |
| "learning_rate": 1.745687448572379e-05, | |
| "loss": 0.8388, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.2475035842744674, | |
| "eval_loss": 0.5746700763702393, | |
| "eval_runtime": 11.4476, | |
| "eval_samples_per_second": 33.02, | |
| "eval_steps_per_second": 1.136, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.24951580853686142, | |
| "grad_norm": 0.7575956583023071, | |
| "learning_rate": 1.741374015310094e-05, | |
| "loss": 0.8362, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.25152803279925545, | |
| "grad_norm": 0.7489831447601318, | |
| "learning_rate": 1.737029734249519e-05, | |
| "loss": 0.836, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.2535402570616495, | |
| "grad_norm": 0.7467206716537476, | |
| "learning_rate": 1.732654786151651e-05, | |
| "loss": 0.8317, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.2535402570616495, | |
| "eval_loss": 0.5750060081481934, | |
| "eval_runtime": 11.2549, | |
| "eval_samples_per_second": 33.585, | |
| "eval_steps_per_second": 1.155, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.25555248132404357, | |
| "grad_norm": 0.7825116515159607, | |
| "learning_rate": 1.7282493530535095e-05, | |
| "loss": 0.8335, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.2575647055864376, | |
| "grad_norm": 0.8054665923118591, | |
| "learning_rate": 1.723813618260564e-05, | |
| "loss": 0.8332, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.25957692984883163, | |
| "grad_norm": 0.740932822227478, | |
| "learning_rate": 1.7193477663391055e-05, | |
| "loss": 0.8333, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.25957692984883163, | |
| "eval_loss": 0.574753999710083, | |
| "eval_runtime": 11.3005, | |
| "eval_samples_per_second": 33.45, | |
| "eval_steps_per_second": 1.15, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.2615891541112257, | |
| "grad_norm": 0.6655648350715637, | |
| "learning_rate": 1.714851983108567e-05, | |
| "loss": 0.8332, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.26360137837361974, | |
| "grad_norm": 0.8892366886138916, | |
| "learning_rate": 1.710326455633792e-05, | |
| "loss": 0.833, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.2656136026360138, | |
| "grad_norm": 0.7081986665725708, | |
| "learning_rate": 1.7057713722172505e-05, | |
| "loss": 0.8352, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.2656136026360138, | |
| "eval_loss": 0.569306492805481, | |
| "eval_runtime": 11.2208, | |
| "eval_samples_per_second": 33.688, | |
| "eval_steps_per_second": 1.159, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.2676258268984078, | |
| "grad_norm": 0.7726171612739563, | |
| "learning_rate": 1.701186922391206e-05, | |
| "loss": 0.8325, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.26963805116080186, | |
| "grad_norm": 0.6000068187713623, | |
| "learning_rate": 1.6965732969098262e-05, | |
| "loss": 0.8303, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.2716502754231959, | |
| "grad_norm": 0.7751488089561462, | |
| "learning_rate": 1.6919306877412474e-05, | |
| "loss": 0.8311, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.2716502754231959, | |
| "eval_loss": 0.5708428621292114, | |
| "eval_runtime": 11.2236, | |
| "eval_samples_per_second": 33.679, | |
| "eval_steps_per_second": 1.158, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.27366249968559, | |
| "grad_norm": 0.7674184441566467, | |
| "learning_rate": 1.6872592880595872e-05, | |
| "loss": 0.8391, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.275674723947984, | |
| "grad_norm": 0.999799370765686, | |
| "learning_rate": 1.6825592922369066e-05, | |
| "loss": 0.8215, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.27768694821037804, | |
| "grad_norm": 0.7192254662513733, | |
| "learning_rate": 1.6778308958351213e-05, | |
| "loss": 0.8304, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.27768694821037804, | |
| "eval_loss": 0.5696760416030884, | |
| "eval_runtime": 11.2331, | |
| "eval_samples_per_second": 33.65, | |
| "eval_steps_per_second": 1.157, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.2796991724727721, | |
| "grad_norm": 1.1758594512939453, | |
| "learning_rate": 1.673074295597867e-05, | |
| "loss": 0.8346, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.28171139673516615, | |
| "grad_norm": 0.5974677801132202, | |
| "learning_rate": 1.6682896894423094e-05, | |
| "loss": 0.824, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.28372362099756016, | |
| "grad_norm": 0.720886766910553, | |
| "learning_rate": 1.6634772764509128e-05, | |
| "loss": 0.8246, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.28372362099756016, | |
| "eval_loss": 0.5675772428512573, | |
| "eval_runtime": 11.3956, | |
| "eval_samples_per_second": 33.171, | |
| "eval_steps_per_second": 1.141, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.2857358452599542, | |
| "grad_norm": 0.6889091730117798, | |
| "learning_rate": 1.6586372568631545e-05, | |
| "loss": 0.8231, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.28774806952234827, | |
| "grad_norm": 0.6523007154464722, | |
| "learning_rate": 1.6537698320671933e-05, | |
| "loss": 0.8272, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.28976029378474233, | |
| "grad_norm": 0.7638033628463745, | |
| "learning_rate": 1.64887520459149e-05, | |
| "loss": 0.8306, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.28976029378474233, | |
| "eval_loss": 0.569464921951294, | |
| "eval_runtime": 11.248, | |
| "eval_samples_per_second": 33.606, | |
| "eval_steps_per_second": 1.156, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.29177251804713633, | |
| "grad_norm": 0.6883799433708191, | |
| "learning_rate": 1.6439535780963808e-05, | |
| "loss": 0.8327, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.2937847423095304, | |
| "grad_norm": 0.8693552017211914, | |
| "learning_rate": 1.6390051573656028e-05, | |
| "loss": 0.8299, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.29579696657192445, | |
| "grad_norm": 0.6811352372169495, | |
| "learning_rate": 1.634030148297773e-05, | |
| "loss": 0.8257, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.29579696657192445, | |
| "eval_loss": 0.5680450797080994, | |
| "eval_runtime": 11.451, | |
| "eval_samples_per_second": 33.01, | |
| "eval_steps_per_second": 1.135, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.2978091908343185, | |
| "grad_norm": 0.7108572721481323, | |
| "learning_rate": 1.629028757897821e-05, | |
| "loss": 0.826, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.2998214150967125, | |
| "grad_norm": 0.701524555683136, | |
| "learning_rate": 1.6240011942683774e-05, | |
| "loss": 0.8233, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.30183363935910656, | |
| "grad_norm": 0.6415804028511047, | |
| "learning_rate": 1.6189476666011123e-05, | |
| "loss": 0.8174, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.30183363935910656, | |
| "eval_loss": 0.5662389397621155, | |
| "eval_runtime": 11.3747, | |
| "eval_samples_per_second": 33.232, | |
| "eval_steps_per_second": 1.143, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.3038458636215006, | |
| "grad_norm": 0.593760073184967, | |
| "learning_rate": 1.6138683851680328e-05, | |
| "loss": 0.8269, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.3058580878838947, | |
| "grad_norm": 0.6708555221557617, | |
| "learning_rate": 1.608763561312733e-05, | |
| "loss": 0.8277, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.3078703121462887, | |
| "grad_norm": 0.5819365382194519, | |
| "learning_rate": 1.603633407441601e-05, | |
| "loss": 0.8237, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.3078703121462887, | |
| "eval_loss": 0.5628697872161865, | |
| "eval_runtime": 11.3199, | |
| "eval_samples_per_second": 33.393, | |
| "eval_steps_per_second": 1.148, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.30988253640868274, | |
| "grad_norm": 0.725537896156311, | |
| "learning_rate": 1.5984781370149798e-05, | |
| "loss": 0.8355, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.3118947606710768, | |
| "grad_norm": 0.642382800579071, | |
| "learning_rate": 1.5932979645382863e-05, | |
| "loss": 0.8292, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.31390698493347086, | |
| "grad_norm": 0.6141934394836426, | |
| "learning_rate": 1.588093105553086e-05, | |
| "loss": 0.8306, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.31390698493347086, | |
| "eval_loss": 0.5633600354194641, | |
| "eval_runtime": 11.3793, | |
| "eval_samples_per_second": 33.218, | |
| "eval_steps_per_second": 1.142, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.31591920919586486, | |
| "grad_norm": 0.6902384757995605, | |
| "learning_rate": 1.5828637766281238e-05, | |
| "loss": 0.8243, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.3179314334582589, | |
| "grad_norm": 0.7464603781700134, | |
| "learning_rate": 1.5776101953503134e-05, | |
| "loss": 0.8296, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.319943657720653, | |
| "grad_norm": 0.6735148429870605, | |
| "learning_rate": 1.5723325803156834e-05, | |
| "loss": 0.8168, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.319943657720653, | |
| "eval_loss": 0.5626727938652039, | |
| "eval_runtime": 11.3991, | |
| "eval_samples_per_second": 33.16, | |
| "eval_steps_per_second": 1.14, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.32195588198304703, | |
| "grad_norm": 0.7461301684379578, | |
| "learning_rate": 1.5670311511202823e-05, | |
| "loss": 0.8175, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.32396810624544103, | |
| "grad_norm": 0.6454249620437622, | |
| "learning_rate": 1.5617061283510404e-05, | |
| "loss": 0.8287, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.3259803305078351, | |
| "grad_norm": 0.723892331123352, | |
| "learning_rate": 1.5563577335765925e-05, | |
| "loss": 0.8256, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.3259803305078351, | |
| "eval_loss": 0.5635449290275574, | |
| "eval_runtime": 11.3171, | |
| "eval_samples_per_second": 33.401, | |
| "eval_steps_per_second": 1.149, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.32799255477022915, | |
| "grad_norm": 0.6277914047241211, | |
| "learning_rate": 1.5509861893380576e-05, | |
| "loss": 0.8274, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.3300047790326232, | |
| "grad_norm": 0.6103200316429138, | |
| "learning_rate": 1.5455917191397806e-05, | |
| "loss": 0.8207, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.3320170032950172, | |
| "grad_norm": 0.6216299533843994, | |
| "learning_rate": 1.5401745474400306e-05, | |
| "loss": 0.8218, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.3320170032950172, | |
| "eval_loss": 0.5613713264465332, | |
| "eval_runtime": 11.3097, | |
| "eval_samples_per_second": 33.423, | |
| "eval_steps_per_second": 1.149, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.33402922755741127, | |
| "grad_norm": 0.6130411624908447, | |
| "learning_rate": 1.5347348996416626e-05, | |
| "loss": 0.8193, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.3360414518198053, | |
| "grad_norm": 0.7175905704498291, | |
| "learning_rate": 1.5292730020827394e-05, | |
| "loss": 0.8205, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.3380536760821994, | |
| "grad_norm": 0.5804928541183472, | |
| "learning_rate": 1.5237890820271124e-05, | |
| "loss": 0.8256, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.3380536760821994, | |
| "eval_loss": 0.558940589427948, | |
| "eval_runtime": 11.507, | |
| "eval_samples_per_second": 32.849, | |
| "eval_steps_per_second": 1.13, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.3400659003445934, | |
| "grad_norm": 0.7494300007820129, | |
| "learning_rate": 1.518283367654966e-05, | |
| "loss": 0.8225, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.34207812460698744, | |
| "grad_norm": 0.5440366268157959, | |
| "learning_rate": 1.5127560880533242e-05, | |
| "loss": 0.8272, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.3440903488693815, | |
| "grad_norm": 0.5601567625999451, | |
| "learning_rate": 1.5072074732065165e-05, | |
| "loss": 0.829, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.3440903488693815, | |
| "eval_loss": 0.5592995285987854, | |
| "eval_runtime": 11.056, | |
| "eval_samples_per_second": 34.19, | |
| "eval_steps_per_second": 1.176, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.34610257313177556, | |
| "grad_norm": 0.6553789377212524, | |
| "learning_rate": 1.5016377539866106e-05, | |
| "loss": 0.824, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.34811479739416956, | |
| "grad_norm": 0.7243614792823792, | |
| "learning_rate": 1.4960471621438047e-05, | |
| "loss": 0.8206, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.3501270216565636, | |
| "grad_norm": 0.7584229111671448, | |
| "learning_rate": 1.4904359302967848e-05, | |
| "loss": 0.8264, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.3501270216565636, | |
| "eval_loss": 0.5582433342933655, | |
| "eval_runtime": 11.4613, | |
| "eval_samples_per_second": 32.98, | |
| "eval_steps_per_second": 1.134, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.3521392459189577, | |
| "grad_norm": 0.9413104057312012, | |
| "learning_rate": 1.4848042919230464e-05, | |
| "loss": 0.8082, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.35415147018135174, | |
| "grad_norm": 0.7952352166175842, | |
| "learning_rate": 1.4791524813491789e-05, | |
| "loss": 0.8138, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.35616369444374574, | |
| "grad_norm": 0.6611462235450745, | |
| "learning_rate": 1.4734807337411166e-05, | |
| "loss": 0.817, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.35616369444374574, | |
| "eval_loss": 0.5570442080497742, | |
| "eval_runtime": 11.4931, | |
| "eval_samples_per_second": 32.889, | |
| "eval_steps_per_second": 1.131, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.3581759187061398, | |
| "grad_norm": 0.8845998644828796, | |
| "learning_rate": 1.4677892850943516e-05, | |
| "loss": 0.8124, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.36018814296853385, | |
| "grad_norm": 0.6421878337860107, | |
| "learning_rate": 1.462078372224117e-05, | |
| "loss": 0.814, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.3622003672309279, | |
| "grad_norm": 0.6532554030418396, | |
| "learning_rate": 1.456348232755531e-05, | |
| "loss": 0.8081, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.3622003672309279, | |
| "eval_loss": 0.5557852983474731, | |
| "eval_runtime": 11.4159, | |
| "eval_samples_per_second": 33.112, | |
| "eval_steps_per_second": 1.139, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.3642125914933219, | |
| "grad_norm": 0.8483557105064392, | |
| "learning_rate": 1.4505991051137112e-05, | |
| "loss": 0.8137, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.36622481575571597, | |
| "grad_norm": 0.7414484620094299, | |
| "learning_rate": 1.4448312285138524e-05, | |
| "loss": 0.8095, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.36823704001811003, | |
| "grad_norm": 0.6685389280319214, | |
| "learning_rate": 1.4390448429512747e-05, | |
| "loss": 0.8108, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.36823704001811003, | |
| "eval_loss": 0.5559925436973572, | |
| "eval_runtime": 11.4267, | |
| "eval_samples_per_second": 33.081, | |
| "eval_steps_per_second": 1.138, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.3702492642805041, | |
| "grad_norm": 0.5973154306411743, | |
| "learning_rate": 1.4332401891914365e-05, | |
| "loss": 0.8144, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.3722614885428981, | |
| "grad_norm": 0.6153602004051208, | |
| "learning_rate": 1.4274175087599166e-05, | |
| "loss": 0.8234, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.37427371280529215, | |
| "grad_norm": 0.6379988789558411, | |
| "learning_rate": 1.4215770439323657e-05, | |
| "loss": 0.8137, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.37427371280529215, | |
| "eval_loss": 0.5545734763145447, | |
| "eval_runtime": 11.3444, | |
| "eval_samples_per_second": 33.32, | |
| "eval_steps_per_second": 1.146, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.3762859370676862, | |
| "grad_norm": 0.6836999654769897, | |
| "learning_rate": 1.4157190377244233e-05, | |
| "loss": 0.811, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.37829816133008026, | |
| "grad_norm": 0.5659916400909424, | |
| "learning_rate": 1.409843733881608e-05, | |
| "loss": 0.8175, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.38031038559247426, | |
| "grad_norm": 0.6270354986190796, | |
| "learning_rate": 1.4039513768691753e-05, | |
| "loss": 0.8221, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.38031038559247426, | |
| "eval_loss": 0.5561990737915039, | |
| "eval_runtime": 11.437, | |
| "eval_samples_per_second": 33.051, | |
| "eval_steps_per_second": 1.137, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.3823226098548683, | |
| "grad_norm": 0.6403433680534363, | |
| "learning_rate": 1.3980422118619447e-05, | |
| "loss": 0.8156, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.3843348341172624, | |
| "grad_norm": 0.5956655144691467, | |
| "learning_rate": 1.3921164847340996e-05, | |
| "loss": 0.8161, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.38634705837965644, | |
| "grad_norm": 1.1075905561447144, | |
| "learning_rate": 1.3861744420489547e-05, | |
| "loss": 0.8115, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.38634705837965644, | |
| "eval_loss": 0.5551438927650452, | |
| "eval_runtime": 11.6061, | |
| "eval_samples_per_second": 32.569, | |
| "eval_steps_per_second": 1.12, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.38835928264205044, | |
| "grad_norm": 0.5919958353042603, | |
| "learning_rate": 1.380216331048699e-05, | |
| "loss": 0.8042, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.3903715069044445, | |
| "grad_norm": 0.599104106426239, | |
| "learning_rate": 1.3742423996441067e-05, | |
| "loss": 0.8107, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.39238373116683856, | |
| "grad_norm": 0.6891294121742249, | |
| "learning_rate": 1.3682528964042234e-05, | |
| "loss": 0.8082, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.39238373116683856, | |
| "eval_loss": 0.5554007291793823, | |
| "eval_runtime": 11.5763, | |
| "eval_samples_per_second": 32.653, | |
| "eval_steps_per_second": 1.123, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.3943959554292326, | |
| "grad_norm": 0.6625336408615112, | |
| "learning_rate": 1.3622480705460217e-05, | |
| "loss": 0.8161, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.3964081796916266, | |
| "grad_norm": 0.6874691843986511, | |
| "learning_rate": 1.3562281719240323e-05, | |
| "loss": 0.808, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.3984204039540207, | |
| "grad_norm": 0.6335239410400391, | |
| "learning_rate": 1.3501934510199479e-05, | |
| "loss": 0.8172, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.3984204039540207, | |
| "eval_loss": 0.5533725023269653, | |
| "eval_runtime": 11.4224, | |
| "eval_samples_per_second": 33.093, | |
| "eval_steps_per_second": 1.138, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.40043262821641473, | |
| "grad_norm": 0.6799935102462769, | |
| "learning_rate": 1.3441441589322013e-05, | |
| "loss": 0.8102, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.4024448524788088, | |
| "grad_norm": 0.7125223278999329, | |
| "learning_rate": 1.338080547365517e-05, | |
| "loss": 0.8196, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.4044570767412028, | |
| "grad_norm": 0.6379702091217041, | |
| "learning_rate": 1.3320028686204378e-05, | |
| "loss": 0.7988, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.4044570767412028, | |
| "eval_loss": 0.5532128214836121, | |
| "eval_runtime": 11.5518, | |
| "eval_samples_per_second": 32.722, | |
| "eval_steps_per_second": 1.125, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.40646930100359685, | |
| "grad_norm": 0.6244897842407227, | |
| "learning_rate": 1.325911375582827e-05, | |
| "loss": 0.8078, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.4084815252659909, | |
| "grad_norm": 0.6567655801773071, | |
| "learning_rate": 1.319806321713346e-05, | |
| "loss": 0.812, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.41049374952838497, | |
| "grad_norm": 0.7605450749397278, | |
| "learning_rate": 1.3136879610369091e-05, | |
| "loss": 0.8078, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.41049374952838497, | |
| "eval_loss": 0.5506391525268555, | |
| "eval_runtime": 11.3697, | |
| "eval_samples_per_second": 33.246, | |
| "eval_steps_per_second": 1.143, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.41250597379077897, | |
| "grad_norm": 0.669282853603363, | |
| "learning_rate": 1.3075565481321122e-05, | |
| "loss": 0.8086, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.414518198053173, | |
| "grad_norm": 0.6792070269584656, | |
| "learning_rate": 1.301412338120641e-05, | |
| "loss": 0.8075, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.4165304223155671, | |
| "grad_norm": 0.5937780737876892, | |
| "learning_rate": 1.2952555866566554e-05, | |
| "loss": 0.8151, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.4165304223155671, | |
| "eval_loss": 0.5495349168777466, | |
| "eval_runtime": 11.3633, | |
| "eval_samples_per_second": 33.265, | |
| "eval_steps_per_second": 1.144, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.4185426465779611, | |
| "grad_norm": 0.6547305583953857, | |
| "learning_rate": 1.2890865499161522e-05, | |
| "loss": 0.8022, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.42055487084035514, | |
| "grad_norm": 0.5942917466163635, | |
| "learning_rate": 1.2829054845863054e-05, | |
| "loss": 0.8079, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.4225670951027492, | |
| "grad_norm": 0.5794849991798401, | |
| "learning_rate": 1.2767126478547865e-05, | |
| "loss": 0.8152, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.4225670951027492, | |
| "eval_loss": 0.5491987466812134, | |
| "eval_runtime": 11.3343, | |
| "eval_samples_per_second": 33.35, | |
| "eval_steps_per_second": 1.147, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.42457931936514326, | |
| "grad_norm": 0.6574000120162964, | |
| "learning_rate": 1.2705082973990623e-05, | |
| "loss": 0.8087, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.42659154362753726, | |
| "grad_norm": 0.6523112654685974, | |
| "learning_rate": 1.264292691375674e-05, | |
| "loss": 0.8098, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.4286037678899313, | |
| "grad_norm": 0.6403859853744507, | |
| "learning_rate": 1.2580660884094944e-05, | |
| "loss": 0.8125, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.4286037678899313, | |
| "eval_loss": 0.5487639307975769, | |
| "eval_runtime": 11.6017, | |
| "eval_samples_per_second": 32.581, | |
| "eval_steps_per_second": 1.121, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.4306159921523254, | |
| "grad_norm": 0.6883541345596313, | |
| "learning_rate": 1.2518287475829687e-05, | |
| "loss": 0.804, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.43262821641471944, | |
| "grad_norm": 0.6650357246398926, | |
| "learning_rate": 1.2455809284253329e-05, | |
| "loss": 0.8097, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.43464044067711344, | |
| "grad_norm": 0.6048406958580017, | |
| "learning_rate": 1.239322890901815e-05, | |
| "loss": 0.8059, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.43464044067711344, | |
| "eval_loss": 0.5487421751022339, | |
| "eval_runtime": 11.4779, | |
| "eval_samples_per_second": 32.933, | |
| "eval_steps_per_second": 1.133, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.4366526649395075, | |
| "grad_norm": 0.6876850724220276, | |
| "learning_rate": 1.233054895402819e-05, | |
| "loss": 0.8027, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.43866488920190155, | |
| "grad_norm": 0.656778872013092, | |
| "learning_rate": 1.2267772027330893e-05, | |
| "loss": 0.8124, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.4406771134642956, | |
| "grad_norm": 0.6603732109069824, | |
| "learning_rate": 1.22049007410086e-05, | |
| "loss": 0.8032, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.4406771134642956, | |
| "eval_loss": 0.547619104385376, | |
| "eval_runtime": 11.4392, | |
| "eval_samples_per_second": 33.044, | |
| "eval_steps_per_second": 1.136, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.4426893377266896, | |
| "grad_norm": 0.5987362861633301, | |
| "learning_rate": 1.2141937711069857e-05, | |
| "loss": 0.8075, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.44470156198908367, | |
| "grad_norm": 0.6756895780563354, | |
| "learning_rate": 1.2078885557340562e-05, | |
| "loss": 0.8092, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.44671378625147773, | |
| "grad_norm": 0.7242164015769958, | |
| "learning_rate": 1.2015746903354968e-05, | |
| "loss": 0.8156, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.44671378625147773, | |
| "eval_loss": 0.5490314364433289, | |
| "eval_runtime": 11.6139, | |
| "eval_samples_per_second": 32.547, | |
| "eval_steps_per_second": 1.119, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.4487260105138718, | |
| "grad_norm": 0.77918541431427, | |
| "learning_rate": 1.1952524376246504e-05, | |
| "loss": 0.8063, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.4507382347762658, | |
| "grad_norm": 0.6913318634033203, | |
| "learning_rate": 1.1889220606638476e-05, | |
| "loss": 0.8079, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.45275045903865985, | |
| "grad_norm": 0.747986376285553, | |
| "learning_rate": 1.1825838228534607e-05, | |
| "loss": 0.8033, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.45275045903865985, | |
| "eval_loss": 0.5468713045120239, | |
| "eval_runtime": 11.4, | |
| "eval_samples_per_second": 33.158, | |
| "eval_steps_per_second": 1.14, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.4547626833010539, | |
| "grad_norm": 0.6693961024284363, | |
| "learning_rate": 1.1762379879209442e-05, | |
| "loss": 0.8089, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.45677490756344796, | |
| "grad_norm": 0.6168875098228455, | |
| "learning_rate": 1.1698848199098596e-05, | |
| "loss": 0.7998, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.45878713182584197, | |
| "grad_norm": 0.6753715872764587, | |
| "learning_rate": 1.1635245831688913e-05, | |
| "loss": 0.8057, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.45878713182584197, | |
| "eval_loss": 0.5467536449432373, | |
| "eval_runtime": 11.3082, | |
| "eval_samples_per_second": 33.427, | |
| "eval_steps_per_second": 1.15, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.460799356088236, | |
| "grad_norm": 0.6399224996566772, | |
| "learning_rate": 1.1571575423408456e-05, | |
| "loss": 0.7965, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.4628115803506301, | |
| "grad_norm": 0.5371870994567871, | |
| "learning_rate": 1.1507839623516401e-05, | |
| "loss": 0.8014, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.46482380461302414, | |
| "grad_norm": 0.711793839931488, | |
| "learning_rate": 1.1444041083992801e-05, | |
| "loss": 0.8081, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.46482380461302414, | |
| "eval_loss": 0.5455725193023682, | |
| "eval_runtime": 11.4796, | |
| "eval_samples_per_second": 32.928, | |
| "eval_steps_per_second": 1.132, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.46683602887541814, | |
| "grad_norm": 0.566677451133728, | |
| "learning_rate": 1.1380182459428234e-05, | |
| "loss": 0.8027, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.4688482531378122, | |
| "grad_norm": 0.7086474895477295, | |
| "learning_rate": 1.1316266406913355e-05, | |
| "loss": 0.8024, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.47086047740020626, | |
| "grad_norm": 0.6261083483695984, | |
| "learning_rate": 1.1252295585928343e-05, | |
| "loss": 0.8054, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.47086047740020626, | |
| "eval_loss": 0.5444592833518982, | |
| "eval_runtime": 11.5945, | |
| "eval_samples_per_second": 32.602, | |
| "eval_steps_per_second": 1.121, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.4728727016626003, | |
| "grad_norm": 0.6763809323310852, | |
| "learning_rate": 1.1188272658232228e-05, | |
| "loss": 0.7952, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.4748849259249943, | |
| "grad_norm": 0.6690487265586853, | |
| "learning_rate": 1.1124200287752157e-05, | |
| "loss": 0.807, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.4768971501873884, | |
| "grad_norm": 0.5711999535560608, | |
| "learning_rate": 1.1060081140472519e-05, | |
| "loss": 0.8052, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.4768971501873884, | |
| "eval_loss": 0.5443876385688782, | |
| "eval_runtime": 11.4195, | |
| "eval_samples_per_second": 33.101, | |
| "eval_steps_per_second": 1.138, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.47890937444978243, | |
| "grad_norm": 0.6411765217781067, | |
| "learning_rate": 1.0995917884324056e-05, | |
| "loss": 0.7976, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.4809215987121765, | |
| "grad_norm": 0.5719566941261292, | |
| "learning_rate": 1.0931713189072827e-05, | |
| "loss": 0.7992, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.4829338229745705, | |
| "grad_norm": 0.5175074934959412, | |
| "learning_rate": 1.086746972620913e-05, | |
| "loss": 0.8009, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.4829338229745705, | |
| "eval_loss": 0.5424737334251404, | |
| "eval_runtime": 11.3763, | |
| "eval_samples_per_second": 33.227, | |
| "eval_steps_per_second": 1.143, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.48494604723696455, | |
| "grad_norm": 0.6476929783821106, | |
| "learning_rate": 1.0803190168836341e-05, | |
| "loss": 0.7984, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.4869582714993586, | |
| "grad_norm": 0.6742759943008423, | |
| "learning_rate": 1.0738877191559691e-05, | |
| "loss": 0.7989, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.48897049576175267, | |
| "grad_norm": 0.5645999908447266, | |
| "learning_rate": 1.067453347037498e-05, | |
| "loss": 0.7985, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.48897049576175267, | |
| "eval_loss": 0.5427749752998352, | |
| "eval_runtime": 11.4256, | |
| "eval_samples_per_second": 33.084, | |
| "eval_steps_per_second": 1.138, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.49098272002414667, | |
| "grad_norm": 0.5972943902015686, | |
| "learning_rate": 1.0610161682557225e-05, | |
| "loss": 0.7961, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.4929949442865407, | |
| "grad_norm": 0.6340279579162598, | |
| "learning_rate": 1.0545764506549273e-05, | |
| "loss": 0.8033, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.4950071685489348, | |
| "grad_norm": 0.6096486449241638, | |
| "learning_rate": 1.0481344621850347e-05, | |
| "loss": 0.7955, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.4950071685489348, | |
| "eval_loss": 0.5418882369995117, | |
| "eval_runtime": 11.4157, | |
| "eval_samples_per_second": 33.112, | |
| "eval_steps_per_second": 1.139, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.49701939281132884, | |
| "grad_norm": 0.5778651833534241, | |
| "learning_rate": 1.041690470890455e-05, | |
| "loss": 0.7954, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.49903161707372284, | |
| "grad_norm": 0.5838211178779602, | |
| "learning_rate": 1.0352447448989337e-05, | |
| "loss": 0.7854, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.5010438413361169, | |
| "grad_norm": 0.5919055342674255, | |
| "learning_rate": 1.0287975524103964e-05, | |
| "loss": 0.7925, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.5010438413361169, | |
| "eval_loss": 0.541851818561554, | |
| "eval_runtime": 11.2979, | |
| "eval_samples_per_second": 33.457, | |
| "eval_steps_per_second": 1.151, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.5030560655985109, | |
| "grad_norm": 0.5358749628067017, | |
| "learning_rate": 1.022349161685787e-05, | |
| "loss": 0.7986, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.505068289860905, | |
| "grad_norm": 0.6401896476745605, | |
| "learning_rate": 1.0158998410359074e-05, | |
| "loss": 0.7914, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.507080514123299, | |
| "grad_norm": 0.5817869901657104, | |
| "learning_rate": 1.0094498588102523e-05, | |
| "loss": 0.7956, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.507080514123299, | |
| "eval_loss": 0.5417122840881348, | |
| "eval_runtime": 11.503, | |
| "eval_samples_per_second": 32.861, | |
| "eval_steps_per_second": 1.13, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.5090927383856931, | |
| "grad_norm": 0.5595591068267822, | |
| "learning_rate": 1.0029994833858438e-05, | |
| "loss": 0.7943, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.5111049626480871, | |
| "grad_norm": 0.5861169099807739, | |
| "learning_rate": 9.965489831560652e-06, | |
| "loss": 0.8006, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.5131171869104811, | |
| "grad_norm": 0.5644922852516174, | |
| "learning_rate": 9.900986265194924e-06, | |
| "loss": 0.7868, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.5131171869104811, | |
| "eval_loss": 0.5409750938415527, | |
| "eval_runtime": 11.3254, | |
| "eval_samples_per_second": 33.376, | |
| "eval_steps_per_second": 1.148, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.5151294111728753, | |
| "grad_norm": 0.5210478901863098, | |
| "learning_rate": 9.836486818687262e-06, | |
| "loss": 0.7967, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.5171416354352693, | |
| "grad_norm": 0.5937855839729309, | |
| "learning_rate": 9.771994175792262e-06, | |
| "loss": 0.7839, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.5191538596976633, | |
| "grad_norm": 0.68199622631073, | |
| "learning_rate": 9.707511019981416e-06, | |
| "loss": 0.7929, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.5191538596976633, | |
| "eval_loss": 0.53957599401474, | |
| "eval_runtime": 11.2847, | |
| "eval_samples_per_second": 33.497, | |
| "eval_steps_per_second": 1.152, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.5211660839600574, | |
| "grad_norm": 0.6363146305084229, | |
| "learning_rate": 9.643040034331475e-06, | |
| "loss": 0.7893, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.5231783082224514, | |
| "grad_norm": 0.6275014877319336, | |
| "learning_rate": 9.578583901412802e-06, | |
| "loss": 0.7883, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.5251905324848455, | |
| "grad_norm": 0.5840523838996887, | |
| "learning_rate": 9.514145303177751e-06, | |
| "loss": 0.7961, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.5251905324848455, | |
| "eval_loss": 0.5387553572654724, | |
| "eval_runtime": 11.2936, | |
| "eval_samples_per_second": 33.47, | |
| "eval_steps_per_second": 1.151, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.5272027567472395, | |
| "grad_norm": 0.706901490688324, | |
| "learning_rate": 9.449726920849085e-06, | |
| "loss": 0.795, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.5292149810096335, | |
| "grad_norm": 0.5236905813217163, | |
| "learning_rate": 9.385331434808386e-06, | |
| "loss": 0.7919, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.5312272052720276, | |
| "grad_norm": 0.6014547348022461, | |
| "learning_rate": 9.320961524484565e-06, | |
| "loss": 0.7917, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.5312272052720276, | |
| "eval_loss": 0.5388390421867371, | |
| "eval_runtime": 11.3827, | |
| "eval_samples_per_second": 33.208, | |
| "eval_steps_per_second": 1.142, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.5332394295344216, | |
| "grad_norm": 0.5613085031509399, | |
| "learning_rate": 9.256619868242341e-06, | |
| "loss": 0.7957, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.5352516537968156, | |
| "grad_norm": 0.6822344064712524, | |
| "learning_rate": 9.192309143270818e-06, | |
| "loss": 0.7867, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.5372638780592097, | |
| "grad_norm": 0.6041319370269775, | |
| "learning_rate": 9.128032025472077e-06, | |
| "loss": 0.7884, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.5372638780592097, | |
| "eval_loss": 0.5368719696998596, | |
| "eval_runtime": 11.3484, | |
| "eval_samples_per_second": 33.309, | |
| "eval_steps_per_second": 1.146, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.5392761023216037, | |
| "grad_norm": 0.644088089466095, | |
| "learning_rate": 9.063791189349841e-06, | |
| "loss": 0.7867, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.5412883265839978, | |
| "grad_norm": 0.627928614616394, | |
| "learning_rate": 8.999589307898192e-06, | |
| "loss": 0.7896, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.5433005508463918, | |
| "grad_norm": 0.6207029819488525, | |
| "learning_rate": 8.935429052490347e-06, | |
| "loss": 0.7853, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.5433005508463918, | |
| "eval_loss": 0.5371023416519165, | |
| "eval_runtime": 11.3461, | |
| "eval_samples_per_second": 33.316, | |
| "eval_steps_per_second": 1.146, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.5453127751087858, | |
| "grad_norm": 0.541533887386322, | |
| "learning_rate": 8.87131309276751e-06, | |
| "loss": 0.7916, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.54732499937118, | |
| "grad_norm": 0.590813934803009, | |
| "learning_rate": 8.807244096527783e-06, | |
| "loss": 0.7948, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.549337223633574, | |
| "grad_norm": 0.584229588508606, | |
| "learning_rate": 8.743224729615168e-06, | |
| "loss": 0.7918, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.549337223633574, | |
| "eval_loss": 0.5366615653038025, | |
| "eval_runtime": 11.3157, | |
| "eval_samples_per_second": 33.405, | |
| "eval_steps_per_second": 1.149, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.551349447895968, | |
| "grad_norm": 0.6746295094490051, | |
| "learning_rate": 8.679257655808645e-06, | |
| "loss": 0.7911, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.5533616721583621, | |
| "grad_norm": 0.6765587329864502, | |
| "learning_rate": 8.615345536711331e-06, | |
| "loss": 0.7906, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.5553738964207561, | |
| "grad_norm": 0.5838325619697571, | |
| "learning_rate": 8.551491031639736e-06, | |
| "loss": 0.7937, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.5553738964207561, | |
| "eval_loss": 0.5361348390579224, | |
| "eval_runtime": 11.3123, | |
| "eval_samples_per_second": 33.415, | |
| "eval_steps_per_second": 1.149, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.5573861206831502, | |
| "grad_norm": 0.6001378893852234, | |
| "learning_rate": 8.487696797513108e-06, | |
| "loss": 0.7777, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.5593983449455442, | |
| "grad_norm": 0.5667701363563538, | |
| "learning_rate": 8.423965488742885e-06, | |
| "loss": 0.7856, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.5614105692079382, | |
| "grad_norm": 0.632291316986084, | |
| "learning_rate": 8.360299757122247e-06, | |
| "loss": 0.7792, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.5614105692079382, | |
| "eval_loss": 0.5353109240531921, | |
| "eval_runtime": 11.3749, | |
| "eval_samples_per_second": 33.231, | |
| "eval_steps_per_second": 1.143, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.5634227934703323, | |
| "grad_norm": 0.5472155213356018, | |
| "learning_rate": 8.296702251715778e-06, | |
| "loss": 0.7831, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.5654350177327263, | |
| "grad_norm": 0.590352475643158, | |
| "learning_rate": 8.233175618749243e-06, | |
| "loss": 0.7833, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 0.5674472419951203, | |
| "grad_norm": 0.5392365455627441, | |
| "learning_rate": 8.16972250149947e-06, | |
| "loss": 0.7846, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.5674472419951203, | |
| "eval_loss": 0.5345659852027893, | |
| "eval_runtime": 11.3797, | |
| "eval_samples_per_second": 33.217, | |
| "eval_steps_per_second": 1.142, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.5694594662575144, | |
| "grad_norm": 0.5367996692657471, | |
| "learning_rate": 8.106345540184382e-06, | |
| "loss": 0.7881, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 0.5714716905199084, | |
| "grad_norm": 0.7017585039138794, | |
| "learning_rate": 8.043047371853135e-06, | |
| "loss": 0.7902, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.5734839147823025, | |
| "grad_norm": 0.6775383353233337, | |
| "learning_rate": 7.979830630276384e-06, | |
| "loss": 0.795, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.5734839147823025, | |
| "eval_loss": 0.5349369645118713, | |
| "eval_runtime": 11.3477, | |
| "eval_samples_per_second": 33.311, | |
| "eval_steps_per_second": 1.146, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.5754961390446965, | |
| "grad_norm": 0.5782616138458252, | |
| "learning_rate": 7.91669794583671e-06, | |
| "loss": 0.7902, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 0.5775083633070905, | |
| "grad_norm": 0.5419892072677612, | |
| "learning_rate": 7.853651945419155e-06, | |
| "loss": 0.7858, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 0.5795205875694847, | |
| "grad_norm": 0.6611707210540771, | |
| "learning_rate": 7.790695252301938e-06, | |
| "loss": 0.7894, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.5795205875694847, | |
| "eval_loss": 0.5343945026397705, | |
| "eval_runtime": 11.4492, | |
| "eval_samples_per_second": 33.015, | |
| "eval_steps_per_second": 1.135, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.5815328118318787, | |
| "grad_norm": 0.5788918137550354, | |
| "learning_rate": 7.727830486047288e-06, | |
| "loss": 0.7868, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 0.5835450360942727, | |
| "grad_norm": 0.5480091571807861, | |
| "learning_rate": 7.665060262392461e-06, | |
| "loss": 0.7858, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.5855572603566668, | |
| "grad_norm": 0.730056881904602, | |
| "learning_rate": 7.602387193140887e-06, | |
| "loss": 0.7884, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 0.5855572603566668, | |
| "eval_loss": 0.5339014530181885, | |
| "eval_runtime": 11.3802, | |
| "eval_samples_per_second": 33.216, | |
| "eval_steps_per_second": 1.142, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 0.5875694846190608, | |
| "grad_norm": 0.5774337649345398, | |
| "learning_rate": 7.539813886053502e-06, | |
| "loss": 0.7893, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 0.5895817088814549, | |
| "grad_norm": 0.615470290184021, | |
| "learning_rate": 7.477342944740249e-06, | |
| "loss": 0.7817, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 0.5915939331438489, | |
| "grad_norm": 0.6776989698410034, | |
| "learning_rate": 7.414976968551735e-06, | |
| "loss": 0.7783, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 0.5915939331438489, | |
| "eval_loss": 0.533939003944397, | |
| "eval_runtime": 11.3711, | |
| "eval_samples_per_second": 33.242, | |
| "eval_steps_per_second": 1.143, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 0.5936061574062429, | |
| "grad_norm": 0.5885875821113586, | |
| "learning_rate": 7.352718552471077e-06, | |
| "loss": 0.784, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.595618381668637, | |
| "grad_norm": 0.5772850513458252, | |
| "learning_rate": 7.290570287005931e-06, | |
| "loss": 0.7819, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 0.597630605931031, | |
| "grad_norm": 0.6122897863388062, | |
| "learning_rate": 7.228534758080694e-06, | |
| "loss": 0.7891, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 0.597630605931031, | |
| "eval_loss": 0.5327485799789429, | |
| "eval_runtime": 11.3326, | |
| "eval_samples_per_second": 33.355, | |
| "eval_steps_per_second": 1.147, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 0.599642830193425, | |
| "grad_norm": 0.6210538148880005, | |
| "learning_rate": 7.1666145469289226e-06, | |
| "loss": 0.7832, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 0.6016550544558191, | |
| "grad_norm": 0.593087911605835, | |
| "learning_rate": 7.1048122299859145e-06, | |
| "loss": 0.7888, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 0.6036672787182131, | |
| "grad_norm": 0.5805263519287109, | |
| "learning_rate": 7.043130378781516e-06, | |
| "loss": 0.7825, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.6036672787182131, | |
| "eval_loss": 0.5322030782699585, | |
| "eval_runtime": 11.3763, | |
| "eval_samples_per_second": 33.227, | |
| "eval_steps_per_second": 1.143, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.6056795029806072, | |
| "grad_norm": 0.5463854074478149, | |
| "learning_rate": 6.981571559833122e-06, | |
| "loss": 0.7881, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 0.6076917272430012, | |
| "grad_norm": 0.5730445384979248, | |
| "learning_rate": 6.920138334538878e-06, | |
| "loss": 0.7858, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 0.6097039515053952, | |
| "grad_norm": 0.5871597528457642, | |
| "learning_rate": 6.858833259071108e-06, | |
| "loss": 0.7777, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 0.6097039515053952, | |
| "eval_loss": 0.5328507423400879, | |
| "eval_runtime": 11.3806, | |
| "eval_samples_per_second": 33.215, | |
| "eval_steps_per_second": 1.142, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 0.6117161757677894, | |
| "grad_norm": 0.6252338290214539, | |
| "learning_rate": 6.797658884269962e-06, | |
| "loss": 0.778, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 0.6137284000301834, | |
| "grad_norm": 0.588524580001831, | |
| "learning_rate": 6.736617755537267e-06, | |
| "loss": 0.7772, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.6157406242925774, | |
| "grad_norm": 0.621525228023529, | |
| "learning_rate": 6.675712412730625e-06, | |
| "loss": 0.7832, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 0.6157406242925774, | |
| "eval_loss": 0.5325730443000793, | |
| "eval_runtime": 11.3314, | |
| "eval_samples_per_second": 33.359, | |
| "eval_steps_per_second": 1.147, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 0.6177528485549715, | |
| "grad_norm": 0.5612871646881104, | |
| "learning_rate": 6.614945390057723e-06, | |
| "loss": 0.7831, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 0.6197650728173655, | |
| "grad_norm": 0.5247837901115417, | |
| "learning_rate": 6.554319215970895e-06, | |
| "loss": 0.7828, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 0.6217772970797596, | |
| "grad_norm": 0.5758721232414246, | |
| "learning_rate": 6.493836413061907e-06, | |
| "loss": 0.781, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 0.6217772970797596, | |
| "eval_loss": 0.5314515829086304, | |
| "eval_runtime": 11.3823, | |
| "eval_samples_per_second": 33.21, | |
| "eval_steps_per_second": 1.142, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 0.6237895213421536, | |
| "grad_norm": 0.7134236693382263, | |
| "learning_rate": 6.433499497957006e-06, | |
| "loss": 0.7852, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.6258017456045476, | |
| "grad_norm": 0.5432785153388977, | |
| "learning_rate": 6.373310981212197e-06, | |
| "loss": 0.7776, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 0.6278139698669417, | |
| "grad_norm": 0.6110942959785461, | |
| "learning_rate": 6.3132733672087875e-06, | |
| "loss": 0.787, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 0.6278139698669417, | |
| "eval_loss": 0.5303037166595459, | |
| "eval_runtime": 11.4219, | |
| "eval_samples_per_second": 33.094, | |
| "eval_steps_per_second": 1.138, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 0.6298261941293357, | |
| "grad_norm": 0.5783369541168213, | |
| "learning_rate": 6.253389154049177e-06, | |
| "loss": 0.7807, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 0.6318384183917297, | |
| "grad_norm": 0.5356603860855103, | |
| "learning_rate": 6.19366083345291e-06, | |
| "loss": 0.7801, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 0.6338506426541238, | |
| "grad_norm": 0.5529428124427795, | |
| "learning_rate": 6.134090890653015e-06, | |
| "loss": 0.7774, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.6338506426541238, | |
| "eval_loss": 0.5301904678344727, | |
| "eval_runtime": 11.4476, | |
| "eval_samples_per_second": 33.02, | |
| "eval_steps_per_second": 1.136, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.6358628669165178, | |
| "grad_norm": 0.5553627610206604, | |
| "learning_rate": 6.074681804292581e-06, | |
| "loss": 0.7791, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 0.6378750911789118, | |
| "grad_norm": 0.5281953811645508, | |
| "learning_rate": 6.0154360463216325e-06, | |
| "loss": 0.7769, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 0.639887315441306, | |
| "grad_norm": 0.6406475305557251, | |
| "learning_rate": 5.956356081894259e-06, | |
| "loss": 0.7799, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 0.639887315441306, | |
| "eval_loss": 0.5294053554534912, | |
| "eval_runtime": 11.3422, | |
| "eval_samples_per_second": 33.327, | |
| "eval_steps_per_second": 1.146, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 0.6418995397037, | |
| "grad_norm": 0.49855828285217285, | |
| "learning_rate": 5.897444369266066e-06, | |
| "loss": 0.7759, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 0.6439117639660941, | |
| "grad_norm": 0.5699638724327087, | |
| "learning_rate": 5.838703359691873e-06, | |
| "loss": 0.7673, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.6459239882284881, | |
| "grad_norm": 0.5306676030158997, | |
| "learning_rate": 5.780135497323724e-06, | |
| "loss": 0.7799, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 0.6459239882284881, | |
| "eval_loss": 0.5290261507034302, | |
| "eval_runtime": 11.3435, | |
| "eval_samples_per_second": 33.323, | |
| "eval_steps_per_second": 1.146, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 0.6479362124908821, | |
| "grad_norm": 0.5989037752151489, | |
| "learning_rate": 5.721743219109187e-06, | |
| "loss": 0.7757, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 0.6499484367532762, | |
| "grad_norm": 0.5595914721488953, | |
| "learning_rate": 5.663528954689958e-06, | |
| "loss": 0.7761, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 0.6519606610156702, | |
| "grad_norm": 0.5618345737457275, | |
| "learning_rate": 5.605495126300766e-06, | |
| "loss": 0.779, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 0.6519606610156702, | |
| "eval_loss": 0.529247522354126, | |
| "eval_runtime": 11.3716, | |
| "eval_samples_per_second": 33.241, | |
| "eval_steps_per_second": 1.143, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 0.6539728852780642, | |
| "grad_norm": 0.5271475315093994, | |
| "learning_rate": 5.547644148668585e-06, | |
| "loss": 0.7747, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.6559851095404583, | |
| "grad_norm": 0.5703973770141602, | |
| "learning_rate": 5.489978428912157e-06, | |
| "loss": 0.7801, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 0.6579973338028523, | |
| "grad_norm": 0.570797860622406, | |
| "learning_rate": 5.432500366441843e-06, | |
| "loss": 0.7756, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 0.6579973338028523, | |
| "eval_loss": 0.5275307893753052, | |
| "eval_runtime": 11.3412, | |
| "eval_samples_per_second": 33.33, | |
| "eval_steps_per_second": 1.146, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 0.6600095580652464, | |
| "grad_norm": 0.564414918422699, | |
| "learning_rate": 5.3752123528597746e-06, | |
| "loss": 0.7688, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 0.6620217823276404, | |
| "grad_norm": 0.5405446290969849, | |
| "learning_rate": 5.318116771860351e-06, | |
| "loss": 0.7777, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 0.6640340065900344, | |
| "grad_norm": 0.5645068883895874, | |
| "learning_rate": 5.261215999131055e-06, | |
| "loss": 0.7723, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.6640340065900344, | |
| "eval_loss": 0.5280060172080994, | |
| "eval_runtime": 11.3103, | |
| "eval_samples_per_second": 33.421, | |
| "eval_steps_per_second": 1.149, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.6660462308524285, | |
| "grad_norm": 0.5821409225463867, | |
| "learning_rate": 5.204512402253592e-06, | |
| "loss": 0.7857, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 0.6680584551148225, | |
| "grad_norm": 0.5534176230430603, | |
| "learning_rate": 5.148008340605393e-06, | |
| "loss": 0.7726, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 0.6700706793772165, | |
| "grad_norm": 0.5734113454818726, | |
| "learning_rate": 5.091706165261438e-06, | |
| "loss": 0.7806, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 0.6700706793772165, | |
| "eval_loss": 0.527226984500885, | |
| "eval_runtime": 11.3532, | |
| "eval_samples_per_second": 33.295, | |
| "eval_steps_per_second": 1.145, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 0.6720829036396107, | |
| "grad_norm": 0.5118337273597717, | |
| "learning_rate": 5.035608218896424e-06, | |
| "loss": 0.7794, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 0.6740951279020047, | |
| "grad_norm": 0.520524799823761, | |
| "learning_rate": 4.979716835687296e-06, | |
| "loss": 0.7833, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.6761073521643988, | |
| "grad_norm": 0.5260956883430481, | |
| "learning_rate": 4.924034341216123e-06, | |
| "loss": 0.7722, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 0.6761073521643988, | |
| "eval_loss": 0.5266076326370239, | |
| "eval_runtime": 11.3351, | |
| "eval_samples_per_second": 33.348, | |
| "eval_steps_per_second": 1.147, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 0.6781195764267928, | |
| "grad_norm": 0.5933238863945007, | |
| "learning_rate": 4.868563052373329e-06, | |
| "loss": 0.778, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 0.6801318006891868, | |
| "grad_norm": 0.5882487297058105, | |
| "learning_rate": 4.813305277261294e-06, | |
| "loss": 0.778, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 0.6821440249515809, | |
| "grad_norm": 0.5495398640632629, | |
| "learning_rate": 4.758263315098319e-06, | |
| "loss": 0.7749, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 0.6821440249515809, | |
| "eval_loss": 0.527021050453186, | |
| "eval_runtime": 11.3019, | |
| "eval_samples_per_second": 33.446, | |
| "eval_steps_per_second": 1.15, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 0.6841562492139749, | |
| "grad_norm": 0.5372888445854187, | |
| "learning_rate": 4.703439456122942e-06, | |
| "loss": 0.7726, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.6861684734763689, | |
| "grad_norm": 0.5453928709030151, | |
| "learning_rate": 4.648835981498665e-06, | |
| "loss": 0.7736, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 0.688180697738763, | |
| "grad_norm": 0.534249484539032, | |
| "learning_rate": 4.594455163219025e-06, | |
| "loss": 0.7669, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 0.688180697738763, | |
| "eval_loss": 0.5258325934410095, | |
| "eval_runtime": 11.3315, | |
| "eval_samples_per_second": 33.358, | |
| "eval_steps_per_second": 1.147, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 0.690192922001157, | |
| "grad_norm": 0.602557897567749, | |
| "learning_rate": 4.5402992640130615e-06, | |
| "loss": 0.7776, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 0.6922051462635511, | |
| "grad_norm": 0.6340908408164978, | |
| "learning_rate": 4.486370537251166e-06, | |
| "loss": 0.7724, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 0.6942173705259451, | |
| "grad_norm": 0.5442144870758057, | |
| "learning_rate": 4.43267122685132e-06, | |
| "loss": 0.7678, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.6942173705259451, | |
| "eval_loss": 0.52588951587677, | |
| "eval_runtime": 11.3113, | |
| "eval_samples_per_second": 33.418, | |
| "eval_steps_per_second": 1.149, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.6962295947883391, | |
| "grad_norm": 0.5438702702522278, | |
| "learning_rate": 4.379203567185733e-06, | |
| "loss": 0.7722, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 0.6982418190507332, | |
| "grad_norm": 0.575579822063446, | |
| "learning_rate": 4.325969782987868e-06, | |
| "loss": 0.7806, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 0.7002540433131272, | |
| "grad_norm": 0.53037029504776, | |
| "learning_rate": 4.2729720892598725e-06, | |
| "loss": 0.7677, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 0.7002540433131272, | |
| "eval_loss": 0.5252464413642883, | |
| "eval_runtime": 11.2976, | |
| "eval_samples_per_second": 33.458, | |
| "eval_steps_per_second": 1.151, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 0.7022662675755212, | |
| "grad_norm": 0.5570893883705139, | |
| "learning_rate": 4.220212691180422e-06, | |
| "loss": 0.7674, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 0.7042784918379154, | |
| "grad_norm": 0.564457893371582, | |
| "learning_rate": 4.167693784012948e-06, | |
| "loss": 0.7774, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.7062907161003094, | |
| "grad_norm": 0.6193362474441528, | |
| "learning_rate": 4.115417553014317e-06, | |
| "loss": 0.7739, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 0.7062907161003094, | |
| "eval_loss": 0.5251539349555969, | |
| "eval_runtime": 11.3037, | |
| "eval_samples_per_second": 33.44, | |
| "eval_steps_per_second": 1.15, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 0.7083029403627035, | |
| "grad_norm": 0.5650792121887207, | |
| "learning_rate": 4.063386173343888e-06, | |
| "loss": 0.775, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 0.7103151646250975, | |
| "grad_norm": 0.5598296523094177, | |
| "learning_rate": 4.0116018099730155e-06, | |
| "loss": 0.7736, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 0.7123273888874915, | |
| "grad_norm": 0.5999264717102051, | |
| "learning_rate": 3.960066617594962e-06, | |
| "loss": 0.7728, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 0.7123273888874915, | |
| "eval_loss": 0.5251903533935547, | |
| "eval_runtime": 11.3608, | |
| "eval_samples_per_second": 33.272, | |
| "eval_steps_per_second": 1.144, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 0.7143396131498856, | |
| "grad_norm": 0.5485169291496277, | |
| "learning_rate": 3.908782740535244e-06, | |
| "loss": 0.7663, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.7163518374122796, | |
| "grad_norm": 0.5973437428474426, | |
| "learning_rate": 3.857752312662413e-06, | |
| "loss": 0.7731, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 0.7183640616746736, | |
| "grad_norm": 0.559617280960083, | |
| "learning_rate": 3.8069774572992614e-06, | |
| "loss": 0.7623, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 0.7183640616746736, | |
| "eval_loss": 0.5247710347175598, | |
| "eval_runtime": 11.3529, | |
| "eval_samples_per_second": 33.296, | |
| "eval_steps_per_second": 1.145, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 0.7203762859370677, | |
| "grad_norm": 0.5565606355667114, | |
| "learning_rate": 3.756460287134479e-06, | |
| "loss": 0.7773, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 0.7223885101994617, | |
| "grad_norm": 0.5371571779251099, | |
| "learning_rate": 3.706202904134747e-06, | |
| "loss": 0.7761, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 0.7244007344618558, | |
| "grad_norm": 0.5425861477851868, | |
| "learning_rate": 3.6562073994572624e-06, | |
| "loss": 0.7775, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.7244007344618558, | |
| "eval_loss": 0.5243012309074402, | |
| "eval_runtime": 11.3858, | |
| "eval_samples_per_second": 33.199, | |
| "eval_steps_per_second": 1.142, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.7264129587242498, | |
| "grad_norm": 0.5546737909317017, | |
| "learning_rate": 3.6064758533627496e-06, | |
| "loss": 0.7712, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 0.7284251829866438, | |
| "grad_norm": 0.6678885221481323, | |
| "learning_rate": 3.55701033512889e-06, | |
| "loss": 0.769, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 0.7304374072490379, | |
| "grad_norm": 0.5747791528701782, | |
| "learning_rate": 3.5078129029642192e-06, | |
| "loss": 0.7671, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 0.7304374072490379, | |
| "eval_loss": 0.523876428604126, | |
| "eval_runtime": 11.3643, | |
| "eval_samples_per_second": 33.262, | |
| "eval_steps_per_second": 1.144, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 0.7324496315114319, | |
| "grad_norm": 0.6479108333587646, | |
| "learning_rate": 3.458885603922498e-06, | |
| "loss": 0.7678, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 0.734461855773826, | |
| "grad_norm": 0.5260623693466187, | |
| "learning_rate": 3.4102304738175264e-06, | |
| "loss": 0.7686, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.7364740800362201, | |
| "grad_norm": 0.5565561056137085, | |
| "learning_rate": 3.3618495371384384e-06, | |
| "loss": 0.7722, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 0.7364740800362201, | |
| "eval_loss": 0.5241602659225464, | |
| "eval_runtime": 11.2637, | |
| "eval_samples_per_second": 33.559, | |
| "eval_steps_per_second": 1.154, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 0.7384863042986141, | |
| "grad_norm": 0.5522435307502747, | |
| "learning_rate": 3.3137448069654687e-06, | |
| "loss": 0.7753, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 0.7404985285610082, | |
| "grad_norm": 0.5111953020095825, | |
| "learning_rate": 3.265918284886186e-06, | |
| "loss": 0.7739, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 0.7425107528234022, | |
| "grad_norm": 0.5280485153198242, | |
| "learning_rate": 3.2183719609122146e-06, | |
| "loss": 0.7626, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 0.7425107528234022, | |
| "eval_loss": 0.5227437615394592, | |
| "eval_runtime": 11.3194, | |
| "eval_samples_per_second": 33.394, | |
| "eval_steps_per_second": 1.148, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 0.7445229770857962, | |
| "grad_norm": 0.5183678865432739, | |
| "learning_rate": 3.171107813396418e-06, | |
| "loss": 0.7745, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.7465352013481903, | |
| "grad_norm": 0.5712314248085022, | |
| "learning_rate": 3.124127808950602e-06, | |
| "loss": 0.7711, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 0.7485474256105843, | |
| "grad_norm": 0.5488412380218506, | |
| "learning_rate": 3.0774339023636756e-06, | |
| "loss": 0.7689, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 0.7485474256105843, | |
| "eval_loss": 0.5230608582496643, | |
| "eval_runtime": 11.338, | |
| "eval_samples_per_second": 33.339, | |
| "eval_steps_per_second": 1.147, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 0.7505596498729783, | |
| "grad_norm": 0.5331023335456848, | |
| "learning_rate": 3.0310280365203102e-06, | |
| "loss": 0.7663, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 0.7525718741353724, | |
| "grad_norm": 0.5227448344230652, | |
| "learning_rate": 2.9849121423201054e-06, | |
| "loss": 0.7645, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 0.7545840983977664, | |
| "grad_norm": 0.5383438467979431, | |
| "learning_rate": 2.9390881385972445e-06, | |
| "loss": 0.7624, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.7545840983977664, | |
| "eval_loss": 0.5230525732040405, | |
| "eval_runtime": 11.3076, | |
| "eval_samples_per_second": 33.429, | |
| "eval_steps_per_second": 1.15, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.7565963226601605, | |
| "grad_norm": 0.5267183184623718, | |
| "learning_rate": 2.8935579320406504e-06, | |
| "loss": 0.7744, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 0.7586085469225545, | |
| "grad_norm": 0.5995730757713318, | |
| "learning_rate": 2.8483234171146544e-06, | |
| "loss": 0.77, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 0.7606207711849485, | |
| "grad_norm": 0.5342182517051697, | |
| "learning_rate": 2.803386475980171e-06, | |
| "loss": 0.772, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 0.7606207711849485, | |
| "eval_loss": 0.5222497582435608, | |
| "eval_runtime": 11.6813, | |
| "eval_samples_per_second": 32.36, | |
| "eval_steps_per_second": 1.113, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 0.7626329954473426, | |
| "grad_norm": 0.5149078965187073, | |
| "learning_rate": 2.758748978416369e-06, | |
| "loss": 0.7675, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 0.7646452197097366, | |
| "grad_norm": 0.5688450932502747, | |
| "learning_rate": 2.7144127817428965e-06, | |
| "loss": 0.7655, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.7666574439721306, | |
| "grad_norm": 0.5706648826599121, | |
| "learning_rate": 2.6703797307425792e-06, | |
| "loss": 0.7645, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 0.7666574439721306, | |
| "eval_loss": 0.5218858122825623, | |
| "eval_runtime": 11.6659, | |
| "eval_samples_per_second": 32.402, | |
| "eval_steps_per_second": 1.114, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 0.7686696682345248, | |
| "grad_norm": 0.5271847248077393, | |
| "learning_rate": 2.626651657584672e-06, | |
| "loss": 0.7699, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 0.7706818924969188, | |
| "grad_norm": 0.5311073064804077, | |
| "learning_rate": 2.5832303817486137e-06, | |
| "loss": 0.766, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 0.7726941167593129, | |
| "grad_norm": 0.5762016177177429, | |
| "learning_rate": 2.540117709948332e-06, | |
| "loss": 0.7612, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 0.7726941167593129, | |
| "eval_loss": 0.5214508175849915, | |
| "eval_runtime": 11.4525, | |
| "eval_samples_per_second": 33.006, | |
| "eval_steps_per_second": 1.135, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 0.7747063410217069, | |
| "grad_norm": 0.5659816861152649, | |
| "learning_rate": 2.497315436057064e-06, | |
| "loss": 0.7693, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.7767185652841009, | |
| "grad_norm": 0.530085563659668, | |
| "learning_rate": 2.4548253410327104e-06, | |
| "loss": 0.7598, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 0.778730789546495, | |
| "grad_norm": 0.624070405960083, | |
| "learning_rate": 2.412649192843739e-06, | |
| "loss": 0.7722, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 0.778730789546495, | |
| "eval_loss": 0.5214821100234985, | |
| "eval_runtime": 11.3194, | |
| "eval_samples_per_second": 33.394, | |
| "eval_steps_per_second": 1.148, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 0.780743013808889, | |
| "grad_norm": 0.5348799228668213, | |
| "learning_rate": 2.3707887463956146e-06, | |
| "loss": 0.7615, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 0.782755238071283, | |
| "grad_norm": 0.5490187406539917, | |
| "learning_rate": 2.3292457434577854e-06, | |
| "loss": 0.7714, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 0.7847674623336771, | |
| "grad_norm": 0.5568532943725586, | |
| "learning_rate": 2.2880219125912064e-06, | |
| "loss": 0.7604, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.7847674623336771, | |
| "eval_loss": 0.5214923620223999, | |
| "eval_runtime": 11.3214, | |
| "eval_samples_per_second": 33.388, | |
| "eval_steps_per_second": 1.148, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.7867796865960711, | |
| "grad_norm": 0.5511381030082703, | |
| "learning_rate": 2.2471189690764093e-06, | |
| "loss": 0.7644, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 0.7887919108584652, | |
| "grad_norm": 0.5425460338592529, | |
| "learning_rate": 2.2065386148421486e-06, | |
| "loss": 0.7633, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 0.7908041351208592, | |
| "grad_norm": 0.4867189824581146, | |
| "learning_rate": 2.1662825383945686e-06, | |
| "loss": 0.7674, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 0.7908041351208592, | |
| "eval_loss": 0.5209300518035889, | |
| "eval_runtime": 11.3182, | |
| "eval_samples_per_second": 33.397, | |
| "eval_steps_per_second": 1.149, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 0.7928163593832532, | |
| "grad_norm": 0.5154452919960022, | |
| "learning_rate": 2.1263524147469573e-06, | |
| "loss": 0.7663, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 0.7948285836456473, | |
| "grad_norm": 0.5264437198638916, | |
| "learning_rate": 2.0867499053500473e-06, | |
| "loss": 0.7642, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.7968408079080413, | |
| "grad_norm": 0.5303503274917603, | |
| "learning_rate": 2.047476658022881e-06, | |
| "loss": 0.7722, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 0.7968408079080413, | |
| "eval_loss": 0.5208966135978699, | |
| "eval_runtime": 11.3632, | |
| "eval_samples_per_second": 33.265, | |
| "eval_steps_per_second": 1.144, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 0.7988530321704354, | |
| "grad_norm": 0.5367266535758972, | |
| "learning_rate": 2.0085343068842546e-06, | |
| "loss": 0.753, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 0.8008652564328295, | |
| "grad_norm": 0.5081086754798889, | |
| "learning_rate": 1.9699244722847143e-06, | |
| "loss": 0.7571, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 0.8028774806952235, | |
| "grad_norm": 0.5019336938858032, | |
| "learning_rate": 1.9316487607391465e-06, | |
| "loss": 0.7723, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 0.8028774806952235, | |
| "eval_loss": 0.5206644535064697, | |
| "eval_runtime": 11.3602, | |
| "eval_samples_per_second": 33.274, | |
| "eval_steps_per_second": 1.144, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 0.8048897049576176, | |
| "grad_norm": 0.5184951424598694, | |
| "learning_rate": 1.893708764859924e-06, | |
| "loss": 0.7677, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.8069019292200116, | |
| "grad_norm": 0.5265465974807739, | |
| "learning_rate": 1.8561060632906369e-06, | |
| "loss": 0.7686, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 0.8089141534824056, | |
| "grad_norm": 0.5161654353141785, | |
| "learning_rate": 1.8188422206404165e-06, | |
| "loss": 0.769, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 0.8089141534824056, | |
| "eval_loss": 0.5201809406280518, | |
| "eval_runtime": 11.369, | |
| "eval_samples_per_second": 33.248, | |
| "eval_steps_per_second": 1.143, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 0.8109263777447997, | |
| "grad_norm": 0.5580165982246399, | |
| "learning_rate": 1.7819187874188293e-06, | |
| "loss": 0.7686, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 0.8129386020071937, | |
| "grad_norm": 0.5577532052993774, | |
| "learning_rate": 1.7453372999713557e-06, | |
| "loss": 0.7616, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 0.8149508262695877, | |
| "grad_norm": 0.5307947993278503, | |
| "learning_rate": 1.709099280415476e-06, | |
| "loss": 0.7705, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.8149508262695877, | |
| "eval_loss": 0.5200989842414856, | |
| "eval_runtime": 11.3357, | |
| "eval_samples_per_second": 33.346, | |
| "eval_steps_per_second": 1.147, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.8169630505319818, | |
| "grad_norm": 0.5261068940162659, | |
| "learning_rate": 1.6732062365773272e-06, | |
| "loss": 0.7674, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 0.8189752747943758, | |
| "grad_norm": 0.4946574568748474, | |
| "learning_rate": 1.6376596619289653e-06, | |
| "loss": 0.7654, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 0.8209874990567699, | |
| "grad_norm": 0.5491064786911011, | |
| "learning_rate": 1.6024610355262282e-06, | |
| "loss": 0.7695, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 0.8209874990567699, | |
| "eval_loss": 0.5198547840118408, | |
| "eval_runtime": 11.316, | |
| "eval_samples_per_second": 33.404, | |
| "eval_steps_per_second": 1.149, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 0.8229997233191639, | |
| "grad_norm": 0.5306958556175232, | |
| "learning_rate": 1.5676118219471891e-06, | |
| "loss": 0.7619, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 0.8250119475815579, | |
| "grad_norm": 0.5380471348762512, | |
| "learning_rate": 1.5331134712312235e-06, | |
| "loss": 0.767, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.827024171843952, | |
| "grad_norm": 0.5167573094367981, | |
| "learning_rate": 1.4989674188186598e-06, | |
| "loss": 0.7599, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 0.827024171843952, | |
| "eval_loss": 0.5196862816810608, | |
| "eval_runtime": 11.2973, | |
| "eval_samples_per_second": 33.459, | |
| "eval_steps_per_second": 1.151, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 0.829036396106346, | |
| "grad_norm": 0.5409244894981384, | |
| "learning_rate": 1.4651750854910685e-06, | |
| "loss": 0.7587, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 0.83104862036874, | |
| "grad_norm": 0.5431727170944214, | |
| "learning_rate": 1.4317378773121393e-06, | |
| "loss": 0.7579, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 0.8330608446311342, | |
| "grad_norm": 0.53000807762146, | |
| "learning_rate": 1.3986571855691744e-06, | |
| "loss": 0.7688, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 0.8330608446311342, | |
| "eval_loss": 0.5197826623916626, | |
| "eval_runtime": 11.3928, | |
| "eval_samples_per_second": 33.179, | |
| "eval_steps_per_second": 1.141, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 0.8350730688935282, | |
| "grad_norm": 0.5434339046478271, | |
| "learning_rate": 1.3659343867151975e-06, | |
| "loss": 0.7695, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.8370852931559222, | |
| "grad_norm": 0.5368450284004211, | |
| "learning_rate": 1.3335708423116856e-06, | |
| "loss": 0.7636, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 0.8390975174183163, | |
| "grad_norm": 0.5331200361251831, | |
| "learning_rate": 1.3015678989719116e-06, | |
| "loss": 0.7696, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 0.8390975174183163, | |
| "eval_loss": 0.519400954246521, | |
| "eval_runtime": 11.3064, | |
| "eval_samples_per_second": 33.432, | |
| "eval_steps_per_second": 1.15, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 0.8411097416807103, | |
| "grad_norm": 0.5858904123306274, | |
| "learning_rate": 1.2699268883049154e-06, | |
| "loss": 0.7648, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 0.8431219659431044, | |
| "grad_norm": 0.5302870273590088, | |
| "learning_rate": 1.2386491268600976e-06, | |
| "loss": 0.7553, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 0.8451341902054984, | |
| "grad_norm": 0.4971041679382324, | |
| "learning_rate": 1.2077359160724388e-06, | |
| "loss": 0.7655, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.8451341902054984, | |
| "eval_loss": 0.519396960735321, | |
| "eval_runtime": 11.3912, | |
| "eval_samples_per_second": 33.183, | |
| "eval_steps_per_second": 1.141, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.8471464144678924, | |
| "grad_norm": 0.5351930856704712, | |
| "learning_rate": 1.1771885422083418e-06, | |
| "loss": 0.7603, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 0.8491586387302865, | |
| "grad_norm": 0.4970718026161194, | |
| "learning_rate": 1.1470082763121227e-06, | |
| "loss": 0.7661, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 0.8511708629926805, | |
| "grad_norm": 0.5322678089141846, | |
| "learning_rate": 1.1171963741531178e-06, | |
| "loss": 0.7616, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 0.8511708629926805, | |
| "eval_loss": 0.5193082094192505, | |
| "eval_runtime": 11.3559, | |
| "eval_samples_per_second": 33.287, | |
| "eval_steps_per_second": 1.145, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 0.8531830872550745, | |
| "grad_norm": 0.5380090475082397, | |
| "learning_rate": 1.0877540761734317e-06, | |
| "loss": 0.7623, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 0.8551953115174686, | |
| "grad_norm": 0.5419859290122986, | |
| "learning_rate": 1.0586826074363277e-06, | |
| "loss": 0.761, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.8572075357798626, | |
| "grad_norm": 0.5447313189506531, | |
| "learning_rate": 1.0299831775752478e-06, | |
| "loss": 0.7635, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 0.8572075357798626, | |
| "eval_loss": 0.5189518332481384, | |
| "eval_runtime": 11.3146, | |
| "eval_samples_per_second": 33.408, | |
| "eval_steps_per_second": 1.149, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 0.8592197600422568, | |
| "grad_norm": 0.5054132342338562, | |
| "learning_rate": 1.0016569807434894e-06, | |
| "loss": 0.7553, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 0.8612319843046508, | |
| "grad_norm": 0.5626354217529297, | |
| "learning_rate": 9.737051955645104e-07, | |
| "loss": 0.76, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 0.8632442085670448, | |
| "grad_norm": 0.6139233112335205, | |
| "learning_rate": 9.461289850828936e-07, | |
| "loss": 0.7586, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 0.8632442085670448, | |
| "eval_loss": 0.5188504457473755, | |
| "eval_runtime": 11.3931, | |
| "eval_samples_per_second": 33.178, | |
| "eval_steps_per_second": 1.141, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 0.8652564328294389, | |
| "grad_norm": 0.5168823003768921, | |
| "learning_rate": 9.189294967159457e-07, | |
| "loss": 0.7569, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.8672686570918329, | |
| "grad_norm": 0.5103846192359924, | |
| "learning_rate": 8.921078622059643e-07, | |
| "loss": 0.7598, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 0.8692808813542269, | |
| "grad_norm": 0.5376741290092468, | |
| "learning_rate": 8.656651975731434e-07, | |
| "loss": 0.7687, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 0.8692808813542269, | |
| "eval_loss": 0.5187187790870667, | |
| "eval_runtime": 11.3132, | |
| "eval_samples_per_second": 33.412, | |
| "eval_steps_per_second": 1.149, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 0.871293105616621, | |
| "grad_norm": 0.5139674544334412, | |
| "learning_rate": 8.396026030691329e-07, | |
| "loss": 0.7543, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 0.873305329879015, | |
| "grad_norm": 0.4912608563899994, | |
| "learning_rate": 8.139211631312638e-07, | |
| "loss": 0.759, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 0.8753175541414091, | |
| "grad_norm": 0.5286913514137268, | |
| "learning_rate": 7.886219463374256e-07, | |
| "loss": 0.7579, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.8753175541414091, | |
| "eval_loss": 0.5185059905052185, | |
| "eval_runtime": 11.3249, | |
| "eval_samples_per_second": 33.378, | |
| "eval_steps_per_second": 1.148, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.8773297784038031, | |
| "grad_norm": 0.4960270822048187, | |
| "learning_rate": 7.637060053615963e-07, | |
| "loss": 0.7582, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 0.8793420026661971, | |
| "grad_norm": 0.5134163498878479, | |
| "learning_rate": 7.391743769300541e-07, | |
| "loss": 0.7624, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 0.8813542269285912, | |
| "grad_norm": 0.5594838857650757, | |
| "learning_rate": 7.150280817782296e-07, | |
| "loss": 0.7626, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 0.8813542269285912, | |
| "eval_loss": 0.5184139013290405, | |
| "eval_runtime": 11.3303, | |
| "eval_samples_per_second": 33.362, | |
| "eval_steps_per_second": 1.147, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 0.8833664511909852, | |
| "grad_norm": 0.523009717464447, | |
| "learning_rate": 6.912681246082409e-07, | |
| "loss": 0.7554, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 0.8853786754533792, | |
| "grad_norm": 0.50362229347229, | |
| "learning_rate": 6.678954940470806e-07, | |
| "loss": 0.758, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.8873908997157733, | |
| "grad_norm": 0.5441898107528687, | |
| "learning_rate": 6.449111626054927e-07, | |
| "loss": 0.7573, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 0.8873908997157733, | |
| "eval_loss": 0.5184325575828552, | |
| "eval_runtime": 11.3938, | |
| "eval_samples_per_second": 33.176, | |
| "eval_steps_per_second": 1.141, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 0.8894031239781673, | |
| "grad_norm": 0.520699679851532, | |
| "learning_rate": 6.223160866374967e-07, | |
| "loss": 0.7638, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 0.8914153482405615, | |
| "grad_norm": 0.4745332598686218, | |
| "learning_rate": 6.001112063005998e-07, | |
| "loss": 0.7577, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 0.8934275725029555, | |
| "grad_norm": 0.49645400047302246, | |
| "learning_rate": 5.782974455166767e-07, | |
| "loss": 0.7619, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 0.8934275725029555, | |
| "eval_loss": 0.518170952796936, | |
| "eval_runtime": 11.3133, | |
| "eval_samples_per_second": 33.412, | |
| "eval_steps_per_second": 1.149, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 0.8954397967653495, | |
| "grad_norm": 0.5159271955490112, | |
| "learning_rate": 5.568757119335244e-07, | |
| "loss": 0.7571, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.8974520210277436, | |
| "grad_norm": 0.5097435712814331, | |
| "learning_rate": 5.358468968871e-07, | |
| "loss": 0.7697, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 0.8994642452901376, | |
| "grad_norm": 0.5482389330863953, | |
| "learning_rate": 5.152118753644275e-07, | |
| "loss": 0.7682, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 0.8994642452901376, | |
| "eval_loss": 0.5181338787078857, | |
| "eval_runtime": 11.4656, | |
| "eval_samples_per_second": 32.968, | |
| "eval_steps_per_second": 1.134, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 0.9014764695525316, | |
| "grad_norm": 0.5253916382789612, | |
| "learning_rate": 4.949715059671978e-07, | |
| "loss": 0.7656, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 0.9034886938149257, | |
| "grad_norm": 0.4978592097759247, | |
| "learning_rate": 4.7512663087603826e-07, | |
| "loss": 0.7621, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 0.9055009180773197, | |
| "grad_norm": 0.5216113924980164, | |
| "learning_rate": 4.5567807581546664e-07, | |
| "loss": 0.7595, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.9055009180773197, | |
| "eval_loss": 0.5181112885475159, | |
| "eval_runtime": 11.5213, | |
| "eval_samples_per_second": 32.809, | |
| "eval_steps_per_second": 1.128, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.9075131423397138, | |
| "grad_norm": 0.5027504563331604, | |
| "learning_rate": 4.366266500195426e-07, | |
| "loss": 0.7588, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 0.9095253666021078, | |
| "grad_norm": 0.5365561842918396, | |
| "learning_rate": 4.1797314619819285e-07, | |
| "loss": 0.7612, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 0.9115375908645018, | |
| "grad_norm": 0.5316836833953857, | |
| "learning_rate": 3.997183405042238e-07, | |
| "loss": 0.7639, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 0.9115375908645018, | |
| "eval_loss": 0.5180224776268005, | |
| "eval_runtime": 11.5144, | |
| "eval_samples_per_second": 32.828, | |
| "eval_steps_per_second": 1.129, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 0.9135498151268959, | |
| "grad_norm": 0.5350984930992126, | |
| "learning_rate": 3.8186299250103085e-07, | |
| "loss": 0.7582, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 0.9155620393892899, | |
| "grad_norm": 0.5509154796600342, | |
| "learning_rate": 3.644078451309907e-07, | |
| "loss": 0.7686, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.9175742636516839, | |
| "grad_norm": 0.5419358611106873, | |
| "learning_rate": 3.47353624684551e-07, | |
| "loss": 0.762, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 0.9175742636516839, | |
| "eval_loss": 0.5179212689399719, | |
| "eval_runtime": 11.4423, | |
| "eval_samples_per_second": 33.035, | |
| "eval_steps_per_second": 1.136, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 0.919586487914078, | |
| "grad_norm": 0.5258903503417969, | |
| "learning_rate": 3.307010407700084e-07, | |
| "loss": 0.7598, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 0.921598712176472, | |
| "grad_norm": 0.519910454750061, | |
| "learning_rate": 3.1445078628398294e-07, | |
| "loss": 0.7589, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 0.9236109364388662, | |
| "grad_norm": 0.5140842795372009, | |
| "learning_rate": 2.986035373825902e-07, | |
| "loss": 0.762, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 0.9236109364388662, | |
| "eval_loss": 0.5178348422050476, | |
| "eval_runtime": 11.4694, | |
| "eval_samples_per_second": 32.957, | |
| "eval_steps_per_second": 1.133, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 0.9256231607012602, | |
| "grad_norm": 0.5274850726127625, | |
| "learning_rate": 2.8315995345329804e-07, | |
| "loss": 0.758, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.9276353849636542, | |
| "grad_norm": 0.5443992018699646, | |
| "learning_rate": 2.681206770875022e-07, | |
| "loss": 0.7614, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 0.9296476092260483, | |
| "grad_norm": 0.5250468254089355, | |
| "learning_rate": 2.5348633405378296e-07, | |
| "loss": 0.7666, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 0.9296476092260483, | |
| "eval_loss": 0.5178038477897644, | |
| "eval_runtime": 11.6986, | |
| "eval_samples_per_second": 32.311, | |
| "eval_steps_per_second": 1.111, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 0.9316598334884423, | |
| "grad_norm": 0.5096211433410645, | |
| "learning_rate": 2.392575332718627e-07, | |
| "loss": 0.7697, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 0.9336720577508363, | |
| "grad_norm": 0.549790620803833, | |
| "learning_rate": 2.2543486678727855e-07, | |
| "loss": 0.7676, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 0.9356842820132304, | |
| "grad_norm": 0.524726152420044, | |
| "learning_rate": 2.120189097467451e-07, | |
| "loss": 0.7673, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.9356842820132304, | |
| "eval_loss": 0.5176617503166199, | |
| "eval_runtime": 11.5673, | |
| "eval_samples_per_second": 32.678, | |
| "eval_steps_per_second": 1.124, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.9376965062756244, | |
| "grad_norm": 0.5029181838035583, | |
| "learning_rate": 1.9901022037421723e-07, | |
| "loss": 0.7642, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 0.9397087305380185, | |
| "grad_norm": 0.5207979679107666, | |
| "learning_rate": 1.8640933994767073e-07, | |
| "loss": 0.7592, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 0.9417209548004125, | |
| "grad_norm": 0.5468851923942566, | |
| "learning_rate": 1.74216792776577e-07, | |
| "loss": 0.7631, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 0.9417209548004125, | |
| "eval_loss": 0.5177092552185059, | |
| "eval_runtime": 11.4559, | |
| "eval_samples_per_second": 32.996, | |
| "eval_steps_per_second": 1.135, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 0.9437331790628065, | |
| "grad_norm": 0.5044853091239929, | |
| "learning_rate": 1.62433086180086e-07, | |
| "loss": 0.7644, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 0.9457454033252006, | |
| "grad_norm": 0.5245229005813599, | |
| "learning_rate": 1.5105871046592e-07, | |
| "loss": 0.7605, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.9477576275875946, | |
| "grad_norm": 0.49839621782302856, | |
| "learning_rate": 1.400941389099697e-07, | |
| "loss": 0.7565, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 0.9477576275875946, | |
| "eval_loss": 0.5176432132720947, | |
| "eval_runtime": 11.5662, | |
| "eval_samples_per_second": 32.681, | |
| "eval_steps_per_second": 1.124, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 0.9497698518499886, | |
| "grad_norm": 0.4973909556865692, | |
| "learning_rate": 1.2953982773660223e-07, | |
| "loss": 0.7656, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 0.9517820761123827, | |
| "grad_norm": 0.5007102489471436, | |
| "learning_rate": 1.1939621609968088e-07, | |
| "loss": 0.7506, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 0.9537943003747767, | |
| "grad_norm": 0.49358874559402466, | |
| "learning_rate": 1.0966372606428855e-07, | |
| "loss": 0.7562, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 0.9537943003747767, | |
| "eval_loss": 0.5176478624343872, | |
| "eval_runtime": 11.3727, | |
| "eval_samples_per_second": 33.237, | |
| "eval_steps_per_second": 1.143, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 0.9558065246371709, | |
| "grad_norm": 0.5771644115447998, | |
| "learning_rate": 1.0034276258916953e-07, | |
| "loss": 0.766, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 0.9578187488995649, | |
| "grad_norm": 0.5385919213294983, | |
| "learning_rate": 9.14337135098764e-08, | |
| "loss": 0.7605, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 0.9598309731619589, | |
| "grad_norm": 0.5119192004203796, | |
| "learning_rate": 8.293694952263286e-08, | |
| "loss": 0.757, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 0.9598309731619589, | |
| "eval_loss": 0.5176236033439636, | |
| "eval_runtime": 11.3818, | |
| "eval_samples_per_second": 33.211, | |
| "eval_steps_per_second": 1.142, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 0.961843197424353, | |
| "grad_norm": 0.5380053520202637, | |
| "learning_rate": 7.485282416891393e-08, | |
| "loss": 0.7574, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 0.5267532467842102, | |
| "learning_rate": 6.718167382072983e-08, | |
| "loss": 0.7668, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 0.965867645949141, | |
| "grad_norm": 0.5199303030967712, | |
| "learning_rate": 5.99238176666328e-08, | |
| "loss": 0.756, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.965867645949141, | |
| "eval_loss": 0.5175907015800476, | |
| "eval_runtime": 11.4752, | |
| "eval_samples_per_second": 32.941, | |
| "eval_steps_per_second": 1.133, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.9678798702115351, | |
| "grad_norm": 0.5405638217926025, | |
| "learning_rate": 5.307955769843443e-08, | |
| "loss": 0.7612, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 0.9698920944739291, | |
| "grad_norm": 0.47063717246055603, | |
| "learning_rate": 4.664917869864338e-08, | |
| "loss": 0.7667, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 0.9719043187363232, | |
| "grad_norm": 0.48465442657470703, | |
| "learning_rate": 4.063294822861163e-08, | |
| "loss": 0.7605, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 0.9719043187363232, | |
| "eval_loss": 0.5175836682319641, | |
| "eval_runtime": 11.3838, | |
| "eval_samples_per_second": 33.205, | |
| "eval_steps_per_second": 1.142, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 0.9739165429987172, | |
| "grad_norm": 0.48423367738723755, | |
| "learning_rate": 3.5031116617404435e-08, | |
| "loss": 0.7574, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 0.9759287672611112, | |
| "grad_norm": 0.5320655107498169, | |
| "learning_rate": 2.9843916951382e-08, | |
| "loss": 0.767, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 0.9779409915235053, | |
| "grad_norm": 0.5267395377159119, | |
| "learning_rate": 2.5071565064506143e-08, | |
| "loss": 0.7593, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 0.9779409915235053, | |
| "eval_loss": 0.5175591707229614, | |
| "eval_runtime": 11.3501, | |
| "eval_samples_per_second": 33.304, | |
| "eval_steps_per_second": 1.145, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 0.9799532157858993, | |
| "grad_norm": 0.514837920665741, | |
| "learning_rate": 2.071425952934969e-08, | |
| "loss": 0.7641, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 0.9819654400482933, | |
| "grad_norm": 0.5345449447631836, | |
| "learning_rate": 1.677218164884753e-08, | |
| "loss": 0.7685, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 0.9839776643106874, | |
| "grad_norm": 0.5339971780776978, | |
| "learning_rate": 1.3245495448739321e-08, | |
| "loss": 0.7612, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 0.9839776643106874, | |
| "eval_loss": 0.5175919532775879, | |
| "eval_runtime": 11.3829, | |
| "eval_samples_per_second": 33.208, | |
| "eval_steps_per_second": 1.142, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 0.9859898885730815, | |
| "grad_norm": 0.49889686703681946, | |
| "learning_rate": 1.013434767075605e-08, | |
| "loss": 0.7692, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.9880021128354756, | |
| "grad_norm": 0.5119482278823853, | |
| "learning_rate": 7.438867766504931e-09, | |
| "loss": 0.7578, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 0.9900143370978696, | |
| "grad_norm": 0.5316244959831238, | |
| "learning_rate": 5.159167892089256e-09, | |
| "loss": 0.7568, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 0.9900143370978696, | |
| "eval_loss": 0.5176030993461609, | |
| "eval_runtime": 11.4046, | |
| "eval_samples_per_second": 33.145, | |
| "eval_steps_per_second": 1.14, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 0.9920265613602636, | |
| "grad_norm": 7.261257648468018, | |
| "learning_rate": 3.2953429034399133e-09, | |
| "loss": 0.7576, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 0.9940387856226577, | |
| "grad_norm": 0.48430758714675903, | |
| "learning_rate": 1.847470352367431e-09, | |
| "loss": 0.7577, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 0.9960510098850517, | |
| "grad_norm": 0.4918181598186493, | |
| "learning_rate": 8.156104833345613e-10, | |
| "loss": 0.7649, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.9960510098850517, | |
| "eval_loss": 0.5175663232803345, | |
| "eval_runtime": 11.4598, | |
| "eval_samples_per_second": 32.985, | |
| "eval_steps_per_second": 1.134, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.9980632341474457, | |
| "grad_norm": 0.5409220457077026, | |
| "learning_rate": 1.9980623095494645e-10, | |
| "loss": 0.7531, | |
| "step": 49600 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 49697, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.166027593741658e+19, | |
| "train_batch_size": 10, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |