| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.7152317880794703, | |
| "eval_steps": 100, | |
| "global_step": 16400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008278145695364239, | |
| "grad_norm": 2.812814950942993, | |
| "learning_rate": 2.7041942604856512e-06, | |
| "loss": 1.4882, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.016556291390728478, | |
| "grad_norm": 2.6125919818878174, | |
| "learning_rate": 5.463576158940398e-06, | |
| "loss": 1.2862, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.016556291390728478, | |
| "eval_loss": 1.2020893096923828, | |
| "eval_runtime": 1896.4557, | |
| "eval_samples_per_second": 3.185, | |
| "eval_steps_per_second": 3.185, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.024834437086092714, | |
| "grad_norm": 2.393519878387451, | |
| "learning_rate": 8.222958057395145e-06, | |
| "loss": 1.1353, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.033112582781456956, | |
| "grad_norm": 2.5604305267333984, | |
| "learning_rate": 1.0982339955849891e-05, | |
| "loss": 1.0539, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.033112582781456956, | |
| "eval_loss": 1.0201358795166016, | |
| "eval_runtime": 1896.3026, | |
| "eval_samples_per_second": 3.185, | |
| "eval_steps_per_second": 3.185, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.041390728476821195, | |
| "grad_norm": 3.0053958892822266, | |
| "learning_rate": 1.3741721854304637e-05, | |
| "loss": 0.9816, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04966887417218543, | |
| "grad_norm": 3.028010368347168, | |
| "learning_rate": 1.6501103752759385e-05, | |
| "loss": 0.9372, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.057947019867549666, | |
| "grad_norm": 2.484466552734375, | |
| "learning_rate": 1.926048565121413e-05, | |
| "loss": 0.9137, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.06622516556291391, | |
| "grad_norm": 3.078425884246826, | |
| "learning_rate": 2.2019867549668874e-05, | |
| "loss": 0.8786, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07450331125827815, | |
| "grad_norm": 2.6672778129577637, | |
| "learning_rate": 2.477924944812362e-05, | |
| "loss": 0.8348, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.08278145695364239, | |
| "grad_norm": 2.8199753761291504, | |
| "learning_rate": 2.753863134657837e-05, | |
| "loss": 0.8417, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09105960264900662, | |
| "grad_norm": 2.961965322494507, | |
| "learning_rate": 3.0298013245033112e-05, | |
| "loss": 0.8209, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.09933774834437085, | |
| "grad_norm": 2.5217514038085938, | |
| "learning_rate": 3.305739514348786e-05, | |
| "loss": 0.8235, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1076158940397351, | |
| "grad_norm": 2.4740138053894043, | |
| "learning_rate": 3.581677704194261e-05, | |
| "loss": 0.7918, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.11589403973509933, | |
| "grad_norm": 2.5529448986053467, | |
| "learning_rate": 3.8576158940397354e-05, | |
| "loss": 0.7749, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.12417218543046357, | |
| "grad_norm": 2.151698589324951, | |
| "learning_rate": 4.13355408388521e-05, | |
| "loss": 0.7838, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.13245033112582782, | |
| "grad_norm": 1.9026315212249756, | |
| "learning_rate": 4.4094922737306846e-05, | |
| "loss": 0.7668, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.14072847682119205, | |
| "grad_norm": 1.9555529356002808, | |
| "learning_rate": 4.685430463576159e-05, | |
| "loss": 0.7684, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1490066225165563, | |
| "grad_norm": 2.308894157409668, | |
| "learning_rate": 4.961368653421634e-05, | |
| "loss": 0.7661, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.15728476821192053, | |
| "grad_norm": 2.3362715244293213, | |
| "learning_rate": 5.237306843267108e-05, | |
| "loss": 0.736, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.16556291390728478, | |
| "grad_norm": 1.8228410482406616, | |
| "learning_rate": 5.513245033112583e-05, | |
| "loss": 0.7213, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.173841059602649, | |
| "grad_norm": 1.8289754390716553, | |
| "learning_rate": 5.789183222958058e-05, | |
| "loss": 0.7335, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.18211920529801323, | |
| "grad_norm": 1.4989681243896484, | |
| "learning_rate": 6.065121412803533e-05, | |
| "loss": 0.7326, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.19039735099337748, | |
| "grad_norm": 1.5326098203659058, | |
| "learning_rate": 6.341059602649006e-05, | |
| "loss": 0.7311, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1986754966887417, | |
| "grad_norm": 1.4897147417068481, | |
| "learning_rate": 6.616997792494481e-05, | |
| "loss": 0.6918, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.20695364238410596, | |
| "grad_norm": 1.634765863418579, | |
| "learning_rate": 6.892935982339957e-05, | |
| "loss": 0.7051, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.2152317880794702, | |
| "grad_norm": 1.4463587999343872, | |
| "learning_rate": 7.168874172185431e-05, | |
| "loss": 0.6955, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.22350993377483444, | |
| "grad_norm": 1.632133960723877, | |
| "learning_rate": 7.444812362030905e-05, | |
| "loss": 0.6901, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.23178807947019867, | |
| "grad_norm": 1.4062328338623047, | |
| "learning_rate": 7.72075055187638e-05, | |
| "loss": 0.6833, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.24006622516556292, | |
| "grad_norm": 1.2914466857910156, | |
| "learning_rate": 7.996688741721855e-05, | |
| "loss": 0.6663, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.24834437086092714, | |
| "grad_norm": 1.4995919466018677, | |
| "learning_rate": 8.272626931567329e-05, | |
| "loss": 0.6959, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.25662251655629137, | |
| "grad_norm": 1.1299749612808228, | |
| "learning_rate": 8.548565121412803e-05, | |
| "loss": 0.6685, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.26490066225165565, | |
| "grad_norm": 1.329004168510437, | |
| "learning_rate": 8.824503311258279e-05, | |
| "loss": 0.6678, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2731788079470199, | |
| "grad_norm": 1.5191948413848877, | |
| "learning_rate": 9.100441501103754e-05, | |
| "loss": 0.6731, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.2814569536423841, | |
| "grad_norm": 1.739169716835022, | |
| "learning_rate": 9.376379690949227e-05, | |
| "loss": 0.6691, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.2897350993377483, | |
| "grad_norm": 1.2906118631362915, | |
| "learning_rate": 9.652317880794703e-05, | |
| "loss": 0.6718, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2980132450331126, | |
| "grad_norm": 1.289502501487732, | |
| "learning_rate": 9.928256070640178e-05, | |
| "loss": 0.6581, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.30629139072847683, | |
| "grad_norm": 1.3923128843307495, | |
| "learning_rate": 9.999872989402833e-05, | |
| "loss": 0.6589, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.31456953642384106, | |
| "grad_norm": 1.1048816442489624, | |
| "learning_rate": 9.999297790520483e-05, | |
| "loss": 0.6341, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3228476821192053, | |
| "grad_norm": 1.3568603992462158, | |
| "learning_rate": 9.998258777484084e-05, | |
| "loss": 0.6318, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.33112582781456956, | |
| "grad_norm": 0.923786997795105, | |
| "learning_rate": 9.996756046688961e-05, | |
| "loss": 0.6318, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3394039735099338, | |
| "grad_norm": 1.102367877960205, | |
| "learning_rate": 9.994789737552259e-05, | |
| "loss": 0.6193, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.347682119205298, | |
| "grad_norm": 1.0738896131515503, | |
| "learning_rate": 9.992360032500001e-05, | |
| "loss": 0.6184, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.35596026490066224, | |
| "grad_norm": 1.279288649559021, | |
| "learning_rate": 9.98946715695016e-05, | |
| "loss": 0.626, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.36423841059602646, | |
| "grad_norm": 1.2009036540985107, | |
| "learning_rate": 9.986111379291759e-05, | |
| "loss": 0.6305, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.37251655629139074, | |
| "grad_norm": 0.8177038431167603, | |
| "learning_rate": 9.982293010859955e-05, | |
| "loss": 0.6266, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.38079470198675497, | |
| "grad_norm": 1.2464983463287354, | |
| "learning_rate": 9.978012405907165e-05, | |
| "loss": 0.6148, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.3890728476821192, | |
| "grad_norm": 1.2841860055923462, | |
| "learning_rate": 9.973269961570195e-05, | |
| "loss": 0.5946, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.3973509933774834, | |
| "grad_norm": 1.2200431823730469, | |
| "learning_rate": 9.968066117833401e-05, | |
| "loss": 0.6166, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.4056291390728477, | |
| "grad_norm": 1.128247857093811, | |
| "learning_rate": 9.962401357487863e-05, | |
| "loss": 0.5992, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.4139072847682119, | |
| "grad_norm": 1.0683091878890991, | |
| "learning_rate": 9.956276206086597e-05, | |
| "loss": 0.6048, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.42218543046357615, | |
| "grad_norm": 1.1819758415222168, | |
| "learning_rate": 9.949691231895791e-05, | |
| "loss": 0.5944, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.4304635761589404, | |
| "grad_norm": 1.0043411254882812, | |
| "learning_rate": 9.942647045842095e-05, | |
| "loss": 0.5962, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.43874172185430466, | |
| "grad_norm": 1.0588668584823608, | |
| "learning_rate": 9.93514430145593e-05, | |
| "loss": 0.6067, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.4470198675496689, | |
| "grad_norm": 0.9364084601402283, | |
| "learning_rate": 9.927183694810862e-05, | |
| "loss": 0.5928, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4552980132450331, | |
| "grad_norm": 1.155172348022461, | |
| "learning_rate": 9.918765964459022e-05, | |
| "loss": 0.5987, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.46357615894039733, | |
| "grad_norm": 1.1639224290847778, | |
| "learning_rate": 9.909891891362587e-05, | |
| "loss": 0.5745, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4718543046357616, | |
| "grad_norm": 0.9658174514770508, | |
| "learning_rate": 9.900562298821323e-05, | |
| "loss": 0.5825, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.48013245033112584, | |
| "grad_norm": 1.118033766746521, | |
| "learning_rate": 9.890778052396205e-05, | |
| "loss": 0.5806, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.48841059602649006, | |
| "grad_norm": 0.9781912565231323, | |
| "learning_rate": 9.880540059829115e-05, | |
| "loss": 0.5712, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.4966887417218543, | |
| "grad_norm": 1.2145684957504272, | |
| "learning_rate": 9.869849270958622e-05, | |
| "loss": 0.5855, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5049668874172185, | |
| "grad_norm": 0.999279260635376, | |
| "learning_rate": 9.858706677631862e-05, | |
| "loss": 0.5843, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.5132450331125827, | |
| "grad_norm": 1.098258137702942, | |
| "learning_rate": 9.847113313612517e-05, | |
| "loss": 0.5605, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5215231788079471, | |
| "grad_norm": 0.627949059009552, | |
| "learning_rate": 9.835070254484912e-05, | |
| "loss": 0.5538, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.5298013245033113, | |
| "grad_norm": 1.0991902351379395, | |
| "learning_rate": 9.822578617554219e-05, | |
| "loss": 0.5555, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5380794701986755, | |
| "grad_norm": 0.9670843482017517, | |
| "learning_rate": 9.8096395617428e-05, | |
| "loss": 0.5647, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.5463576158940397, | |
| "grad_norm": 0.9838133454322815, | |
| "learning_rate": 9.796254287482693e-05, | |
| "loss": 0.5561, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.554635761589404, | |
| "grad_norm": 1.1465744972229004, | |
| "learning_rate": 9.782424036604234e-05, | |
| "loss": 0.559, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.5629139072847682, | |
| "grad_norm": 1.1423758268356323, | |
| "learning_rate": 9.768150092220849e-05, | |
| "loss": 0.5517, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5711920529801324, | |
| "grad_norm": 1.1365066766738892, | |
| "learning_rate": 9.753433778610008e-05, | |
| "loss": 0.5464, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.5794701986754967, | |
| "grad_norm": 0.81045001745224, | |
| "learning_rate": 9.738276461090371e-05, | |
| "loss": 0.5493, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5877483443708609, | |
| "grad_norm": 1.0236687660217285, | |
| "learning_rate": 9.72267954589511e-05, | |
| "loss": 0.567, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.5960264900662252, | |
| "grad_norm": 0.9495602250099182, | |
| "learning_rate": 9.706644480041455e-05, | |
| "loss": 0.5474, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.6043046357615894, | |
| "grad_norm": 0.960738480091095, | |
| "learning_rate": 9.690172751196437e-05, | |
| "loss": 0.5238, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.6125827814569537, | |
| "grad_norm": 1.0488675832748413, | |
| "learning_rate": 9.67326588753887e-05, | |
| "loss": 0.521, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6208609271523179, | |
| "grad_norm": 0.8753538727760315, | |
| "learning_rate": 9.65592545761758e-05, | |
| "loss": 0.5232, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.6291390728476821, | |
| "grad_norm": 1.0551217794418335, | |
| "learning_rate": 9.638153070205871e-05, | |
| "loss": 0.5432, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6374172185430463, | |
| "grad_norm": 1.158676028251648, | |
| "learning_rate": 9.619950374152278e-05, | |
| "loss": 0.5416, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.6456953642384106, | |
| "grad_norm": 1.0036752223968506, | |
| "learning_rate": 9.601319058227589e-05, | |
| "loss": 0.5496, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.6539735099337748, | |
| "grad_norm": 0.8905594348907471, | |
| "learning_rate": 9.58226085096817e-05, | |
| "loss": 0.5335, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.6622516556291391, | |
| "grad_norm": 0.9868190884590149, | |
| "learning_rate": 9.562777520515598e-05, | |
| "loss": 0.5094, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6705298013245033, | |
| "grad_norm": 0.9672690629959106, | |
| "learning_rate": 9.542870874452618e-05, | |
| "loss": 0.5061, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.6788079470198676, | |
| "grad_norm": 1.044123888015747, | |
| "learning_rate": 9.52254275963545e-05, | |
| "loss": 0.5253, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.6870860927152318, | |
| "grad_norm": 1.0346958637237549, | |
| "learning_rate": 9.501795062022434e-05, | |
| "loss": 0.5149, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.695364238410596, | |
| "grad_norm": 1.0799248218536377, | |
| "learning_rate": 9.48062970649907e-05, | |
| "loss": 0.5207, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7036423841059603, | |
| "grad_norm": 0.9847925901412964, | |
| "learning_rate": 9.459048656699427e-05, | |
| "loss": 0.531, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.7119205298013245, | |
| "grad_norm": 1.134179949760437, | |
| "learning_rate": 9.43705391482397e-05, | |
| "loss": 0.5202, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.7201986754966887, | |
| "grad_norm": 0.9750307202339172, | |
| "learning_rate": 9.414647521453798e-05, | |
| "loss": 0.5183, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.7284768211920529, | |
| "grad_norm": 1.372010350227356, | |
| "learning_rate": 9.391831555361341e-05, | |
| "loss": 0.5203, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7367549668874173, | |
| "grad_norm": 0.9671643376350403, | |
| "learning_rate": 9.36860813331748e-05, | |
| "loss": 0.5313, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.7450331125827815, | |
| "grad_norm": 1.270264983177185, | |
| "learning_rate": 9.344979409895178e-05, | |
| "loss": 0.5236, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7533112582781457, | |
| "grad_norm": 1.1816293001174927, | |
| "learning_rate": 9.320947577269581e-05, | |
| "loss": 0.518, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.7615894039735099, | |
| "grad_norm": 0.8809527158737183, | |
| "learning_rate": 9.29651486501464e-05, | |
| "loss": 0.5086, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.7698675496688742, | |
| "grad_norm": 0.9570929408073425, | |
| "learning_rate": 9.271683539896257e-05, | |
| "loss": 0.5195, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.7781456953642384, | |
| "grad_norm": 1.147157907485962, | |
| "learning_rate": 9.246455905661983e-05, | |
| "loss": 0.509, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.7864238410596026, | |
| "grad_norm": 0.9548070430755615, | |
| "learning_rate": 9.220834302827295e-05, | |
| "loss": 0.5078, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.7947019867549668, | |
| "grad_norm": 1.0823866128921509, | |
| "learning_rate": 9.194821108458438e-05, | |
| "loss": 0.5088, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.8029801324503312, | |
| "grad_norm": 1.0077965259552002, | |
| "learning_rate": 9.168418735951902e-05, | |
| "loss": 0.4994, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.8112582781456954, | |
| "grad_norm": 1.1390560865402222, | |
| "learning_rate": 9.141629634810516e-05, | |
| "loss": 0.5098, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.8195364238410596, | |
| "grad_norm": 1.1819497346878052, | |
| "learning_rate": 9.114456290416186e-05, | |
| "loss": 0.5012, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.8278145695364238, | |
| "grad_norm": 0.8828374147415161, | |
| "learning_rate": 9.08690122379932e-05, | |
| "loss": 0.4895, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8360927152317881, | |
| "grad_norm": 1.263590693473816, | |
| "learning_rate": 9.058966991404933e-05, | |
| "loss": 0.5088, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.8443708609271523, | |
| "grad_norm": 1.0225087404251099, | |
| "learning_rate": 9.03065618485547e-05, | |
| "loss": 0.5029, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.8526490066225165, | |
| "grad_norm": 1.0702545642852783, | |
| "learning_rate": 9.001971430710368e-05, | |
| "loss": 0.5042, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.8609271523178808, | |
| "grad_norm": 1.1163524389266968, | |
| "learning_rate": 8.972915390222376e-05, | |
| "loss": 0.4973, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.8692052980132451, | |
| "grad_norm": 1.0248407125473022, | |
| "learning_rate": 8.943490759090648e-05, | |
| "loss": 0.4907, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.8774834437086093, | |
| "grad_norm": 1.125511646270752, | |
| "learning_rate": 8.913700267210657e-05, | |
| "loss": 0.4982, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.8857615894039735, | |
| "grad_norm": 1.0324009656906128, | |
| "learning_rate": 8.883546678420917e-05, | |
| "loss": 0.4973, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.8940397350993378, | |
| "grad_norm": 0.9293155670166016, | |
| "learning_rate": 8.853032790246575e-05, | |
| "loss": 0.4938, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.902317880794702, | |
| "grad_norm": 1.0467238426208496, | |
| "learning_rate": 8.822161433639864e-05, | |
| "loss": 0.5071, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.9105960264900662, | |
| "grad_norm": 1.26283597946167, | |
| "learning_rate": 8.790935472717452e-05, | |
| "loss": 0.4943, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.9188741721854304, | |
| "grad_norm": 1.0565682649612427, | |
| "learning_rate": 8.75935780449473e-05, | |
| "loss": 0.4884, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.9271523178807947, | |
| "grad_norm": 1.0465178489685059, | |
| "learning_rate": 8.727431358617042e-05, | |
| "loss": 0.48, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.9354304635761589, | |
| "grad_norm": 0.8586106300354004, | |
| "learning_rate": 8.695159097087872e-05, | |
| "loss": 0.4759, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.9437086092715232, | |
| "grad_norm": 1.185562014579773, | |
| "learning_rate": 8.662544013994054e-05, | |
| "loss": 0.4788, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.9519867549668874, | |
| "grad_norm": 0.8274655938148499, | |
| "learning_rate": 8.62958913522798e-05, | |
| "loss": 0.4831, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.9602649006622517, | |
| "grad_norm": 1.0315494537353516, | |
| "learning_rate": 8.596297518206889e-05, | |
| "loss": 0.4755, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.9685430463576159, | |
| "grad_norm": 1.0181632041931152, | |
| "learning_rate": 8.562672251589188e-05, | |
| "loss": 0.4784, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.9768211920529801, | |
| "grad_norm": 1.0092475414276123, | |
| "learning_rate": 8.528716454987927e-05, | |
| "loss": 0.4638, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.9850993377483444, | |
| "grad_norm": 1.1267504692077637, | |
| "learning_rate": 8.494433278681347e-05, | |
| "loss": 0.4765, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.9933774834437086, | |
| "grad_norm": 1.1000663042068481, | |
| "learning_rate": 8.459825903320628e-05, | |
| "loss": 0.495, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.4767566919326782, | |
| "eval_runtime": 1888.1726, | |
| "eval_samples_per_second": 3.199, | |
| "eval_steps_per_second": 3.199, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.0016556291390728, | |
| "grad_norm": 1.1419239044189453, | |
| "learning_rate": 8.424897539634801e-05, | |
| "loss": 0.4541, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.009933774834437, | |
| "grad_norm": 1.235561728477478, | |
| "learning_rate": 8.389651428132857e-05, | |
| "loss": 0.4453, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.0182119205298013, | |
| "grad_norm": 1.079288363456726, | |
| "learning_rate": 8.354090838803115e-05, | |
| "loss": 0.4274, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.0264900662251655, | |
| "grad_norm": 1.0246189832687378, | |
| "learning_rate": 8.318219070809851e-05, | |
| "loss": 0.4297, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.0347682119205297, | |
| "grad_norm": 1.0732612609863281, | |
| "learning_rate": 8.282039452187206e-05, | |
| "loss": 0.4369, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.0430463576158941, | |
| "grad_norm": 1.0800081491470337, | |
| "learning_rate": 8.245555339530427e-05, | |
| "loss": 0.4245, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.0513245033112584, | |
| "grad_norm": 1.2054654359817505, | |
| "learning_rate": 8.208770117684455e-05, | |
| "loss": 0.4366, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.0596026490066226, | |
| "grad_norm": 0.8937723636627197, | |
| "learning_rate": 8.171687199429901e-05, | |
| "loss": 0.4247, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.0678807947019868, | |
| "grad_norm": 1.1639046669006348, | |
| "learning_rate": 8.13431002516641e-05, | |
| "loss": 0.4334, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.076158940397351, | |
| "grad_norm": 0.9183699488639832, | |
| "learning_rate": 8.096642062593489e-05, | |
| "loss": 0.426, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.0844370860927153, | |
| "grad_norm": 1.05479097366333, | |
| "learning_rate": 8.058686806388772e-05, | |
| "loss": 0.4317, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.0927152317880795, | |
| "grad_norm": 1.0889151096343994, | |
| "learning_rate": 8.020447777883813e-05, | |
| "loss": 0.4214, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.1009933774834437, | |
| "grad_norm": 0.9915281534194946, | |
| "learning_rate": 7.981928524737386e-05, | |
| "loss": 0.437, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.109271523178808, | |
| "grad_norm": 1.159812569618225, | |
| "learning_rate": 7.943132620606341e-05, | |
| "loss": 0.433, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.1175496688741722, | |
| "grad_norm": 1.2697975635528564, | |
| "learning_rate": 7.904063664814065e-05, | |
| "loss": 0.4099, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.1258278145695364, | |
| "grad_norm": 1.1016411781311035, | |
| "learning_rate": 7.86472528201655e-05, | |
| "loss": 0.4172, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.1341059602649006, | |
| "grad_norm": 1.155731439590454, | |
| "learning_rate": 7.825121121866106e-05, | |
| "loss": 0.4262, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.1423841059602649, | |
| "grad_norm": 1.0718499422073364, | |
| "learning_rate": 7.785254858672768e-05, | |
| "loss": 0.4245, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.150662251655629, | |
| "grad_norm": 1.3144177198410034, | |
| "learning_rate": 7.745130191063405e-05, | |
| "loss": 0.4213, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.1589403973509933, | |
| "grad_norm": 1.1578547954559326, | |
| "learning_rate": 7.704750841638581e-05, | |
| "loss": 0.4179, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.1672185430463575, | |
| "grad_norm": 1.2637394666671753, | |
| "learning_rate": 7.664120556627181e-05, | |
| "loss": 0.4392, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.1754966887417218, | |
| "grad_norm": 0.9308638572692871, | |
| "learning_rate": 7.623243105538858e-05, | |
| "loss": 0.4306, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.1837748344370862, | |
| "grad_norm": 1.062219500541687, | |
| "learning_rate": 7.582122280814305e-05, | |
| "loss": 0.4332, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.1920529801324504, | |
| "grad_norm": 1.0162498950958252, | |
| "learning_rate": 7.540761897473421e-05, | |
| "loss": 0.4105, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.2003311258278146, | |
| "grad_norm": 1.3984551429748535, | |
| "learning_rate": 7.499165792761355e-05, | |
| "loss": 0.4216, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.2086092715231789, | |
| "grad_norm": 1.1185648441314697, | |
| "learning_rate": 7.457337825792515e-05, | |
| "loss": 0.4115, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.216887417218543, | |
| "grad_norm": 1.1665916442871094, | |
| "learning_rate": 7.415281877192525e-05, | |
| "loss": 0.4203, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.2251655629139073, | |
| "grad_norm": 1.1253561973571777, | |
| "learning_rate": 7.373001848738202e-05, | |
| "loss": 0.4035, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.2334437086092715, | |
| "grad_norm": 1.0157604217529297, | |
| "learning_rate": 7.330501662995566e-05, | |
| "loss": 0.4215, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.2417218543046358, | |
| "grad_norm": 0.9860717058181763, | |
| "learning_rate": 7.287785262955919e-05, | |
| "loss": 0.4305, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.9860566258430481, | |
| "learning_rate": 7.244856611670025e-05, | |
| "loss": 0.4188, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.2582781456953642, | |
| "grad_norm": 1.1415915489196777, | |
| "learning_rate": 7.201719691880446e-05, | |
| "loss": 0.4273, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.2665562913907285, | |
| "grad_norm": 1.2406401634216309, | |
| "learning_rate": 7.158378505652033e-05, | |
| "loss": 0.4078, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.2748344370860927, | |
| "grad_norm": 1.1644037961959839, | |
| "learning_rate": 7.11483707400063e-05, | |
| "loss": 0.4271, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.283112582781457, | |
| "grad_norm": 1.1102714538574219, | |
| "learning_rate": 7.07109943652002e-05, | |
| "loss": 0.4047, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.2913907284768211, | |
| "grad_norm": 0.9731109142303467, | |
| "learning_rate": 7.027169651007156e-05, | |
| "loss": 0.4123, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.2996688741721854, | |
| "grad_norm": 1.0506606101989746, | |
| "learning_rate": 6.983051793085688e-05, | |
| "loss": 0.4143, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.3079470198675498, | |
| "grad_norm": 1.263192057609558, | |
| "learning_rate": 6.938749955827842e-05, | |
| "loss": 0.4151, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.3162251655629138, | |
| "grad_norm": 0.9878462553024292, | |
| "learning_rate": 6.894268249374689e-05, | |
| "loss": 0.4149, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.3245033112582782, | |
| "grad_norm": 1.1122633218765259, | |
| "learning_rate": 6.84961080055482e-05, | |
| "loss": 0.3948, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.3327814569536423, | |
| "grad_norm": 1.1024750471115112, | |
| "learning_rate": 6.804781752501475e-05, | |
| "loss": 0.4124, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.3410596026490067, | |
| "grad_norm": 1.1353663206100464, | |
| "learning_rate": 6.759785264268154e-05, | |
| "loss": 0.4062, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.349337748344371, | |
| "grad_norm": 1.1376144886016846, | |
| "learning_rate": 6.714625510442773e-05, | |
| "loss": 0.4244, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.3576158940397351, | |
| "grad_norm": 1.0264923572540283, | |
| "learning_rate": 6.669306680760351e-05, | |
| "loss": 0.4061, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.3658940397350994, | |
| "grad_norm": 0.8679251670837402, | |
| "learning_rate": 6.623832979714302e-05, | |
| "loss": 0.4018, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.3741721854304636, | |
| "grad_norm": 1.014840841293335, | |
| "learning_rate": 6.57820862616637e-05, | |
| "loss": 0.4058, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.3824503311258278, | |
| "grad_norm": 0.9928984045982361, | |
| "learning_rate": 6.53243785295521e-05, | |
| "loss": 0.4022, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.390728476821192, | |
| "grad_norm": 1.077910304069519, | |
| "learning_rate": 6.48652490650369e-05, | |
| "loss": 0.4092, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.3990066225165563, | |
| "grad_norm": 1.1118338108062744, | |
| "learning_rate": 6.440474046424923e-05, | |
| "loss": 0.4047, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.4072847682119205, | |
| "grad_norm": 1.0150924921035767, | |
| "learning_rate": 6.394289545127073e-05, | |
| "loss": 0.4052, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.4155629139072847, | |
| "grad_norm": 1.129887342453003, | |
| "learning_rate": 6.34797568741699e-05, | |
| "loss": 0.4124, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.423841059602649, | |
| "grad_norm": 1.0517767667770386, | |
| "learning_rate": 6.30153677010267e-05, | |
| "loss": 0.3971, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.4321192052980132, | |
| "grad_norm": 1.107248067855835, | |
| "learning_rate": 6.254977101594625e-05, | |
| "loss": 0.3871, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.4403973509933774, | |
| "grad_norm": 1.0460799932479858, | |
| "learning_rate": 6.208301001506162e-05, | |
| "loss": 0.3998, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.4486754966887418, | |
| "grad_norm": 1.0304828882217407, | |
| "learning_rate": 6.16151280025263e-05, | |
| "loss": 0.4116, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.4569536423841059, | |
| "grad_norm": 1.321883201599121, | |
| "learning_rate": 6.114616838649656e-05, | |
| "loss": 0.3988, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.4652317880794703, | |
| "grad_norm": 1.1075403690338135, | |
| "learning_rate": 6.067617467510429e-05, | |
| "loss": 0.4081, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.4735099337748343, | |
| "grad_norm": 1.1279836893081665, | |
| "learning_rate": 6.020519047242046e-05, | |
| "loss": 0.4025, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.4817880794701987, | |
| "grad_norm": 1.2322368621826172, | |
| "learning_rate": 5.973325947440972e-05, | |
| "loss": 0.398, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.490066225165563, | |
| "grad_norm": 0.6588311195373535, | |
| "learning_rate": 5.926042546487647e-05, | |
| "loss": 0.4045, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.4983443708609272, | |
| "grad_norm": 0.9893741607666016, | |
| "learning_rate": 5.878673231140279e-05, | |
| "loss": 0.4069, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.5066225165562914, | |
| "grad_norm": 1.0072598457336426, | |
| "learning_rate": 5.831222396127858e-05, | |
| "loss": 0.4099, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.5149006622516556, | |
| "grad_norm": 0.9812730550765991, | |
| "learning_rate": 5.783694443742429e-05, | |
| "loss": 0.3965, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.5231788079470199, | |
| "grad_norm": 1.0679937601089478, | |
| "learning_rate": 5.7360937834306693e-05, | |
| "loss": 0.3856, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.531456953642384, | |
| "grad_norm": 1.1114174127578735, | |
| "learning_rate": 5.688424831384795e-05, | |
| "loss": 0.3869, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.5397350993377483, | |
| "grad_norm": 0.962031900882721, | |
| "learning_rate": 5.640692010132851e-05, | |
| "loss": 0.3917, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.5480132450331126, | |
| "grad_norm": 0.6730946898460388, | |
| "learning_rate": 5.5928997481283976e-05, | |
| "loss": 0.3911, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.5562913907284768, | |
| "grad_norm": 1.235037922859192, | |
| "learning_rate": 5.545052479339662e-05, | |
| "loss": 0.3775, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.564569536423841, | |
| "grad_norm": 0.990074634552002, | |
| "learning_rate": 5.497154642838179e-05, | |
| "loss": 0.396, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.5728476821192054, | |
| "grad_norm": 1.0766572952270508, | |
| "learning_rate": 5.449210682386942e-05, | |
| "loss": 0.3711, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.5811258278145695, | |
| "grad_norm": 0.9285266995429993, | |
| "learning_rate": 5.401225046028131e-05, | |
| "loss": 0.383, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.589403973509934, | |
| "grad_norm": 1.1176483631134033, | |
| "learning_rate": 5.3532021856704504e-05, | |
| "loss": 0.3889, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.597682119205298, | |
| "grad_norm": 1.178252935409546, | |
| "learning_rate": 5.3051465566760895e-05, | |
| "loss": 0.3879, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.6059602649006623, | |
| "grad_norm": 1.2217224836349487, | |
| "learning_rate": 5.2570626174473756e-05, | |
| "loss": 0.384, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.6142384105960264, | |
| "grad_norm": 1.11509370803833, | |
| "learning_rate": 5.208954829013145e-05, | |
| "loss": 0.379, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.6225165562913908, | |
| "grad_norm": 0.9148808121681213, | |
| "learning_rate": 5.1608276546148616e-05, | |
| "loss": 0.3813, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.6307947019867548, | |
| "grad_norm": 0.9564769268035889, | |
| "learning_rate": 5.112685559292542e-05, | |
| "loss": 0.3975, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.6390728476821192, | |
| "grad_norm": 1.0157302618026733, | |
| "learning_rate": 5.064533009470499e-05, | |
| "loss": 0.3754, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.6473509933774835, | |
| "grad_norm": 1.2600700855255127, | |
| "learning_rate": 5.016374472542978e-05, | |
| "loss": 0.3705, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.6556291390728477, | |
| "grad_norm": 1.298558235168457, | |
| "learning_rate": 4.968214416459678e-05, | |
| "loss": 0.3898, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.663907284768212, | |
| "grad_norm": 1.1298922300338745, | |
| "learning_rate": 4.9200573093112384e-05, | |
| "loss": 0.3898, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.6721854304635762, | |
| "grad_norm": 0.9608431458473206, | |
| "learning_rate": 4.871907618914714e-05, | |
| "loss": 0.3705, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.6804635761589404, | |
| "grad_norm": 1.0675232410430908, | |
| "learning_rate": 4.823769812399059e-05, | |
| "loss": 0.3881, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.6887417218543046, | |
| "grad_norm": 1.101462960243225, | |
| "learning_rate": 4.775648355790691e-05, | |
| "loss": 0.3765, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.6970198675496688, | |
| "grad_norm": 1.147665023803711, | |
| "learning_rate": 4.7275477135991535e-05, | |
| "loss": 0.3836, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.705298013245033, | |
| "grad_norm": 1.0882872343063354, | |
| "learning_rate": 4.679472348402913e-05, | |
| "loss": 0.3841, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.7135761589403975, | |
| "grad_norm": 1.0927859544754028, | |
| "learning_rate": 4.631426720435339e-05, | |
| "loss": 0.3833, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.7218543046357615, | |
| "grad_norm": 1.2883074283599854, | |
| "learning_rate": 4.583415287170908e-05, | |
| "loss": 0.3785, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.730132450331126, | |
| "grad_norm": 1.11234450340271, | |
| "learning_rate": 4.535442502911653e-05, | |
| "loss": 0.3805, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.73841059602649, | |
| "grad_norm": 1.2244069576263428, | |
| "learning_rate": 4.487512818373906e-05, | |
| "loss": 0.3715, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.7466887417218544, | |
| "grad_norm": 1.1873053312301636, | |
| "learning_rate": 4.439630680275393e-05, | |
| "loss": 0.3939, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.7549668874172184, | |
| "grad_norm": 1.0485515594482422, | |
| "learning_rate": 4.391800530922675e-05, | |
| "loss": 0.3776, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.7632450331125828, | |
| "grad_norm": 0.9849908351898193, | |
| "learning_rate": 4.344026807799012e-05, | |
| "loss": 0.3903, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.771523178807947, | |
| "grad_norm": 1.3031227588653564, | |
| "learning_rate": 4.296313943152673e-05, | |
| "loss": 0.3715, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.7798013245033113, | |
| "grad_norm": 1.0970308780670166, | |
| "learning_rate": 4.2486663635857286e-05, | |
| "loss": 0.3729, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.7880794701986755, | |
| "grad_norm": 0.9399189949035645, | |
| "learning_rate": 4.201088489643372e-05, | |
| "loss": 0.363, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.7963576158940397, | |
| "grad_norm": 1.198516845703125, | |
| "learning_rate": 4.153584735403795e-05, | |
| "loss": 0.3842, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.804635761589404, | |
| "grad_norm": 0.9462034702301025, | |
| "learning_rate": 4.106159508068668e-05, | |
| "loss": 0.369, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.8129139072847682, | |
| "grad_norm": 1.0740764141082764, | |
| "learning_rate": 4.058817207554266e-05, | |
| "loss": 0.354, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.8211920529801324, | |
| "grad_norm": 1.0017762184143066, | |
| "learning_rate": 4.011562226083254e-05, | |
| "loss": 0.3705, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.8294701986754967, | |
| "grad_norm": 1.1367080211639404, | |
| "learning_rate": 3.964398947777196e-05, | |
| "loss": 0.3889, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.8377483443708609, | |
| "grad_norm": 1.0371805429458618, | |
| "learning_rate": 3.9173317482498176e-05, | |
| "loss": 0.3652, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.846026490066225, | |
| "grad_norm": 1.145815134048462, | |
| "learning_rate": 3.8703649942010535e-05, | |
| "loss": 0.3701, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.8543046357615895, | |
| "grad_norm": 1.0999510288238525, | |
| "learning_rate": 3.8235030430119215e-05, | |
| "loss": 0.3573, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.8625827814569536, | |
| "grad_norm": 1.1765943765640259, | |
| "learning_rate": 3.7767502423402645e-05, | |
| "loss": 0.3663, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.870860927152318, | |
| "grad_norm": 1.0484038591384888, | |
| "learning_rate": 3.730110929717393e-05, | |
| "loss": 0.3665, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.879139072847682, | |
| "grad_norm": 0.9852702021598816, | |
| "learning_rate": 3.6835894321456655e-05, | |
| "loss": 0.3674, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.8874172185430464, | |
| "grad_norm": 0.5963965654373169, | |
| "learning_rate": 3.6371900656970446e-05, | |
| "loss": 0.3527, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.8956953642384105, | |
| "grad_norm": 0.9792616367340088, | |
| "learning_rate": 3.59091713511268e-05, | |
| "loss": 0.3624, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.903973509933775, | |
| "grad_norm": 1.276879072189331, | |
| "learning_rate": 3.5447749334035205e-05, | |
| "loss": 0.3653, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.9122516556291391, | |
| "grad_norm": 1.0514193773269653, | |
| "learning_rate": 3.498767741452028e-05, | |
| "loss": 0.3635, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.9205298013245033, | |
| "grad_norm": 1.130402684211731, | |
| "learning_rate": 3.452899827615026e-05, | |
| "loss": 0.3629, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.9288079470198676, | |
| "grad_norm": 0.9695820212364197, | |
| "learning_rate": 3.407175447327685e-05, | |
| "loss": 0.3649, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.9370860927152318, | |
| "grad_norm": 1.0880156755447388, | |
| "learning_rate": 3.361598842708727e-05, | |
| "loss": 0.3618, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.945364238410596, | |
| "grad_norm": 1.1198087930679321, | |
| "learning_rate": 3.3161742421668654e-05, | |
| "loss": 0.3672, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.9536423841059603, | |
| "grad_norm": 0.8677820563316345, | |
| "learning_rate": 3.2709058600084964e-05, | |
| "loss": 0.3551, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.9619205298013245, | |
| "grad_norm": 0.8376750349998474, | |
| "learning_rate": 3.225797896046724e-05, | |
| "loss": 0.3709, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 1.9701986754966887, | |
| "grad_norm": 1.0590113401412964, | |
| "learning_rate": 3.180854535211721e-05, | |
| "loss": 0.3617, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.978476821192053, | |
| "grad_norm": 1.0090305805206299, | |
| "learning_rate": 3.136079947162456e-05, | |
| "loss": 0.3584, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 1.9867549668874172, | |
| "grad_norm": 0.9464877843856812, | |
| "learning_rate": 3.091478285899862e-05, | |
| "loss": 0.3612, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.9950331125827816, | |
| "grad_norm": 1.2669893503189087, | |
| "learning_rate": 3.0470536893814385e-05, | |
| "loss": 0.3633, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.38875386118888855, | |
| "eval_runtime": 1884.7691, | |
| "eval_samples_per_second": 3.205, | |
| "eval_steps_per_second": 3.205, | |
| "step": 12080 | |
| }, | |
| { | |
| "epoch": 2.0033112582781456, | |
| "grad_norm": 1.1019831895828247, | |
| "learning_rate": 3.0028102791373535e-05, | |
| "loss": 0.3221, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.01158940397351, | |
| "grad_norm": 1.095613956451416, | |
| "learning_rate": 2.9587521598880573e-05, | |
| "loss": 0.281, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 2.019867549668874, | |
| "grad_norm": 1.1338448524475098, | |
| "learning_rate": 2.914883419163475e-05, | |
| "loss": 0.2824, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.0281456953642385, | |
| "grad_norm": 1.1247187852859497, | |
| "learning_rate": 2.871208126923771e-05, | |
| "loss": 0.2986, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 2.0364238410596025, | |
| "grad_norm": 1.0773439407348633, | |
| "learning_rate": 2.827730335181765e-05, | |
| "loss": 0.2852, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.044701986754967, | |
| "grad_norm": 1.343947410583496, | |
| "learning_rate": 2.7844540776269924e-05, | |
| "loss": 0.2905, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 2.052980132450331, | |
| "grad_norm": 0.9737703800201416, | |
| "learning_rate": 2.7413833692514844e-05, | |
| "loss": 0.2791, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.0612582781456954, | |
| "grad_norm": 1.0275969505310059, | |
| "learning_rate": 2.698522205977273e-05, | |
| "loss": 0.2887, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 2.0695364238410594, | |
| "grad_norm": 1.1896377801895142, | |
| "learning_rate": 2.655874564285656e-05, | |
| "loss": 0.2845, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.077814569536424, | |
| "grad_norm": 1.0280028581619263, | |
| "learning_rate": 2.613444400848287e-05, | |
| "loss": 0.284, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 2.0860927152317883, | |
| "grad_norm": 1.2673025131225586, | |
| "learning_rate": 2.571235652160091e-05, | |
| "loss": 0.2852, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.0943708609271523, | |
| "grad_norm": 1.076798915863037, | |
| "learning_rate": 2.529252234174041e-05, | |
| "loss": 0.2893, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 2.1026490066225167, | |
| "grad_norm": 0.7327485084533691, | |
| "learning_rate": 2.4874980419378647e-05, | |
| "loss": 0.2837, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.1109271523178808, | |
| "grad_norm": 1.2723850011825562, | |
| "learning_rate": 2.445976949232676e-05, | |
| "loss": 0.2859, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 2.119205298013245, | |
| "grad_norm": 1.1690808534622192, | |
| "learning_rate": 2.4046928082135733e-05, | |
| "loss": 0.2841, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.127483443708609, | |
| "grad_norm": 1.2063932418823242, | |
| "learning_rate": 2.3636494490522624e-05, | |
| "loss": 0.2772, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 2.1357615894039736, | |
| "grad_norm": 1.2601985931396484, | |
| "learning_rate": 2.3228506795817072e-05, | |
| "loss": 0.2829, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 2.1440397350993377, | |
| "grad_norm": 1.1223324537277222, | |
| "learning_rate": 2.282300284942846e-05, | |
| "loss": 0.293, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 2.152317880794702, | |
| "grad_norm": 1.2425414323806763, | |
| "learning_rate": 2.2420020272334337e-05, | |
| "loss": 0.2946, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.160596026490066, | |
| "grad_norm": 1.176163673400879, | |
| "learning_rate": 2.2019596451590047e-05, | |
| "loss": 0.2888, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 2.1688741721854305, | |
| "grad_norm": 1.073018193244934, | |
| "learning_rate": 2.162176853686006e-05, | |
| "loss": 0.2921, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 2.1771523178807946, | |
| "grad_norm": 1.3194624185562134, | |
| "learning_rate": 2.1226573436971487e-05, | |
| "loss": 0.3014, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 2.185430463576159, | |
| "grad_norm": 1.3240703344345093, | |
| "learning_rate": 2.0834047816489772e-05, | |
| "loss": 0.2828, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.193708609271523, | |
| "grad_norm": 1.529966950416565, | |
| "learning_rate": 2.0444228092317057e-05, | |
| "loss": 0.2946, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 2.2019867549668874, | |
| "grad_norm": 1.0397886037826538, | |
| "learning_rate": 2.005715043031369e-05, | |
| "loss": 0.2905, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 2.2102649006622515, | |
| "grad_norm": 0.8956925868988037, | |
| "learning_rate": 1.967285074194283e-05, | |
| "loss": 0.2847, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 2.218543046357616, | |
| "grad_norm": 1.2051565647125244, | |
| "learning_rate": 1.9291364680938688e-05, | |
| "loss": 0.2709, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.22682119205298, | |
| "grad_norm": 1.2182625532150269, | |
| "learning_rate": 1.891272763999884e-05, | |
| "loss": 0.2754, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 2.2350993377483444, | |
| "grad_norm": 1.2709006071090698, | |
| "learning_rate": 1.8536974747500556e-05, | |
| "loss": 0.278, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.243377483443709, | |
| "grad_norm": 1.142155647277832, | |
| "learning_rate": 1.8164140864241723e-05, | |
| "loss": 0.2863, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 2.251655629139073, | |
| "grad_norm": 1.218385100364685, | |
| "learning_rate": 1.7794260580206673e-05, | |
| "loss": 0.282, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.2599337748344372, | |
| "grad_norm": 0.6981252431869507, | |
| "learning_rate": 1.742736821135702e-05, | |
| "loss": 0.2756, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 2.2682119205298013, | |
| "grad_norm": 1.193818211555481, | |
| "learning_rate": 1.7063497796447935e-05, | |
| "loss": 0.2711, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.2764900662251657, | |
| "grad_norm": 1.1021385192871094, | |
| "learning_rate": 1.670268309387029e-05, | |
| "loss": 0.2723, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 2.2847682119205297, | |
| "grad_norm": 1.4959111213684082, | |
| "learning_rate": 1.634495757851855e-05, | |
| "loss": 0.2821, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.293046357615894, | |
| "grad_norm": 1.3577196598052979, | |
| "learning_rate": 1.599035443868518e-05, | |
| "loss": 0.2701, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 2.301324503311258, | |
| "grad_norm": 1.391587495803833, | |
| "learning_rate": 1.5638906572981604e-05, | |
| "loss": 0.2809, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.3096026490066226, | |
| "grad_norm": 1.1502324342727661, | |
| "learning_rate": 1.529064658728598e-05, | |
| "loss": 0.2847, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 2.3178807947019866, | |
| "grad_norm": 1.315737009048462, | |
| "learning_rate": 1.4945606791718092e-05, | |
| "loss": 0.2772, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.326158940397351, | |
| "grad_norm": 1.5041948556900024, | |
| "learning_rate": 1.4603819197641883e-05, | |
| "loss": 0.2798, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 2.334437086092715, | |
| "grad_norm": 1.1226636171340942, | |
| "learning_rate": 1.4265315514695488e-05, | |
| "loss": 0.2712, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.3427152317880795, | |
| "grad_norm": 0.982099175453186, | |
| "learning_rate": 1.3930127147849314e-05, | |
| "loss": 0.2805, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 2.3509933774834435, | |
| "grad_norm": 1.165469765663147, | |
| "learning_rate": 1.3598285194492521e-05, | |
| "loss": 0.2882, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.359271523178808, | |
| "grad_norm": 1.305770754814148, | |
| "learning_rate": 1.326982044154787e-05, | |
| "loss": 0.2837, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 2.3675496688741724, | |
| "grad_norm": 1.5525685548782349, | |
| "learning_rate": 1.2944763362615413e-05, | |
| "loss": 0.2742, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.3758278145695364, | |
| "grad_norm": 1.0929417610168457, | |
| "learning_rate": 1.2623144115145342e-05, | |
| "loss": 0.2698, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 2.384105960264901, | |
| "grad_norm": 1.1220225095748901, | |
| "learning_rate": 1.2304992537640092e-05, | |
| "loss": 0.2764, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.392384105960265, | |
| "grad_norm": 1.229873776435852, | |
| "learning_rate": 1.1990338146885977e-05, | |
| "loss": 0.2729, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 2.4006622516556293, | |
| "grad_norm": 1.4030100107192993, | |
| "learning_rate": 1.1679210135214858e-05, | |
| "loss": 0.2764, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.4089403973509933, | |
| "grad_norm": 1.1974605321884155, | |
| "learning_rate": 1.1371637367795735e-05, | |
| "loss": 0.2718, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 2.4172185430463577, | |
| "grad_norm": 1.3544508218765259, | |
| "learning_rate": 1.1067648379956714e-05, | |
| "loss": 0.288, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.4254966887417218, | |
| "grad_norm": 1.3159441947937012, | |
| "learning_rate": 1.0767271374537724e-05, | |
| "loss": 0.27, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 2.433774834437086, | |
| "grad_norm": 1.531108021736145, | |
| "learning_rate": 1.0470534219273903e-05, | |
| "loss": 0.2764, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.44205298013245, | |
| "grad_norm": 1.3860596418380737, | |
| "learning_rate": 1.0177464444210133e-05, | |
| "loss": 0.2685, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 2.4503311258278146, | |
| "grad_norm": 1.0219587087631226, | |
| "learning_rate": 9.888089239146963e-06, | |
| "loss": 0.2882, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.4586092715231787, | |
| "grad_norm": 1.3057914972305298, | |
| "learning_rate": 9.602435451118047e-06, | |
| "loss": 0.288, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 2.466887417218543, | |
| "grad_norm": 1.2784461975097656, | |
| "learning_rate": 9.320529581899335e-06, | |
| "loss": 0.2797, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 2.475165562913907, | |
| "grad_norm": 1.4044731855392456, | |
| "learning_rate": 9.042397785550405e-06, | |
| "loss": 0.2812, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 2.4834437086092715, | |
| "grad_norm": 1.240498661994934, | |
| "learning_rate": 8.768065865987995e-06, | |
| "loss": 0.2719, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.491721854304636, | |
| "grad_norm": 1.1060854196548462, | |
| "learning_rate": 8.49755927459196e-06, | |
| "loss": 0.2862, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.4075932502746582, | |
| "learning_rate": 8.230903107844078e-06, | |
| "loss": 0.2622, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 2.508278145695364, | |
| "grad_norm": 1.277173399925232, | |
| "learning_rate": 7.968122104999676e-06, | |
| "loss": 0.2735, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 2.5165562913907285, | |
| "grad_norm": 1.4734828472137451, | |
| "learning_rate": 7.70924064579236e-06, | |
| "loss": 0.2857, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.524834437086093, | |
| "grad_norm": 1.183908224105835, | |
| "learning_rate": 7.454282748172281e-06, | |
| "loss": 0.2647, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 2.533112582781457, | |
| "grad_norm": 1.2471554279327393, | |
| "learning_rate": 7.2032720660777706e-06, | |
| "loss": 0.272, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 2.5413907284768213, | |
| "grad_norm": 1.2211905717849731, | |
| "learning_rate": 6.95623188724081e-06, | |
| "loss": 0.2815, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 2.5496688741721854, | |
| "grad_norm": 1.453582525253296, | |
| "learning_rate": 6.713185131026567e-06, | |
| "loss": 0.2808, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 2.55794701986755, | |
| "grad_norm": 1.27803635597229, | |
| "learning_rate": 6.474154346306999e-06, | |
| "loss": 0.2719, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 2.566225165562914, | |
| "grad_norm": 1.2783340215682983, | |
| "learning_rate": 6.239161709368774e-06, | |
| "loss": 0.2743, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.5745033112582782, | |
| "grad_norm": 1.371692180633545, | |
| "learning_rate": 6.00822902185601e-06, | |
| "loss": 0.2817, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 2.5827814569536423, | |
| "grad_norm": 1.6115282773971558, | |
| "learning_rate": 5.781377708747493e-06, | |
| "loss": 0.2665, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.5910596026490067, | |
| "grad_norm": 1.3451199531555176, | |
| "learning_rate": 5.558628816368972e-06, | |
| "loss": 0.2767, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 2.5993377483443707, | |
| "grad_norm": 1.2596265077590942, | |
| "learning_rate": 5.340003010440603e-06, | |
| "loss": 0.2721, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 2.607615894039735, | |
| "grad_norm": 1.3860641717910767, | |
| "learning_rate": 5.125520574159654e-06, | |
| "loss": 0.2797, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 2.6158940397350996, | |
| "grad_norm": 1.2152941226959229, | |
| "learning_rate": 4.915201406318676e-06, | |
| "loss": 0.2669, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 2.6241721854304636, | |
| "grad_norm": 1.2556190490722656, | |
| "learning_rate": 4.7090650194594465e-06, | |
| "loss": 0.2756, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 2.6324503311258276, | |
| "grad_norm": 1.2963857650756836, | |
| "learning_rate": 4.50713053806262e-06, | |
| "loss": 0.2737, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 2.640728476821192, | |
| "grad_norm": 1.0009130239486694, | |
| "learning_rate": 4.309416696773455e-06, | |
| "loss": 0.27, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 2.6490066225165565, | |
| "grad_norm": 1.1450060606002808, | |
| "learning_rate": 4.1159418386636895e-06, | |
| "loss": 0.2659, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.6572847682119205, | |
| "grad_norm": 1.2123562097549438, | |
| "learning_rate": 3.926723913529773e-06, | |
| "loss": 0.2739, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 2.6655629139072845, | |
| "grad_norm": 1.3714101314544678, | |
| "learning_rate": 3.7417804762274968e-06, | |
| "loss": 0.2801, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.673841059602649, | |
| "grad_norm": 1.3810080289840698, | |
| "learning_rate": 3.5611286850433967e-06, | |
| "loss": 0.2646, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 2.6821192052980134, | |
| "grad_norm": 1.0282268524169922, | |
| "learning_rate": 3.3847853001028495e-06, | |
| "loss": 0.2699, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.6903973509933774, | |
| "grad_norm": 1.1786317825317383, | |
| "learning_rate": 3.2127666818151046e-06, | |
| "loss": 0.2717, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 2.698675496688742, | |
| "grad_norm": 1.0810590982437134, | |
| "learning_rate": 3.045088789355488e-06, | |
| "loss": 0.277, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.706953642384106, | |
| "grad_norm": 1.194557785987854, | |
| "learning_rate": 2.8817671791847634e-06, | |
| "loss": 0.2652, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 2.7152317880794703, | |
| "grad_norm": 1.1923502683639526, | |
| "learning_rate": 2.7228170036058153e-06, | |
| "loss": 0.2617, | |
| "step": 16400 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 18120, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2874605314388787e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |