{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 421, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023788284269997025, "grad_norm": 1.0281301736831665, "learning_rate": 0.0, "loss": 2.0766, "step": 1 }, { "epoch": 0.004757656853999405, "grad_norm": 1.2767698764801025, "learning_rate": 1e-05, "loss": 2.3525, "step": 2 }, { "epoch": 0.007136485280999108, "grad_norm": 1.3647927045822144, "learning_rate": 2e-05, "loss": 2.4261, "step": 3 }, { "epoch": 0.00951531370799881, "grad_norm": 0.8589039444923401, "learning_rate": 3e-05, "loss": 1.7271, "step": 4 }, { "epoch": 0.011894142134998514, "grad_norm": 1.1814510822296143, "learning_rate": 4e-05, "loss": 2.1379, "step": 5 }, { "epoch": 0.014272970561998216, "grad_norm": 1.012987732887268, "learning_rate": 5e-05, "loss": 1.8598, "step": 6 }, { "epoch": 0.016651798988997917, "grad_norm": 1.052037239074707, "learning_rate": 6e-05, "loss": 1.9315, "step": 7 }, { "epoch": 0.01903062741599762, "grad_norm": 0.8621532320976257, "learning_rate": 7e-05, "loss": 1.7205, "step": 8 }, { "epoch": 0.021409455842997322, "grad_norm": 0.7924615740776062, "learning_rate": 8e-05, "loss": 1.589, "step": 9 }, { "epoch": 0.023788284269997028, "grad_norm": 0.6322035789489746, "learning_rate": 9e-05, "loss": 1.3374, "step": 10 }, { "epoch": 0.02616711269699673, "grad_norm": 0.7854194641113281, "learning_rate": 0.0001, "loss": 1.4292, "step": 11 }, { "epoch": 0.028545941123996433, "grad_norm": 0.6697576642036438, "learning_rate": 9.975669099756692e-05, "loss": 1.206, "step": 12 }, { "epoch": 0.030924769550996135, "grad_norm": 0.7266756296157837, "learning_rate": 9.951338199513383e-05, "loss": 1.2737, "step": 13 }, { "epoch": 0.033303597977995834, "grad_norm": 0.7196414470672607, "learning_rate": 9.927007299270074e-05, "loss": 1.2023, "step": 14 }, { "epoch": 0.03568242640499554, "grad_norm": 0.7868577241897583, "learning_rate": 9.902676399026765e-05, "loss": 1.1486, "step": 15 }, { "epoch": 0.03806125483199524, "grad_norm": 0.7759497761726379, "learning_rate": 9.878345498783455e-05, "loss": 1.112, "step": 16 }, { "epoch": 0.040440083258994945, "grad_norm": 0.7133831977844238, "learning_rate": 9.854014598540146e-05, "loss": 1.0158, "step": 17 }, { "epoch": 0.042818911685994644, "grad_norm": 0.644658088684082, "learning_rate": 9.829683698296837e-05, "loss": 0.9656, "step": 18 }, { "epoch": 0.04519774011299435, "grad_norm": 0.6295652985572815, "learning_rate": 9.805352798053529e-05, "loss": 0.9295, "step": 19 }, { "epoch": 0.047576568539994056, "grad_norm": 0.6460481882095337, "learning_rate": 9.78102189781022e-05, "loss": 0.9851, "step": 20 }, { "epoch": 0.049955396966993755, "grad_norm": 0.5485146045684814, "learning_rate": 9.756690997566911e-05, "loss": 0.864, "step": 21 }, { "epoch": 0.05233422539399346, "grad_norm": 0.6552495360374451, "learning_rate": 9.732360097323602e-05, "loss": 0.7903, "step": 22 }, { "epoch": 0.05471305382099316, "grad_norm": 0.5481061935424805, "learning_rate": 9.708029197080293e-05, "loss": 0.843, "step": 23 }, { "epoch": 0.057091882247992866, "grad_norm": 0.6262676119804382, "learning_rate": 9.683698296836983e-05, "loss": 0.9421, "step": 24 }, { "epoch": 0.059470710674992565, "grad_norm": 0.5493852496147156, "learning_rate": 9.659367396593674e-05, "loss": 0.9045, "step": 25 }, { "epoch": 0.06184953910199227, "grad_norm": 0.542617678642273, "learning_rate": 9.635036496350366e-05, "loss": 0.8028, "step": 26 }, { "epoch": 0.06422836752899197, "grad_norm": 0.4842965602874756, "learning_rate": 9.610705596107057e-05, "loss": 0.716, "step": 27 }, { "epoch": 0.06660719595599167, "grad_norm": 0.5457621216773987, "learning_rate": 9.586374695863748e-05, "loss": 0.7069, "step": 28 }, { "epoch": 0.06898602438299138, "grad_norm": 0.5163421034812927, "learning_rate": 9.562043795620439e-05, "loss": 0.6999, "step": 29 }, { "epoch": 0.07136485280999108, "grad_norm": 0.4519389569759369, "learning_rate": 9.537712895377129e-05, "loss": 0.6219, "step": 30 }, { "epoch": 0.07374368123699078, "grad_norm": 0.5117298364639282, "learning_rate": 9.51338199513382e-05, "loss": 0.597, "step": 31 }, { "epoch": 0.07612250966399048, "grad_norm": 0.5518686771392822, "learning_rate": 9.489051094890511e-05, "loss": 0.6789, "step": 32 }, { "epoch": 0.07850133809099019, "grad_norm": 0.5353817939758301, "learning_rate": 9.464720194647201e-05, "loss": 0.6493, "step": 33 }, { "epoch": 0.08088016651798989, "grad_norm": 0.48304057121276855, "learning_rate": 9.440389294403893e-05, "loss": 0.5638, "step": 34 }, { "epoch": 0.08325899494498959, "grad_norm": 0.42356157302856445, "learning_rate": 9.416058394160584e-05, "loss": 0.5589, "step": 35 }, { "epoch": 0.08563782337198929, "grad_norm": 0.39856433868408203, "learning_rate": 9.391727493917275e-05, "loss": 0.5208, "step": 36 }, { "epoch": 0.088016651798989, "grad_norm": 0.4774276912212372, "learning_rate": 9.367396593673966e-05, "loss": 0.6367, "step": 37 }, { "epoch": 0.0903954802259887, "grad_norm": 0.48944222927093506, "learning_rate": 9.343065693430657e-05, "loss": 0.6261, "step": 38 }, { "epoch": 0.0927743086529884, "grad_norm": 0.4067060053348541, "learning_rate": 9.318734793187348e-05, "loss": 0.5512, "step": 39 }, { "epoch": 0.09515313707998811, "grad_norm": 0.4157446622848511, "learning_rate": 9.29440389294404e-05, "loss": 0.5823, "step": 40 }, { "epoch": 0.09753196550698781, "grad_norm": 0.43023228645324707, "learning_rate": 9.27007299270073e-05, "loss": 0.5499, "step": 41 }, { "epoch": 0.09991079393398751, "grad_norm": 0.40139999985694885, "learning_rate": 9.245742092457421e-05, "loss": 0.5808, "step": 42 }, { "epoch": 0.10228962236098721, "grad_norm": 0.4564228653907776, "learning_rate": 9.221411192214112e-05, "loss": 0.652, "step": 43 }, { "epoch": 0.10466845078798692, "grad_norm": 0.40338802337646484, "learning_rate": 9.197080291970803e-05, "loss": 0.5065, "step": 44 }, { "epoch": 0.10704727921498662, "grad_norm": 0.5054526925086975, "learning_rate": 9.172749391727494e-05, "loss": 0.6194, "step": 45 }, { "epoch": 0.10942610764198632, "grad_norm": 0.44123998284339905, "learning_rate": 9.148418491484186e-05, "loss": 0.5657, "step": 46 }, { "epoch": 0.11180493606898602, "grad_norm": 0.4445621371269226, "learning_rate": 9.124087591240877e-05, "loss": 0.5808, "step": 47 }, { "epoch": 0.11418376449598573, "grad_norm": 0.3913380801677704, "learning_rate": 9.099756690997568e-05, "loss": 0.5816, "step": 48 }, { "epoch": 0.11656259292298543, "grad_norm": 0.43966594338417053, "learning_rate": 9.075425790754258e-05, "loss": 0.4836, "step": 49 }, { "epoch": 0.11894142134998513, "grad_norm": 0.4385397434234619, "learning_rate": 9.051094890510949e-05, "loss": 0.5962, "step": 50 }, { "epoch": 0.12132024977698483, "grad_norm": 0.41545379161834717, "learning_rate": 9.02676399026764e-05, "loss": 0.5748, "step": 51 }, { "epoch": 0.12369907820398454, "grad_norm": 0.3504558801651001, "learning_rate": 9.002433090024331e-05, "loss": 0.5202, "step": 52 }, { "epoch": 0.12607790663098423, "grad_norm": 0.44601067900657654, "learning_rate": 8.978102189781023e-05, "loss": 0.5451, "step": 53 }, { "epoch": 0.12845673505798394, "grad_norm": 0.3965899646282196, "learning_rate": 8.953771289537714e-05, "loss": 0.5578, "step": 54 }, { "epoch": 0.13083556348498365, "grad_norm": 0.44455817341804504, "learning_rate": 8.929440389294405e-05, "loss": 0.4968, "step": 55 }, { "epoch": 0.13321439191198334, "grad_norm": 0.45583397150039673, "learning_rate": 8.905109489051096e-05, "loss": 0.598, "step": 56 }, { "epoch": 0.13559322033898305, "grad_norm": 0.46639132499694824, "learning_rate": 8.880778588807786e-05, "loss": 0.5857, "step": 57 }, { "epoch": 0.13797204876598276, "grad_norm": 0.4358392655849457, "learning_rate": 8.856447688564477e-05, "loss": 0.5299, "step": 58 }, { "epoch": 0.14035087719298245, "grad_norm": 0.4803287088871002, "learning_rate": 8.832116788321168e-05, "loss": 0.5082, "step": 59 }, { "epoch": 0.14272970561998216, "grad_norm": 0.41488590836524963, "learning_rate": 8.80778588807786e-05, "loss": 0.5223, "step": 60 }, { "epoch": 0.14510853404698187, "grad_norm": 0.46010011434555054, "learning_rate": 8.783454987834551e-05, "loss": 0.5125, "step": 61 }, { "epoch": 0.14748736247398156, "grad_norm": 0.3907822370529175, "learning_rate": 8.759124087591242e-05, "loss": 0.5121, "step": 62 }, { "epoch": 0.14986619090098127, "grad_norm": 0.4548039138317108, "learning_rate": 8.734793187347933e-05, "loss": 0.577, "step": 63 }, { "epoch": 0.15224501932798096, "grad_norm": 0.3938556909561157, "learning_rate": 8.710462287104624e-05, "loss": 0.4726, "step": 64 }, { "epoch": 0.15462384775498067, "grad_norm": 0.46850043535232544, "learning_rate": 8.686131386861314e-05, "loss": 0.5203, "step": 65 }, { "epoch": 0.15700267618198038, "grad_norm": 0.43241533637046814, "learning_rate": 8.661800486618005e-05, "loss": 0.4683, "step": 66 }, { "epoch": 0.15938150460898007, "grad_norm": 0.43365252017974854, "learning_rate": 8.637469586374697e-05, "loss": 0.5508, "step": 67 }, { "epoch": 0.16176033303597978, "grad_norm": 0.4271024763584137, "learning_rate": 8.613138686131386e-05, "loss": 0.4992, "step": 68 }, { "epoch": 0.1641391614629795, "grad_norm": 0.4089069366455078, "learning_rate": 8.588807785888078e-05, "loss": 0.4575, "step": 69 }, { "epoch": 0.16651798988997918, "grad_norm": 0.43635204434394836, "learning_rate": 8.564476885644769e-05, "loss": 0.5513, "step": 70 }, { "epoch": 0.1688968183169789, "grad_norm": 0.4246712625026703, "learning_rate": 8.54014598540146e-05, "loss": 0.4759, "step": 71 }, { "epoch": 0.17127564674397858, "grad_norm": 0.463638573884964, "learning_rate": 8.515815085158151e-05, "loss": 0.5323, "step": 72 }, { "epoch": 0.1736544751709783, "grad_norm": 0.4437566101551056, "learning_rate": 8.491484184914842e-05, "loss": 0.5446, "step": 73 }, { "epoch": 0.176033303597978, "grad_norm": 0.4280001223087311, "learning_rate": 8.467153284671534e-05, "loss": 0.5092, "step": 74 }, { "epoch": 0.1784121320249777, "grad_norm": 0.4512069523334503, "learning_rate": 8.442822384428223e-05, "loss": 0.5002, "step": 75 }, { "epoch": 0.1807909604519774, "grad_norm": 0.39247390627861023, "learning_rate": 8.418491484184915e-05, "loss": 0.4522, "step": 76 }, { "epoch": 0.1831697888789771, "grad_norm": 0.531852662563324, "learning_rate": 8.394160583941606e-05, "loss": 0.5507, "step": 77 }, { "epoch": 0.1855486173059768, "grad_norm": 0.49727141857147217, "learning_rate": 8.369829683698297e-05, "loss": 0.519, "step": 78 }, { "epoch": 0.1879274457329765, "grad_norm": 0.46292659640312195, "learning_rate": 8.345498783454988e-05, "loss": 0.5271, "step": 79 }, { "epoch": 0.19030627415997622, "grad_norm": 0.4514133036136627, "learning_rate": 8.32116788321168e-05, "loss": 0.5499, "step": 80 }, { "epoch": 0.1926851025869759, "grad_norm": 0.42186304926872253, "learning_rate": 8.29683698296837e-05, "loss": 0.541, "step": 81 }, { "epoch": 0.19506393101397562, "grad_norm": 0.4527208209037781, "learning_rate": 8.272506082725062e-05, "loss": 0.482, "step": 82 }, { "epoch": 0.1974427594409753, "grad_norm": 0.4818190634250641, "learning_rate": 8.248175182481752e-05, "loss": 0.5627, "step": 83 }, { "epoch": 0.19982158786797502, "grad_norm": 0.44998934864997864, "learning_rate": 8.223844282238443e-05, "loss": 0.5169, "step": 84 }, { "epoch": 0.20220041629497473, "grad_norm": 0.42806658148765564, "learning_rate": 8.199513381995134e-05, "loss": 0.5613, "step": 85 }, { "epoch": 0.20457924472197442, "grad_norm": 0.4187542796134949, "learning_rate": 8.175182481751825e-05, "loss": 0.4584, "step": 86 }, { "epoch": 0.20695807314897413, "grad_norm": 0.5182843804359436, "learning_rate": 8.150851581508516e-05, "loss": 0.5253, "step": 87 }, { "epoch": 0.20933690157597384, "grad_norm": 0.4145914614200592, "learning_rate": 8.126520681265208e-05, "loss": 0.4828, "step": 88 }, { "epoch": 0.21171573000297353, "grad_norm": 0.46647554636001587, "learning_rate": 8.102189781021899e-05, "loss": 0.4496, "step": 89 }, { "epoch": 0.21409455842997324, "grad_norm": 0.4022408723831177, "learning_rate": 8.07785888077859e-05, "loss": 0.4329, "step": 90 }, { "epoch": 0.21647338685697295, "grad_norm": 0.4754613935947418, "learning_rate": 8.05352798053528e-05, "loss": 0.5314, "step": 91 }, { "epoch": 0.21885221528397264, "grad_norm": 0.41736510396003723, "learning_rate": 8.029197080291971e-05, "loss": 0.4906, "step": 92 }, { "epoch": 0.22123104371097235, "grad_norm": 0.4199044406414032, "learning_rate": 8.004866180048662e-05, "loss": 0.495, "step": 93 }, { "epoch": 0.22360987213797204, "grad_norm": 0.4374317526817322, "learning_rate": 7.980535279805353e-05, "loss": 0.533, "step": 94 }, { "epoch": 0.22598870056497175, "grad_norm": 0.4060494899749756, "learning_rate": 7.956204379562045e-05, "loss": 0.4711, "step": 95 }, { "epoch": 0.22836752899197146, "grad_norm": 0.49371036887168884, "learning_rate": 7.931873479318736e-05, "loss": 0.5417, "step": 96 }, { "epoch": 0.23074635741897115, "grad_norm": 0.4903344511985779, "learning_rate": 7.907542579075427e-05, "loss": 0.4595, "step": 97 }, { "epoch": 0.23312518584597086, "grad_norm": 0.4055982530117035, "learning_rate": 7.883211678832118e-05, "loss": 0.4926, "step": 98 }, { "epoch": 0.23550401427297057, "grad_norm": 0.4552580416202545, "learning_rate": 7.858880778588808e-05, "loss": 0.5644, "step": 99 }, { "epoch": 0.23788284269997026, "grad_norm": 0.4261854588985443, "learning_rate": 7.834549878345499e-05, "loss": 0.4915, "step": 100 }, { "epoch": 0.23788284269997026, "eval_loss": 0.48822253942489624, "eval_runtime": 27.0528, "eval_samples_per_second": 27.65, "eval_steps_per_second": 13.825, "step": 100 }, { "epoch": 0.24026167112696997, "grad_norm": 0.48629841208457947, "learning_rate": 7.81021897810219e-05, "loss": 0.4824, "step": 101 }, { "epoch": 0.24264049955396966, "grad_norm": 0.4699622094631195, "learning_rate": 7.785888077858882e-05, "loss": 0.4642, "step": 102 }, { "epoch": 0.24501932798096937, "grad_norm": 0.39009714126586914, "learning_rate": 7.761557177615573e-05, "loss": 0.4614, "step": 103 }, { "epoch": 0.24739815640796908, "grad_norm": 0.4852162003517151, "learning_rate": 7.737226277372264e-05, "loss": 0.5112, "step": 104 }, { "epoch": 0.24977698483496877, "grad_norm": 0.5088614821434021, "learning_rate": 7.712895377128954e-05, "loss": 0.4914, "step": 105 }, { "epoch": 0.25215581326196845, "grad_norm": 0.4846271276473999, "learning_rate": 7.688564476885645e-05, "loss": 0.4832, "step": 106 }, { "epoch": 0.25453464168896817, "grad_norm": 0.48459744453430176, "learning_rate": 7.664233576642336e-05, "loss": 0.5386, "step": 107 }, { "epoch": 0.2569134701159679, "grad_norm": 0.47507190704345703, "learning_rate": 7.639902676399027e-05, "loss": 0.4593, "step": 108 }, { "epoch": 0.2592922985429676, "grad_norm": 0.46605178713798523, "learning_rate": 7.615571776155717e-05, "loss": 0.5265, "step": 109 }, { "epoch": 0.2616711269699673, "grad_norm": 0.4616071283817291, "learning_rate": 7.591240875912408e-05, "loss": 0.4323, "step": 110 }, { "epoch": 0.264049955396967, "grad_norm": 0.5419961214065552, "learning_rate": 7.5669099756691e-05, "loss": 0.5403, "step": 111 }, { "epoch": 0.2664287838239667, "grad_norm": 0.4992631673812866, "learning_rate": 7.542579075425791e-05, "loss": 0.4558, "step": 112 }, { "epoch": 0.2688076122509664, "grad_norm": 0.5071640610694885, "learning_rate": 7.518248175182482e-05, "loss": 0.5385, "step": 113 }, { "epoch": 0.2711864406779661, "grad_norm": 0.5004437565803528, "learning_rate": 7.493917274939173e-05, "loss": 0.5106, "step": 114 }, { "epoch": 0.2735652691049658, "grad_norm": 0.4706288278102875, "learning_rate": 7.469586374695864e-05, "loss": 0.5333, "step": 115 }, { "epoch": 0.2759440975319655, "grad_norm": 0.5518192052841187, "learning_rate": 7.445255474452556e-05, "loss": 0.5945, "step": 116 }, { "epoch": 0.2783229259589652, "grad_norm": 0.4758547246456146, "learning_rate": 7.420924574209245e-05, "loss": 0.533, "step": 117 }, { "epoch": 0.2807017543859649, "grad_norm": 0.4491472840309143, "learning_rate": 7.396593673965937e-05, "loss": 0.4213, "step": 118 }, { "epoch": 0.2830805828129646, "grad_norm": 0.4540884494781494, "learning_rate": 7.372262773722628e-05, "loss": 0.4747, "step": 119 }, { "epoch": 0.2854594112399643, "grad_norm": 0.4252176880836487, "learning_rate": 7.347931873479319e-05, "loss": 0.4639, "step": 120 }, { "epoch": 0.28783823966696404, "grad_norm": 0.4695313274860382, "learning_rate": 7.32360097323601e-05, "loss": 0.5563, "step": 121 }, { "epoch": 0.29021706809396375, "grad_norm": 0.49951449036598206, "learning_rate": 7.299270072992701e-05, "loss": 0.4517, "step": 122 }, { "epoch": 0.2925958965209634, "grad_norm": 0.4763878285884857, "learning_rate": 7.274939172749393e-05, "loss": 0.5223, "step": 123 }, { "epoch": 0.2949747249479631, "grad_norm": 0.49518251419067383, "learning_rate": 7.250608272506084e-05, "loss": 0.5832, "step": 124 }, { "epoch": 0.29735355337496283, "grad_norm": 0.4721708595752716, "learning_rate": 7.226277372262774e-05, "loss": 0.5259, "step": 125 }, { "epoch": 0.29973238180196254, "grad_norm": 0.4356157183647156, "learning_rate": 7.201946472019465e-05, "loss": 0.408, "step": 126 }, { "epoch": 0.30211121022896226, "grad_norm": 0.4771776795387268, "learning_rate": 7.177615571776156e-05, "loss": 0.4909, "step": 127 }, { "epoch": 0.3044900386559619, "grad_norm": 0.5367064476013184, "learning_rate": 7.153284671532847e-05, "loss": 0.556, "step": 128 }, { "epoch": 0.3068688670829616, "grad_norm": 0.43037423491477966, "learning_rate": 7.128953771289538e-05, "loss": 0.3805, "step": 129 }, { "epoch": 0.30924769550996134, "grad_norm": 0.4576215445995331, "learning_rate": 7.10462287104623e-05, "loss": 0.437, "step": 130 }, { "epoch": 0.31162652393696105, "grad_norm": 0.46385103464126587, "learning_rate": 7.080291970802921e-05, "loss": 0.4561, "step": 131 }, { "epoch": 0.31400535236396077, "grad_norm": 0.4961807131767273, "learning_rate": 7.055961070559612e-05, "loss": 0.3691, "step": 132 }, { "epoch": 0.3163841807909605, "grad_norm": 0.5330118536949158, "learning_rate": 7.031630170316302e-05, "loss": 0.5341, "step": 133 }, { "epoch": 0.31876300921796014, "grad_norm": 0.4410998225212097, "learning_rate": 7.007299270072993e-05, "loss": 0.4867, "step": 134 }, { "epoch": 0.32114183764495985, "grad_norm": 0.4794428050518036, "learning_rate": 6.982968369829684e-05, "loss": 0.4818, "step": 135 }, { "epoch": 0.32352066607195956, "grad_norm": 0.49389636516571045, "learning_rate": 6.958637469586375e-05, "loss": 0.5156, "step": 136 }, { "epoch": 0.3258994944989593, "grad_norm": 0.49501174688339233, "learning_rate": 6.934306569343067e-05, "loss": 0.5122, "step": 137 }, { "epoch": 0.328278322925959, "grad_norm": 0.4967164695262909, "learning_rate": 6.909975669099758e-05, "loss": 0.4853, "step": 138 }, { "epoch": 0.33065715135295864, "grad_norm": 0.5276123881340027, "learning_rate": 6.885644768856449e-05, "loss": 0.4703, "step": 139 }, { "epoch": 0.33303597977995836, "grad_norm": 0.5336246490478516, "learning_rate": 6.86131386861314e-05, "loss": 0.4652, "step": 140 }, { "epoch": 0.33541480820695807, "grad_norm": 0.3898225426673889, "learning_rate": 6.83698296836983e-05, "loss": 0.3983, "step": 141 }, { "epoch": 0.3377936366339578, "grad_norm": 0.4705723226070404, "learning_rate": 6.81265206812652e-05, "loss": 0.4646, "step": 142 }, { "epoch": 0.3401724650609575, "grad_norm": 0.4973605275154114, "learning_rate": 6.788321167883211e-05, "loss": 0.4848, "step": 143 }, { "epoch": 0.34255129348795715, "grad_norm": 0.5203383564949036, "learning_rate": 6.763990267639902e-05, "loss": 0.4992, "step": 144 }, { "epoch": 0.34493012191495687, "grad_norm": 0.47000083327293396, "learning_rate": 6.739659367396593e-05, "loss": 0.4665, "step": 145 }, { "epoch": 0.3473089503419566, "grad_norm": 0.4483359456062317, "learning_rate": 6.715328467153285e-05, "loss": 0.4553, "step": 146 }, { "epoch": 0.3496877787689563, "grad_norm": 0.4584805965423584, "learning_rate": 6.690997566909976e-05, "loss": 0.4945, "step": 147 }, { "epoch": 0.352066607195956, "grad_norm": 0.5037974119186401, "learning_rate": 6.666666666666667e-05, "loss": 0.4723, "step": 148 }, { "epoch": 0.3544454356229557, "grad_norm": 0.4981141984462738, "learning_rate": 6.642335766423358e-05, "loss": 0.5417, "step": 149 }, { "epoch": 0.3568242640499554, "grad_norm": 0.49627479910850525, "learning_rate": 6.618004866180048e-05, "loss": 0.4569, "step": 150 }, { "epoch": 0.3592030924769551, "grad_norm": 0.49889135360717773, "learning_rate": 6.593673965936739e-05, "loss": 0.5391, "step": 151 }, { "epoch": 0.3615819209039548, "grad_norm": 0.5222321152687073, "learning_rate": 6.56934306569343e-05, "loss": 0.4482, "step": 152 }, { "epoch": 0.3639607493309545, "grad_norm": 0.48798415064811707, "learning_rate": 6.545012165450122e-05, "loss": 0.5148, "step": 153 }, { "epoch": 0.3663395777579542, "grad_norm": 0.4458797574043274, "learning_rate": 6.520681265206813e-05, "loss": 0.4516, "step": 154 }, { "epoch": 0.3687184061849539, "grad_norm": 0.4997321665287018, "learning_rate": 6.496350364963504e-05, "loss": 0.4769, "step": 155 }, { "epoch": 0.3710972346119536, "grad_norm": 0.5233137011528015, "learning_rate": 6.472019464720195e-05, "loss": 0.5108, "step": 156 }, { "epoch": 0.3734760630389533, "grad_norm": 0.5136488676071167, "learning_rate": 6.447688564476886e-05, "loss": 0.4755, "step": 157 }, { "epoch": 0.375854891465953, "grad_norm": 0.4736205041408539, "learning_rate": 6.423357664233576e-05, "loss": 0.3712, "step": 158 }, { "epoch": 0.37823371989295274, "grad_norm": 0.43539518117904663, "learning_rate": 6.399026763990267e-05, "loss": 0.416, "step": 159 }, { "epoch": 0.38061254831995245, "grad_norm": 0.44481900334358215, "learning_rate": 6.374695863746959e-05, "loss": 0.4628, "step": 160 }, { "epoch": 0.3829913767469521, "grad_norm": 0.45447418093681335, "learning_rate": 6.35036496350365e-05, "loss": 0.4273, "step": 161 }, { "epoch": 0.3853702051739518, "grad_norm": 0.43469730019569397, "learning_rate": 6.326034063260341e-05, "loss": 0.4584, "step": 162 }, { "epoch": 0.38774903360095153, "grad_norm": 0.4581772983074188, "learning_rate": 6.301703163017032e-05, "loss": 0.4379, "step": 163 }, { "epoch": 0.39012786202795124, "grad_norm": 0.42583826184272766, "learning_rate": 6.277372262773723e-05, "loss": 0.4336, "step": 164 }, { "epoch": 0.39250669045495096, "grad_norm": 0.5775184631347656, "learning_rate": 6.253041362530415e-05, "loss": 0.4901, "step": 165 }, { "epoch": 0.3948855188819506, "grad_norm": 0.4797421991825104, "learning_rate": 6.228710462287104e-05, "loss": 0.4319, "step": 166 }, { "epoch": 0.3972643473089503, "grad_norm": 0.5240707397460938, "learning_rate": 6.204379562043796e-05, "loss": 0.4638, "step": 167 }, { "epoch": 0.39964317573595004, "grad_norm": 0.5036562085151672, "learning_rate": 6.180048661800487e-05, "loss": 0.5198, "step": 168 }, { "epoch": 0.40202200416294975, "grad_norm": 0.5376377105712891, "learning_rate": 6.155717761557178e-05, "loss": 0.4564, "step": 169 }, { "epoch": 0.40440083258994947, "grad_norm": 0.5044119954109192, "learning_rate": 6.131386861313869e-05, "loss": 0.4269, "step": 170 }, { "epoch": 0.4067796610169492, "grad_norm": 0.5195197463035583, "learning_rate": 6.10705596107056e-05, "loss": 0.4866, "step": 171 }, { "epoch": 0.40915848944394884, "grad_norm": 0.4676225483417511, "learning_rate": 6.082725060827251e-05, "loss": 0.4509, "step": 172 }, { "epoch": 0.41153731787094855, "grad_norm": 0.4422922134399414, "learning_rate": 6.058394160583942e-05, "loss": 0.4498, "step": 173 }, { "epoch": 0.41391614629794826, "grad_norm": 0.5001342296600342, "learning_rate": 6.034063260340633e-05, "loss": 0.4743, "step": 174 }, { "epoch": 0.416294974724948, "grad_norm": 0.4759523272514343, "learning_rate": 6.0097323600973245e-05, "loss": 0.4514, "step": 175 }, { "epoch": 0.4186738031519477, "grad_norm": 0.46963706612586975, "learning_rate": 5.985401459854016e-05, "loss": 0.4776, "step": 176 }, { "epoch": 0.42105263157894735, "grad_norm": 0.49028345942497253, "learning_rate": 5.961070559610706e-05, "loss": 0.4632, "step": 177 }, { "epoch": 0.42343146000594706, "grad_norm": 0.4388274848461151, "learning_rate": 5.9367396593673974e-05, "loss": 0.4822, "step": 178 }, { "epoch": 0.42581028843294677, "grad_norm": 0.4349250793457031, "learning_rate": 5.9124087591240886e-05, "loss": 0.4468, "step": 179 }, { "epoch": 0.4281891168599465, "grad_norm": 0.4884635806083679, "learning_rate": 5.8880778588807784e-05, "loss": 0.4724, "step": 180 }, { "epoch": 0.4305679452869462, "grad_norm": 0.4598194360733032, "learning_rate": 5.8637469586374696e-05, "loss": 0.4762, "step": 181 }, { "epoch": 0.4329467737139459, "grad_norm": 0.5073289275169373, "learning_rate": 5.83941605839416e-05, "loss": 0.4732, "step": 182 }, { "epoch": 0.43532560214094557, "grad_norm": 0.4630277752876282, "learning_rate": 5.815085158150851e-05, "loss": 0.4301, "step": 183 }, { "epoch": 0.4377044305679453, "grad_norm": 0.42805591225624084, "learning_rate": 5.7907542579075425e-05, "loss": 0.47, "step": 184 }, { "epoch": 0.440083258994945, "grad_norm": 0.6053451299667358, "learning_rate": 5.766423357664234e-05, "loss": 0.4977, "step": 185 }, { "epoch": 0.4424620874219447, "grad_norm": 0.530331552028656, "learning_rate": 5.742092457420924e-05, "loss": 0.4492, "step": 186 }, { "epoch": 0.4448409158489444, "grad_norm": 0.46791204810142517, "learning_rate": 5.7177615571776154e-05, "loss": 0.4402, "step": 187 }, { "epoch": 0.4472197442759441, "grad_norm": 0.543755829334259, "learning_rate": 5.6934306569343066e-05, "loss": 0.5958, "step": 188 }, { "epoch": 0.4495985727029438, "grad_norm": 0.4579405188560486, "learning_rate": 5.669099756690998e-05, "loss": 0.4145, "step": 189 }, { "epoch": 0.4519774011299435, "grad_norm": 0.5464953184127808, "learning_rate": 5.644768856447688e-05, "loss": 0.4869, "step": 190 }, { "epoch": 0.4543562295569432, "grad_norm": 0.4730067551136017, "learning_rate": 5.6204379562043795e-05, "loss": 0.4432, "step": 191 }, { "epoch": 0.4567350579839429, "grad_norm": 0.4859981834888458, "learning_rate": 5.596107055961071e-05, "loss": 0.4376, "step": 192 }, { "epoch": 0.4591138864109426, "grad_norm": 0.4743499159812927, "learning_rate": 5.571776155717762e-05, "loss": 0.4542, "step": 193 }, { "epoch": 0.4614927148379423, "grad_norm": 0.5267781019210815, "learning_rate": 5.5474452554744524e-05, "loss": 0.4684, "step": 194 }, { "epoch": 0.463871543264942, "grad_norm": 0.5235748291015625, "learning_rate": 5.5231143552311436e-05, "loss": 0.4412, "step": 195 }, { "epoch": 0.4662503716919417, "grad_norm": 0.4266457259654999, "learning_rate": 5.498783454987835e-05, "loss": 0.3855, "step": 196 }, { "epoch": 0.46862920011894144, "grad_norm": 0.49032875895500183, "learning_rate": 5.474452554744526e-05, "loss": 0.4413, "step": 197 }, { "epoch": 0.47100802854594115, "grad_norm": 0.5913426280021667, "learning_rate": 5.4501216545012165e-05, "loss": 0.5653, "step": 198 }, { "epoch": 0.4733868569729408, "grad_norm": 0.5137138366699219, "learning_rate": 5.425790754257908e-05, "loss": 0.4835, "step": 199 }, { "epoch": 0.4757656853999405, "grad_norm": 0.43370646238327026, "learning_rate": 5.401459854014599e-05, "loss": 0.41, "step": 200 }, { "epoch": 0.4757656853999405, "eval_loss": 0.4463058412075043, "eval_runtime": 24.577, "eval_samples_per_second": 30.435, "eval_steps_per_second": 15.218, "step": 200 }, { "epoch": 0.47814451382694023, "grad_norm": 0.5559374690055847, "learning_rate": 5.37712895377129e-05, "loss": 0.5726, "step": 201 }, { "epoch": 0.48052334225393994, "grad_norm": 0.48858100175857544, "learning_rate": 5.3527980535279806e-05, "loss": 0.481, "step": 202 }, { "epoch": 0.48290217068093966, "grad_norm": 0.45764482021331787, "learning_rate": 5.328467153284672e-05, "loss": 0.4345, "step": 203 }, { "epoch": 0.4852809991079393, "grad_norm": 0.5081002116203308, "learning_rate": 5.304136253041363e-05, "loss": 0.4091, "step": 204 }, { "epoch": 0.487659827534939, "grad_norm": 0.47981277108192444, "learning_rate": 5.279805352798054e-05, "loss": 0.4198, "step": 205 }, { "epoch": 0.49003865596193874, "grad_norm": 0.5422961115837097, "learning_rate": 5.255474452554745e-05, "loss": 0.4645, "step": 206 }, { "epoch": 0.49241748438893845, "grad_norm": 0.5617045164108276, "learning_rate": 5.231143552311436e-05, "loss": 0.499, "step": 207 }, { "epoch": 0.49479631281593817, "grad_norm": 0.5472312569618225, "learning_rate": 5.206812652068127e-05, "loss": 0.4303, "step": 208 }, { "epoch": 0.4971751412429379, "grad_norm": 0.560845136642456, "learning_rate": 5.182481751824818e-05, "loss": 0.507, "step": 209 }, { "epoch": 0.49955396966993754, "grad_norm": 0.5158256888389587, "learning_rate": 5.158150851581509e-05, "loss": 0.5126, "step": 210 }, { "epoch": 0.5019327980969372, "grad_norm": 0.49810492992401123, "learning_rate": 5.1338199513382e-05, "loss": 0.3799, "step": 211 }, { "epoch": 0.5043116265239369, "grad_norm": 0.5269343852996826, "learning_rate": 5.109489051094891e-05, "loss": 0.5066, "step": 212 }, { "epoch": 0.5066904549509367, "grad_norm": 0.48883453011512756, "learning_rate": 5.0851581508515824e-05, "loss": 0.4593, "step": 213 }, { "epoch": 0.5090692833779363, "grad_norm": 0.5003328919410706, "learning_rate": 5.060827250608273e-05, "loss": 0.4672, "step": 214 }, { "epoch": 0.5114481118049361, "grad_norm": 0.5346857905387878, "learning_rate": 5.036496350364964e-05, "loss": 0.459, "step": 215 }, { "epoch": 0.5138269402319358, "grad_norm": 0.5018550157546997, "learning_rate": 5.012165450121655e-05, "loss": 0.427, "step": 216 }, { "epoch": 0.5162057686589355, "grad_norm": 0.5394279956817627, "learning_rate": 4.987834549878346e-05, "loss": 0.4906, "step": 217 }, { "epoch": 0.5185845970859352, "grad_norm": 0.41929885745048523, "learning_rate": 4.963503649635037e-05, "loss": 0.4719, "step": 218 }, { "epoch": 0.5209634255129348, "grad_norm": 0.5181702375411987, "learning_rate": 4.9391727493917275e-05, "loss": 0.4187, "step": 219 }, { "epoch": 0.5233422539399346, "grad_norm": 0.5157968997955322, "learning_rate": 4.914841849148419e-05, "loss": 0.3988, "step": 220 }, { "epoch": 0.5257210823669343, "grad_norm": 0.5713850259780884, "learning_rate": 4.89051094890511e-05, "loss": 0.4988, "step": 221 }, { "epoch": 0.528099910793934, "grad_norm": 0.5276979804039001, "learning_rate": 4.866180048661801e-05, "loss": 0.4289, "step": 222 }, { "epoch": 0.5304787392209337, "grad_norm": 0.5099546313285828, "learning_rate": 4.8418491484184916e-05, "loss": 0.4143, "step": 223 }, { "epoch": 0.5328575676479334, "grad_norm": 0.5845214128494263, "learning_rate": 4.817518248175183e-05, "loss": 0.528, "step": 224 }, { "epoch": 0.5352363960749331, "grad_norm": 0.5065467357635498, "learning_rate": 4.793187347931874e-05, "loss": 0.4109, "step": 225 }, { "epoch": 0.5376152245019328, "grad_norm": 0.46541669964790344, "learning_rate": 4.7688564476885646e-05, "loss": 0.4202, "step": 226 }, { "epoch": 0.5399940529289325, "grad_norm": 0.571874737739563, "learning_rate": 4.744525547445256e-05, "loss": 0.5088, "step": 227 }, { "epoch": 0.5423728813559322, "grad_norm": 0.542327880859375, "learning_rate": 4.720194647201946e-05, "loss": 0.458, "step": 228 }, { "epoch": 0.5447517097829319, "grad_norm": 0.48291677236557007, "learning_rate": 4.6958637469586375e-05, "loss": 0.3743, "step": 229 }, { "epoch": 0.5471305382099316, "grad_norm": 0.5330439805984497, "learning_rate": 4.6715328467153287e-05, "loss": 0.4217, "step": 230 }, { "epoch": 0.5495093666369313, "grad_norm": 0.47011399269104004, "learning_rate": 4.64720194647202e-05, "loss": 0.4582, "step": 231 }, { "epoch": 0.551888195063931, "grad_norm": 0.5513572692871094, "learning_rate": 4.6228710462287104e-05, "loss": 0.4527, "step": 232 }, { "epoch": 0.5542670234909307, "grad_norm": 0.61734539270401, "learning_rate": 4.5985401459854016e-05, "loss": 0.5963, "step": 233 }, { "epoch": 0.5566458519179304, "grad_norm": 0.4924643933773041, "learning_rate": 4.574209245742093e-05, "loss": 0.3916, "step": 234 }, { "epoch": 0.5590246803449301, "grad_norm": 0.4713283181190491, "learning_rate": 4.549878345498784e-05, "loss": 0.3963, "step": 235 }, { "epoch": 0.5614035087719298, "grad_norm": 0.4790879786014557, "learning_rate": 4.5255474452554745e-05, "loss": 0.4097, "step": 236 }, { "epoch": 0.5637823371989296, "grad_norm": 0.4752825200557709, "learning_rate": 4.5012165450121657e-05, "loss": 0.3671, "step": 237 }, { "epoch": 0.5661611656259292, "grad_norm": 0.6054632663726807, "learning_rate": 4.476885644768857e-05, "loss": 0.478, "step": 238 }, { "epoch": 0.568539994052929, "grad_norm": 0.5796523094177246, "learning_rate": 4.452554744525548e-05, "loss": 0.4677, "step": 239 }, { "epoch": 0.5709188224799286, "grad_norm": 0.7159845232963562, "learning_rate": 4.4282238442822386e-05, "loss": 0.6, "step": 240 }, { "epoch": 0.5732976509069283, "grad_norm": 0.4576638638973236, "learning_rate": 4.40389294403893e-05, "loss": 0.3984, "step": 241 }, { "epoch": 0.5756764793339281, "grad_norm": 0.5089020133018494, "learning_rate": 4.379562043795621e-05, "loss": 0.4245, "step": 242 }, { "epoch": 0.5780553077609277, "grad_norm": 0.4587903618812561, "learning_rate": 4.355231143552312e-05, "loss": 0.399, "step": 243 }, { "epoch": 0.5804341361879275, "grad_norm": 0.46936094760894775, "learning_rate": 4.3309002433090027e-05, "loss": 0.3892, "step": 244 }, { "epoch": 0.5828129646149272, "grad_norm": 0.5407763719558716, "learning_rate": 4.306569343065693e-05, "loss": 0.5055, "step": 245 }, { "epoch": 0.5851917930419268, "grad_norm": 0.5131598114967346, "learning_rate": 4.2822384428223844e-05, "loss": 0.469, "step": 246 }, { "epoch": 0.5875706214689266, "grad_norm": 0.48786240816116333, "learning_rate": 4.2579075425790756e-05, "loss": 0.3993, "step": 247 }, { "epoch": 0.5899494498959262, "grad_norm": 0.5068132281303406, "learning_rate": 4.233576642335767e-05, "loss": 0.4114, "step": 248 }, { "epoch": 0.592328278322926, "grad_norm": 0.5316363573074341, "learning_rate": 4.209245742092457e-05, "loss": 0.446, "step": 249 }, { "epoch": 0.5947071067499257, "grad_norm": 0.511885404586792, "learning_rate": 4.1849148418491485e-05, "loss": 0.4738, "step": 250 }, { "epoch": 0.5970859351769253, "grad_norm": 0.6263471245765686, "learning_rate": 4.16058394160584e-05, "loss": 0.5074, "step": 251 }, { "epoch": 0.5994647636039251, "grad_norm": 0.5359822511672974, "learning_rate": 4.136253041362531e-05, "loss": 0.4239, "step": 252 }, { "epoch": 0.6018435920309247, "grad_norm": 0.5764995217323303, "learning_rate": 4.1119221411192214e-05, "loss": 0.5247, "step": 253 }, { "epoch": 0.6042224204579245, "grad_norm": 0.5081723928451538, "learning_rate": 4.0875912408759126e-05, "loss": 0.4104, "step": 254 }, { "epoch": 0.6066012488849242, "grad_norm": 0.5557652115821838, "learning_rate": 4.063260340632604e-05, "loss": 0.4889, "step": 255 }, { "epoch": 0.6089800773119238, "grad_norm": 0.48941561579704285, "learning_rate": 4.038929440389295e-05, "loss": 0.4557, "step": 256 }, { "epoch": 0.6113589057389236, "grad_norm": 0.565647304058075, "learning_rate": 4.0145985401459855e-05, "loss": 0.4924, "step": 257 }, { "epoch": 0.6137377341659233, "grad_norm": 0.5029006004333496, "learning_rate": 3.990267639902677e-05, "loss": 0.4885, "step": 258 }, { "epoch": 0.616116562592923, "grad_norm": 0.5224466323852539, "learning_rate": 3.965936739659368e-05, "loss": 0.4433, "step": 259 }, { "epoch": 0.6184953910199227, "grad_norm": 0.6890097260475159, "learning_rate": 3.941605839416059e-05, "loss": 0.5399, "step": 260 }, { "epoch": 0.6208742194469223, "grad_norm": 0.48153895139694214, "learning_rate": 3.9172749391727496e-05, "loss": 0.3881, "step": 261 }, { "epoch": 0.6232530478739221, "grad_norm": 0.5819414258003235, "learning_rate": 3.892944038929441e-05, "loss": 0.4568, "step": 262 }, { "epoch": 0.6256318763009218, "grad_norm": 0.4706772267818451, "learning_rate": 3.868613138686132e-05, "loss": 0.4382, "step": 263 }, { "epoch": 0.6280107047279215, "grad_norm": 0.5553036332130432, "learning_rate": 3.8442822384428225e-05, "loss": 0.4611, "step": 264 }, { "epoch": 0.6303895331549212, "grad_norm": 0.5100443959236145, "learning_rate": 3.819951338199514e-05, "loss": 0.424, "step": 265 }, { "epoch": 0.632768361581921, "grad_norm": 0.47650212049484253, "learning_rate": 3.795620437956204e-05, "loss": 0.4642, "step": 266 }, { "epoch": 0.6351471900089206, "grad_norm": 0.5056859254837036, "learning_rate": 3.7712895377128954e-05, "loss": 0.4643, "step": 267 }, { "epoch": 0.6375260184359203, "grad_norm": 0.48869505524635315, "learning_rate": 3.7469586374695866e-05, "loss": 0.4058, "step": 268 }, { "epoch": 0.63990484686292, "grad_norm": 0.5686202645301819, "learning_rate": 3.722627737226278e-05, "loss": 0.4772, "step": 269 }, { "epoch": 0.6422836752899197, "grad_norm": 0.523289680480957, "learning_rate": 3.698296836982968e-05, "loss": 0.4257, "step": 270 }, { "epoch": 0.6446625037169195, "grad_norm": 0.4355628192424774, "learning_rate": 3.6739659367396595e-05, "loss": 0.4035, "step": 271 }, { "epoch": 0.6470413321439191, "grad_norm": 0.5667319297790527, "learning_rate": 3.649635036496351e-05, "loss": 0.5109, "step": 272 }, { "epoch": 0.6494201605709188, "grad_norm": 0.49869677424430847, "learning_rate": 3.625304136253042e-05, "loss": 0.4496, "step": 273 }, { "epoch": 0.6517989889979185, "grad_norm": 0.4514950215816498, "learning_rate": 3.6009732360097324e-05, "loss": 0.3948, "step": 274 }, { "epoch": 0.6541778174249182, "grad_norm": 0.5341079235076904, "learning_rate": 3.5766423357664236e-05, "loss": 0.5351, "step": 275 }, { "epoch": 0.656556645851918, "grad_norm": 0.5494990348815918, "learning_rate": 3.552311435523115e-05, "loss": 0.5355, "step": 276 }, { "epoch": 0.6589354742789176, "grad_norm": 0.5039032697677612, "learning_rate": 3.527980535279806e-05, "loss": 0.388, "step": 277 }, { "epoch": 0.6613143027059173, "grad_norm": 0.543323278427124, "learning_rate": 3.5036496350364965e-05, "loss": 0.5407, "step": 278 }, { "epoch": 0.6636931311329171, "grad_norm": 0.5307977795600891, "learning_rate": 3.479318734793188e-05, "loss": 0.4416, "step": 279 }, { "epoch": 0.6660719595599167, "grad_norm": 0.5810051560401917, "learning_rate": 3.454987834549879e-05, "loss": 0.4984, "step": 280 }, { "epoch": 0.6684507879869165, "grad_norm": 0.5557127594947815, "learning_rate": 3.43065693430657e-05, "loss": 0.4426, "step": 281 }, { "epoch": 0.6708296164139161, "grad_norm": 0.49247419834136963, "learning_rate": 3.40632603406326e-05, "loss": 0.4353, "step": 282 }, { "epoch": 0.6732084448409158, "grad_norm": 0.5093483328819275, "learning_rate": 3.381995133819951e-05, "loss": 0.4141, "step": 283 }, { "epoch": 0.6755872732679156, "grad_norm": 0.5625473856925964, "learning_rate": 3.357664233576642e-05, "loss": 0.5592, "step": 284 }, { "epoch": 0.6779661016949152, "grad_norm": 0.47916609048843384, "learning_rate": 3.3333333333333335e-05, "loss": 0.4519, "step": 285 }, { "epoch": 0.680344930121915, "grad_norm": 0.514183759689331, "learning_rate": 3.309002433090024e-05, "loss": 0.3704, "step": 286 }, { "epoch": 0.6827237585489146, "grad_norm": 0.5638673305511475, "learning_rate": 3.284671532846715e-05, "loss": 0.4813, "step": 287 }, { "epoch": 0.6851025869759143, "grad_norm": 0.436166375875473, "learning_rate": 3.2603406326034064e-05, "loss": 0.4546, "step": 288 }, { "epoch": 0.6874814154029141, "grad_norm": 0.5311592221260071, "learning_rate": 3.2360097323600976e-05, "loss": 0.4948, "step": 289 }, { "epoch": 0.6898602438299137, "grad_norm": 0.5682978630065918, "learning_rate": 3.211678832116788e-05, "loss": 0.4595, "step": 290 }, { "epoch": 0.6922390722569135, "grad_norm": 0.5363884568214417, "learning_rate": 3.187347931873479e-05, "loss": 0.3572, "step": 291 }, { "epoch": 0.6946179006839132, "grad_norm": 0.517888069152832, "learning_rate": 3.1630170316301705e-05, "loss": 0.3949, "step": 292 }, { "epoch": 0.6969967291109129, "grad_norm": 0.5211926698684692, "learning_rate": 3.138686131386862e-05, "loss": 0.4904, "step": 293 }, { "epoch": 0.6993755575379126, "grad_norm": 0.5304282903671265, "learning_rate": 3.114355231143552e-05, "loss": 0.4632, "step": 294 }, { "epoch": 0.7017543859649122, "grad_norm": 0.4735037386417389, "learning_rate": 3.0900243309002434e-05, "loss": 0.4002, "step": 295 }, { "epoch": 0.704133214391912, "grad_norm": 0.5239513516426086, "learning_rate": 3.0656934306569346e-05, "loss": 0.4758, "step": 296 }, { "epoch": 0.7065120428189117, "grad_norm": 0.5408491492271423, "learning_rate": 3.0413625304136255e-05, "loss": 0.4993, "step": 297 }, { "epoch": 0.7088908712459114, "grad_norm": 0.6086730360984802, "learning_rate": 3.0170316301703166e-05, "loss": 0.5048, "step": 298 }, { "epoch": 0.7112696996729111, "grad_norm": 0.4720120131969452, "learning_rate": 2.992700729927008e-05, "loss": 0.4215, "step": 299 }, { "epoch": 0.7136485280999108, "grad_norm": 0.5266088843345642, "learning_rate": 2.9683698296836987e-05, "loss": 0.4553, "step": 300 }, { "epoch": 0.7136485280999108, "eval_loss": 0.42479127645492554, "eval_runtime": 24.6017, "eval_samples_per_second": 30.404, "eval_steps_per_second": 15.202, "step": 300 }, { "epoch": 0.7160273565269105, "grad_norm": 0.5402945280075073, "learning_rate": 2.9440389294403892e-05, "loss": 0.4477, "step": 301 }, { "epoch": 0.7184061849539102, "grad_norm": 0.5225419402122498, "learning_rate": 2.91970802919708e-05, "loss": 0.4112, "step": 302 }, { "epoch": 0.72078501338091, "grad_norm": 0.516687273979187, "learning_rate": 2.8953771289537713e-05, "loss": 0.51, "step": 303 }, { "epoch": 0.7231638418079096, "grad_norm": 0.5380510091781616, "learning_rate": 2.871046228710462e-05, "loss": 0.3956, "step": 304 }, { "epoch": 0.7255426702349093, "grad_norm": 0.5149944424629211, "learning_rate": 2.8467153284671533e-05, "loss": 0.4596, "step": 305 }, { "epoch": 0.727921498661909, "grad_norm": 0.5109550356864929, "learning_rate": 2.822384428223844e-05, "loss": 0.4538, "step": 306 }, { "epoch": 0.7303003270889087, "grad_norm": 0.4751725196838379, "learning_rate": 2.7980535279805354e-05, "loss": 0.3858, "step": 307 }, { "epoch": 0.7326791555159085, "grad_norm": 0.5320805907249451, "learning_rate": 2.7737226277372262e-05, "loss": 0.4388, "step": 308 }, { "epoch": 0.7350579839429081, "grad_norm": 0.4881283938884735, "learning_rate": 2.7493917274939174e-05, "loss": 0.3787, "step": 309 }, { "epoch": 0.7374368123699078, "grad_norm": 0.5177855491638184, "learning_rate": 2.7250608272506083e-05, "loss": 0.4489, "step": 310 }, { "epoch": 0.7398156407969075, "grad_norm": 0.4821816682815552, "learning_rate": 2.7007299270072995e-05, "loss": 0.4178, "step": 311 }, { "epoch": 0.7421944692239072, "grad_norm": 0.5222703218460083, "learning_rate": 2.6763990267639903e-05, "loss": 0.3892, "step": 312 }, { "epoch": 0.744573297650907, "grad_norm": 0.554428219795227, "learning_rate": 2.6520681265206815e-05, "loss": 0.425, "step": 313 }, { "epoch": 0.7469521260779066, "grad_norm": 0.47558993101119995, "learning_rate": 2.6277372262773724e-05, "loss": 0.4414, "step": 314 }, { "epoch": 0.7493309545049064, "grad_norm": 0.5236990451812744, "learning_rate": 2.6034063260340636e-05, "loss": 0.4194, "step": 315 }, { "epoch": 0.751709782931906, "grad_norm": 0.5379797220230103, "learning_rate": 2.5790754257907544e-05, "loss": 0.4434, "step": 316 }, { "epoch": 0.7540886113589057, "grad_norm": 0.7105163931846619, "learning_rate": 2.5547445255474456e-05, "loss": 0.6032, "step": 317 }, { "epoch": 0.7564674397859055, "grad_norm": 0.5832124352455139, "learning_rate": 2.5304136253041365e-05, "loss": 0.5152, "step": 318 }, { "epoch": 0.7588462682129051, "grad_norm": 0.5089643597602844, "learning_rate": 2.5060827250608277e-05, "loss": 0.4333, "step": 319 }, { "epoch": 0.7612250966399049, "grad_norm": 0.509891152381897, "learning_rate": 2.4817518248175185e-05, "loss": 0.4357, "step": 320 }, { "epoch": 0.7636039250669046, "grad_norm": 0.6722425222396851, "learning_rate": 2.4574209245742094e-05, "loss": 0.5146, "step": 321 }, { "epoch": 0.7659827534939042, "grad_norm": 0.5415557622909546, "learning_rate": 2.4330900243309006e-05, "loss": 0.5069, "step": 322 }, { "epoch": 0.768361581920904, "grad_norm": 0.4898138642311096, "learning_rate": 2.4087591240875914e-05, "loss": 0.4025, "step": 323 }, { "epoch": 0.7707404103479036, "grad_norm": 0.47411227226257324, "learning_rate": 2.3844282238442823e-05, "loss": 0.4339, "step": 324 }, { "epoch": 0.7731192387749034, "grad_norm": 0.5482645034790039, "learning_rate": 2.360097323600973e-05, "loss": 0.4023, "step": 325 }, { "epoch": 0.7754980672019031, "grad_norm": 0.4828038513660431, "learning_rate": 2.3357664233576643e-05, "loss": 0.4683, "step": 326 }, { "epoch": 0.7778768956289027, "grad_norm": 0.47106465697288513, "learning_rate": 2.3114355231143552e-05, "loss": 0.3926, "step": 327 }, { "epoch": 0.7802557240559025, "grad_norm": 0.5016714930534363, "learning_rate": 2.2871046228710464e-05, "loss": 0.3897, "step": 328 }, { "epoch": 0.7826345524829021, "grad_norm": 0.48692432045936584, "learning_rate": 2.2627737226277372e-05, "loss": 0.4213, "step": 329 }, { "epoch": 0.7850133809099019, "grad_norm": 0.625466525554657, "learning_rate": 2.2384428223844284e-05, "loss": 0.4761, "step": 330 }, { "epoch": 0.7873922093369016, "grad_norm": 0.5269051194190979, "learning_rate": 2.2141119221411193e-05, "loss": 0.4216, "step": 331 }, { "epoch": 0.7897710377639012, "grad_norm": 0.6055566072463989, "learning_rate": 2.1897810218978105e-05, "loss": 0.5351, "step": 332 }, { "epoch": 0.792149866190901, "grad_norm": 0.5403903126716614, "learning_rate": 2.1654501216545013e-05, "loss": 0.4028, "step": 333 }, { "epoch": 0.7945286946179007, "grad_norm": 0.6619285941123962, "learning_rate": 2.1411192214111922e-05, "loss": 0.4291, "step": 334 }, { "epoch": 0.7969075230449004, "grad_norm": 0.48977240920066833, "learning_rate": 2.1167883211678834e-05, "loss": 0.4171, "step": 335 }, { "epoch": 0.7992863514719001, "grad_norm": 0.5986789464950562, "learning_rate": 2.0924574209245742e-05, "loss": 0.3742, "step": 336 }, { "epoch": 0.8016651798988997, "grad_norm": 0.5153005719184875, "learning_rate": 2.0681265206812654e-05, "loss": 0.4147, "step": 337 }, { "epoch": 0.8040440083258995, "grad_norm": 0.5173148512840271, "learning_rate": 2.0437956204379563e-05, "loss": 0.4648, "step": 338 }, { "epoch": 0.8064228367528992, "grad_norm": 0.5250167846679688, "learning_rate": 2.0194647201946475e-05, "loss": 0.4919, "step": 339 }, { "epoch": 0.8088016651798989, "grad_norm": 0.5532633066177368, "learning_rate": 1.9951338199513383e-05, "loss": 0.3462, "step": 340 }, { "epoch": 0.8111804936068986, "grad_norm": 0.4855538308620453, "learning_rate": 1.9708029197080295e-05, "loss": 0.4902, "step": 341 }, { "epoch": 0.8135593220338984, "grad_norm": 0.597204864025116, "learning_rate": 1.9464720194647204e-05, "loss": 0.4741, "step": 342 }, { "epoch": 0.815938150460898, "grad_norm": 0.5064167380332947, "learning_rate": 1.9221411192214112e-05, "loss": 0.4488, "step": 343 }, { "epoch": 0.8183169788878977, "grad_norm": 0.5640056133270264, "learning_rate": 1.897810218978102e-05, "loss": 0.4408, "step": 344 }, { "epoch": 0.8206958073148974, "grad_norm": 0.5452293753623962, "learning_rate": 1.8734793187347933e-05, "loss": 0.4406, "step": 345 }, { "epoch": 0.8230746357418971, "grad_norm": 0.5527360439300537, "learning_rate": 1.849148418491484e-05, "loss": 0.432, "step": 346 }, { "epoch": 0.8254534641688969, "grad_norm": 0.5353720784187317, "learning_rate": 1.8248175182481753e-05, "loss": 0.4195, "step": 347 }, { "epoch": 0.8278322925958965, "grad_norm": 0.4622812569141388, "learning_rate": 1.8004866180048662e-05, "loss": 0.4146, "step": 348 }, { "epoch": 0.8302111210228962, "grad_norm": 0.5472270250320435, "learning_rate": 1.7761557177615574e-05, "loss": 0.4521, "step": 349 }, { "epoch": 0.832589949449896, "grad_norm": 0.5854270458221436, "learning_rate": 1.7518248175182482e-05, "loss": 0.3974, "step": 350 }, { "epoch": 0.8349687778768956, "grad_norm": 0.4779118597507477, "learning_rate": 1.7274939172749394e-05, "loss": 0.3683, "step": 351 }, { "epoch": 0.8373476063038954, "grad_norm": 0.5334904789924622, "learning_rate": 1.70316301703163e-05, "loss": 0.4319, "step": 352 }, { "epoch": 0.839726434730895, "grad_norm": 0.49920886754989624, "learning_rate": 1.678832116788321e-05, "loss": 0.4115, "step": 353 }, { "epoch": 0.8421052631578947, "grad_norm": 0.6039828658103943, "learning_rate": 1.654501216545012e-05, "loss": 0.4133, "step": 354 }, { "epoch": 0.8444840915848945, "grad_norm": 0.5769008994102478, "learning_rate": 1.6301703163017032e-05, "loss": 0.4316, "step": 355 }, { "epoch": 0.8468629200118941, "grad_norm": 0.5622073411941528, "learning_rate": 1.605839416058394e-05, "loss": 0.5002, "step": 356 }, { "epoch": 0.8492417484388939, "grad_norm": 0.5158018469810486, "learning_rate": 1.5815085158150852e-05, "loss": 0.3537, "step": 357 }, { "epoch": 0.8516205768658935, "grad_norm": 0.5640501379966736, "learning_rate": 1.557177615571776e-05, "loss": 0.5049, "step": 358 }, { "epoch": 0.8539994052928932, "grad_norm": 0.5024840831756592, "learning_rate": 1.5328467153284673e-05, "loss": 0.4185, "step": 359 }, { "epoch": 0.856378233719893, "grad_norm": 0.5789101123809814, "learning_rate": 1.5085158150851583e-05, "loss": 0.4016, "step": 360 }, { "epoch": 0.8587570621468926, "grad_norm": 0.5746119022369385, "learning_rate": 1.4841849148418493e-05, "loss": 0.4421, "step": 361 }, { "epoch": 0.8611358905738924, "grad_norm": 0.5970929861068726, "learning_rate": 1.45985401459854e-05, "loss": 0.5766, "step": 362 }, { "epoch": 0.863514719000892, "grad_norm": 0.5322396755218506, "learning_rate": 1.435523114355231e-05, "loss": 0.4418, "step": 363 }, { "epoch": 0.8658935474278918, "grad_norm": 0.5240874886512756, "learning_rate": 1.411192214111922e-05, "loss": 0.461, "step": 364 }, { "epoch": 0.8682723758548915, "grad_norm": 0.6501151919364929, "learning_rate": 1.3868613138686131e-05, "loss": 0.3628, "step": 365 }, { "epoch": 0.8706512042818911, "grad_norm": 0.5303117036819458, "learning_rate": 1.3625304136253041e-05, "loss": 0.441, "step": 366 }, { "epoch": 0.8730300327088909, "grad_norm": 0.48986294865608215, "learning_rate": 1.3381995133819952e-05, "loss": 0.4371, "step": 367 }, { "epoch": 0.8754088611358906, "grad_norm": 0.5813177824020386, "learning_rate": 1.3138686131386862e-05, "loss": 0.4787, "step": 368 }, { "epoch": 0.8777876895628903, "grad_norm": 0.5057734251022339, "learning_rate": 1.2895377128953772e-05, "loss": 0.3852, "step": 369 }, { "epoch": 0.88016651798989, "grad_norm": 0.5309827923774719, "learning_rate": 1.2652068126520682e-05, "loss": 0.4518, "step": 370 }, { "epoch": 0.8825453464168896, "grad_norm": 0.5106115341186523, "learning_rate": 1.2408759124087593e-05, "loss": 0.414, "step": 371 }, { "epoch": 0.8849241748438894, "grad_norm": 0.6071767210960388, "learning_rate": 1.2165450121654503e-05, "loss": 0.4988, "step": 372 }, { "epoch": 0.8873030032708891, "grad_norm": 0.5328472852706909, "learning_rate": 1.1922141119221411e-05, "loss": 0.5115, "step": 373 }, { "epoch": 0.8896818316978888, "grad_norm": 0.4671655595302582, "learning_rate": 1.1678832116788322e-05, "loss": 0.3692, "step": 374 }, { "epoch": 0.8920606601248885, "grad_norm": 0.5364991426467896, "learning_rate": 1.1435523114355232e-05, "loss": 0.369, "step": 375 }, { "epoch": 0.8944394885518882, "grad_norm": 0.5912413597106934, "learning_rate": 1.1192214111922142e-05, "loss": 0.4404, "step": 376 }, { "epoch": 0.8968183169788879, "grad_norm": 0.54830402135849, "learning_rate": 1.0948905109489052e-05, "loss": 0.3858, "step": 377 }, { "epoch": 0.8991971454058876, "grad_norm": 0.5137721300125122, "learning_rate": 1.0705596107055961e-05, "loss": 0.3976, "step": 378 }, { "epoch": 0.9015759738328873, "grad_norm": 0.5718546509742737, "learning_rate": 1.0462287104622871e-05, "loss": 0.4714, "step": 379 }, { "epoch": 0.903954802259887, "grad_norm": 0.5441231727600098, "learning_rate": 1.0218978102189781e-05, "loss": 0.4706, "step": 380 }, { "epoch": 0.9063336306868867, "grad_norm": 0.5782937407493591, "learning_rate": 9.975669099756692e-06, "loss": 0.4758, "step": 381 }, { "epoch": 0.9087124591138864, "grad_norm": 0.5077878832817078, "learning_rate": 9.732360097323602e-06, "loss": 0.4739, "step": 382 }, { "epoch": 0.9110912875408861, "grad_norm": 0.5690765380859375, "learning_rate": 9.48905109489051e-06, "loss": 0.3926, "step": 383 }, { "epoch": 0.9134701159678859, "grad_norm": 0.5629989504814148, "learning_rate": 9.24574209245742e-06, "loss": 0.4728, "step": 384 }, { "epoch": 0.9158489443948855, "grad_norm": 0.46644964814186096, "learning_rate": 9.002433090024331e-06, "loss": 0.3798, "step": 385 }, { "epoch": 0.9182277728218852, "grad_norm": 0.5197304487228394, "learning_rate": 8.759124087591241e-06, "loss": 0.4181, "step": 386 }, { "epoch": 0.9206066012488849, "grad_norm": 0.5880502462387085, "learning_rate": 8.51581508515815e-06, "loss": 0.5106, "step": 387 }, { "epoch": 0.9229854296758846, "grad_norm": 0.6154156923294067, "learning_rate": 8.27250608272506e-06, "loss": 0.4669, "step": 388 }, { "epoch": 0.9253642581028844, "grad_norm": 0.604262113571167, "learning_rate": 8.02919708029197e-06, "loss": 0.451, "step": 389 }, { "epoch": 0.927743086529884, "grad_norm": 0.5451841354370117, "learning_rate": 7.78588807785888e-06, "loss": 0.4477, "step": 390 }, { "epoch": 0.9301219149568838, "grad_norm": 0.6219511032104492, "learning_rate": 7.542579075425792e-06, "loss": 0.4633, "step": 391 }, { "epoch": 0.9325007433838834, "grad_norm": 0.5371522903442383, "learning_rate": 7.2992700729927e-06, "loss": 0.4109, "step": 392 }, { "epoch": 0.9348795718108831, "grad_norm": 0.5737317800521851, "learning_rate": 7.05596107055961e-06, "loss": 0.434, "step": 393 }, { "epoch": 0.9372584002378829, "grad_norm": 0.6237983703613281, "learning_rate": 6.812652068126521e-06, "loss": 0.4657, "step": 394 }, { "epoch": 0.9396372286648825, "grad_norm": 0.5219160914421082, "learning_rate": 6.569343065693431e-06, "loss": 0.3983, "step": 395 }, { "epoch": 0.9420160570918823, "grad_norm": 0.5657615661621094, "learning_rate": 6.326034063260341e-06, "loss": 0.3901, "step": 396 }, { "epoch": 0.944394885518882, "grad_norm": 0.6130802035331726, "learning_rate": 6.082725060827251e-06, "loss": 0.4844, "step": 397 }, { "epoch": 0.9467737139458816, "grad_norm": 0.6125937104225159, "learning_rate": 5.839416058394161e-06, "loss": 0.4312, "step": 398 }, { "epoch": 0.9491525423728814, "grad_norm": 0.5268482565879822, "learning_rate": 5.596107055961071e-06, "loss": 0.3764, "step": 399 }, { "epoch": 0.951531370799881, "grad_norm": 0.601588785648346, "learning_rate": 5.3527980535279805e-06, "loss": 0.429, "step": 400 }, { "epoch": 0.951531370799881, "eval_loss": 0.4152432680130005, "eval_runtime": 24.6197, "eval_samples_per_second": 30.382, "eval_steps_per_second": 15.191, "step": 400 }, { "epoch": 0.9539101992268808, "grad_norm": 0.6177543997764587, "learning_rate": 5.109489051094891e-06, "loss": 0.4553, "step": 401 }, { "epoch": 0.9562890276538805, "grad_norm": 0.5756453275680542, "learning_rate": 4.866180048661801e-06, "loss": 0.4026, "step": 402 }, { "epoch": 0.9586678560808801, "grad_norm": 0.575804591178894, "learning_rate": 4.62287104622871e-06, "loss": 0.4351, "step": 403 }, { "epoch": 0.9610466845078799, "grad_norm": 0.5242466926574707, "learning_rate": 4.379562043795621e-06, "loss": 0.4606, "step": 404 }, { "epoch": 0.9634255129348795, "grad_norm": 0.5496498346328735, "learning_rate": 4.13625304136253e-06, "loss": 0.5175, "step": 405 }, { "epoch": 0.9658043413618793, "grad_norm": 0.5221502184867859, "learning_rate": 3.89294403892944e-06, "loss": 0.3652, "step": 406 }, { "epoch": 0.968183169788879, "grad_norm": 0.5580674409866333, "learning_rate": 3.64963503649635e-06, "loss": 0.3914, "step": 407 }, { "epoch": 0.9705619982158786, "grad_norm": 0.5373774170875549, "learning_rate": 3.4063260340632603e-06, "loss": 0.399, "step": 408 }, { "epoch": 0.9729408266428784, "grad_norm": 0.44664666056632996, "learning_rate": 3.1630170316301706e-06, "loss": 0.3659, "step": 409 }, { "epoch": 0.975319655069878, "grad_norm": 0.5495840311050415, "learning_rate": 2.9197080291970804e-06, "loss": 0.4548, "step": 410 }, { "epoch": 0.9776984834968778, "grad_norm": 0.6554842591285706, "learning_rate": 2.6763990267639902e-06, "loss": 0.3963, "step": 411 }, { "epoch": 0.9800773119238775, "grad_norm": 0.5366389155387878, "learning_rate": 2.4330900243309005e-06, "loss": 0.391, "step": 412 }, { "epoch": 0.9824561403508771, "grad_norm": 0.5802214741706848, "learning_rate": 2.1897810218978103e-06, "loss": 0.4397, "step": 413 }, { "epoch": 0.9848349687778769, "grad_norm": 0.5553064942359924, "learning_rate": 1.94647201946472e-06, "loss": 0.384, "step": 414 }, { "epoch": 0.9872137972048766, "grad_norm": 0.5031337141990662, "learning_rate": 1.7031630170316302e-06, "loss": 0.3638, "step": 415 }, { "epoch": 0.9895926256318763, "grad_norm": 0.5195873379707336, "learning_rate": 1.4598540145985402e-06, "loss": 0.4275, "step": 416 }, { "epoch": 0.991971454058876, "grad_norm": 0.5617223978042603, "learning_rate": 1.2165450121654502e-06, "loss": 0.4383, "step": 417 }, { "epoch": 0.9943502824858758, "grad_norm": 0.5391303300857544, "learning_rate": 9.7323600973236e-07, "loss": 0.5038, "step": 418 }, { "epoch": 0.9967291109128754, "grad_norm": 0.5291772484779358, "learning_rate": 7.299270072992701e-07, "loss": 0.4119, "step": 419 }, { "epoch": 0.9991079393398751, "grad_norm": 0.52870112657547, "learning_rate": 4.8661800486618e-07, "loss": 0.4272, "step": 420 }, { "epoch": 1.0, "grad_norm": 0.8552002310752869, "learning_rate": 2.4330900243309e-07, "loss": 0.3481, "step": 421 } ], "logging_steps": 1, "max_steps": 421, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4660818389684224e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }