| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.0, |
| "eval_steps": 100, |
| "global_step": 1684, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0023788284269997025, |
| "grad_norm": 1.1563122272491455, |
| "learning_rate": 0.0, |
| "loss": 2.0206, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.004757656853999405, |
| "grad_norm": 1.4585579633712769, |
| "learning_rate": 1e-05, |
| "loss": 2.2693, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.007136485280999108, |
| "grad_norm": 1.5164328813552856, |
| "learning_rate": 2e-05, |
| "loss": 2.3356, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.00951531370799881, |
| "grad_norm": 0.9573532938957214, |
| "learning_rate": 3e-05, |
| "loss": 1.6731, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.011894142134998514, |
| "grad_norm": 1.3015975952148438, |
| "learning_rate": 4e-05, |
| "loss": 2.0563, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.014272970561998216, |
| "grad_norm": 1.1276603937149048, |
| "learning_rate": 5e-05, |
| "loss": 1.7849, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.016651798988997917, |
| "grad_norm": 1.1689510345458984, |
| "learning_rate": 6e-05, |
| "loss": 1.8166, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.01903062741599762, |
| "grad_norm": 0.9140409827232361, |
| "learning_rate": 7e-05, |
| "loss": 1.5923, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.021409455842997322, |
| "grad_norm": 0.8196120262145996, |
| "learning_rate": 8e-05, |
| "loss": 1.4652, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.023788284269997028, |
| "grad_norm": 0.5995307564735413, |
| "learning_rate": 9e-05, |
| "loss": 1.2367, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.02616711269699673, |
| "grad_norm": 0.7658697366714478, |
| "learning_rate": 0.0001, |
| "loss": 1.3004, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.028545941123996433, |
| "grad_norm": 0.6701432466506958, |
| "learning_rate": 9.994026284348866e-05, |
| "loss": 1.1174, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.030924769550996135, |
| "grad_norm": 0.7277427315711975, |
| "learning_rate": 9.98805256869773e-05, |
| "loss": 1.1932, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.033303597977995834, |
| "grad_norm": 0.6935728788375854, |
| "learning_rate": 9.982078853046596e-05, |
| "loss": 1.1501, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.03568242640499554, |
| "grad_norm": 0.7421849370002747, |
| "learning_rate": 9.97610513739546e-05, |
| "loss": 1.0886, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.03806125483199524, |
| "grad_norm": 0.7694389820098877, |
| "learning_rate": 9.970131421744326e-05, |
| "loss": 1.068, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.040440083258994945, |
| "grad_norm": 0.7087035775184631, |
| "learning_rate": 9.96415770609319e-05, |
| "loss": 0.9678, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.042818911685994644, |
| "grad_norm": 0.660852313041687, |
| "learning_rate": 9.958183990442056e-05, |
| "loss": 0.9252, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.04519774011299435, |
| "grad_norm": 0.6802922487258911, |
| "learning_rate": 9.952210274790921e-05, |
| "loss": 0.9095, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.047576568539994056, |
| "grad_norm": 0.5777844786643982, |
| "learning_rate": 9.946236559139786e-05, |
| "loss": 0.9658, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.049955396966993755, |
| "grad_norm": 0.4977063238620758, |
| "learning_rate": 9.940262843488651e-05, |
| "loss": 0.8399, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.05233422539399346, |
| "grad_norm": 0.601166307926178, |
| "learning_rate": 9.934289127837514e-05, |
| "loss": 0.767, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.05471305382099316, |
| "grad_norm": 0.5210549235343933, |
| "learning_rate": 9.928315412186381e-05, |
| "loss": 0.8259, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.057091882247992866, |
| "grad_norm": 0.5965569615364075, |
| "learning_rate": 9.922341696535246e-05, |
| "loss": 0.9172, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.059470710674992565, |
| "grad_norm": 0.5033414959907532, |
| "learning_rate": 9.916367980884111e-05, |
| "loss": 0.8809, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.06184953910199227, |
| "grad_norm": 0.5232973098754883, |
| "learning_rate": 9.910394265232975e-05, |
| "loss": 0.7822, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.06422836752899197, |
| "grad_norm": 0.47480395436286926, |
| "learning_rate": 9.90442054958184e-05, |
| "loss": 0.6984, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.06660719595599167, |
| "grad_norm": 0.5224344730377197, |
| "learning_rate": 9.898446833930706e-05, |
| "loss": 0.6915, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.06898602438299138, |
| "grad_norm": 0.4992648661136627, |
| "learning_rate": 9.892473118279571e-05, |
| "loss": 0.685, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.07136485280999108, |
| "grad_norm": 0.4528586268424988, |
| "learning_rate": 9.886499402628435e-05, |
| "loss": 0.6188, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.07374368123699078, |
| "grad_norm": 0.49073758721351624, |
| "learning_rate": 9.8805256869773e-05, |
| "loss": 0.581, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.07612250966399048, |
| "grad_norm": 0.5179185271263123, |
| "learning_rate": 9.874551971326166e-05, |
| "loss": 0.6578, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.07850133809099019, |
| "grad_norm": 0.49443480372428894, |
| "learning_rate": 9.868578255675031e-05, |
| "loss": 0.6404, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.08088016651798989, |
| "grad_norm": 0.4802263081073761, |
| "learning_rate": 9.862604540023895e-05, |
| "loss": 0.5601, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.08325899494498959, |
| "grad_norm": 0.43969476222991943, |
| "learning_rate": 9.85663082437276e-05, |
| "loss": 0.5594, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.08563782337198929, |
| "grad_norm": 0.40470626950263977, |
| "learning_rate": 9.850657108721625e-05, |
| "loss": 0.5224, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.088016651798989, |
| "grad_norm": 0.48150962591171265, |
| "learning_rate": 9.844683393070491e-05, |
| "loss": 0.6268, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.0903954802259887, |
| "grad_norm": 0.49091798067092896, |
| "learning_rate": 9.838709677419355e-05, |
| "loss": 0.6127, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.0927743086529884, |
| "grad_norm": 0.40109407901763916, |
| "learning_rate": 9.83273596176822e-05, |
| "loss": 0.5474, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.09515313707998811, |
| "grad_norm": 0.4158681333065033, |
| "learning_rate": 9.826762246117085e-05, |
| "loss": 0.5739, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.09753196550698781, |
| "grad_norm": 0.4261043667793274, |
| "learning_rate": 9.820788530465951e-05, |
| "loss": 0.5421, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.09991079393398751, |
| "grad_norm": 0.3990945816040039, |
| "learning_rate": 9.814814814814815e-05, |
| "loss": 0.5748, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.10228962236098721, |
| "grad_norm": 0.45243942737579346, |
| "learning_rate": 9.80884109916368e-05, |
| "loss": 0.6495, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.10466845078798692, |
| "grad_norm": 0.40185514092445374, |
| "learning_rate": 9.802867383512545e-05, |
| "loss": 0.5058, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.10704727921498662, |
| "grad_norm": 0.5162461996078491, |
| "learning_rate": 9.79689366786141e-05, |
| "loss": 0.6102, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.10942610764198632, |
| "grad_norm": 0.4457720220088959, |
| "learning_rate": 9.790919952210275e-05, |
| "loss": 0.5633, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.11180493606898602, |
| "grad_norm": 0.4560127556324005, |
| "learning_rate": 9.78494623655914e-05, |
| "loss": 0.5784, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.11418376449598573, |
| "grad_norm": 0.38972005248069763, |
| "learning_rate": 9.778972520908005e-05, |
| "loss": 0.5795, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.11656259292298543, |
| "grad_norm": 0.43415090441703796, |
| "learning_rate": 9.77299880525687e-05, |
| "loss": 0.4822, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.11894142134998513, |
| "grad_norm": 0.44866281747817993, |
| "learning_rate": 9.767025089605735e-05, |
| "loss": 0.5978, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.12132024977698483, |
| "grad_norm": 0.42778265476226807, |
| "learning_rate": 9.7610513739546e-05, |
| "loss": 0.5713, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.12369907820398454, |
| "grad_norm": 0.37236231565475464, |
| "learning_rate": 9.755077658303465e-05, |
| "loss": 0.5136, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.12607790663098423, |
| "grad_norm": 0.4558245837688446, |
| "learning_rate": 9.74910394265233e-05, |
| "loss": 0.5411, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.12845673505798394, |
| "grad_norm": 0.410610169172287, |
| "learning_rate": 9.743130227001195e-05, |
| "loss": 0.5581, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.13083556348498365, |
| "grad_norm": 0.4050186276435852, |
| "learning_rate": 9.73715651135006e-05, |
| "loss": 0.4935, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.13321439191198334, |
| "grad_norm": 0.45527154207229614, |
| "learning_rate": 9.731182795698925e-05, |
| "loss": 0.5917, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.13559322033898305, |
| "grad_norm": 0.4652462899684906, |
| "learning_rate": 9.72520908004779e-05, |
| "loss": 0.5731, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.13797204876598276, |
| "grad_norm": 0.4191625118255615, |
| "learning_rate": 9.719235364396656e-05, |
| "loss": 0.5287, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.14035087719298245, |
| "grad_norm": 0.5013799071311951, |
| "learning_rate": 9.713261648745519e-05, |
| "loss": 0.508, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.14272970561998216, |
| "grad_norm": 0.4258359372615814, |
| "learning_rate": 9.707287933094386e-05, |
| "loss": 0.5213, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.14510853404698187, |
| "grad_norm": 0.42738595604896545, |
| "learning_rate": 9.70131421744325e-05, |
| "loss": 0.5043, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.14748736247398156, |
| "grad_norm": 0.38936150074005127, |
| "learning_rate": 9.695340501792116e-05, |
| "loss": 0.5113, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.14986619090098127, |
| "grad_norm": 0.46111366152763367, |
| "learning_rate": 9.68936678614098e-05, |
| "loss": 0.5738, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.15224501932798096, |
| "grad_norm": 0.38841429352760315, |
| "learning_rate": 9.683393070489846e-05, |
| "loss": 0.4687, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.15462384775498067, |
| "grad_norm": 0.47296616435050964, |
| "learning_rate": 9.677419354838711e-05, |
| "loss": 0.5165, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.15700267618198038, |
| "grad_norm": 0.4431455433368683, |
| "learning_rate": 9.671445639187576e-05, |
| "loss": 0.4682, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.15938150460898007, |
| "grad_norm": 0.43318790197372437, |
| "learning_rate": 9.66547192353644e-05, |
| "loss": 0.5467, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.16176033303597978, |
| "grad_norm": 0.4447453022003174, |
| "learning_rate": 9.659498207885304e-05, |
| "loss": 0.4962, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.1641391614629795, |
| "grad_norm": 0.4104563891887665, |
| "learning_rate": 9.653524492234171e-05, |
| "loss": 0.4536, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.16651798988997918, |
| "grad_norm": 0.4398542046546936, |
| "learning_rate": 9.647550776583036e-05, |
| "loss": 0.5473, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.1688968183169789, |
| "grad_norm": 0.43461278080940247, |
| "learning_rate": 9.6415770609319e-05, |
| "loss": 0.474, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.17127564674397858, |
| "grad_norm": 0.4528830051422119, |
| "learning_rate": 9.635603345280765e-05, |
| "loss": 0.5247, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.1736544751709783, |
| "grad_norm": 0.45138227939605713, |
| "learning_rate": 9.62962962962963e-05, |
| "loss": 0.542, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.176033303597978, |
| "grad_norm": 0.43729302287101746, |
| "learning_rate": 9.623655913978496e-05, |
| "loss": 0.5053, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.1784121320249777, |
| "grad_norm": 0.4513843357563019, |
| "learning_rate": 9.61768219832736e-05, |
| "loss": 0.4978, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.1807909604519774, |
| "grad_norm": 0.39364778995513916, |
| "learning_rate": 9.611708482676225e-05, |
| "loss": 0.4489, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.1831697888789771, |
| "grad_norm": 0.5211411714553833, |
| "learning_rate": 9.60573476702509e-05, |
| "loss": 0.5445, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.1855486173059768, |
| "grad_norm": 0.49370935559272766, |
| "learning_rate": 9.599761051373956e-05, |
| "loss": 0.5108, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.1879274457329765, |
| "grad_norm": 0.4482332766056061, |
| "learning_rate": 9.59378733572282e-05, |
| "loss": 0.5219, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.19030627415997622, |
| "grad_norm": 0.45065587759017944, |
| "learning_rate": 9.587813620071685e-05, |
| "loss": 0.5503, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.1926851025869759, |
| "grad_norm": 0.4308435320854187, |
| "learning_rate": 9.58183990442055e-05, |
| "loss": 0.5377, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.19506393101397562, |
| "grad_norm": 0.45612701773643494, |
| "learning_rate": 9.575866188769415e-05, |
| "loss": 0.4786, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.1974427594409753, |
| "grad_norm": 0.4834578335285187, |
| "learning_rate": 9.56989247311828e-05, |
| "loss": 0.5582, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.19982158786797502, |
| "grad_norm": 0.45820561051368713, |
| "learning_rate": 9.563918757467145e-05, |
| "loss": 0.5158, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.20220041629497473, |
| "grad_norm": 0.42592278122901917, |
| "learning_rate": 9.55794504181601e-05, |
| "loss": 0.5587, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.20457924472197442, |
| "grad_norm": 0.43931952118873596, |
| "learning_rate": 9.551971326164875e-05, |
| "loss": 0.4604, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.20695807314897413, |
| "grad_norm": 0.504015326499939, |
| "learning_rate": 9.54599761051374e-05, |
| "loss": 0.5234, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.20933690157597384, |
| "grad_norm": 0.41193896532058716, |
| "learning_rate": 9.540023894862605e-05, |
| "loss": 0.4782, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.21171573000297353, |
| "grad_norm": 0.47255855798721313, |
| "learning_rate": 9.53405017921147e-05, |
| "loss": 0.4457, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.21409455842997324, |
| "grad_norm": 0.41772446036338806, |
| "learning_rate": 9.528076463560335e-05, |
| "loss": 0.4297, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.21647338685697295, |
| "grad_norm": 0.47142553329467773, |
| "learning_rate": 9.5221027479092e-05, |
| "loss": 0.5292, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.21885221528397264, |
| "grad_norm": 0.40953025221824646, |
| "learning_rate": 9.516129032258065e-05, |
| "loss": 0.4847, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.22123104371097235, |
| "grad_norm": 0.4205390512943268, |
| "learning_rate": 9.51015531660693e-05, |
| "loss": 0.4924, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.22360987213797204, |
| "grad_norm": 0.4367603063583374, |
| "learning_rate": 9.504181600955795e-05, |
| "loss": 0.5321, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.22598870056497175, |
| "grad_norm": 0.40274521708488464, |
| "learning_rate": 9.49820788530466e-05, |
| "loss": 0.467, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.22836752899197146, |
| "grad_norm": 0.49425846338272095, |
| "learning_rate": 9.492234169653524e-05, |
| "loss": 0.5386, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.23074635741897115, |
| "grad_norm": 0.4841647744178772, |
| "learning_rate": 9.48626045400239e-05, |
| "loss": 0.4556, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.23312518584597086, |
| "grad_norm": 0.41210493445396423, |
| "learning_rate": 9.480286738351255e-05, |
| "loss": 0.4885, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.23550401427297057, |
| "grad_norm": 0.4671674072742462, |
| "learning_rate": 9.47431302270012e-05, |
| "loss": 0.5642, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.23788284269997026, |
| "grad_norm": 0.4276902973651886, |
| "learning_rate": 9.468339307048984e-05, |
| "loss": 0.4897, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.23788284269997026, |
| "eval_loss": 0.4852786660194397, |
| "eval_runtime": 27.1696, |
| "eval_samples_per_second": 27.531, |
| "eval_steps_per_second": 13.765, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.24026167112696997, |
| "grad_norm": 0.48646774888038635, |
| "learning_rate": 9.46236559139785e-05, |
| "loss": 0.4827, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.24264049955396966, |
| "grad_norm": 0.4662117063999176, |
| "learning_rate": 9.456391875746716e-05, |
| "loss": 0.4601, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.24501932798096937, |
| "grad_norm": 0.39756080508232117, |
| "learning_rate": 9.45041816009558e-05, |
| "loss": 0.4574, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.24739815640796908, |
| "grad_norm": 0.4817146062850952, |
| "learning_rate": 9.444444444444444e-05, |
| "loss": 0.5051, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.24977698483496877, |
| "grad_norm": 0.5022764205932617, |
| "learning_rate": 9.438470728793309e-05, |
| "loss": 0.4912, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.25215581326196845, |
| "grad_norm": 0.4829760193824768, |
| "learning_rate": 9.432497013142176e-05, |
| "loss": 0.4778, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.25453464168896817, |
| "grad_norm": 0.4534595310688019, |
| "learning_rate": 9.42652329749104e-05, |
| "loss": 0.5318, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.2569134701159679, |
| "grad_norm": 0.48831233382225037, |
| "learning_rate": 9.420549581839904e-05, |
| "loss": 0.4606, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.2592922985429676, |
| "grad_norm": 0.4774167835712433, |
| "learning_rate": 9.41457586618877e-05, |
| "loss": 0.5262, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.2616711269699673, |
| "grad_norm": 0.46727365255355835, |
| "learning_rate": 9.408602150537636e-05, |
| "loss": 0.4316, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.264049955396967, |
| "grad_norm": 0.547978401184082, |
| "learning_rate": 9.402628434886501e-05, |
| "loss": 0.5375, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.2664287838239667, |
| "grad_norm": 0.5053251385688782, |
| "learning_rate": 9.396654719235364e-05, |
| "loss": 0.4526, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.2688076122509664, |
| "grad_norm": 0.5039639472961426, |
| "learning_rate": 9.39068100358423e-05, |
| "loss": 0.5364, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.2711864406779661, |
| "grad_norm": 0.5594891905784607, |
| "learning_rate": 9.384707287933095e-05, |
| "loss": 0.5086, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.2735652691049658, |
| "grad_norm": 0.45772382616996765, |
| "learning_rate": 9.378733572281961e-05, |
| "loss": 0.5296, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.2759440975319655, |
| "grad_norm": 0.5358996987342834, |
| "learning_rate": 9.372759856630825e-05, |
| "loss": 0.5925, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.2783229259589652, |
| "grad_norm": 0.4769960343837738, |
| "learning_rate": 9.36678614097969e-05, |
| "loss": 0.5323, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.2807017543859649, |
| "grad_norm": 0.4564250409603119, |
| "learning_rate": 9.360812425328555e-05, |
| "loss": 0.4207, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.2830805828129646, |
| "grad_norm": 0.4692419469356537, |
| "learning_rate": 9.35483870967742e-05, |
| "loss": 0.4771, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.2854594112399643, |
| "grad_norm": 0.42338937520980835, |
| "learning_rate": 9.348864994026285e-05, |
| "loss": 0.4622, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.28783823966696404, |
| "grad_norm": 0.46972939372062683, |
| "learning_rate": 9.34289127837515e-05, |
| "loss": 0.5536, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.29021706809396375, |
| "grad_norm": 0.4813671112060547, |
| "learning_rate": 9.336917562724015e-05, |
| "loss": 0.4483, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.2925958965209634, |
| "grad_norm": 0.4687402546405792, |
| "learning_rate": 9.33094384707288e-05, |
| "loss": 0.5196, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.2949747249479631, |
| "grad_norm": 0.48412230610847473, |
| "learning_rate": 9.324970131421745e-05, |
| "loss": 0.583, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.29735355337496283, |
| "grad_norm": 0.4591488838195801, |
| "learning_rate": 9.31899641577061e-05, |
| "loss": 0.5234, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.29973238180196254, |
| "grad_norm": 0.41953685879707336, |
| "learning_rate": 9.313022700119475e-05, |
| "loss": 0.4022, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.30211121022896226, |
| "grad_norm": 0.4601673185825348, |
| "learning_rate": 9.30704898446834e-05, |
| "loss": 0.4862, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.3044900386559619, |
| "grad_norm": 0.5358420014381409, |
| "learning_rate": 9.301075268817204e-05, |
| "loss": 0.5561, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.3068688670829616, |
| "grad_norm": 0.4296037256717682, |
| "learning_rate": 9.29510155316607e-05, |
| "loss": 0.3769, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.30924769550996134, |
| "grad_norm": 0.461069792509079, |
| "learning_rate": 9.289127837514935e-05, |
| "loss": 0.4367, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.31162652393696105, |
| "grad_norm": 0.45425426959991455, |
| "learning_rate": 9.2831541218638e-05, |
| "loss": 0.453, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.31400535236396077, |
| "grad_norm": 0.4828135371208191, |
| "learning_rate": 9.277180406212664e-05, |
| "loss": 0.366, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.3163841807909605, |
| "grad_norm": 0.5032231211662292, |
| "learning_rate": 9.27120669056153e-05, |
| "loss": 0.5314, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.31876300921796014, |
| "grad_norm": 0.42812997102737427, |
| "learning_rate": 9.265232974910395e-05, |
| "loss": 0.4845, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.32114183764495985, |
| "grad_norm": 0.4545380473136902, |
| "learning_rate": 9.25925925925926e-05, |
| "loss": 0.474, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.32352066607195956, |
| "grad_norm": 0.4786789119243622, |
| "learning_rate": 9.253285543608124e-05, |
| "loss": 0.5126, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.3258994944989593, |
| "grad_norm": 0.48447903990745544, |
| "learning_rate": 9.247311827956989e-05, |
| "loss": 0.5048, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.328278322925959, |
| "grad_norm": 0.4667385518550873, |
| "learning_rate": 9.241338112305855e-05, |
| "loss": 0.4826, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.33065715135295864, |
| "grad_norm": 0.5055387616157532, |
| "learning_rate": 9.23536439665472e-05, |
| "loss": 0.4674, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.33303597977995836, |
| "grad_norm": 0.5160491466522217, |
| "learning_rate": 9.229390681003584e-05, |
| "loss": 0.4607, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.33541480820695807, |
| "grad_norm": 0.3776059150695801, |
| "learning_rate": 9.223416965352449e-05, |
| "loss": 0.3956, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.3377936366339578, |
| "grad_norm": 0.4551769196987152, |
| "learning_rate": 9.217443249701314e-05, |
| "loss": 0.462, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.3401724650609575, |
| "grad_norm": 0.4852535128593445, |
| "learning_rate": 9.21146953405018e-05, |
| "loss": 0.4803, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.34255129348795715, |
| "grad_norm": 0.5050077438354492, |
| "learning_rate": 9.205495818399044e-05, |
| "loss": 0.5, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.34493012191495687, |
| "grad_norm": 0.46231594681739807, |
| "learning_rate": 9.199522102747909e-05, |
| "loss": 0.4652, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.3473089503419566, |
| "grad_norm": 0.434261679649353, |
| "learning_rate": 9.193548387096774e-05, |
| "loss": 0.4542, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.3496877787689563, |
| "grad_norm": 0.44038939476013184, |
| "learning_rate": 9.18757467144564e-05, |
| "loss": 0.4922, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.352066607195956, |
| "grad_norm": 0.48634183406829834, |
| "learning_rate": 9.181600955794504e-05, |
| "loss": 0.4709, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.3544454356229557, |
| "grad_norm": 0.47852379083633423, |
| "learning_rate": 9.175627240143369e-05, |
| "loss": 0.5384, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.3568242640499554, |
| "grad_norm": 0.48153775930404663, |
| "learning_rate": 9.169653524492234e-05, |
| "loss": 0.4515, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3592030924769551, |
| "grad_norm": 0.4894790053367615, |
| "learning_rate": 9.163679808841099e-05, |
| "loss": 0.54, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.3615819209039548, |
| "grad_norm": 0.5082180500030518, |
| "learning_rate": 9.157706093189964e-05, |
| "loss": 0.4483, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.3639607493309545, |
| "grad_norm": 0.4721032381057739, |
| "learning_rate": 9.15173237753883e-05, |
| "loss": 0.5123, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.3663395777579542, |
| "grad_norm": 0.4319252371788025, |
| "learning_rate": 9.145758661887694e-05, |
| "loss": 0.4481, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.3687184061849539, |
| "grad_norm": 0.48786380887031555, |
| "learning_rate": 9.13978494623656e-05, |
| "loss": 0.4753, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.3710972346119536, |
| "grad_norm": 0.5081771016120911, |
| "learning_rate": 9.133811230585424e-05, |
| "loss": 0.5056, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.3734760630389533, |
| "grad_norm": 0.4977443218231201, |
| "learning_rate": 9.12783751493429e-05, |
| "loss": 0.4685, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.375854891465953, |
| "grad_norm": 0.4570608139038086, |
| "learning_rate": 9.121863799283154e-05, |
| "loss": 0.3681, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.37823371989295274, |
| "grad_norm": 0.42695313692092896, |
| "learning_rate": 9.11589008363202e-05, |
| "loss": 0.4141, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.38061254831995245, |
| "grad_norm": 0.43205609917640686, |
| "learning_rate": 9.109916367980885e-05, |
| "loss": 0.4596, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.3829913767469521, |
| "grad_norm": 0.4466572701931, |
| "learning_rate": 9.10394265232975e-05, |
| "loss": 0.4247, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.3853702051739518, |
| "grad_norm": 0.4255600571632385, |
| "learning_rate": 9.097968936678615e-05, |
| "loss": 0.4557, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.38774903360095153, |
| "grad_norm": 0.4489794671535492, |
| "learning_rate": 9.09199522102748e-05, |
| "loss": 0.4372, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.39012786202795124, |
| "grad_norm": 0.4137515425682068, |
| "learning_rate": 9.086021505376345e-05, |
| "loss": 0.429, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.39250669045495096, |
| "grad_norm": 0.5630534887313843, |
| "learning_rate": 9.080047789725208e-05, |
| "loss": 0.4875, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.3948855188819506, |
| "grad_norm": 0.45907342433929443, |
| "learning_rate": 9.074074074074075e-05, |
| "loss": 0.4258, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.3972643473089503, |
| "grad_norm": 0.5133002996444702, |
| "learning_rate": 9.06810035842294e-05, |
| "loss": 0.4657, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.39964317573595004, |
| "grad_norm": 0.49119633436203003, |
| "learning_rate": 9.062126642771805e-05, |
| "loss": 0.5184, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.40202200416294975, |
| "grad_norm": 0.5076906681060791, |
| "learning_rate": 9.056152927120668e-05, |
| "loss": 0.4548, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.40440083258994947, |
| "grad_norm": 0.49528005719184875, |
| "learning_rate": 9.050179211469535e-05, |
| "loss": 0.4214, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.4067796610169492, |
| "grad_norm": 0.48812952637672424, |
| "learning_rate": 9.0442054958184e-05, |
| "loss": 0.4825, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.40915848944394884, |
| "grad_norm": 0.44896411895751953, |
| "learning_rate": 9.038231780167265e-05, |
| "loss": 0.4465, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.41153731787094855, |
| "grad_norm": 0.42637521028518677, |
| "learning_rate": 9.032258064516129e-05, |
| "loss": 0.4469, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.41391614629794826, |
| "grad_norm": 0.47798776626586914, |
| "learning_rate": 9.026284348864994e-05, |
| "loss": 0.4689, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.416294974724948, |
| "grad_norm": 0.4603348970413208, |
| "learning_rate": 9.02031063321386e-05, |
| "loss": 0.4494, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.4186738031519477, |
| "grad_norm": 0.44408583641052246, |
| "learning_rate": 9.014336917562725e-05, |
| "loss": 0.4727, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.42105263157894735, |
| "grad_norm": 0.4634898602962494, |
| "learning_rate": 9.008363201911589e-05, |
| "loss": 0.455, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.42343146000594706, |
| "grad_norm": 0.42439034581184387, |
| "learning_rate": 9.002389486260454e-05, |
| "loss": 0.4771, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.42581028843294677, |
| "grad_norm": 0.4173770248889923, |
| "learning_rate": 8.99641577060932e-05, |
| "loss": 0.4437, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.4281891168599465, |
| "grad_norm": 0.5092681050300598, |
| "learning_rate": 8.990442054958185e-05, |
| "loss": 0.4728, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.4305679452869462, |
| "grad_norm": 0.41918063163757324, |
| "learning_rate": 8.984468339307049e-05, |
| "loss": 0.4711, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.4329467737139459, |
| "grad_norm": 0.482857346534729, |
| "learning_rate": 8.978494623655914e-05, |
| "loss": 0.4661, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.43532560214094557, |
| "grad_norm": 0.44419020414352417, |
| "learning_rate": 8.972520908004779e-05, |
| "loss": 0.4264, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.4377044305679453, |
| "grad_norm": 0.4146061837673187, |
| "learning_rate": 8.966547192353645e-05, |
| "loss": 0.4652, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.440083258994945, |
| "grad_norm": 0.5836295485496521, |
| "learning_rate": 8.960573476702509e-05, |
| "loss": 0.4986, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.4424620874219447, |
| "grad_norm": 0.5084491968154907, |
| "learning_rate": 8.954599761051374e-05, |
| "loss": 0.4474, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.4448409158489444, |
| "grad_norm": 0.4597759544849396, |
| "learning_rate": 8.948626045400239e-05, |
| "loss": 0.4368, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.4472197442759441, |
| "grad_norm": 0.5420514345169067, |
| "learning_rate": 8.942652329749104e-05, |
| "loss": 0.5872, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.4495985727029438, |
| "grad_norm": 0.43812426924705505, |
| "learning_rate": 8.936678614097969e-05, |
| "loss": 0.4105, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.4519774011299435, |
| "grad_norm": 0.5247114300727844, |
| "learning_rate": 8.930704898446834e-05, |
| "loss": 0.4847, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.4543562295569432, |
| "grad_norm": 0.470450222492218, |
| "learning_rate": 8.924731182795699e-05, |
| "loss": 0.4386, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.4567350579839429, |
| "grad_norm": 0.47008588910102844, |
| "learning_rate": 8.918757467144564e-05, |
| "loss": 0.4321, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.4591138864109426, |
| "grad_norm": 0.4504205584526062, |
| "learning_rate": 8.912783751493429e-05, |
| "loss": 0.4462, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.4614927148379423, |
| "grad_norm": 0.5049585103988647, |
| "learning_rate": 8.906810035842294e-05, |
| "loss": 0.4664, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.463871543264942, |
| "grad_norm": 0.48764532804489136, |
| "learning_rate": 8.900836320191159e-05, |
| "loss": 0.439, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.4662503716919417, |
| "grad_norm": 0.40391966700553894, |
| "learning_rate": 8.894862604540024e-05, |
| "loss": 0.3806, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.46862920011894144, |
| "grad_norm": 0.46843641996383667, |
| "learning_rate": 8.888888888888889e-05, |
| "loss": 0.4332, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.47100802854594115, |
| "grad_norm": 0.5837143063545227, |
| "learning_rate": 8.882915173237754e-05, |
| "loss": 0.5652, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.4733868569729408, |
| "grad_norm": 0.49761733412742615, |
| "learning_rate": 8.87694145758662e-05, |
| "loss": 0.4827, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.4757656853999405, |
| "grad_norm": 0.43618375062942505, |
| "learning_rate": 8.870967741935484e-05, |
| "loss": 0.4084, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4757656853999405, |
| "eval_loss": 0.4430273473262787, |
| "eval_runtime": 24.6904, |
| "eval_samples_per_second": 30.295, |
| "eval_steps_per_second": 15.148, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.47814451382694023, |
| "grad_norm": 0.5256602168083191, |
| "learning_rate": 8.86499402628435e-05, |
| "loss": 0.5669, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.48052334225393994, |
| "grad_norm": 0.46762749552726746, |
| "learning_rate": 8.859020310633214e-05, |
| "loss": 0.4767, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.48290217068093966, |
| "grad_norm": 0.44799891114234924, |
| "learning_rate": 8.85304659498208e-05, |
| "loss": 0.4338, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.4852809991079393, |
| "grad_norm": 0.5003095269203186, |
| "learning_rate": 8.847072879330945e-05, |
| "loss": 0.409, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.487659827534939, |
| "grad_norm": 0.4540422856807709, |
| "learning_rate": 8.84109916367981e-05, |
| "loss": 0.415, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.49003865596193874, |
| "grad_norm": 0.5260767936706543, |
| "learning_rate": 8.835125448028673e-05, |
| "loss": 0.4591, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.49241748438893845, |
| "grad_norm": 0.5634750723838806, |
| "learning_rate": 8.82915173237754e-05, |
| "loss": 0.4961, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.49479631281593817, |
| "grad_norm": 0.5199817419052124, |
| "learning_rate": 8.823178016726405e-05, |
| "loss": 0.4258, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.4971751412429379, |
| "grad_norm": 0.5310624241828918, |
| "learning_rate": 8.81720430107527e-05, |
| "loss": 0.5003, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.49955396966993754, |
| "grad_norm": 0.4997275769710541, |
| "learning_rate": 8.811230585424133e-05, |
| "loss": 0.5096, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5019327980969372, |
| "grad_norm": 0.47169461846351624, |
| "learning_rate": 8.805256869772998e-05, |
| "loss": 0.3788, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.5043116265239369, |
| "grad_norm": 0.49595358967781067, |
| "learning_rate": 8.799283154121865e-05, |
| "loss": 0.5021, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.5066904549509367, |
| "grad_norm": 0.46028879284858704, |
| "learning_rate": 8.79330943847073e-05, |
| "loss": 0.4533, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.5090692833779363, |
| "grad_norm": 0.4750324785709381, |
| "learning_rate": 8.787335722819593e-05, |
| "loss": 0.4634, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.5114481118049361, |
| "grad_norm": 0.4960343539714813, |
| "learning_rate": 8.781362007168459e-05, |
| "loss": 0.4516, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.5138269402319358, |
| "grad_norm": 0.4688979983329773, |
| "learning_rate": 8.775388291517325e-05, |
| "loss": 0.4208, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.5162057686589355, |
| "grad_norm": 0.5113106966018677, |
| "learning_rate": 8.76941457586619e-05, |
| "loss": 0.4894, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.5185845970859352, |
| "grad_norm": 0.39522895216941833, |
| "learning_rate": 8.763440860215054e-05, |
| "loss": 0.4661, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.5209634255129348, |
| "grad_norm": 0.5012136101722717, |
| "learning_rate": 8.757467144563919e-05, |
| "loss": 0.4127, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.5233422539399346, |
| "grad_norm": 0.4807124435901642, |
| "learning_rate": 8.751493428912784e-05, |
| "loss": 0.3925, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5257210823669343, |
| "grad_norm": 0.5423474311828613, |
| "learning_rate": 8.74551971326165e-05, |
| "loss": 0.4961, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.528099910793934, |
| "grad_norm": 0.48710131645202637, |
| "learning_rate": 8.739545997610514e-05, |
| "loss": 0.425, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.5304787392209337, |
| "grad_norm": 0.47647836804389954, |
| "learning_rate": 8.733572281959379e-05, |
| "loss": 0.4065, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.5328575676479334, |
| "grad_norm": 0.5683121085166931, |
| "learning_rate": 8.727598566308244e-05, |
| "loss": 0.5268, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.5352363960749331, |
| "grad_norm": 0.47146156430244446, |
| "learning_rate": 8.72162485065711e-05, |
| "loss": 0.4059, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.5376152245019328, |
| "grad_norm": 0.460894376039505, |
| "learning_rate": 8.715651135005974e-05, |
| "loss": 0.4245, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.5399940529289325, |
| "grad_norm": 0.5514392256736755, |
| "learning_rate": 8.709677419354839e-05, |
| "loss": 0.506, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.5423728813559322, |
| "grad_norm": 0.5026625394821167, |
| "learning_rate": 8.703703703703704e-05, |
| "loss": 0.4574, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.5447517097829319, |
| "grad_norm": 0.45298993587493896, |
| "learning_rate": 8.697729988052569e-05, |
| "loss": 0.3697, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.5471305382099316, |
| "grad_norm": 0.5072413682937622, |
| "learning_rate": 8.691756272401434e-05, |
| "loss": 0.4165, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5495093666369313, |
| "grad_norm": 0.44066810607910156, |
| "learning_rate": 8.685782556750299e-05, |
| "loss": 0.4554, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.551888195063931, |
| "grad_norm": 0.513289749622345, |
| "learning_rate": 8.679808841099164e-05, |
| "loss": 0.4498, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.5542670234909307, |
| "grad_norm": 0.555833637714386, |
| "learning_rate": 8.673835125448029e-05, |
| "loss": 0.5903, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.5566458519179304, |
| "grad_norm": 0.44822368025779724, |
| "learning_rate": 8.667861409796894e-05, |
| "loss": 0.3928, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.5590246803449301, |
| "grad_norm": 0.44668492674827576, |
| "learning_rate": 8.661887694145759e-05, |
| "loss": 0.3957, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.5614035087719298, |
| "grad_norm": 0.44891655445098877, |
| "learning_rate": 8.655913978494624e-05, |
| "loss": 0.4083, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.5637823371989296, |
| "grad_norm": 0.4478503465652466, |
| "learning_rate": 8.649940262843489e-05, |
| "loss": 0.366, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.5661611656259292, |
| "grad_norm": 0.5855295658111572, |
| "learning_rate": 8.643966547192354e-05, |
| "loss": 0.4751, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.568539994052929, |
| "grad_norm": 0.536170244216919, |
| "learning_rate": 8.637992831541219e-05, |
| "loss": 0.4637, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.5709188224799286, |
| "grad_norm": 0.6756762862205505, |
| "learning_rate": 8.632019115890084e-05, |
| "loss": 0.5934, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5732976509069283, |
| "grad_norm": 0.41015246510505676, |
| "learning_rate": 8.626045400238949e-05, |
| "loss": 0.3901, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.5756764793339281, |
| "grad_norm": 0.4543169438838959, |
| "learning_rate": 8.620071684587814e-05, |
| "loss": 0.4198, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.5780553077609277, |
| "grad_norm": 0.44006866216659546, |
| "learning_rate": 8.614097968936678e-05, |
| "loss": 0.3953, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.5804341361879275, |
| "grad_norm": 0.4517867863178253, |
| "learning_rate": 8.608124253285544e-05, |
| "loss": 0.3867, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.5828129646149272, |
| "grad_norm": 0.49809014797210693, |
| "learning_rate": 8.60215053763441e-05, |
| "loss": 0.5027, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.5851917930419268, |
| "grad_norm": 0.5044611692428589, |
| "learning_rate": 8.596176821983274e-05, |
| "loss": 0.4684, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.5875706214689266, |
| "grad_norm": 0.4469461441040039, |
| "learning_rate": 8.590203106332138e-05, |
| "loss": 0.3945, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.5899494498959262, |
| "grad_norm": 0.4750162959098816, |
| "learning_rate": 8.584229390681004e-05, |
| "loss": 0.4048, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.592328278322926, |
| "grad_norm": 0.4861357808113098, |
| "learning_rate": 8.57825567502987e-05, |
| "loss": 0.4435, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.5947071067499257, |
| "grad_norm": 0.49261415004730225, |
| "learning_rate": 8.572281959378735e-05, |
| "loss": 0.4717, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5970859351769253, |
| "grad_norm": 0.5840692520141602, |
| "learning_rate": 8.566308243727598e-05, |
| "loss": 0.5115, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.5994647636039251, |
| "grad_norm": 0.4797023832798004, |
| "learning_rate": 8.560334528076463e-05, |
| "loss": 0.4224, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.6018435920309247, |
| "grad_norm": 0.5371966361999512, |
| "learning_rate": 8.55436081242533e-05, |
| "loss": 0.5191, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.6042224204579245, |
| "grad_norm": 0.450199693441391, |
| "learning_rate": 8.548387096774195e-05, |
| "loss": 0.4073, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.6066012488849242, |
| "grad_norm": 0.5445570945739746, |
| "learning_rate": 8.542413381123058e-05, |
| "loss": 0.4863, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.6089800773119238, |
| "grad_norm": 0.4752117395401001, |
| "learning_rate": 8.536439665471923e-05, |
| "loss": 0.4522, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.6113589057389236, |
| "grad_norm": 0.5417598485946655, |
| "learning_rate": 8.530465949820788e-05, |
| "loss": 0.4937, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.6137377341659233, |
| "grad_norm": 0.4615798592567444, |
| "learning_rate": 8.524492234169655e-05, |
| "loss": 0.4858, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.616116562592923, |
| "grad_norm": 0.46850234270095825, |
| "learning_rate": 8.518518518518518e-05, |
| "loss": 0.4354, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.6184953910199227, |
| "grad_norm": 0.6526990532875061, |
| "learning_rate": 8.512544802867384e-05, |
| "loss": 0.5441, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.6208742194469223, |
| "grad_norm": 0.42547082901000977, |
| "learning_rate": 8.506571087216249e-05, |
| "loss": 0.3834, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.6232530478739221, |
| "grad_norm": 0.46142813563346863, |
| "learning_rate": 8.500597371565115e-05, |
| "loss": 0.4562, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.6256318763009218, |
| "grad_norm": 0.42186999320983887, |
| "learning_rate": 8.494623655913979e-05, |
| "loss": 0.4341, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.6280107047279215, |
| "grad_norm": 0.5033466815948486, |
| "learning_rate": 8.488649940262844e-05, |
| "loss": 0.4606, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.6303895331549212, |
| "grad_norm": 0.4589903950691223, |
| "learning_rate": 8.482676224611709e-05, |
| "loss": 0.4232, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.632768361581921, |
| "grad_norm": 0.43397510051727295, |
| "learning_rate": 8.476702508960574e-05, |
| "loss": 0.4604, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.6351471900089206, |
| "grad_norm": 0.4586094319820404, |
| "learning_rate": 8.470728793309439e-05, |
| "loss": 0.4637, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.6375260184359203, |
| "grad_norm": 0.4164815843105316, |
| "learning_rate": 8.464755077658304e-05, |
| "loss": 0.3969, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.63990484686292, |
| "grad_norm": 0.5223293304443359, |
| "learning_rate": 8.458781362007169e-05, |
| "loss": 0.4753, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.6422836752899197, |
| "grad_norm": 0.4807034134864807, |
| "learning_rate": 8.452807646356034e-05, |
| "loss": 0.4249, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6446625037169195, |
| "grad_norm": 0.39427343010902405, |
| "learning_rate": 8.446833930704899e-05, |
| "loss": 0.4014, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.6470413321439191, |
| "grad_norm": 0.5013017654418945, |
| "learning_rate": 8.440860215053764e-05, |
| "loss": 0.505, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.6494201605709188, |
| "grad_norm": 0.4361181855201721, |
| "learning_rate": 8.434886499402629e-05, |
| "loss": 0.442, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.6517989889979185, |
| "grad_norm": 0.4051946699619293, |
| "learning_rate": 8.428912783751494e-05, |
| "loss": 0.3891, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.6541778174249182, |
| "grad_norm": 0.48686203360557556, |
| "learning_rate": 8.422939068100359e-05, |
| "loss": 0.5328, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.656556645851918, |
| "grad_norm": 0.4902230203151703, |
| "learning_rate": 8.416965352449224e-05, |
| "loss": 0.5243, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.6589354742789176, |
| "grad_norm": 0.45863983035087585, |
| "learning_rate": 8.410991636798089e-05, |
| "loss": 0.3835, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.6613143027059173, |
| "grad_norm": 0.5191627144813538, |
| "learning_rate": 8.405017921146954e-05, |
| "loss": 0.5338, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.6636931311329171, |
| "grad_norm": 0.47656649351119995, |
| "learning_rate": 8.399044205495819e-05, |
| "loss": 0.4346, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.6660719595599167, |
| "grad_norm": 0.5256597399711609, |
| "learning_rate": 8.393070489844683e-05, |
| "loss": 0.4918, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6684507879869165, |
| "grad_norm": 0.5173395872116089, |
| "learning_rate": 8.387096774193549e-05, |
| "loss": 0.4374, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.6708296164139161, |
| "grad_norm": 0.4496103525161743, |
| "learning_rate": 8.381123058542414e-05, |
| "loss": 0.4269, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.6732084448409158, |
| "grad_norm": 0.46055564284324646, |
| "learning_rate": 8.375149342891279e-05, |
| "loss": 0.407, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.6755872732679156, |
| "grad_norm": 0.5227301120758057, |
| "learning_rate": 8.369175627240143e-05, |
| "loss": 0.5523, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.6779661016949152, |
| "grad_norm": 0.44740229845046997, |
| "learning_rate": 8.363201911589009e-05, |
| "loss": 0.4504, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.680344930121915, |
| "grad_norm": 0.4591153562068939, |
| "learning_rate": 8.357228195937874e-05, |
| "loss": 0.3691, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.6827237585489146, |
| "grad_norm": 0.5262241959571838, |
| "learning_rate": 8.351254480286739e-05, |
| "loss": 0.4774, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.6851025869759143, |
| "grad_norm": 0.4027566909790039, |
| "learning_rate": 8.345280764635603e-05, |
| "loss": 0.4508, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.6874814154029141, |
| "grad_norm": 0.4727931618690491, |
| "learning_rate": 8.339307048984468e-05, |
| "loss": 0.4829, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.6898602438299137, |
| "grad_norm": 0.5389280915260315, |
| "learning_rate": 8.333333333333334e-05, |
| "loss": 0.4643, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6922390722569135, |
| "grad_norm": 0.46183347702026367, |
| "learning_rate": 8.3273596176822e-05, |
| "loss": 0.348, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.6946179006839132, |
| "grad_norm": 0.4557804465293884, |
| "learning_rate": 8.321385902031063e-05, |
| "loss": 0.3903, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.6969967291109129, |
| "grad_norm": 0.49101144075393677, |
| "learning_rate": 8.315412186379928e-05, |
| "loss": 0.4898, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.6993755575379126, |
| "grad_norm": 0.4704493284225464, |
| "learning_rate": 8.309438470728795e-05, |
| "loss": 0.4569, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.7017543859649122, |
| "grad_norm": 0.42415690422058105, |
| "learning_rate": 8.30346475507766e-05, |
| "loss": 0.3992, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.704133214391912, |
| "grad_norm": 0.48090270161628723, |
| "learning_rate": 8.297491039426523e-05, |
| "loss": 0.473, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.7065120428189117, |
| "grad_norm": 0.4778149724006653, |
| "learning_rate": 8.291517323775388e-05, |
| "loss": 0.4849, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.7088908712459114, |
| "grad_norm": 0.5467464923858643, |
| "learning_rate": 8.285543608124253e-05, |
| "loss": 0.502, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.7112696996729111, |
| "grad_norm": 0.43591874837875366, |
| "learning_rate": 8.27956989247312e-05, |
| "loss": 0.4206, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.7136485280999108, |
| "grad_norm": 0.45697838068008423, |
| "learning_rate": 8.273596176821983e-05, |
| "loss": 0.4414, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7136485280999108, |
| "eval_loss": 0.42019587755203247, |
| "eval_runtime": 24.6142, |
| "eval_samples_per_second": 30.389, |
| "eval_steps_per_second": 15.195, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7160273565269105, |
| "grad_norm": 0.5003380179405212, |
| "learning_rate": 8.267622461170848e-05, |
| "loss": 0.4433, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.7184061849539102, |
| "grad_norm": 0.48988282680511475, |
| "learning_rate": 8.261648745519713e-05, |
| "loss": 0.4097, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.72078501338091, |
| "grad_norm": 0.48710161447525024, |
| "learning_rate": 8.255675029868578e-05, |
| "loss": 0.5047, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.7231638418079096, |
| "grad_norm": 0.49213966727256775, |
| "learning_rate": 8.249701314217443e-05, |
| "loss": 0.3946, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.7255426702349093, |
| "grad_norm": 0.48221632838249207, |
| "learning_rate": 8.243727598566309e-05, |
| "loss": 0.4533, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.727921498661909, |
| "grad_norm": 0.4673924446105957, |
| "learning_rate": 8.237753882915174e-05, |
| "loss": 0.4467, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.7303003270889087, |
| "grad_norm": 0.4333641827106476, |
| "learning_rate": 8.231780167264039e-05, |
| "loss": 0.3813, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.7326791555159085, |
| "grad_norm": 0.48312193155288696, |
| "learning_rate": 8.225806451612904e-05, |
| "loss": 0.4388, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.7350579839429081, |
| "grad_norm": 0.4759059250354767, |
| "learning_rate": 8.219832735961769e-05, |
| "loss": 0.3819, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.7374368123699078, |
| "grad_norm": 0.47491195797920227, |
| "learning_rate": 8.213859020310634e-05, |
| "loss": 0.4421, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.7398156407969075, |
| "grad_norm": 0.4345269799232483, |
| "learning_rate": 8.207885304659499e-05, |
| "loss": 0.4134, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.7421944692239072, |
| "grad_norm": 0.48786696791648865, |
| "learning_rate": 8.201911589008364e-05, |
| "loss": 0.3868, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.744573297650907, |
| "grad_norm": 0.5398519039154053, |
| "learning_rate": 8.195937873357229e-05, |
| "loss": 0.4272, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.7469521260779066, |
| "grad_norm": 0.4489103853702545, |
| "learning_rate": 8.189964157706094e-05, |
| "loss": 0.4376, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.7493309545049064, |
| "grad_norm": 0.4804931581020355, |
| "learning_rate": 8.183990442054959e-05, |
| "loss": 0.4128, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.751709782931906, |
| "grad_norm": 0.5021992921829224, |
| "learning_rate": 8.178016726403824e-05, |
| "loss": 0.4401, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.7540886113589057, |
| "grad_norm": 0.6202664375305176, |
| "learning_rate": 8.172043010752689e-05, |
| "loss": 0.5967, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.7564674397859055, |
| "grad_norm": 0.4981567859649658, |
| "learning_rate": 8.166069295101554e-05, |
| "loss": 0.5046, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.7588462682129051, |
| "grad_norm": 0.4543679654598236, |
| "learning_rate": 8.160095579450419e-05, |
| "loss": 0.4256, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.7612250966399049, |
| "grad_norm": 0.48019784688949585, |
| "learning_rate": 8.154121863799284e-05, |
| "loss": 0.4365, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.7636039250669046, |
| "grad_norm": 0.6639605164527893, |
| "learning_rate": 8.148148148148148e-05, |
| "loss": 0.5101, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.7659827534939042, |
| "grad_norm": 0.4873949885368347, |
| "learning_rate": 8.142174432497014e-05, |
| "loss": 0.4971, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.768361581920904, |
| "grad_norm": 0.4552108943462372, |
| "learning_rate": 8.136200716845879e-05, |
| "loss": 0.3995, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.7707404103479036, |
| "grad_norm": 0.42462024092674255, |
| "learning_rate": 8.130227001194744e-05, |
| "loss": 0.4306, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.7731192387749034, |
| "grad_norm": 0.47372832894325256, |
| "learning_rate": 8.124253285543608e-05, |
| "loss": 0.3956, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.7754980672019031, |
| "grad_norm": 0.41488945484161377, |
| "learning_rate": 8.118279569892473e-05, |
| "loss": 0.4634, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.7778768956289027, |
| "grad_norm": 0.41355305910110474, |
| "learning_rate": 8.112305854241339e-05, |
| "loss": 0.3903, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.7802557240559025, |
| "grad_norm": 0.4431304335594177, |
| "learning_rate": 8.106332138590204e-05, |
| "loss": 0.3842, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.7826345524829021, |
| "grad_norm": 0.41997721791267395, |
| "learning_rate": 8.100358422939068e-05, |
| "loss": 0.4125, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.7850133809099019, |
| "grad_norm": 0.5141931772232056, |
| "learning_rate": 8.094384707287933e-05, |
| "loss": 0.4582, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7873922093369016, |
| "grad_norm": 0.48614996671676636, |
| "learning_rate": 8.088410991636799e-05, |
| "loss": 0.417, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.7897710377639012, |
| "grad_norm": 0.5671504139900208, |
| "learning_rate": 8.082437275985664e-05, |
| "loss": 0.5316, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.792149866190901, |
| "grad_norm": 0.483743280172348, |
| "learning_rate": 8.076463560334528e-05, |
| "loss": 0.4018, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.7945286946179007, |
| "grad_norm": 0.5026981234550476, |
| "learning_rate": 8.070489844683393e-05, |
| "loss": 0.415, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.7969075230449004, |
| "grad_norm": 0.4384533762931824, |
| "learning_rate": 8.064516129032258e-05, |
| "loss": 0.4116, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.7992863514719001, |
| "grad_norm": 0.5437987446784973, |
| "learning_rate": 8.058542413381124e-05, |
| "loss": 0.3674, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.8016651798988997, |
| "grad_norm": 0.4940624237060547, |
| "learning_rate": 8.052568697729988e-05, |
| "loss": 0.4128, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.8040440083258995, |
| "grad_norm": 0.46025657653808594, |
| "learning_rate": 8.046594982078853e-05, |
| "loss": 0.4514, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.8064228367528992, |
| "grad_norm": 0.48849228024482727, |
| "learning_rate": 8.040621266427718e-05, |
| "loss": 0.4903, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.8088016651798989, |
| "grad_norm": 0.49576497077941895, |
| "learning_rate": 8.034647550776585e-05, |
| "loss": 0.3421, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.8111804936068986, |
| "grad_norm": 0.44594496488571167, |
| "learning_rate": 8.028673835125448e-05, |
| "loss": 0.4878, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.8135593220338984, |
| "grad_norm": 0.535062849521637, |
| "learning_rate": 8.022700119474313e-05, |
| "loss": 0.4713, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.815938150460898, |
| "grad_norm": 0.4524611532688141, |
| "learning_rate": 8.016726403823178e-05, |
| "loss": 0.4416, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.8183169788878977, |
| "grad_norm": 0.5021877288818359, |
| "learning_rate": 8.010752688172043e-05, |
| "loss": 0.4265, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.8206958073148974, |
| "grad_norm": 0.5109665989875793, |
| "learning_rate": 8.004778972520908e-05, |
| "loss": 0.4438, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.8230746357418971, |
| "grad_norm": 0.46363264322280884, |
| "learning_rate": 7.998805256869773e-05, |
| "loss": 0.4209, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.8254534641688969, |
| "grad_norm": 0.47233110666275024, |
| "learning_rate": 7.992831541218638e-05, |
| "loss": 0.4135, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.8278322925958965, |
| "grad_norm": 0.42109566926956177, |
| "learning_rate": 7.986857825567503e-05, |
| "loss": 0.4168, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.8302111210228962, |
| "grad_norm": 0.4773065149784088, |
| "learning_rate": 7.980884109916368e-05, |
| "loss": 0.4437, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.832589949449896, |
| "grad_norm": 0.5352240800857544, |
| "learning_rate": 7.974910394265234e-05, |
| "loss": 0.3954, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.8349687778768956, |
| "grad_norm": 0.3983041048049927, |
| "learning_rate": 7.968936678614099e-05, |
| "loss": 0.3578, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.8373476063038954, |
| "grad_norm": 0.4744306206703186, |
| "learning_rate": 7.962962962962964e-05, |
| "loss": 0.4262, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.839726434730895, |
| "grad_norm": 0.4249997138977051, |
| "learning_rate": 7.956989247311829e-05, |
| "loss": 0.4078, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.8421052631578947, |
| "grad_norm": 0.4846518337726593, |
| "learning_rate": 7.951015531660694e-05, |
| "loss": 0.4084, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.8444840915848945, |
| "grad_norm": 0.5177565217018127, |
| "learning_rate": 7.945041816009559e-05, |
| "loss": 0.4306, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.8468629200118941, |
| "grad_norm": 0.5126465559005737, |
| "learning_rate": 7.939068100358424e-05, |
| "loss": 0.4932, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.8492417484388939, |
| "grad_norm": 0.45076677203178406, |
| "learning_rate": 7.933094384707289e-05, |
| "loss": 0.3486, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.8516205768658935, |
| "grad_norm": 0.4830567538738251, |
| "learning_rate": 7.927120669056152e-05, |
| "loss": 0.4967, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.8539994052928932, |
| "grad_norm": 0.436100572347641, |
| "learning_rate": 7.921146953405019e-05, |
| "loss": 0.4028, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.856378233719893, |
| "grad_norm": 0.5462535619735718, |
| "learning_rate": 7.915173237753884e-05, |
| "loss": 0.3865, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.8587570621468926, |
| "grad_norm": 0.5388931632041931, |
| "learning_rate": 7.909199522102749e-05, |
| "loss": 0.4424, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.8611358905738924, |
| "grad_norm": 0.5704916715621948, |
| "learning_rate": 7.903225806451613e-05, |
| "loss": 0.5744, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.863514719000892, |
| "grad_norm": 0.442672461271286, |
| "learning_rate": 7.897252090800479e-05, |
| "loss": 0.4299, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.8658935474278918, |
| "grad_norm": 0.4462336599826813, |
| "learning_rate": 7.891278375149344e-05, |
| "loss": 0.4498, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.8682723758548915, |
| "grad_norm": 0.5641991496086121, |
| "learning_rate": 7.885304659498209e-05, |
| "loss": 0.3614, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.8706512042818911, |
| "grad_norm": 0.47875943779945374, |
| "learning_rate": 7.879330943847073e-05, |
| "loss": 0.4333, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.8730300327088909, |
| "grad_norm": 0.4294171929359436, |
| "learning_rate": 7.873357228195938e-05, |
| "loss": 0.4294, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.8754088611358906, |
| "grad_norm": 0.499260812997818, |
| "learning_rate": 7.867383512544804e-05, |
| "loss": 0.4785, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.8777876895628903, |
| "grad_norm": 0.4474776089191437, |
| "learning_rate": 7.861409796893669e-05, |
| "loss": 0.3835, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.88016651798989, |
| "grad_norm": 0.4485014081001282, |
| "learning_rate": 7.855436081242533e-05, |
| "loss": 0.4452, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.8825453464168896, |
| "grad_norm": 0.45007187128067017, |
| "learning_rate": 7.849462365591398e-05, |
| "loss": 0.4115, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.8849241748438894, |
| "grad_norm": 0.5280534029006958, |
| "learning_rate": 7.843488649940263e-05, |
| "loss": 0.4891, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.8873030032708891, |
| "grad_norm": 0.45218995213508606, |
| "learning_rate": 7.837514934289129e-05, |
| "loss": 0.4952, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.8896818316978888, |
| "grad_norm": 0.4186769425868988, |
| "learning_rate": 7.831541218637993e-05, |
| "loss": 0.3688, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.8920606601248885, |
| "grad_norm": 0.45719078183174133, |
| "learning_rate": 7.825567502986858e-05, |
| "loss": 0.3713, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.8944394885518882, |
| "grad_norm": 0.5285611748695374, |
| "learning_rate": 7.819593787335723e-05, |
| "loss": 0.4358, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.8968183169788879, |
| "grad_norm": 0.4786120653152466, |
| "learning_rate": 7.81362007168459e-05, |
| "loss": 0.3769, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.8991971454058876, |
| "grad_norm": 0.42901334166526794, |
| "learning_rate": 7.807646356033453e-05, |
| "loss": 0.3876, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.9015759738328873, |
| "grad_norm": 0.5145654082298279, |
| "learning_rate": 7.801672640382318e-05, |
| "loss": 0.4675, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.903954802259887, |
| "grad_norm": 0.49353912472724915, |
| "learning_rate": 7.795698924731183e-05, |
| "loss": 0.4615, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.9063336306868867, |
| "grad_norm": 0.5090721845626831, |
| "learning_rate": 7.789725209080048e-05, |
| "loss": 0.4628, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.9087124591138864, |
| "grad_norm": 0.4594820737838745, |
| "learning_rate": 7.783751493428913e-05, |
| "loss": 0.4605, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.9110912875408861, |
| "grad_norm": 0.5224221348762512, |
| "learning_rate": 7.777777777777778e-05, |
| "loss": 0.3948, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.9134701159678859, |
| "grad_norm": 0.4966912567615509, |
| "learning_rate": 7.771804062126643e-05, |
| "loss": 0.4646, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.9158489443948855, |
| "grad_norm": 0.41586506366729736, |
| "learning_rate": 7.765830346475508e-05, |
| "loss": 0.3713, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.9182277728218852, |
| "grad_norm": 0.44495782256126404, |
| "learning_rate": 7.759856630824373e-05, |
| "loss": 0.4071, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.9206066012488849, |
| "grad_norm": 0.4939152002334595, |
| "learning_rate": 7.753882915173238e-05, |
| "loss": 0.5035, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.9229854296758846, |
| "grad_norm": 0.5020651817321777, |
| "learning_rate": 7.747909199522103e-05, |
| "loss": 0.458, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.9253642581028844, |
| "grad_norm": 0.5235796570777893, |
| "learning_rate": 7.741935483870968e-05, |
| "loss": 0.4411, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.927743086529884, |
| "grad_norm": 0.47786203026771545, |
| "learning_rate": 7.735961768219832e-05, |
| "loss": 0.4452, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.9301219149568838, |
| "grad_norm": 0.5545435547828674, |
| "learning_rate": 7.729988052568698e-05, |
| "loss": 0.4522, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.9325007433838834, |
| "grad_norm": 0.46032214164733887, |
| "learning_rate": 7.724014336917563e-05, |
| "loss": 0.3926, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.9348795718108831, |
| "grad_norm": 0.49607938528060913, |
| "learning_rate": 7.718040621266428e-05, |
| "loss": 0.4277, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.9372584002378829, |
| "grad_norm": 0.558600664138794, |
| "learning_rate": 7.712066905615292e-05, |
| "loss": 0.4564, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.9396372286648825, |
| "grad_norm": 0.47056880593299866, |
| "learning_rate": 7.706093189964157e-05, |
| "loss": 0.3932, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.9420160570918823, |
| "grad_norm": 0.4679563343524933, |
| "learning_rate": 7.700119474313024e-05, |
| "loss": 0.3767, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.944394885518882, |
| "grad_norm": 0.5144210457801819, |
| "learning_rate": 7.694145758661889e-05, |
| "loss": 0.463, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.9467737139458816, |
| "grad_norm": 0.538809597492218, |
| "learning_rate": 7.688172043010752e-05, |
| "loss": 0.422, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.9491525423728814, |
| "grad_norm": 0.4757307767868042, |
| "learning_rate": 7.682198327359617e-05, |
| "loss": 0.3726, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.951531370799881, |
| "grad_norm": 0.5315593481063843, |
| "learning_rate": 7.676224611708484e-05, |
| "loss": 0.4213, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.951531370799881, |
| "eval_loss": 0.40764564275741577, |
| "eval_runtime": 24.5873, |
| "eval_samples_per_second": 30.422, |
| "eval_steps_per_second": 15.211, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.9539101992268808, |
| "grad_norm": 0.5194830298423767, |
| "learning_rate": 7.670250896057349e-05, |
| "loss": 0.4463, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.9562890276538805, |
| "grad_norm": 0.49560976028442383, |
| "learning_rate": 7.664277180406212e-05, |
| "loss": 0.3966, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.9586678560808801, |
| "grad_norm": 0.5115331411361694, |
| "learning_rate": 7.658303464755077e-05, |
| "loss": 0.4229, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.9610466845078799, |
| "grad_norm": 0.4300745725631714, |
| "learning_rate": 7.652329749103942e-05, |
| "loss": 0.4448, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.9634255129348795, |
| "grad_norm": 0.4714706242084503, |
| "learning_rate": 7.646356033452809e-05, |
| "loss": 0.5054, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.9658043413618793, |
| "grad_norm": 0.4393133521080017, |
| "learning_rate": 7.640382317801672e-05, |
| "loss": 0.3574, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.968183169788879, |
| "grad_norm": 0.47342050075531006, |
| "learning_rate": 7.634408602150538e-05, |
| "loss": 0.3789, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.9705619982158786, |
| "grad_norm": 0.4490165114402771, |
| "learning_rate": 7.628434886499403e-05, |
| "loss": 0.386, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.9729408266428784, |
| "grad_norm": 0.3915969133377075, |
| "learning_rate": 7.622461170848269e-05, |
| "loss": 0.3605, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.975319655069878, |
| "grad_norm": 0.4768807888031006, |
| "learning_rate": 7.616487455197133e-05, |
| "loss": 0.4476, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.9776984834968778, |
| "grad_norm": 0.5957807302474976, |
| "learning_rate": 7.610513739545998e-05, |
| "loss": 0.3811, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.9800773119238775, |
| "grad_norm": 0.4522106647491455, |
| "learning_rate": 7.604540023894863e-05, |
| "loss": 0.3792, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.9824561403508771, |
| "grad_norm": 0.49110835790634155, |
| "learning_rate": 7.598566308243728e-05, |
| "loss": 0.4261, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.9848349687778769, |
| "grad_norm": 0.49582499265670776, |
| "learning_rate": 7.592592592592593e-05, |
| "loss": 0.3845, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.9872137972048766, |
| "grad_norm": 0.4370538890361786, |
| "learning_rate": 7.586618876941458e-05, |
| "loss": 0.36, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.9895926256318763, |
| "grad_norm": 0.473349928855896, |
| "learning_rate": 7.580645161290323e-05, |
| "loss": 0.419, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.991971454058876, |
| "grad_norm": 0.5078774094581604, |
| "learning_rate": 7.574671445639188e-05, |
| "loss": 0.4279, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.9943502824858758, |
| "grad_norm": 0.4728812873363495, |
| "learning_rate": 7.568697729988053e-05, |
| "loss": 0.4926, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.9967291109128754, |
| "grad_norm": 0.5215616822242737, |
| "learning_rate": 7.562724014336918e-05, |
| "loss": 0.4064, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.9991079393398751, |
| "grad_norm": 0.5137555003166199, |
| "learning_rate": 7.556750298685783e-05, |
| "loss": 0.4205, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.7743244171142578, |
| "learning_rate": 7.550776583034648e-05, |
| "loss": 0.3431, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.0023788284269997, |
| "grad_norm": 0.46033695340156555, |
| "learning_rate": 7.544802867383513e-05, |
| "loss": 0.3297, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.0047576568539993, |
| "grad_norm": 0.423931747674942, |
| "learning_rate": 7.538829151732378e-05, |
| "loss": 0.377, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.0071364852809992, |
| "grad_norm": 0.45330631732940674, |
| "learning_rate": 7.532855436081243e-05, |
| "loss": 0.4413, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.0095153137079989, |
| "grad_norm": 0.4233931601047516, |
| "learning_rate": 7.526881720430108e-05, |
| "loss": 0.3698, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.0118941421349985, |
| "grad_norm": 0.44395965337753296, |
| "learning_rate": 7.520908004778973e-05, |
| "loss": 0.3871, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.0142729705619982, |
| "grad_norm": 0.47254008054733276, |
| "learning_rate": 7.514934289127837e-05, |
| "loss": 0.3216, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.0166517989889978, |
| "grad_norm": 0.4359164237976074, |
| "learning_rate": 7.508960573476703e-05, |
| "loss": 0.3463, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.0190306274159977, |
| "grad_norm": 0.4071188271045685, |
| "learning_rate": 7.502986857825568e-05, |
| "loss": 0.3649, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.0214094558429974, |
| "grad_norm": 0.4662891924381256, |
| "learning_rate": 7.497013142174433e-05, |
| "loss": 0.3709, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.023788284269997, |
| "grad_norm": 0.4861668348312378, |
| "learning_rate": 7.491039426523297e-05, |
| "loss": 0.3461, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.0261671126969967, |
| "grad_norm": 0.44614556431770325, |
| "learning_rate": 7.485065710872163e-05, |
| "loss": 0.348, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.0285459411239963, |
| "grad_norm": 0.5915560722351074, |
| "learning_rate": 7.479091995221028e-05, |
| "loss": 0.4335, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.0309247695509962, |
| "grad_norm": 0.5173319578170776, |
| "learning_rate": 7.473118279569893e-05, |
| "loss": 0.4246, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.0333035979779959, |
| "grad_norm": 0.5578414797782898, |
| "learning_rate": 7.467144563918757e-05, |
| "loss": 0.45, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.0356824264049955, |
| "grad_norm": 0.4671054184436798, |
| "learning_rate": 7.461170848267622e-05, |
| "loss": 0.4, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.0380612548319952, |
| "grad_norm": 0.5529332756996155, |
| "learning_rate": 7.455197132616488e-05, |
| "loss": 0.3572, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.0404400832589948, |
| "grad_norm": 0.39636898040771484, |
| "learning_rate": 7.449223416965353e-05, |
| "loss": 0.3505, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.0428189116859947, |
| "grad_norm": 0.48718351125717163, |
| "learning_rate": 7.443249701314217e-05, |
| "loss": 0.3856, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.0451977401129944, |
| "grad_norm": 0.45662057399749756, |
| "learning_rate": 7.437275985663082e-05, |
| "loss": 0.3257, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.047576568539994, |
| "grad_norm": 0.5234172344207764, |
| "learning_rate": 7.431302270011949e-05, |
| "loss": 0.3933, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.0499553969669937, |
| "grad_norm": 0.46789664030075073, |
| "learning_rate": 7.425328554360814e-05, |
| "loss": 0.3381, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.0523342253939933, |
| "grad_norm": 0.4628084897994995, |
| "learning_rate": 7.419354838709677e-05, |
| "loss": 0.3924, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.0547130538209932, |
| "grad_norm": 0.44630610942840576, |
| "learning_rate": 7.413381123058542e-05, |
| "loss": 0.3711, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.0570918822479929, |
| "grad_norm": 0.49540749192237854, |
| "learning_rate": 7.407407407407407e-05, |
| "loss": 0.3633, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.0594707106749925, |
| "grad_norm": 0.4955281913280487, |
| "learning_rate": 7.401433691756274e-05, |
| "loss": 0.3842, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.0618495391019922, |
| "grad_norm": 0.512876570224762, |
| "learning_rate": 7.395459976105137e-05, |
| "loss": 0.3781, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.064228367528992, |
| "grad_norm": 0.45539894700050354, |
| "learning_rate": 7.389486260454002e-05, |
| "loss": 0.3227, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.0666071959559917, |
| "grad_norm": 0.48431605100631714, |
| "learning_rate": 7.383512544802867e-05, |
| "loss": 0.3943, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.0689860243829914, |
| "grad_norm": 0.4580828547477722, |
| "learning_rate": 7.377538829151732e-05, |
| "loss": 0.3791, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.071364852809991, |
| "grad_norm": 0.48635613918304443, |
| "learning_rate": 7.371565113500597e-05, |
| "loss": 0.2822, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.0737436812369907, |
| "grad_norm": 0.5143231153488159, |
| "learning_rate": 7.365591397849463e-05, |
| "loss": 0.3912, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.0761225096639904, |
| "grad_norm": 0.44039368629455566, |
| "learning_rate": 7.359617682198328e-05, |
| "loss": 0.2943, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.0785013380909902, |
| "grad_norm": 0.5627120137214661, |
| "learning_rate": 7.353643966547193e-05, |
| "loss": 0.4064, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.08088016651799, |
| "grad_norm": 0.539448618888855, |
| "learning_rate": 7.347670250896058e-05, |
| "loss": 0.432, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.0832589949449896, |
| "grad_norm": 0.5089407563209534, |
| "learning_rate": 7.341696535244923e-05, |
| "loss": 0.3737, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.0856378233719892, |
| "grad_norm": 0.5352144837379456, |
| "learning_rate": 7.335722819593788e-05, |
| "loss": 0.4279, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.088016651798989, |
| "grad_norm": 0.49480384588241577, |
| "learning_rate": 7.329749103942653e-05, |
| "loss": 0.3251, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.0903954802259888, |
| "grad_norm": 0.49400433897972107, |
| "learning_rate": 7.323775388291518e-05, |
| "loss": 0.321, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.0927743086529884, |
| "grad_norm": 0.4779919981956482, |
| "learning_rate": 7.317801672640383e-05, |
| "loss": 0.3446, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.095153137079988, |
| "grad_norm": 0.5801271796226501, |
| "learning_rate": 7.311827956989248e-05, |
| "loss": 0.4476, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.0975319655069877, |
| "grad_norm": 0.49812352657318115, |
| "learning_rate": 7.305854241338113e-05, |
| "loss": 0.3112, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.0999107939339876, |
| "grad_norm": 0.47404006123542786, |
| "learning_rate": 7.299880525686978e-05, |
| "loss": 0.3562, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.1022896223609873, |
| "grad_norm": 0.5289527177810669, |
| "learning_rate": 7.293906810035843e-05, |
| "loss": 0.3986, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.104668450787987, |
| "grad_norm": 0.5155789256095886, |
| "learning_rate": 7.287933094384708e-05, |
| "loss": 0.3943, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.1070472792149866, |
| "grad_norm": 0.5496542453765869, |
| "learning_rate": 7.281959378733573e-05, |
| "loss": 0.398, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.1094261076419862, |
| "grad_norm": 0.47129660844802856, |
| "learning_rate": 7.275985663082438e-05, |
| "loss": 0.3039, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.1118049360689861, |
| "grad_norm": 0.4683108627796173, |
| "learning_rate": 7.270011947431302e-05, |
| "loss": 0.3282, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.1141837644959858, |
| "grad_norm": 0.49074095487594604, |
| "learning_rate": 7.264038231780168e-05, |
| "loss": 0.3097, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.1165625929229854, |
| "grad_norm": 0.47383546829223633, |
| "learning_rate": 7.258064516129033e-05, |
| "loss": 0.3536, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.118941421349985, |
| "grad_norm": 0.5356380939483643, |
| "learning_rate": 7.252090800477898e-05, |
| "loss": 0.3153, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.1213202497769847, |
| "grad_norm": 0.46198195219039917, |
| "learning_rate": 7.246117084826762e-05, |
| "loss": 0.3815, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.1236990782039846, |
| "grad_norm": 0.6104227304458618, |
| "learning_rate": 7.240143369175627e-05, |
| "loss": 0.4299, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.1260779066309843, |
| "grad_norm": 0.4720747172832489, |
| "learning_rate": 7.234169653524493e-05, |
| "loss": 0.3439, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.128456735057984, |
| "grad_norm": 0.5005800724029541, |
| "learning_rate": 7.228195937873358e-05, |
| "loss": 0.35, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.1308355634849836, |
| "grad_norm": 0.48051440715789795, |
| "learning_rate": 7.222222222222222e-05, |
| "loss": 0.3473, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.1332143919119833, |
| "grad_norm": 0.5383554697036743, |
| "learning_rate": 7.216248506571087e-05, |
| "loss": 0.3363, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.1355932203389831, |
| "grad_norm": 0.5020559430122375, |
| "learning_rate": 7.210274790919953e-05, |
| "loss": 0.3175, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.1379720487659828, |
| "grad_norm": 0.523763120174408, |
| "learning_rate": 7.204301075268818e-05, |
| "loss": 0.3663, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.1403508771929824, |
| "grad_norm": 0.45200997591018677, |
| "learning_rate": 7.198327359617682e-05, |
| "loss": 0.3154, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.142729705619982, |
| "grad_norm": 0.47503340244293213, |
| "learning_rate": 7.192353643966547e-05, |
| "loss": 0.2779, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.1451085340469818, |
| "grad_norm": 0.47074899077415466, |
| "learning_rate": 7.186379928315412e-05, |
| "loss": 0.3705, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.1474873624739816, |
| "grad_norm": 0.6446363925933838, |
| "learning_rate": 7.180406212664278e-05, |
| "loss": 0.3343, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.1498661909009813, |
| "grad_norm": 0.516174852848053, |
| "learning_rate": 7.174432497013142e-05, |
| "loss": 0.3437, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.152245019327981, |
| "grad_norm": 0.4703758955001831, |
| "learning_rate": 7.168458781362007e-05, |
| "loss": 0.3604, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.1546238477549806, |
| "grad_norm": 0.4764377176761627, |
| "learning_rate": 7.162485065710872e-05, |
| "loss": 0.3724, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.1570026761819805, |
| "grad_norm": 0.5020294785499573, |
| "learning_rate": 7.156511350059739e-05, |
| "loss": 0.3681, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.1593815046089802, |
| "grad_norm": 0.5461302399635315, |
| "learning_rate": 7.150537634408602e-05, |
| "loss": 0.3805, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.1617603330359798, |
| "grad_norm": 0.5569891333580017, |
| "learning_rate": 7.144563918757467e-05, |
| "loss": 0.4203, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.1641391614629795, |
| "grad_norm": 0.5478572845458984, |
| "learning_rate": 7.138590203106332e-05, |
| "loss": 0.4285, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.1665179898899791, |
| "grad_norm": 0.5138799548149109, |
| "learning_rate": 7.132616487455197e-05, |
| "loss": 0.3738, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.1688968183169788, |
| "grad_norm": 0.5300459265708923, |
| "learning_rate": 7.126642771804062e-05, |
| "loss": 0.411, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.1712756467439787, |
| "grad_norm": 0.4664587378501892, |
| "learning_rate": 7.120669056152927e-05, |
| "loss": 0.3519, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.1736544751709783, |
| "grad_norm": 0.4465605616569519, |
| "learning_rate": 7.114695340501792e-05, |
| "loss": 0.3515, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.176033303597978, |
| "grad_norm": 0.48844248056411743, |
| "learning_rate": 7.108721624850657e-05, |
| "loss": 0.3656, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.1784121320249776, |
| "grad_norm": 0.4901794195175171, |
| "learning_rate": 7.102747909199522e-05, |
| "loss": 0.3904, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.1807909604519775, |
| "grad_norm": 0.5040355324745178, |
| "learning_rate": 7.096774193548388e-05, |
| "loss": 0.3242, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.1831697888789772, |
| "grad_norm": 0.5615472793579102, |
| "learning_rate": 7.090800477897253e-05, |
| "loss": 0.3417, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.1855486173059768, |
| "grad_norm": 0.4797894358634949, |
| "learning_rate": 7.084826762246118e-05, |
| "loss": 0.3423, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.1879274457329765, |
| "grad_norm": 0.4823111891746521, |
| "learning_rate": 7.078853046594983e-05, |
| "loss": 0.35, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.1879274457329765, |
| "eval_loss": 0.4021657109260559, |
| "eval_runtime": 24.5893, |
| "eval_samples_per_second": 30.42, |
| "eval_steps_per_second": 15.21, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.1903062741599761, |
| "grad_norm": 0.5362781286239624, |
| "learning_rate": 7.072879330943848e-05, |
| "loss": 0.3355, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.1926851025869758, |
| "grad_norm": 0.5271429419517517, |
| "learning_rate": 7.066905615292713e-05, |
| "loss": 0.3473, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.1950639310139757, |
| "grad_norm": 0.48875752091407776, |
| "learning_rate": 7.060931899641578e-05, |
| "loss": 0.3527, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.1974427594409753, |
| "grad_norm": 0.5725129842758179, |
| "learning_rate": 7.054958183990443e-05, |
| "loss": 0.3942, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.199821587867975, |
| "grad_norm": 0.5125251412391663, |
| "learning_rate": 7.048984468339306e-05, |
| "loss": 0.3818, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.2022004162949746, |
| "grad_norm": 0.5617442727088928, |
| "learning_rate": 7.043010752688173e-05, |
| "loss": 0.3741, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.2045792447219745, |
| "grad_norm": 0.4992014467716217, |
| "learning_rate": 7.037037037037038e-05, |
| "loss": 0.368, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.2069580731489742, |
| "grad_norm": 0.5008623600006104, |
| "learning_rate": 7.031063321385903e-05, |
| "loss": 0.3235, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.2093369015759738, |
| "grad_norm": 0.49350398778915405, |
| "learning_rate": 7.025089605734767e-05, |
| "loss": 0.3355, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.2117157300029735, |
| "grad_norm": 0.5325546860694885, |
| "learning_rate": 7.019115890083633e-05, |
| "loss": 0.4046, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.2140945584299732, |
| "grad_norm": 0.49078550934791565, |
| "learning_rate": 7.013142174432498e-05, |
| "loss": 0.3521, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.216473386856973, |
| "grad_norm": 0.5535020232200623, |
| "learning_rate": 7.007168458781363e-05, |
| "loss": 0.3914, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.2188522152839727, |
| "grad_norm": 0.4585452079772949, |
| "learning_rate": 7.001194743130227e-05, |
| "loss": 0.2666, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.2212310437109724, |
| "grad_norm": 0.6677319407463074, |
| "learning_rate": 6.995221027479092e-05, |
| "loss": 0.4613, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.223609872137972, |
| "grad_norm": 0.5839059948921204, |
| "learning_rate": 6.989247311827958e-05, |
| "loss": 0.4185, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.2259887005649717, |
| "grad_norm": 0.5040207505226135, |
| "learning_rate": 6.983273596176823e-05, |
| "loss": 0.3351, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.2283675289919715, |
| "grad_norm": 0.5528902411460876, |
| "learning_rate": 6.977299880525687e-05, |
| "loss": 0.3561, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.2307463574189712, |
| "grad_norm": 0.5583405494689941, |
| "learning_rate": 6.971326164874552e-05, |
| "loss": 0.3641, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.2331251858459709, |
| "grad_norm": 0.5596750378608704, |
| "learning_rate": 6.965352449223417e-05, |
| "loss": 0.3536, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.2355040142729705, |
| "grad_norm": 0.5579570531845093, |
| "learning_rate": 6.959378733572283e-05, |
| "loss": 0.4025, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.2378828426999702, |
| "grad_norm": 0.5613878965377808, |
| "learning_rate": 6.953405017921147e-05, |
| "loss": 0.3825, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.24026167112697, |
| "grad_norm": 0.5365767478942871, |
| "learning_rate": 6.947431302270012e-05, |
| "loss": 0.3801, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.2426404995539697, |
| "grad_norm": 0.558989942073822, |
| "learning_rate": 6.941457586618877e-05, |
| "loss": 0.391, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.2450193279809694, |
| "grad_norm": 0.5318464636802673, |
| "learning_rate": 6.935483870967743e-05, |
| "loss": 0.3548, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.247398156407969, |
| "grad_norm": 0.5546141266822815, |
| "learning_rate": 6.929510155316607e-05, |
| "loss": 0.3988, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.2497769848349687, |
| "grad_norm": 0.5842304229736328, |
| "learning_rate": 6.923536439665472e-05, |
| "loss": 0.4038, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.2521558132619686, |
| "grad_norm": 0.5150060653686523, |
| "learning_rate": 6.917562724014337e-05, |
| "loss": 0.3793, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.2545346416889682, |
| "grad_norm": 0.4285811185836792, |
| "learning_rate": 6.911589008363202e-05, |
| "loss": 0.3562, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.2569134701159679, |
| "grad_norm": 0.4896494746208191, |
| "learning_rate": 6.905615292712067e-05, |
| "loss": 0.3246, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.2592922985429675, |
| "grad_norm": 0.5333868861198425, |
| "learning_rate": 6.899641577060932e-05, |
| "loss": 0.4024, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.2616711269699672, |
| "grad_norm": 0.5703678727149963, |
| "learning_rate": 6.893667861409797e-05, |
| "loss": 0.3583, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.264049955396967, |
| "grad_norm": 0.4980472922325134, |
| "learning_rate": 6.887694145758662e-05, |
| "loss": 0.3631, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.2664287838239667, |
| "grad_norm": 0.4485708475112915, |
| "learning_rate": 6.881720430107527e-05, |
| "loss": 0.2715, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.2688076122509664, |
| "grad_norm": 0.454412579536438, |
| "learning_rate": 6.875746714456392e-05, |
| "loss": 0.2944, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.271186440677966, |
| "grad_norm": 0.4816785156726837, |
| "learning_rate": 6.869772998805257e-05, |
| "loss": 0.4177, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.273565269104966, |
| "grad_norm": 0.5463387370109558, |
| "learning_rate": 6.863799283154122e-05, |
| "loss": 0.3994, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.2759440975319656, |
| "grad_norm": 0.5411667823791504, |
| "learning_rate": 6.857825567502987e-05, |
| "loss": 0.3623, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.2783229259589652, |
| "grad_norm": 0.5007591247558594, |
| "learning_rate": 6.851851851851852e-05, |
| "loss": 0.3953, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.280701754385965, |
| "grad_norm": 0.5197054147720337, |
| "learning_rate": 6.845878136200717e-05, |
| "loss": 0.3355, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.2830805828129646, |
| "grad_norm": 0.48684370517730713, |
| "learning_rate": 6.839904420549582e-05, |
| "loss": 0.3826, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.2854594112399642, |
| "grad_norm": 0.5456849932670593, |
| "learning_rate": 6.833930704898447e-05, |
| "loss": 0.3919, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.287838239666964, |
| "grad_norm": 0.4965335428714752, |
| "learning_rate": 6.827956989247311e-05, |
| "loss": 0.3905, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.2902170680939637, |
| "grad_norm": 0.4824836850166321, |
| "learning_rate": 6.821983273596178e-05, |
| "loss": 0.3212, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.2925958965209634, |
| "grad_norm": 0.5223451256752014, |
| "learning_rate": 6.816009557945043e-05, |
| "loss": 0.3359, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.294974724947963, |
| "grad_norm": 0.4838787317276001, |
| "learning_rate": 6.810035842293908e-05, |
| "loss": 0.2781, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.297353553374963, |
| "grad_norm": 0.4827715754508972, |
| "learning_rate": 6.804062126642771e-05, |
| "loss": 0.3118, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.2997323818019626, |
| "grad_norm": 0.49691149592399597, |
| "learning_rate": 6.798088410991638e-05, |
| "loss": 0.3527, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.3021112102289623, |
| "grad_norm": 0.49155953526496887, |
| "learning_rate": 6.792114695340503e-05, |
| "loss": 0.3626, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.304490038655962, |
| "grad_norm": 0.5283527374267578, |
| "learning_rate": 6.786140979689368e-05, |
| "loss": 0.3875, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.3068688670829616, |
| "grad_norm": 0.5794263482093811, |
| "learning_rate": 6.780167264038231e-05, |
| "loss": 0.3357, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.3092476955099612, |
| "grad_norm": 0.5505433082580566, |
| "learning_rate": 6.774193548387096e-05, |
| "loss": 0.4037, |
| "step": 551 |
| }, |
| { |
| "epoch": 1.311626523936961, |
| "grad_norm": 0.48500585556030273, |
| "learning_rate": 6.768219832735963e-05, |
| "loss": 0.3564, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.3140053523639608, |
| "grad_norm": 0.5165260434150696, |
| "learning_rate": 6.762246117084828e-05, |
| "loss": 0.3726, |
| "step": 553 |
| }, |
| { |
| "epoch": 1.3163841807909604, |
| "grad_norm": 0.5240358710289001, |
| "learning_rate": 6.756272401433692e-05, |
| "loss": 0.3444, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.31876300921796, |
| "grad_norm": 0.45539531111717224, |
| "learning_rate": 6.750298685782557e-05, |
| "loss": 0.314, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.32114183764496, |
| "grad_norm": 0.48855656385421753, |
| "learning_rate": 6.744324970131423e-05, |
| "loss": 0.3942, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.3235206660719596, |
| "grad_norm": 0.4930460751056671, |
| "learning_rate": 6.738351254480288e-05, |
| "loss": 0.3297, |
| "step": 557 |
| }, |
| { |
| "epoch": 1.3258994944989593, |
| "grad_norm": 0.4979415833950043, |
| "learning_rate": 6.732377538829152e-05, |
| "loss": 0.343, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.328278322925959, |
| "grad_norm": 0.47355780005455017, |
| "learning_rate": 6.726403823178017e-05, |
| "loss": 0.3013, |
| "step": 559 |
| }, |
| { |
| "epoch": 1.3306571513529586, |
| "grad_norm": 0.5629305839538574, |
| "learning_rate": 6.720430107526882e-05, |
| "loss": 0.3732, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.3330359797799582, |
| "grad_norm": 0.44539135694503784, |
| "learning_rate": 6.714456391875748e-05, |
| "loss": 0.3009, |
| "step": 561 |
| }, |
| { |
| "epoch": 1.3354148082069581, |
| "grad_norm": 0.5273521542549133, |
| "learning_rate": 6.708482676224612e-05, |
| "loss": 0.3454, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.3377936366339578, |
| "grad_norm": 0.5609079003334045, |
| "learning_rate": 6.702508960573477e-05, |
| "loss": 0.3395, |
| "step": 563 |
| }, |
| { |
| "epoch": 1.3401724650609574, |
| "grad_norm": 0.4624035954475403, |
| "learning_rate": 6.696535244922342e-05, |
| "loss": 0.3244, |
| "step": 564 |
| }, |
| { |
| "epoch": 1.342551293487957, |
| "grad_norm": 0.4840734302997589, |
| "learning_rate": 6.690561529271207e-05, |
| "loss": 0.3705, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.344930121914957, |
| "grad_norm": 0.5756794810295105, |
| "learning_rate": 6.684587813620072e-05, |
| "loss": 0.3406, |
| "step": 566 |
| }, |
| { |
| "epoch": 1.3473089503419566, |
| "grad_norm": 0.5743434429168701, |
| "learning_rate": 6.678614097968937e-05, |
| "loss": 0.4137, |
| "step": 567 |
| }, |
| { |
| "epoch": 1.3496877787689563, |
| "grad_norm": 0.5068850517272949, |
| "learning_rate": 6.672640382317802e-05, |
| "loss": 0.4139, |
| "step": 568 |
| }, |
| { |
| "epoch": 1.352066607195956, |
| "grad_norm": 0.5004804134368896, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 0.3534, |
| "step": 569 |
| }, |
| { |
| "epoch": 1.3544454356229556, |
| "grad_norm": 0.5731816291809082, |
| "learning_rate": 6.660692951015532e-05, |
| "loss": 0.3521, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.3568242640499553, |
| "grad_norm": 0.5832188129425049, |
| "learning_rate": 6.654719235364397e-05, |
| "loss": 0.3541, |
| "step": 571 |
| }, |
| { |
| "epoch": 1.3592030924769551, |
| "grad_norm": 0.4894527792930603, |
| "learning_rate": 6.648745519713262e-05, |
| "loss": 0.36, |
| "step": 572 |
| }, |
| { |
| "epoch": 1.3615819209039548, |
| "grad_norm": 0.5579627752304077, |
| "learning_rate": 6.642771804062127e-05, |
| "loss": 0.4035, |
| "step": 573 |
| }, |
| { |
| "epoch": 1.3639607493309545, |
| "grad_norm": 0.5148259997367859, |
| "learning_rate": 6.636798088410992e-05, |
| "loss": 0.3376, |
| "step": 574 |
| }, |
| { |
| "epoch": 1.3663395777579543, |
| "grad_norm": 0.5262428522109985, |
| "learning_rate": 6.630824372759857e-05, |
| "loss": 0.375, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.368718406184954, |
| "grad_norm": 0.4974241554737091, |
| "learning_rate": 6.624850657108722e-05, |
| "loss": 0.3895, |
| "step": 576 |
| }, |
| { |
| "epoch": 1.3710972346119537, |
| "grad_norm": 0.6021261215209961, |
| "learning_rate": 6.618876941457587e-05, |
| "loss": 0.4076, |
| "step": 577 |
| }, |
| { |
| "epoch": 1.3734760630389533, |
| "grad_norm": 0.5429256558418274, |
| "learning_rate": 6.612903225806452e-05, |
| "loss": 0.3665, |
| "step": 578 |
| }, |
| { |
| "epoch": 1.375854891465953, |
| "grad_norm": 0.5747050046920776, |
| "learning_rate": 6.606929510155317e-05, |
| "loss": 0.3845, |
| "step": 579 |
| }, |
| { |
| "epoch": 1.3782337198929526, |
| "grad_norm": 0.5203744173049927, |
| "learning_rate": 6.600955794504182e-05, |
| "loss": 0.2655, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.3806125483199525, |
| "grad_norm": 0.5207931995391846, |
| "learning_rate": 6.594982078853047e-05, |
| "loss": 0.3998, |
| "step": 581 |
| }, |
| { |
| "epoch": 1.3829913767469522, |
| "grad_norm": 0.4678313732147217, |
| "learning_rate": 6.589008363201912e-05, |
| "loss": 0.2987, |
| "step": 582 |
| }, |
| { |
| "epoch": 1.3853702051739518, |
| "grad_norm": 0.5557482242584229, |
| "learning_rate": 6.583034647550776e-05, |
| "loss": 0.399, |
| "step": 583 |
| }, |
| { |
| "epoch": 1.3877490336009515, |
| "grad_norm": 0.5944557785987854, |
| "learning_rate": 6.577060931899642e-05, |
| "loss": 0.4344, |
| "step": 584 |
| }, |
| { |
| "epoch": 1.3901278620279514, |
| "grad_norm": 0.5242769718170166, |
| "learning_rate": 6.571087216248507e-05, |
| "loss": 0.3141, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.392506690454951, |
| "grad_norm": 0.5812731385231018, |
| "learning_rate": 6.565113500597372e-05, |
| "loss": 0.3739, |
| "step": 586 |
| }, |
| { |
| "epoch": 1.3948855188819507, |
| "grad_norm": 0.5420759320259094, |
| "learning_rate": 6.559139784946236e-05, |
| "loss": 0.3644, |
| "step": 587 |
| }, |
| { |
| "epoch": 1.3972643473089503, |
| "grad_norm": 0.565941333770752, |
| "learning_rate": 6.553166069295101e-05, |
| "loss": 0.4276, |
| "step": 588 |
| }, |
| { |
| "epoch": 1.39964317573595, |
| "grad_norm": 0.502975344657898, |
| "learning_rate": 6.547192353643968e-05, |
| "loss": 0.3868, |
| "step": 589 |
| }, |
| { |
| "epoch": 1.4020220041629496, |
| "grad_norm": 0.5543567538261414, |
| "learning_rate": 6.541218637992833e-05, |
| "loss": 0.3978, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.4044008325899495, |
| "grad_norm": 0.5115704536437988, |
| "learning_rate": 6.535244922341696e-05, |
| "loss": 0.3002, |
| "step": 591 |
| }, |
| { |
| "epoch": 1.4067796610169492, |
| "grad_norm": 0.50956130027771, |
| "learning_rate": 6.529271206690561e-05, |
| "loss": 0.3456, |
| "step": 592 |
| }, |
| { |
| "epoch": 1.4091584894439488, |
| "grad_norm": 0.6440345048904419, |
| "learning_rate": 6.523297491039428e-05, |
| "loss": 0.4266, |
| "step": 593 |
| }, |
| { |
| "epoch": 1.4115373178709485, |
| "grad_norm": 0.49989134073257446, |
| "learning_rate": 6.517323775388293e-05, |
| "loss": 0.364, |
| "step": 594 |
| }, |
| { |
| "epoch": 1.4139161462979484, |
| "grad_norm": 0.5295883417129517, |
| "learning_rate": 6.511350059737156e-05, |
| "loss": 0.3685, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.416294974724948, |
| "grad_norm": 0.5092173218727112, |
| "learning_rate": 6.505376344086021e-05, |
| "loss": 0.3517, |
| "step": 596 |
| }, |
| { |
| "epoch": 1.4186738031519477, |
| "grad_norm": 0.43385863304138184, |
| "learning_rate": 6.499402628434886e-05, |
| "loss": 0.2672, |
| "step": 597 |
| }, |
| { |
| "epoch": 1.4210526315789473, |
| "grad_norm": 0.5401270985603333, |
| "learning_rate": 6.493428912783753e-05, |
| "loss": 0.3773, |
| "step": 598 |
| }, |
| { |
| "epoch": 1.423431460005947, |
| "grad_norm": 0.45737171173095703, |
| "learning_rate": 6.487455197132617e-05, |
| "loss": 0.2862, |
| "step": 599 |
| }, |
| { |
| "epoch": 1.4258102884329467, |
| "grad_norm": 0.5563622713088989, |
| "learning_rate": 6.481481481481482e-05, |
| "loss": 0.3362, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.4258102884329467, |
| "eval_loss": 0.39686334133148193, |
| "eval_runtime": 24.7459, |
| "eval_samples_per_second": 30.227, |
| "eval_steps_per_second": 15.114, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.4281891168599465, |
| "grad_norm": 0.5734651684761047, |
| "learning_rate": 6.475507765830347e-05, |
| "loss": 0.4083, |
| "step": 601 |
| }, |
| { |
| "epoch": 1.4305679452869462, |
| "grad_norm": 0.45734620094299316, |
| "learning_rate": 6.469534050179213e-05, |
| "loss": 0.3329, |
| "step": 602 |
| }, |
| { |
| "epoch": 1.4329467737139459, |
| "grad_norm": 0.49928852915763855, |
| "learning_rate": 6.463560334528077e-05, |
| "loss": 0.3785, |
| "step": 603 |
| }, |
| { |
| "epoch": 1.4353256021409455, |
| "grad_norm": 0.48517173528671265, |
| "learning_rate": 6.457586618876942e-05, |
| "loss": 0.3299, |
| "step": 604 |
| }, |
| { |
| "epoch": 1.4377044305679454, |
| "grad_norm": 0.4913112223148346, |
| "learning_rate": 6.451612903225807e-05, |
| "loss": 0.3249, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.440083258994945, |
| "grad_norm": 0.5694173574447632, |
| "learning_rate": 6.445639187574672e-05, |
| "loss": 0.3494, |
| "step": 606 |
| }, |
| { |
| "epoch": 1.4424620874219447, |
| "grad_norm": 0.47303032875061035, |
| "learning_rate": 6.439665471923537e-05, |
| "loss": 0.347, |
| "step": 607 |
| }, |
| { |
| "epoch": 1.4448409158489444, |
| "grad_norm": 0.5557957291603088, |
| "learning_rate": 6.433691756272402e-05, |
| "loss": 0.4251, |
| "step": 608 |
| }, |
| { |
| "epoch": 1.447219744275944, |
| "grad_norm": 0.5228065848350525, |
| "learning_rate": 6.427718040621267e-05, |
| "loss": 0.3118, |
| "step": 609 |
| }, |
| { |
| "epoch": 1.4495985727029437, |
| "grad_norm": 0.5313312411308289, |
| "learning_rate": 6.421744324970132e-05, |
| "loss": 0.3826, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.4519774011299436, |
| "grad_norm": 0.5596655607223511, |
| "learning_rate": 6.415770609318996e-05, |
| "loss": 0.4413, |
| "step": 611 |
| }, |
| { |
| "epoch": 1.4543562295569432, |
| "grad_norm": 0.575697124004364, |
| "learning_rate": 6.409796893667862e-05, |
| "loss": 0.4097, |
| "step": 612 |
| }, |
| { |
| "epoch": 1.4567350579839429, |
| "grad_norm": 0.4731719493865967, |
| "learning_rate": 6.403823178016727e-05, |
| "loss": 0.3443, |
| "step": 613 |
| }, |
| { |
| "epoch": 1.4591138864109425, |
| "grad_norm": 0.5273444056510925, |
| "learning_rate": 6.397849462365592e-05, |
| "loss": 0.3843, |
| "step": 614 |
| }, |
| { |
| "epoch": 1.4614927148379424, |
| "grad_norm": 0.5462388396263123, |
| "learning_rate": 6.391875746714456e-05, |
| "loss": 0.3972, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.463871543264942, |
| "grad_norm": 0.45089593529701233, |
| "learning_rate": 6.385902031063322e-05, |
| "loss": 0.3287, |
| "step": 616 |
| }, |
| { |
| "epoch": 1.4662503716919417, |
| "grad_norm": 0.47337663173675537, |
| "learning_rate": 6.379928315412187e-05, |
| "loss": 0.3347, |
| "step": 617 |
| }, |
| { |
| "epoch": 1.4686292001189414, |
| "grad_norm": 0.5648345947265625, |
| "learning_rate": 6.373954599761052e-05, |
| "loss": 0.3703, |
| "step": 618 |
| }, |
| { |
| "epoch": 1.471008028545941, |
| "grad_norm": 0.5006834268569946, |
| "learning_rate": 6.367980884109916e-05, |
| "loss": 0.3699, |
| "step": 619 |
| }, |
| { |
| "epoch": 1.4733868569729407, |
| "grad_norm": 0.51920086145401, |
| "learning_rate": 6.362007168458781e-05, |
| "loss": 0.3799, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.4757656853999406, |
| "grad_norm": 0.5321612358093262, |
| "learning_rate": 6.356033452807647e-05, |
| "loss": 0.3866, |
| "step": 621 |
| }, |
| { |
| "epoch": 1.4781445138269402, |
| "grad_norm": 0.5410431623458862, |
| "learning_rate": 6.350059737156512e-05, |
| "loss": 0.3777, |
| "step": 622 |
| }, |
| { |
| "epoch": 1.48052334225394, |
| "grad_norm": 0.5751019716262817, |
| "learning_rate": 6.344086021505376e-05, |
| "loss": 0.3585, |
| "step": 623 |
| }, |
| { |
| "epoch": 1.4829021706809398, |
| "grad_norm": 0.5092802047729492, |
| "learning_rate": 6.338112305854241e-05, |
| "loss": 0.3229, |
| "step": 624 |
| }, |
| { |
| "epoch": 1.4852809991079394, |
| "grad_norm": 0.5037229061126709, |
| "learning_rate": 6.332138590203107e-05, |
| "loss": 0.3243, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.487659827534939, |
| "grad_norm": 0.5060740113258362, |
| "learning_rate": 6.326164874551972e-05, |
| "loss": 0.4023, |
| "step": 626 |
| }, |
| { |
| "epoch": 1.4900386559619387, |
| "grad_norm": 0.5434842705726624, |
| "learning_rate": 6.320191158900836e-05, |
| "loss": 0.3541, |
| "step": 627 |
| }, |
| { |
| "epoch": 1.4924174843889384, |
| "grad_norm": 0.5165389180183411, |
| "learning_rate": 6.314217443249701e-05, |
| "loss": 0.317, |
| "step": 628 |
| }, |
| { |
| "epoch": 1.494796312815938, |
| "grad_norm": 0.4614243805408478, |
| "learning_rate": 6.308243727598566e-05, |
| "loss": 0.3269, |
| "step": 629 |
| }, |
| { |
| "epoch": 1.497175141242938, |
| "grad_norm": 0.5254392027854919, |
| "learning_rate": 6.302270011947432e-05, |
| "loss": 0.3223, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.4995539696699376, |
| "grad_norm": 0.54349285364151, |
| "learning_rate": 6.296296296296296e-05, |
| "loss": 0.3861, |
| "step": 631 |
| }, |
| { |
| "epoch": 1.5019327980969372, |
| "grad_norm": 0.6002068519592285, |
| "learning_rate": 6.290322580645161e-05, |
| "loss": 0.4794, |
| "step": 632 |
| }, |
| { |
| "epoch": 1.504311626523937, |
| "grad_norm": 0.5521604418754578, |
| "learning_rate": 6.284348864994026e-05, |
| "loss": 0.355, |
| "step": 633 |
| }, |
| { |
| "epoch": 1.5066904549509368, |
| "grad_norm": 0.5643691420555115, |
| "learning_rate": 6.278375149342891e-05, |
| "loss": 0.4274, |
| "step": 634 |
| }, |
| { |
| "epoch": 1.5090692833779364, |
| "grad_norm": 0.5313323736190796, |
| "learning_rate": 6.272401433691756e-05, |
| "loss": 0.3817, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.511448111804936, |
| "grad_norm": 0.5157251358032227, |
| "learning_rate": 6.266427718040621e-05, |
| "loss": 0.3636, |
| "step": 636 |
| }, |
| { |
| "epoch": 1.5138269402319358, |
| "grad_norm": 0.6033576130867004, |
| "learning_rate": 6.260454002389486e-05, |
| "loss": 0.3274, |
| "step": 637 |
| }, |
| { |
| "epoch": 1.5162057686589354, |
| "grad_norm": 0.5338175296783447, |
| "learning_rate": 6.254480286738351e-05, |
| "loss": 0.4118, |
| "step": 638 |
| }, |
| { |
| "epoch": 1.518584597085935, |
| "grad_norm": 0.5195765495300293, |
| "learning_rate": 6.248506571087216e-05, |
| "loss": 0.3153, |
| "step": 639 |
| }, |
| { |
| "epoch": 1.5209634255129347, |
| "grad_norm": 0.4563392996788025, |
| "learning_rate": 6.242532855436081e-05, |
| "loss": 0.3512, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.5233422539399346, |
| "grad_norm": 0.5181533694267273, |
| "learning_rate": 6.236559139784946e-05, |
| "loss": 0.3473, |
| "step": 641 |
| }, |
| { |
| "epoch": 1.5257210823669343, |
| "grad_norm": 0.5112624168395996, |
| "learning_rate": 6.230585424133811e-05, |
| "loss": 0.3215, |
| "step": 642 |
| }, |
| { |
| "epoch": 1.5280999107939341, |
| "grad_norm": 0.48247793316841125, |
| "learning_rate": 6.224611708482677e-05, |
| "loss": 0.3685, |
| "step": 643 |
| }, |
| { |
| "epoch": 1.5304787392209338, |
| "grad_norm": 0.4956831932067871, |
| "learning_rate": 6.218637992831542e-05, |
| "loss": 0.3542, |
| "step": 644 |
| }, |
| { |
| "epoch": 1.5328575676479335, |
| "grad_norm": 0.5187450051307678, |
| "learning_rate": 6.212664277180407e-05, |
| "loss": 0.3949, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.5352363960749331, |
| "grad_norm": 0.4967552721500397, |
| "learning_rate": 6.206690561529272e-05, |
| "loss": 0.4084, |
| "step": 646 |
| }, |
| { |
| "epoch": 1.5376152245019328, |
| "grad_norm": 0.5258282423019409, |
| "learning_rate": 6.200716845878137e-05, |
| "loss": 0.3735, |
| "step": 647 |
| }, |
| { |
| "epoch": 1.5399940529289324, |
| "grad_norm": 0.42625510692596436, |
| "learning_rate": 6.194743130227002e-05, |
| "loss": 0.278, |
| "step": 648 |
| }, |
| { |
| "epoch": 1.542372881355932, |
| "grad_norm": 0.5355501770973206, |
| "learning_rate": 6.188769414575867e-05, |
| "loss": 0.3549, |
| "step": 649 |
| }, |
| { |
| "epoch": 1.5447517097829317, |
| "grad_norm": 0.48417362570762634, |
| "learning_rate": 6.182795698924732e-05, |
| "loss": 0.3222, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.5471305382099316, |
| "grad_norm": 0.5076582431793213, |
| "learning_rate": 6.176821983273597e-05, |
| "loss": 0.3242, |
| "step": 651 |
| }, |
| { |
| "epoch": 1.5495093666369313, |
| "grad_norm": 0.49634814262390137, |
| "learning_rate": 6.17084826762246e-05, |
| "loss": 0.3095, |
| "step": 652 |
| }, |
| { |
| "epoch": 1.5518881950639312, |
| "grad_norm": 0.5837588310241699, |
| "learning_rate": 6.164874551971327e-05, |
| "loss": 0.4119, |
| "step": 653 |
| }, |
| { |
| "epoch": 1.5542670234909308, |
| "grad_norm": 0.5286645293235779, |
| "learning_rate": 6.158900836320192e-05, |
| "loss": 0.3891, |
| "step": 654 |
| }, |
| { |
| "epoch": 1.5566458519179305, |
| "grad_norm": 0.4566105306148529, |
| "learning_rate": 6.152927120669057e-05, |
| "loss": 0.2904, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.5590246803449301, |
| "grad_norm": 0.6136950254440308, |
| "learning_rate": 6.14695340501792e-05, |
| "loss": 0.3836, |
| "step": 656 |
| }, |
| { |
| "epoch": 1.5614035087719298, |
| "grad_norm": 0.7156520485877991, |
| "learning_rate": 6.140979689366786e-05, |
| "loss": 0.2578, |
| "step": 657 |
| }, |
| { |
| "epoch": 1.5637823371989295, |
| "grad_norm": 0.5305624604225159, |
| "learning_rate": 6.135005973715652e-05, |
| "loss": 0.37, |
| "step": 658 |
| }, |
| { |
| "epoch": 1.566161165625929, |
| "grad_norm": 0.5822964310646057, |
| "learning_rate": 6.129032258064517e-05, |
| "loss": 0.4268, |
| "step": 659 |
| }, |
| { |
| "epoch": 1.568539994052929, |
| "grad_norm": 0.4895828664302826, |
| "learning_rate": 6.12305854241338e-05, |
| "loss": 0.317, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.5709188224799286, |
| "grad_norm": 0.4806120693683624, |
| "learning_rate": 6.117084826762246e-05, |
| "loss": 0.3224, |
| "step": 661 |
| }, |
| { |
| "epoch": 1.5732976509069283, |
| "grad_norm": 0.5153700113296509, |
| "learning_rate": 6.111111111111112e-05, |
| "loss": 0.387, |
| "step": 662 |
| }, |
| { |
| "epoch": 1.5756764793339282, |
| "grad_norm": 0.4535806477069855, |
| "learning_rate": 6.105137395459977e-05, |
| "loss": 0.3369, |
| "step": 663 |
| }, |
| { |
| "epoch": 1.5780553077609278, |
| "grad_norm": 0.5814690589904785, |
| "learning_rate": 6.0991636798088415e-05, |
| "loss": 0.5091, |
| "step": 664 |
| }, |
| { |
| "epoch": 1.5804341361879275, |
| "grad_norm": 0.5779417157173157, |
| "learning_rate": 6.093189964157706e-05, |
| "loss": 0.4162, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.5828129646149272, |
| "grad_norm": 0.5305196046829224, |
| "learning_rate": 6.087216248506571e-05, |
| "loss": 0.3734, |
| "step": 666 |
| }, |
| { |
| "epoch": 1.5851917930419268, |
| "grad_norm": 0.5372288823127747, |
| "learning_rate": 6.0812425328554366e-05, |
| "loss": 0.2981, |
| "step": 667 |
| }, |
| { |
| "epoch": 1.5875706214689265, |
| "grad_norm": 0.49461305141448975, |
| "learning_rate": 6.0752688172043016e-05, |
| "loss": 0.3412, |
| "step": 668 |
| }, |
| { |
| "epoch": 1.5899494498959261, |
| "grad_norm": 0.49554646015167236, |
| "learning_rate": 6.069295101553166e-05, |
| "loss": 0.3639, |
| "step": 669 |
| }, |
| { |
| "epoch": 1.592328278322926, |
| "grad_norm": 0.515537679195404, |
| "learning_rate": 6.063321385902031e-05, |
| "loss": 0.3132, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.5947071067499257, |
| "grad_norm": 0.5863080024719238, |
| "learning_rate": 6.057347670250897e-05, |
| "loss": 0.3877, |
| "step": 671 |
| }, |
| { |
| "epoch": 1.5970859351769253, |
| "grad_norm": 0.5367560982704163, |
| "learning_rate": 6.051373954599762e-05, |
| "loss": 0.3591, |
| "step": 672 |
| }, |
| { |
| "epoch": 1.5994647636039252, |
| "grad_norm": 0.4917997419834137, |
| "learning_rate": 6.045400238948626e-05, |
| "loss": 0.3451, |
| "step": 673 |
| }, |
| { |
| "epoch": 1.6018435920309249, |
| "grad_norm": 0.539308488368988, |
| "learning_rate": 6.039426523297491e-05, |
| "loss": 0.3696, |
| "step": 674 |
| }, |
| { |
| "epoch": 1.6042224204579245, |
| "grad_norm": 0.6007647514343262, |
| "learning_rate": 6.033452807646356e-05, |
| "loss": 0.3717, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.6066012488849242, |
| "grad_norm": 0.4990600347518921, |
| "learning_rate": 6.027479091995222e-05, |
| "loss": 0.2856, |
| "step": 676 |
| }, |
| { |
| "epoch": 1.6089800773119238, |
| "grad_norm": 0.5643115043640137, |
| "learning_rate": 6.021505376344086e-05, |
| "loss": 0.3852, |
| "step": 677 |
| }, |
| { |
| "epoch": 1.6113589057389235, |
| "grad_norm": 0.5239197611808777, |
| "learning_rate": 6.015531660692951e-05, |
| "loss": 0.3479, |
| "step": 678 |
| }, |
| { |
| "epoch": 1.6137377341659231, |
| "grad_norm": 0.576170802116394, |
| "learning_rate": 6.009557945041816e-05, |
| "loss": 0.3959, |
| "step": 679 |
| }, |
| { |
| "epoch": 1.616116562592923, |
| "grad_norm": 0.5128218531608582, |
| "learning_rate": 6.0035842293906806e-05, |
| "loss": 0.2702, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.6184953910199227, |
| "grad_norm": 0.5164602994918823, |
| "learning_rate": 5.997610513739546e-05, |
| "loss": 0.3526, |
| "step": 681 |
| }, |
| { |
| "epoch": 1.6208742194469223, |
| "grad_norm": 0.6454161405563354, |
| "learning_rate": 5.991636798088411e-05, |
| "loss": 0.4639, |
| "step": 682 |
| }, |
| { |
| "epoch": 1.6232530478739222, |
| "grad_norm": 0.6234579086303711, |
| "learning_rate": 5.9856630824372764e-05, |
| "loss": 0.4439, |
| "step": 683 |
| }, |
| { |
| "epoch": 1.6256318763009219, |
| "grad_norm": 0.5979102849960327, |
| "learning_rate": 5.979689366786141e-05, |
| "loss": 0.3348, |
| "step": 684 |
| }, |
| { |
| "epoch": 1.6280107047279215, |
| "grad_norm": 0.4938049018383026, |
| "learning_rate": 5.9737156511350064e-05, |
| "loss": 0.3097, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.6303895331549212, |
| "grad_norm": 0.5276683568954468, |
| "learning_rate": 5.9677419354838715e-05, |
| "loss": 0.3375, |
| "step": 686 |
| }, |
| { |
| "epoch": 1.6327683615819208, |
| "grad_norm": 0.5086409449577332, |
| "learning_rate": 5.9617682198327365e-05, |
| "loss": 0.3752, |
| "step": 687 |
| }, |
| { |
| "epoch": 1.6351471900089205, |
| "grad_norm": 0.4907076954841614, |
| "learning_rate": 5.955794504181601e-05, |
| "loss": 0.3211, |
| "step": 688 |
| }, |
| { |
| "epoch": 1.6375260184359202, |
| "grad_norm": 0.534393310546875, |
| "learning_rate": 5.949820788530466e-05, |
| "loss": 0.3701, |
| "step": 689 |
| }, |
| { |
| "epoch": 1.63990484686292, |
| "grad_norm": 0.5492992997169495, |
| "learning_rate": 5.9438470728793316e-05, |
| "loss": 0.4018, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.6422836752899197, |
| "grad_norm": 0.5407986640930176, |
| "learning_rate": 5.9378733572281966e-05, |
| "loss": 0.2754, |
| "step": 691 |
| }, |
| { |
| "epoch": 1.6446625037169196, |
| "grad_norm": 0.5436089038848877, |
| "learning_rate": 5.931899641577061e-05, |
| "loss": 0.3546, |
| "step": 692 |
| }, |
| { |
| "epoch": 1.6470413321439192, |
| "grad_norm": 0.4981859028339386, |
| "learning_rate": 5.925925925925926e-05, |
| "loss": 0.3077, |
| "step": 693 |
| }, |
| { |
| "epoch": 1.649420160570919, |
| "grad_norm": 0.6169450283050537, |
| "learning_rate": 5.919952210274792e-05, |
| "loss": 0.4087, |
| "step": 694 |
| }, |
| { |
| "epoch": 1.6517989889979185, |
| "grad_norm": 0.4453374147415161, |
| "learning_rate": 5.913978494623657e-05, |
| "loss": 0.3179, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.6541778174249182, |
| "grad_norm": 0.5288130044937134, |
| "learning_rate": 5.908004778972521e-05, |
| "loss": 0.3406, |
| "step": 696 |
| }, |
| { |
| "epoch": 1.6565566458519179, |
| "grad_norm": 0.4710347056388855, |
| "learning_rate": 5.902031063321386e-05, |
| "loss": 0.2985, |
| "step": 697 |
| }, |
| { |
| "epoch": 1.6589354742789175, |
| "grad_norm": 0.48624956607818604, |
| "learning_rate": 5.8960573476702505e-05, |
| "loss": 0.3804, |
| "step": 698 |
| }, |
| { |
| "epoch": 1.6613143027059172, |
| "grad_norm": 0.5085429549217224, |
| "learning_rate": 5.890083632019117e-05, |
| "loss": 0.3055, |
| "step": 699 |
| }, |
| { |
| "epoch": 1.663693131132917, |
| "grad_norm": 0.5178834199905396, |
| "learning_rate": 5.884109916367981e-05, |
| "loss": 0.485, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.663693131132917, |
| "eval_loss": 0.38970109820365906, |
| "eval_runtime": 24.7392, |
| "eval_samples_per_second": 30.235, |
| "eval_steps_per_second": 15.118, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.6660719595599167, |
| "grad_norm": 0.5501241683959961, |
| "learning_rate": 5.878136200716846e-05, |
| "loss": 0.3649, |
| "step": 701 |
| }, |
| { |
| "epoch": 1.6684507879869166, |
| "grad_norm": 0.485432893037796, |
| "learning_rate": 5.8721624850657106e-05, |
| "loss": 0.3161, |
| "step": 702 |
| }, |
| { |
| "epoch": 1.6708296164139163, |
| "grad_norm": 0.49101677536964417, |
| "learning_rate": 5.8661887694145756e-05, |
| "loss": 0.3169, |
| "step": 703 |
| }, |
| { |
| "epoch": 1.673208444840916, |
| "grad_norm": 0.5963920950889587, |
| "learning_rate": 5.860215053763441e-05, |
| "loss": 0.393, |
| "step": 704 |
| }, |
| { |
| "epoch": 1.6755872732679156, |
| "grad_norm": 0.49317649006843567, |
| "learning_rate": 5.8542413381123063e-05, |
| "loss": 0.2979, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.6779661016949152, |
| "grad_norm": 0.5598815679550171, |
| "learning_rate": 5.848267622461171e-05, |
| "loss": 0.3924, |
| "step": 706 |
| }, |
| { |
| "epoch": 1.6803449301219149, |
| "grad_norm": 0.4852607250213623, |
| "learning_rate": 5.842293906810036e-05, |
| "loss": 0.278, |
| "step": 707 |
| }, |
| { |
| "epoch": 1.6827237585489145, |
| "grad_norm": 0.5489526987075806, |
| "learning_rate": 5.8363201911589014e-05, |
| "loss": 0.3864, |
| "step": 708 |
| }, |
| { |
| "epoch": 1.6851025869759142, |
| "grad_norm": 0.4885106086730957, |
| "learning_rate": 5.8303464755077665e-05, |
| "loss": 0.3562, |
| "step": 709 |
| }, |
| { |
| "epoch": 1.687481415402914, |
| "grad_norm": 0.47927844524383545, |
| "learning_rate": 5.824372759856631e-05, |
| "loss": 0.3441, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.6898602438299137, |
| "grad_norm": 0.5159226655960083, |
| "learning_rate": 5.818399044205496e-05, |
| "loss": 0.3259, |
| "step": 711 |
| }, |
| { |
| "epoch": 1.6922390722569136, |
| "grad_norm": 0.5427981615066528, |
| "learning_rate": 5.812425328554361e-05, |
| "loss": 0.3629, |
| "step": 712 |
| }, |
| { |
| "epoch": 1.6946179006839133, |
| "grad_norm": 0.5536279678344727, |
| "learning_rate": 5.8064516129032266e-05, |
| "loss": 0.3578, |
| "step": 713 |
| }, |
| { |
| "epoch": 1.696996729110913, |
| "grad_norm": 0.5016468167304993, |
| "learning_rate": 5.800477897252091e-05, |
| "loss": 0.3633, |
| "step": 714 |
| }, |
| { |
| "epoch": 1.6993755575379126, |
| "grad_norm": 0.49730172753334045, |
| "learning_rate": 5.794504181600956e-05, |
| "loss": 0.3682, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.7017543859649122, |
| "grad_norm": 0.5634946823120117, |
| "learning_rate": 5.788530465949821e-05, |
| "loss": 0.4343, |
| "step": 716 |
| }, |
| { |
| "epoch": 1.704133214391912, |
| "grad_norm": 0.6075212955474854, |
| "learning_rate": 5.782556750298687e-05, |
| "loss": 0.4506, |
| "step": 717 |
| }, |
| { |
| "epoch": 1.7065120428189116, |
| "grad_norm": 0.4885086119174957, |
| "learning_rate": 5.776583034647551e-05, |
| "loss": 0.3652, |
| "step": 718 |
| }, |
| { |
| "epoch": 1.7088908712459114, |
| "grad_norm": 0.5155897736549377, |
| "learning_rate": 5.770609318996416e-05, |
| "loss": 0.3198, |
| "step": 719 |
| }, |
| { |
| "epoch": 1.711269699672911, |
| "grad_norm": 0.5561105608940125, |
| "learning_rate": 5.764635603345281e-05, |
| "loss": 0.4273, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.7136485280999108, |
| "grad_norm": 0.6169248223304749, |
| "learning_rate": 5.7586618876941455e-05, |
| "loss": 0.3704, |
| "step": 721 |
| }, |
| { |
| "epoch": 1.7160273565269106, |
| "grad_norm": 0.5170378684997559, |
| "learning_rate": 5.752688172043011e-05, |
| "loss": 0.4371, |
| "step": 722 |
| }, |
| { |
| "epoch": 1.7184061849539103, |
| "grad_norm": 0.4240604341030121, |
| "learning_rate": 5.746714456391876e-05, |
| "loss": 0.302, |
| "step": 723 |
| }, |
| { |
| "epoch": 1.72078501338091, |
| "grad_norm": 0.6027634739875793, |
| "learning_rate": 5.740740740740741e-05, |
| "loss": 0.3999, |
| "step": 724 |
| }, |
| { |
| "epoch": 1.7231638418079096, |
| "grad_norm": 0.4671403467655182, |
| "learning_rate": 5.7347670250896056e-05, |
| "loss": 0.3012, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.7255426702349093, |
| "grad_norm": 0.4920845925807953, |
| "learning_rate": 5.7287933094384706e-05, |
| "loss": 0.3595, |
| "step": 726 |
| }, |
| { |
| "epoch": 1.727921498661909, |
| "grad_norm": 0.48627743124961853, |
| "learning_rate": 5.722819593787336e-05, |
| "loss": 0.3263, |
| "step": 727 |
| }, |
| { |
| "epoch": 1.7303003270889086, |
| "grad_norm": 0.4806533455848694, |
| "learning_rate": 5.7168458781362014e-05, |
| "loss": 0.302, |
| "step": 728 |
| }, |
| { |
| "epoch": 1.7326791555159085, |
| "grad_norm": 0.5916662812232971, |
| "learning_rate": 5.710872162485066e-05, |
| "loss": 0.4725, |
| "step": 729 |
| }, |
| { |
| "epoch": 1.735057983942908, |
| "grad_norm": 0.5074958801269531, |
| "learning_rate": 5.704898446833931e-05, |
| "loss": 0.365, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.7374368123699078, |
| "grad_norm": 0.4481968283653259, |
| "learning_rate": 5.6989247311827965e-05, |
| "loss": 0.2967, |
| "step": 731 |
| }, |
| { |
| "epoch": 1.7398156407969076, |
| "grad_norm": 0.48121869564056396, |
| "learning_rate": 5.6929510155316615e-05, |
| "loss": 0.2601, |
| "step": 732 |
| }, |
| { |
| "epoch": 1.7421944692239073, |
| "grad_norm": 0.6281145811080933, |
| "learning_rate": 5.686977299880526e-05, |
| "loss": 0.3374, |
| "step": 733 |
| }, |
| { |
| "epoch": 1.744573297650907, |
| "grad_norm": 0.5491541624069214, |
| "learning_rate": 5.681003584229391e-05, |
| "loss": 0.3351, |
| "step": 734 |
| }, |
| { |
| "epoch": 1.7469521260779066, |
| "grad_norm": 0.5338577032089233, |
| "learning_rate": 5.675029868578255e-05, |
| "loss": 0.3048, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.7493309545049063, |
| "grad_norm": 0.5534673929214478, |
| "learning_rate": 5.6690561529271216e-05, |
| "loss": 0.3444, |
| "step": 736 |
| }, |
| { |
| "epoch": 1.751709782931906, |
| "grad_norm": 0.5476568341255188, |
| "learning_rate": 5.663082437275986e-05, |
| "loss": 0.3367, |
| "step": 737 |
| }, |
| { |
| "epoch": 1.7540886113589056, |
| "grad_norm": 0.5127911567687988, |
| "learning_rate": 5.657108721624851e-05, |
| "loss": 0.4117, |
| "step": 738 |
| }, |
| { |
| "epoch": 1.7564674397859055, |
| "grad_norm": 0.5235200524330139, |
| "learning_rate": 5.6511350059737153e-05, |
| "loss": 0.2968, |
| "step": 739 |
| }, |
| { |
| "epoch": 1.7588462682129051, |
| "grad_norm": 0.5279961228370667, |
| "learning_rate": 5.645161290322582e-05, |
| "loss": 0.337, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.761225096639905, |
| "grad_norm": 0.5380825400352478, |
| "learning_rate": 5.639187574671446e-05, |
| "loss": 0.3376, |
| "step": 741 |
| }, |
| { |
| "epoch": 1.7636039250669047, |
| "grad_norm": 0.5725474953651428, |
| "learning_rate": 5.633213859020311e-05, |
| "loss": 0.3843, |
| "step": 742 |
| }, |
| { |
| "epoch": 1.7659827534939043, |
| "grad_norm": 0.47840648889541626, |
| "learning_rate": 5.6272401433691755e-05, |
| "loss": 0.3816, |
| "step": 743 |
| }, |
| { |
| "epoch": 1.768361581920904, |
| "grad_norm": 0.5369259119033813, |
| "learning_rate": 5.6212664277180405e-05, |
| "loss": 0.3751, |
| "step": 744 |
| }, |
| { |
| "epoch": 1.7707404103479036, |
| "grad_norm": 0.5530030131340027, |
| "learning_rate": 5.615292712066906e-05, |
| "loss": 0.3387, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.7731192387749033, |
| "grad_norm": 0.47207772731781006, |
| "learning_rate": 5.609318996415771e-05, |
| "loss": 0.3897, |
| "step": 746 |
| }, |
| { |
| "epoch": 1.775498067201903, |
| "grad_norm": 0.49299487471580505, |
| "learning_rate": 5.6033452807646356e-05, |
| "loss": 0.3318, |
| "step": 747 |
| }, |
| { |
| "epoch": 1.7778768956289026, |
| "grad_norm": 0.48968756198883057, |
| "learning_rate": 5.5973715651135006e-05, |
| "loss": 0.3949, |
| "step": 748 |
| }, |
| { |
| "epoch": 1.7802557240559025, |
| "grad_norm": 0.4778074026107788, |
| "learning_rate": 5.5913978494623656e-05, |
| "loss": 0.3891, |
| "step": 749 |
| }, |
| { |
| "epoch": 1.7826345524829021, |
| "grad_norm": 0.536257803440094, |
| "learning_rate": 5.5854241338112313e-05, |
| "loss": 0.3005, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.785013380909902, |
| "grad_norm": 0.5621615648269653, |
| "learning_rate": 5.579450418160096e-05, |
| "loss": 0.384, |
| "step": 751 |
| }, |
| { |
| "epoch": 1.7873922093369017, |
| "grad_norm": 0.4926300644874573, |
| "learning_rate": 5.573476702508961e-05, |
| "loss": 0.3277, |
| "step": 752 |
| }, |
| { |
| "epoch": 1.7897710377639013, |
| "grad_norm": 0.49344128370285034, |
| "learning_rate": 5.567502986857826e-05, |
| "loss": 0.3386, |
| "step": 753 |
| }, |
| { |
| "epoch": 1.792149866190901, |
| "grad_norm": 0.4578976333141327, |
| "learning_rate": 5.5615292712066915e-05, |
| "loss": 0.3371, |
| "step": 754 |
| }, |
| { |
| "epoch": 1.7945286946179007, |
| "grad_norm": 0.5874441266059875, |
| "learning_rate": 5.555555555555556e-05, |
| "loss": 0.3814, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.7969075230449003, |
| "grad_norm": 0.548207700252533, |
| "learning_rate": 5.549581839904421e-05, |
| "loss": 0.377, |
| "step": 756 |
| }, |
| { |
| "epoch": 1.7992863514719, |
| "grad_norm": 0.4734908938407898, |
| "learning_rate": 5.543608124253286e-05, |
| "loss": 0.3264, |
| "step": 757 |
| }, |
| { |
| "epoch": 1.8016651798988996, |
| "grad_norm": 0.5327095985412598, |
| "learning_rate": 5.53763440860215e-05, |
| "loss": 0.3742, |
| "step": 758 |
| }, |
| { |
| "epoch": 1.8040440083258995, |
| "grad_norm": 0.562744140625, |
| "learning_rate": 5.531660692951016e-05, |
| "loss": 0.3545, |
| "step": 759 |
| }, |
| { |
| "epoch": 1.8064228367528992, |
| "grad_norm": 0.47441789507865906, |
| "learning_rate": 5.525686977299881e-05, |
| "loss": 0.3274, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.808801665179899, |
| "grad_norm": 0.5133342742919922, |
| "learning_rate": 5.519713261648746e-05, |
| "loss": 0.3693, |
| "step": 761 |
| }, |
| { |
| "epoch": 1.8111804936068987, |
| "grad_norm": 0.5697010159492493, |
| "learning_rate": 5.5137395459976104e-05, |
| "loss": 0.3936, |
| "step": 762 |
| }, |
| { |
| "epoch": 1.8135593220338984, |
| "grad_norm": 0.5646381974220276, |
| "learning_rate": 5.507765830346476e-05, |
| "loss": 0.4184, |
| "step": 763 |
| }, |
| { |
| "epoch": 1.815938150460898, |
| "grad_norm": 0.4727121889591217, |
| "learning_rate": 5.501792114695341e-05, |
| "loss": 0.3406, |
| "step": 764 |
| }, |
| { |
| "epoch": 1.8183169788878977, |
| "grad_norm": 0.5393305420875549, |
| "learning_rate": 5.495818399044206e-05, |
| "loss": 0.3324, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.8206958073148973, |
| "grad_norm": 0.5477187037467957, |
| "learning_rate": 5.4898446833930705e-05, |
| "loss": 0.3, |
| "step": 766 |
| }, |
| { |
| "epoch": 1.823074635741897, |
| "grad_norm": 0.6073659062385559, |
| "learning_rate": 5.4838709677419355e-05, |
| "loss": 0.439, |
| "step": 767 |
| }, |
| { |
| "epoch": 1.8254534641688969, |
| "grad_norm": 0.5603262186050415, |
| "learning_rate": 5.477897252090801e-05, |
| "loss": 0.363, |
| "step": 768 |
| }, |
| { |
| "epoch": 1.8278322925958965, |
| "grad_norm": 0.507232129573822, |
| "learning_rate": 5.471923536439666e-05, |
| "loss": 0.307, |
| "step": 769 |
| }, |
| { |
| "epoch": 1.8302111210228962, |
| "grad_norm": 0.605401337146759, |
| "learning_rate": 5.4659498207885306e-05, |
| "loss": 0.3638, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.832589949449896, |
| "grad_norm": 0.4511045515537262, |
| "learning_rate": 5.4599761051373956e-05, |
| "loss": 0.2924, |
| "step": 771 |
| }, |
| { |
| "epoch": 1.8349687778768957, |
| "grad_norm": 0.5773246884346008, |
| "learning_rate": 5.45400238948626e-05, |
| "loss": 0.3979, |
| "step": 772 |
| }, |
| { |
| "epoch": 1.8373476063038954, |
| "grad_norm": 0.5383531451225281, |
| "learning_rate": 5.4480286738351264e-05, |
| "loss": 0.3491, |
| "step": 773 |
| }, |
| { |
| "epoch": 1.839726434730895, |
| "grad_norm": 0.48941031098365784, |
| "learning_rate": 5.442054958183991e-05, |
| "loss": 0.3749, |
| "step": 774 |
| }, |
| { |
| "epoch": 1.8421052631578947, |
| "grad_norm": 0.5394012331962585, |
| "learning_rate": 5.436081242532856e-05, |
| "loss": 0.3267, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.8444840915848943, |
| "grad_norm": 0.5717118382453918, |
| "learning_rate": 5.43010752688172e-05, |
| "loss": 0.3667, |
| "step": 776 |
| }, |
| { |
| "epoch": 1.846862920011894, |
| "grad_norm": 0.634171724319458, |
| "learning_rate": 5.4241338112305865e-05, |
| "loss": 0.4774, |
| "step": 777 |
| }, |
| { |
| "epoch": 1.8492417484388939, |
| "grad_norm": 0.5577214956283569, |
| "learning_rate": 5.418160095579451e-05, |
| "loss": 0.393, |
| "step": 778 |
| }, |
| { |
| "epoch": 1.8516205768658935, |
| "grad_norm": 0.6162117123603821, |
| "learning_rate": 5.412186379928316e-05, |
| "loss": 0.416, |
| "step": 779 |
| }, |
| { |
| "epoch": 1.8539994052928932, |
| "grad_norm": 0.48546504974365234, |
| "learning_rate": 5.40621266427718e-05, |
| "loss": 0.2887, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.856378233719893, |
| "grad_norm": 0.5625360012054443, |
| "learning_rate": 5.400238948626045e-05, |
| "loss": 0.3789, |
| "step": 781 |
| }, |
| { |
| "epoch": 1.8587570621468927, |
| "grad_norm": 0.5299000144004822, |
| "learning_rate": 5.394265232974911e-05, |
| "loss": 0.3646, |
| "step": 782 |
| }, |
| { |
| "epoch": 1.8611358905738924, |
| "grad_norm": 0.5966357588768005, |
| "learning_rate": 5.388291517323776e-05, |
| "loss": 0.3386, |
| "step": 783 |
| }, |
| { |
| "epoch": 1.863514719000892, |
| "grad_norm": 0.520456075668335, |
| "learning_rate": 5.3823178016726403e-05, |
| "loss": 0.3239, |
| "step": 784 |
| }, |
| { |
| "epoch": 1.8658935474278917, |
| "grad_norm": 0.5554583668708801, |
| "learning_rate": 5.3763440860215054e-05, |
| "loss": 0.3616, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.8682723758548914, |
| "grad_norm": 0.5244592428207397, |
| "learning_rate": 5.370370370370371e-05, |
| "loss": 0.3714, |
| "step": 786 |
| }, |
| { |
| "epoch": 1.870651204281891, |
| "grad_norm": 0.4749840497970581, |
| "learning_rate": 5.364396654719236e-05, |
| "loss": 0.3731, |
| "step": 787 |
| }, |
| { |
| "epoch": 1.873030032708891, |
| "grad_norm": 0.5398964881896973, |
| "learning_rate": 5.3584229390681005e-05, |
| "loss": 0.3281, |
| "step": 788 |
| }, |
| { |
| "epoch": 1.8754088611358906, |
| "grad_norm": 0.5688257217407227, |
| "learning_rate": 5.3524492234169655e-05, |
| "loss": 0.3677, |
| "step": 789 |
| }, |
| { |
| "epoch": 1.8777876895628904, |
| "grad_norm": 0.5520471930503845, |
| "learning_rate": 5.34647550776583e-05, |
| "loss": 0.3801, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.88016651798989, |
| "grad_norm": 0.547389566898346, |
| "learning_rate": 5.340501792114696e-05, |
| "loss": 0.368, |
| "step": 791 |
| }, |
| { |
| "epoch": 1.8825453464168898, |
| "grad_norm": 0.47842028737068176, |
| "learning_rate": 5.3345280764635606e-05, |
| "loss": 0.3322, |
| "step": 792 |
| }, |
| { |
| "epoch": 1.8849241748438894, |
| "grad_norm": 0.5586636066436768, |
| "learning_rate": 5.3285543608124256e-05, |
| "loss": 0.3678, |
| "step": 793 |
| }, |
| { |
| "epoch": 1.887303003270889, |
| "grad_norm": 0.5233385562896729, |
| "learning_rate": 5.32258064516129e-05, |
| "loss": 0.3498, |
| "step": 794 |
| }, |
| { |
| "epoch": 1.8896818316978887, |
| "grad_norm": 0.4903348684310913, |
| "learning_rate": 5.316606929510155e-05, |
| "loss": 0.3427, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.8920606601248884, |
| "grad_norm": 0.48647114634513855, |
| "learning_rate": 5.310633213859021e-05, |
| "loss": 0.2726, |
| "step": 796 |
| }, |
| { |
| "epoch": 1.894439488551888, |
| "grad_norm": 0.5279799699783325, |
| "learning_rate": 5.304659498207886e-05, |
| "loss": 0.3228, |
| "step": 797 |
| }, |
| { |
| "epoch": 1.896818316978888, |
| "grad_norm": 0.6157146692276001, |
| "learning_rate": 5.29868578255675e-05, |
| "loss": 0.3942, |
| "step": 798 |
| }, |
| { |
| "epoch": 1.8991971454058876, |
| "grad_norm": 0.5570871829986572, |
| "learning_rate": 5.292712066905615e-05, |
| "loss": 0.375, |
| "step": 799 |
| }, |
| { |
| "epoch": 1.9015759738328875, |
| "grad_norm": 0.48831409215927124, |
| "learning_rate": 5.286738351254481e-05, |
| "loss": 0.3498, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.9015759738328875, |
| "eval_loss": 0.3837396204471588, |
| "eval_runtime": 24.7664, |
| "eval_samples_per_second": 30.202, |
| "eval_steps_per_second": 15.101, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.9039548022598871, |
| "grad_norm": 0.5372702479362488, |
| "learning_rate": 5.280764635603346e-05, |
| "loss": 0.3304, |
| "step": 801 |
| }, |
| { |
| "epoch": 1.9063336306868868, |
| "grad_norm": 0.4852701425552368, |
| "learning_rate": 5.27479091995221e-05, |
| "loss": 0.3299, |
| "step": 802 |
| }, |
| { |
| "epoch": 1.9087124591138864, |
| "grad_norm": 0.614133894443512, |
| "learning_rate": 5.268817204301075e-05, |
| "loss": 0.3738, |
| "step": 803 |
| }, |
| { |
| "epoch": 1.911091287540886, |
| "grad_norm": 0.5791040658950806, |
| "learning_rate": 5.26284348864994e-05, |
| "loss": 0.3164, |
| "step": 804 |
| }, |
| { |
| "epoch": 1.9134701159678857, |
| "grad_norm": 0.542182207107544, |
| "learning_rate": 5.256869772998806e-05, |
| "loss": 0.3784, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.9158489443948854, |
| "grad_norm": 0.5163738131523132, |
| "learning_rate": 5.25089605734767e-05, |
| "loss": 0.3684, |
| "step": 806 |
| }, |
| { |
| "epoch": 1.918227772821885, |
| "grad_norm": 0.5529613494873047, |
| "learning_rate": 5.2449223416965354e-05, |
| "loss": 0.3658, |
| "step": 807 |
| }, |
| { |
| "epoch": 1.920606601248885, |
| "grad_norm": 0.545074462890625, |
| "learning_rate": 5.2389486260454004e-05, |
| "loss": 0.3343, |
| "step": 808 |
| }, |
| { |
| "epoch": 1.9229854296758846, |
| "grad_norm": 0.5097286701202393, |
| "learning_rate": 5.232974910394266e-05, |
| "loss": 0.3189, |
| "step": 809 |
| }, |
| { |
| "epoch": 1.9253642581028845, |
| "grad_norm": 0.5587269067764282, |
| "learning_rate": 5.2270011947431304e-05, |
| "loss": 0.2856, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.9277430865298841, |
| "grad_norm": 0.47120344638824463, |
| "learning_rate": 5.2210274790919955e-05, |
| "loss": 0.336, |
| "step": 811 |
| }, |
| { |
| "epoch": 1.9301219149568838, |
| "grad_norm": 0.5026476979255676, |
| "learning_rate": 5.2150537634408605e-05, |
| "loss": 0.3642, |
| "step": 812 |
| }, |
| { |
| "epoch": 1.9325007433838834, |
| "grad_norm": 0.4788722097873688, |
| "learning_rate": 5.209080047789725e-05, |
| "loss": 0.3609, |
| "step": 813 |
| }, |
| { |
| "epoch": 1.934879571810883, |
| "grad_norm": 0.5492738485336304, |
| "learning_rate": 5.2031063321385906e-05, |
| "loss": 0.3353, |
| "step": 814 |
| }, |
| { |
| "epoch": 1.9372584002378828, |
| "grad_norm": 0.5270171165466309, |
| "learning_rate": 5.1971326164874556e-05, |
| "loss": 0.2813, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.9396372286648824, |
| "grad_norm": 0.5411754250526428, |
| "learning_rate": 5.1911589008363206e-05, |
| "loss": 0.3842, |
| "step": 816 |
| }, |
| { |
| "epoch": 1.9420160570918823, |
| "grad_norm": 0.5464585423469543, |
| "learning_rate": 5.185185185185185e-05, |
| "loss": 0.339, |
| "step": 817 |
| }, |
| { |
| "epoch": 1.944394885518882, |
| "grad_norm": 0.5153761506080627, |
| "learning_rate": 5.17921146953405e-05, |
| "loss": 0.3486, |
| "step": 818 |
| }, |
| { |
| "epoch": 1.9467737139458816, |
| "grad_norm": 0.5572924613952637, |
| "learning_rate": 5.173237753882916e-05, |
| "loss": 0.39, |
| "step": 819 |
| }, |
| { |
| "epoch": 1.9491525423728815, |
| "grad_norm": 0.5603399872779846, |
| "learning_rate": 5.167264038231781e-05, |
| "loss": 0.3799, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.9515313707998811, |
| "grad_norm": 0.5274227857589722, |
| "learning_rate": 5.161290322580645e-05, |
| "loss": 0.3355, |
| "step": 821 |
| }, |
| { |
| "epoch": 1.9539101992268808, |
| "grad_norm": 0.5358468890190125, |
| "learning_rate": 5.15531660692951e-05, |
| "loss": 0.3252, |
| "step": 822 |
| }, |
| { |
| "epoch": 1.9562890276538805, |
| "grad_norm": 0.5450243353843689, |
| "learning_rate": 5.149342891278376e-05, |
| "loss": 0.3685, |
| "step": 823 |
| }, |
| { |
| "epoch": 1.9586678560808801, |
| "grad_norm": 0.5191539525985718, |
| "learning_rate": 5.143369175627241e-05, |
| "loss": 0.3151, |
| "step": 824 |
| }, |
| { |
| "epoch": 1.9610466845078798, |
| "grad_norm": 0.48101523518562317, |
| "learning_rate": 5.137395459976105e-05, |
| "loss": 0.3056, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.9634255129348794, |
| "grad_norm": 0.4933248460292816, |
| "learning_rate": 5.13142174432497e-05, |
| "loss": 0.3303, |
| "step": 826 |
| }, |
| { |
| "epoch": 1.9658043413618793, |
| "grad_norm": 0.5369696617126465, |
| "learning_rate": 5.1254480286738346e-05, |
| "loss": 0.3708, |
| "step": 827 |
| }, |
| { |
| "epoch": 1.968183169788879, |
| "grad_norm": 0.4536949396133423, |
| "learning_rate": 5.119474313022701e-05, |
| "loss": 0.3057, |
| "step": 828 |
| }, |
| { |
| "epoch": 1.9705619982158786, |
| "grad_norm": 0.5654657483100891, |
| "learning_rate": 5.1135005973715653e-05, |
| "loss": 0.4042, |
| "step": 829 |
| }, |
| { |
| "epoch": 1.9729408266428785, |
| "grad_norm": 0.5114840865135193, |
| "learning_rate": 5.1075268817204304e-05, |
| "loss": 0.3863, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.9753196550698782, |
| "grad_norm": 0.5588715672492981, |
| "learning_rate": 5.101553166069295e-05, |
| "loss": 0.3143, |
| "step": 831 |
| }, |
| { |
| "epoch": 1.9776984834968778, |
| "grad_norm": 0.5302734375, |
| "learning_rate": 5.095579450418161e-05, |
| "loss": 0.3683, |
| "step": 832 |
| }, |
| { |
| "epoch": 1.9800773119238775, |
| "grad_norm": 0.4852687120437622, |
| "learning_rate": 5.0896057347670255e-05, |
| "loss": 0.2965, |
| "step": 833 |
| }, |
| { |
| "epoch": 1.9824561403508771, |
| "grad_norm": 0.5499604940414429, |
| "learning_rate": 5.0836320191158905e-05, |
| "loss": 0.3342, |
| "step": 834 |
| }, |
| { |
| "epoch": 1.9848349687778768, |
| "grad_norm": 0.5646479725837708, |
| "learning_rate": 5.077658303464755e-05, |
| "loss": 0.3291, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.9872137972048765, |
| "grad_norm": 0.4746275246143341, |
| "learning_rate": 5.07168458781362e-05, |
| "loss": 0.3243, |
| "step": 836 |
| }, |
| { |
| "epoch": 1.9895926256318763, |
| "grad_norm": 0.5071300864219666, |
| "learning_rate": 5.0657108721624856e-05, |
| "loss": 0.3593, |
| "step": 837 |
| }, |
| { |
| "epoch": 1.991971454058876, |
| "grad_norm": 0.4464685916900635, |
| "learning_rate": 5.0597371565113506e-05, |
| "loss": 0.3016, |
| "step": 838 |
| }, |
| { |
| "epoch": 1.9943502824858759, |
| "grad_norm": 0.5198728442192078, |
| "learning_rate": 5.053763440860215e-05, |
| "loss": 0.4076, |
| "step": 839 |
| }, |
| { |
| "epoch": 1.9967291109128755, |
| "grad_norm": 0.4641326367855072, |
| "learning_rate": 5.04778972520908e-05, |
| "loss": 0.3481, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.9991079393398752, |
| "grad_norm": 0.5178696513175964, |
| "learning_rate": 5.041816009557945e-05, |
| "loss": 0.3941, |
| "step": 841 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.9851695895195007, |
| "learning_rate": 5.035842293906811e-05, |
| "loss": 0.3565, |
| "step": 842 |
| }, |
| { |
| "epoch": 2.0023788284269997, |
| "grad_norm": 0.47530972957611084, |
| "learning_rate": 5.029868578255675e-05, |
| "loss": 0.319, |
| "step": 843 |
| }, |
| { |
| "epoch": 2.0047576568539993, |
| "grad_norm": 0.42570582032203674, |
| "learning_rate": 5.02389486260454e-05, |
| "loss": 0.2761, |
| "step": 844 |
| }, |
| { |
| "epoch": 2.007136485280999, |
| "grad_norm": 0.45585909485816956, |
| "learning_rate": 5.017921146953405e-05, |
| "loss": 0.3012, |
| "step": 845 |
| }, |
| { |
| "epoch": 2.0095153137079986, |
| "grad_norm": 0.4082046449184418, |
| "learning_rate": 5.011947431302271e-05, |
| "loss": 0.2773, |
| "step": 846 |
| }, |
| { |
| "epoch": 2.0118941421349987, |
| "grad_norm": 0.4512028992176056, |
| "learning_rate": 5.005973715651135e-05, |
| "loss": 0.32, |
| "step": 847 |
| }, |
| { |
| "epoch": 2.0142729705619984, |
| "grad_norm": 0.5121605396270752, |
| "learning_rate": 5e-05, |
| "loss": 0.3279, |
| "step": 848 |
| }, |
| { |
| "epoch": 2.016651798988998, |
| "grad_norm": 0.5313460230827332, |
| "learning_rate": 4.994026284348865e-05, |
| "loss": 0.338, |
| "step": 849 |
| }, |
| { |
| "epoch": 2.0190306274159977, |
| "grad_norm": 0.45323655009269714, |
| "learning_rate": 4.98805256869773e-05, |
| "loss": 0.2707, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.0214094558429974, |
| "grad_norm": 0.5094829201698303, |
| "learning_rate": 4.982078853046595e-05, |
| "loss": 0.2984, |
| "step": 851 |
| }, |
| { |
| "epoch": 2.023788284269997, |
| "grad_norm": 0.4851239323616028, |
| "learning_rate": 4.9761051373954604e-05, |
| "loss": 0.2722, |
| "step": 852 |
| }, |
| { |
| "epoch": 2.0261671126969967, |
| "grad_norm": 0.4356750547885895, |
| "learning_rate": 4.9701314217443254e-05, |
| "loss": 0.2569, |
| "step": 853 |
| }, |
| { |
| "epoch": 2.0285459411239963, |
| "grad_norm": 0.5339999794960022, |
| "learning_rate": 4.9641577060931904e-05, |
| "loss": 0.306, |
| "step": 854 |
| }, |
| { |
| "epoch": 2.030924769550996, |
| "grad_norm": 0.6356980204582214, |
| "learning_rate": 4.9581839904420555e-05, |
| "loss": 0.3564, |
| "step": 855 |
| }, |
| { |
| "epoch": 2.0333035979779956, |
| "grad_norm": 0.4849913418292999, |
| "learning_rate": 4.95221027479092e-05, |
| "loss": 0.3277, |
| "step": 856 |
| }, |
| { |
| "epoch": 2.0356824264049957, |
| "grad_norm": 0.5072147250175476, |
| "learning_rate": 4.9462365591397855e-05, |
| "loss": 0.305, |
| "step": 857 |
| }, |
| { |
| "epoch": 2.0380612548319954, |
| "grad_norm": 0.5672339797019958, |
| "learning_rate": 4.94026284348865e-05, |
| "loss": 0.3457, |
| "step": 858 |
| }, |
| { |
| "epoch": 2.040440083258995, |
| "grad_norm": 0.5494199991226196, |
| "learning_rate": 4.9342891278375156e-05, |
| "loss": 0.3739, |
| "step": 859 |
| }, |
| { |
| "epoch": 2.0428189116859947, |
| "grad_norm": 0.5226100087165833, |
| "learning_rate": 4.92831541218638e-05, |
| "loss": 0.2661, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.0451977401129944, |
| "grad_norm": 0.49554064869880676, |
| "learning_rate": 4.9223416965352456e-05, |
| "loss": 0.2994, |
| "step": 861 |
| }, |
| { |
| "epoch": 2.047576568539994, |
| "grad_norm": 0.5684688091278076, |
| "learning_rate": 4.91636798088411e-05, |
| "loss": 0.353, |
| "step": 862 |
| }, |
| { |
| "epoch": 2.0499553969669937, |
| "grad_norm": 0.5614392161369324, |
| "learning_rate": 4.910394265232976e-05, |
| "loss": 0.2898, |
| "step": 863 |
| }, |
| { |
| "epoch": 2.0523342253939933, |
| "grad_norm": 0.522982120513916, |
| "learning_rate": 4.90442054958184e-05, |
| "loss": 0.277, |
| "step": 864 |
| }, |
| { |
| "epoch": 2.054713053820993, |
| "grad_norm": 0.48135489225387573, |
| "learning_rate": 4.898446833930705e-05, |
| "loss": 0.2326, |
| "step": 865 |
| }, |
| { |
| "epoch": 2.0570918822479927, |
| "grad_norm": 0.505936324596405, |
| "learning_rate": 4.89247311827957e-05, |
| "loss": 0.2519, |
| "step": 866 |
| }, |
| { |
| "epoch": 2.0594707106749928, |
| "grad_norm": 0.5319898724555969, |
| "learning_rate": 4.886499402628435e-05, |
| "loss": 0.2914, |
| "step": 867 |
| }, |
| { |
| "epoch": 2.0618495391019924, |
| "grad_norm": 0.5627797842025757, |
| "learning_rate": 4.8805256869773e-05, |
| "loss": 0.2827, |
| "step": 868 |
| }, |
| { |
| "epoch": 2.064228367528992, |
| "grad_norm": 0.5255232453346252, |
| "learning_rate": 4.874551971326165e-05, |
| "loss": 0.2431, |
| "step": 869 |
| }, |
| { |
| "epoch": 2.0666071959559917, |
| "grad_norm": 0.58738774061203, |
| "learning_rate": 4.86857825567503e-05, |
| "loss": 0.2937, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.0689860243829914, |
| "grad_norm": 0.4902366101741791, |
| "learning_rate": 4.862604540023895e-05, |
| "loss": 0.2908, |
| "step": 871 |
| }, |
| { |
| "epoch": 2.071364852809991, |
| "grad_norm": 0.5246136784553528, |
| "learning_rate": 4.8566308243727596e-05, |
| "loss": 0.2602, |
| "step": 872 |
| }, |
| { |
| "epoch": 2.0737436812369907, |
| "grad_norm": 0.555491030216217, |
| "learning_rate": 4.850657108721625e-05, |
| "loss": 0.2981, |
| "step": 873 |
| }, |
| { |
| "epoch": 2.0761225096639904, |
| "grad_norm": 0.5617752075195312, |
| "learning_rate": 4.84468339307049e-05, |
| "loss": 0.3404, |
| "step": 874 |
| }, |
| { |
| "epoch": 2.07850133809099, |
| "grad_norm": 0.5913593769073486, |
| "learning_rate": 4.8387096774193554e-05, |
| "loss": 0.3252, |
| "step": 875 |
| }, |
| { |
| "epoch": 2.0808801665179897, |
| "grad_norm": 0.5444798469543457, |
| "learning_rate": 4.83273596176822e-05, |
| "loss": 0.2884, |
| "step": 876 |
| }, |
| { |
| "epoch": 2.08325899494499, |
| "grad_norm": 0.5903862714767456, |
| "learning_rate": 4.8267622461170854e-05, |
| "loss": 0.2807, |
| "step": 877 |
| }, |
| { |
| "epoch": 2.0856378233719894, |
| "grad_norm": 0.49321287870407104, |
| "learning_rate": 4.82078853046595e-05, |
| "loss": 0.2533, |
| "step": 878 |
| }, |
| { |
| "epoch": 2.088016651798989, |
| "grad_norm": 0.5765815377235413, |
| "learning_rate": 4.814814814814815e-05, |
| "loss": 0.3025, |
| "step": 879 |
| }, |
| { |
| "epoch": 2.0903954802259888, |
| "grad_norm": 0.6189472079277039, |
| "learning_rate": 4.80884109916368e-05, |
| "loss": 0.3258, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.0927743086529884, |
| "grad_norm": 0.48916128277778625, |
| "learning_rate": 4.802867383512545e-05, |
| "loss": 0.2493, |
| "step": 881 |
| }, |
| { |
| "epoch": 2.095153137079988, |
| "grad_norm": 0.5639986991882324, |
| "learning_rate": 4.79689366786141e-05, |
| "loss": 0.3068, |
| "step": 882 |
| }, |
| { |
| "epoch": 2.0975319655069877, |
| "grad_norm": 0.6248376965522766, |
| "learning_rate": 4.790919952210275e-05, |
| "loss": 0.2955, |
| "step": 883 |
| }, |
| { |
| "epoch": 2.0999107939339874, |
| "grad_norm": 0.5759831666946411, |
| "learning_rate": 4.78494623655914e-05, |
| "loss": 0.3161, |
| "step": 884 |
| }, |
| { |
| "epoch": 2.102289622360987, |
| "grad_norm": 0.5416770577430725, |
| "learning_rate": 4.778972520908005e-05, |
| "loss": 0.2909, |
| "step": 885 |
| }, |
| { |
| "epoch": 2.1046684507879867, |
| "grad_norm": 0.5953570604324341, |
| "learning_rate": 4.77299880525687e-05, |
| "loss": 0.3355, |
| "step": 886 |
| }, |
| { |
| "epoch": 2.107047279214987, |
| "grad_norm": 0.5626474022865295, |
| "learning_rate": 4.767025089605735e-05, |
| "loss": 0.2999, |
| "step": 887 |
| }, |
| { |
| "epoch": 2.1094261076419865, |
| "grad_norm": 0.535835325717926, |
| "learning_rate": 4.7610513739546e-05, |
| "loss": 0.2792, |
| "step": 888 |
| }, |
| { |
| "epoch": 2.111804936068986, |
| "grad_norm": 0.4889310598373413, |
| "learning_rate": 4.755077658303465e-05, |
| "loss": 0.2638, |
| "step": 889 |
| }, |
| { |
| "epoch": 2.1141837644959858, |
| "grad_norm": 0.5014443397521973, |
| "learning_rate": 4.74910394265233e-05, |
| "loss": 0.2865, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.1165625929229854, |
| "grad_norm": 0.5222111344337463, |
| "learning_rate": 4.743130227001195e-05, |
| "loss": 0.2829, |
| "step": 891 |
| }, |
| { |
| "epoch": 2.118941421349985, |
| "grad_norm": 0.5849918723106384, |
| "learning_rate": 4.73715651135006e-05, |
| "loss": 0.3257, |
| "step": 892 |
| }, |
| { |
| "epoch": 2.1213202497769847, |
| "grad_norm": 0.516745924949646, |
| "learning_rate": 4.731182795698925e-05, |
| "loss": 0.3169, |
| "step": 893 |
| }, |
| { |
| "epoch": 2.1236990782039844, |
| "grad_norm": 0.5032657384872437, |
| "learning_rate": 4.72520908004779e-05, |
| "loss": 0.2685, |
| "step": 894 |
| }, |
| { |
| "epoch": 2.126077906630984, |
| "grad_norm": 0.5356237888336182, |
| "learning_rate": 4.7192353643966546e-05, |
| "loss": 0.2767, |
| "step": 895 |
| }, |
| { |
| "epoch": 2.128456735057984, |
| "grad_norm": 0.6084557771682739, |
| "learning_rate": 4.71326164874552e-05, |
| "loss": 0.2951, |
| "step": 896 |
| }, |
| { |
| "epoch": 2.130835563484984, |
| "grad_norm": 0.5304860472679138, |
| "learning_rate": 4.707287933094385e-05, |
| "loss": 0.2738, |
| "step": 897 |
| }, |
| { |
| "epoch": 2.1332143919119835, |
| "grad_norm": 0.5701313018798828, |
| "learning_rate": 4.7013142174432504e-05, |
| "loss": 0.3126, |
| "step": 898 |
| }, |
| { |
| "epoch": 2.135593220338983, |
| "grad_norm": 0.5780851244926453, |
| "learning_rate": 4.695340501792115e-05, |
| "loss": 0.2865, |
| "step": 899 |
| }, |
| { |
| "epoch": 2.137972048765983, |
| "grad_norm": 0.4513755440711975, |
| "learning_rate": 4.6893667861409805e-05, |
| "loss": 0.2668, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.137972048765983, |
| "eval_loss": 0.39199650287628174, |
| "eval_runtime": 24.8995, |
| "eval_samples_per_second": 30.041, |
| "eval_steps_per_second": 15.02, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.1403508771929824, |
| "grad_norm": 0.5768704414367676, |
| "learning_rate": 4.683393070489845e-05, |
| "loss": 0.256, |
| "step": 901 |
| }, |
| { |
| "epoch": 2.142729705619982, |
| "grad_norm": 0.5071364045143127, |
| "learning_rate": 4.67741935483871e-05, |
| "loss": 0.2681, |
| "step": 902 |
| }, |
| { |
| "epoch": 2.1451085340469818, |
| "grad_norm": 0.6088786721229553, |
| "learning_rate": 4.671445639187575e-05, |
| "loss": 0.2721, |
| "step": 903 |
| }, |
| { |
| "epoch": 2.1474873624739814, |
| "grad_norm": 0.5659234523773193, |
| "learning_rate": 4.66547192353644e-05, |
| "loss": 0.3302, |
| "step": 904 |
| }, |
| { |
| "epoch": 2.149866190900981, |
| "grad_norm": 0.5510467886924744, |
| "learning_rate": 4.659498207885305e-05, |
| "loss": 0.2804, |
| "step": 905 |
| }, |
| { |
| "epoch": 2.1522450193279807, |
| "grad_norm": 0.6317784190177917, |
| "learning_rate": 4.65352449223417e-05, |
| "loss": 0.4077, |
| "step": 906 |
| }, |
| { |
| "epoch": 2.154623847754981, |
| "grad_norm": 0.5438193082809448, |
| "learning_rate": 4.647550776583035e-05, |
| "loss": 0.349, |
| "step": 907 |
| }, |
| { |
| "epoch": 2.1570026761819805, |
| "grad_norm": 0.5480299592018127, |
| "learning_rate": 4.6415770609319e-05, |
| "loss": 0.2482, |
| "step": 908 |
| }, |
| { |
| "epoch": 2.15938150460898, |
| "grad_norm": 0.5093241333961487, |
| "learning_rate": 4.635603345280765e-05, |
| "loss": 0.285, |
| "step": 909 |
| }, |
| { |
| "epoch": 2.16176033303598, |
| "grad_norm": 0.600224494934082, |
| "learning_rate": 4.62962962962963e-05, |
| "loss": 0.2936, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.1641391614629795, |
| "grad_norm": 0.5286124348640442, |
| "learning_rate": 4.6236559139784944e-05, |
| "loss": 0.3155, |
| "step": 911 |
| }, |
| { |
| "epoch": 2.166517989889979, |
| "grad_norm": 0.5486891865730286, |
| "learning_rate": 4.61768219832736e-05, |
| "loss": 0.2868, |
| "step": 912 |
| }, |
| { |
| "epoch": 2.168896818316979, |
| "grad_norm": 0.5705512166023254, |
| "learning_rate": 4.6117084826762245e-05, |
| "loss": 0.2534, |
| "step": 913 |
| }, |
| { |
| "epoch": 2.1712756467439784, |
| "grad_norm": 0.5283891558647156, |
| "learning_rate": 4.60573476702509e-05, |
| "loss": 0.2449, |
| "step": 914 |
| }, |
| { |
| "epoch": 2.173654475170978, |
| "grad_norm": 0.5561732053756714, |
| "learning_rate": 4.5997610513739546e-05, |
| "loss": 0.2469, |
| "step": 915 |
| }, |
| { |
| "epoch": 2.176033303597978, |
| "grad_norm": 0.5191594362258911, |
| "learning_rate": 4.59378733572282e-05, |
| "loss": 0.266, |
| "step": 916 |
| }, |
| { |
| "epoch": 2.178412132024978, |
| "grad_norm": 0.5965383648872375, |
| "learning_rate": 4.5878136200716846e-05, |
| "loss": 0.3015, |
| "step": 917 |
| }, |
| { |
| "epoch": 2.1807909604519775, |
| "grad_norm": 0.5929700136184692, |
| "learning_rate": 4.5818399044205496e-05, |
| "loss": 0.3173, |
| "step": 918 |
| }, |
| { |
| "epoch": 2.183169788878977, |
| "grad_norm": 0.5193952918052673, |
| "learning_rate": 4.575866188769415e-05, |
| "loss": 0.3002, |
| "step": 919 |
| }, |
| { |
| "epoch": 2.185548617305977, |
| "grad_norm": 0.5817952752113342, |
| "learning_rate": 4.56989247311828e-05, |
| "loss": 0.3217, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.1879274457329765, |
| "grad_norm": 0.5975373983383179, |
| "learning_rate": 4.563918757467145e-05, |
| "loss": 0.3132, |
| "step": 921 |
| }, |
| { |
| "epoch": 2.190306274159976, |
| "grad_norm": 0.5984764695167542, |
| "learning_rate": 4.55794504181601e-05, |
| "loss": 0.2953, |
| "step": 922 |
| }, |
| { |
| "epoch": 2.192685102586976, |
| "grad_norm": 0.6337423324584961, |
| "learning_rate": 4.551971326164875e-05, |
| "loss": 0.2945, |
| "step": 923 |
| }, |
| { |
| "epoch": 2.1950639310139755, |
| "grad_norm": 0.6041463613510132, |
| "learning_rate": 4.54599761051374e-05, |
| "loss": 0.2643, |
| "step": 924 |
| }, |
| { |
| "epoch": 2.197442759440975, |
| "grad_norm": 0.5808432102203369, |
| "learning_rate": 4.540023894862604e-05, |
| "loss": 0.3021, |
| "step": 925 |
| }, |
| { |
| "epoch": 2.199821587867975, |
| "grad_norm": 0.5449970364570618, |
| "learning_rate": 4.53405017921147e-05, |
| "loss": 0.2452, |
| "step": 926 |
| }, |
| { |
| "epoch": 2.202200416294975, |
| "grad_norm": 0.5480767488479614, |
| "learning_rate": 4.528076463560334e-05, |
| "loss": 0.2762, |
| "step": 927 |
| }, |
| { |
| "epoch": 2.2045792447219745, |
| "grad_norm": 0.5530625581741333, |
| "learning_rate": 4.5221027479092e-05, |
| "loss": 0.3128, |
| "step": 928 |
| }, |
| { |
| "epoch": 2.206958073148974, |
| "grad_norm": 0.5787012577056885, |
| "learning_rate": 4.516129032258064e-05, |
| "loss": 0.2905, |
| "step": 929 |
| }, |
| { |
| "epoch": 2.209336901575974, |
| "grad_norm": 0.5901174545288086, |
| "learning_rate": 4.51015531660693e-05, |
| "loss": 0.2822, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.2117157300029735, |
| "grad_norm": 0.571593165397644, |
| "learning_rate": 4.5041816009557944e-05, |
| "loss": 0.3344, |
| "step": 931 |
| }, |
| { |
| "epoch": 2.214094558429973, |
| "grad_norm": 0.6192464828491211, |
| "learning_rate": 4.49820788530466e-05, |
| "loss": 0.294, |
| "step": 932 |
| }, |
| { |
| "epoch": 2.216473386856973, |
| "grad_norm": 0.5928755402565002, |
| "learning_rate": 4.4922341696535244e-05, |
| "loss": 0.2952, |
| "step": 933 |
| }, |
| { |
| "epoch": 2.2188522152839725, |
| "grad_norm": 0.5460443496704102, |
| "learning_rate": 4.4862604540023894e-05, |
| "loss": 0.2638, |
| "step": 934 |
| }, |
| { |
| "epoch": 2.221231043710972, |
| "grad_norm": 0.47907713055610657, |
| "learning_rate": 4.4802867383512545e-05, |
| "loss": 0.2629, |
| "step": 935 |
| }, |
| { |
| "epoch": 2.2236098721379722, |
| "grad_norm": 0.552532434463501, |
| "learning_rate": 4.4743130227001195e-05, |
| "loss": 0.2394, |
| "step": 936 |
| }, |
| { |
| "epoch": 2.225988700564972, |
| "grad_norm": 0.6192559599876404, |
| "learning_rate": 4.4683393070489845e-05, |
| "loss": 0.2764, |
| "step": 937 |
| }, |
| { |
| "epoch": 2.2283675289919715, |
| "grad_norm": 0.6276203393936157, |
| "learning_rate": 4.4623655913978496e-05, |
| "loss": 0.3492, |
| "step": 938 |
| }, |
| { |
| "epoch": 2.230746357418971, |
| "grad_norm": 0.5791794061660767, |
| "learning_rate": 4.4563918757467146e-05, |
| "loss": 0.2908, |
| "step": 939 |
| }, |
| { |
| "epoch": 2.233125185845971, |
| "grad_norm": 0.5309539437294006, |
| "learning_rate": 4.4504181600955796e-05, |
| "loss": 0.2684, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.2355040142729705, |
| "grad_norm": 0.5494111180305481, |
| "learning_rate": 4.4444444444444447e-05, |
| "loss": 0.2727, |
| "step": 941 |
| }, |
| { |
| "epoch": 2.23788284269997, |
| "grad_norm": 0.5954378843307495, |
| "learning_rate": 4.43847072879331e-05, |
| "loss": 0.275, |
| "step": 942 |
| }, |
| { |
| "epoch": 2.24026167112697, |
| "grad_norm": 0.5295591950416565, |
| "learning_rate": 4.432497013142175e-05, |
| "loss": 0.259, |
| "step": 943 |
| }, |
| { |
| "epoch": 2.2426404995539695, |
| "grad_norm": 0.6501302123069763, |
| "learning_rate": 4.42652329749104e-05, |
| "loss": 0.3046, |
| "step": 944 |
| }, |
| { |
| "epoch": 2.2450193279809696, |
| "grad_norm": 0.6218990087509155, |
| "learning_rate": 4.420549581839905e-05, |
| "loss": 0.3367, |
| "step": 945 |
| }, |
| { |
| "epoch": 2.2473981564079692, |
| "grad_norm": 0.549957275390625, |
| "learning_rate": 4.41457586618877e-05, |
| "loss": 0.2916, |
| "step": 946 |
| }, |
| { |
| "epoch": 2.249776984834969, |
| "grad_norm": 0.6548086404800415, |
| "learning_rate": 4.408602150537635e-05, |
| "loss": 0.3352, |
| "step": 947 |
| }, |
| { |
| "epoch": 2.2521558132619686, |
| "grad_norm": 0.5784884095191956, |
| "learning_rate": 4.402628434886499e-05, |
| "loss": 0.3103, |
| "step": 948 |
| }, |
| { |
| "epoch": 2.254534641688968, |
| "grad_norm": 0.5839419364929199, |
| "learning_rate": 4.396654719235365e-05, |
| "loss": 0.3194, |
| "step": 949 |
| }, |
| { |
| "epoch": 2.256913470115968, |
| "grad_norm": 0.6073633432388306, |
| "learning_rate": 4.390681003584229e-05, |
| "loss": 0.3136, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.2592922985429675, |
| "grad_norm": 0.5259355902671814, |
| "learning_rate": 4.384707287933095e-05, |
| "loss": 0.2504, |
| "step": 951 |
| }, |
| { |
| "epoch": 2.261671126969967, |
| "grad_norm": 0.5521760582923889, |
| "learning_rate": 4.378733572281959e-05, |
| "loss": 0.2783, |
| "step": 952 |
| }, |
| { |
| "epoch": 2.264049955396967, |
| "grad_norm": 0.6189229488372803, |
| "learning_rate": 4.372759856630825e-05, |
| "loss": 0.2586, |
| "step": 953 |
| }, |
| { |
| "epoch": 2.2664287838239665, |
| "grad_norm": 0.5720909833908081, |
| "learning_rate": 4.3667861409796894e-05, |
| "loss": 0.2539, |
| "step": 954 |
| }, |
| { |
| "epoch": 2.268807612250966, |
| "grad_norm": 0.5107256174087524, |
| "learning_rate": 4.360812425328555e-05, |
| "loss": 0.2295, |
| "step": 955 |
| }, |
| { |
| "epoch": 2.2711864406779663, |
| "grad_norm": 0.5512247681617737, |
| "learning_rate": 4.3548387096774194e-05, |
| "loss": 0.323, |
| "step": 956 |
| }, |
| { |
| "epoch": 2.273565269104966, |
| "grad_norm": 0.6190577745437622, |
| "learning_rate": 4.3488649940262845e-05, |
| "loss": 0.3358, |
| "step": 957 |
| }, |
| { |
| "epoch": 2.2759440975319656, |
| "grad_norm": 0.5246328711509705, |
| "learning_rate": 4.3428912783751495e-05, |
| "loss": 0.2112, |
| "step": 958 |
| }, |
| { |
| "epoch": 2.2783229259589652, |
| "grad_norm": 0.6078363656997681, |
| "learning_rate": 4.3369175627240145e-05, |
| "loss": 0.2555, |
| "step": 959 |
| }, |
| { |
| "epoch": 2.280701754385965, |
| "grad_norm": 0.6270473599433899, |
| "learning_rate": 4.3309438470728796e-05, |
| "loss": 0.292, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.2830805828129646, |
| "grad_norm": 0.6397281885147095, |
| "learning_rate": 4.3249701314217446e-05, |
| "loss": 0.2761, |
| "step": 961 |
| }, |
| { |
| "epoch": 2.285459411239964, |
| "grad_norm": 0.6567078828811646, |
| "learning_rate": 4.3189964157706096e-05, |
| "loss": 0.2742, |
| "step": 962 |
| }, |
| { |
| "epoch": 2.287838239666964, |
| "grad_norm": 0.5602155327796936, |
| "learning_rate": 4.3130227001194746e-05, |
| "loss": 0.2316, |
| "step": 963 |
| }, |
| { |
| "epoch": 2.2902170680939635, |
| "grad_norm": 0.5947728157043457, |
| "learning_rate": 4.307048984468339e-05, |
| "loss": 0.3116, |
| "step": 964 |
| }, |
| { |
| "epoch": 2.2925958965209636, |
| "grad_norm": 0.5324766039848328, |
| "learning_rate": 4.301075268817205e-05, |
| "loss": 0.2906, |
| "step": 965 |
| }, |
| { |
| "epoch": 2.2949747249479633, |
| "grad_norm": 0.5670318603515625, |
| "learning_rate": 4.295101553166069e-05, |
| "loss": 0.3218, |
| "step": 966 |
| }, |
| { |
| "epoch": 2.297353553374963, |
| "grad_norm": 0.6235133409500122, |
| "learning_rate": 4.289127837514935e-05, |
| "loss": 0.2306, |
| "step": 967 |
| }, |
| { |
| "epoch": 2.2997323818019626, |
| "grad_norm": 0.6106896996498108, |
| "learning_rate": 4.283154121863799e-05, |
| "loss": 0.3038, |
| "step": 968 |
| }, |
| { |
| "epoch": 2.3021112102289623, |
| "grad_norm": 0.6239914298057556, |
| "learning_rate": 4.277180406212665e-05, |
| "loss": 0.289, |
| "step": 969 |
| }, |
| { |
| "epoch": 2.304490038655962, |
| "grad_norm": 0.588311493396759, |
| "learning_rate": 4.271206690561529e-05, |
| "loss": 0.301, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.3068688670829616, |
| "grad_norm": 0.6421441435813904, |
| "learning_rate": 4.265232974910394e-05, |
| "loss": 0.3437, |
| "step": 971 |
| }, |
| { |
| "epoch": 2.3092476955099612, |
| "grad_norm": 0.6681007742881775, |
| "learning_rate": 4.259259259259259e-05, |
| "loss": 0.33, |
| "step": 972 |
| }, |
| { |
| "epoch": 2.311626523936961, |
| "grad_norm": 0.6258997321128845, |
| "learning_rate": 4.253285543608124e-05, |
| "loss": 0.297, |
| "step": 973 |
| }, |
| { |
| "epoch": 2.314005352363961, |
| "grad_norm": 0.572743833065033, |
| "learning_rate": 4.247311827956989e-05, |
| "loss": 0.2313, |
| "step": 974 |
| }, |
| { |
| "epoch": 2.3163841807909606, |
| "grad_norm": 0.5629462003707886, |
| "learning_rate": 4.241338112305854e-05, |
| "loss": 0.2776, |
| "step": 975 |
| }, |
| { |
| "epoch": 2.3187630092179603, |
| "grad_norm": 0.5810324549674988, |
| "learning_rate": 4.2353643966547194e-05, |
| "loss": 0.2514, |
| "step": 976 |
| }, |
| { |
| "epoch": 2.32114183764496, |
| "grad_norm": 0.6199133396148682, |
| "learning_rate": 4.2293906810035844e-05, |
| "loss": 0.3355, |
| "step": 977 |
| }, |
| { |
| "epoch": 2.3235206660719596, |
| "grad_norm": 0.4951048195362091, |
| "learning_rate": 4.2234169653524494e-05, |
| "loss": 0.2709, |
| "step": 978 |
| }, |
| { |
| "epoch": 2.3258994944989593, |
| "grad_norm": 0.5519804358482361, |
| "learning_rate": 4.2174432497013144e-05, |
| "loss": 0.263, |
| "step": 979 |
| }, |
| { |
| "epoch": 2.328278322925959, |
| "grad_norm": 0.5669978857040405, |
| "learning_rate": 4.2114695340501795e-05, |
| "loss": 0.2733, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.3306571513529586, |
| "grad_norm": 0.5783933401107788, |
| "learning_rate": 4.2054958183990445e-05, |
| "loss": 0.3205, |
| "step": 981 |
| }, |
| { |
| "epoch": 2.3330359797799582, |
| "grad_norm": 0.5702626705169678, |
| "learning_rate": 4.1995221027479095e-05, |
| "loss": 0.2642, |
| "step": 982 |
| }, |
| { |
| "epoch": 2.335414808206958, |
| "grad_norm": 0.5818063020706177, |
| "learning_rate": 4.1935483870967746e-05, |
| "loss": 0.2586, |
| "step": 983 |
| }, |
| { |
| "epoch": 2.3377936366339576, |
| "grad_norm": 0.6229294538497925, |
| "learning_rate": 4.1875746714456396e-05, |
| "loss": 0.3086, |
| "step": 984 |
| }, |
| { |
| "epoch": 2.3401724650609577, |
| "grad_norm": 0.5085659027099609, |
| "learning_rate": 4.1816009557945046e-05, |
| "loss": 0.3015, |
| "step": 985 |
| }, |
| { |
| "epoch": 2.3425512934879573, |
| "grad_norm": 0.5718642473220825, |
| "learning_rate": 4.1756272401433697e-05, |
| "loss": 0.3043, |
| "step": 986 |
| }, |
| { |
| "epoch": 2.344930121914957, |
| "grad_norm": 0.6011469960212708, |
| "learning_rate": 4.169653524492234e-05, |
| "loss": 0.2431, |
| "step": 987 |
| }, |
| { |
| "epoch": 2.3473089503419566, |
| "grad_norm": 0.5666202306747437, |
| "learning_rate": 4.1636798088411e-05, |
| "loss": 0.2424, |
| "step": 988 |
| }, |
| { |
| "epoch": 2.3496877787689563, |
| "grad_norm": 0.4900258481502533, |
| "learning_rate": 4.157706093189964e-05, |
| "loss": 0.2791, |
| "step": 989 |
| }, |
| { |
| "epoch": 2.352066607195956, |
| "grad_norm": 0.5660254955291748, |
| "learning_rate": 4.15173237753883e-05, |
| "loss": 0.2993, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.3544454356229556, |
| "grad_norm": 0.5348349213600159, |
| "learning_rate": 4.145758661887694e-05, |
| "loss": 0.289, |
| "step": 991 |
| }, |
| { |
| "epoch": 2.3568242640499553, |
| "grad_norm": 0.6012505292892456, |
| "learning_rate": 4.13978494623656e-05, |
| "loss": 0.3272, |
| "step": 992 |
| }, |
| { |
| "epoch": 2.359203092476955, |
| "grad_norm": 0.6564033627510071, |
| "learning_rate": 4.133811230585424e-05, |
| "loss": 0.3512, |
| "step": 993 |
| }, |
| { |
| "epoch": 2.361581920903955, |
| "grad_norm": 0.6032342314720154, |
| "learning_rate": 4.127837514934289e-05, |
| "loss": 0.2783, |
| "step": 994 |
| }, |
| { |
| "epoch": 2.3639607493309547, |
| "grad_norm": 0.5665048360824585, |
| "learning_rate": 4.121863799283154e-05, |
| "loss": 0.2502, |
| "step": 995 |
| }, |
| { |
| "epoch": 2.3663395777579543, |
| "grad_norm": 0.5682721138000488, |
| "learning_rate": 4.115890083632019e-05, |
| "loss": 0.2321, |
| "step": 996 |
| }, |
| { |
| "epoch": 2.368718406184954, |
| "grad_norm": 0.5423948168754578, |
| "learning_rate": 4.109916367980884e-05, |
| "loss": 0.303, |
| "step": 997 |
| }, |
| { |
| "epoch": 2.3710972346119537, |
| "grad_norm": 0.6970639824867249, |
| "learning_rate": 4.1039426523297493e-05, |
| "loss": 0.3652, |
| "step": 998 |
| }, |
| { |
| "epoch": 2.3734760630389533, |
| "grad_norm": 0.5611357688903809, |
| "learning_rate": 4.0979689366786144e-05, |
| "loss": 0.3083, |
| "step": 999 |
| }, |
| { |
| "epoch": 2.375854891465953, |
| "grad_norm": 0.5091648697853088, |
| "learning_rate": 4.0919952210274794e-05, |
| "loss": 0.2621, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.375854891465953, |
| "eval_loss": 0.39063429832458496, |
| "eval_runtime": 24.8071, |
| "eval_samples_per_second": 30.153, |
| "eval_steps_per_second": 15.076, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.3782337198929526, |
| "grad_norm": 0.657648503780365, |
| "learning_rate": 4.0860215053763444e-05, |
| "loss": 0.3406, |
| "step": 1001 |
| }, |
| { |
| "epoch": 2.3806125483199523, |
| "grad_norm": 0.5850951671600342, |
| "learning_rate": 4.0800477897252095e-05, |
| "loss": 0.2944, |
| "step": 1002 |
| }, |
| { |
| "epoch": 2.382991376746952, |
| "grad_norm": 0.5571763515472412, |
| "learning_rate": 4.074074074074074e-05, |
| "loss": 0.248, |
| "step": 1003 |
| }, |
| { |
| "epoch": 2.3853702051739516, |
| "grad_norm": 0.5550372004508972, |
| "learning_rate": 4.0681003584229395e-05, |
| "loss": 0.3049, |
| "step": 1004 |
| }, |
| { |
| "epoch": 2.3877490336009517, |
| "grad_norm": 0.5554943680763245, |
| "learning_rate": 4.062126642771804e-05, |
| "loss": 0.3124, |
| "step": 1005 |
| }, |
| { |
| "epoch": 2.3901278620279514, |
| "grad_norm": 0.5197229385375977, |
| "learning_rate": 4.0561529271206696e-05, |
| "loss": 0.2344, |
| "step": 1006 |
| }, |
| { |
| "epoch": 2.392506690454951, |
| "grad_norm": 0.5891127586364746, |
| "learning_rate": 4.050179211469534e-05, |
| "loss": 0.305, |
| "step": 1007 |
| }, |
| { |
| "epoch": 2.3948855188819507, |
| "grad_norm": 0.6232504844665527, |
| "learning_rate": 4.0442054958183996e-05, |
| "loss": 0.3513, |
| "step": 1008 |
| }, |
| { |
| "epoch": 2.3972643473089503, |
| "grad_norm": 0.7023825645446777, |
| "learning_rate": 4.038231780167264e-05, |
| "loss": 0.3333, |
| "step": 1009 |
| }, |
| { |
| "epoch": 2.39964317573595, |
| "grad_norm": 0.6072371006011963, |
| "learning_rate": 4.032258064516129e-05, |
| "loss": 0.2898, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.4020220041629496, |
| "grad_norm": 0.5343210101127625, |
| "learning_rate": 4.026284348864994e-05, |
| "loss": 0.2502, |
| "step": 1011 |
| }, |
| { |
| "epoch": 2.4044008325899493, |
| "grad_norm": 0.5760728120803833, |
| "learning_rate": 4.020310633213859e-05, |
| "loss": 0.2727, |
| "step": 1012 |
| }, |
| { |
| "epoch": 2.406779661016949, |
| "grad_norm": 0.5475410223007202, |
| "learning_rate": 4.014336917562724e-05, |
| "loss": 0.3089, |
| "step": 1013 |
| }, |
| { |
| "epoch": 2.409158489443949, |
| "grad_norm": 0.6408705115318298, |
| "learning_rate": 4.008363201911589e-05, |
| "loss": 0.2844, |
| "step": 1014 |
| }, |
| { |
| "epoch": 2.4115373178709487, |
| "grad_norm": 0.5707472562789917, |
| "learning_rate": 4.002389486260454e-05, |
| "loss": 0.2819, |
| "step": 1015 |
| }, |
| { |
| "epoch": 2.4139161462979484, |
| "grad_norm": 0.5893364548683167, |
| "learning_rate": 3.996415770609319e-05, |
| "loss": 0.3223, |
| "step": 1016 |
| }, |
| { |
| "epoch": 2.416294974724948, |
| "grad_norm": 0.5145408511161804, |
| "learning_rate": 3.990442054958184e-05, |
| "loss": 0.2778, |
| "step": 1017 |
| }, |
| { |
| "epoch": 2.4186738031519477, |
| "grad_norm": 0.6715821623802185, |
| "learning_rate": 3.984468339307049e-05, |
| "loss": 0.3036, |
| "step": 1018 |
| }, |
| { |
| "epoch": 2.4210526315789473, |
| "grad_norm": 0.5638325810432434, |
| "learning_rate": 3.978494623655914e-05, |
| "loss": 0.3206, |
| "step": 1019 |
| }, |
| { |
| "epoch": 2.423431460005947, |
| "grad_norm": 0.5965414643287659, |
| "learning_rate": 3.972520908004779e-05, |
| "loss": 0.2817, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.4258102884329467, |
| "grad_norm": 0.6358485221862793, |
| "learning_rate": 3.9665471923536444e-05, |
| "loss": 0.3195, |
| "step": 1021 |
| }, |
| { |
| "epoch": 2.4281891168599463, |
| "grad_norm": 0.5645793080329895, |
| "learning_rate": 3.9605734767025094e-05, |
| "loss": 0.2882, |
| "step": 1022 |
| }, |
| { |
| "epoch": 2.4305679452869464, |
| "grad_norm": 0.6534497737884521, |
| "learning_rate": 3.9545997610513744e-05, |
| "loss": 0.2937, |
| "step": 1023 |
| }, |
| { |
| "epoch": 2.432946773713946, |
| "grad_norm": 0.5793471932411194, |
| "learning_rate": 3.9486260454002395e-05, |
| "loss": 0.2921, |
| "step": 1024 |
| }, |
| { |
| "epoch": 2.4353256021409457, |
| "grad_norm": 0.6896522045135498, |
| "learning_rate": 3.9426523297491045e-05, |
| "loss": 0.2904, |
| "step": 1025 |
| }, |
| { |
| "epoch": 2.4377044305679454, |
| "grad_norm": 0.6361718773841858, |
| "learning_rate": 3.936678614097969e-05, |
| "loss": 0.3316, |
| "step": 1026 |
| }, |
| { |
| "epoch": 2.440083258994945, |
| "grad_norm": 0.5773645043373108, |
| "learning_rate": 3.9307048984468345e-05, |
| "loss": 0.3056, |
| "step": 1027 |
| }, |
| { |
| "epoch": 2.4424620874219447, |
| "grad_norm": 0.5956210494041443, |
| "learning_rate": 3.924731182795699e-05, |
| "loss": 0.2966, |
| "step": 1028 |
| }, |
| { |
| "epoch": 2.4448409158489444, |
| "grad_norm": 0.50902259349823, |
| "learning_rate": 3.9187574671445646e-05, |
| "loss": 0.2504, |
| "step": 1029 |
| }, |
| { |
| "epoch": 2.447219744275944, |
| "grad_norm": 0.6098092794418335, |
| "learning_rate": 3.912783751493429e-05, |
| "loss": 0.2785, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.4495985727029437, |
| "grad_norm": 0.6550672054290771, |
| "learning_rate": 3.906810035842295e-05, |
| "loss": 0.283, |
| "step": 1031 |
| }, |
| { |
| "epoch": 2.4519774011299433, |
| "grad_norm": 0.5612165927886963, |
| "learning_rate": 3.900836320191159e-05, |
| "loss": 0.2738, |
| "step": 1032 |
| }, |
| { |
| "epoch": 2.454356229556943, |
| "grad_norm": 0.5695220232009888, |
| "learning_rate": 3.894862604540024e-05, |
| "loss": 0.2887, |
| "step": 1033 |
| }, |
| { |
| "epoch": 2.456735057983943, |
| "grad_norm": 0.559490978717804, |
| "learning_rate": 3.888888888888889e-05, |
| "loss": 0.2697, |
| "step": 1034 |
| }, |
| { |
| "epoch": 2.4591138864109428, |
| "grad_norm": 0.5419275760650635, |
| "learning_rate": 3.882915173237754e-05, |
| "loss": 0.243, |
| "step": 1035 |
| }, |
| { |
| "epoch": 2.4614927148379424, |
| "grad_norm": 0.6212007999420166, |
| "learning_rate": 3.876941457586619e-05, |
| "loss": 0.2965, |
| "step": 1036 |
| }, |
| { |
| "epoch": 2.463871543264942, |
| "grad_norm": 0.5144377946853638, |
| "learning_rate": 3.870967741935484e-05, |
| "loss": 0.2576, |
| "step": 1037 |
| }, |
| { |
| "epoch": 2.4662503716919417, |
| "grad_norm": 0.6120803952217102, |
| "learning_rate": 3.864994026284349e-05, |
| "loss": 0.3826, |
| "step": 1038 |
| }, |
| { |
| "epoch": 2.4686292001189414, |
| "grad_norm": 0.6212862133979797, |
| "learning_rate": 3.859020310633214e-05, |
| "loss": 0.3232, |
| "step": 1039 |
| }, |
| { |
| "epoch": 2.471008028545941, |
| "grad_norm": 0.6324489116668701, |
| "learning_rate": 3.8530465949820786e-05, |
| "loss": 0.3372, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.4733868569729407, |
| "grad_norm": 0.5289970636367798, |
| "learning_rate": 3.847072879330944e-05, |
| "loss": 0.3208, |
| "step": 1041 |
| }, |
| { |
| "epoch": 2.4757656853999404, |
| "grad_norm": 0.5548078417778015, |
| "learning_rate": 3.8410991636798086e-05, |
| "loss": 0.2724, |
| "step": 1042 |
| }, |
| { |
| "epoch": 2.4781445138269405, |
| "grad_norm": 0.5034798383712769, |
| "learning_rate": 3.8351254480286743e-05, |
| "loss": 0.2471, |
| "step": 1043 |
| }, |
| { |
| "epoch": 2.48052334225394, |
| "grad_norm": 0.5730745196342468, |
| "learning_rate": 3.829151732377539e-05, |
| "loss": 0.3225, |
| "step": 1044 |
| }, |
| { |
| "epoch": 2.4829021706809398, |
| "grad_norm": 0.580128014087677, |
| "learning_rate": 3.8231780167264044e-05, |
| "loss": 0.2591, |
| "step": 1045 |
| }, |
| { |
| "epoch": 2.4852809991079394, |
| "grad_norm": 0.5486919283866882, |
| "learning_rate": 3.817204301075269e-05, |
| "loss": 0.2867, |
| "step": 1046 |
| }, |
| { |
| "epoch": 2.487659827534939, |
| "grad_norm": 0.5794557332992554, |
| "learning_rate": 3.8112305854241345e-05, |
| "loss": 0.3313, |
| "step": 1047 |
| }, |
| { |
| "epoch": 2.4900386559619387, |
| "grad_norm": 0.5111564993858337, |
| "learning_rate": 3.805256869772999e-05, |
| "loss": 0.2931, |
| "step": 1048 |
| }, |
| { |
| "epoch": 2.4924174843889384, |
| "grad_norm": 0.5828002095222473, |
| "learning_rate": 3.799283154121864e-05, |
| "loss": 0.2933, |
| "step": 1049 |
| }, |
| { |
| "epoch": 2.494796312815938, |
| "grad_norm": 0.6088399291038513, |
| "learning_rate": 3.793309438470729e-05, |
| "loss": 0.2784, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.4971751412429377, |
| "grad_norm": 0.5388132929801941, |
| "learning_rate": 3.787335722819594e-05, |
| "loss": 0.263, |
| "step": 1051 |
| }, |
| { |
| "epoch": 2.4995539696699374, |
| "grad_norm": 0.6311586499214172, |
| "learning_rate": 3.781362007168459e-05, |
| "loss": 0.2742, |
| "step": 1052 |
| }, |
| { |
| "epoch": 2.501932798096937, |
| "grad_norm": 0.6956512331962585, |
| "learning_rate": 3.775388291517324e-05, |
| "loss": 0.2726, |
| "step": 1053 |
| }, |
| { |
| "epoch": 2.504311626523937, |
| "grad_norm": 0.5791674852371216, |
| "learning_rate": 3.769414575866189e-05, |
| "loss": 0.2594, |
| "step": 1054 |
| }, |
| { |
| "epoch": 2.506690454950937, |
| "grad_norm": 0.6453083157539368, |
| "learning_rate": 3.763440860215054e-05, |
| "loss": 0.299, |
| "step": 1055 |
| }, |
| { |
| "epoch": 2.5090692833779364, |
| "grad_norm": 0.6255761384963989, |
| "learning_rate": 3.7574671445639184e-05, |
| "loss": 0.284, |
| "step": 1056 |
| }, |
| { |
| "epoch": 2.511448111804936, |
| "grad_norm": 0.5033700466156006, |
| "learning_rate": 3.751493428912784e-05, |
| "loss": 0.2419, |
| "step": 1057 |
| }, |
| { |
| "epoch": 2.5138269402319358, |
| "grad_norm": 0.6552553772926331, |
| "learning_rate": 3.7455197132616484e-05, |
| "loss": 0.326, |
| "step": 1058 |
| }, |
| { |
| "epoch": 2.5162057686589354, |
| "grad_norm": 0.5841801762580872, |
| "learning_rate": 3.739545997610514e-05, |
| "loss": 0.2795, |
| "step": 1059 |
| }, |
| { |
| "epoch": 2.518584597085935, |
| "grad_norm": 0.7399122714996338, |
| "learning_rate": 3.7335722819593785e-05, |
| "loss": 0.3375, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.5209634255129347, |
| "grad_norm": 0.5951675176620483, |
| "learning_rate": 3.727598566308244e-05, |
| "loss": 0.2637, |
| "step": 1061 |
| }, |
| { |
| "epoch": 2.5233422539399344, |
| "grad_norm": 0.5338478088378906, |
| "learning_rate": 3.7216248506571086e-05, |
| "loss": 0.3215, |
| "step": 1062 |
| }, |
| { |
| "epoch": 2.5257210823669345, |
| "grad_norm": 0.6295543909072876, |
| "learning_rate": 3.715651135005974e-05, |
| "loss": 0.337, |
| "step": 1063 |
| }, |
| { |
| "epoch": 2.528099910793934, |
| "grad_norm": 0.5548101663589478, |
| "learning_rate": 3.7096774193548386e-05, |
| "loss": 0.2745, |
| "step": 1064 |
| }, |
| { |
| "epoch": 2.530478739220934, |
| "grad_norm": 0.5178088545799255, |
| "learning_rate": 3.7037037037037037e-05, |
| "loss": 0.23, |
| "step": 1065 |
| }, |
| { |
| "epoch": 2.5328575676479335, |
| "grad_norm": 0.5105859041213989, |
| "learning_rate": 3.697729988052569e-05, |
| "loss": 0.1886, |
| "step": 1066 |
| }, |
| { |
| "epoch": 2.535236396074933, |
| "grad_norm": 0.5834839344024658, |
| "learning_rate": 3.691756272401434e-05, |
| "loss": 0.2866, |
| "step": 1067 |
| }, |
| { |
| "epoch": 2.5376152245019328, |
| "grad_norm": 0.6643683314323425, |
| "learning_rate": 3.685782556750299e-05, |
| "loss": 0.3365, |
| "step": 1068 |
| }, |
| { |
| "epoch": 2.5399940529289324, |
| "grad_norm": 0.5959658622741699, |
| "learning_rate": 3.679808841099164e-05, |
| "loss": 0.3177, |
| "step": 1069 |
| }, |
| { |
| "epoch": 2.542372881355932, |
| "grad_norm": 0.587335467338562, |
| "learning_rate": 3.673835125448029e-05, |
| "loss": 0.3069, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.5447517097829317, |
| "grad_norm": 0.6357367038726807, |
| "learning_rate": 3.667861409796894e-05, |
| "loss": 0.3166, |
| "step": 1071 |
| }, |
| { |
| "epoch": 2.547130538209932, |
| "grad_norm": 0.5437172055244446, |
| "learning_rate": 3.661887694145759e-05, |
| "loss": 0.2837, |
| "step": 1072 |
| }, |
| { |
| "epoch": 2.549509366636931, |
| "grad_norm": 0.5983290076255798, |
| "learning_rate": 3.655913978494624e-05, |
| "loss": 0.2586, |
| "step": 1073 |
| }, |
| { |
| "epoch": 2.551888195063931, |
| "grad_norm": 0.6598303914070129, |
| "learning_rate": 3.649940262843489e-05, |
| "loss": 0.2916, |
| "step": 1074 |
| }, |
| { |
| "epoch": 2.554267023490931, |
| "grad_norm": 0.5528435707092285, |
| "learning_rate": 3.643966547192354e-05, |
| "loss": 0.2843, |
| "step": 1075 |
| }, |
| { |
| "epoch": 2.5566458519179305, |
| "grad_norm": 0.7062620520591736, |
| "learning_rate": 3.637992831541219e-05, |
| "loss": 0.3117, |
| "step": 1076 |
| }, |
| { |
| "epoch": 2.55902468034493, |
| "grad_norm": 0.5995641946792603, |
| "learning_rate": 3.632019115890084e-05, |
| "loss": 0.3298, |
| "step": 1077 |
| }, |
| { |
| "epoch": 2.56140350877193, |
| "grad_norm": 0.6305801272392273, |
| "learning_rate": 3.626045400238949e-05, |
| "loss": 0.3426, |
| "step": 1078 |
| }, |
| { |
| "epoch": 2.5637823371989295, |
| "grad_norm": 0.6942247748374939, |
| "learning_rate": 3.6200716845878134e-05, |
| "loss": 0.2996, |
| "step": 1079 |
| }, |
| { |
| "epoch": 2.566161165625929, |
| "grad_norm": 0.6617063879966736, |
| "learning_rate": 3.614097968936679e-05, |
| "loss": 0.2998, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.568539994052929, |
| "grad_norm": 0.5509942173957825, |
| "learning_rate": 3.6081242532855435e-05, |
| "loss": 0.2671, |
| "step": 1081 |
| }, |
| { |
| "epoch": 2.5709188224799284, |
| "grad_norm": 0.6745384931564331, |
| "learning_rate": 3.602150537634409e-05, |
| "loss": 0.2895, |
| "step": 1082 |
| }, |
| { |
| "epoch": 2.5732976509069285, |
| "grad_norm": 0.6246810555458069, |
| "learning_rate": 3.5961768219832735e-05, |
| "loss": 0.2544, |
| "step": 1083 |
| }, |
| { |
| "epoch": 2.575676479333928, |
| "grad_norm": 0.6448361873626709, |
| "learning_rate": 3.590203106332139e-05, |
| "loss": 0.3294, |
| "step": 1084 |
| }, |
| { |
| "epoch": 2.578055307760928, |
| "grad_norm": 0.6272534728050232, |
| "learning_rate": 3.5842293906810036e-05, |
| "loss": 0.2618, |
| "step": 1085 |
| }, |
| { |
| "epoch": 2.5804341361879275, |
| "grad_norm": 0.6175225377082825, |
| "learning_rate": 3.578255675029869e-05, |
| "loss": 0.3305, |
| "step": 1086 |
| }, |
| { |
| "epoch": 2.582812964614927, |
| "grad_norm": 0.5742083191871643, |
| "learning_rate": 3.5722819593787336e-05, |
| "loss": 0.2782, |
| "step": 1087 |
| }, |
| { |
| "epoch": 2.585191793041927, |
| "grad_norm": 0.6171672344207764, |
| "learning_rate": 3.566308243727599e-05, |
| "loss": 0.2625, |
| "step": 1088 |
| }, |
| { |
| "epoch": 2.5875706214689265, |
| "grad_norm": 0.5434128642082214, |
| "learning_rate": 3.560334528076464e-05, |
| "loss": 0.2562, |
| "step": 1089 |
| }, |
| { |
| "epoch": 2.589949449895926, |
| "grad_norm": 0.6696468591690063, |
| "learning_rate": 3.554360812425329e-05, |
| "loss": 0.3718, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.592328278322926, |
| "grad_norm": 0.574690043926239, |
| "learning_rate": 3.548387096774194e-05, |
| "loss": 0.2481, |
| "step": 1091 |
| }, |
| { |
| "epoch": 2.594707106749926, |
| "grad_norm": 0.606299638748169, |
| "learning_rate": 3.542413381123059e-05, |
| "loss": 0.2644, |
| "step": 1092 |
| }, |
| { |
| "epoch": 2.597085935176925, |
| "grad_norm": 0.6124521493911743, |
| "learning_rate": 3.536439665471924e-05, |
| "loss": 0.2608, |
| "step": 1093 |
| }, |
| { |
| "epoch": 2.599464763603925, |
| "grad_norm": 0.5545802712440491, |
| "learning_rate": 3.530465949820789e-05, |
| "loss": 0.2714, |
| "step": 1094 |
| }, |
| { |
| "epoch": 2.601843592030925, |
| "grad_norm": 0.6126631498336792, |
| "learning_rate": 3.524492234169653e-05, |
| "loss": 0.2823, |
| "step": 1095 |
| }, |
| { |
| "epoch": 2.6042224204579245, |
| "grad_norm": 0.5954961180686951, |
| "learning_rate": 3.518518518518519e-05, |
| "loss": 0.2937, |
| "step": 1096 |
| }, |
| { |
| "epoch": 2.606601248884924, |
| "grad_norm": 0.616807222366333, |
| "learning_rate": 3.512544802867383e-05, |
| "loss": 0.3133, |
| "step": 1097 |
| }, |
| { |
| "epoch": 2.608980077311924, |
| "grad_norm": 0.6112419962882996, |
| "learning_rate": 3.506571087216249e-05, |
| "loss": 0.2772, |
| "step": 1098 |
| }, |
| { |
| "epoch": 2.6113589057389235, |
| "grad_norm": 0.552832841873169, |
| "learning_rate": 3.500597371565113e-05, |
| "loss": 0.2726, |
| "step": 1099 |
| }, |
| { |
| "epoch": 2.613737734165923, |
| "grad_norm": 0.5582488775253296, |
| "learning_rate": 3.494623655913979e-05, |
| "loss": 0.2947, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.613737734165923, |
| "eval_loss": 0.38853010535240173, |
| "eval_runtime": 24.8956, |
| "eval_samples_per_second": 30.045, |
| "eval_steps_per_second": 15.023, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.6161165625929232, |
| "grad_norm": 0.5219835638999939, |
| "learning_rate": 3.4886499402628434e-05, |
| "loss": 0.3032, |
| "step": 1101 |
| }, |
| { |
| "epoch": 2.6184953910199225, |
| "grad_norm": 0.612126350402832, |
| "learning_rate": 3.4826762246117084e-05, |
| "loss": 0.3414, |
| "step": 1102 |
| }, |
| { |
| "epoch": 2.6208742194469226, |
| "grad_norm": 0.5982509255409241, |
| "learning_rate": 3.4767025089605734e-05, |
| "loss": 0.2909, |
| "step": 1103 |
| }, |
| { |
| "epoch": 2.623253047873922, |
| "grad_norm": 0.6476831436157227, |
| "learning_rate": 3.4707287933094385e-05, |
| "loss": 0.3255, |
| "step": 1104 |
| }, |
| { |
| "epoch": 2.625631876300922, |
| "grad_norm": 0.5892149806022644, |
| "learning_rate": 3.4647550776583035e-05, |
| "loss": 0.2788, |
| "step": 1105 |
| }, |
| { |
| "epoch": 2.6280107047279215, |
| "grad_norm": 0.5823227763175964, |
| "learning_rate": 3.4587813620071685e-05, |
| "loss": 0.2953, |
| "step": 1106 |
| }, |
| { |
| "epoch": 2.630389533154921, |
| "grad_norm": 0.5910363793373108, |
| "learning_rate": 3.4528076463560336e-05, |
| "loss": 0.2619, |
| "step": 1107 |
| }, |
| { |
| "epoch": 2.632768361581921, |
| "grad_norm": 0.6204510927200317, |
| "learning_rate": 3.4468339307048986e-05, |
| "loss": 0.2771, |
| "step": 1108 |
| }, |
| { |
| "epoch": 2.6351471900089205, |
| "grad_norm": 0.5862094759941101, |
| "learning_rate": 3.4408602150537636e-05, |
| "loss": 0.2577, |
| "step": 1109 |
| }, |
| { |
| "epoch": 2.63752601843592, |
| "grad_norm": 0.5883625149726868, |
| "learning_rate": 3.4348864994026287e-05, |
| "loss": 0.2714, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.63990484686292, |
| "grad_norm": 0.594275951385498, |
| "learning_rate": 3.428912783751494e-05, |
| "loss": 0.2588, |
| "step": 1111 |
| }, |
| { |
| "epoch": 2.64228367528992, |
| "grad_norm": 0.628243088722229, |
| "learning_rate": 3.422939068100359e-05, |
| "loss": 0.2898, |
| "step": 1112 |
| }, |
| { |
| "epoch": 2.6446625037169196, |
| "grad_norm": 0.597488284111023, |
| "learning_rate": 3.416965352449224e-05, |
| "loss": 0.3002, |
| "step": 1113 |
| }, |
| { |
| "epoch": 2.6470413321439192, |
| "grad_norm": 0.5316476225852966, |
| "learning_rate": 3.410991636798089e-05, |
| "loss": 0.2652, |
| "step": 1114 |
| }, |
| { |
| "epoch": 2.649420160570919, |
| "grad_norm": 0.4937030076980591, |
| "learning_rate": 3.405017921146954e-05, |
| "loss": 0.2695, |
| "step": 1115 |
| }, |
| { |
| "epoch": 2.6517989889979185, |
| "grad_norm": 0.5909802913665771, |
| "learning_rate": 3.399044205495819e-05, |
| "loss": 0.2897, |
| "step": 1116 |
| }, |
| { |
| "epoch": 2.654177817424918, |
| "grad_norm": 0.6558974981307983, |
| "learning_rate": 3.393070489844684e-05, |
| "loss": 0.2713, |
| "step": 1117 |
| }, |
| { |
| "epoch": 2.656556645851918, |
| "grad_norm": 0.5037103295326233, |
| "learning_rate": 3.387096774193548e-05, |
| "loss": 0.2507, |
| "step": 1118 |
| }, |
| { |
| "epoch": 2.6589354742789175, |
| "grad_norm": 0.5422642827033997, |
| "learning_rate": 3.381123058542414e-05, |
| "loss": 0.2866, |
| "step": 1119 |
| }, |
| { |
| "epoch": 2.661314302705917, |
| "grad_norm": 0.6684660315513611, |
| "learning_rate": 3.375149342891278e-05, |
| "loss": 0.2693, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.6636931311329173, |
| "grad_norm": 0.6448227167129517, |
| "learning_rate": 3.369175627240144e-05, |
| "loss": 0.3011, |
| "step": 1121 |
| }, |
| { |
| "epoch": 2.6660719595599165, |
| "grad_norm": 0.5330891609191895, |
| "learning_rate": 3.3632019115890083e-05, |
| "loss": 0.2492, |
| "step": 1122 |
| }, |
| { |
| "epoch": 2.6684507879869166, |
| "grad_norm": 0.6081724166870117, |
| "learning_rate": 3.357228195937874e-05, |
| "loss": 0.2665, |
| "step": 1123 |
| }, |
| { |
| "epoch": 2.6708296164139163, |
| "grad_norm": 0.6162919402122498, |
| "learning_rate": 3.3512544802867384e-05, |
| "loss": 0.3346, |
| "step": 1124 |
| }, |
| { |
| "epoch": 2.673208444840916, |
| "grad_norm": 0.681792676448822, |
| "learning_rate": 3.3452807646356034e-05, |
| "loss": 0.3543, |
| "step": 1125 |
| }, |
| { |
| "epoch": 2.6755872732679156, |
| "grad_norm": 0.675923228263855, |
| "learning_rate": 3.3393070489844685e-05, |
| "loss": 0.2943, |
| "step": 1126 |
| }, |
| { |
| "epoch": 2.6779661016949152, |
| "grad_norm": 0.6438702940940857, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 0.2654, |
| "step": 1127 |
| }, |
| { |
| "epoch": 2.680344930121915, |
| "grad_norm": 0.5402166247367859, |
| "learning_rate": 3.3273596176821985e-05, |
| "loss": 0.2676, |
| "step": 1128 |
| }, |
| { |
| "epoch": 2.6827237585489145, |
| "grad_norm": 0.5861081480979919, |
| "learning_rate": 3.3213859020310636e-05, |
| "loss": 0.2892, |
| "step": 1129 |
| }, |
| { |
| "epoch": 2.685102586975914, |
| "grad_norm": 0.6178301572799683, |
| "learning_rate": 3.3154121863799286e-05, |
| "loss": 0.3191, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.687481415402914, |
| "grad_norm": 0.6531718373298645, |
| "learning_rate": 3.3094384707287936e-05, |
| "loss": 0.3261, |
| "step": 1131 |
| }, |
| { |
| "epoch": 2.689860243829914, |
| "grad_norm": 0.6978683471679688, |
| "learning_rate": 3.3034647550776586e-05, |
| "loss": 0.2942, |
| "step": 1132 |
| }, |
| { |
| "epoch": 2.6922390722569136, |
| "grad_norm": 0.571123480796814, |
| "learning_rate": 3.297491039426524e-05, |
| "loss": 0.2324, |
| "step": 1133 |
| }, |
| { |
| "epoch": 2.6946179006839133, |
| "grad_norm": 0.6293614506721497, |
| "learning_rate": 3.291517323775388e-05, |
| "loss": 0.3065, |
| "step": 1134 |
| }, |
| { |
| "epoch": 2.696996729110913, |
| "grad_norm": 0.6265407800674438, |
| "learning_rate": 3.285543608124254e-05, |
| "loss": 0.2506, |
| "step": 1135 |
| }, |
| { |
| "epoch": 2.6993755575379126, |
| "grad_norm": 0.5328508019447327, |
| "learning_rate": 3.279569892473118e-05, |
| "loss": 0.2653, |
| "step": 1136 |
| }, |
| { |
| "epoch": 2.7017543859649122, |
| "grad_norm": 0.6570659875869751, |
| "learning_rate": 3.273596176821984e-05, |
| "loss": 0.2438, |
| "step": 1137 |
| }, |
| { |
| "epoch": 2.704133214391912, |
| "grad_norm": 0.7575013637542725, |
| "learning_rate": 3.267622461170848e-05, |
| "loss": 0.4259, |
| "step": 1138 |
| }, |
| { |
| "epoch": 2.7065120428189116, |
| "grad_norm": 0.6472057700157166, |
| "learning_rate": 3.261648745519714e-05, |
| "loss": 0.2812, |
| "step": 1139 |
| }, |
| { |
| "epoch": 2.708890871245911, |
| "grad_norm": 0.5538692474365234, |
| "learning_rate": 3.255675029868578e-05, |
| "loss": 0.2403, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.7112696996729113, |
| "grad_norm": 0.5290868282318115, |
| "learning_rate": 3.249701314217443e-05, |
| "loss": 0.248, |
| "step": 1141 |
| }, |
| { |
| "epoch": 2.7136485280999105, |
| "grad_norm": 0.6288734078407288, |
| "learning_rate": 3.243727598566308e-05, |
| "loss": 0.3678, |
| "step": 1142 |
| }, |
| { |
| "epoch": 2.7160273565269106, |
| "grad_norm": 0.6795669794082642, |
| "learning_rate": 3.237753882915173e-05, |
| "loss": 0.2997, |
| "step": 1143 |
| }, |
| { |
| "epoch": 2.7184061849539103, |
| "grad_norm": 0.6042711138725281, |
| "learning_rate": 3.231780167264038e-05, |
| "loss": 0.2916, |
| "step": 1144 |
| }, |
| { |
| "epoch": 2.72078501338091, |
| "grad_norm": 0.6086244583129883, |
| "learning_rate": 3.2258064516129034e-05, |
| "loss": 0.2755, |
| "step": 1145 |
| }, |
| { |
| "epoch": 2.7231638418079096, |
| "grad_norm": 0.5196691751480103, |
| "learning_rate": 3.2198327359617684e-05, |
| "loss": 0.2972, |
| "step": 1146 |
| }, |
| { |
| "epoch": 2.7255426702349093, |
| "grad_norm": 0.5514973998069763, |
| "learning_rate": 3.2138590203106334e-05, |
| "loss": 0.2221, |
| "step": 1147 |
| }, |
| { |
| "epoch": 2.727921498661909, |
| "grad_norm": 0.5679486989974976, |
| "learning_rate": 3.207885304659498e-05, |
| "loss": 0.3141, |
| "step": 1148 |
| }, |
| { |
| "epoch": 2.7303003270889086, |
| "grad_norm": 0.6501855254173279, |
| "learning_rate": 3.2019115890083635e-05, |
| "loss": 0.3131, |
| "step": 1149 |
| }, |
| { |
| "epoch": 2.7326791555159087, |
| "grad_norm": 0.5475935935974121, |
| "learning_rate": 3.195937873357228e-05, |
| "loss": 0.2932, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.735057983942908, |
| "grad_norm": 0.6490268707275391, |
| "learning_rate": 3.1899641577060935e-05, |
| "loss": 0.2759, |
| "step": 1151 |
| }, |
| { |
| "epoch": 2.737436812369908, |
| "grad_norm": 0.6104749441146851, |
| "learning_rate": 3.183990442054958e-05, |
| "loss": 0.3175, |
| "step": 1152 |
| }, |
| { |
| "epoch": 2.7398156407969076, |
| "grad_norm": 0.581358790397644, |
| "learning_rate": 3.1780167264038236e-05, |
| "loss": 0.3054, |
| "step": 1153 |
| }, |
| { |
| "epoch": 2.7421944692239073, |
| "grad_norm": 0.5382530093193054, |
| "learning_rate": 3.172043010752688e-05, |
| "loss": 0.2848, |
| "step": 1154 |
| }, |
| { |
| "epoch": 2.744573297650907, |
| "grad_norm": 0.6220831871032715, |
| "learning_rate": 3.1660692951015537e-05, |
| "loss": 0.3404, |
| "step": 1155 |
| }, |
| { |
| "epoch": 2.7469521260779066, |
| "grad_norm": 0.6008325815200806, |
| "learning_rate": 3.160095579450418e-05, |
| "loss": 0.2543, |
| "step": 1156 |
| }, |
| { |
| "epoch": 2.7493309545049063, |
| "grad_norm": 0.5742363929748535, |
| "learning_rate": 3.154121863799283e-05, |
| "loss": 0.259, |
| "step": 1157 |
| }, |
| { |
| "epoch": 2.751709782931906, |
| "grad_norm": 0.5634133219718933, |
| "learning_rate": 3.148148148148148e-05, |
| "loss": 0.2698, |
| "step": 1158 |
| }, |
| { |
| "epoch": 2.7540886113589056, |
| "grad_norm": 0.5804802775382996, |
| "learning_rate": 3.142174432497013e-05, |
| "loss": 0.2744, |
| "step": 1159 |
| }, |
| { |
| "epoch": 2.7564674397859052, |
| "grad_norm": 0.6175990700721741, |
| "learning_rate": 3.136200716845878e-05, |
| "loss": 0.2503, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.7588462682129054, |
| "grad_norm": 0.6470534801483154, |
| "learning_rate": 3.130227001194743e-05, |
| "loss": 0.3464, |
| "step": 1161 |
| }, |
| { |
| "epoch": 2.761225096639905, |
| "grad_norm": 0.6418605446815491, |
| "learning_rate": 3.124253285543608e-05, |
| "loss": 0.2886, |
| "step": 1162 |
| }, |
| { |
| "epoch": 2.7636039250669047, |
| "grad_norm": 0.8086894750595093, |
| "learning_rate": 3.118279569892473e-05, |
| "loss": 0.2267, |
| "step": 1163 |
| }, |
| { |
| "epoch": 2.7659827534939043, |
| "grad_norm": 0.616875410079956, |
| "learning_rate": 3.112305854241338e-05, |
| "loss": 0.3042, |
| "step": 1164 |
| }, |
| { |
| "epoch": 2.768361581920904, |
| "grad_norm": 0.5028004050254822, |
| "learning_rate": 3.106332138590203e-05, |
| "loss": 0.2081, |
| "step": 1165 |
| }, |
| { |
| "epoch": 2.7707404103479036, |
| "grad_norm": 0.5773189067840576, |
| "learning_rate": 3.100358422939068e-05, |
| "loss": 0.3195, |
| "step": 1166 |
| }, |
| { |
| "epoch": 2.7731192387749033, |
| "grad_norm": 0.7181592583656311, |
| "learning_rate": 3.0943847072879333e-05, |
| "loss": 0.3569, |
| "step": 1167 |
| }, |
| { |
| "epoch": 2.775498067201903, |
| "grad_norm": 0.5970394611358643, |
| "learning_rate": 3.0884109916367984e-05, |
| "loss": 0.264, |
| "step": 1168 |
| }, |
| { |
| "epoch": 2.7778768956289026, |
| "grad_norm": 1.4490431547164917, |
| "learning_rate": 3.0824372759856634e-05, |
| "loss": 0.276, |
| "step": 1169 |
| }, |
| { |
| "epoch": 2.7802557240559027, |
| "grad_norm": 0.6196287870407104, |
| "learning_rate": 3.0764635603345284e-05, |
| "loss": 0.296, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.782634552482902, |
| "grad_norm": 0.6437617540359497, |
| "learning_rate": 3.070489844683393e-05, |
| "loss": 0.268, |
| "step": 1171 |
| }, |
| { |
| "epoch": 2.785013380909902, |
| "grad_norm": 0.6326783895492554, |
| "learning_rate": 3.0645161290322585e-05, |
| "loss": 0.329, |
| "step": 1172 |
| }, |
| { |
| "epoch": 2.7873922093369017, |
| "grad_norm": 0.5962085127830505, |
| "learning_rate": 3.058542413381123e-05, |
| "loss": 0.2598, |
| "step": 1173 |
| }, |
| { |
| "epoch": 2.7897710377639013, |
| "grad_norm": 0.7013174891471863, |
| "learning_rate": 3.0525686977299886e-05, |
| "loss": 0.308, |
| "step": 1174 |
| }, |
| { |
| "epoch": 2.792149866190901, |
| "grad_norm": 0.6110320687294006, |
| "learning_rate": 3.046594982078853e-05, |
| "loss": 0.2844, |
| "step": 1175 |
| }, |
| { |
| "epoch": 2.7945286946179007, |
| "grad_norm": 0.5971323251724243, |
| "learning_rate": 3.0406212664277183e-05, |
| "loss": 0.2795, |
| "step": 1176 |
| }, |
| { |
| "epoch": 2.7969075230449003, |
| "grad_norm": 0.6299296021461487, |
| "learning_rate": 3.034647550776583e-05, |
| "loss": 0.2621, |
| "step": 1177 |
| }, |
| { |
| "epoch": 2.7992863514719, |
| "grad_norm": 0.6195304989814758, |
| "learning_rate": 3.0286738351254483e-05, |
| "loss": 0.3349, |
| "step": 1178 |
| }, |
| { |
| "epoch": 2.8016651798988996, |
| "grad_norm": 0.49884721636772156, |
| "learning_rate": 3.022700119474313e-05, |
| "loss": 0.2469, |
| "step": 1179 |
| }, |
| { |
| "epoch": 2.8040440083258993, |
| "grad_norm": 0.5852887034416199, |
| "learning_rate": 3.016726403823178e-05, |
| "loss": 0.2885, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.8064228367528994, |
| "grad_norm": 0.6772944331169128, |
| "learning_rate": 3.010752688172043e-05, |
| "loss": 0.283, |
| "step": 1181 |
| }, |
| { |
| "epoch": 2.808801665179899, |
| "grad_norm": 0.5862092971801758, |
| "learning_rate": 3.004778972520908e-05, |
| "loss": 0.3337, |
| "step": 1182 |
| }, |
| { |
| "epoch": 2.8111804936068987, |
| "grad_norm": 0.5878643989562988, |
| "learning_rate": 2.998805256869773e-05, |
| "loss": 0.2948, |
| "step": 1183 |
| }, |
| { |
| "epoch": 2.8135593220338984, |
| "grad_norm": 0.5235293507575989, |
| "learning_rate": 2.9928315412186382e-05, |
| "loss": 0.2513, |
| "step": 1184 |
| }, |
| { |
| "epoch": 2.815938150460898, |
| "grad_norm": 0.5556120276451111, |
| "learning_rate": 2.9868578255675032e-05, |
| "loss": 0.2719, |
| "step": 1185 |
| }, |
| { |
| "epoch": 2.8183169788878977, |
| "grad_norm": 0.6422498822212219, |
| "learning_rate": 2.9808841099163682e-05, |
| "loss": 0.3302, |
| "step": 1186 |
| }, |
| { |
| "epoch": 2.8206958073148973, |
| "grad_norm": 0.6159522533416748, |
| "learning_rate": 2.974910394265233e-05, |
| "loss": 0.3088, |
| "step": 1187 |
| }, |
| { |
| "epoch": 2.823074635741897, |
| "grad_norm": 0.5432312488555908, |
| "learning_rate": 2.9689366786140983e-05, |
| "loss": 0.2808, |
| "step": 1188 |
| }, |
| { |
| "epoch": 2.8254534641688966, |
| "grad_norm": 0.6293365359306335, |
| "learning_rate": 2.962962962962963e-05, |
| "loss": 0.2902, |
| "step": 1189 |
| }, |
| { |
| "epoch": 2.8278322925958967, |
| "grad_norm": 0.6163249015808105, |
| "learning_rate": 2.9569892473118284e-05, |
| "loss": 0.2708, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.830211121022896, |
| "grad_norm": 0.5814666748046875, |
| "learning_rate": 2.951015531660693e-05, |
| "loss": 0.2855, |
| "step": 1191 |
| }, |
| { |
| "epoch": 2.832589949449896, |
| "grad_norm": 0.5678106546401978, |
| "learning_rate": 2.9450418160095584e-05, |
| "loss": 0.2909, |
| "step": 1192 |
| }, |
| { |
| "epoch": 2.8349687778768957, |
| "grad_norm": 0.5888578295707703, |
| "learning_rate": 2.939068100358423e-05, |
| "loss": 0.2653, |
| "step": 1193 |
| }, |
| { |
| "epoch": 2.8373476063038954, |
| "grad_norm": 0.5649316310882568, |
| "learning_rate": 2.9330943847072878e-05, |
| "loss": 0.3173, |
| "step": 1194 |
| }, |
| { |
| "epoch": 2.839726434730895, |
| "grad_norm": 0.6372131109237671, |
| "learning_rate": 2.9271206690561532e-05, |
| "loss": 0.3098, |
| "step": 1195 |
| }, |
| { |
| "epoch": 2.8421052631578947, |
| "grad_norm": 0.6272296905517578, |
| "learning_rate": 2.921146953405018e-05, |
| "loss": 0.3157, |
| "step": 1196 |
| }, |
| { |
| "epoch": 2.8444840915848943, |
| "grad_norm": 0.6271634101867676, |
| "learning_rate": 2.9151732377538832e-05, |
| "loss": 0.3352, |
| "step": 1197 |
| }, |
| { |
| "epoch": 2.846862920011894, |
| "grad_norm": 0.5737547874450684, |
| "learning_rate": 2.909199522102748e-05, |
| "loss": 0.2748, |
| "step": 1198 |
| }, |
| { |
| "epoch": 2.849241748438894, |
| "grad_norm": 0.6345760226249695, |
| "learning_rate": 2.9032258064516133e-05, |
| "loss": 0.2769, |
| "step": 1199 |
| }, |
| { |
| "epoch": 2.8516205768658933, |
| "grad_norm": 0.5594867467880249, |
| "learning_rate": 2.897252090800478e-05, |
| "loss": 0.2573, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.8516205768658933, |
| "eval_loss": 0.3864609897136688, |
| "eval_runtime": 24.8837, |
| "eval_samples_per_second": 30.06, |
| "eval_steps_per_second": 15.03, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.8539994052928934, |
| "grad_norm": 0.5716099143028259, |
| "learning_rate": 2.8912783751493434e-05, |
| "loss": 0.2803, |
| "step": 1201 |
| }, |
| { |
| "epoch": 2.856378233719893, |
| "grad_norm": 0.7210864424705505, |
| "learning_rate": 2.885304659498208e-05, |
| "loss": 0.2867, |
| "step": 1202 |
| }, |
| { |
| "epoch": 2.8587570621468927, |
| "grad_norm": 0.7296307682991028, |
| "learning_rate": 2.8793309438470727e-05, |
| "loss": 0.3317, |
| "step": 1203 |
| }, |
| { |
| "epoch": 2.8611358905738924, |
| "grad_norm": 0.7070258259773254, |
| "learning_rate": 2.873357228195938e-05, |
| "loss": 0.3031, |
| "step": 1204 |
| }, |
| { |
| "epoch": 2.863514719000892, |
| "grad_norm": 0.6060933470726013, |
| "learning_rate": 2.8673835125448028e-05, |
| "loss": 0.2816, |
| "step": 1205 |
| }, |
| { |
| "epoch": 2.8658935474278917, |
| "grad_norm": 0.6421394944190979, |
| "learning_rate": 2.861409796893668e-05, |
| "loss": 0.2717, |
| "step": 1206 |
| }, |
| { |
| "epoch": 2.8682723758548914, |
| "grad_norm": 0.6352380514144897, |
| "learning_rate": 2.855436081242533e-05, |
| "loss": 0.3115, |
| "step": 1207 |
| }, |
| { |
| "epoch": 2.870651204281891, |
| "grad_norm": 0.6790797114372253, |
| "learning_rate": 2.8494623655913982e-05, |
| "loss": 0.3528, |
| "step": 1208 |
| }, |
| { |
| "epoch": 2.8730300327088907, |
| "grad_norm": 0.5601657629013062, |
| "learning_rate": 2.843488649940263e-05, |
| "loss": 0.2254, |
| "step": 1209 |
| }, |
| { |
| "epoch": 2.875408861135891, |
| "grad_norm": 0.5759854912757874, |
| "learning_rate": 2.8375149342891276e-05, |
| "loss": 0.284, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.8777876895628904, |
| "grad_norm": 0.6258363127708435, |
| "learning_rate": 2.831541218637993e-05, |
| "loss": 0.28, |
| "step": 1211 |
| }, |
| { |
| "epoch": 2.88016651798989, |
| "grad_norm": 0.7181396484375, |
| "learning_rate": 2.8255675029868577e-05, |
| "loss": 0.3192, |
| "step": 1212 |
| }, |
| { |
| "epoch": 2.8825453464168898, |
| "grad_norm": 0.6534887552261353, |
| "learning_rate": 2.819593787335723e-05, |
| "loss": 0.3195, |
| "step": 1213 |
| }, |
| { |
| "epoch": 2.8849241748438894, |
| "grad_norm": 0.6765838265419006, |
| "learning_rate": 2.8136200716845877e-05, |
| "loss": 0.3031, |
| "step": 1214 |
| }, |
| { |
| "epoch": 2.887303003270889, |
| "grad_norm": 0.6807898879051208, |
| "learning_rate": 2.807646356033453e-05, |
| "loss": 0.315, |
| "step": 1215 |
| }, |
| { |
| "epoch": 2.8896818316978887, |
| "grad_norm": 0.6026751399040222, |
| "learning_rate": 2.8016726403823178e-05, |
| "loss": 0.2811, |
| "step": 1216 |
| }, |
| { |
| "epoch": 2.8920606601248884, |
| "grad_norm": 0.5754644870758057, |
| "learning_rate": 2.7956989247311828e-05, |
| "loss": 0.2772, |
| "step": 1217 |
| }, |
| { |
| "epoch": 2.894439488551888, |
| "grad_norm": 0.5280768871307373, |
| "learning_rate": 2.789725209080048e-05, |
| "loss": 0.2345, |
| "step": 1218 |
| }, |
| { |
| "epoch": 2.896818316978888, |
| "grad_norm": 0.6319217681884766, |
| "learning_rate": 2.783751493428913e-05, |
| "loss": 0.2649, |
| "step": 1219 |
| }, |
| { |
| "epoch": 2.8991971454058874, |
| "grad_norm": 0.5606786608695984, |
| "learning_rate": 2.777777777777778e-05, |
| "loss": 0.2557, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.9015759738328875, |
| "grad_norm": 0.6417257785797119, |
| "learning_rate": 2.771804062126643e-05, |
| "loss": 0.3148, |
| "step": 1221 |
| }, |
| { |
| "epoch": 2.903954802259887, |
| "grad_norm": 0.5431168079376221, |
| "learning_rate": 2.765830346475508e-05, |
| "loss": 0.2775, |
| "step": 1222 |
| }, |
| { |
| "epoch": 2.9063336306868868, |
| "grad_norm": 0.6295545697212219, |
| "learning_rate": 2.759856630824373e-05, |
| "loss": 0.3051, |
| "step": 1223 |
| }, |
| { |
| "epoch": 2.9087124591138864, |
| "grad_norm": 0.6698517203330994, |
| "learning_rate": 2.753882915173238e-05, |
| "loss": 0.3033, |
| "step": 1224 |
| }, |
| { |
| "epoch": 2.911091287540886, |
| "grad_norm": 0.5729504227638245, |
| "learning_rate": 2.747909199522103e-05, |
| "loss": 0.26, |
| "step": 1225 |
| }, |
| { |
| "epoch": 2.9134701159678857, |
| "grad_norm": 0.5817504525184631, |
| "learning_rate": 2.7419354838709678e-05, |
| "loss": 0.2457, |
| "step": 1226 |
| }, |
| { |
| "epoch": 2.9158489443948854, |
| "grad_norm": 0.6074779629707336, |
| "learning_rate": 2.735961768219833e-05, |
| "loss": 0.2623, |
| "step": 1227 |
| }, |
| { |
| "epoch": 2.918227772821885, |
| "grad_norm": 0.7250639796257019, |
| "learning_rate": 2.7299880525686978e-05, |
| "loss": 0.3272, |
| "step": 1228 |
| }, |
| { |
| "epoch": 2.9206066012488847, |
| "grad_norm": 0.6558791399002075, |
| "learning_rate": 2.7240143369175632e-05, |
| "loss": 0.2699, |
| "step": 1229 |
| }, |
| { |
| "epoch": 2.922985429675885, |
| "grad_norm": 0.5835295915603638, |
| "learning_rate": 2.718040621266428e-05, |
| "loss": 0.3002, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.9253642581028845, |
| "grad_norm": 0.6902837157249451, |
| "learning_rate": 2.7120669056152932e-05, |
| "loss": 0.3135, |
| "step": 1231 |
| }, |
| { |
| "epoch": 2.927743086529884, |
| "grad_norm": 0.5926578640937805, |
| "learning_rate": 2.706093189964158e-05, |
| "loss": 0.2616, |
| "step": 1232 |
| }, |
| { |
| "epoch": 2.930121914956884, |
| "grad_norm": 0.5405444502830505, |
| "learning_rate": 2.7001194743130226e-05, |
| "loss": 0.2587, |
| "step": 1233 |
| }, |
| { |
| "epoch": 2.9325007433838834, |
| "grad_norm": 0.606576144695282, |
| "learning_rate": 2.694145758661888e-05, |
| "loss": 0.2717, |
| "step": 1234 |
| }, |
| { |
| "epoch": 2.934879571810883, |
| "grad_norm": 0.6612190008163452, |
| "learning_rate": 2.6881720430107527e-05, |
| "loss": 0.3005, |
| "step": 1235 |
| }, |
| { |
| "epoch": 2.9372584002378828, |
| "grad_norm": 0.6425749063491821, |
| "learning_rate": 2.682198327359618e-05, |
| "loss": 0.2606, |
| "step": 1236 |
| }, |
| { |
| "epoch": 2.9396372286648824, |
| "grad_norm": 0.6714048385620117, |
| "learning_rate": 2.6762246117084827e-05, |
| "loss": 0.3116, |
| "step": 1237 |
| }, |
| { |
| "epoch": 2.942016057091882, |
| "grad_norm": 0.6480368971824646, |
| "learning_rate": 2.670250896057348e-05, |
| "loss": 0.3088, |
| "step": 1238 |
| }, |
| { |
| "epoch": 2.944394885518882, |
| "grad_norm": 0.6665281653404236, |
| "learning_rate": 2.6642771804062128e-05, |
| "loss": 0.3281, |
| "step": 1239 |
| }, |
| { |
| "epoch": 2.9467737139458814, |
| "grad_norm": 0.5490178465843201, |
| "learning_rate": 2.6583034647550775e-05, |
| "loss": 0.2708, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.9491525423728815, |
| "grad_norm": 0.6380129456520081, |
| "learning_rate": 2.652329749103943e-05, |
| "loss": 0.2738, |
| "step": 1241 |
| }, |
| { |
| "epoch": 2.951531370799881, |
| "grad_norm": 0.6085153818130493, |
| "learning_rate": 2.6463560334528076e-05, |
| "loss": 0.2426, |
| "step": 1242 |
| }, |
| { |
| "epoch": 2.953910199226881, |
| "grad_norm": 0.6035470366477966, |
| "learning_rate": 2.640382317801673e-05, |
| "loss": 0.3003, |
| "step": 1243 |
| }, |
| { |
| "epoch": 2.9562890276538805, |
| "grad_norm": 0.6204206943511963, |
| "learning_rate": 2.6344086021505376e-05, |
| "loss": 0.2981, |
| "step": 1244 |
| }, |
| { |
| "epoch": 2.95866785608088, |
| "grad_norm": 0.629393458366394, |
| "learning_rate": 2.628434886499403e-05, |
| "loss": 0.3259, |
| "step": 1245 |
| }, |
| { |
| "epoch": 2.96104668450788, |
| "grad_norm": 0.6644812226295471, |
| "learning_rate": 2.6224611708482677e-05, |
| "loss": 0.2825, |
| "step": 1246 |
| }, |
| { |
| "epoch": 2.9634255129348794, |
| "grad_norm": 0.6230280995368958, |
| "learning_rate": 2.616487455197133e-05, |
| "loss": 0.2443, |
| "step": 1247 |
| }, |
| { |
| "epoch": 2.9658043413618795, |
| "grad_norm": 0.6109925508499146, |
| "learning_rate": 2.6105137395459977e-05, |
| "loss": 0.3038, |
| "step": 1248 |
| }, |
| { |
| "epoch": 2.9681831697888788, |
| "grad_norm": 0.5434057712554932, |
| "learning_rate": 2.6045400238948624e-05, |
| "loss": 0.2835, |
| "step": 1249 |
| }, |
| { |
| "epoch": 2.970561998215879, |
| "grad_norm": 0.6708266735076904, |
| "learning_rate": 2.5985663082437278e-05, |
| "loss": 0.3569, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.9729408266428785, |
| "grad_norm": 0.614422619342804, |
| "learning_rate": 2.5925925925925925e-05, |
| "loss": 0.3171, |
| "step": 1251 |
| }, |
| { |
| "epoch": 2.975319655069878, |
| "grad_norm": 0.6892760992050171, |
| "learning_rate": 2.586618876941458e-05, |
| "loss": 0.2787, |
| "step": 1252 |
| }, |
| { |
| "epoch": 2.977698483496878, |
| "grad_norm": 0.6347784399986267, |
| "learning_rate": 2.5806451612903226e-05, |
| "loss": 0.2649, |
| "step": 1253 |
| }, |
| { |
| "epoch": 2.9800773119238775, |
| "grad_norm": 0.5815712809562683, |
| "learning_rate": 2.574671445639188e-05, |
| "loss": 0.332, |
| "step": 1254 |
| }, |
| { |
| "epoch": 2.982456140350877, |
| "grad_norm": 0.57394939661026, |
| "learning_rate": 2.5686977299880526e-05, |
| "loss": 0.2549, |
| "step": 1255 |
| }, |
| { |
| "epoch": 2.984834968777877, |
| "grad_norm": 0.5753270387649536, |
| "learning_rate": 2.5627240143369173e-05, |
| "loss": 0.291, |
| "step": 1256 |
| }, |
| { |
| "epoch": 2.9872137972048765, |
| "grad_norm": 0.7134138941764832, |
| "learning_rate": 2.5567502986857827e-05, |
| "loss": 0.3167, |
| "step": 1257 |
| }, |
| { |
| "epoch": 2.989592625631876, |
| "grad_norm": 0.5441657900810242, |
| "learning_rate": 2.5507765830346474e-05, |
| "loss": 0.2621, |
| "step": 1258 |
| }, |
| { |
| "epoch": 2.991971454058876, |
| "grad_norm": 0.5922890305519104, |
| "learning_rate": 2.5448028673835127e-05, |
| "loss": 0.2679, |
| "step": 1259 |
| }, |
| { |
| "epoch": 2.994350282485876, |
| "grad_norm": 0.559668779373169, |
| "learning_rate": 2.5388291517323774e-05, |
| "loss": 0.3293, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.9967291109128755, |
| "grad_norm": 0.5969208478927612, |
| "learning_rate": 2.5328554360812428e-05, |
| "loss": 0.2749, |
| "step": 1261 |
| }, |
| { |
| "epoch": 2.999107939339875, |
| "grad_norm": 0.609047532081604, |
| "learning_rate": 2.5268817204301075e-05, |
| "loss": 0.3221, |
| "step": 1262 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 1.1574978828430176, |
| "learning_rate": 2.5209080047789725e-05, |
| "loss": 0.2991, |
| "step": 1263 |
| }, |
| { |
| "epoch": 3.0023788284269997, |
| "grad_norm": 0.5617911219596863, |
| "learning_rate": 2.5149342891278375e-05, |
| "loss": 0.2266, |
| "step": 1264 |
| }, |
| { |
| "epoch": 3.0047576568539993, |
| "grad_norm": 0.519255518913269, |
| "learning_rate": 2.5089605734767026e-05, |
| "loss": 0.2256, |
| "step": 1265 |
| }, |
| { |
| "epoch": 3.007136485280999, |
| "grad_norm": 0.5122116208076477, |
| "learning_rate": 2.5029868578255676e-05, |
| "loss": 0.2438, |
| "step": 1266 |
| }, |
| { |
| "epoch": 3.0095153137079986, |
| "grad_norm": 0.5582380890846252, |
| "learning_rate": 2.4970131421744326e-05, |
| "loss": 0.284, |
| "step": 1267 |
| }, |
| { |
| "epoch": 3.0118941421349987, |
| "grad_norm": 0.5494282841682434, |
| "learning_rate": 2.4910394265232977e-05, |
| "loss": 0.2118, |
| "step": 1268 |
| }, |
| { |
| "epoch": 3.0142729705619984, |
| "grad_norm": 0.585095226764679, |
| "learning_rate": 2.4850657108721627e-05, |
| "loss": 0.2572, |
| "step": 1269 |
| }, |
| { |
| "epoch": 3.016651798988998, |
| "grad_norm": 0.5811973810195923, |
| "learning_rate": 2.4790919952210277e-05, |
| "loss": 0.2532, |
| "step": 1270 |
| }, |
| { |
| "epoch": 3.0190306274159977, |
| "grad_norm": 0.5602511763572693, |
| "learning_rate": 2.4731182795698928e-05, |
| "loss": 0.2175, |
| "step": 1271 |
| }, |
| { |
| "epoch": 3.0214094558429974, |
| "grad_norm": 0.5386038422584534, |
| "learning_rate": 2.4671445639187578e-05, |
| "loss": 0.1973, |
| "step": 1272 |
| }, |
| { |
| "epoch": 3.023788284269997, |
| "grad_norm": 0.5802236795425415, |
| "learning_rate": 2.4611708482676228e-05, |
| "loss": 0.2655, |
| "step": 1273 |
| }, |
| { |
| "epoch": 3.0261671126969967, |
| "grad_norm": 0.652818500995636, |
| "learning_rate": 2.455197132616488e-05, |
| "loss": 0.2907, |
| "step": 1274 |
| }, |
| { |
| "epoch": 3.0285459411239963, |
| "grad_norm": 0.6712412238121033, |
| "learning_rate": 2.4492234169653525e-05, |
| "loss": 0.2186, |
| "step": 1275 |
| }, |
| { |
| "epoch": 3.030924769550996, |
| "grad_norm": 0.6123949885368347, |
| "learning_rate": 2.4432497013142176e-05, |
| "loss": 0.2088, |
| "step": 1276 |
| }, |
| { |
| "epoch": 3.0333035979779956, |
| "grad_norm": 0.6561902761459351, |
| "learning_rate": 2.4372759856630826e-05, |
| "loss": 0.279, |
| "step": 1277 |
| }, |
| { |
| "epoch": 3.0356824264049957, |
| "grad_norm": 0.6276388168334961, |
| "learning_rate": 2.4313022700119476e-05, |
| "loss": 0.271, |
| "step": 1278 |
| }, |
| { |
| "epoch": 3.0380612548319954, |
| "grad_norm": 0.6087429523468018, |
| "learning_rate": 2.4253285543608127e-05, |
| "loss": 0.2306, |
| "step": 1279 |
| }, |
| { |
| "epoch": 3.040440083258995, |
| "grad_norm": 0.5728775858879089, |
| "learning_rate": 2.4193548387096777e-05, |
| "loss": 0.2348, |
| "step": 1280 |
| }, |
| { |
| "epoch": 3.0428189116859947, |
| "grad_norm": 0.5878280401229858, |
| "learning_rate": 2.4133811230585427e-05, |
| "loss": 0.2411, |
| "step": 1281 |
| }, |
| { |
| "epoch": 3.0451977401129944, |
| "grad_norm": 0.5655471086502075, |
| "learning_rate": 2.4074074074074074e-05, |
| "loss": 0.2077, |
| "step": 1282 |
| }, |
| { |
| "epoch": 3.047576568539994, |
| "grad_norm": 0.5882824659347534, |
| "learning_rate": 2.4014336917562724e-05, |
| "loss": 0.227, |
| "step": 1283 |
| }, |
| { |
| "epoch": 3.0499553969669937, |
| "grad_norm": 0.6607369184494019, |
| "learning_rate": 2.3954599761051375e-05, |
| "loss": 0.2578, |
| "step": 1284 |
| }, |
| { |
| "epoch": 3.0523342253939933, |
| "grad_norm": 0.5448257327079773, |
| "learning_rate": 2.3894862604540025e-05, |
| "loss": 0.2349, |
| "step": 1285 |
| }, |
| { |
| "epoch": 3.054713053820993, |
| "grad_norm": 0.6125568747520447, |
| "learning_rate": 2.3835125448028675e-05, |
| "loss": 0.2324, |
| "step": 1286 |
| }, |
| { |
| "epoch": 3.0570918822479927, |
| "grad_norm": 0.6578051447868347, |
| "learning_rate": 2.3775388291517326e-05, |
| "loss": 0.2301, |
| "step": 1287 |
| }, |
| { |
| "epoch": 3.0594707106749928, |
| "grad_norm": 0.6022703051567078, |
| "learning_rate": 2.3715651135005976e-05, |
| "loss": 0.2335, |
| "step": 1288 |
| }, |
| { |
| "epoch": 3.0618495391019924, |
| "grad_norm": 0.6820448637008667, |
| "learning_rate": 2.3655913978494626e-05, |
| "loss": 0.2426, |
| "step": 1289 |
| }, |
| { |
| "epoch": 3.064228367528992, |
| "grad_norm": 0.6197579503059387, |
| "learning_rate": 2.3596176821983273e-05, |
| "loss": 0.2528, |
| "step": 1290 |
| }, |
| { |
| "epoch": 3.0666071959559917, |
| "grad_norm": 0.6325532793998718, |
| "learning_rate": 2.3536439665471923e-05, |
| "loss": 0.2398, |
| "step": 1291 |
| }, |
| { |
| "epoch": 3.0689860243829914, |
| "grad_norm": 0.6155019402503967, |
| "learning_rate": 2.3476702508960574e-05, |
| "loss": 0.2767, |
| "step": 1292 |
| }, |
| { |
| "epoch": 3.071364852809991, |
| "grad_norm": 0.6121396422386169, |
| "learning_rate": 2.3416965352449224e-05, |
| "loss": 0.2391, |
| "step": 1293 |
| }, |
| { |
| "epoch": 3.0737436812369907, |
| "grad_norm": 0.553560733795166, |
| "learning_rate": 2.3357228195937874e-05, |
| "loss": 0.2009, |
| "step": 1294 |
| }, |
| { |
| "epoch": 3.0761225096639904, |
| "grad_norm": 0.5787554383277893, |
| "learning_rate": 2.3297491039426525e-05, |
| "loss": 0.2359, |
| "step": 1295 |
| }, |
| { |
| "epoch": 3.07850133809099, |
| "grad_norm": 0.6845440864562988, |
| "learning_rate": 2.3237753882915175e-05, |
| "loss": 0.2573, |
| "step": 1296 |
| }, |
| { |
| "epoch": 3.0808801665179897, |
| "grad_norm": 0.6331266760826111, |
| "learning_rate": 2.3178016726403825e-05, |
| "loss": 0.2295, |
| "step": 1297 |
| }, |
| { |
| "epoch": 3.08325899494499, |
| "grad_norm": 0.640466034412384, |
| "learning_rate": 2.3118279569892472e-05, |
| "loss": 0.2456, |
| "step": 1298 |
| }, |
| { |
| "epoch": 3.0856378233719894, |
| "grad_norm": 0.7017742991447449, |
| "learning_rate": 2.3058542413381122e-05, |
| "loss": 0.2793, |
| "step": 1299 |
| }, |
| { |
| "epoch": 3.088016651798989, |
| "grad_norm": 0.7358404397964478, |
| "learning_rate": 2.2998805256869773e-05, |
| "loss": 0.277, |
| "step": 1300 |
| }, |
| { |
| "epoch": 3.088016651798989, |
| "eval_loss": 0.4049249291419983, |
| "eval_runtime": 24.7093, |
| "eval_samples_per_second": 30.272, |
| "eval_steps_per_second": 15.136, |
| "step": 1300 |
| }, |
| { |
| "epoch": 3.0903954802259888, |
| "grad_norm": 0.64457106590271, |
| "learning_rate": 2.2939068100358423e-05, |
| "loss": 0.2707, |
| "step": 1301 |
| }, |
| { |
| "epoch": 3.0927743086529884, |
| "grad_norm": 0.672550618648529, |
| "learning_rate": 2.2879330943847073e-05, |
| "loss": 0.248, |
| "step": 1302 |
| }, |
| { |
| "epoch": 3.095153137079988, |
| "grad_norm": 0.6365009546279907, |
| "learning_rate": 2.2819593787335724e-05, |
| "loss": 0.2387, |
| "step": 1303 |
| }, |
| { |
| "epoch": 3.0975319655069877, |
| "grad_norm": 0.6442080736160278, |
| "learning_rate": 2.2759856630824374e-05, |
| "loss": 0.2216, |
| "step": 1304 |
| }, |
| { |
| "epoch": 3.0999107939339874, |
| "grad_norm": 0.6259413361549377, |
| "learning_rate": 2.270011947431302e-05, |
| "loss": 0.2541, |
| "step": 1305 |
| }, |
| { |
| "epoch": 3.102289622360987, |
| "grad_norm": 0.6365742683410645, |
| "learning_rate": 2.264038231780167e-05, |
| "loss": 0.2415, |
| "step": 1306 |
| }, |
| { |
| "epoch": 3.1046684507879867, |
| "grad_norm": 0.5824887156486511, |
| "learning_rate": 2.258064516129032e-05, |
| "loss": 0.2529, |
| "step": 1307 |
| }, |
| { |
| "epoch": 3.107047279214987, |
| "grad_norm": 0.6794803142547607, |
| "learning_rate": 2.2520908004778972e-05, |
| "loss": 0.23, |
| "step": 1308 |
| }, |
| { |
| "epoch": 3.1094261076419865, |
| "grad_norm": 0.6019396185874939, |
| "learning_rate": 2.2461170848267622e-05, |
| "loss": 0.2162, |
| "step": 1309 |
| }, |
| { |
| "epoch": 3.111804936068986, |
| "grad_norm": 0.5950125455856323, |
| "learning_rate": 2.2401433691756272e-05, |
| "loss": 0.29, |
| "step": 1310 |
| }, |
| { |
| "epoch": 3.1141837644959858, |
| "grad_norm": 0.6780076026916504, |
| "learning_rate": 2.2341696535244923e-05, |
| "loss": 0.2402, |
| "step": 1311 |
| }, |
| { |
| "epoch": 3.1165625929229854, |
| "grad_norm": 0.6014128923416138, |
| "learning_rate": 2.2281959378733573e-05, |
| "loss": 0.2104, |
| "step": 1312 |
| }, |
| { |
| "epoch": 3.118941421349985, |
| "grad_norm": 0.6915101408958435, |
| "learning_rate": 2.2222222222222223e-05, |
| "loss": 0.3026, |
| "step": 1313 |
| }, |
| { |
| "epoch": 3.1213202497769847, |
| "grad_norm": 0.6975738406181335, |
| "learning_rate": 2.2162485065710874e-05, |
| "loss": 0.2436, |
| "step": 1314 |
| }, |
| { |
| "epoch": 3.1236990782039844, |
| "grad_norm": 0.7148664593696594, |
| "learning_rate": 2.2102747909199524e-05, |
| "loss": 0.2506, |
| "step": 1315 |
| }, |
| { |
| "epoch": 3.126077906630984, |
| "grad_norm": 0.6675700545310974, |
| "learning_rate": 2.2043010752688174e-05, |
| "loss": 0.2308, |
| "step": 1316 |
| }, |
| { |
| "epoch": 3.128456735057984, |
| "grad_norm": 0.6789043545722961, |
| "learning_rate": 2.1983273596176824e-05, |
| "loss": 0.2353, |
| "step": 1317 |
| }, |
| { |
| "epoch": 3.130835563484984, |
| "grad_norm": 0.6762030124664307, |
| "learning_rate": 2.1923536439665475e-05, |
| "loss": 0.2479, |
| "step": 1318 |
| }, |
| { |
| "epoch": 3.1332143919119835, |
| "grad_norm": 0.6441430449485779, |
| "learning_rate": 2.1863799283154125e-05, |
| "loss": 0.1835, |
| "step": 1319 |
| }, |
| { |
| "epoch": 3.135593220338983, |
| "grad_norm": 0.6008920073509216, |
| "learning_rate": 2.1804062126642775e-05, |
| "loss": 0.2179, |
| "step": 1320 |
| }, |
| { |
| "epoch": 3.137972048765983, |
| "grad_norm": 0.7051548361778259, |
| "learning_rate": 2.1744324970131422e-05, |
| "loss": 0.2748, |
| "step": 1321 |
| }, |
| { |
| "epoch": 3.1403508771929824, |
| "grad_norm": 0.6684269905090332, |
| "learning_rate": 2.1684587813620073e-05, |
| "loss": 0.2622, |
| "step": 1322 |
| }, |
| { |
| "epoch": 3.142729705619982, |
| "grad_norm": 0.5970059633255005, |
| "learning_rate": 2.1624850657108723e-05, |
| "loss": 0.2305, |
| "step": 1323 |
| }, |
| { |
| "epoch": 3.1451085340469818, |
| "grad_norm": 0.6092391014099121, |
| "learning_rate": 2.1565113500597373e-05, |
| "loss": 0.2169, |
| "step": 1324 |
| }, |
| { |
| "epoch": 3.1474873624739814, |
| "grad_norm": 0.7716182470321655, |
| "learning_rate": 2.1505376344086024e-05, |
| "loss": 0.2244, |
| "step": 1325 |
| }, |
| { |
| "epoch": 3.149866190900981, |
| "grad_norm": 0.7600685954093933, |
| "learning_rate": 2.1445639187574674e-05, |
| "loss": 0.2807, |
| "step": 1326 |
| }, |
| { |
| "epoch": 3.1522450193279807, |
| "grad_norm": 0.6617460250854492, |
| "learning_rate": 2.1385902031063324e-05, |
| "loss": 0.2643, |
| "step": 1327 |
| }, |
| { |
| "epoch": 3.154623847754981, |
| "grad_norm": 0.6389066576957703, |
| "learning_rate": 2.132616487455197e-05, |
| "loss": 0.2674, |
| "step": 1328 |
| }, |
| { |
| "epoch": 3.1570026761819805, |
| "grad_norm": 0.6522324085235596, |
| "learning_rate": 2.126642771804062e-05, |
| "loss": 0.2083, |
| "step": 1329 |
| }, |
| { |
| "epoch": 3.15938150460898, |
| "grad_norm": 0.7513405084609985, |
| "learning_rate": 2.120669056152927e-05, |
| "loss": 0.2785, |
| "step": 1330 |
| }, |
| { |
| "epoch": 3.16176033303598, |
| "grad_norm": 0.5705693364143372, |
| "learning_rate": 2.1146953405017922e-05, |
| "loss": 0.1822, |
| "step": 1331 |
| }, |
| { |
| "epoch": 3.1641391614629795, |
| "grad_norm": 0.6877608299255371, |
| "learning_rate": 2.1087216248506572e-05, |
| "loss": 0.2754, |
| "step": 1332 |
| }, |
| { |
| "epoch": 3.166517989889979, |
| "grad_norm": 0.6934994459152222, |
| "learning_rate": 2.1027479091995223e-05, |
| "loss": 0.1944, |
| "step": 1333 |
| }, |
| { |
| "epoch": 3.168896818316979, |
| "grad_norm": 0.7194043397903442, |
| "learning_rate": 2.0967741935483873e-05, |
| "loss": 0.2657, |
| "step": 1334 |
| }, |
| { |
| "epoch": 3.1712756467439784, |
| "grad_norm": 0.6782123446464539, |
| "learning_rate": 2.0908004778972523e-05, |
| "loss": 0.2531, |
| "step": 1335 |
| }, |
| { |
| "epoch": 3.173654475170978, |
| "grad_norm": 0.7624220848083496, |
| "learning_rate": 2.084826762246117e-05, |
| "loss": 0.2082, |
| "step": 1336 |
| }, |
| { |
| "epoch": 3.176033303597978, |
| "grad_norm": 0.6336691975593567, |
| "learning_rate": 2.078853046594982e-05, |
| "loss": 0.2304, |
| "step": 1337 |
| }, |
| { |
| "epoch": 3.178412132024978, |
| "grad_norm": 0.6183249950408936, |
| "learning_rate": 2.072879330943847e-05, |
| "loss": 0.1725, |
| "step": 1338 |
| }, |
| { |
| "epoch": 3.1807909604519775, |
| "grad_norm": 0.6695713400840759, |
| "learning_rate": 2.066905615292712e-05, |
| "loss": 0.1807, |
| "step": 1339 |
| }, |
| { |
| "epoch": 3.183169788878977, |
| "grad_norm": 0.5882018208503723, |
| "learning_rate": 2.060931899641577e-05, |
| "loss": 0.2072, |
| "step": 1340 |
| }, |
| { |
| "epoch": 3.185548617305977, |
| "grad_norm": 0.6536471843719482, |
| "learning_rate": 2.054958183990442e-05, |
| "loss": 0.2962, |
| "step": 1341 |
| }, |
| { |
| "epoch": 3.1879274457329765, |
| "grad_norm": 0.6349655985832214, |
| "learning_rate": 2.0489844683393072e-05, |
| "loss": 0.2309, |
| "step": 1342 |
| }, |
| { |
| "epoch": 3.190306274159976, |
| "grad_norm": 0.6827989816665649, |
| "learning_rate": 2.0430107526881722e-05, |
| "loss": 0.2236, |
| "step": 1343 |
| }, |
| { |
| "epoch": 3.192685102586976, |
| "grad_norm": 0.6630003452301025, |
| "learning_rate": 2.037037037037037e-05, |
| "loss": 0.2739, |
| "step": 1344 |
| }, |
| { |
| "epoch": 3.1950639310139755, |
| "grad_norm": 0.5332040190696716, |
| "learning_rate": 2.031063321385902e-05, |
| "loss": 0.1947, |
| "step": 1345 |
| }, |
| { |
| "epoch": 3.197442759440975, |
| "grad_norm": 0.624686062335968, |
| "learning_rate": 2.025089605734767e-05, |
| "loss": 0.2403, |
| "step": 1346 |
| }, |
| { |
| "epoch": 3.199821587867975, |
| "grad_norm": 0.6674289703369141, |
| "learning_rate": 2.019115890083632e-05, |
| "loss": 0.2448, |
| "step": 1347 |
| }, |
| { |
| "epoch": 3.202200416294975, |
| "grad_norm": 0.6246338486671448, |
| "learning_rate": 2.013142174432497e-05, |
| "loss": 0.2025, |
| "step": 1348 |
| }, |
| { |
| "epoch": 3.2045792447219745, |
| "grad_norm": 0.6760092377662659, |
| "learning_rate": 2.007168458781362e-05, |
| "loss": 0.2186, |
| "step": 1349 |
| }, |
| { |
| "epoch": 3.206958073148974, |
| "grad_norm": 0.6240797638893127, |
| "learning_rate": 2.001194743130227e-05, |
| "loss": 0.1938, |
| "step": 1350 |
| }, |
| { |
| "epoch": 3.209336901575974, |
| "grad_norm": 0.5926909446716309, |
| "learning_rate": 1.995221027479092e-05, |
| "loss": 0.2389, |
| "step": 1351 |
| }, |
| { |
| "epoch": 3.2117157300029735, |
| "grad_norm": 0.623314619064331, |
| "learning_rate": 1.989247311827957e-05, |
| "loss": 0.2479, |
| "step": 1352 |
| }, |
| { |
| "epoch": 3.214094558429973, |
| "grad_norm": 0.6462867259979248, |
| "learning_rate": 1.9832735961768222e-05, |
| "loss": 0.2363, |
| "step": 1353 |
| }, |
| { |
| "epoch": 3.216473386856973, |
| "grad_norm": 0.6551673412322998, |
| "learning_rate": 1.9772998805256872e-05, |
| "loss": 0.2214, |
| "step": 1354 |
| }, |
| { |
| "epoch": 3.2188522152839725, |
| "grad_norm": 0.6646662354469299, |
| "learning_rate": 1.9713261648745522e-05, |
| "loss": 0.1996, |
| "step": 1355 |
| }, |
| { |
| "epoch": 3.221231043710972, |
| "grad_norm": 0.6474018096923828, |
| "learning_rate": 1.9653524492234173e-05, |
| "loss": 0.2457, |
| "step": 1356 |
| }, |
| { |
| "epoch": 3.2236098721379722, |
| "grad_norm": 0.6855640411376953, |
| "learning_rate": 1.9593787335722823e-05, |
| "loss": 0.271, |
| "step": 1357 |
| }, |
| { |
| "epoch": 3.225988700564972, |
| "grad_norm": 0.6802095174789429, |
| "learning_rate": 1.9534050179211473e-05, |
| "loss": 0.2271, |
| "step": 1358 |
| }, |
| { |
| "epoch": 3.2283675289919715, |
| "grad_norm": 0.6579050421714783, |
| "learning_rate": 1.947431302270012e-05, |
| "loss": 0.2601, |
| "step": 1359 |
| }, |
| { |
| "epoch": 3.230746357418971, |
| "grad_norm": 0.6538249850273132, |
| "learning_rate": 1.941457586618877e-05, |
| "loss": 0.257, |
| "step": 1360 |
| }, |
| { |
| "epoch": 3.233125185845971, |
| "grad_norm": 0.6673462986946106, |
| "learning_rate": 1.935483870967742e-05, |
| "loss": 0.2185, |
| "step": 1361 |
| }, |
| { |
| "epoch": 3.2355040142729705, |
| "grad_norm": 0.7256568074226379, |
| "learning_rate": 1.929510155316607e-05, |
| "loss": 0.2904, |
| "step": 1362 |
| }, |
| { |
| "epoch": 3.23788284269997, |
| "grad_norm": 0.5458927750587463, |
| "learning_rate": 1.923536439665472e-05, |
| "loss": 0.1592, |
| "step": 1363 |
| }, |
| { |
| "epoch": 3.24026167112697, |
| "grad_norm": 0.6696286201477051, |
| "learning_rate": 1.9175627240143372e-05, |
| "loss": 0.2131, |
| "step": 1364 |
| }, |
| { |
| "epoch": 3.2426404995539695, |
| "grad_norm": 0.7205179333686829, |
| "learning_rate": 1.9115890083632022e-05, |
| "loss": 0.2046, |
| "step": 1365 |
| }, |
| { |
| "epoch": 3.2450193279809696, |
| "grad_norm": 0.7840339541435242, |
| "learning_rate": 1.9056152927120672e-05, |
| "loss": 0.2388, |
| "step": 1366 |
| }, |
| { |
| "epoch": 3.2473981564079692, |
| "grad_norm": 0.7273231744766235, |
| "learning_rate": 1.899641577060932e-05, |
| "loss": 0.2431, |
| "step": 1367 |
| }, |
| { |
| "epoch": 3.249776984834969, |
| "grad_norm": 0.7814990282058716, |
| "learning_rate": 1.893667861409797e-05, |
| "loss": 0.2464, |
| "step": 1368 |
| }, |
| { |
| "epoch": 3.2521558132619686, |
| "grad_norm": 0.8068521618843079, |
| "learning_rate": 1.887694145758662e-05, |
| "loss": 0.299, |
| "step": 1369 |
| }, |
| { |
| "epoch": 3.254534641688968, |
| "grad_norm": 0.6146328449249268, |
| "learning_rate": 1.881720430107527e-05, |
| "loss": 0.2077, |
| "step": 1370 |
| }, |
| { |
| "epoch": 3.256913470115968, |
| "grad_norm": 0.6214416027069092, |
| "learning_rate": 1.875746714456392e-05, |
| "loss": 0.243, |
| "step": 1371 |
| }, |
| { |
| "epoch": 3.2592922985429675, |
| "grad_norm": 0.6775934100151062, |
| "learning_rate": 1.869772998805257e-05, |
| "loss": 0.2181, |
| "step": 1372 |
| }, |
| { |
| "epoch": 3.261671126969967, |
| "grad_norm": 0.6707413792610168, |
| "learning_rate": 1.863799283154122e-05, |
| "loss": 0.2385, |
| "step": 1373 |
| }, |
| { |
| "epoch": 3.264049955396967, |
| "grad_norm": 0.6954108476638794, |
| "learning_rate": 1.857825567502987e-05, |
| "loss": 0.2367, |
| "step": 1374 |
| }, |
| { |
| "epoch": 3.2664287838239665, |
| "grad_norm": 0.683694064617157, |
| "learning_rate": 1.8518518518518518e-05, |
| "loss": 0.2761, |
| "step": 1375 |
| }, |
| { |
| "epoch": 3.268807612250966, |
| "grad_norm": 0.6562153697013855, |
| "learning_rate": 1.845878136200717e-05, |
| "loss": 0.1777, |
| "step": 1376 |
| }, |
| { |
| "epoch": 3.2711864406779663, |
| "grad_norm": 0.6518459916114807, |
| "learning_rate": 1.839904420549582e-05, |
| "loss": 0.228, |
| "step": 1377 |
| }, |
| { |
| "epoch": 3.273565269104966, |
| "grad_norm": 0.5777814984321594, |
| "learning_rate": 1.833930704898447e-05, |
| "loss": 0.22, |
| "step": 1378 |
| }, |
| { |
| "epoch": 3.2759440975319656, |
| "grad_norm": 0.6501063108444214, |
| "learning_rate": 1.827956989247312e-05, |
| "loss": 0.2363, |
| "step": 1379 |
| }, |
| { |
| "epoch": 3.2783229259589652, |
| "grad_norm": 0.6595446467399597, |
| "learning_rate": 1.821983273596177e-05, |
| "loss": 0.2106, |
| "step": 1380 |
| }, |
| { |
| "epoch": 3.280701754385965, |
| "grad_norm": 0.66554856300354, |
| "learning_rate": 1.816009557945042e-05, |
| "loss": 0.2224, |
| "step": 1381 |
| }, |
| { |
| "epoch": 3.2830805828129646, |
| "grad_norm": 0.6982068419456482, |
| "learning_rate": 1.8100358422939067e-05, |
| "loss": 0.2215, |
| "step": 1382 |
| }, |
| { |
| "epoch": 3.285459411239964, |
| "grad_norm": 0.7318804860115051, |
| "learning_rate": 1.8040621266427717e-05, |
| "loss": 0.2905, |
| "step": 1383 |
| }, |
| { |
| "epoch": 3.287838239666964, |
| "grad_norm": 0.685772716999054, |
| "learning_rate": 1.7980884109916368e-05, |
| "loss": 0.2241, |
| "step": 1384 |
| }, |
| { |
| "epoch": 3.2902170680939635, |
| "grad_norm": 0.662263035774231, |
| "learning_rate": 1.7921146953405018e-05, |
| "loss": 0.2333, |
| "step": 1385 |
| }, |
| { |
| "epoch": 3.2925958965209636, |
| "grad_norm": 0.6051913499832153, |
| "learning_rate": 1.7861409796893668e-05, |
| "loss": 0.2162, |
| "step": 1386 |
| }, |
| { |
| "epoch": 3.2949747249479633, |
| "grad_norm": 0.6377174258232117, |
| "learning_rate": 1.780167264038232e-05, |
| "loss": 0.1745, |
| "step": 1387 |
| }, |
| { |
| "epoch": 3.297353553374963, |
| "grad_norm": 0.6460472345352173, |
| "learning_rate": 1.774193548387097e-05, |
| "loss": 0.2591, |
| "step": 1388 |
| }, |
| { |
| "epoch": 3.2997323818019626, |
| "grad_norm": 0.5293865203857422, |
| "learning_rate": 1.768219832735962e-05, |
| "loss": 0.188, |
| "step": 1389 |
| }, |
| { |
| "epoch": 3.3021112102289623, |
| "grad_norm": 0.9448232650756836, |
| "learning_rate": 1.7622461170848266e-05, |
| "loss": 0.2767, |
| "step": 1390 |
| }, |
| { |
| "epoch": 3.304490038655962, |
| "grad_norm": 0.6465947031974792, |
| "learning_rate": 1.7562724014336916e-05, |
| "loss": 0.2408, |
| "step": 1391 |
| }, |
| { |
| "epoch": 3.3068688670829616, |
| "grad_norm": 0.7603315711021423, |
| "learning_rate": 1.7502986857825567e-05, |
| "loss": 0.2325, |
| "step": 1392 |
| }, |
| { |
| "epoch": 3.3092476955099612, |
| "grad_norm": 0.6722696423530579, |
| "learning_rate": 1.7443249701314217e-05, |
| "loss": 0.2089, |
| "step": 1393 |
| }, |
| { |
| "epoch": 3.311626523936961, |
| "grad_norm": 0.7081143856048584, |
| "learning_rate": 1.7383512544802867e-05, |
| "loss": 0.2684, |
| "step": 1394 |
| }, |
| { |
| "epoch": 3.314005352363961, |
| "grad_norm": 0.6449976563453674, |
| "learning_rate": 1.7323775388291518e-05, |
| "loss": 0.2041, |
| "step": 1395 |
| }, |
| { |
| "epoch": 3.3163841807909606, |
| "grad_norm": 0.8436914682388306, |
| "learning_rate": 1.7264038231780168e-05, |
| "loss": 0.2326, |
| "step": 1396 |
| }, |
| { |
| "epoch": 3.3187630092179603, |
| "grad_norm": 0.6485365033149719, |
| "learning_rate": 1.7204301075268818e-05, |
| "loss": 0.2081, |
| "step": 1397 |
| }, |
| { |
| "epoch": 3.32114183764496, |
| "grad_norm": 0.6128183603286743, |
| "learning_rate": 1.714456391875747e-05, |
| "loss": 0.1998, |
| "step": 1398 |
| }, |
| { |
| "epoch": 3.3235206660719596, |
| "grad_norm": 0.6225998401641846, |
| "learning_rate": 1.708482676224612e-05, |
| "loss": 0.1919, |
| "step": 1399 |
| }, |
| { |
| "epoch": 3.3258994944989593, |
| "grad_norm": 0.7354225516319275, |
| "learning_rate": 1.702508960573477e-05, |
| "loss": 0.2243, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.3258994944989593, |
| "eval_loss": 0.40811267495155334, |
| "eval_runtime": 24.7259, |
| "eval_samples_per_second": 30.252, |
| "eval_steps_per_second": 15.126, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.328278322925959, |
| "grad_norm": 0.693684458732605, |
| "learning_rate": 1.696535244922342e-05, |
| "loss": 0.1834, |
| "step": 1401 |
| }, |
| { |
| "epoch": 3.3306571513529586, |
| "grad_norm": 0.7312796711921692, |
| "learning_rate": 1.690561529271207e-05, |
| "loss": 0.2561, |
| "step": 1402 |
| }, |
| { |
| "epoch": 3.3330359797799582, |
| "grad_norm": 0.6952280402183533, |
| "learning_rate": 1.684587813620072e-05, |
| "loss": 0.2586, |
| "step": 1403 |
| }, |
| { |
| "epoch": 3.335414808206958, |
| "grad_norm": 0.6589756011962891, |
| "learning_rate": 1.678614097968937e-05, |
| "loss": 0.2643, |
| "step": 1404 |
| }, |
| { |
| "epoch": 3.3377936366339576, |
| "grad_norm": 0.5879402756690979, |
| "learning_rate": 1.6726403823178017e-05, |
| "loss": 0.2092, |
| "step": 1405 |
| }, |
| { |
| "epoch": 3.3401724650609577, |
| "grad_norm": 0.5845806002616882, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 0.2077, |
| "step": 1406 |
| }, |
| { |
| "epoch": 3.3425512934879573, |
| "grad_norm": 0.7089442014694214, |
| "learning_rate": 1.6606929510155318e-05, |
| "loss": 0.2417, |
| "step": 1407 |
| }, |
| { |
| "epoch": 3.344930121914957, |
| "grad_norm": 0.6234202980995178, |
| "learning_rate": 1.6547192353643968e-05, |
| "loss": 0.1982, |
| "step": 1408 |
| }, |
| { |
| "epoch": 3.3473089503419566, |
| "grad_norm": 0.6180989742279053, |
| "learning_rate": 1.648745519713262e-05, |
| "loss": 0.1913, |
| "step": 1409 |
| }, |
| { |
| "epoch": 3.3496877787689563, |
| "grad_norm": 0.6704964637756348, |
| "learning_rate": 1.642771804062127e-05, |
| "loss": 0.2388, |
| "step": 1410 |
| }, |
| { |
| "epoch": 3.352066607195956, |
| "grad_norm": 0.6707442998886108, |
| "learning_rate": 1.636798088410992e-05, |
| "loss": 0.2897, |
| "step": 1411 |
| }, |
| { |
| "epoch": 3.3544454356229556, |
| "grad_norm": 0.7604225873947144, |
| "learning_rate": 1.630824372759857e-05, |
| "loss": 0.2933, |
| "step": 1412 |
| }, |
| { |
| "epoch": 3.3568242640499553, |
| "grad_norm": 0.7985626459121704, |
| "learning_rate": 1.6248506571087216e-05, |
| "loss": 0.2905, |
| "step": 1413 |
| }, |
| { |
| "epoch": 3.359203092476955, |
| "grad_norm": 0.6078615188598633, |
| "learning_rate": 1.6188769414575866e-05, |
| "loss": 0.2217, |
| "step": 1414 |
| }, |
| { |
| "epoch": 3.361581920903955, |
| "grad_norm": 0.8050974011421204, |
| "learning_rate": 1.6129032258064517e-05, |
| "loss": 0.2299, |
| "step": 1415 |
| }, |
| { |
| "epoch": 3.3639607493309547, |
| "grad_norm": 0.675726056098938, |
| "learning_rate": 1.6069295101553167e-05, |
| "loss": 0.1939, |
| "step": 1416 |
| }, |
| { |
| "epoch": 3.3663395777579543, |
| "grad_norm": 0.6330052614212036, |
| "learning_rate": 1.6009557945041817e-05, |
| "loss": 0.2316, |
| "step": 1417 |
| }, |
| { |
| "epoch": 3.368718406184954, |
| "grad_norm": 0.7237552404403687, |
| "learning_rate": 1.5949820788530468e-05, |
| "loss": 0.2821, |
| "step": 1418 |
| }, |
| { |
| "epoch": 3.3710972346119537, |
| "grad_norm": 0.6474528312683105, |
| "learning_rate": 1.5890083632019118e-05, |
| "loss": 0.2136, |
| "step": 1419 |
| }, |
| { |
| "epoch": 3.3734760630389533, |
| "grad_norm": 0.7798577547073364, |
| "learning_rate": 1.5830346475507768e-05, |
| "loss": 0.223, |
| "step": 1420 |
| }, |
| { |
| "epoch": 3.375854891465953, |
| "grad_norm": 0.7392546534538269, |
| "learning_rate": 1.5770609318996415e-05, |
| "loss": 0.2513, |
| "step": 1421 |
| }, |
| { |
| "epoch": 3.3782337198929526, |
| "grad_norm": 0.7369937896728516, |
| "learning_rate": 1.5710872162485066e-05, |
| "loss": 0.2672, |
| "step": 1422 |
| }, |
| { |
| "epoch": 3.3806125483199523, |
| "grad_norm": 0.7227432131767273, |
| "learning_rate": 1.5651135005973716e-05, |
| "loss": 0.2551, |
| "step": 1423 |
| }, |
| { |
| "epoch": 3.382991376746952, |
| "grad_norm": 0.6788824200630188, |
| "learning_rate": 1.5591397849462366e-05, |
| "loss": 0.2378, |
| "step": 1424 |
| }, |
| { |
| "epoch": 3.3853702051739516, |
| "grad_norm": 0.6660415530204773, |
| "learning_rate": 1.5531660692951016e-05, |
| "loss": 0.22, |
| "step": 1425 |
| }, |
| { |
| "epoch": 3.3877490336009517, |
| "grad_norm": 0.6559244394302368, |
| "learning_rate": 1.5471923536439667e-05, |
| "loss": 0.2282, |
| "step": 1426 |
| }, |
| { |
| "epoch": 3.3901278620279514, |
| "grad_norm": 0.7439149618148804, |
| "learning_rate": 1.5412186379928317e-05, |
| "loss": 0.251, |
| "step": 1427 |
| }, |
| { |
| "epoch": 3.392506690454951, |
| "grad_norm": 0.6336953639984131, |
| "learning_rate": 1.5352449223416964e-05, |
| "loss": 0.2226, |
| "step": 1428 |
| }, |
| { |
| "epoch": 3.3948855188819507, |
| "grad_norm": 0.6222031712532043, |
| "learning_rate": 1.5292712066905614e-05, |
| "loss": 0.2445, |
| "step": 1429 |
| }, |
| { |
| "epoch": 3.3972643473089503, |
| "grad_norm": 0.6259122490882874, |
| "learning_rate": 1.5232974910394265e-05, |
| "loss": 0.2376, |
| "step": 1430 |
| }, |
| { |
| "epoch": 3.39964317573595, |
| "grad_norm": 0.6915052533149719, |
| "learning_rate": 1.5173237753882915e-05, |
| "loss": 0.2365, |
| "step": 1431 |
| }, |
| { |
| "epoch": 3.4020220041629496, |
| "grad_norm": 0.7592293620109558, |
| "learning_rate": 1.5113500597371565e-05, |
| "loss": 0.2395, |
| "step": 1432 |
| }, |
| { |
| "epoch": 3.4044008325899493, |
| "grad_norm": 0.7728424668312073, |
| "learning_rate": 1.5053763440860215e-05, |
| "loss": 0.2445, |
| "step": 1433 |
| }, |
| { |
| "epoch": 3.406779661016949, |
| "grad_norm": 0.602087140083313, |
| "learning_rate": 1.4994026284348866e-05, |
| "loss": 0.2054, |
| "step": 1434 |
| }, |
| { |
| "epoch": 3.409158489443949, |
| "grad_norm": 0.5934260487556458, |
| "learning_rate": 1.4934289127837516e-05, |
| "loss": 0.178, |
| "step": 1435 |
| }, |
| { |
| "epoch": 3.4115373178709487, |
| "grad_norm": 0.6058094501495361, |
| "learning_rate": 1.4874551971326165e-05, |
| "loss": 0.1932, |
| "step": 1436 |
| }, |
| { |
| "epoch": 3.4139161462979484, |
| "grad_norm": 0.6478481888771057, |
| "learning_rate": 1.4814814814814815e-05, |
| "loss": 0.2139, |
| "step": 1437 |
| }, |
| { |
| "epoch": 3.416294974724948, |
| "grad_norm": 0.7313652634620667, |
| "learning_rate": 1.4755077658303465e-05, |
| "loss": 0.2884, |
| "step": 1438 |
| }, |
| { |
| "epoch": 3.4186738031519477, |
| "grad_norm": 0.6436812281608582, |
| "learning_rate": 1.4695340501792116e-05, |
| "loss": 0.2411, |
| "step": 1439 |
| }, |
| { |
| "epoch": 3.4210526315789473, |
| "grad_norm": 0.5855724215507507, |
| "learning_rate": 1.4635603345280766e-05, |
| "loss": 0.2268, |
| "step": 1440 |
| }, |
| { |
| "epoch": 3.423431460005947, |
| "grad_norm": 0.6639002561569214, |
| "learning_rate": 1.4575866188769416e-05, |
| "loss": 0.2157, |
| "step": 1441 |
| }, |
| { |
| "epoch": 3.4258102884329467, |
| "grad_norm": 0.6343371272087097, |
| "learning_rate": 1.4516129032258066e-05, |
| "loss": 0.2036, |
| "step": 1442 |
| }, |
| { |
| "epoch": 3.4281891168599463, |
| "grad_norm": 0.6803449988365173, |
| "learning_rate": 1.4456391875746717e-05, |
| "loss": 0.2474, |
| "step": 1443 |
| }, |
| { |
| "epoch": 3.4305679452869464, |
| "grad_norm": 0.5939714908599854, |
| "learning_rate": 1.4396654719235364e-05, |
| "loss": 0.1785, |
| "step": 1444 |
| }, |
| { |
| "epoch": 3.432946773713946, |
| "grad_norm": 0.7304908037185669, |
| "learning_rate": 1.4336917562724014e-05, |
| "loss": 0.2588, |
| "step": 1445 |
| }, |
| { |
| "epoch": 3.4353256021409457, |
| "grad_norm": 0.5911548733711243, |
| "learning_rate": 1.4277180406212664e-05, |
| "loss": 0.2105, |
| "step": 1446 |
| }, |
| { |
| "epoch": 3.4377044305679454, |
| "grad_norm": 0.6603512167930603, |
| "learning_rate": 1.4217443249701315e-05, |
| "loss": 0.2324, |
| "step": 1447 |
| }, |
| { |
| "epoch": 3.440083258994945, |
| "grad_norm": 0.618499755859375, |
| "learning_rate": 1.4157706093189965e-05, |
| "loss": 0.221, |
| "step": 1448 |
| }, |
| { |
| "epoch": 3.4424620874219447, |
| "grad_norm": 0.7231235504150391, |
| "learning_rate": 1.4097968936678615e-05, |
| "loss": 0.2625, |
| "step": 1449 |
| }, |
| { |
| "epoch": 3.4448409158489444, |
| "grad_norm": 0.6488828659057617, |
| "learning_rate": 1.4038231780167265e-05, |
| "loss": 0.2161, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.447219744275944, |
| "grad_norm": 0.695941150188446, |
| "learning_rate": 1.3978494623655914e-05, |
| "loss": 0.1978, |
| "step": 1451 |
| }, |
| { |
| "epoch": 3.4495985727029437, |
| "grad_norm": 0.6400462985038757, |
| "learning_rate": 1.3918757467144564e-05, |
| "loss": 0.2576, |
| "step": 1452 |
| }, |
| { |
| "epoch": 3.4519774011299433, |
| "grad_norm": 0.6457123160362244, |
| "learning_rate": 1.3859020310633215e-05, |
| "loss": 0.2296, |
| "step": 1453 |
| }, |
| { |
| "epoch": 3.454356229556943, |
| "grad_norm": 0.6414039731025696, |
| "learning_rate": 1.3799283154121865e-05, |
| "loss": 0.2452, |
| "step": 1454 |
| }, |
| { |
| "epoch": 3.456735057983943, |
| "grad_norm": 0.7506712675094604, |
| "learning_rate": 1.3739545997610515e-05, |
| "loss": 0.2473, |
| "step": 1455 |
| }, |
| { |
| "epoch": 3.4591138864109428, |
| "grad_norm": 0.7017701864242554, |
| "learning_rate": 1.3679808841099166e-05, |
| "loss": 0.2143, |
| "step": 1456 |
| }, |
| { |
| "epoch": 3.4614927148379424, |
| "grad_norm": 0.6323108077049255, |
| "learning_rate": 1.3620071684587816e-05, |
| "loss": 0.2232, |
| "step": 1457 |
| }, |
| { |
| "epoch": 3.463871543264942, |
| "grad_norm": 0.6644439101219177, |
| "learning_rate": 1.3560334528076466e-05, |
| "loss": 0.2429, |
| "step": 1458 |
| }, |
| { |
| "epoch": 3.4662503716919417, |
| "grad_norm": 0.7907066345214844, |
| "learning_rate": 1.3500597371565113e-05, |
| "loss": 0.2537, |
| "step": 1459 |
| }, |
| { |
| "epoch": 3.4686292001189414, |
| "grad_norm": 0.6679920554161072, |
| "learning_rate": 1.3440860215053763e-05, |
| "loss": 0.1961, |
| "step": 1460 |
| }, |
| { |
| "epoch": 3.471008028545941, |
| "grad_norm": 0.7501968145370483, |
| "learning_rate": 1.3381123058542414e-05, |
| "loss": 0.2933, |
| "step": 1461 |
| }, |
| { |
| "epoch": 3.4733868569729407, |
| "grad_norm": 0.6941911578178406, |
| "learning_rate": 1.3321385902031064e-05, |
| "loss": 0.2156, |
| "step": 1462 |
| }, |
| { |
| "epoch": 3.4757656853999404, |
| "grad_norm": 0.6447787284851074, |
| "learning_rate": 1.3261648745519714e-05, |
| "loss": 0.1829, |
| "step": 1463 |
| }, |
| { |
| "epoch": 3.4781445138269405, |
| "grad_norm": 0.6218757629394531, |
| "learning_rate": 1.3201911589008365e-05, |
| "loss": 0.2292, |
| "step": 1464 |
| }, |
| { |
| "epoch": 3.48052334225394, |
| "grad_norm": 0.7498683929443359, |
| "learning_rate": 1.3142174432497015e-05, |
| "loss": 0.2756, |
| "step": 1465 |
| }, |
| { |
| "epoch": 3.4829021706809398, |
| "grad_norm": 0.6936948299407959, |
| "learning_rate": 1.3082437275985665e-05, |
| "loss": 0.2607, |
| "step": 1466 |
| }, |
| { |
| "epoch": 3.4852809991079394, |
| "grad_norm": 0.6592556238174438, |
| "learning_rate": 1.3022700119474312e-05, |
| "loss": 0.1836, |
| "step": 1467 |
| }, |
| { |
| "epoch": 3.487659827534939, |
| "grad_norm": 0.6678502559661865, |
| "learning_rate": 1.2962962962962962e-05, |
| "loss": 0.249, |
| "step": 1468 |
| }, |
| { |
| "epoch": 3.4900386559619387, |
| "grad_norm": 0.5917351841926575, |
| "learning_rate": 1.2903225806451613e-05, |
| "loss": 0.2522, |
| "step": 1469 |
| }, |
| { |
| "epoch": 3.4924174843889384, |
| "grad_norm": 0.7131730914115906, |
| "learning_rate": 1.2843488649940263e-05, |
| "loss": 0.2573, |
| "step": 1470 |
| }, |
| { |
| "epoch": 3.494796312815938, |
| "grad_norm": 0.6857194900512695, |
| "learning_rate": 1.2783751493428913e-05, |
| "loss": 0.2162, |
| "step": 1471 |
| }, |
| { |
| "epoch": 3.4971751412429377, |
| "grad_norm": 0.5957016348838806, |
| "learning_rate": 1.2724014336917564e-05, |
| "loss": 0.1861, |
| "step": 1472 |
| }, |
| { |
| "epoch": 3.4995539696699374, |
| "grad_norm": 0.6489928364753723, |
| "learning_rate": 1.2664277180406214e-05, |
| "loss": 0.2385, |
| "step": 1473 |
| }, |
| { |
| "epoch": 3.501932798096937, |
| "grad_norm": 0.6470348834991455, |
| "learning_rate": 1.2604540023894863e-05, |
| "loss": 0.261, |
| "step": 1474 |
| }, |
| { |
| "epoch": 3.504311626523937, |
| "grad_norm": 0.6595022678375244, |
| "learning_rate": 1.2544802867383513e-05, |
| "loss": 0.2107, |
| "step": 1475 |
| }, |
| { |
| "epoch": 3.506690454950937, |
| "grad_norm": 0.6955150961875916, |
| "learning_rate": 1.2485065710872163e-05, |
| "loss": 0.2775, |
| "step": 1476 |
| }, |
| { |
| "epoch": 3.5090692833779364, |
| "grad_norm": 0.5981859564781189, |
| "learning_rate": 1.2425328554360813e-05, |
| "loss": 0.2169, |
| "step": 1477 |
| }, |
| { |
| "epoch": 3.511448111804936, |
| "grad_norm": 0.7145429849624634, |
| "learning_rate": 1.2365591397849464e-05, |
| "loss": 0.236, |
| "step": 1478 |
| }, |
| { |
| "epoch": 3.5138269402319358, |
| "grad_norm": 0.7278533577919006, |
| "learning_rate": 1.2305854241338114e-05, |
| "loss": 0.2556, |
| "step": 1479 |
| }, |
| { |
| "epoch": 3.5162057686589354, |
| "grad_norm": 0.6920650005340576, |
| "learning_rate": 1.2246117084826763e-05, |
| "loss": 0.2613, |
| "step": 1480 |
| }, |
| { |
| "epoch": 3.518584597085935, |
| "grad_norm": 0.6695207357406616, |
| "learning_rate": 1.2186379928315413e-05, |
| "loss": 0.2451, |
| "step": 1481 |
| }, |
| { |
| "epoch": 3.5209634255129347, |
| "grad_norm": 0.736623227596283, |
| "learning_rate": 1.2126642771804063e-05, |
| "loss": 0.2641, |
| "step": 1482 |
| }, |
| { |
| "epoch": 3.5233422539399344, |
| "grad_norm": 0.6805233955383301, |
| "learning_rate": 1.2066905615292714e-05, |
| "loss": 0.2185, |
| "step": 1483 |
| }, |
| { |
| "epoch": 3.5257210823669345, |
| "grad_norm": 0.6742261052131653, |
| "learning_rate": 1.2007168458781362e-05, |
| "loss": 0.2401, |
| "step": 1484 |
| }, |
| { |
| "epoch": 3.528099910793934, |
| "grad_norm": 0.5891537666320801, |
| "learning_rate": 1.1947431302270013e-05, |
| "loss": 0.2007, |
| "step": 1485 |
| }, |
| { |
| "epoch": 3.530478739220934, |
| "grad_norm": 0.7856806516647339, |
| "learning_rate": 1.1887694145758663e-05, |
| "loss": 0.2935, |
| "step": 1486 |
| }, |
| { |
| "epoch": 3.5328575676479335, |
| "grad_norm": 0.657417356967926, |
| "learning_rate": 1.1827956989247313e-05, |
| "loss": 0.2079, |
| "step": 1487 |
| }, |
| { |
| "epoch": 3.535236396074933, |
| "grad_norm": 0.817315936088562, |
| "learning_rate": 1.1768219832735962e-05, |
| "loss": 0.2897, |
| "step": 1488 |
| }, |
| { |
| "epoch": 3.5376152245019328, |
| "grad_norm": 0.67595374584198, |
| "learning_rate": 1.1708482676224612e-05, |
| "loss": 0.2737, |
| "step": 1489 |
| }, |
| { |
| "epoch": 3.5399940529289324, |
| "grad_norm": 0.6928833723068237, |
| "learning_rate": 1.1648745519713262e-05, |
| "loss": 0.2437, |
| "step": 1490 |
| }, |
| { |
| "epoch": 3.542372881355932, |
| "grad_norm": 0.6645117402076721, |
| "learning_rate": 1.1589008363201913e-05, |
| "loss": 0.1768, |
| "step": 1491 |
| }, |
| { |
| "epoch": 3.5447517097829317, |
| "grad_norm": 0.7368005514144897, |
| "learning_rate": 1.1529271206690561e-05, |
| "loss": 0.2562, |
| "step": 1492 |
| }, |
| { |
| "epoch": 3.547130538209932, |
| "grad_norm": 0.6753305792808533, |
| "learning_rate": 1.1469534050179212e-05, |
| "loss": 0.2068, |
| "step": 1493 |
| }, |
| { |
| "epoch": 3.549509366636931, |
| "grad_norm": 0.6689797043800354, |
| "learning_rate": 1.1409796893667862e-05, |
| "loss": 0.2141, |
| "step": 1494 |
| }, |
| { |
| "epoch": 3.551888195063931, |
| "grad_norm": 0.7002167701721191, |
| "learning_rate": 1.135005973715651e-05, |
| "loss": 0.2314, |
| "step": 1495 |
| }, |
| { |
| "epoch": 3.554267023490931, |
| "grad_norm": 0.6405853629112244, |
| "learning_rate": 1.129032258064516e-05, |
| "loss": 0.2289, |
| "step": 1496 |
| }, |
| { |
| "epoch": 3.5566458519179305, |
| "grad_norm": 0.7196563482284546, |
| "learning_rate": 1.1230585424133811e-05, |
| "loss": 0.279, |
| "step": 1497 |
| }, |
| { |
| "epoch": 3.55902468034493, |
| "grad_norm": 0.7163687348365784, |
| "learning_rate": 1.1170848267622461e-05, |
| "loss": 0.2054, |
| "step": 1498 |
| }, |
| { |
| "epoch": 3.56140350877193, |
| "grad_norm": 0.6668791174888611, |
| "learning_rate": 1.1111111111111112e-05, |
| "loss": 0.2417, |
| "step": 1499 |
| }, |
| { |
| "epoch": 3.5637823371989295, |
| "grad_norm": 0.8013843894004822, |
| "learning_rate": 1.1051373954599762e-05, |
| "loss": 0.2523, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.5637823371989295, |
| "eval_loss": 0.4071366488933563, |
| "eval_runtime": 24.9101, |
| "eval_samples_per_second": 30.028, |
| "eval_steps_per_second": 15.014, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.566161165625929, |
| "grad_norm": 0.649554431438446, |
| "learning_rate": 1.0991636798088412e-05, |
| "loss": 0.262, |
| "step": 1501 |
| }, |
| { |
| "epoch": 3.568539994052929, |
| "grad_norm": 0.6857299208641052, |
| "learning_rate": 1.0931899641577063e-05, |
| "loss": 0.2615, |
| "step": 1502 |
| }, |
| { |
| "epoch": 3.5709188224799284, |
| "grad_norm": 0.7175583839416504, |
| "learning_rate": 1.0872162485065711e-05, |
| "loss": 0.258, |
| "step": 1503 |
| }, |
| { |
| "epoch": 3.5732976509069285, |
| "grad_norm": 0.6463188529014587, |
| "learning_rate": 1.0812425328554361e-05, |
| "loss": 0.2259, |
| "step": 1504 |
| }, |
| { |
| "epoch": 3.575676479333928, |
| "grad_norm": 0.7195451855659485, |
| "learning_rate": 1.0752688172043012e-05, |
| "loss": 0.2458, |
| "step": 1505 |
| }, |
| { |
| "epoch": 3.578055307760928, |
| "grad_norm": 0.7340347766876221, |
| "learning_rate": 1.0692951015531662e-05, |
| "loss": 0.2478, |
| "step": 1506 |
| }, |
| { |
| "epoch": 3.5804341361879275, |
| "grad_norm": 0.7194491028785706, |
| "learning_rate": 1.063321385902031e-05, |
| "loss": 0.2369, |
| "step": 1507 |
| }, |
| { |
| "epoch": 3.582812964614927, |
| "grad_norm": 0.7110004425048828, |
| "learning_rate": 1.0573476702508961e-05, |
| "loss": 0.257, |
| "step": 1508 |
| }, |
| { |
| "epoch": 3.585191793041927, |
| "grad_norm": 0.6778460144996643, |
| "learning_rate": 1.0513739545997611e-05, |
| "loss": 0.2128, |
| "step": 1509 |
| }, |
| { |
| "epoch": 3.5875706214689265, |
| "grad_norm": 0.7083544135093689, |
| "learning_rate": 1.0454002389486262e-05, |
| "loss": 0.2743, |
| "step": 1510 |
| }, |
| { |
| "epoch": 3.589949449895926, |
| "grad_norm": 0.7141408324241638, |
| "learning_rate": 1.039426523297491e-05, |
| "loss": 0.2752, |
| "step": 1511 |
| }, |
| { |
| "epoch": 3.592328278322926, |
| "grad_norm": 0.6944218873977661, |
| "learning_rate": 1.033452807646356e-05, |
| "loss": 0.2627, |
| "step": 1512 |
| }, |
| { |
| "epoch": 3.594707106749926, |
| "grad_norm": 0.5565508604049683, |
| "learning_rate": 1.027479091995221e-05, |
| "loss": 0.1928, |
| "step": 1513 |
| }, |
| { |
| "epoch": 3.597085935176925, |
| "grad_norm": 0.5949556231498718, |
| "learning_rate": 1.0215053763440861e-05, |
| "loss": 0.2007, |
| "step": 1514 |
| }, |
| { |
| "epoch": 3.599464763603925, |
| "grad_norm": 0.6212142109870911, |
| "learning_rate": 1.015531660692951e-05, |
| "loss": 0.1798, |
| "step": 1515 |
| }, |
| { |
| "epoch": 3.601843592030925, |
| "grad_norm": 0.7298859357833862, |
| "learning_rate": 1.009557945041816e-05, |
| "loss": 0.2466, |
| "step": 1516 |
| }, |
| { |
| "epoch": 3.6042224204579245, |
| "grad_norm": 0.8149348497390747, |
| "learning_rate": 1.003584229390681e-05, |
| "loss": 0.3083, |
| "step": 1517 |
| }, |
| { |
| "epoch": 3.606601248884924, |
| "grad_norm": 0.7248669862747192, |
| "learning_rate": 9.97610513739546e-06, |
| "loss": 0.2124, |
| "step": 1518 |
| }, |
| { |
| "epoch": 3.608980077311924, |
| "grad_norm": 0.6633000373840332, |
| "learning_rate": 9.916367980884111e-06, |
| "loss": 0.2021, |
| "step": 1519 |
| }, |
| { |
| "epoch": 3.6113589057389235, |
| "grad_norm": 0.5681532621383667, |
| "learning_rate": 9.856630824372761e-06, |
| "loss": 0.1449, |
| "step": 1520 |
| }, |
| { |
| "epoch": 3.613737734165923, |
| "grad_norm": 0.6119810938835144, |
| "learning_rate": 9.796893667861412e-06, |
| "loss": 0.2125, |
| "step": 1521 |
| }, |
| { |
| "epoch": 3.6161165625929232, |
| "grad_norm": 0.6357595920562744, |
| "learning_rate": 9.73715651135006e-06, |
| "loss": 0.2047, |
| "step": 1522 |
| }, |
| { |
| "epoch": 3.6184953910199225, |
| "grad_norm": 0.6470745205879211, |
| "learning_rate": 9.67741935483871e-06, |
| "loss": 0.2453, |
| "step": 1523 |
| }, |
| { |
| "epoch": 3.6208742194469226, |
| "grad_norm": 0.6681517958641052, |
| "learning_rate": 9.61768219832736e-06, |
| "loss": 0.213, |
| "step": 1524 |
| }, |
| { |
| "epoch": 3.623253047873922, |
| "grad_norm": 0.6562544107437134, |
| "learning_rate": 9.557945041816011e-06, |
| "loss": 0.2493, |
| "step": 1525 |
| }, |
| { |
| "epoch": 3.625631876300922, |
| "grad_norm": 0.7458012700080872, |
| "learning_rate": 9.49820788530466e-06, |
| "loss": 0.2309, |
| "step": 1526 |
| }, |
| { |
| "epoch": 3.6280107047279215, |
| "grad_norm": 0.6401541233062744, |
| "learning_rate": 9.43847072879331e-06, |
| "loss": 0.2141, |
| "step": 1527 |
| }, |
| { |
| "epoch": 3.630389533154921, |
| "grad_norm": 0.6830606460571289, |
| "learning_rate": 9.37873357228196e-06, |
| "loss": 0.2194, |
| "step": 1528 |
| }, |
| { |
| "epoch": 3.632768361581921, |
| "grad_norm": 0.613207995891571, |
| "learning_rate": 9.31899641577061e-06, |
| "loss": 0.2095, |
| "step": 1529 |
| }, |
| { |
| "epoch": 3.6351471900089205, |
| "grad_norm": 0.7548460960388184, |
| "learning_rate": 9.259259259259259e-06, |
| "loss": 0.2218, |
| "step": 1530 |
| }, |
| { |
| "epoch": 3.63752601843592, |
| "grad_norm": 0.6122269034385681, |
| "learning_rate": 9.19952210274791e-06, |
| "loss": 0.2229, |
| "step": 1531 |
| }, |
| { |
| "epoch": 3.63990484686292, |
| "grad_norm": 0.763201117515564, |
| "learning_rate": 9.13978494623656e-06, |
| "loss": 0.2915, |
| "step": 1532 |
| }, |
| { |
| "epoch": 3.64228367528992, |
| "grad_norm": 0.5817426443099976, |
| "learning_rate": 9.08004778972521e-06, |
| "loss": 0.2133, |
| "step": 1533 |
| }, |
| { |
| "epoch": 3.6446625037169196, |
| "grad_norm": 0.6793233752250671, |
| "learning_rate": 9.020310633213859e-06, |
| "loss": 0.1959, |
| "step": 1534 |
| }, |
| { |
| "epoch": 3.6470413321439192, |
| "grad_norm": 0.7488880157470703, |
| "learning_rate": 8.960573476702509e-06, |
| "loss": 0.2454, |
| "step": 1535 |
| }, |
| { |
| "epoch": 3.649420160570919, |
| "grad_norm": 0.6657071709632874, |
| "learning_rate": 8.90083632019116e-06, |
| "loss": 0.2157, |
| "step": 1536 |
| }, |
| { |
| "epoch": 3.6517989889979185, |
| "grad_norm": 0.6894775629043579, |
| "learning_rate": 8.84109916367981e-06, |
| "loss": 0.2498, |
| "step": 1537 |
| }, |
| { |
| "epoch": 3.654177817424918, |
| "grad_norm": 0.6401564478874207, |
| "learning_rate": 8.781362007168458e-06, |
| "loss": 0.1939, |
| "step": 1538 |
| }, |
| { |
| "epoch": 3.656556645851918, |
| "grad_norm": 0.7901315689086914, |
| "learning_rate": 8.721624850657108e-06, |
| "loss": 0.2725, |
| "step": 1539 |
| }, |
| { |
| "epoch": 3.6589354742789175, |
| "grad_norm": 0.6278257369995117, |
| "learning_rate": 8.661887694145759e-06, |
| "loss": 0.2293, |
| "step": 1540 |
| }, |
| { |
| "epoch": 3.661314302705917, |
| "grad_norm": 0.6456495523452759, |
| "learning_rate": 8.602150537634409e-06, |
| "loss": 0.2433, |
| "step": 1541 |
| }, |
| { |
| "epoch": 3.6636931311329173, |
| "grad_norm": 0.6201629042625427, |
| "learning_rate": 8.54241338112306e-06, |
| "loss": 0.1798, |
| "step": 1542 |
| }, |
| { |
| "epoch": 3.6660719595599165, |
| "grad_norm": 0.7407602071762085, |
| "learning_rate": 8.48267622461171e-06, |
| "loss": 0.2428, |
| "step": 1543 |
| }, |
| { |
| "epoch": 3.6684507879869166, |
| "grad_norm": 0.7235687375068665, |
| "learning_rate": 8.42293906810036e-06, |
| "loss": 0.2421, |
| "step": 1544 |
| }, |
| { |
| "epoch": 3.6708296164139163, |
| "grad_norm": 0.6602762341499329, |
| "learning_rate": 8.363201911589009e-06, |
| "loss": 0.2513, |
| "step": 1545 |
| }, |
| { |
| "epoch": 3.673208444840916, |
| "grad_norm": 0.7141433954238892, |
| "learning_rate": 8.303464755077659e-06, |
| "loss": 0.2656, |
| "step": 1546 |
| }, |
| { |
| "epoch": 3.6755872732679156, |
| "grad_norm": 0.6279122233390808, |
| "learning_rate": 8.24372759856631e-06, |
| "loss": 0.1955, |
| "step": 1547 |
| }, |
| { |
| "epoch": 3.6779661016949152, |
| "grad_norm": 0.6232267618179321, |
| "learning_rate": 8.18399044205496e-06, |
| "loss": 0.1934, |
| "step": 1548 |
| }, |
| { |
| "epoch": 3.680344930121915, |
| "grad_norm": 0.7122899889945984, |
| "learning_rate": 8.124253285543608e-06, |
| "loss": 0.2608, |
| "step": 1549 |
| }, |
| { |
| "epoch": 3.6827237585489145, |
| "grad_norm": 0.6872605085372925, |
| "learning_rate": 8.064516129032258e-06, |
| "loss": 0.255, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.685102586975914, |
| "grad_norm": 0.7177041172981262, |
| "learning_rate": 8.004778972520909e-06, |
| "loss": 0.308, |
| "step": 1551 |
| }, |
| { |
| "epoch": 3.687481415402914, |
| "grad_norm": 0.6372491121292114, |
| "learning_rate": 7.945041816009559e-06, |
| "loss": 0.194, |
| "step": 1552 |
| }, |
| { |
| "epoch": 3.689860243829914, |
| "grad_norm": 0.6712515354156494, |
| "learning_rate": 7.885304659498208e-06, |
| "loss": 0.211, |
| "step": 1553 |
| }, |
| { |
| "epoch": 3.6922390722569136, |
| "grad_norm": 0.6724277138710022, |
| "learning_rate": 7.825567502986858e-06, |
| "loss": 0.22, |
| "step": 1554 |
| }, |
| { |
| "epoch": 3.6946179006839133, |
| "grad_norm": 0.729597806930542, |
| "learning_rate": 7.765830346475508e-06, |
| "loss": 0.2722, |
| "step": 1555 |
| }, |
| { |
| "epoch": 3.696996729110913, |
| "grad_norm": 0.7727295160293579, |
| "learning_rate": 7.706093189964159e-06, |
| "loss": 0.2225, |
| "step": 1556 |
| }, |
| { |
| "epoch": 3.6993755575379126, |
| "grad_norm": 0.8393397927284241, |
| "learning_rate": 7.646356033452807e-06, |
| "loss": 0.253, |
| "step": 1557 |
| }, |
| { |
| "epoch": 3.7017543859649122, |
| "grad_norm": 0.5970509052276611, |
| "learning_rate": 7.586618876941457e-06, |
| "loss": 0.2052, |
| "step": 1558 |
| }, |
| { |
| "epoch": 3.704133214391912, |
| "grad_norm": 0.6462686061859131, |
| "learning_rate": 7.526881720430108e-06, |
| "loss": 0.2158, |
| "step": 1559 |
| }, |
| { |
| "epoch": 3.7065120428189116, |
| "grad_norm": 0.6514200568199158, |
| "learning_rate": 7.467144563918758e-06, |
| "loss": 0.2331, |
| "step": 1560 |
| }, |
| { |
| "epoch": 3.708890871245911, |
| "grad_norm": 0.588154137134552, |
| "learning_rate": 7.4074074074074075e-06, |
| "loss": 0.2044, |
| "step": 1561 |
| }, |
| { |
| "epoch": 3.7112696996729113, |
| "grad_norm": 0.7149415016174316, |
| "learning_rate": 7.347670250896058e-06, |
| "loss": 0.2486, |
| "step": 1562 |
| }, |
| { |
| "epoch": 3.7136485280999105, |
| "grad_norm": 0.6742112636566162, |
| "learning_rate": 7.287933094384708e-06, |
| "loss": 0.2379, |
| "step": 1563 |
| }, |
| { |
| "epoch": 3.7160273565269106, |
| "grad_norm": 0.6923094987869263, |
| "learning_rate": 7.228195937873358e-06, |
| "loss": 0.2474, |
| "step": 1564 |
| }, |
| { |
| "epoch": 3.7184061849539103, |
| "grad_norm": 0.7106072902679443, |
| "learning_rate": 7.168458781362007e-06, |
| "loss": 0.2541, |
| "step": 1565 |
| }, |
| { |
| "epoch": 3.72078501338091, |
| "grad_norm": 0.6844406127929688, |
| "learning_rate": 7.108721624850657e-06, |
| "loss": 0.2154, |
| "step": 1566 |
| }, |
| { |
| "epoch": 3.7231638418079096, |
| "grad_norm": 0.7566594481468201, |
| "learning_rate": 7.048984468339308e-06, |
| "loss": 0.2432, |
| "step": 1567 |
| }, |
| { |
| "epoch": 3.7255426702349093, |
| "grad_norm": 0.6934136152267456, |
| "learning_rate": 6.989247311827957e-06, |
| "loss": 0.1982, |
| "step": 1568 |
| }, |
| { |
| "epoch": 3.727921498661909, |
| "grad_norm": 0.6883266568183899, |
| "learning_rate": 6.929510155316607e-06, |
| "loss": 0.2299, |
| "step": 1569 |
| }, |
| { |
| "epoch": 3.7303003270889086, |
| "grad_norm": 0.6047712564468384, |
| "learning_rate": 6.869772998805258e-06, |
| "loss": 0.193, |
| "step": 1570 |
| }, |
| { |
| "epoch": 3.7326791555159087, |
| "grad_norm": 0.7365937232971191, |
| "learning_rate": 6.810035842293908e-06, |
| "loss": 0.224, |
| "step": 1571 |
| }, |
| { |
| "epoch": 3.735057983942908, |
| "grad_norm": 0.7228516936302185, |
| "learning_rate": 6.7502986857825566e-06, |
| "loss": 0.2644, |
| "step": 1572 |
| }, |
| { |
| "epoch": 3.737436812369908, |
| "grad_norm": 0.6381697058677673, |
| "learning_rate": 6.690561529271207e-06, |
| "loss": 0.1966, |
| "step": 1573 |
| }, |
| { |
| "epoch": 3.7398156407969076, |
| "grad_norm": 0.7666082978248596, |
| "learning_rate": 6.630824372759857e-06, |
| "loss": 0.2725, |
| "step": 1574 |
| }, |
| { |
| "epoch": 3.7421944692239073, |
| "grad_norm": 0.6610901951789856, |
| "learning_rate": 6.5710872162485075e-06, |
| "loss": 0.2398, |
| "step": 1575 |
| }, |
| { |
| "epoch": 3.744573297650907, |
| "grad_norm": 0.7147103548049927, |
| "learning_rate": 6.511350059737156e-06, |
| "loss": 0.246, |
| "step": 1576 |
| }, |
| { |
| "epoch": 3.7469521260779066, |
| "grad_norm": 0.681191086769104, |
| "learning_rate": 6.451612903225806e-06, |
| "loss": 0.2185, |
| "step": 1577 |
| }, |
| { |
| "epoch": 3.7493309545049063, |
| "grad_norm": 0.6059114336967468, |
| "learning_rate": 6.391875746714457e-06, |
| "loss": 0.2127, |
| "step": 1578 |
| }, |
| { |
| "epoch": 3.751709782931906, |
| "grad_norm": 0.7046500444412231, |
| "learning_rate": 6.332138590203107e-06, |
| "loss": 0.226, |
| "step": 1579 |
| }, |
| { |
| "epoch": 3.7540886113589056, |
| "grad_norm": 0.6811462640762329, |
| "learning_rate": 6.2724014336917564e-06, |
| "loss": 0.2474, |
| "step": 1580 |
| }, |
| { |
| "epoch": 3.7564674397859052, |
| "grad_norm": 0.7344135642051697, |
| "learning_rate": 6.212664277180407e-06, |
| "loss": 0.2569, |
| "step": 1581 |
| }, |
| { |
| "epoch": 3.7588462682129054, |
| "grad_norm": 0.7511917352676392, |
| "learning_rate": 6.152927120669057e-06, |
| "loss": 0.2848, |
| "step": 1582 |
| }, |
| { |
| "epoch": 3.761225096639905, |
| "grad_norm": 0.7010191679000854, |
| "learning_rate": 6.0931899641577065e-06, |
| "loss": 0.2017, |
| "step": 1583 |
| }, |
| { |
| "epoch": 3.7636039250669047, |
| "grad_norm": 0.722025990486145, |
| "learning_rate": 6.033452807646357e-06, |
| "loss": 0.2473, |
| "step": 1584 |
| }, |
| { |
| "epoch": 3.7659827534939043, |
| "grad_norm": 0.6399304866790771, |
| "learning_rate": 5.973715651135006e-06, |
| "loss": 0.2219, |
| "step": 1585 |
| }, |
| { |
| "epoch": 3.768361581920904, |
| "grad_norm": 0.6593719720840454, |
| "learning_rate": 5.9139784946236566e-06, |
| "loss": 0.2105, |
| "step": 1586 |
| }, |
| { |
| "epoch": 3.7707404103479036, |
| "grad_norm": 0.6997963786125183, |
| "learning_rate": 5.854241338112306e-06, |
| "loss": 0.2317, |
| "step": 1587 |
| }, |
| { |
| "epoch": 3.7731192387749033, |
| "grad_norm": 0.7119143009185791, |
| "learning_rate": 5.794504181600956e-06, |
| "loss": 0.2567, |
| "step": 1588 |
| }, |
| { |
| "epoch": 3.775498067201903, |
| "grad_norm": 0.6408453583717346, |
| "learning_rate": 5.734767025089606e-06, |
| "loss": 0.2252, |
| "step": 1589 |
| }, |
| { |
| "epoch": 3.7778768956289026, |
| "grad_norm": 0.6919389963150024, |
| "learning_rate": 5.675029868578255e-06, |
| "loss": 0.1793, |
| "step": 1590 |
| }, |
| { |
| "epoch": 3.7802557240559027, |
| "grad_norm": 0.6644212007522583, |
| "learning_rate": 5.6152927120669055e-06, |
| "loss": 0.2008, |
| "step": 1591 |
| }, |
| { |
| "epoch": 3.782634552482902, |
| "grad_norm": 0.7227513194084167, |
| "learning_rate": 5.555555555555556e-06, |
| "loss": 0.2518, |
| "step": 1592 |
| }, |
| { |
| "epoch": 3.785013380909902, |
| "grad_norm": 0.7298300266265869, |
| "learning_rate": 5.495818399044206e-06, |
| "loss": 0.2362, |
| "step": 1593 |
| }, |
| { |
| "epoch": 3.7873922093369017, |
| "grad_norm": 0.5880789160728455, |
| "learning_rate": 5.436081242532856e-06, |
| "loss": 0.1614, |
| "step": 1594 |
| }, |
| { |
| "epoch": 3.7897710377639013, |
| "grad_norm": 0.7107828259468079, |
| "learning_rate": 5.376344086021506e-06, |
| "loss": 0.2027, |
| "step": 1595 |
| }, |
| { |
| "epoch": 3.792149866190901, |
| "grad_norm": 0.7066324949264526, |
| "learning_rate": 5.316606929510155e-06, |
| "loss": 0.2406, |
| "step": 1596 |
| }, |
| { |
| "epoch": 3.7945286946179007, |
| "grad_norm": 0.7864248156547546, |
| "learning_rate": 5.256869772998806e-06, |
| "loss": 0.3063, |
| "step": 1597 |
| }, |
| { |
| "epoch": 3.7969075230449003, |
| "grad_norm": 0.8586356043815613, |
| "learning_rate": 5.197132616487455e-06, |
| "loss": 0.2634, |
| "step": 1598 |
| }, |
| { |
| "epoch": 3.7992863514719, |
| "grad_norm": 0.6787355542182922, |
| "learning_rate": 5.137395459976105e-06, |
| "loss": 0.2439, |
| "step": 1599 |
| }, |
| { |
| "epoch": 3.8016651798988996, |
| "grad_norm": 0.6626994013786316, |
| "learning_rate": 5.077658303464755e-06, |
| "loss": 0.2148, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.8016651798988996, |
| "eval_loss": 0.406727135181427, |
| "eval_runtime": 24.8753, |
| "eval_samples_per_second": 30.07, |
| "eval_steps_per_second": 15.035, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.8040440083258993, |
| "grad_norm": 0.7175625562667847, |
| "learning_rate": 5.017921146953405e-06, |
| "loss": 0.2611, |
| "step": 1601 |
| }, |
| { |
| "epoch": 3.8064228367528994, |
| "grad_norm": 0.6542512774467468, |
| "learning_rate": 4.9581839904420555e-06, |
| "loss": 0.2489, |
| "step": 1602 |
| }, |
| { |
| "epoch": 3.808801665179899, |
| "grad_norm": 0.5721225738525391, |
| "learning_rate": 4.898446833930706e-06, |
| "loss": 0.1609, |
| "step": 1603 |
| }, |
| { |
| "epoch": 3.8111804936068987, |
| "grad_norm": 0.6456505060195923, |
| "learning_rate": 4.838709677419355e-06, |
| "loss": 0.2108, |
| "step": 1604 |
| }, |
| { |
| "epoch": 3.8135593220338984, |
| "grad_norm": 0.7169181108474731, |
| "learning_rate": 4.7789725209080055e-06, |
| "loss": 0.2584, |
| "step": 1605 |
| }, |
| { |
| "epoch": 3.815938150460898, |
| "grad_norm": 0.7455881237983704, |
| "learning_rate": 4.719235364396655e-06, |
| "loss": 0.2769, |
| "step": 1606 |
| }, |
| { |
| "epoch": 3.8183169788878977, |
| "grad_norm": 0.706899881362915, |
| "learning_rate": 4.659498207885305e-06, |
| "loss": 0.188, |
| "step": 1607 |
| }, |
| { |
| "epoch": 3.8206958073148973, |
| "grad_norm": 0.7342420220375061, |
| "learning_rate": 4.599761051373955e-06, |
| "loss": 0.2373, |
| "step": 1608 |
| }, |
| { |
| "epoch": 3.823074635741897, |
| "grad_norm": 0.6335705518722534, |
| "learning_rate": 4.540023894862605e-06, |
| "loss": 0.2299, |
| "step": 1609 |
| }, |
| { |
| "epoch": 3.8254534641688966, |
| "grad_norm": 0.7097893953323364, |
| "learning_rate": 4.4802867383512545e-06, |
| "loss": 0.321, |
| "step": 1610 |
| }, |
| { |
| "epoch": 3.8278322925958967, |
| "grad_norm": 0.7030773758888245, |
| "learning_rate": 4.420549581839905e-06, |
| "loss": 0.2354, |
| "step": 1611 |
| }, |
| { |
| "epoch": 3.830211121022896, |
| "grad_norm": 0.7927830815315247, |
| "learning_rate": 4.360812425328554e-06, |
| "loss": 0.2692, |
| "step": 1612 |
| }, |
| { |
| "epoch": 3.832589949449896, |
| "grad_norm": 0.7883430123329163, |
| "learning_rate": 4.3010752688172045e-06, |
| "loss": 0.2383, |
| "step": 1613 |
| }, |
| { |
| "epoch": 3.8349687778768957, |
| "grad_norm": 0.777301549911499, |
| "learning_rate": 4.241338112305855e-06, |
| "loss": 0.2841, |
| "step": 1614 |
| }, |
| { |
| "epoch": 3.8373476063038954, |
| "grad_norm": 0.7094290852546692, |
| "learning_rate": 4.181600955794504e-06, |
| "loss": 0.2444, |
| "step": 1615 |
| }, |
| { |
| "epoch": 3.839726434730895, |
| "grad_norm": 0.6646528244018555, |
| "learning_rate": 4.121863799283155e-06, |
| "loss": 0.2186, |
| "step": 1616 |
| }, |
| { |
| "epoch": 3.8421052631578947, |
| "grad_norm": 0.6202448010444641, |
| "learning_rate": 4.062126642771804e-06, |
| "loss": 0.1955, |
| "step": 1617 |
| }, |
| { |
| "epoch": 3.8444840915848943, |
| "grad_norm": 0.7290095090866089, |
| "learning_rate": 4.002389486260454e-06, |
| "loss": 0.2387, |
| "step": 1618 |
| }, |
| { |
| "epoch": 3.846862920011894, |
| "grad_norm": 0.832423746585846, |
| "learning_rate": 3.942652329749104e-06, |
| "loss": 0.2889, |
| "step": 1619 |
| }, |
| { |
| "epoch": 3.849241748438894, |
| "grad_norm": 0.7755414247512817, |
| "learning_rate": 3.882915173237754e-06, |
| "loss": 0.1783, |
| "step": 1620 |
| }, |
| { |
| "epoch": 3.8516205768658933, |
| "grad_norm": 0.6566904187202454, |
| "learning_rate": 3.8231780167264036e-06, |
| "loss": 0.2044, |
| "step": 1621 |
| }, |
| { |
| "epoch": 3.8539994052928934, |
| "grad_norm": 0.707445502281189, |
| "learning_rate": 3.763440860215054e-06, |
| "loss": 0.3036, |
| "step": 1622 |
| }, |
| { |
| "epoch": 3.856378233719893, |
| "grad_norm": 0.6451879143714905, |
| "learning_rate": 3.7037037037037037e-06, |
| "loss": 0.3102, |
| "step": 1623 |
| }, |
| { |
| "epoch": 3.8587570621468927, |
| "grad_norm": 0.6781940460205078, |
| "learning_rate": 3.643966547192354e-06, |
| "loss": 0.196, |
| "step": 1624 |
| }, |
| { |
| "epoch": 3.8611358905738924, |
| "grad_norm": 0.6479291319847107, |
| "learning_rate": 3.5842293906810035e-06, |
| "loss": 0.2299, |
| "step": 1625 |
| }, |
| { |
| "epoch": 3.863514719000892, |
| "grad_norm": 0.7208328247070312, |
| "learning_rate": 3.524492234169654e-06, |
| "loss": 0.2393, |
| "step": 1626 |
| }, |
| { |
| "epoch": 3.8658935474278917, |
| "grad_norm": 0.6611060500144958, |
| "learning_rate": 3.4647550776583037e-06, |
| "loss": 0.2117, |
| "step": 1627 |
| }, |
| { |
| "epoch": 3.8682723758548914, |
| "grad_norm": 0.6570304036140442, |
| "learning_rate": 3.405017921146954e-06, |
| "loss": 0.2343, |
| "step": 1628 |
| }, |
| { |
| "epoch": 3.870651204281891, |
| "grad_norm": 0.7514824271202087, |
| "learning_rate": 3.3452807646356034e-06, |
| "loss": 0.238, |
| "step": 1629 |
| }, |
| { |
| "epoch": 3.8730300327088907, |
| "grad_norm": 0.6288111209869385, |
| "learning_rate": 3.2855436081242537e-06, |
| "loss": 0.2154, |
| "step": 1630 |
| }, |
| { |
| "epoch": 3.875408861135891, |
| "grad_norm": 0.7032327651977539, |
| "learning_rate": 3.225806451612903e-06, |
| "loss": 0.2279, |
| "step": 1631 |
| }, |
| { |
| "epoch": 3.8777876895628904, |
| "grad_norm": 0.6228746175765991, |
| "learning_rate": 3.1660692951015535e-06, |
| "loss": 0.1931, |
| "step": 1632 |
| }, |
| { |
| "epoch": 3.88016651798989, |
| "grad_norm": 0.6107305884361267, |
| "learning_rate": 3.1063321385902034e-06, |
| "loss": 0.2251, |
| "step": 1633 |
| }, |
| { |
| "epoch": 3.8825453464168898, |
| "grad_norm": 0.6516736149787903, |
| "learning_rate": 3.0465949820788532e-06, |
| "loss": 0.2714, |
| "step": 1634 |
| }, |
| { |
| "epoch": 3.8849241748438894, |
| "grad_norm": 0.7341564893722534, |
| "learning_rate": 2.986857825567503e-06, |
| "loss": 0.241, |
| "step": 1635 |
| }, |
| { |
| "epoch": 3.887303003270889, |
| "grad_norm": 0.6014671325683594, |
| "learning_rate": 2.927120669056153e-06, |
| "loss": 0.1807, |
| "step": 1636 |
| }, |
| { |
| "epoch": 3.8896818316978887, |
| "grad_norm": 0.7346065044403076, |
| "learning_rate": 2.867383512544803e-06, |
| "loss": 0.2421, |
| "step": 1637 |
| }, |
| { |
| "epoch": 3.8920606601248884, |
| "grad_norm": 0.7137173414230347, |
| "learning_rate": 2.8076463560334528e-06, |
| "loss": 0.2606, |
| "step": 1638 |
| }, |
| { |
| "epoch": 3.894439488551888, |
| "grad_norm": 0.7425440549850464, |
| "learning_rate": 2.747909199522103e-06, |
| "loss": 0.218, |
| "step": 1639 |
| }, |
| { |
| "epoch": 3.896818316978888, |
| "grad_norm": 0.5857921838760376, |
| "learning_rate": 2.688172043010753e-06, |
| "loss": 0.1838, |
| "step": 1640 |
| }, |
| { |
| "epoch": 3.8991971454058874, |
| "grad_norm": 0.5919234156608582, |
| "learning_rate": 2.628434886499403e-06, |
| "loss": 0.1848, |
| "step": 1641 |
| }, |
| { |
| "epoch": 3.9015759738328875, |
| "grad_norm": 0.7037178874015808, |
| "learning_rate": 2.5686977299880527e-06, |
| "loss": 0.2417, |
| "step": 1642 |
| }, |
| { |
| "epoch": 3.903954802259887, |
| "grad_norm": 0.7201714515686035, |
| "learning_rate": 2.5089605734767026e-06, |
| "loss": 0.2424, |
| "step": 1643 |
| }, |
| { |
| "epoch": 3.9063336306868868, |
| "grad_norm": 0.7508910298347473, |
| "learning_rate": 2.449223416965353e-06, |
| "loss": 0.2837, |
| "step": 1644 |
| }, |
| { |
| "epoch": 3.9087124591138864, |
| "grad_norm": 0.7130612730979919, |
| "learning_rate": 2.3894862604540028e-06, |
| "loss": 0.2364, |
| "step": 1645 |
| }, |
| { |
| "epoch": 3.911091287540886, |
| "grad_norm": 0.6658805012702942, |
| "learning_rate": 2.3297491039426526e-06, |
| "loss": 0.215, |
| "step": 1646 |
| }, |
| { |
| "epoch": 3.9134701159678857, |
| "grad_norm": 0.6496844291687012, |
| "learning_rate": 2.2700119474313025e-06, |
| "loss": 0.2213, |
| "step": 1647 |
| }, |
| { |
| "epoch": 3.9158489443948854, |
| "grad_norm": 0.6499704122543335, |
| "learning_rate": 2.2102747909199524e-06, |
| "loss": 0.2403, |
| "step": 1648 |
| }, |
| { |
| "epoch": 3.918227772821885, |
| "grad_norm": 0.7895413637161255, |
| "learning_rate": 2.1505376344086023e-06, |
| "loss": 0.2743, |
| "step": 1649 |
| }, |
| { |
| "epoch": 3.9206066012488847, |
| "grad_norm": 0.6984461545944214, |
| "learning_rate": 2.090800477897252e-06, |
| "loss": 0.2492, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.922985429675885, |
| "grad_norm": 0.6958913207054138, |
| "learning_rate": 2.031063321385902e-06, |
| "loss": 0.223, |
| "step": 1651 |
| }, |
| { |
| "epoch": 3.9253642581028845, |
| "grad_norm": 0.8210883736610413, |
| "learning_rate": 1.971326164874552e-06, |
| "loss": 0.3268, |
| "step": 1652 |
| }, |
| { |
| "epoch": 3.927743086529884, |
| "grad_norm": 0.7338096499443054, |
| "learning_rate": 1.9115890083632018e-06, |
| "loss": 0.2821, |
| "step": 1653 |
| }, |
| { |
| "epoch": 3.930121914956884, |
| "grad_norm": 0.7852990031242371, |
| "learning_rate": 1.8518518518518519e-06, |
| "loss": 0.2819, |
| "step": 1654 |
| }, |
| { |
| "epoch": 3.9325007433838834, |
| "grad_norm": 0.587360680103302, |
| "learning_rate": 1.7921146953405017e-06, |
| "loss": 0.219, |
| "step": 1655 |
| }, |
| { |
| "epoch": 3.934879571810883, |
| "grad_norm": 0.5991771221160889, |
| "learning_rate": 1.7323775388291518e-06, |
| "loss": 0.1996, |
| "step": 1656 |
| }, |
| { |
| "epoch": 3.9372584002378828, |
| "grad_norm": 0.7518739104270935, |
| "learning_rate": 1.6726403823178017e-06, |
| "loss": 0.2585, |
| "step": 1657 |
| }, |
| { |
| "epoch": 3.9396372286648824, |
| "grad_norm": 0.725003719329834, |
| "learning_rate": 1.6129032258064516e-06, |
| "loss": 0.2435, |
| "step": 1658 |
| }, |
| { |
| "epoch": 3.942016057091882, |
| "grad_norm": 0.5956133604049683, |
| "learning_rate": 1.5531660692951017e-06, |
| "loss": 0.2052, |
| "step": 1659 |
| }, |
| { |
| "epoch": 3.944394885518882, |
| "grad_norm": 0.5697284936904907, |
| "learning_rate": 1.4934289127837516e-06, |
| "loss": 0.1942, |
| "step": 1660 |
| }, |
| { |
| "epoch": 3.9467737139458814, |
| "grad_norm": 0.6322106122970581, |
| "learning_rate": 1.4336917562724014e-06, |
| "loss": 0.1886, |
| "step": 1661 |
| }, |
| { |
| "epoch": 3.9491525423728815, |
| "grad_norm": 0.6705097556114197, |
| "learning_rate": 1.3739545997610515e-06, |
| "loss": 0.2222, |
| "step": 1662 |
| }, |
| { |
| "epoch": 3.951531370799881, |
| "grad_norm": 0.8009798526763916, |
| "learning_rate": 1.3142174432497014e-06, |
| "loss": 0.2858, |
| "step": 1663 |
| }, |
| { |
| "epoch": 3.953910199226881, |
| "grad_norm": 0.5885463356971741, |
| "learning_rate": 1.2544802867383513e-06, |
| "loss": 0.1702, |
| "step": 1664 |
| }, |
| { |
| "epoch": 3.9562890276538805, |
| "grad_norm": 0.7516363263130188, |
| "learning_rate": 1.1947431302270014e-06, |
| "loss": 0.2719, |
| "step": 1665 |
| }, |
| { |
| "epoch": 3.95866785608088, |
| "grad_norm": 0.6109394431114197, |
| "learning_rate": 1.1350059737156513e-06, |
| "loss": 0.2404, |
| "step": 1666 |
| }, |
| { |
| "epoch": 3.96104668450788, |
| "grad_norm": 0.6410830020904541, |
| "learning_rate": 1.0752688172043011e-06, |
| "loss": 0.2229, |
| "step": 1667 |
| }, |
| { |
| "epoch": 3.9634255129348794, |
| "grad_norm": 0.6514939665794373, |
| "learning_rate": 1.015531660692951e-06, |
| "loss": 0.2404, |
| "step": 1668 |
| }, |
| { |
| "epoch": 3.9658043413618795, |
| "grad_norm": 0.7575217485427856, |
| "learning_rate": 9.557945041816009e-07, |
| "loss": 0.22, |
| "step": 1669 |
| }, |
| { |
| "epoch": 3.9681831697888788, |
| "grad_norm": 0.675889790058136, |
| "learning_rate": 8.960573476702509e-07, |
| "loss": 0.2421, |
| "step": 1670 |
| }, |
| { |
| "epoch": 3.970561998215879, |
| "grad_norm": 0.7055429816246033, |
| "learning_rate": 8.363201911589009e-07, |
| "loss": 0.2476, |
| "step": 1671 |
| }, |
| { |
| "epoch": 3.9729408266428785, |
| "grad_norm": 0.7240319848060608, |
| "learning_rate": 7.765830346475508e-07, |
| "loss": 0.2222, |
| "step": 1672 |
| }, |
| { |
| "epoch": 3.975319655069878, |
| "grad_norm": 0.7141379714012146, |
| "learning_rate": 7.168458781362007e-07, |
| "loss": 0.1832, |
| "step": 1673 |
| }, |
| { |
| "epoch": 3.977698483496878, |
| "grad_norm": 0.6299167275428772, |
| "learning_rate": 6.571087216248507e-07, |
| "loss": 0.2301, |
| "step": 1674 |
| }, |
| { |
| "epoch": 3.9800773119238775, |
| "grad_norm": 0.7430551052093506, |
| "learning_rate": 5.973715651135007e-07, |
| "loss": 0.2718, |
| "step": 1675 |
| }, |
| { |
| "epoch": 3.982456140350877, |
| "grad_norm": 0.6470373272895813, |
| "learning_rate": 5.376344086021506e-07, |
| "loss": 0.2408, |
| "step": 1676 |
| }, |
| { |
| "epoch": 3.984834968777877, |
| "grad_norm": 0.6226888298988342, |
| "learning_rate": 4.778972520908004e-07, |
| "loss": 0.2267, |
| "step": 1677 |
| }, |
| { |
| "epoch": 3.9872137972048765, |
| "grad_norm": 0.6659932732582092, |
| "learning_rate": 4.1816009557945043e-07, |
| "loss": 0.2575, |
| "step": 1678 |
| }, |
| { |
| "epoch": 3.989592625631876, |
| "grad_norm": 0.6825204491615295, |
| "learning_rate": 3.5842293906810036e-07, |
| "loss": 0.2001, |
| "step": 1679 |
| }, |
| { |
| "epoch": 3.991971454058876, |
| "grad_norm": 0.6531214118003845, |
| "learning_rate": 2.9868578255675034e-07, |
| "loss": 0.2078, |
| "step": 1680 |
| }, |
| { |
| "epoch": 3.994350282485876, |
| "grad_norm": 0.674887478351593, |
| "learning_rate": 2.389486260454002e-07, |
| "loss": 0.2757, |
| "step": 1681 |
| }, |
| { |
| "epoch": 3.9967291109128755, |
| "grad_norm": 0.704774796962738, |
| "learning_rate": 1.7921146953405018e-07, |
| "loss": 0.2122, |
| "step": 1682 |
| }, |
| { |
| "epoch": 3.999107939339875, |
| "grad_norm": 0.6378912329673767, |
| "learning_rate": 1.194743130227001e-07, |
| "loss": 0.2269, |
| "step": 1683 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 1.035918951034546, |
| "learning_rate": 5.973715651135006e-08, |
| "loss": 0.2651, |
| "step": 1684 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1684, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.855004092650957e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|