| { |
| "best_global_step": null, |
| "best_metric": 1.3073337078094482, |
| "best_model_checkpoint": null, |
| "epoch": 0.7067795033944015, |
| "eval_steps": 50, |
| "global_step": 1900, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0018599460615642146, |
| "grad_norm": 3.400698661804199, |
| "learning_rate": 1.4037033750100226e-06, |
| "loss": 2.6219, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.003719892123128429, |
| "grad_norm": 3.7983412742614746, |
| "learning_rate": 3.1583325937725507e-06, |
| "loss": 2.5432, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.005579838184692644, |
| "grad_norm": 3.0861151218414307, |
| "learning_rate": 4.912961812535079e-06, |
| "loss": 2.3792, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.007439784246256858, |
| "grad_norm": 2.717170476913452, |
| "learning_rate": 6.667591031297607e-06, |
| "loss": 2.3455, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.009299730307821073, |
| "grad_norm": 2.2805142402648926, |
| "learning_rate": 8.422220250060135e-06, |
| "loss": 2.1424, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.011159676369385288, |
| "grad_norm": 1.9744335412979126, |
| "learning_rate": 1.0176849468822663e-05, |
| "loss": 1.8742, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.013019622430949503, |
| "grad_norm": 1.8863332271575928, |
| "learning_rate": 1.1931478687585193e-05, |
| "loss": 1.7454, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.014879568492513717, |
| "grad_norm": 1.5830954313278198, |
| "learning_rate": 1.368610790634772e-05, |
| "loss": 1.7088, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.016739514554077933, |
| "grad_norm": 1.8321353197097778, |
| "learning_rate": 1.544073712511025e-05, |
| "loss": 1.632, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.018599460615642147, |
| "grad_norm": 1.4320825338363647, |
| "learning_rate": 1.7195366343872776e-05, |
| "loss": 1.4594, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.018599460615642147, |
| "eval_loss": 1.5426534414291382, |
| "eval_runtime": 60.7069, |
| "eval_samples_per_second": 164.726, |
| "eval_steps_per_second": 5.156, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.02045940667720636, |
| "grad_norm": 1.4581339359283447, |
| "learning_rate": 1.8949995562635306e-05, |
| "loss": 1.4841, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.022319352738770577, |
| "grad_norm": 1.3386048078536987, |
| "learning_rate": 2.0704624781397832e-05, |
| "loss": 1.5089, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.02417929880033479, |
| "grad_norm": 1.3601100444793701, |
| "learning_rate": 2.245925400016036e-05, |
| "loss": 1.4723, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.026039244861899007, |
| "grad_norm": 1.2655599117279053, |
| "learning_rate": 2.4213883218922888e-05, |
| "loss": 1.4864, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.02789919092346322, |
| "grad_norm": 1.3355369567871094, |
| "learning_rate": 2.5968512437685417e-05, |
| "loss": 1.4451, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.029759136985027433, |
| "grad_norm": 1.274318814277649, |
| "learning_rate": 2.7723141656447947e-05, |
| "loss": 1.4944, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.031619083046591646, |
| "grad_norm": 2.0729751586914062, |
| "learning_rate": 2.9477770875210473e-05, |
| "loss": 1.437, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.03347902910815587, |
| "grad_norm": 1.2173129320144653, |
| "learning_rate": 3.1232400093973e-05, |
| "loss": 1.482, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.03533897516972008, |
| "grad_norm": 1.2413370609283447, |
| "learning_rate": 3.298702931273553e-05, |
| "loss": 1.3871, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.03719892123128429, |
| "grad_norm": 1.4032536745071411, |
| "learning_rate": 3.4741658531498055e-05, |
| "loss": 1.436, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.03719892123128429, |
| "eval_loss": 1.4507780075073242, |
| "eval_runtime": 60.5546, |
| "eval_samples_per_second": 165.14, |
| "eval_steps_per_second": 5.169, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.039058867292848506, |
| "grad_norm": 1.408437967300415, |
| "learning_rate": 3.509220060941937e-05, |
| "loss": 1.4291, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.04091881335441272, |
| "grad_norm": 1.2020059823989868, |
| "learning_rate": 3.509064158950106e-05, |
| "loss": 1.4322, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.04277875941597694, |
| "grad_norm": 1.2841988801956177, |
| "learning_rate": 3.5087883436606155e-05, |
| "loss": 1.4509, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.04463870547754115, |
| "grad_norm": 1.1238864660263062, |
| "learning_rate": 3.508392633925074e-05, |
| "loss": 1.4467, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.046498651539105366, |
| "grad_norm": 1.285390019416809, |
| "learning_rate": 3.507877056789716e-05, |
| "loss": 1.4269, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.04835859760066958, |
| "grad_norm": 1.1746469736099243, |
| "learning_rate": 3.507241647493555e-05, |
| "loss": 1.4636, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.05021854366223379, |
| "grad_norm": 1.177941083908081, |
| "learning_rate": 3.506486449465971e-05, |
| "loss": 1.4078, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.05207848972379801, |
| "grad_norm": 1.2271251678466797, |
| "learning_rate": 3.505611514323747e-05, |
| "loss": 1.403, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.053938435785362226, |
| "grad_norm": 1.1131396293640137, |
| "learning_rate": 3.5046169018675374e-05, |
| "loss": 1.4511, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.05579838184692644, |
| "grad_norm": 1.195426106452942, |
| "learning_rate": 3.503502680077782e-05, |
| "loss": 1.3926, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.05579838184692644, |
| "eval_loss": 1.428297519683838, |
| "eval_runtime": 60.4864, |
| "eval_samples_per_second": 165.326, |
| "eval_steps_per_second": 5.175, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.05765832790849065, |
| "grad_norm": 1.2685747146606445, |
| "learning_rate": 3.5022689251100616e-05, |
| "loss": 1.4539, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.059518273970054866, |
| "grad_norm": 1.09740149974823, |
| "learning_rate": 3.500915721289888e-05, |
| "loss": 1.431, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.06137822003161908, |
| "grad_norm": 1.1276105642318726, |
| "learning_rate": 3.499443161106944e-05, |
| "loss": 1.3862, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.06323816609318329, |
| "grad_norm": 1.1806763410568237, |
| "learning_rate": 3.497851345208764e-05, |
| "loss": 1.4269, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0650981121547475, |
| "grad_norm": 1.079424500465393, |
| "learning_rate": 3.496140382393849e-05, |
| "loss": 1.3912, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.06695805821631173, |
| "grad_norm": 1.0723962783813477, |
| "learning_rate": 3.4943103896042344e-05, |
| "loss": 1.3961, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.06881800427787595, |
| "grad_norm": 1.1186383962631226, |
| "learning_rate": 3.492361491917497e-05, |
| "loss": 1.4213, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.07067795033944016, |
| "grad_norm": 1.0343255996704102, |
| "learning_rate": 3.4902938225382055e-05, |
| "loss": 1.3989, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.07253789640100437, |
| "grad_norm": 1.141801357269287, |
| "learning_rate": 3.488107522788814e-05, |
| "loss": 1.4074, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.07439784246256859, |
| "grad_norm": 1.0875025987625122, |
| "learning_rate": 3.485802742100007e-05, |
| "loss": 1.4185, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07439784246256859, |
| "eval_loss": 1.4180645942687988, |
| "eval_runtime": 60.4683, |
| "eval_samples_per_second": 165.376, |
| "eval_steps_per_second": 5.176, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0762577885241328, |
| "grad_norm": 1.1400611400604248, |
| "learning_rate": 3.483379638000484e-05, |
| "loss": 1.4337, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.07811773458569701, |
| "grad_norm": 1.0570169687271118, |
| "learning_rate": 3.480838376106189e-05, |
| "loss": 1.4042, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.07997768064726123, |
| "grad_norm": 1.4329943656921387, |
| "learning_rate": 3.478179130108999e-05, |
| "loss": 1.3974, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.08183762670882544, |
| "grad_norm": 1.1569150686264038, |
| "learning_rate": 3.475402081764844e-05, |
| "loss": 1.4402, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.08369757277038965, |
| "grad_norm": 1.0719656944274902, |
| "learning_rate": 3.4725074208812906e-05, |
| "loss": 1.4071, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.08555751883195388, |
| "grad_norm": 0.9809865951538086, |
| "learning_rate": 3.4694953453045645e-05, |
| "loss": 1.4349, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.0874174648935181, |
| "grad_norm": 1.3142799139022827, |
| "learning_rate": 3.466366060906031e-05, |
| "loss": 1.3969, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.0892774109550823, |
| "grad_norm": 1.1862435340881348, |
| "learning_rate": 3.463119781568121e-05, |
| "loss": 1.4122, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.09113735701664652, |
| "grad_norm": 1.1050539016723633, |
| "learning_rate": 3.459756729169715e-05, |
| "loss": 1.3759, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.09299730307821073, |
| "grad_norm": 1.0910768508911133, |
| "learning_rate": 3.456277133570978e-05, |
| "loss": 1.4006, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.09299730307821073, |
| "eval_loss": 1.4099781513214111, |
| "eval_runtime": 60.4718, |
| "eval_samples_per_second": 165.366, |
| "eval_steps_per_second": 5.176, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.09485724913977495, |
| "grad_norm": 1.0490922927856445, |
| "learning_rate": 3.452681232597646e-05, |
| "loss": 1.3863, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.09671719520133916, |
| "grad_norm": 1.0440068244934082, |
| "learning_rate": 3.448969272024775e-05, |
| "loss": 1.4116, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.09857714126290337, |
| "grad_norm": 1.1154160499572754, |
| "learning_rate": 3.4451415055599386e-05, |
| "loss": 1.4171, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.10043708732446759, |
| "grad_norm": 1.091792106628418, |
| "learning_rate": 3.4411981948258904e-05, |
| "loss": 1.3739, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.1022970333860318, |
| "grad_norm": 1.1216192245483398, |
| "learning_rate": 3.437139609342681e-05, |
| "loss": 1.3473, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.10415697944759603, |
| "grad_norm": 1.0132431983947754, |
| "learning_rate": 3.4329660265092366e-05, |
| "loss": 1.4201, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.10601692550916024, |
| "grad_norm": 1.0745493173599243, |
| "learning_rate": 3.4286777315844006e-05, |
| "loss": 1.3943, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.10787687157072445, |
| "grad_norm": 1.173161268234253, |
| "learning_rate": 3.4242750176674336e-05, |
| "loss": 1.4077, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.10973681763228867, |
| "grad_norm": 1.085132122039795, |
| "learning_rate": 3.419758185677985e-05, |
| "loss": 1.3981, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.11159676369385288, |
| "grad_norm": 1.049797534942627, |
| "learning_rate": 3.41512754433552e-05, |
| "loss": 1.4143, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.11159676369385288, |
| "eval_loss": 1.3975080251693726, |
| "eval_runtime": 60.4745, |
| "eval_samples_per_second": 165.359, |
| "eval_steps_per_second": 5.176, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.11345670975541709, |
| "grad_norm": 1.0642030239105225, |
| "learning_rate": 3.4103834101382244e-05, |
| "loss": 1.4265, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.1153166558169813, |
| "grad_norm": 1.074144721031189, |
| "learning_rate": 3.405526107341368e-05, |
| "loss": 1.3677, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.11717660187854552, |
| "grad_norm": 1.2073023319244385, |
| "learning_rate": 3.4005559679351445e-05, |
| "loss": 1.3879, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.11903654794010973, |
| "grad_norm": 1.0445431470870972, |
| "learning_rate": 3.395473331621981e-05, |
| "loss": 1.3625, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.12089649400167395, |
| "grad_norm": 1.1690475940704346, |
| "learning_rate": 3.3902785457933166e-05, |
| "loss": 1.414, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.12275644006323816, |
| "grad_norm": 0.9960464239120483, |
| "learning_rate": 3.3849719655058636e-05, |
| "loss": 1.386, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.12461638612480239, |
| "grad_norm": 1.1535345315933228, |
| "learning_rate": 3.379553953457336e-05, |
| "loss": 1.3309, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.12647633218636659, |
| "grad_norm": 1.1487047672271729, |
| "learning_rate": 3.3740248799616596e-05, |
| "loss": 1.4549, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.1283362782479308, |
| "grad_norm": 1.0776844024658203, |
| "learning_rate": 3.368385122923663e-05, |
| "loss": 1.382, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.130196224309495, |
| "grad_norm": 1.1013463735580444, |
| "learning_rate": 3.362635067813248e-05, |
| "loss": 1.4432, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.130196224309495, |
| "eval_loss": 1.3882778882980347, |
| "eval_runtime": 60.4797, |
| "eval_samples_per_second": 165.345, |
| "eval_steps_per_second": 5.175, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.13205617037105924, |
| "grad_norm": 1.1181257963180542, |
| "learning_rate": 3.356775107639044e-05, |
| "loss": 1.3779, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.13391611643262347, |
| "grad_norm": 1.0939253568649292, |
| "learning_rate": 3.350805642921544e-05, |
| "loss": 1.3873, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.13577606249418767, |
| "grad_norm": 1.0570895671844482, |
| "learning_rate": 3.3447270816657335e-05, |
| "loss": 1.3408, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.1376360085557519, |
| "grad_norm": 1.0405073165893555, |
| "learning_rate": 3.338539839333198e-05, |
| "loss": 1.3577, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.1394959546173161, |
| "grad_norm": 1.0169920921325684, |
| "learning_rate": 3.332244338813734e-05, |
| "loss": 1.3905, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.14135590067888032, |
| "grad_norm": 0.9779713749885559, |
| "learning_rate": 3.325841010396438e-05, |
| "loss": 1.4239, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.14321584674044452, |
| "grad_norm": 1.1131649017333984, |
| "learning_rate": 3.319330291740301e-05, |
| "loss": 1.3793, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.14507579280200875, |
| "grad_norm": 1.091145396232605, |
| "learning_rate": 3.312712627844296e-05, |
| "loss": 1.3784, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.14693573886357295, |
| "grad_norm": 0.997805118560791, |
| "learning_rate": 3.3059884710169595e-05, |
| "loss": 1.3277, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.14879568492513717, |
| "grad_norm": 1.102238416671753, |
| "learning_rate": 3.299158280845478e-05, |
| "loss": 1.3698, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.14879568492513717, |
| "eval_loss": 1.3824845552444458, |
| "eval_runtime": 60.5144, |
| "eval_samples_per_second": 165.25, |
| "eval_steps_per_second": 5.172, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.15065563098670137, |
| "grad_norm": 1.0498243570327759, |
| "learning_rate": 3.292222524164277e-05, |
| "loss": 1.3825, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.1525155770482656, |
| "grad_norm": 1.1161103248596191, |
| "learning_rate": 3.2851816750231135e-05, |
| "loss": 1.4245, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.15437552310982983, |
| "grad_norm": 1.0065839290618896, |
| "learning_rate": 3.278036214654672e-05, |
| "loss": 1.3448, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.15623546917139403, |
| "grad_norm": 1.109067440032959, |
| "learning_rate": 3.2707866314416786e-05, |
| "loss": 1.4031, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.15809541523295825, |
| "grad_norm": 1.0229747295379639, |
| "learning_rate": 3.263433420883514e-05, |
| "loss": 1.3632, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.15995536129452245, |
| "grad_norm": 0.9845519065856934, |
| "learning_rate": 3.255977085562354e-05, |
| "loss": 1.3494, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.16181530735608668, |
| "grad_norm": 1.106702446937561, |
| "learning_rate": 3.248418135108813e-05, |
| "loss": 1.3682, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.16367525341765088, |
| "grad_norm": 0.9956566095352173, |
| "learning_rate": 3.240757086167112e-05, |
| "loss": 1.3687, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.1655351994792151, |
| "grad_norm": 1.006332516670227, |
| "learning_rate": 3.2329944623597715e-05, |
| "loss": 1.3089, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.1673951455407793, |
| "grad_norm": 0.977312445640564, |
| "learning_rate": 3.2251307942518165e-05, |
| "loss": 1.3686, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1673951455407793, |
| "eval_loss": 1.3752514123916626, |
| "eval_runtime": 60.5893, |
| "eval_samples_per_second": 165.046, |
| "eval_steps_per_second": 5.166, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.16925509160234353, |
| "grad_norm": 1.0094255208969116, |
| "learning_rate": 3.2171666193145165e-05, |
| "loss": 1.3994, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.17111503766390776, |
| "grad_norm": 1.0625929832458496, |
| "learning_rate": 3.209102481888649e-05, |
| "loss": 1.404, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.17297498372547196, |
| "grad_norm": 1.0466253757476807, |
| "learning_rate": 3.2009389331472956e-05, |
| "loss": 1.3521, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.1748349297870362, |
| "grad_norm": 1.0407981872558594, |
| "learning_rate": 3.192676531058168e-05, |
| "loss": 1.3876, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.17669487584860039, |
| "grad_norm": 1.0586848258972168, |
| "learning_rate": 3.184315840345474e-05, |
| "loss": 1.3607, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.1785548219101646, |
| "grad_norm": 1.0073552131652832, |
| "learning_rate": 3.175857432451318e-05, |
| "loss": 1.3726, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.1804147679717288, |
| "grad_norm": 1.0010629892349243, |
| "learning_rate": 3.167301885496645e-05, |
| "loss": 1.3581, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.18227471403329304, |
| "grad_norm": 1.002271056175232, |
| "learning_rate": 3.158649784241722e-05, |
| "loss": 1.3611, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.18413466009485724, |
| "grad_norm": 0.9680567979812622, |
| "learning_rate": 3.149901720046178e-05, |
| "loss": 1.363, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.18599460615642147, |
| "grad_norm": 1.0661317110061646, |
| "learning_rate": 3.1410582908285814e-05, |
| "loss": 1.3698, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.18599460615642147, |
| "eval_loss": 1.368044137954712, |
| "eval_runtime": 60.5679, |
| "eval_samples_per_second": 165.104, |
| "eval_steps_per_second": 5.168, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.18785455221798567, |
| "grad_norm": 1.1560924053192139, |
| "learning_rate": 3.132120101025571e-05, |
| "loss": 1.401, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.1897144982795499, |
| "grad_norm": 1.0191872119903564, |
| "learning_rate": 3.1230877615505466e-05, |
| "loss": 1.3452, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.19157444434111412, |
| "grad_norm": 1.4533931016921997, |
| "learning_rate": 3.113961889751914e-05, |
| "loss": 1.3616, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.19343439040267832, |
| "grad_norm": 0.9897879362106323, |
| "learning_rate": 3.104743109370887e-05, |
| "loss": 1.3048, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.19529433646424255, |
| "grad_norm": 1.078536868095398, |
| "learning_rate": 3.09543205049886e-05, |
| "loss": 1.3072, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.19715428252580675, |
| "grad_norm": 1.0671093463897705, |
| "learning_rate": 3.0860293495343384e-05, |
| "loss": 1.369, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.19901422858737097, |
| "grad_norm": 1.077250599861145, |
| "learning_rate": 3.076535649139443e-05, |
| "loss": 1.3864, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.20087417464893517, |
| "grad_norm": 1.0758692026138306, |
| "learning_rate": 3.0669515981959844e-05, |
| "loss": 1.328, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.2027341207104994, |
| "grad_norm": 1.1256663799285889, |
| "learning_rate": 3.057277851761114e-05, |
| "loss": 1.3823, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.2045940667720636, |
| "grad_norm": 1.2285213470458984, |
| "learning_rate": 3.0475150710225507e-05, |
| "loss": 1.3729, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.2045940667720636, |
| "eval_loss": 1.3673256635665894, |
| "eval_runtime": 60.4936, |
| "eval_samples_per_second": 165.307, |
| "eval_steps_per_second": 5.174, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.20645401283362783, |
| "grad_norm": 1.0815597772598267, |
| "learning_rate": 3.0376639232533898e-05, |
| "loss": 1.3791, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.20831395889519205, |
| "grad_norm": 1.1154593229293823, |
| "learning_rate": 3.0277250817664945e-05, |
| "loss": 1.4125, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.21017390495675625, |
| "grad_norm": 1.1166361570358276, |
| "learning_rate": 3.017699225868479e-05, |
| "loss": 1.3984, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.21203385101832048, |
| "grad_norm": 1.0057345628738403, |
| "learning_rate": 3.007587040813276e-05, |
| "loss": 1.3763, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.21389379707988468, |
| "grad_norm": 1.091741681098938, |
| "learning_rate": 2.9973892177553013e-05, |
| "loss": 1.3778, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.2157537431414489, |
| "grad_norm": 1.0506634712219238, |
| "learning_rate": 2.987106453702215e-05, |
| "loss": 1.3768, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.2176136892030131, |
| "grad_norm": 1.0511835813522339, |
| "learning_rate": 2.9767394514672807e-05, |
| "loss": 1.3113, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.21947363526457733, |
| "grad_norm": 1.0758532285690308, |
| "learning_rate": 2.9662889196213302e-05, |
| "loss": 1.433, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.22133358132614153, |
| "grad_norm": 1.0389028787612915, |
| "learning_rate": 2.955755572444333e-05, |
| "loss": 1.3394, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.22319352738770576, |
| "grad_norm": 0.9888710379600525, |
| "learning_rate": 2.9451401298765766e-05, |
| "loss": 1.3657, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.22319352738770576, |
| "eval_loss": 1.358090877532959, |
| "eval_runtime": 60.5018, |
| "eval_samples_per_second": 165.284, |
| "eval_steps_per_second": 5.173, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.22505347344926996, |
| "grad_norm": 1.0805813074111938, |
| "learning_rate": 2.9344433174694606e-05, |
| "loss": 1.3697, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.22691341951083419, |
| "grad_norm": 0.9579132199287415, |
| "learning_rate": 2.9236658663359032e-05, |
| "loss": 1.3599, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.2287733655723984, |
| "grad_norm": 1.103060007095337, |
| "learning_rate": 2.912808513100373e-05, |
| "loss": 1.3386, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.2306333116339626, |
| "grad_norm": 0.9141831994056702, |
| "learning_rate": 2.901871999848541e-05, |
| "loss": 1.3891, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.23249325769552684, |
| "grad_norm": 0.9142250418663025, |
| "learning_rate": 2.8908570740765607e-05, |
| "loss": 1.3781, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.23435320375709104, |
| "grad_norm": 0.952091634273529, |
| "learning_rate": 2.8797644886399776e-05, |
| "loss": 1.3449, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.23621314981865527, |
| "grad_norm": 1.228947401046753, |
| "learning_rate": 2.8685950017022712e-05, |
| "loss": 1.3673, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.23807309588021947, |
| "grad_norm": 0.9899388551712036, |
| "learning_rate": 2.857349376683036e-05, |
| "loss": 1.3736, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.2399330419417837, |
| "grad_norm": 1.0279674530029297, |
| "learning_rate": 2.8460283822058048e-05, |
| "loss": 1.3377, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.2417929880033479, |
| "grad_norm": 1.1311445236206055, |
| "learning_rate": 2.8346327920455112e-05, |
| "loss": 1.4113, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.2417929880033479, |
| "eval_loss": 1.3546315431594849, |
| "eval_runtime": 60.4585, |
| "eval_samples_per_second": 165.403, |
| "eval_steps_per_second": 5.177, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.24365293406491212, |
| "grad_norm": 1.0192480087280273, |
| "learning_rate": 2.8231633850756056e-05, |
| "loss": 1.4002, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.24551288012647632, |
| "grad_norm": 0.9501408338546753, |
| "learning_rate": 2.8116209452148195e-05, |
| "loss": 1.3385, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.24737282618804055, |
| "grad_norm": 0.9931688904762268, |
| "learning_rate": 2.800006261373584e-05, |
| "loss": 1.3638, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.24923277224960477, |
| "grad_norm": 1.0014399290084839, |
| "learning_rate": 2.7883201274001122e-05, |
| "loss": 1.4008, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.25109271831116897, |
| "grad_norm": 0.9573982357978821, |
| "learning_rate": 2.7765633420261374e-05, |
| "loss": 1.3607, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.25295266437273317, |
| "grad_norm": 1.4360772371292114, |
| "learning_rate": 2.7647367088123233e-05, |
| "loss": 1.3409, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.2548126104342974, |
| "grad_norm": 0.9750449061393738, |
| "learning_rate": 2.7528410360933393e-05, |
| "loss": 1.3392, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.2566725564958616, |
| "grad_norm": 1.0708160400390625, |
| "learning_rate": 2.740877136922615e-05, |
| "loss": 1.3225, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.2585325025574258, |
| "grad_norm": 0.9025946259498596, |
| "learning_rate": 2.728845829016766e-05, |
| "loss": 1.3537, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.26039244861899, |
| "grad_norm": 0.9979115128517151, |
| "learning_rate": 2.7167479346997062e-05, |
| "loss": 1.3367, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.26039244861899, |
| "eval_loss": 1.350304126739502, |
| "eval_runtime": 60.5478, |
| "eval_samples_per_second": 165.159, |
| "eval_steps_per_second": 5.169, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2622523946805543, |
| "grad_norm": 0.9901404976844788, |
| "learning_rate": 2.7045842808464416e-05, |
| "loss": 1.3596, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.2641123407421185, |
| "grad_norm": 1.103663444519043, |
| "learning_rate": 2.692355698826556e-05, |
| "loss": 1.3654, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.2659722868036827, |
| "grad_norm": 0.9586177468299866, |
| "learning_rate": 2.680063024447386e-05, |
| "loss": 1.3472, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.26783223286524693, |
| "grad_norm": 0.9534765481948853, |
| "learning_rate": 2.6677070978968968e-05, |
| "loss": 1.3306, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.26969217892681113, |
| "grad_norm": 1.0054278373718262, |
| "learning_rate": 2.655288763686255e-05, |
| "loss": 1.392, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.27155212498837533, |
| "grad_norm": 1.0037821531295776, |
| "learning_rate": 2.642808870592108e-05, |
| "loss": 1.3624, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.27341207104993953, |
| "grad_norm": 1.0708328485488892, |
| "learning_rate": 2.6302682715985714e-05, |
| "loss": 1.3319, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.2752720171115038, |
| "grad_norm": 1.0329240560531616, |
| "learning_rate": 2.617667823838928e-05, |
| "loss": 1.3502, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.277131963173068, |
| "grad_norm": 0.9648259878158569, |
| "learning_rate": 2.6050083885370444e-05, |
| "loss": 1.358, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.2789919092346322, |
| "grad_norm": 1.043942928314209, |
| "learning_rate": 2.592290830948507e-05, |
| "loss": 1.3254, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2789919092346322, |
| "eval_loss": 1.3463472127914429, |
| "eval_runtime": 60.4768, |
| "eval_samples_per_second": 165.353, |
| "eval_steps_per_second": 5.176, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2808518552961964, |
| "grad_norm": 1.074987530708313, |
| "learning_rate": 2.579516020301484e-05, |
| "loss": 1.3848, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.28271180135776064, |
| "grad_norm": 1.0375388860702515, |
| "learning_rate": 2.5666848297373133e-05, |
| "loss": 1.4132, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.28457174741932484, |
| "grad_norm": 0.9844212532043457, |
| "learning_rate": 2.553798136250826e-05, |
| "loss": 1.3713, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.28643169348088904, |
| "grad_norm": 1.021017074584961, |
| "learning_rate": 2.540856820630404e-05, |
| "loss": 1.3627, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.2882916395424533, |
| "grad_norm": 1.0512734651565552, |
| "learning_rate": 2.5278617673977793e-05, |
| "loss": 1.3959, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.2901515856040175, |
| "grad_norm": 1.0537786483764648, |
| "learning_rate": 2.514813864747578e-05, |
| "loss": 1.3661, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.2920115316655817, |
| "grad_norm": 1.0074138641357422, |
| "learning_rate": 2.5017140044866143e-05, |
| "loss": 1.3438, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.2938714777271459, |
| "grad_norm": 0.9245646595954895, |
| "learning_rate": 2.488563081972936e-05, |
| "loss": 1.3232, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.29573142378871015, |
| "grad_norm": 1.0348320007324219, |
| "learning_rate": 2.4753619960546277e-05, |
| "loss": 1.3259, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.29759136985027435, |
| "grad_norm": 1.0290751457214355, |
| "learning_rate": 2.4621116490083764e-05, |
| "loss": 1.3328, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.29759136985027435, |
| "eval_loss": 1.3428822755813599, |
| "eval_runtime": 60.4759, |
| "eval_samples_per_second": 165.355, |
| "eval_steps_per_second": 5.176, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.29945131591183854, |
| "grad_norm": 0.9886801838874817, |
| "learning_rate": 2.4488129464778016e-05, |
| "loss": 1.3618, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.30131126197340274, |
| "grad_norm": 1.0141544342041016, |
| "learning_rate": 2.4354667974115556e-05, |
| "loss": 1.2996, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.303171208034967, |
| "grad_norm": 1.0118813514709473, |
| "learning_rate": 2.4220741140011997e-05, |
| "loss": 1.3632, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.3050311540965312, |
| "grad_norm": 1.0116174221038818, |
| "learning_rate": 2.4086358116188535e-05, |
| "loss": 1.3208, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.3068911001580954, |
| "grad_norm": 0.9581089019775391, |
| "learning_rate": 2.395152808754635e-05, |
| "loss": 1.3489, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.30875104621965965, |
| "grad_norm": 1.0313812494277954, |
| "learning_rate": 2.3816260269538798e-05, |
| "loss": 1.3304, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.31061099228122385, |
| "grad_norm": 0.9710265398025513, |
| "learning_rate": 2.368056390754155e-05, |
| "loss": 1.32, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.31247093834278805, |
| "grad_norm": 0.9445378184318542, |
| "learning_rate": 2.35444482762207e-05, |
| "loss": 1.3755, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.31433088440435225, |
| "grad_norm": 0.9779049158096313, |
| "learning_rate": 2.340792267889885e-05, |
| "loss": 1.3321, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.3161908304659165, |
| "grad_norm": 1.0178042650222778, |
| "learning_rate": 2.3270996446919208e-05, |
| "loss": 1.3845, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.3161908304659165, |
| "eval_loss": 1.3410464525222778, |
| "eval_runtime": 60.4741, |
| "eval_samples_per_second": 165.36, |
| "eval_steps_per_second": 5.176, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.3180507765274807, |
| "grad_norm": 0.938162624835968, |
| "learning_rate": 2.313367893900785e-05, |
| "loss": 1.3115, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.3199107225890449, |
| "grad_norm": 0.9968370199203491, |
| "learning_rate": 2.2995979540634033e-05, |
| "loss": 1.3254, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.32177066865060916, |
| "grad_norm": 1.0070589780807495, |
| "learning_rate": 2.2857907663368726e-05, |
| "loss": 1.4028, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.32363061471217336, |
| "grad_norm": 1.0026954412460327, |
| "learning_rate": 2.2719472744241337e-05, |
| "loss": 1.3736, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.32549056077373756, |
| "grad_norm": 0.923169732093811, |
| "learning_rate": 2.258068424509469e-05, |
| "loss": 1.3654, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.32735050683530176, |
| "grad_norm": 0.9882062077522278, |
| "learning_rate": 2.244155165193835e-05, |
| "loss": 1.3451, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.329210452896866, |
| "grad_norm": 1.023770809173584, |
| "learning_rate": 2.2302084474300236e-05, |
| "loss": 1.3042, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.3310703989584302, |
| "grad_norm": 0.9432891011238098, |
| "learning_rate": 2.2162292244576682e-05, |
| "loss": 1.3351, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.3329303450199944, |
| "grad_norm": 1.0601178407669067, |
| "learning_rate": 2.202218451738089e-05, |
| "loss": 1.4111, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.3347902910815586, |
| "grad_norm": 1.7586909532546997, |
| "learning_rate": 2.1881770868889913e-05, |
| "loss": 1.3708, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.3347902910815586, |
| "eval_loss": 1.336167812347412, |
| "eval_runtime": 60.4785, |
| "eval_samples_per_second": 165.348, |
| "eval_steps_per_second": 5.175, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.33665023714312287, |
| "grad_norm": 1.055015206336975, |
| "learning_rate": 2.1741060896190096e-05, |
| "loss": 1.3273, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.33851018320468707, |
| "grad_norm": 1.1004860401153564, |
| "learning_rate": 2.160006421662117e-05, |
| "loss": 1.3965, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.34037012926625126, |
| "grad_norm": 0.9952622056007385, |
| "learning_rate": 2.1458790467118895e-05, |
| "loss": 1.3419, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.3422300753278155, |
| "grad_norm": 1.0129551887512207, |
| "learning_rate": 2.131724930355637e-05, |
| "loss": 1.3443, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.3440900213893797, |
| "grad_norm": 1.0409715175628662, |
| "learning_rate": 2.117545040008412e-05, |
| "loss": 1.3601, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.3459499674509439, |
| "grad_norm": 1.0580016374588013, |
| "learning_rate": 2.1033403448468844e-05, |
| "loss": 1.3321, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.3478099135125081, |
| "grad_norm": 1.0712617635726929, |
| "learning_rate": 2.089111815743099e-05, |
| "loss": 1.3727, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.3496698595740724, |
| "grad_norm": 0.9753930568695068, |
| "learning_rate": 2.074860425198119e-05, |
| "loss": 1.3014, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.35152980563563657, |
| "grad_norm": 0.9435998201370239, |
| "learning_rate": 2.0605871472755586e-05, |
| "loss": 1.3518, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.35338975169720077, |
| "grad_norm": 1.0743812322616577, |
| "learning_rate": 2.046292957535004e-05, |
| "loss": 1.3012, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.35338975169720077, |
| "eval_loss": 1.3357957601547241, |
| "eval_runtime": 60.4881, |
| "eval_samples_per_second": 165.322, |
| "eval_steps_per_second": 5.175, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.35524969775876497, |
| "grad_norm": 0.9649530649185181, |
| "learning_rate": 2.0319788329653343e-05, |
| "loss": 1.3176, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.3571096438203292, |
| "grad_norm": 1.0349267721176147, |
| "learning_rate": 2.0176457519179516e-05, |
| "loss": 1.3752, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.3589695898818934, |
| "grad_norm": 1.021322250366211, |
| "learning_rate": 2.0032946940399056e-05, |
| "loss": 1.3736, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.3608295359434576, |
| "grad_norm": 1.0290499925613403, |
| "learning_rate": 1.9889266402069386e-05, |
| "loss": 1.3687, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.3626894820050219, |
| "grad_norm": 1.0723743438720703, |
| "learning_rate": 1.974542572456445e-05, |
| "loss": 1.3196, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.3645494280665861, |
| "grad_norm": 0.9752843379974365, |
| "learning_rate": 1.9601434739203483e-05, |
| "loss": 1.3734, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.3664093741281503, |
| "grad_norm": 1.0339841842651367, |
| "learning_rate": 1.945730328757906e-05, |
| "loss": 1.3315, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.3682693201897145, |
| "grad_norm": 1.0285077095031738, |
| "learning_rate": 1.9313041220884443e-05, |
| "loss": 1.3427, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.37012926625127873, |
| "grad_norm": 1.0121210813522339, |
| "learning_rate": 1.9168658399240265e-05, |
| "loss": 1.3379, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.37198921231284293, |
| "grad_norm": 0.9975342154502869, |
| "learning_rate": 1.9024164691020593e-05, |
| "loss": 1.3238, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.37198921231284293, |
| "eval_loss": 1.330836296081543, |
| "eval_runtime": 60.528, |
| "eval_samples_per_second": 165.213, |
| "eval_steps_per_second": 5.171, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.37384915837440713, |
| "grad_norm": 1.1997116804122925, |
| "learning_rate": 1.8879569972178443e-05, |
| "loss": 1.3633, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.37570910443597133, |
| "grad_norm": 0.9928303360939026, |
| "learning_rate": 1.8734884125570776e-05, |
| "loss": 1.367, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.3775690504975356, |
| "grad_norm": 1.1708087921142578, |
| "learning_rate": 1.859011704028302e-05, |
| "loss": 1.3382, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.3794289965590998, |
| "grad_norm": 1.06816565990448, |
| "learning_rate": 1.8445278610953146e-05, |
| "loss": 1.3605, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.381288942620664, |
| "grad_norm": 1.0174709558486938, |
| "learning_rate": 1.8300378737095408e-05, |
| "loss": 1.3561, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.38314888868222824, |
| "grad_norm": 1.0002470016479492, |
| "learning_rate": 1.8155427322423704e-05, |
| "loss": 1.3546, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.38500883474379244, |
| "grad_norm": 1.0101186037063599, |
| "learning_rate": 1.8010434274174678e-05, |
| "loss": 1.3915, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.38686878080535664, |
| "grad_norm": 1.0798920392990112, |
| "learning_rate": 1.786540950243058e-05, |
| "loss": 1.3456, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.38872872686692084, |
| "grad_norm": 1.0295590162277222, |
| "learning_rate": 1.772036291944191e-05, |
| "loss": 1.2886, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.3905886729284851, |
| "grad_norm": 1.3735235929489136, |
| "learning_rate": 1.7575304438949958e-05, |
| "loss": 1.3724, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3905886729284851, |
| "eval_loss": 1.3277840614318848, |
| "eval_runtime": 60.4768, |
| "eval_samples_per_second": 165.353, |
| "eval_steps_per_second": 5.176, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3924486189900493, |
| "grad_norm": 1.078456997871399, |
| "learning_rate": 1.743024397550916e-05, |
| "loss": 1.3861, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.3943085650516135, |
| "grad_norm": 1.0125305652618408, |
| "learning_rate": 1.7285191443809507e-05, |
| "loss": 1.3671, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.3961685111131777, |
| "grad_norm": 0.9944830536842346, |
| "learning_rate": 1.714015675799886e-05, |
| "loss": 1.3546, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.39802845717474195, |
| "grad_norm": 1.0055299997329712, |
| "learning_rate": 1.699514983100534e-05, |
| "loss": 1.318, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.39988840323630614, |
| "grad_norm": 0.9434425830841064, |
| "learning_rate": 1.6850180573859786e-05, |
| "loss": 1.3336, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.40174834929787034, |
| "grad_norm": 1.0011420249938965, |
| "learning_rate": 1.6705258895018352e-05, |
| "loss": 1.3028, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.4036082953594346, |
| "grad_norm": 1.0826334953308105, |
| "learning_rate": 1.6560394699685283e-05, |
| "loss": 1.3033, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.4054682414209988, |
| "grad_norm": 0.9963647723197937, |
| "learning_rate": 1.6415597889135897e-05, |
| "loss": 1.3299, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.407328187482563, |
| "grad_norm": 0.9412882924079895, |
| "learning_rate": 1.6270878360039855e-05, |
| "loss": 1.3982, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.4091881335441272, |
| "grad_norm": 1.145696759223938, |
| "learning_rate": 1.6126246003784744e-05, |
| "loss": 1.3216, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.4091881335441272, |
| "eval_loss": 1.3285400867462158, |
| "eval_runtime": 60.8136, |
| "eval_samples_per_second": 164.437, |
| "eval_steps_per_second": 5.147, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.41104807960569145, |
| "grad_norm": 0.9899356961250305, |
| "learning_rate": 1.598171070579997e-05, |
| "loss": 1.3185, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.41290802566725565, |
| "grad_norm": 1.0239601135253906, |
| "learning_rate": 1.583728234488117e-05, |
| "loss": 1.3312, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.41476797172881985, |
| "grad_norm": 1.0883080959320068, |
| "learning_rate": 1.569297079251496e-05, |
| "loss": 1.251, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.4166279177903841, |
| "grad_norm": 0.9925746321678162, |
| "learning_rate": 1.5548785912204247e-05, |
| "loss": 1.3591, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.4184878638519483, |
| "grad_norm": 1.1945092678070068, |
| "learning_rate": 1.5404737558794072e-05, |
| "loss": 1.3444, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.4203478099135125, |
| "grad_norm": 1.1621458530426025, |
| "learning_rate": 1.526083557779805e-05, |
| "loss": 1.3419, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.4222077559750767, |
| "grad_norm": 1.0666788816452026, |
| "learning_rate": 1.511708980472542e-05, |
| "loss": 1.3351, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.42406770203664096, |
| "grad_norm": 0.9614555835723877, |
| "learning_rate": 1.4973510064408831e-05, |
| "loss": 1.2855, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.42592764809820516, |
| "grad_norm": 1.0012832880020142, |
| "learning_rate": 1.4830106170332813e-05, |
| "loss": 1.4136, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.42778759415976936, |
| "grad_norm": 1.2083728313446045, |
| "learning_rate": 1.4686887923963032e-05, |
| "loss": 1.3486, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.42778759415976936, |
| "eval_loss": 1.3236939907073975, |
| "eval_runtime": 60.4717, |
| "eval_samples_per_second": 165.367, |
| "eval_steps_per_second": 5.176, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.42964754022133356, |
| "grad_norm": 0.9932472705841064, |
| "learning_rate": 1.4543865114076387e-05, |
| "loss": 1.2785, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.4315074862828978, |
| "grad_norm": 1.0794252157211304, |
| "learning_rate": 1.4401047516091949e-05, |
| "loss": 1.3629, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.433367432344462, |
| "grad_norm": 1.0356013774871826, |
| "learning_rate": 1.4258444891402823e-05, |
| "loss": 1.3503, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.4352273784060262, |
| "grad_norm": 1.0389708280563354, |
| "learning_rate": 1.4116066986708994e-05, |
| "loss": 1.3919, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.43708732446759047, |
| "grad_norm": 1.0182055234909058, |
| "learning_rate": 1.3973923533351102e-05, |
| "loss": 1.3258, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.43894727052915467, |
| "grad_norm": 0.9431582689285278, |
| "learning_rate": 1.3832024246645377e-05, |
| "loss": 1.3329, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.44080721659071886, |
| "grad_norm": 1.014979600906372, |
| "learning_rate": 1.3690378825219572e-05, |
| "loss": 1.3719, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.44266716265228306, |
| "grad_norm": 1.0422216653823853, |
| "learning_rate": 1.354899695035009e-05, |
| "loss": 1.3107, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.4445271087138473, |
| "grad_norm": 1.0085375308990479, |
| "learning_rate": 1.340788828530027e-05, |
| "loss": 1.3542, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.4463870547754115, |
| "grad_norm": 2.4849801063537598, |
| "learning_rate": 1.326706247465993e-05, |
| "loss": 1.3224, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4463870547754115, |
| "eval_loss": 1.3226845264434814, |
| "eval_runtime": 60.4716, |
| "eval_samples_per_second": 165.367, |
| "eval_steps_per_second": 5.176, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4482470008369757, |
| "grad_norm": 1.0490351915359497, |
| "learning_rate": 1.3126529143686158e-05, |
| "loss": 1.3868, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.4501069468985399, |
| "grad_norm": 0.9706118702888489, |
| "learning_rate": 1.2986297897645448e-05, |
| "loss": 1.2929, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.45196689296010417, |
| "grad_norm": 0.9651924967765808, |
| "learning_rate": 1.2846378321157197e-05, |
| "loss": 1.3361, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.45382683902166837, |
| "grad_norm": 0.9976927638053894, |
| "learning_rate": 1.270677997753859e-05, |
| "loss": 1.3169, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.45568678508323257, |
| "grad_norm": 1.0347989797592163, |
| "learning_rate": 1.256751240815098e-05, |
| "loss": 1.3577, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.4575467311447968, |
| "grad_norm": 1.0190398693084717, |
| "learning_rate": 1.242858513174774e-05, |
| "loss": 1.3067, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.459406677206361, |
| "grad_norm": 1.1658672094345093, |
| "learning_rate": 1.2290007643823672e-05, |
| "loss": 1.3642, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.4612666232679252, |
| "grad_norm": 0.9859827756881714, |
| "learning_rate": 1.2151789415965982e-05, |
| "loss": 1.3451, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.4631265693294894, |
| "grad_norm": 1.0628533363342285, |
| "learning_rate": 1.2013939895206955e-05, |
| "loss": 1.3878, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.4649865153910537, |
| "grad_norm": 1.0261117219924927, |
| "learning_rate": 1.187646850337822e-05, |
| "loss": 1.3263, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.4649865153910537, |
| "eval_loss": 1.319972276687622, |
| "eval_runtime": 60.4945, |
| "eval_samples_per_second": 165.304, |
| "eval_steps_per_second": 5.174, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.4668464614526179, |
| "grad_norm": 0.9808030128479004, |
| "learning_rate": 1.1739384636466793e-05, |
| "loss": 1.334, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.4687064075141821, |
| "grad_norm": 0.9631521105766296, |
| "learning_rate": 1.160269766397289e-05, |
| "loss": 1.393, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.4705663535757463, |
| "grad_norm": 1.0527286529541016, |
| "learning_rate": 1.146641692826951e-05, |
| "loss": 1.3541, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.47242629963731053, |
| "grad_norm": 0.9961625933647156, |
| "learning_rate": 1.1330551743963907e-05, |
| "loss": 1.3919, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.47428624569887473, |
| "grad_norm": 1.033630132675171, |
| "learning_rate": 1.1195111397260953e-05, |
| "loss": 1.324, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.47614619176043893, |
| "grad_norm": 0.9427865147590637, |
| "learning_rate": 1.1060105145328438e-05, |
| "loss": 1.3016, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.4780061378220032, |
| "grad_norm": 1.0001837015151978, |
| "learning_rate": 1.0925542215664338e-05, |
| "loss": 1.3122, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.4798660838835674, |
| "grad_norm": 0.9850189089775085, |
| "learning_rate": 1.0791431805466157e-05, |
| "loss": 1.3301, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.4817260299451316, |
| "grad_norm": 1.0945813655853271, |
| "learning_rate": 1.065778308100228e-05, |
| "loss": 1.3268, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.4835859760066958, |
| "grad_norm": 1.0691360235214233, |
| "learning_rate": 1.0524605176985496e-05, |
| "loss": 1.3294, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.4835859760066958, |
| "eval_loss": 1.3182451725006104, |
| "eval_runtime": 60.5291, |
| "eval_samples_per_second": 165.21, |
| "eval_steps_per_second": 5.171, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.48544592206826004, |
| "grad_norm": 1.0112754106521606, |
| "learning_rate": 1.0391907195948643e-05, |
| "loss": 1.2971, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.48730586812982424, |
| "grad_norm": 1.0204293727874756, |
| "learning_rate": 1.0259698207622443e-05, |
| "loss": 1.3557, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.48916581419138844, |
| "grad_norm": 0.9734729528427124, |
| "learning_rate": 1.0127987248315628e-05, |
| "loss": 1.2949, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.49102576025295264, |
| "grad_norm": 1.0576294660568237, |
| "learning_rate": 9.996783320297322e-06, |
| "loss": 1.3116, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.4928857063145169, |
| "grad_norm": 1.0640610456466675, |
| "learning_rate": 9.866095391181714e-06, |
| "loss": 1.3427, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.4947456523760811, |
| "grad_norm": 1.0691922903060913, |
| "learning_rate": 9.735932393315157e-06, |
| "loss": 1.3017, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.4966055984376453, |
| "grad_norm": 1.0166995525360107, |
| "learning_rate": 9.606303223165656e-06, |
| "loss": 1.3303, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.49846554449920955, |
| "grad_norm": 1.110524296760559, |
| "learning_rate": 9.477216740714798e-06, |
| "loss": 1.321, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.5003254905607737, |
| "grad_norm": 1.0372258424758911, |
| "learning_rate": 9.348681768852186e-06, |
| "loss": 1.3231, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.5021854366223379, |
| "grad_norm": 1.021887183189392, |
| "learning_rate": 9.220707092772407e-06, |
| "loss": 1.3761, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.5021854366223379, |
| "eval_loss": 1.3183324337005615, |
| "eval_runtime": 60.5168, |
| "eval_samples_per_second": 165.243, |
| "eval_steps_per_second": 5.172, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.5040453826839022, |
| "grad_norm": 1.0530022382736206, |
| "learning_rate": 9.093301459374576e-06, |
| "loss": 1.3057, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.5059053287454663, |
| "grad_norm": 1.042121171951294, |
| "learning_rate": 8.966473576664499e-06, |
| "loss": 1.2922, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.5077652748070306, |
| "grad_norm": 1.0158429145812988, |
| "learning_rate": 8.840232113159481e-06, |
| "loss": 1.3242, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.5096252208685949, |
| "grad_norm": 1.0740450620651245, |
| "learning_rate": 8.714585697295876e-06, |
| "loss": 1.3677, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.511485166930159, |
| "grad_norm": 1.0379616022109985, |
| "learning_rate": 8.589542916839287e-06, |
| "loss": 1.3317, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.5133451129917233, |
| "grad_norm": 1.1563701629638672, |
| "learning_rate": 8.465112318297662e-06, |
| "loss": 1.3267, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.5152050590532875, |
| "grad_norm": 0.9955624938011169, |
| "learning_rate": 8.34130240633713e-06, |
| "loss": 1.3064, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.5170650051148517, |
| "grad_norm": 0.9720199108123779, |
| "learning_rate": 8.218121643200707e-06, |
| "loss": 1.2534, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.5189249511764159, |
| "grad_norm": 1.0836621522903442, |
| "learning_rate": 8.095578448129925e-06, |
| "loss": 1.3382, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.52078489723798, |
| "grad_norm": 1.0937702655792236, |
| "learning_rate": 7.973681196789392e-06, |
| "loss": 1.379, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.52078489723798, |
| "eval_loss": 1.3146955966949463, |
| "eval_runtime": 60.6087, |
| "eval_samples_per_second": 164.993, |
| "eval_steps_per_second": 5.164, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.5226448432995443, |
| "grad_norm": 1.0201658010482788, |
| "learning_rate": 7.85243822069431e-06, |
| "loss": 1.3525, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.5245047893611086, |
| "grad_norm": 1.0252045392990112, |
| "learning_rate": 7.731857806641046e-06, |
| "loss": 1.3257, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.5263647354226727, |
| "grad_norm": 0.9879313707351685, |
| "learning_rate": 7.611948196140724e-06, |
| "loss": 1.3926, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.528224681484237, |
| "grad_norm": 1.0046051740646362, |
| "learning_rate": 7.492717584855942e-06, |
| "loss": 1.352, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.5300846275458012, |
| "grad_norm": 1.0227553844451904, |
| "learning_rate": 7.3741741220406e-06, |
| "loss": 1.3079, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.5319445736073654, |
| "grad_norm": 1.0626540184020996, |
| "learning_rate": 7.2563259099829175e-06, |
| "loss": 1.3412, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.5338045196689296, |
| "grad_norm": 1.0018831491470337, |
| "learning_rate": 7.1391810034516405e-06, |
| "loss": 1.3367, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.5356644657304939, |
| "grad_norm": 0.9997648000717163, |
| "learning_rate": 7.022747409145532e-06, |
| "loss": 1.314, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.537524411792058, |
| "grad_norm": 1.0522111654281616, |
| "learning_rate": 6.907033085146082e-06, |
| "loss": 1.3755, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.5393843578536223, |
| "grad_norm": 0.9937166571617126, |
| "learning_rate": 6.792045940373635e-06, |
| "loss": 1.328, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.5393843578536223, |
| "eval_loss": 1.3144030570983887, |
| "eval_runtime": 60.5569, |
| "eval_samples_per_second": 165.134, |
| "eval_steps_per_second": 5.169, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.5412443039151864, |
| "grad_norm": 1.0255683660507202, |
| "learning_rate": 6.677793834046793e-06, |
| "loss": 1.3236, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.5431042499767507, |
| "grad_norm": 0.9852287769317627, |
| "learning_rate": 6.564284575145255e-06, |
| "loss": 1.3339, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.5449641960383149, |
| "grad_norm": 1.0695271492004395, |
| "learning_rate": 6.451525921876091e-06, |
| "loss": 1.3292, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.5468241420998791, |
| "grad_norm": 1.1841660737991333, |
| "learning_rate": 6.339525581143464e-06, |
| "loss": 1.2773, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.5486840881614433, |
| "grad_norm": 1.0714248418807983, |
| "learning_rate": 6.2282912080218895e-06, |
| "loss": 1.3186, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.5505440342230076, |
| "grad_norm": 1.1358453035354614, |
| "learning_rate": 6.1178304052330156e-06, |
| "loss": 1.3164, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.5524039802845717, |
| "grad_norm": 1.0477732419967651, |
| "learning_rate": 6.008150722625978e-06, |
| "loss": 1.3006, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.554263926346136, |
| "grad_norm": 0.9401105642318726, |
| "learning_rate": 5.899259656661391e-06, |
| "loss": 1.3668, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.5561238724077002, |
| "grad_norm": 1.0811374187469482, |
| "learning_rate": 5.791164649898969e-06, |
| "loss": 1.2621, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.5579838184692644, |
| "grad_norm": 1.046583890914917, |
| "learning_rate": 5.683873090488836e-06, |
| "loss": 1.3526, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5579838184692644, |
| "eval_loss": 1.310549020767212, |
| "eval_runtime": 60.5321, |
| "eval_samples_per_second": 165.202, |
| "eval_steps_per_second": 5.171, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5598437645308286, |
| "grad_norm": 1.042725920677185, |
| "learning_rate": 5.577392311666558e-06, |
| "loss": 1.3255, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.5617037105923928, |
| "grad_norm": 1.0929292440414429, |
| "learning_rate": 5.471729591251926e-06, |
| "loss": 1.3171, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.563563656653957, |
| "grad_norm": 1.0681570768356323, |
| "learning_rate": 5.366892151151515e-06, |
| "loss": 1.3551, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.5654236027155213, |
| "grad_norm": 0.9610005617141724, |
| "learning_rate": 5.262887156865101e-06, |
| "loss": 1.3463, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.5672835487770854, |
| "grad_norm": 1.028685450553894, |
| "learning_rate": 5.159721716995887e-06, |
| "loss": 1.331, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.5691434948386497, |
| "grad_norm": 1.0659209489822388, |
| "learning_rate": 5.0574028827646464e-06, |
| "loss": 1.3453, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.5710034409002139, |
| "grad_norm": 1.0256808996200562, |
| "learning_rate": 4.955937647527789e-06, |
| "loss": 1.3523, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.5728633869617781, |
| "grad_norm": 1.0253880023956299, |
| "learning_rate": 4.855332946299358e-06, |
| "loss": 1.3594, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.5747233330233423, |
| "grad_norm": 0.9797042608261108, |
| "learning_rate": 4.755595655277047e-06, |
| "loss": 1.3301, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.5765832790849066, |
| "grad_norm": 1.0288350582122803, |
| "learning_rate": 4.656732591372208e-06, |
| "loss": 1.2953, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.5765832790849066, |
| "eval_loss": 1.3121490478515625, |
| "eval_runtime": 60.5529, |
| "eval_samples_per_second": 165.145, |
| "eval_steps_per_second": 5.169, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.5784432251464707, |
| "grad_norm": 1.0119401216506958, |
| "learning_rate": 4.558750511743937e-06, |
| "loss": 1.348, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.580303171208035, |
| "grad_norm": 1.0771074295043945, |
| "learning_rate": 4.461656113337223e-06, |
| "loss": 1.3214, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.5821631172695991, |
| "grad_norm": 1.0131157636642456, |
| "learning_rate": 4.365456032425219e-06, |
| "loss": 1.2896, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.5840230633311634, |
| "grad_norm": 0.9793763160705566, |
| "learning_rate": 4.270156844155667e-06, |
| "loss": 1.3405, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.5858830093927276, |
| "grad_norm": 1.0757414102554321, |
| "learning_rate": 4.175765062101498e-06, |
| "loss": 1.3704, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.5877429554542918, |
| "grad_norm": 1.182327151298523, |
| "learning_rate": 4.082287137815629e-06, |
| "loss": 1.3274, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.589602901515856, |
| "grad_norm": 1.0652798414230347, |
| "learning_rate": 3.989729460390014e-06, |
| "loss": 1.3884, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.5914628475774203, |
| "grad_norm": 1.0110604763031006, |
| "learning_rate": 3.8980983560189544e-06, |
| "loss": 1.2986, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.5933227936389844, |
| "grad_norm": 1.0287728309631348, |
| "learning_rate": 3.8074000875667173e-06, |
| "loss": 1.3227, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.5951827397005487, |
| "grad_norm": 1.0629034042358398, |
| "learning_rate": 3.7176408541394724e-06, |
| "loss": 1.3454, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5951827397005487, |
| "eval_loss": 1.3092882633209229, |
| "eval_runtime": 60.6125, |
| "eval_samples_per_second": 164.983, |
| "eval_steps_per_second": 5.164, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.597042685762113, |
| "grad_norm": 1.0497337579727173, |
| "learning_rate": 3.6288267906615927e-06, |
| "loss": 1.2941, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.5989026318236771, |
| "grad_norm": 1.0062696933746338, |
| "learning_rate": 3.5409639674563414e-06, |
| "loss": 1.3119, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.6007625778852413, |
| "grad_norm": 0.9828023314476013, |
| "learning_rate": 3.4540583898309718e-06, |
| "loss": 1.299, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.6026225239468055, |
| "grad_norm": 0.9954901337623596, |
| "learning_rate": 3.3681159976662705e-06, |
| "loss": 1.3478, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.6044824700083697, |
| "grad_norm": 1.0962938070297241, |
| "learning_rate": 3.2831426650105854e-06, |
| "loss": 1.3267, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.606342416069934, |
| "grad_norm": 1.0761761665344238, |
| "learning_rate": 3.199144199678326e-06, |
| "loss": 1.3335, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.6082023621314981, |
| "grad_norm": 1.0441513061523438, |
| "learning_rate": 3.11612634285302e-06, |
| "loss": 1.3116, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.6100623081930624, |
| "grad_norm": 1.0409022569656372, |
| "learning_rate": 3.034094768694904e-06, |
| "loss": 1.3156, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.6119222542546267, |
| "grad_norm": 0.9353120923042297, |
| "learning_rate": 2.95305508395311e-06, |
| "loss": 1.2779, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.6137822003161908, |
| "grad_norm": 1.8940584659576416, |
| "learning_rate": 2.8730128275824325e-06, |
| "loss": 1.2904, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.6137822003161908, |
| "eval_loss": 1.3097484111785889, |
| "eval_runtime": 60.6389, |
| "eval_samples_per_second": 164.911, |
| "eval_steps_per_second": 5.162, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.615642146377755, |
| "grad_norm": 1.0142489671707153, |
| "learning_rate": 2.7939734703647734e-06, |
| "loss": 1.3105, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.6175020924393193, |
| "grad_norm": 0.9673045873641968, |
| "learning_rate": 2.7159424145352063e-06, |
| "loss": 1.3624, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.6193620385008834, |
| "grad_norm": 1.3291678428649902, |
| "learning_rate": 2.6389249934127475e-06, |
| "loss": 1.3605, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.6212219845624477, |
| "grad_norm": 1.0622340440750122, |
| "learning_rate": 2.5629264710358236e-06, |
| "loss": 1.3195, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.6230819306240118, |
| "grad_norm": 1.07737135887146, |
| "learning_rate": 2.4879520418024855e-06, |
| "loss": 1.3621, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.6249418766855761, |
| "grad_norm": 1.076094627380371, |
| "learning_rate": 2.4140068301153783e-06, |
| "loss": 1.3352, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.6268018227471404, |
| "grad_norm": 1.1228044033050537, |
| "learning_rate": 2.3410958900314987e-06, |
| "loss": 1.3171, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.6286617688087045, |
| "grad_norm": 1.0142266750335693, |
| "learning_rate": 2.2692242049167475e-06, |
| "loss": 1.3062, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.6305217148702688, |
| "grad_norm": 1.1721749305725098, |
| "learning_rate": 2.1983966871053323e-06, |
| "loss": 1.3482, |
| "step": 1695 |
| }, |
| { |
| "epoch": 0.632381660931833, |
| "grad_norm": 1.1963858604431152, |
| "learning_rate": 2.1286181775640126e-06, |
| "loss": 1.3803, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.632381660931833, |
| "eval_loss": 1.3107084035873413, |
| "eval_runtime": 60.7317, |
| "eval_samples_per_second": 164.659, |
| "eval_steps_per_second": 5.154, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.6342416069933972, |
| "grad_norm": 1.0231841802597046, |
| "learning_rate": 2.059893445561226e-06, |
| "loss": 1.313, |
| "step": 1705 |
| }, |
| { |
| "epoch": 0.6361015530549614, |
| "grad_norm": 1.0954822301864624, |
| "learning_rate": 1.9922271883411143e-06, |
| "loss": 1.3398, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.6379614991165257, |
| "grad_norm": 1.1705200672149658, |
| "learning_rate": 1.925624030802471e-06, |
| "loss": 1.3195, |
| "step": 1715 |
| }, |
| { |
| "epoch": 0.6398214451780898, |
| "grad_norm": 1.1075305938720703, |
| "learning_rate": 1.8600885251826436e-06, |
| "loss": 1.3313, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.6416813912396541, |
| "grad_norm": 1.020974040031433, |
| "learning_rate": 1.7956251507463883e-06, |
| "loss": 1.3352, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.6435413373012183, |
| "grad_norm": 1.1549155712127686, |
| "learning_rate": 1.7322383134797149e-06, |
| "loss": 1.361, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.6454012833627825, |
| "grad_norm": 1.1022095680236816, |
| "learning_rate": 1.6699323457887554e-06, |
| "loss": 1.3802, |
| "step": 1735 |
| }, |
| { |
| "epoch": 0.6472612294243467, |
| "grad_norm": 1.0258790254592896, |
| "learning_rate": 1.6087115062036328e-06, |
| "loss": 1.2892, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.6491211754859109, |
| "grad_norm": 1.0042780637741089, |
| "learning_rate": 1.5485799790874115e-06, |
| "loss": 1.35, |
| "step": 1745 |
| }, |
| { |
| "epoch": 0.6509811215474751, |
| "grad_norm": 1.0090981721878052, |
| "learning_rate": 1.4895418743500954e-06, |
| "loss": 1.3185, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.6509811215474751, |
| "eval_loss": 1.308115839958191, |
| "eval_runtime": 60.6449, |
| "eval_samples_per_second": 164.894, |
| "eval_steps_per_second": 5.161, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.6528410676090394, |
| "grad_norm": 1.059403657913208, |
| "learning_rate": 1.431601227167719e-06, |
| "loss": 1.3153, |
| "step": 1755 |
| }, |
| { |
| "epoch": 0.6547010136706035, |
| "grad_norm": 1.0298171043395996, |
| "learning_rate": 1.3747619977065534e-06, |
| "loss": 1.3642, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.6565609597321678, |
| "grad_norm": 1.0219703912734985, |
| "learning_rate": 1.3190280708524274e-06, |
| "loss": 1.356, |
| "step": 1765 |
| }, |
| { |
| "epoch": 0.658420905793732, |
| "grad_norm": 1.0099432468414307, |
| "learning_rate": 1.2644032559452095e-06, |
| "loss": 1.2847, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.6602808518552962, |
| "grad_norm": 1.0368894338607788, |
| "learning_rate": 1.2108912865184372e-06, |
| "loss": 1.3282, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.6621407979168604, |
| "grad_norm": 1.1284433603286743, |
| "learning_rate": 1.1584958200441366e-06, |
| "loss": 1.3546, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.6640007439784247, |
| "grad_norm": 1.024755597114563, |
| "learning_rate": 1.107220437682845e-06, |
| "loss": 1.3606, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.6658606900399888, |
| "grad_norm": 0.9858983159065247, |
| "learning_rate": 1.0570686440388318e-06, |
| "loss": 1.2959, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.6677206361015531, |
| "grad_norm": 1.0345178842544556, |
| "learning_rate": 1.0080438669205757e-06, |
| "loss": 1.3119, |
| "step": 1795 |
| }, |
| { |
| "epoch": 0.6695805821631172, |
| "grad_norm": 1.125440001487732, |
| "learning_rate": 9.601494571064706e-07, |
| "loss": 1.3397, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.6695805821631172, |
| "eval_loss": 1.308713674545288, |
| "eval_runtime": 60.5955, |
| "eval_samples_per_second": 165.029, |
| "eval_steps_per_second": 5.165, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.6714405282246815, |
| "grad_norm": 0.9926828742027283, |
| "learning_rate": 9.133886881158041e-07, |
| "loss": 1.3156, |
| "step": 1805 |
| }, |
| { |
| "epoch": 0.6733004742862457, |
| "grad_norm": 2.1787917613983154, |
| "learning_rate": 8.677647559850251e-07, |
| "loss": 1.3884, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.6751604203478099, |
| "grad_norm": 1.1709134578704834, |
| "learning_rate": 8.232807790492901e-07, |
| "loss": 1.3408, |
| "step": 1815 |
| }, |
| { |
| "epoch": 0.6770203664093741, |
| "grad_norm": 1.049757480621338, |
| "learning_rate": 7.799397977293321e-07, |
| "loss": 1.3641, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.6788803124709384, |
| "grad_norm": 1.0234988927841187, |
| "learning_rate": 7.377447743236496e-07, |
| "loss": 1.3226, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.6807402585325025, |
| "grad_norm": 0.9746132493019104, |
| "learning_rate": 6.966985928060477e-07, |
| "loss": 1.3427, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.6826002045940668, |
| "grad_norm": 3.015550374984741, |
| "learning_rate": 6.568040586285049e-07, |
| "loss": 1.3353, |
| "step": 1835 |
| }, |
| { |
| "epoch": 0.684460150655631, |
| "grad_norm": 1.0778768062591553, |
| "learning_rate": 6.180638985294406e-07, |
| "loss": 1.3186, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.6863200967171952, |
| "grad_norm": 1.1300712823867798, |
| "learning_rate": 5.804807603473371e-07, |
| "loss": 1.3452, |
| "step": 1845 |
| }, |
| { |
| "epoch": 0.6881800427787594, |
| "grad_norm": 1.057513952255249, |
| "learning_rate": 5.44057212839764e-07, |
| "loss": 1.2665, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.6881800427787594, |
| "eval_loss": 1.3100022077560425, |
| "eval_runtime": 60.5956, |
| "eval_samples_per_second": 165.028, |
| "eval_steps_per_second": 5.165, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.6900399888403236, |
| "grad_norm": 1.076949954032898, |
| "learning_rate": 5.08795745507812e-07, |
| "loss": 1.3476, |
| "step": 1855 |
| }, |
| { |
| "epoch": 0.6918999349018878, |
| "grad_norm": 0.9384456276893616, |
| "learning_rate": 4.746987684259339e-07, |
| "loss": 1.2809, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.6937598809634521, |
| "grad_norm": 0.9707076549530029, |
| "learning_rate": 4.417686120772182e-07, |
| "loss": 1.3307, |
| "step": 1865 |
| }, |
| { |
| "epoch": 0.6956198270250162, |
| "grad_norm": 1.1225857734680176, |
| "learning_rate": 4.100075271941094e-07, |
| "loss": 1.3795, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.6974797730865805, |
| "grad_norm": 1.0616652965545654, |
| "learning_rate": 3.794176846045729e-07, |
| "loss": 1.3137, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.6993397191481447, |
| "grad_norm": 1.0068029165267944, |
| "learning_rate": 3.500011750837112e-07, |
| "loss": 1.3503, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.7011996652097089, |
| "grad_norm": 1.0520777702331543, |
| "learning_rate": 3.21760009210876e-07, |
| "loss": 1.3875, |
| "step": 1885 |
| }, |
| { |
| "epoch": 0.7030596112712731, |
| "grad_norm": 1.0416043996810913, |
| "learning_rate": 2.946961172322425e-07, |
| "loss": 1.3565, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.7049195573328374, |
| "grad_norm": 1.0348520278930664, |
| "learning_rate": 2.6881134892887327e-07, |
| "loss": 1.307, |
| "step": 1895 |
| }, |
| { |
| "epoch": 0.7067795033944015, |
| "grad_norm": 1.0482661724090576, |
| "learning_rate": 2.441074734903027e-07, |
| "loss": 1.4315, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.7067795033944015, |
| "eval_loss": 1.3073337078094482, |
| "eval_runtime": 60.5911, |
| "eval_samples_per_second": 165.041, |
| "eval_steps_per_second": 5.166, |
| "step": 1900 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 2000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 3, |
| "early_stopping_threshold": 0.0001 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.700258141162177e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|