{ "best_metric": 0.17754687368869781, "best_model_checkpoint": "results/checkpoint-25000", "epoch": 9.998720081914758, "eval_steps": 500, "global_step": 26040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.038397542557276336, "grad_norm": 1.0079567432403564, "learning_rate": 9.999643338380885e-06, "loss": 5.5723, "step": 100 }, { "epoch": 0.07679508511455267, "grad_norm": 0.6461474299430847, "learning_rate": 9.998558958654982e-06, "loss": 2.2782, "step": 200 }, { "epoch": 0.115192627671829, "grad_norm": 0.4909125566482544, "learning_rate": 9.996746982275233e-06, "loss": 1.8047, "step": 300 }, { "epoch": 0.15359017022910534, "grad_norm": 0.47547289729118347, "learning_rate": 9.994207672995245e-06, "loss": 1.5821, "step": 400 }, { "epoch": 0.19198771278638166, "grad_norm": 0.41358837485313416, "learning_rate": 9.99094140044013e-06, "loss": 1.4754, "step": 500 }, { "epoch": 0.19198771278638166, "eval_valid_loss": 1.4288749694824219, "eval_valid_runtime": 4.7117, "eval_valid_samples_per_second": 212.238, "eval_valid_steps_per_second": 6.792, "step": 500 }, { "epoch": 0.19198771278638166, "eval_valid_target_loss": 1.4590624570846558, "eval_valid_target_runtime": 4.684, "eval_valid_target_samples_per_second": 213.493, "eval_valid_target_steps_per_second": 6.832, "step": 500 }, { "epoch": 0.230385255343658, "grad_norm": 0.43229448795318604, "learning_rate": 9.986948640052719e-06, "loss": 1.4087, "step": 600 }, { "epoch": 0.26878279790093434, "grad_norm": 0.528977632522583, "learning_rate": 9.982229973024328e-06, "loss": 1.3245, "step": 700 }, { "epoch": 0.3071803404582107, "grad_norm": 0.5489594340324402, "learning_rate": 9.976786086210186e-06, "loss": 1.0455, "step": 800 }, { "epoch": 0.34557788301548703, "grad_norm": 0.5119125843048096, "learning_rate": 9.970617772029439e-06, "loss": 0.7605, "step": 900 }, { "epoch": 0.3839754255727633, "grad_norm": 0.5092576146125793, "learning_rate": 9.963725928349814e-06, "loss": 0.6005, "step": 1000 }, { "epoch": 0.3839754255727633, "eval_valid_loss": 0.49165624380111694, "eval_valid_runtime": 4.674, "eval_valid_samples_per_second": 213.951, "eval_valid_steps_per_second": 6.846, "step": 1000 }, { "epoch": 0.3839754255727633, "eval_valid_target_loss": 0.5142187476158142, "eval_valid_target_runtime": 4.6758, "eval_valid_target_samples_per_second": 213.869, "eval_valid_target_steps_per_second": 6.844, "step": 1000 }, { "epoch": 0.42237296813003966, "grad_norm": 0.43798330426216125, "learning_rate": 9.956111558356915e-06, "loss": 0.4887, "step": 1100 }, { "epoch": 0.460770510687316, "grad_norm": 0.3737218379974365, "learning_rate": 9.947775770408207e-06, "loss": 0.4307, "step": 1200 }, { "epoch": 0.49916805324459235, "grad_norm": 0.4303857386112213, "learning_rate": 9.938719777871674e-06, "loss": 0.4027, "step": 1300 }, { "epoch": 0.5375655958018687, "grad_norm": 0.3709202706813812, "learning_rate": 9.92894489894921e-06, "loss": 0.3799, "step": 1400 }, { "epoch": 0.575963138359145, "grad_norm": 0.3918135464191437, "learning_rate": 9.918452556484728e-06, "loss": 0.3633, "step": 1500 }, { "epoch": 0.575963138359145, "eval_valid_loss": 0.33228906989097595, "eval_valid_runtime": 4.7244, "eval_valid_samples_per_second": 211.669, "eval_valid_steps_per_second": 6.773, "step": 1500 }, { "epoch": 0.575963138359145, "eval_valid_target_loss": 0.3428671956062317, "eval_valid_target_runtime": 4.6595, "eval_valid_target_samples_per_second": 214.617, "eval_valid_target_steps_per_second": 6.868, "step": 1500 }, { "epoch": 0.6143606809164214, "grad_norm": 0.3580816686153412, "learning_rate": 9.907244277757053e-06, "loss": 0.3565, "step": 1600 }, { "epoch": 0.6527582234736977, "grad_norm": 0.34893009066581726, "learning_rate": 9.895321694257617e-06, "loss": 0.3443, "step": 1700 }, { "epoch": 0.6911557660309741, "grad_norm": 0.3050221800804138, "learning_rate": 9.882686541452967e-06, "loss": 0.3339, "step": 1800 }, { "epoch": 0.7295533085882504, "grad_norm": 0.3123306632041931, "learning_rate": 9.869340658532151e-06, "loss": 0.3278, "step": 1900 }, { "epoch": 0.7679508511455266, "grad_norm": 0.31590646505355835, "learning_rate": 9.85528598813901e-06, "loss": 0.32, "step": 2000 }, { "epoch": 0.7679508511455266, "eval_valid_loss": 0.29783594608306885, "eval_valid_runtime": 4.6776, "eval_valid_samples_per_second": 213.785, "eval_valid_steps_per_second": 6.841, "step": 2000 }, { "epoch": 0.7679508511455266, "eval_valid_target_loss": 0.3129218816757202, "eval_valid_target_runtime": 4.6598, "eval_valid_target_samples_per_second": 214.601, "eval_valid_target_steps_per_second": 6.867, "step": 2000 }, { "epoch": 0.806348393702803, "grad_norm": 0.2684693932533264, "learning_rate": 9.840524576089392e-06, "loss": 0.3194, "step": 2100 }, { "epoch": 0.8447459362600793, "grad_norm": 0.30888476967811584, "learning_rate": 9.82505857107337e-06, "loss": 0.3108, "step": 2200 }, { "epoch": 0.8831434788173557, "grad_norm": 0.2777215242385864, "learning_rate": 9.808890224342476e-06, "loss": 0.3105, "step": 2300 }, { "epoch": 0.921541021374632, "grad_norm": 0.30601397156715393, "learning_rate": 9.792021889381995e-06, "loss": 0.3055, "step": 2400 }, { "epoch": 0.9599385639319084, "grad_norm": 0.2972748875617981, "learning_rate": 9.774456021568404e-06, "loss": 0.3008, "step": 2500 }, { "epoch": 0.9599385639319084, "eval_valid_loss": 0.27842968702316284, "eval_valid_runtime": 4.68, "eval_valid_samples_per_second": 213.675, "eval_valid_steps_per_second": 6.838, "step": 2500 }, { "epoch": 0.9599385639319084, "eval_valid_target_loss": 0.2943359315395355, "eval_valid_target_runtime": 4.6764, "eval_valid_target_samples_per_second": 213.838, "eval_valid_target_steps_per_second": 6.843, "step": 2500 }, { "epoch": 0.9983361064891847, "grad_norm": 0.2891447842121124, "learning_rate": 9.756195177811953e-06, "loss": 0.2969, "step": 2600 }, { "epoch": 1.036733649046461, "grad_norm": 0.30049994587898254, "learning_rate": 9.737242016184486e-06, "loss": 0.2913, "step": 2700 }, { "epoch": 1.0751311916037374, "grad_norm": 0.25728341937065125, "learning_rate": 9.717599295532518e-06, "loss": 0.2911, "step": 2800 }, { "epoch": 1.1135287341610136, "grad_norm": 0.31619054079055786, "learning_rate": 9.697269875075667e-06, "loss": 0.2879, "step": 2900 }, { "epoch": 1.15192627671829, "grad_norm": 0.3005208671092987, "learning_rate": 9.676256713990448e-06, "loss": 0.2839, "step": 3000 }, { "epoch": 1.15192627671829, "eval_valid_loss": 0.2648593783378601, "eval_valid_runtime": 4.6888, "eval_valid_samples_per_second": 213.274, "eval_valid_steps_per_second": 6.825, "step": 3000 }, { "epoch": 1.15192627671829, "eval_valid_target_loss": 0.2817968726158142, "eval_valid_target_runtime": 4.6695, "eval_valid_target_samples_per_second": 214.155, "eval_valid_target_steps_per_second": 6.853, "step": 3000 }, { "epoch": 1.1903238192755663, "grad_norm": 0.24846772849559784, "learning_rate": 9.654562870979545e-06, "loss": 0.2803, "step": 3100 }, { "epoch": 1.2287213618328428, "grad_norm": 0.23501113057136536, "learning_rate": 9.632191503826574e-06, "loss": 0.278, "step": 3200 }, { "epoch": 1.267118904390119, "grad_norm": 0.27793240547180176, "learning_rate": 9.609145868936434e-06, "loss": 0.2776, "step": 3300 }, { "epoch": 1.3055164469473954, "grad_norm": 0.2599338889122009, "learning_rate": 9.5854293208613e-06, "loss": 0.275, "step": 3400 }, { "epoch": 1.3439139895046717, "grad_norm": 0.2431340515613556, "learning_rate": 9.561045311812335e-06, "loss": 0.2722, "step": 3500 }, { "epoch": 1.3439139895046717, "eval_valid_loss": 0.2545468807220459, "eval_valid_runtime": 4.7131, "eval_valid_samples_per_second": 212.177, "eval_valid_steps_per_second": 6.79, "step": 3500 }, { "epoch": 1.3439139895046717, "eval_valid_target_loss": 0.27326563000679016, "eval_valid_target_runtime": 4.6635, "eval_valid_target_samples_per_second": 214.433, "eval_valid_target_steps_per_second": 6.862, "step": 3500 }, { "epoch": 1.382311532061948, "grad_norm": 0.28658226132392883, "learning_rate": 9.535997391157174e-06, "loss": 0.2693, "step": 3600 }, { "epoch": 1.4207090746192244, "grad_norm": 0.26118528842926025, "learning_rate": 9.510289204903273e-06, "loss": 0.2667, "step": 3700 }, { "epoch": 1.4591066171765008, "grad_norm": 0.2761940062046051, "learning_rate": 9.483924495167204e-06, "loss": 0.2654, "step": 3800 }, { "epoch": 1.497504159733777, "grad_norm": 0.2712952792644501, "learning_rate": 9.456907099629933e-06, "loss": 0.2642, "step": 3900 }, { "epoch": 1.5359017022910533, "grad_norm": 0.23100949823856354, "learning_rate": 9.429240950978212e-06, "loss": 0.2622, "step": 4000 }, { "epoch": 1.5359017022910533, "eval_valid_loss": 0.24485936760902405, "eval_valid_runtime": 4.6751, "eval_valid_samples_per_second": 213.9, "eval_valid_steps_per_second": 6.845, "step": 4000 }, { "epoch": 1.5359017022910533, "eval_valid_target_loss": 0.2641640603542328, "eval_valid_target_runtime": 4.6656, "eval_valid_target_samples_per_second": 214.335, "eval_valid_target_steps_per_second": 6.859, "step": 4000 }, { "epoch": 1.5742992448483297, "grad_norm": 0.2676081359386444, "learning_rate": 9.400930076332126e-06, "loss": 0.2602, "step": 4100 }, { "epoch": 1.6126967874056062, "grad_norm": 0.24242335557937622, "learning_rate": 9.371978596658904e-06, "loss": 0.2573, "step": 4200 }, { "epoch": 1.6510943299628824, "grad_norm": 0.27868130803108215, "learning_rate": 9.342390726173065e-06, "loss": 0.2574, "step": 4300 }, { "epoch": 1.6894918725201586, "grad_norm": 0.2644180655479431, "learning_rate": 9.31217077172299e-06, "loss": 0.255, "step": 4400 }, { "epoch": 1.727889415077435, "grad_norm": 0.2352069914340973, "learning_rate": 9.281323132164013e-06, "loss": 0.2538, "step": 4500 }, { "epoch": 1.727889415077435, "eval_valid_loss": 0.2354765683412552, "eval_valid_runtime": 4.7068, "eval_valid_samples_per_second": 212.461, "eval_valid_steps_per_second": 6.799, "step": 4500 }, { "epoch": 1.727889415077435, "eval_valid_target_loss": 0.25502344965934753, "eval_valid_target_runtime": 4.6612, "eval_valid_target_samples_per_second": 214.536, "eval_valid_target_steps_per_second": 6.865, "step": 4500 }, { "epoch": 1.7662869576347113, "grad_norm": 0.28041261434555054, "learning_rate": 9.249852297718116e-06, "loss": 0.2507, "step": 4600 }, { "epoch": 1.8046845001919878, "grad_norm": 0.2735440135002136, "learning_rate": 9.217762849320334e-06, "loss": 0.2496, "step": 4700 }, { "epoch": 1.843082042749264, "grad_norm": 0.26316097378730774, "learning_rate": 9.185059457951933e-06, "loss": 0.2479, "step": 4800 }, { "epoch": 1.8814795853065402, "grad_norm": 0.23891638219356537, "learning_rate": 9.151746883960512e-06, "loss": 0.2457, "step": 4900 }, { "epoch": 1.9198771278638167, "grad_norm": 0.22432874143123627, "learning_rate": 9.117829976367072e-06, "loss": 0.2446, "step": 5000 }, { "epoch": 1.9198771278638167, "eval_valid_loss": 0.2283046841621399, "eval_valid_runtime": 4.6829, "eval_valid_samples_per_second": 213.544, "eval_valid_steps_per_second": 6.833, "step": 5000 }, { "epoch": 1.9198771278638167, "eval_valid_target_loss": 0.24809375405311584, "eval_valid_target_runtime": 4.6694, "eval_valid_target_samples_per_second": 214.162, "eval_valid_target_steps_per_second": 6.853, "step": 5000 }, { "epoch": 1.9582746704210932, "grad_norm": 0.27488961815834045, "learning_rate": 9.08331367216019e-06, "loss": 0.2434, "step": 5100 }, { "epoch": 1.9966722129783694, "grad_norm": 0.2284267097711563, "learning_rate": 9.048202995577383e-06, "loss": 0.24, "step": 5200 }, { "epoch": 2.0350697555356456, "grad_norm": 0.2710357904434204, "learning_rate": 9.012503057373769e-06, "loss": 0.2399, "step": 5300 }, { "epoch": 2.073467298092922, "grad_norm": 0.24398750066757202, "learning_rate": 8.976219054078147e-06, "loss": 0.2391, "step": 5400 }, { "epoch": 2.1118648406501985, "grad_norm": 0.24732039868831635, "learning_rate": 8.939356267236582e-06, "loss": 0.2374, "step": 5500 }, { "epoch": 2.1118648406501985, "eval_valid_loss": 0.22253906726837158, "eval_valid_runtime": 4.6969, "eval_valid_samples_per_second": 212.904, "eval_valid_steps_per_second": 6.813, "step": 5500 }, { "epoch": 2.1118648406501985, "eval_valid_target_loss": 0.24240624904632568, "eval_valid_target_runtime": 4.6761, "eval_valid_target_samples_per_second": 213.853, "eval_valid_target_steps_per_second": 6.843, "step": 5500 }, { "epoch": 2.1502623832074748, "grad_norm": 0.23949123919010162, "learning_rate": 8.901920062643607e-06, "loss": 0.2368, "step": 5600 }, { "epoch": 2.188659925764751, "grad_norm": 0.26010605692863464, "learning_rate": 8.863915889561188e-06, "loss": 0.2351, "step": 5700 }, { "epoch": 2.2270574683220272, "grad_norm": 0.2524034380912781, "learning_rate": 8.825349279925506e-06, "loss": 0.2333, "step": 5800 }, { "epoch": 2.265455010879304, "grad_norm": 0.24745632708072662, "learning_rate": 8.78622584754173e-06, "loss": 0.2323, "step": 5900 }, { "epoch": 2.30385255343658, "grad_norm": 0.2586907148361206, "learning_rate": 8.746551287266863e-06, "loss": 0.2312, "step": 6000 }, { "epoch": 2.30385255343658, "eval_valid_loss": 0.216859370470047, "eval_valid_runtime": 4.6709, "eval_valid_samples_per_second": 214.092, "eval_valid_steps_per_second": 6.851, "step": 6000 }, { "epoch": 2.30385255343658, "eval_valid_target_loss": 0.23771093785762787, "eval_valid_target_runtime": 4.6848, "eval_valid_target_samples_per_second": 213.455, "eval_valid_target_steps_per_second": 6.831, "step": 6000 }, { "epoch": 2.3422500959938564, "grad_norm": 0.24499697983264923, "learning_rate": 8.706331374180792e-06, "loss": 0.2301, "step": 6100 }, { "epoch": 2.3806476385511326, "grad_norm": 0.24237163364887238, "learning_rate": 8.665571962745655e-06, "loss": 0.2304, "step": 6200 }, { "epoch": 2.419045181108409, "grad_norm": 0.27395910024642944, "learning_rate": 8.624278985953665e-06, "loss": 0.2287, "step": 6300 }, { "epoch": 2.4574427236656855, "grad_norm": 0.2500033378601074, "learning_rate": 8.582458454463493e-06, "loss": 0.2279, "step": 6400 }, { "epoch": 2.4958402662229617, "grad_norm": 0.2605977952480316, "learning_rate": 8.540116455725346e-06, "loss": 0.2277, "step": 6500 }, { "epoch": 2.4958402662229617, "eval_valid_loss": 0.21196874976158142, "eval_valid_runtime": 4.6941, "eval_valid_samples_per_second": 213.035, "eval_valid_steps_per_second": 6.817, "step": 6500 }, { "epoch": 2.4958402662229617, "eval_valid_target_loss": 0.23328906297683716, "eval_valid_target_runtime": 4.6792, "eval_valid_target_samples_per_second": 213.712, "eval_valid_target_steps_per_second": 6.839, "step": 6500 }, { "epoch": 2.534237808780238, "grad_norm": 0.2220095992088318, "learning_rate": 8.497259153094875e-06, "loss": 0.2254, "step": 6600 }, { "epoch": 2.5726353513375146, "grad_norm": 0.24707047641277313, "learning_rate": 8.453892784936022e-06, "loss": 0.2239, "step": 6700 }, { "epoch": 2.611032893894791, "grad_norm": 0.23103290796279907, "learning_rate": 8.41002366371297e-06, "loss": 0.224, "step": 6800 }, { "epoch": 2.649430436452067, "grad_norm": 0.2249547839164734, "learning_rate": 8.36565817507127e-06, "loss": 0.2227, "step": 6900 }, { "epoch": 2.6878279790093433, "grad_norm": 0.24457262456417084, "learning_rate": 8.32080277690836e-06, "loss": 0.2209, "step": 7000 }, { "epoch": 2.6878279790093433, "eval_valid_loss": 0.20793749392032623, "eval_valid_runtime": 4.6727, "eval_valid_samples_per_second": 214.01, "eval_valid_steps_per_second": 6.848, "step": 7000 }, { "epoch": 2.6878279790093433, "eval_valid_target_loss": 0.22950781881809235, "eval_valid_target_runtime": 4.6848, "eval_valid_target_samples_per_second": 213.456, "eval_valid_target_steps_per_second": 6.831, "step": 7000 }, { "epoch": 2.7262255215666196, "grad_norm": 0.23176012933254242, "learning_rate": 8.275463998433537e-06, "loss": 0.2206, "step": 7100 }, { "epoch": 2.764623064123896, "grad_norm": 0.21723733842372894, "learning_rate": 8.229648439217552e-06, "loss": 0.2203, "step": 7200 }, { "epoch": 2.8030206066811725, "grad_norm": 0.2428179383277893, "learning_rate": 8.183362768231971e-06, "loss": 0.2192, "step": 7300 }, { "epoch": 2.8414181492384487, "grad_norm": 0.2162482738494873, "learning_rate": 8.136613722878437e-06, "loss": 0.2183, "step": 7400 }, { "epoch": 2.879815691795725, "grad_norm": 0.22231200337409973, "learning_rate": 8.08940810800796e-06, "loss": 0.2177, "step": 7500 }, { "epoch": 2.879815691795725, "eval_valid_loss": 0.20469531416893005, "eval_valid_runtime": 4.6819, "eval_valid_samples_per_second": 213.587, "eval_valid_steps_per_second": 6.835, "step": 7500 }, { "epoch": 2.879815691795725, "eval_valid_target_loss": 0.2264062464237213, "eval_valid_target_runtime": 4.6647, "eval_valid_target_samples_per_second": 214.376, "eval_valid_target_steps_per_second": 6.86, "step": 7500 }, { "epoch": 2.9182132343530016, "grad_norm": 0.2663327157497406, "learning_rate": 8.041752794930389e-06, "loss": 0.2172, "step": 7600 }, { "epoch": 2.956610776910278, "grad_norm": 0.2545444369316101, "learning_rate": 7.993654720414227e-06, "loss": 0.216, "step": 7700 }, { "epoch": 2.995008319467554, "grad_norm": 0.2252371460199356, "learning_rate": 7.9451208856769e-06, "loss": 0.2154, "step": 7800 }, { "epoch": 3.0334058620248303, "grad_norm": 0.2507840394973755, "learning_rate": 7.896158355365643e-06, "loss": 0.2151, "step": 7900 }, { "epoch": 3.0718034045821065, "grad_norm": 0.22570189833641052, "learning_rate": 7.846774256529178e-06, "loss": 0.2131, "step": 8000 }, { "epoch": 3.0718034045821065, "eval_valid_loss": 0.2014453113079071, "eval_valid_runtime": 4.6924, "eval_valid_samples_per_second": 213.111, "eval_valid_steps_per_second": 6.82, "step": 8000 }, { "epoch": 3.0718034045821065, "eval_valid_target_loss": 0.22346094250679016, "eval_valid_target_runtime": 4.6655, "eval_valid_target_samples_per_second": 214.339, "eval_valid_target_steps_per_second": 6.859, "step": 8000 }, { "epoch": 3.110200947139383, "grad_norm": 0.24750301241874695, "learning_rate": 7.796975777580276e-06, "loss": 0.2133, "step": 8100 }, { "epoch": 3.1485984896966595, "grad_norm": 0.2118765264749527, "learning_rate": 7.746770167249413e-06, "loss": 0.2124, "step": 8200 }, { "epoch": 3.1869960322539357, "grad_norm": 0.22295965254306793, "learning_rate": 7.696164733529628e-06, "loss": 0.2123, "step": 8300 }, { "epoch": 3.225393574811212, "grad_norm": 0.2226712554693222, "learning_rate": 7.645166842612766e-06, "loss": 0.2115, "step": 8400 }, { "epoch": 3.2637911173684886, "grad_norm": 0.22712872922420502, "learning_rate": 7.593783917817248e-06, "loss": 0.211, "step": 8500 }, { "epoch": 3.2637911173684886, "eval_valid_loss": 0.19893750548362732, "eval_valid_runtime": 4.6876, "eval_valid_samples_per_second": 213.327, "eval_valid_steps_per_second": 6.826, "step": 8500 }, { "epoch": 3.2637911173684886, "eval_valid_target_loss": 0.22138281166553497, "eval_valid_target_runtime": 4.6684, "eval_valid_target_samples_per_second": 214.206, "eval_valid_target_steps_per_second": 6.855, "step": 8500 }, { "epoch": 3.302188659925765, "grad_norm": 0.20663662254810333, "learning_rate": 7.5420234385075155e-06, "loss": 0.211, "step": 8600 }, { "epoch": 3.340586202483041, "grad_norm": 0.24639233946800232, "learning_rate": 7.489892939005333e-06, "loss": 0.2099, "step": 8700 }, { "epoch": 3.3789837450403173, "grad_norm": 0.21435491740703583, "learning_rate": 7.437400007493079e-06, "loss": 0.209, "step": 8800 }, { "epoch": 3.4173812875975935, "grad_norm": 0.21131959557533264, "learning_rate": 7.384552284909195e-06, "loss": 0.2081, "step": 8900 }, { "epoch": 3.45577883015487, "grad_norm": 0.2295517921447754, "learning_rate": 7.3313574638359734e-06, "loss": 0.2084, "step": 9000 }, { "epoch": 3.45577883015487, "eval_valid_loss": 0.19658593833446503, "eval_valid_runtime": 4.6935, "eval_valid_samples_per_second": 213.059, "eval_valid_steps_per_second": 6.818, "step": 9000 }, { "epoch": 3.45577883015487, "eval_valid_target_loss": 0.2188750058412552, "eval_valid_target_runtime": 4.6686, "eval_valid_target_samples_per_second": 214.199, "eval_valid_target_steps_per_second": 6.854, "step": 9000 }, { "epoch": 3.4941763727121464, "grad_norm": 0.2244088351726532, "learning_rate": 7.277823287379801e-06, "loss": 0.2084, "step": 9100 }, { "epoch": 3.5325739152694227, "grad_norm": 0.2267696112394333, "learning_rate": 7.2239575480440774e-06, "loss": 0.2085, "step": 9200 }, { "epoch": 3.5709714578266993, "grad_norm": 0.20846766233444214, "learning_rate": 7.169768086594913e-06, "loss": 0.2063, "step": 9300 }, { "epoch": 3.6093690003839756, "grad_norm": 0.23632733523845673, "learning_rate": 7.115262790919827e-06, "loss": 0.2068, "step": 9400 }, { "epoch": 3.647766542941252, "grad_norm": 0.20877471566200256, "learning_rate": 7.060449594879573e-06, "loss": 0.2059, "step": 9500 }, { "epoch": 3.647766542941252, "eval_valid_loss": 0.19441406428813934, "eval_valid_runtime": 4.6671, "eval_valid_samples_per_second": 214.264, "eval_valid_steps_per_second": 6.856, "step": 9500 }, { "epoch": 3.647766542941252, "eval_valid_target_loss": 0.21704687178134918, "eval_valid_target_runtime": 4.6648, "eval_valid_target_samples_per_second": 214.371, "eval_valid_target_steps_per_second": 6.86, "step": 9500 }, { "epoch": 3.686164085498528, "grad_norm": 0.20587915182113647, "learning_rate": 7.0053364771532805e-06, "loss": 0.2058, "step": 9600 }, { "epoch": 3.7245616280558043, "grad_norm": 0.208708256483078, "learning_rate": 6.949931460077058e-06, "loss": 0.2052, "step": 9700 }, { "epoch": 3.7629591706130805, "grad_norm": 0.21517980098724365, "learning_rate": 6.894242608476263e-06, "loss": 0.2049, "step": 9800 }, { "epoch": 3.801356713170357, "grad_norm": 0.22570070624351501, "learning_rate": 6.8382780284915685e-06, "loss": 0.2047, "step": 9900 }, { "epoch": 3.8397542557276334, "grad_norm": 0.22346258163452148, "learning_rate": 6.782045866399023e-06, "loss": 0.2037, "step": 10000 }, { "epoch": 3.8397542557276334, "eval_valid_loss": 0.1928359419107437, "eval_valid_runtime": 4.6748, "eval_valid_samples_per_second": 213.912, "eval_valid_steps_per_second": 6.845, "step": 10000 }, { "epoch": 3.8397542557276334, "eval_valid_target_loss": 0.21531249582767487, "eval_valid_target_runtime": 4.6773, "eval_valid_target_samples_per_second": 213.8, "eval_valid_target_steps_per_second": 6.842, "step": 10000 }, { "epoch": 3.8781517982849096, "grad_norm": 0.2544507086277008, "learning_rate": 6.725554307424274e-06, "loss": 0.2036, "step": 10100 }, { "epoch": 3.9165493408421863, "grad_norm": 0.27723318338394165, "learning_rate": 6.668811574551106e-06, "loss": 0.2039, "step": 10200 }, { "epoch": 3.9549468833994625, "grad_norm": 0.22496485710144043, "learning_rate": 6.6118259273245065e-06, "loss": 0.2032, "step": 10300 }, { "epoch": 3.9933444259567388, "grad_norm": 0.22093619406223297, "learning_rate": 6.55460566064838e-06, "loss": 0.2027, "step": 10400 }, { "epoch": 4.031741968514015, "grad_norm": 0.2137976437807083, "learning_rate": 6.497159103578143e-06, "loss": 0.2016, "step": 10500 }, { "epoch": 4.031741968514015, "eval_valid_loss": 0.19111718237400055, "eval_valid_runtime": 4.6833, "eval_valid_samples_per_second": 213.523, "eval_valid_steps_per_second": 6.833, "step": 10500 }, { "epoch": 4.031741968514015, "eval_valid_target_loss": 0.2142656296491623, "eval_valid_target_runtime": 4.6587, "eval_valid_target_samples_per_second": 214.65, "eval_valid_target_steps_per_second": 6.869, "step": 10500 }, { "epoch": 4.070139511071291, "grad_norm": 0.20360158383846283, "learning_rate": 6.439494618108332e-06, "loss": 0.2013, "step": 10600 }, { "epoch": 4.1085370536285675, "grad_norm": 0.21878282725811005, "learning_rate": 6.38162059795542e-06, "loss": 0.2006, "step": 10700 }, { "epoch": 4.146934596185844, "grad_norm": 0.2319776862859726, "learning_rate": 6.323545467336017e-06, "loss": 0.2012, "step": 10800 }, { "epoch": 4.185332138743121, "grad_norm": 0.20898312330245972, "learning_rate": 6.26527767974063e-06, "loss": 0.2005, "step": 10900 }, { "epoch": 4.223729681300397, "grad_norm": 0.21366915106773376, "learning_rate": 6.206825716703166e-06, "loss": 0.2, "step": 11000 }, { "epoch": 4.223729681300397, "eval_valid_loss": 0.18977344036102295, "eval_valid_runtime": 4.7328, "eval_valid_samples_per_second": 211.293, "eval_valid_steps_per_second": 6.761, "step": 11000 }, { "epoch": 4.223729681300397, "eval_valid_target_loss": 0.21274219453334808, "eval_valid_target_runtime": 4.6506, "eval_valid_target_samples_per_second": 215.026, "eval_valid_target_steps_per_second": 6.881, "step": 11000 }, { "epoch": 4.262127223857673, "grad_norm": 0.20968745648860931, "learning_rate": 6.1481980865663405e-06, "loss": 0.1993, "step": 11100 }, { "epoch": 4.3005247664149495, "grad_norm": 0.20683012902736664, "learning_rate": 6.089403323243203e-06, "loss": 0.1992, "step": 11200 }, { "epoch": 4.338922308972226, "grad_norm": 0.20785097777843475, "learning_rate": 6.030449984974916e-06, "loss": 0.199, "step": 11300 }, { "epoch": 4.377319851529502, "grad_norm": 0.20532238483428955, "learning_rate": 5.971346653085025e-06, "loss": 0.199, "step": 11400 }, { "epoch": 4.415717394086778, "grad_norm": 0.21589842438697815, "learning_rate": 5.912101930730329e-06, "loss": 0.1992, "step": 11500 }, { "epoch": 4.415717394086778, "eval_valid_loss": 0.18833594024181366, "eval_valid_runtime": 4.6904, "eval_valid_samples_per_second": 213.203, "eval_valid_steps_per_second": 6.823, "step": 11500 }, { "epoch": 4.415717394086778, "eval_valid_target_loss": 0.211976557970047, "eval_valid_target_runtime": 4.658, "eval_valid_target_samples_per_second": 214.686, "eval_valid_target_steps_per_second": 6.87, "step": 11500 }, { "epoch": 4.4541149366440544, "grad_norm": 0.2021540254354477, "learning_rate": 5.852724441648614e-06, "loss": 0.1987, "step": 11600 }, { "epoch": 4.492512479201331, "grad_norm": 0.24406403303146362, "learning_rate": 5.7932228289033506e-06, "loss": 0.1984, "step": 11700 }, { "epoch": 4.530910021758608, "grad_norm": 0.20519228279590607, "learning_rate": 5.7336057536256216e-06, "loss": 0.1984, "step": 11800 }, { "epoch": 4.569307564315884, "grad_norm": 0.21227143704891205, "learning_rate": 5.67388189375337e-06, "loss": 0.1976, "step": 11900 }, { "epoch": 4.60770510687316, "grad_norm": 0.2325662076473236, "learning_rate": 5.614059942768254e-06, "loss": 0.1977, "step": 12000 }, { "epoch": 4.60770510687316, "eval_valid_loss": 0.18742187321186066, "eval_valid_runtime": 4.6831, "eval_valid_samples_per_second": 213.535, "eval_valid_steps_per_second": 6.833, "step": 12000 }, { "epoch": 4.60770510687316, "eval_valid_target_loss": 0.21108593046665192, "eval_valid_target_runtime": 4.6502, "eval_valid_target_samples_per_second": 215.046, "eval_valid_target_steps_per_second": 6.881, "step": 12000 }, { "epoch": 4.6461026494304365, "grad_norm": 0.2245544046163559, "learning_rate": 5.554148608430192e-06, "loss": 0.1965, "step": 12100 }, { "epoch": 4.684500191987713, "grad_norm": 0.22662824392318726, "learning_rate": 5.4941566115098614e-06, "loss": 0.1971, "step": 12200 }, { "epoch": 4.722897734544989, "grad_norm": 0.19245535135269165, "learning_rate": 5.4340926845192874e-06, "loss": 0.1974, "step": 12300 }, { "epoch": 4.761295277102265, "grad_norm": 0.18942756950855255, "learning_rate": 5.373965570440729e-06, "loss": 0.1966, "step": 12400 }, { "epoch": 4.799692819659541, "grad_norm": 0.1962059736251831, "learning_rate": 5.3137840214540395e-06, "loss": 0.1958, "step": 12500 }, { "epoch": 4.799692819659541, "eval_valid_loss": 0.18663281202316284, "eval_valid_runtime": 4.6972, "eval_valid_samples_per_second": 212.895, "eval_valid_steps_per_second": 6.813, "step": 12500 }, { "epoch": 4.799692819659541, "eval_valid_target_loss": 0.21009375154972076, "eval_valid_target_runtime": 4.6708, "eval_valid_target_samples_per_second": 214.096, "eval_valid_target_steps_per_second": 6.851, "step": 12500 }, { "epoch": 4.838090362216818, "grad_norm": 0.2151457667350769, "learning_rate": 5.2535567976626846e-06, "loss": 0.1963, "step": 12600 }, { "epoch": 4.876487904774095, "grad_norm": 0.18380814790725708, "learning_rate": 5.1932926658186166e-06, "loss": 0.1959, "step": 12700 }, { "epoch": 4.914885447331371, "grad_norm": 0.19516663253307343, "learning_rate": 5.133000398046168e-06, "loss": 0.1953, "step": 12800 }, { "epoch": 4.953282989888647, "grad_norm": 0.24182352423667908, "learning_rate": 5.072688770565177e-06, "loss": 0.1953, "step": 12900 }, { "epoch": 4.9916805324459235, "grad_norm": 0.23720215260982513, "learning_rate": 5.012366562413501e-06, "loss": 0.1955, "step": 13000 }, { "epoch": 4.9916805324459235, "eval_valid_loss": 0.18524999916553497, "eval_valid_runtime": 4.6908, "eval_valid_samples_per_second": 213.184, "eval_valid_steps_per_second": 6.822, "step": 13000 }, { "epoch": 4.9916805324459235, "eval_valid_target_loss": 0.20893749594688416, "eval_valid_target_runtime": 4.6667, "eval_valid_target_samples_per_second": 214.285, "eval_valid_target_steps_per_second": 6.857, "step": 13000 }, { "epoch": 5.0300780750032, "grad_norm": 0.20271484553813934, "learning_rate": 4.952042554169138e-06, "loss": 0.1948, "step": 13100 }, { "epoch": 5.068475617560476, "grad_norm": 0.2053770273923874, "learning_rate": 4.891725526672107e-06, "loss": 0.1947, "step": 13200 }, { "epoch": 5.106873160117752, "grad_norm": 0.20811979472637177, "learning_rate": 4.8314242597463e-06, "loss": 0.1939, "step": 13300 }, { "epoch": 5.145270702675028, "grad_norm": 0.19889037311077118, "learning_rate": 4.771147530921483e-06, "loss": 0.1943, "step": 13400 }, { "epoch": 5.1836682452323055, "grad_norm": 0.2038932591676712, "learning_rate": 4.710904114155621e-06, "loss": 0.1938, "step": 13500 }, { "epoch": 5.1836682452323055, "eval_valid_loss": 0.1847265660762787, "eval_valid_runtime": 4.698, "eval_valid_samples_per_second": 212.854, "eval_valid_steps_per_second": 6.811, "step": 13500 }, { "epoch": 5.1836682452323055, "eval_valid_target_loss": 0.20839843153953552, "eval_valid_target_runtime": 4.6593, "eval_valid_target_samples_per_second": 214.626, "eval_valid_target_steps_per_second": 6.868, "step": 13500 }, { "epoch": 5.222065787789582, "grad_norm": 0.19585560262203217, "learning_rate": 4.650702778557736e-06, "loss": 0.1932, "step": 13600 }, { "epoch": 5.260463330346858, "grad_norm": 0.23953603208065033, "learning_rate": 4.59055228711146e-06, "loss": 0.1933, "step": 13700 }, { "epoch": 5.298860872904134, "grad_norm": 0.21477288007736206, "learning_rate": 4.530461395399485e-06, "loss": 0.1929, "step": 13800 }, { "epoch": 5.33725841546141, "grad_norm": 0.22662727534770966, "learning_rate": 4.470438850329089e-06, "loss": 0.1935, "step": 13900 }, { "epoch": 5.375655958018687, "grad_norm": 0.18912354111671448, "learning_rate": 4.410493388858925e-06, "loss": 0.1931, "step": 14000 }, { "epoch": 5.375655958018687, "eval_valid_loss": 0.18379686772823334, "eval_valid_runtime": 4.6729, "eval_valid_samples_per_second": 214.001, "eval_valid_steps_per_second": 6.848, "step": 14000 }, { "epoch": 5.375655958018687, "eval_valid_target_loss": 0.20746874809265137, "eval_valid_target_runtime": 4.6581, "eval_valid_target_samples_per_second": 214.682, "eval_valid_target_steps_per_second": 6.87, "step": 14000 }, { "epoch": 5.414053500575963, "grad_norm": 0.21155835688114166, "learning_rate": 4.350633736727259e-06, "loss": 0.193, "step": 14100 }, { "epoch": 5.452451043133239, "grad_norm": 0.2160138338804245, "learning_rate": 4.29086860718184e-06, "loss": 0.1931, "step": 14200 }, { "epoch": 5.490848585690516, "grad_norm": 0.19270409643650055, "learning_rate": 4.231206699711587e-06, "loss": 0.1925, "step": 14300 }, { "epoch": 5.5292461282477925, "grad_norm": 0.18501386046409607, "learning_rate": 4.171656698780281e-06, "loss": 0.1925, "step": 14400 }, { "epoch": 5.567643670805069, "grad_norm": 0.20564299821853638, "learning_rate": 4.112227272562447e-06, "loss": 0.1918, "step": 14500 }, { "epoch": 5.567643670805069, "eval_valid_loss": 0.18317969143390656, "eval_valid_runtime": 4.679, "eval_valid_samples_per_second": 213.72, "eval_valid_steps_per_second": 6.839, "step": 14500 }, { "epoch": 5.567643670805069, "eval_valid_target_loss": 0.20700781047344208, "eval_valid_target_runtime": 4.674, "eval_valid_target_samples_per_second": 213.95, "eval_valid_target_steps_per_second": 6.846, "step": 14500 }, { "epoch": 5.606041213362345, "grad_norm": 0.21509169042110443, "learning_rate": 4.052927071681593e-06, "loss": 0.1919, "step": 14600 }, { "epoch": 5.644438755919621, "grad_norm": 0.18730491399765015, "learning_rate": 3.99376472795103e-06, "loss": 0.1921, "step": 14700 }, { "epoch": 5.682836298476897, "grad_norm": 0.21269969642162323, "learning_rate": 3.934748853117398e-06, "loss": 0.1918, "step": 14800 }, { "epoch": 5.721233841034174, "grad_norm": 0.18910899758338928, "learning_rate": 3.8758880376071415e-06, "loss": 0.1914, "step": 14900 }, { "epoch": 5.75963138359145, "grad_norm": 0.22251802682876587, "learning_rate": 3.8171908492760665e-06, "loss": 0.1916, "step": 15000 }, { "epoch": 5.75963138359145, "eval_valid_loss": 0.18259374797344208, "eval_valid_runtime": 4.67, "eval_valid_samples_per_second": 214.134, "eval_valid_steps_per_second": 6.852, "step": 15000 }, { "epoch": 5.75963138359145, "eval_valid_target_loss": 0.20646093785762787, "eval_valid_target_runtime": 4.67, "eval_valid_target_samples_per_second": 214.131, "eval_valid_target_steps_per_second": 6.852, "step": 15000 }, { "epoch": 5.798028926148726, "grad_norm": 0.17328619956970215, "learning_rate": 3.758665832162203e-06, "loss": 0.1911, "step": 15100 }, { "epoch": 5.836426468706003, "grad_norm": 0.20850612223148346, "learning_rate": 3.7003215052421116e-06, "loss": 0.1915, "step": 15200 }, { "epoch": 5.8748240112632795, "grad_norm": 0.1912785917520523, "learning_rate": 3.642166361190859e-06, "loss": 0.1908, "step": 15300 }, { "epoch": 5.913221553820556, "grad_norm": 0.2138790339231491, "learning_rate": 3.584208865145812e-06, "loss": 0.1907, "step": 15400 }, { "epoch": 5.951619096377832, "grad_norm": 0.19723013043403625, "learning_rate": 3.5264574534744373e-06, "loss": 0.1913, "step": 15500 }, { "epoch": 5.951619096377832, "eval_valid_loss": 0.1817968785762787, "eval_valid_runtime": 4.6726, "eval_valid_samples_per_second": 214.016, "eval_valid_steps_per_second": 6.849, "step": 15500 }, { "epoch": 5.951619096377832, "eval_valid_target_loss": 0.20574218034744263, "eval_valid_target_runtime": 4.6817, "eval_valid_target_samples_per_second": 213.599, "eval_valid_target_steps_per_second": 6.835, "step": 15500 }, { "epoch": 5.990016638935108, "grad_norm": 0.19212548434734344, "learning_rate": 3.4689205325462997e-06, "loss": 0.1907, "step": 15600 }, { "epoch": 6.028414181492384, "grad_norm": 0.19529464840888977, "learning_rate": 3.4116064775094126e-06, "loss": 0.1901, "step": 15700 }, { "epoch": 6.066811724049661, "grad_norm": 0.2088070809841156, "learning_rate": 3.354523631071147e-06, "loss": 0.1902, "step": 15800 }, { "epoch": 6.105209266606937, "grad_norm": 0.19294045865535736, "learning_rate": 3.2976803022838514e-06, "loss": 0.1903, "step": 15900 }, { "epoch": 6.143606809164213, "grad_norm": 0.20844899117946625, "learning_rate": 3.2410847653353805e-06, "loss": 0.1897, "step": 16000 }, { "epoch": 6.143606809164213, "eval_valid_loss": 0.1809999942779541, "eval_valid_runtime": 4.6789, "eval_valid_samples_per_second": 213.724, "eval_valid_steps_per_second": 6.839, "step": 16000 }, { "epoch": 6.143606809164213, "eval_valid_target_loss": 0.20546874403953552, "eval_valid_target_runtime": 4.6614, "eval_valid_target_samples_per_second": 214.53, "eval_valid_target_steps_per_second": 6.865, "step": 16000 }, { "epoch": 6.18200435172149, "grad_norm": 0.19932307302951813, "learning_rate": 3.184745258344688e-06, "loss": 0.1894, "step": 16100 }, { "epoch": 6.220401894278766, "grad_norm": 0.19776058197021484, "learning_rate": 3.128669982162681e-06, "loss": 0.1899, "step": 16200 }, { "epoch": 6.258799436836043, "grad_norm": 0.20467509329319, "learning_rate": 3.07286709917849e-06, "loss": 0.1898, "step": 16300 }, { "epoch": 6.297196979393319, "grad_norm": 0.19593088328838348, "learning_rate": 3.017344732131342e-06, "loss": 0.1895, "step": 16400 }, { "epoch": 6.335594521950595, "grad_norm": 0.20078891515731812, "learning_rate": 2.9621109629282064e-06, "loss": 0.1897, "step": 16500 }, { "epoch": 6.335594521950595, "eval_valid_loss": 0.1807578057050705, "eval_valid_runtime": 4.7017, "eval_valid_samples_per_second": 212.687, "eval_valid_steps_per_second": 6.806, "step": 16500 }, { "epoch": 6.335594521950595, "eval_valid_target_loss": 0.2052578181028366, "eval_valid_target_runtime": 4.6726, "eval_valid_target_samples_per_second": 214.013, "eval_valid_target_steps_per_second": 6.848, "step": 16500 }, { "epoch": 6.373992064507871, "grad_norm": 0.17822235822677612, "learning_rate": 2.9071738314673758e-06, "loss": 0.1889, "step": 16600 }, { "epoch": 6.412389607065148, "grad_norm": 0.21160703897476196, "learning_rate": 2.8525413344681797e-06, "loss": 0.1889, "step": 16700 }, { "epoch": 6.450787149622424, "grad_norm": 0.19472962617874146, "learning_rate": 2.798221424306953e-06, "loss": 0.1894, "step": 16800 }, { "epoch": 6.4891846921797, "grad_norm": 0.17923222482204437, "learning_rate": 2.744222007859506e-06, "loss": 0.1891, "step": 16900 }, { "epoch": 6.527582234736977, "grad_norm": 0.18077126145362854, "learning_rate": 2.690550945350157e-06, "loss": 0.1886, "step": 17000 }, { "epoch": 6.527582234736977, "eval_valid_loss": 0.18031249940395355, "eval_valid_runtime": 4.6828, "eval_valid_samples_per_second": 213.548, "eval_valid_steps_per_second": 6.834, "step": 17000 }, { "epoch": 6.527582234736977, "eval_valid_target_loss": 0.20450781285762787, "eval_valid_target_runtime": 4.6685, "eval_valid_target_samples_per_second": 214.203, "eval_valid_target_steps_per_second": 6.854, "step": 17000 }, { "epoch": 6.565979777294253, "grad_norm": 0.19065329432487488, "learning_rate": 2.637216049207615e-06, "loss": 0.188, "step": 17100 }, { "epoch": 6.60437731985153, "grad_norm": 0.20368430018424988, "learning_rate": 2.5842250829277724e-06, "loss": 0.189, "step": 17200 }, { "epoch": 6.642774862408806, "grad_norm": 0.21131780743598938, "learning_rate": 2.5315857599436575e-06, "loss": 0.1887, "step": 17300 }, { "epoch": 6.681172404966082, "grad_norm": 0.2033446729183197, "learning_rate": 2.4793057425026467e-06, "loss": 0.1887, "step": 17400 }, { "epoch": 6.719569947523358, "grad_norm": 0.19689294695854187, "learning_rate": 2.427392640551137e-06, "loss": 0.1887, "step": 17500 }, { "epoch": 6.719569947523358, "eval_valid_loss": 0.17996874451637268, "eval_valid_runtime": 4.7043, "eval_valid_samples_per_second": 212.57, "eval_valid_steps_per_second": 6.802, "step": 17500 }, { "epoch": 6.719569947523358, "eval_valid_target_loss": 0.20432811975479126, "eval_valid_target_runtime": 4.6638, "eval_valid_target_samples_per_second": 214.416, "eval_valid_target_steps_per_second": 6.861, "step": 17500 }, { "epoch": 6.757967490080635, "grad_norm": 0.1994999349117279, "learning_rate": 2.3758540106268406e-06, "loss": 0.1881, "step": 17600 }, { "epoch": 6.796365032637911, "grad_norm": 0.19650602340698242, "learning_rate": 2.32469735475884e-06, "loss": 0.1881, "step": 17700 }, { "epoch": 6.834762575195187, "grad_norm": 0.21248474717140198, "learning_rate": 2.273930119375586e-06, "loss": 0.1882, "step": 17800 }, { "epoch": 6.873160117752464, "grad_norm": 0.19042810797691345, "learning_rate": 2.2235596942209776e-06, "loss": 0.188, "step": 17900 }, { "epoch": 6.91155766030974, "grad_norm": 0.23096908628940582, "learning_rate": 2.173593411278714e-06, "loss": 0.1886, "step": 18000 }, { "epoch": 6.91155766030974, "eval_valid_loss": 0.17952343821525574, "eval_valid_runtime": 4.6878, "eval_valid_samples_per_second": 213.321, "eval_valid_steps_per_second": 6.826, "step": 18000 }, { "epoch": 6.91155766030974, "eval_valid_target_loss": 0.20391406118869781, "eval_valid_target_runtime": 4.6595, "eval_valid_target_samples_per_second": 214.617, "eval_valid_target_steps_per_second": 6.868, "step": 18000 }, { "epoch": 6.949955202867017, "grad_norm": 0.21275204420089722, "learning_rate": 2.124038543705034e-06, "loss": 0.1878, "step": 18100 }, { "epoch": 6.988352745424293, "grad_norm": 0.20453621447086334, "learning_rate": 2.0749023047700285e-06, "loss": 0.188, "step": 18200 }, { "epoch": 7.026750287981569, "grad_norm": 0.20724526047706604, "learning_rate": 2.026191846807663e-06, "loss": 0.1883, "step": 18300 }, { "epoch": 7.065147830538845, "grad_norm": 0.1886543333530426, "learning_rate": 1.9779142601746825e-06, "loss": 0.1874, "step": 18400 }, { "epoch": 7.1035453730961216, "grad_norm": 0.20411571860313416, "learning_rate": 1.9300765722185265e-06, "loss": 0.187, "step": 18500 }, { "epoch": 7.1035453730961216, "eval_valid_loss": 0.17924219369888306, "eval_valid_runtime": 4.6825, "eval_valid_samples_per_second": 213.561, "eval_valid_steps_per_second": 6.834, "step": 18500 }, { "epoch": 7.1035453730961216, "eval_valid_target_loss": 0.20393750071525574, "eval_valid_target_runtime": 4.6736, "eval_valid_target_samples_per_second": 213.97, "eval_valid_target_steps_per_second": 6.847, "step": 18500 }, { "epoch": 7.141942915653398, "grad_norm": 0.18996645510196686, "learning_rate": 1.8826857462544129e-06, "loss": 0.1871, "step": 18600 }, { "epoch": 7.180340458210675, "grad_norm": 0.21018381416797638, "learning_rate": 1.8357486805517615e-06, "loss": 0.1874, "step": 18700 }, { "epoch": 7.218738000767951, "grad_norm": 0.19617675244808197, "learning_rate": 1.7892722073300627e-06, "loss": 0.1869, "step": 18800 }, { "epoch": 7.257135543325227, "grad_norm": 0.2340448796749115, "learning_rate": 1.743263091764379e-06, "loss": 0.187, "step": 18900 }, { "epoch": 7.295533085882504, "grad_norm": 0.22970305383205414, "learning_rate": 1.6977280310005845e-06, "loss": 0.1873, "step": 19000 }, { "epoch": 7.295533085882504, "eval_valid_loss": 0.1788671910762787, "eval_valid_runtime": 4.6706, "eval_valid_samples_per_second": 214.105, "eval_valid_steps_per_second": 6.851, "step": 19000 }, { "epoch": 7.295533085882504, "eval_valid_target_loss": 0.20334374904632568, "eval_valid_target_runtime": 4.6842, "eval_valid_target_samples_per_second": 213.484, "eval_valid_target_steps_per_second": 6.831, "step": 19000 }, { "epoch": 7.33393062843978, "grad_norm": 0.20527499914169312, "learning_rate": 1.6526736531805354e-06, "loss": 0.1873, "step": 19100 }, { "epoch": 7.372328170997056, "grad_norm": 0.1835908442735672, "learning_rate": 1.6081065164772624e-06, "loss": 0.187, "step": 19200 }, { "epoch": 7.410725713554332, "grad_norm": 0.18936371803283691, "learning_rate": 1.564033108140348e-06, "loss": 0.1865, "step": 19300 }, { "epoch": 7.4491232561116085, "grad_norm": 0.19136998057365417, "learning_rate": 1.520459843551646e-06, "loss": 0.1872, "step": 19400 }, { "epoch": 7.487520798668886, "grad_norm": 0.19691316783428192, "learning_rate": 1.4773930652914426e-06, "loss": 0.187, "step": 19500 }, { "epoch": 7.487520798668886, "eval_valid_loss": 0.17878125607967377, "eval_valid_runtime": 4.6602, "eval_valid_samples_per_second": 214.581, "eval_valid_steps_per_second": 6.867, "step": 19500 }, { "epoch": 7.487520798668886, "eval_valid_target_loss": 0.20325781404972076, "eval_valid_target_runtime": 4.6796, "eval_valid_target_samples_per_second": 213.695, "eval_valid_target_steps_per_second": 6.838, "step": 19500 }, { "epoch": 7.525918341226162, "grad_norm": 0.18792080879211426, "learning_rate": 1.434839042215227e-06, "loss": 0.1868, "step": 19600 }, { "epoch": 7.564315883783438, "grad_norm": 0.1945939064025879, "learning_rate": 1.3928039685411793e-06, "loss": 0.1869, "step": 19700 }, { "epoch": 7.602713426340714, "grad_norm": 0.17974095046520233, "learning_rate": 1.3512939629485456e-06, "loss": 0.187, "step": 19800 }, { "epoch": 7.641110968897991, "grad_norm": 0.22416825592517853, "learning_rate": 1.3103150676869864e-06, "loss": 0.1871, "step": 19900 }, { "epoch": 7.679508511455267, "grad_norm": 0.19613422453403473, "learning_rate": 1.2698732476970627e-06, "loss": 0.1869, "step": 20000 }, { "epoch": 7.679508511455267, "eval_valid_loss": 0.1783437430858612, "eval_valid_runtime": 4.6716, "eval_valid_samples_per_second": 214.058, "eval_valid_steps_per_second": 6.85, "step": 20000 }, { "epoch": 7.679508511455267, "eval_valid_target_loss": 0.2031562477350235, "eval_valid_target_runtime": 4.6803, "eval_valid_target_samples_per_second": 213.661, "eval_valid_target_steps_per_second": 6.837, "step": 20000 }, { "epoch": 7.717906054012543, "grad_norm": 0.20145875215530396, "learning_rate": 1.229974389741964e-06, "loss": 0.187, "step": 20100 }, { "epoch": 7.756303596569819, "grad_norm": 0.18396620452404022, "learning_rate": 1.1906243015506375e-06, "loss": 0.1867, "step": 20200 }, { "epoch": 7.7947011391270955, "grad_norm": 0.18105918169021606, "learning_rate": 1.1518287109723958e-06, "loss": 0.1862, "step": 20300 }, { "epoch": 7.833098681684373, "grad_norm": 0.20986780524253845, "learning_rate": 1.1135932651431651e-06, "loss": 0.1863, "step": 20400 }, { "epoch": 7.871496224241649, "grad_norm": 0.21804456412792206, "learning_rate": 1.075923529663489e-06, "loss": 0.1869, "step": 20500 }, { "epoch": 7.871496224241649, "eval_valid_loss": 0.17836718261241913, "eval_valid_runtime": 4.6832, "eval_valid_samples_per_second": 213.531, "eval_valid_steps_per_second": 6.833, "step": 20500 }, { "epoch": 7.871496224241649, "eval_valid_target_loss": 0.20322656631469727, "eval_valid_target_runtime": 4.6763, "eval_valid_target_samples_per_second": 213.843, "eval_valid_target_steps_per_second": 6.843, "step": 20500 }, { "epoch": 7.909893766798925, "grad_norm": 0.22019818425178528, "learning_rate": 1.0388249877883827e-06, "loss": 0.1858, "step": 20600 }, { "epoch": 7.948291309356201, "grad_norm": 0.1965310275554657, "learning_rate": 1.0023030396291916e-06, "loss": 0.1866, "step": 20700 }, { "epoch": 7.9866888519134775, "grad_norm": 0.18218408524990082, "learning_rate": 9.66363001367534e-07, "loss": 0.1869, "step": 20800 }, { "epoch": 8.025086394470755, "grad_norm": 0.1850380003452301, "learning_rate": 9.310101044814835e-07, "loss": 0.1861, "step": 20900 }, { "epoch": 8.06348393702803, "grad_norm": 0.18823818862438202, "learning_rate": 8.962494949840577e-07, "loss": 0.186, "step": 21000 }, { "epoch": 8.06348393702803, "eval_valid_loss": 0.17808593809604645, "eval_valid_runtime": 4.6916, "eval_valid_samples_per_second": 213.147, "eval_valid_steps_per_second": 6.821, "step": 21000 }, { "epoch": 8.06348393702803, "eval_valid_target_loss": 0.20311719179153442, "eval_valid_target_runtime": 4.6653, "eval_valid_target_samples_per_second": 214.347, "eval_valid_target_steps_per_second": 6.859, "step": 21000 }, { "epoch": 8.101881479585307, "grad_norm": 0.20501789450645447, "learning_rate": 8.620862326741658e-07, "loss": 0.1862, "step": 21100 }, { "epoch": 8.140279022142582, "grad_norm": 0.19500133395195007, "learning_rate": 8.285252904000906e-07, "loss": 0.1862, "step": 21200 }, { "epoch": 8.17867656469986, "grad_norm": 0.18742544949054718, "learning_rate": 7.955715533356367e-07, "loss": 0.1863, "step": 21300 }, { "epoch": 8.217074107257135, "grad_norm": 0.20386624336242676, "learning_rate": 7.632298182690473e-07, "loss": 0.186, "step": 21400 }, { "epoch": 8.255471649814412, "grad_norm": 0.17727358639240265, "learning_rate": 7.315047929047608e-07, "loss": 0.1861, "step": 21500 }, { "epoch": 8.255471649814412, "eval_valid_loss": 0.17788280546665192, "eval_valid_runtime": 4.679, "eval_valid_samples_per_second": 213.72, "eval_valid_steps_per_second": 6.839, "step": 21500 }, { "epoch": 8.255471649814412, "eval_valid_target_loss": 0.2026640623807907, "eval_valid_target_runtime": 4.6709, "eval_valid_target_samples_per_second": 214.093, "eval_valid_target_steps_per_second": 6.851, "step": 21500 }, { "epoch": 8.293869192371687, "grad_norm": 0.19971401989459991, "learning_rate": 7.004010951781648e-07, "loss": 0.1858, "step": 21600 }, { "epoch": 8.332266734928965, "grad_norm": 0.17827193439006805, "learning_rate": 6.699232525833987e-07, "loss": 0.1868, "step": 21700 }, { "epoch": 8.370664277486242, "grad_norm": 0.18275295197963715, "learning_rate": 6.400757015143266e-07, "loss": 0.1858, "step": 21800 }, { "epoch": 8.409061820043517, "grad_norm": 0.19496768712997437, "learning_rate": 6.108627866187661e-07, "loss": 0.1854, "step": 21900 }, { "epoch": 8.447459362600794, "grad_norm": 0.19046269357204437, "learning_rate": 5.822887601660832e-07, "loss": 0.1862, "step": 22000 }, { "epoch": 8.447459362600794, "eval_valid_loss": 0.17781250178813934, "eval_valid_runtime": 4.6746, "eval_valid_samples_per_second": 213.921, "eval_valid_steps_per_second": 6.845, "step": 22000 }, { "epoch": 8.447459362600794, "eval_valid_target_loss": 0.2026640623807907, "eval_valid_target_runtime": 4.6755, "eval_valid_target_samples_per_second": 213.88, "eval_valid_target_steps_per_second": 6.844, "step": 22000 }, { "epoch": 8.48585690515807, "grad_norm": 0.20896296203136444, "learning_rate": 5.543577814282219e-07, "loss": 0.1856, "step": 22100 }, { "epoch": 8.524254447715347, "grad_norm": 0.19562530517578125, "learning_rate": 5.270739160742738e-07, "loss": 0.1857, "step": 22200 }, { "epoch": 8.562651990272622, "grad_norm": 0.1972120851278305, "learning_rate": 5.004411355786792e-07, "loss": 0.1863, "step": 22300 }, { "epoch": 8.601049532829899, "grad_norm": 0.19712330400943756, "learning_rate": 4.7446331664312786e-07, "loss": 0.1855, "step": 22400 }, { "epoch": 8.639447075387174, "grad_norm": 0.20409992337226868, "learning_rate": 4.4914424063226937e-07, "loss": 0.1857, "step": 22500 }, { "epoch": 8.639447075387174, "eval_valid_loss": 0.17765624821186066, "eval_valid_runtime": 4.6769, "eval_valid_samples_per_second": 213.818, "eval_valid_steps_per_second": 6.842, "step": 22500 }, { "epoch": 8.639447075387174, "eval_valid_target_loss": 0.2025781273841858, "eval_valid_target_runtime": 4.6696, "eval_valid_target_samples_per_second": 214.151, "eval_valid_target_steps_per_second": 6.853, "step": 22500 }, { "epoch": 8.677844617944451, "grad_norm": 0.21083636581897736, "learning_rate": 4.2448759302328336e-07, "loss": 0.1861, "step": 22600 }, { "epoch": 8.716242160501729, "grad_norm": 0.18778979778289795, "learning_rate": 4.0049696286942496e-07, "loss": 0.1862, "step": 22700 }, { "epoch": 8.754639703059004, "grad_norm": 0.18586015701293945, "learning_rate": 3.7717584227759117e-07, "loss": 0.1857, "step": 22800 }, { "epoch": 8.793037245616281, "grad_norm": 0.1977422684431076, "learning_rate": 3.54527625900013e-07, "loss": 0.1856, "step": 22900 }, { "epoch": 8.831434788173556, "grad_norm": 0.18881608545780182, "learning_rate": 3.3255561044011564e-07, "loss": 0.1857, "step": 23000 }, { "epoch": 8.831434788173556, "eval_valid_loss": 0.17771874368190765, "eval_valid_runtime": 4.6727, "eval_valid_samples_per_second": 214.01, "eval_valid_steps_per_second": 6.848, "step": 23000 }, { "epoch": 8.831434788173556, "eval_valid_target_loss": 0.20250000059604645, "eval_valid_target_runtime": 4.6666, "eval_valid_target_samples_per_second": 214.287, "eval_valid_target_steps_per_second": 6.857, "step": 23000 }, { "epoch": 8.869832330730834, "grad_norm": 0.2037239372730255, "learning_rate": 3.112629941726547e-07, "loss": 0.1856, "step": 23100 }, { "epoch": 8.908229873288109, "grad_norm": 0.18967826664447784, "learning_rate": 2.9065287647816744e-07, "loss": 0.1855, "step": 23200 }, { "epoch": 8.946627415845386, "grad_norm": 0.17752571403980255, "learning_rate": 2.707282573918213e-07, "loss": 0.1858, "step": 23300 }, { "epoch": 8.985024958402661, "grad_norm": 0.18709731101989746, "learning_rate": 2.514920371667301e-07, "loss": 0.1854, "step": 23400 }, { "epoch": 9.023422500959938, "grad_norm": 0.21643956005573273, "learning_rate": 2.3294701585178213e-07, "loss": 0.1858, "step": 23500 }, { "epoch": 9.023422500959938, "eval_valid_loss": 0.17762500047683716, "eval_valid_runtime": 4.6791, "eval_valid_samples_per_second": 213.717, "eval_valid_steps_per_second": 6.839, "step": 23500 }, { "epoch": 9.023422500959938, "eval_valid_target_loss": 0.20255468785762787, "eval_valid_target_runtime": 4.7103, "eval_valid_target_samples_per_second": 212.301, "eval_valid_target_steps_per_second": 6.794, "step": 23500 }, { "epoch": 9.061820043517216, "grad_norm": 0.18775244057178497, "learning_rate": 2.1509589288407183e-07, "loss": 0.1855, "step": 23600 }, { "epoch": 9.100217586074491, "grad_norm": 0.17277489602565765, "learning_rate": 1.9794126669595403e-07, "loss": 0.1859, "step": 23700 }, { "epoch": 9.138615128631768, "grad_norm": 0.18996348977088928, "learning_rate": 1.8148563433682264e-07, "loss": 0.1852, "step": 23800 }, { "epoch": 9.177012671189043, "grad_norm": 0.1894453912973404, "learning_rate": 1.6573139110963087e-07, "loss": 0.1854, "step": 23900 }, { "epoch": 9.21541021374632, "grad_norm": 0.2011975795030594, "learning_rate": 1.5068083022223346e-07, "loss": 0.1855, "step": 24000 }, { "epoch": 9.21541021374632, "eval_valid_loss": 0.17754687368869781, "eval_valid_runtime": 4.6668, "eval_valid_samples_per_second": 214.279, "eval_valid_steps_per_second": 6.857, "step": 24000 }, { "epoch": 9.21541021374632, "eval_valid_target_loss": 0.20250000059604645, "eval_valid_target_runtime": 4.6766, "eval_valid_target_samples_per_second": 213.829, "eval_valid_target_steps_per_second": 6.843, "step": 24000 }, { "epoch": 9.253807756303596, "grad_norm": 0.2087700515985489, "learning_rate": 1.3633614245357807e-07, "loss": 0.1858, "step": 24100 }, { "epoch": 9.292205298860873, "grad_norm": 0.18402153253555298, "learning_rate": 1.2269941583481548e-07, "loss": 0.1859, "step": 24200 }, { "epoch": 9.330602841418148, "grad_norm": 0.17724697291851044, "learning_rate": 1.0977263534536597e-07, "loss": 0.1856, "step": 24300 }, { "epoch": 9.369000383975425, "grad_norm": 0.1847800761461258, "learning_rate": 9.755768262397936e-08, "loss": 0.1858, "step": 24400 }, { "epoch": 9.407397926532703, "grad_norm": 0.1905263364315033, "learning_rate": 8.605633569484184e-08, "loss": 0.1856, "step": 24500 }, { "epoch": 9.407397926532703, "eval_valid_loss": 0.1775546818971634, "eval_valid_runtime": 4.6591, "eval_valid_samples_per_second": 214.636, "eval_valid_steps_per_second": 6.868, "step": 24500 }, { "epoch": 9.407397926532703, "eval_valid_target_loss": 0.2024531215429306, "eval_valid_target_runtime": 4.6763, "eval_valid_target_samples_per_second": 213.844, "eval_valid_target_steps_per_second": 6.843, "step": 24500 }, { "epoch": 9.445795469089978, "grad_norm": 0.17600856721401215, "learning_rate": 7.52702687087653e-08, "loss": 0.1855, "step": 24600 }, { "epoch": 9.484193011647255, "grad_norm": 0.19071801006793976, "learning_rate": 6.520105169949609e-08, "loss": 0.1856, "step": 24700 }, { "epoch": 9.52259055420453, "grad_norm": 0.20268982648849487, "learning_rate": 5.5850150355178936e-08, "loss": 0.1855, "step": 24800 }, { "epoch": 9.560988096761807, "grad_norm": 0.18069659173488617, "learning_rate": 4.721892580500709e-08, "loss": 0.1852, "step": 24900 }, { "epoch": 9.599385639319083, "grad_norm": 0.19809788465499878, "learning_rate": 3.9308634421098e-08, "loss": 0.1853, "step": 25000 }, { "epoch": 9.599385639319083, "eval_valid_loss": 0.17754687368869781, "eval_valid_runtime": 4.6689, "eval_valid_samples_per_second": 214.182, "eval_valid_steps_per_second": 6.854, "step": 25000 }, { "epoch": 9.599385639319083, "eval_valid_target_loss": 0.20237499475479126, "eval_valid_target_runtime": 4.688, "eval_valid_target_samples_per_second": 213.313, "eval_valid_target_steps_per_second": 6.826, "step": 25000 }, { "epoch": 9.63778318187636, "grad_norm": 0.1990041732788086, "learning_rate": 3.2120427635613517e-08, "loss": 0.1852, "step": 25100 }, { "epoch": 9.676180724433635, "grad_norm": 0.20578785240650177, "learning_rate": 2.565535177315226e-08, "loss": 0.185, "step": 25200 }, { "epoch": 9.714578266990912, "grad_norm": 0.19831426441669464, "learning_rate": 1.991434789845037e-08, "loss": 0.1858, "step": 25300 }, { "epoch": 9.75297580954819, "grad_norm": 0.18692290782928467, "learning_rate": 1.489825167939607e-08, "loss": 0.1848, "step": 25400 }, { "epoch": 9.791373352105465, "grad_norm": 0.20175856351852417, "learning_rate": 1.0607793265389742e-08, "loss": 0.1854, "step": 25500 }, { "epoch": 9.791373352105465, "eval_valid_loss": 0.17751562595367432, "eval_valid_runtime": 4.667, "eval_valid_samples_per_second": 214.272, "eval_valid_steps_per_second": 6.857, "step": 25500 }, { "epoch": 9.791373352105465, "eval_valid_target_loss": 0.20240625739097595, "eval_valid_target_runtime": 4.6781, "eval_valid_target_samples_per_second": 213.763, "eval_valid_target_steps_per_second": 6.84, "step": 25500 }, { "epoch": 9.829770894662742, "grad_norm": 0.20650416612625122, "learning_rate": 7.0435971810606244e-09, "loss": 0.1859, "step": 25600 }, { "epoch": 9.868168437220017, "grad_norm": 0.1880464404821396, "learning_rate": 4.206182235363399e-09, "loss": 0.1857, "step": 25700 }, { "epoch": 9.906565979777294, "grad_norm": 0.19517436623573303, "learning_rate": 2.095961446056949e-09, "loss": 0.1851, "step": 25800 }, { "epoch": 9.94496352233457, "grad_norm": 0.21848323941230774, "learning_rate": 7.132419795868872e-10, "loss": 0.1858, "step": 25900 }, { "epoch": 9.983361064891847, "grad_norm": 0.20499403774738312, "learning_rate": 5.82251063713235e-11, "loss": 0.1851, "step": 26000 }, { "epoch": 9.983361064891847, "eval_valid_loss": 0.1775234341621399, "eval_valid_runtime": 4.6706, "eval_valid_samples_per_second": 214.106, "eval_valid_steps_per_second": 6.851, "step": 26000 }, { "epoch": 9.983361064891847, "eval_valid_target_loss": 0.20242968201637268, "eval_valid_target_runtime": 4.679, "eval_valid_target_samples_per_second": 213.721, "eval_valid_target_steps_per_second": 6.839, "step": 26000 } ], "logging_steps": 100, "max_steps": 26040, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.475781022436819e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }