diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.20012108536518244, + "epoch": 0.3001816280477737, "eval_steps": 500, - "global_step": 3636, + "global_step": 5454, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -25459,6 +25459,12732 @@ "learning_rate": 9.766905396590851e-06, "loss": 0.8658, "step": 3636 + }, + { + "epoch": 0.2001761241675381, + "grad_norm": 0.8647506237030029, + "learning_rate": 9.766774571898516e-06, + "loss": 0.84, + "step": 3637 + }, + { + "epoch": 0.20023116296989377, + "grad_norm": 0.8545078635215759, + "learning_rate": 9.766643711380378e-06, + "loss": 0.8455, + "step": 3638 + }, + { + "epoch": 0.20028620177224943, + "grad_norm": 0.924404501914978, + "learning_rate": 9.766512815037424e-06, + "loss": 0.6954, + "step": 3639 + }, + { + "epoch": 0.2003412405746051, + "grad_norm": 0.8077614903450012, + "learning_rate": 9.766381882870635e-06, + "loss": 0.7724, + "step": 3640 + }, + { + "epoch": 0.20039627937696075, + "grad_norm": 0.8886739015579224, + "learning_rate": 9.766250914880994e-06, + "loss": 0.8318, + "step": 3641 + }, + { + "epoch": 0.2004513181793164, + "grad_norm": 0.8086267113685608, + "learning_rate": 9.76611991106949e-06, + "loss": 0.8494, + "step": 3642 + }, + { + "epoch": 0.20050635698167207, + "grad_norm": 0.8606873750686646, + "learning_rate": 9.765988871437101e-06, + "loss": 0.8488, + "step": 3643 + }, + { + "epoch": 0.20056139578402774, + "grad_norm": 0.6966355443000793, + "learning_rate": 9.76585779598482e-06, + "loss": 0.7361, + "step": 3644 + }, + { + "epoch": 0.2006164345863834, + "grad_norm": 0.8474385738372803, + "learning_rate": 9.765726684713623e-06, + "loss": 0.8354, + "step": 3645 + }, + { + "epoch": 0.20067147338873906, + "grad_norm": 0.7609736919403076, + "learning_rate": 9.765595537624502e-06, + "loss": 0.7297, + "step": 3646 + }, + { + "epoch": 0.20072651219109472, + "grad_norm": 1.08648681640625, + "learning_rate": 9.76546435471844e-06, + "loss": 0.7534, + "step": 3647 + }, + { + "epoch": 0.20078155099345038, + "grad_norm": 0.7437332272529602, + "learning_rate": 9.765333135996425e-06, + "loss": 0.8532, + "step": 3648 + }, + { + "epoch": 0.20083658979580604, + "grad_norm": 0.9016552567481995, + "learning_rate": 9.76520188145944e-06, + "loss": 0.7968, + "step": 3649 + }, + { + "epoch": 0.2008916285981617, + "grad_norm": 0.8916428089141846, + "learning_rate": 9.765070591108473e-06, + "loss": 0.9601, + "step": 3650 + }, + { + "epoch": 0.20094666740051736, + "grad_norm": 0.7679058313369751, + "learning_rate": 9.764939264944512e-06, + "loss": 0.816, + "step": 3651 + }, + { + "epoch": 0.20100170620287303, + "grad_norm": 0.7716549634933472, + "learning_rate": 9.764807902968543e-06, + "loss": 0.876, + "step": 3652 + }, + { + "epoch": 0.2010567450052287, + "grad_norm": 0.8288074731826782, + "learning_rate": 9.764676505181554e-06, + "loss": 0.8054, + "step": 3653 + }, + { + "epoch": 0.20111178380758435, + "grad_norm": 0.7906842827796936, + "learning_rate": 9.76454507158453e-06, + "loss": 0.8026, + "step": 3654 + }, + { + "epoch": 0.20116682260994, + "grad_norm": 0.8093311190605164, + "learning_rate": 9.764413602178461e-06, + "loss": 0.8093, + "step": 3655 + }, + { + "epoch": 0.20122186141229567, + "grad_norm": 0.7234730124473572, + "learning_rate": 9.764282096964335e-06, + "loss": 0.7194, + "step": 3656 + }, + { + "epoch": 0.20127690021465133, + "grad_norm": 0.9048555493354797, + "learning_rate": 9.76415055594314e-06, + "loss": 0.8996, + "step": 3657 + }, + { + "epoch": 0.201331939017007, + "grad_norm": 0.7630691528320312, + "learning_rate": 9.764018979115864e-06, + "loss": 0.7876, + "step": 3658 + }, + { + "epoch": 0.20138697781936266, + "grad_norm": 0.9551032781600952, + "learning_rate": 9.763887366483498e-06, + "loss": 0.8249, + "step": 3659 + }, + { + "epoch": 0.20144201662171832, + "grad_norm": 0.6988314986228943, + "learning_rate": 9.76375571804703e-06, + "loss": 0.8011, + "step": 3660 + }, + { + "epoch": 0.20149705542407398, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.763624033807448e-06, + "loss": 0.8287, + "step": 3661 + }, + { + "epoch": 0.20155209422642964, + "grad_norm": 0.7201293706893921, + "learning_rate": 9.763492313765743e-06, + "loss": 0.7854, + "step": 3662 + }, + { + "epoch": 0.2016071330287853, + "grad_norm": 0.8691730499267578, + "learning_rate": 9.763360557922905e-06, + "loss": 0.8348, + "step": 3663 + }, + { + "epoch": 0.20166217183114096, + "grad_norm": 0.7660881876945496, + "learning_rate": 9.763228766279924e-06, + "loss": 0.7686, + "step": 3664 + }, + { + "epoch": 0.20171721063349662, + "grad_norm": 1.083796501159668, + "learning_rate": 9.76309693883779e-06, + "loss": 0.8848, + "step": 3665 + }, + { + "epoch": 0.20177224943585229, + "grad_norm": 0.7892678380012512, + "learning_rate": 9.762965075597496e-06, + "loss": 0.7804, + "step": 3666 + }, + { + "epoch": 0.20182728823820795, + "grad_norm": 0.7166122198104858, + "learning_rate": 9.762833176560031e-06, + "loss": 0.761, + "step": 3667 + }, + { + "epoch": 0.2018823270405636, + "grad_norm": 0.8187084794044495, + "learning_rate": 9.762701241726386e-06, + "loss": 0.8251, + "step": 3668 + }, + { + "epoch": 0.20193736584291927, + "grad_norm": 0.6930577158927917, + "learning_rate": 9.762569271097556e-06, + "loss": 0.6795, + "step": 3669 + }, + { + "epoch": 0.20199240464527493, + "grad_norm": 0.8085465431213379, + "learning_rate": 9.762437264674527e-06, + "loss": 0.8415, + "step": 3670 + }, + { + "epoch": 0.2020474434476306, + "grad_norm": 0.8111084699630737, + "learning_rate": 9.762305222458294e-06, + "loss": 0.792, + "step": 3671 + }, + { + "epoch": 0.20210248224998625, + "grad_norm": 0.8200401067733765, + "learning_rate": 9.762173144449852e-06, + "loss": 0.8224, + "step": 3672 + }, + { + "epoch": 0.2021575210523419, + "grad_norm": 0.8460109233856201, + "learning_rate": 9.762041030650192e-06, + "loss": 0.9025, + "step": 3673 + }, + { + "epoch": 0.20221255985469755, + "grad_norm": 0.8152671456336975, + "learning_rate": 9.761908881060303e-06, + "loss": 0.9002, + "step": 3674 + }, + { + "epoch": 0.2022675986570532, + "grad_norm": 0.8204773664474487, + "learning_rate": 9.761776695681185e-06, + "loss": 0.8324, + "step": 3675 + }, + { + "epoch": 0.20232263745940887, + "grad_norm": 0.8121044039726257, + "learning_rate": 9.761644474513825e-06, + "loss": 0.855, + "step": 3676 + }, + { + "epoch": 0.20237767626176453, + "grad_norm": 0.79920494556427, + "learning_rate": 9.76151221755922e-06, + "loss": 0.7837, + "step": 3677 + }, + { + "epoch": 0.2024327150641202, + "grad_norm": 0.862808346748352, + "learning_rate": 9.761379924818367e-06, + "loss": 0.8714, + "step": 3678 + }, + { + "epoch": 0.20248775386647586, + "grad_norm": 0.7135004997253418, + "learning_rate": 9.761247596292254e-06, + "loss": 0.774, + "step": 3679 + }, + { + "epoch": 0.20254279266883152, + "grad_norm": 0.7967603802680969, + "learning_rate": 9.761115231981878e-06, + "loss": 0.919, + "step": 3680 + }, + { + "epoch": 0.20259783147118718, + "grad_norm": 0.7425099611282349, + "learning_rate": 9.760982831888236e-06, + "loss": 0.819, + "step": 3681 + }, + { + "epoch": 0.20265287027354284, + "grad_norm": 0.7631763815879822, + "learning_rate": 9.760850396012323e-06, + "loss": 0.816, + "step": 3682 + }, + { + "epoch": 0.2027079090758985, + "grad_norm": 0.7931755185127258, + "learning_rate": 9.76071792435513e-06, + "loss": 0.8299, + "step": 3683 + }, + { + "epoch": 0.20276294787825416, + "grad_norm": 0.8409438729286194, + "learning_rate": 9.760585416917657e-06, + "loss": 0.8503, + "step": 3684 + }, + { + "epoch": 0.20281798668060982, + "grad_norm": 0.7632728815078735, + "learning_rate": 9.760452873700898e-06, + "loss": 0.8394, + "step": 3685 + }, + { + "epoch": 0.20287302548296549, + "grad_norm": 0.7765083312988281, + "learning_rate": 9.76032029470585e-06, + "loss": 0.8879, + "step": 3686 + }, + { + "epoch": 0.20292806428532115, + "grad_norm": 0.7736936807632446, + "learning_rate": 9.760187679933507e-06, + "loss": 0.7987, + "step": 3687 + }, + { + "epoch": 0.2029831030876768, + "grad_norm": 0.8270270824432373, + "learning_rate": 9.760055029384869e-06, + "loss": 0.8267, + "step": 3688 + }, + { + "epoch": 0.20303814189003247, + "grad_norm": 0.7742369174957275, + "learning_rate": 9.759922343060932e-06, + "loss": 0.8447, + "step": 3689 + }, + { + "epoch": 0.20309318069238813, + "grad_norm": 0.7543869018554688, + "learning_rate": 9.759789620962692e-06, + "loss": 0.7325, + "step": 3690 + }, + { + "epoch": 0.2031482194947438, + "grad_norm": 0.7913174033164978, + "learning_rate": 9.759656863091147e-06, + "loss": 0.8622, + "step": 3691 + }, + { + "epoch": 0.20320325829709945, + "grad_norm": 0.7445376515388489, + "learning_rate": 9.759524069447296e-06, + "loss": 0.7115, + "step": 3692 + }, + { + "epoch": 0.20325829709945512, + "grad_norm": 0.7744696140289307, + "learning_rate": 9.759391240032136e-06, + "loss": 0.8437, + "step": 3693 + }, + { + "epoch": 0.20331333590181078, + "grad_norm": 0.6984724998474121, + "learning_rate": 9.759258374846665e-06, + "loss": 0.7415, + "step": 3694 + }, + { + "epoch": 0.20336837470416644, + "grad_norm": 0.7453249096870422, + "learning_rate": 9.759125473891882e-06, + "loss": 0.7708, + "step": 3695 + }, + { + "epoch": 0.2034234135065221, + "grad_norm": 0.7459438443183899, + "learning_rate": 9.758992537168787e-06, + "loss": 0.7961, + "step": 3696 + }, + { + "epoch": 0.20347845230887776, + "grad_norm": 0.808944582939148, + "learning_rate": 9.758859564678377e-06, + "loss": 0.8875, + "step": 3697 + }, + { + "epoch": 0.20353349111123342, + "grad_norm": 0.7202889323234558, + "learning_rate": 9.758726556421652e-06, + "loss": 0.8064, + "step": 3698 + }, + { + "epoch": 0.20358852991358908, + "grad_norm": 0.7874952554702759, + "learning_rate": 9.758593512399613e-06, + "loss": 0.7881, + "step": 3699 + }, + { + "epoch": 0.20364356871594474, + "grad_norm": 0.771300733089447, + "learning_rate": 9.758460432613259e-06, + "loss": 0.8938, + "step": 3700 + }, + { + "epoch": 0.2036986075183004, + "grad_norm": 0.7332000136375427, + "learning_rate": 9.758327317063589e-06, + "loss": 0.7369, + "step": 3701 + }, + { + "epoch": 0.20375364632065607, + "grad_norm": 0.8206236958503723, + "learning_rate": 9.758194165751604e-06, + "loss": 0.8727, + "step": 3702 + }, + { + "epoch": 0.20380868512301173, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.758060978678308e-06, + "loss": 0.8013, + "step": 3703 + }, + { + "epoch": 0.2038637239253674, + "grad_norm": 0.7213704586029053, + "learning_rate": 9.757927755844698e-06, + "loss": 0.7413, + "step": 3704 + }, + { + "epoch": 0.20391876272772305, + "grad_norm": 0.8982640504837036, + "learning_rate": 9.757794497251776e-06, + "loss": 0.9077, + "step": 3705 + }, + { + "epoch": 0.2039738015300787, + "grad_norm": 0.8439363241195679, + "learning_rate": 9.757661202900544e-06, + "loss": 0.7887, + "step": 3706 + }, + { + "epoch": 0.20402884033243437, + "grad_norm": 0.7700560688972473, + "learning_rate": 9.757527872792005e-06, + "loss": 0.8677, + "step": 3707 + }, + { + "epoch": 0.20408387913479004, + "grad_norm": 0.7462438941001892, + "learning_rate": 9.75739450692716e-06, + "loss": 0.7937, + "step": 3708 + }, + { + "epoch": 0.2041389179371457, + "grad_norm": 0.9125999808311462, + "learning_rate": 9.75726110530701e-06, + "loss": 0.9374, + "step": 3709 + }, + { + "epoch": 0.20419395673950136, + "grad_norm": 0.8949875831604004, + "learning_rate": 9.75712766793256e-06, + "loss": 0.8586, + "step": 3710 + }, + { + "epoch": 0.20424899554185702, + "grad_norm": 0.9042442440986633, + "learning_rate": 9.756994194804812e-06, + "loss": 0.9411, + "step": 3711 + }, + { + "epoch": 0.20430403434421268, + "grad_norm": 0.7646238207817078, + "learning_rate": 9.756860685924769e-06, + "loss": 0.8353, + "step": 3712 + }, + { + "epoch": 0.20435907314656834, + "grad_norm": 0.7551934123039246, + "learning_rate": 9.756727141293434e-06, + "loss": 0.8109, + "step": 3713 + }, + { + "epoch": 0.204414111948924, + "grad_norm": 0.7526532411575317, + "learning_rate": 9.756593560911811e-06, + "loss": 0.8509, + "step": 3714 + }, + { + "epoch": 0.20446915075127967, + "grad_norm": 0.8423319458961487, + "learning_rate": 9.756459944780903e-06, + "loss": 0.9003, + "step": 3715 + }, + { + "epoch": 0.2045241895536353, + "grad_norm": 0.7966015934944153, + "learning_rate": 9.756326292901716e-06, + "loss": 0.7606, + "step": 3716 + }, + { + "epoch": 0.20457922835599096, + "grad_norm": 0.7642805576324463, + "learning_rate": 9.756192605275256e-06, + "loss": 0.8321, + "step": 3717 + }, + { + "epoch": 0.20463426715834662, + "grad_norm": 0.7285729646682739, + "learning_rate": 9.756058881902524e-06, + "loss": 0.7375, + "step": 3718 + }, + { + "epoch": 0.20468930596070228, + "grad_norm": 0.852020263671875, + "learning_rate": 9.755925122784525e-06, + "loss": 0.8207, + "step": 3719 + }, + { + "epoch": 0.20474434476305794, + "grad_norm": 0.8227072358131409, + "learning_rate": 9.755791327922268e-06, + "loss": 0.872, + "step": 3720 + }, + { + "epoch": 0.2047993835654136, + "grad_norm": 1.0128127336502075, + "learning_rate": 9.755657497316755e-06, + "loss": 0.9186, + "step": 3721 + }, + { + "epoch": 0.20485442236776927, + "grad_norm": 0.8208017349243164, + "learning_rate": 9.755523630968994e-06, + "loss": 0.6968, + "step": 3722 + }, + { + "epoch": 0.20490946117012493, + "grad_norm": 0.7716407179832458, + "learning_rate": 9.75538972887999e-06, + "loss": 0.8068, + "step": 3723 + }, + { + "epoch": 0.2049644999724806, + "grad_norm": 0.779608964920044, + "learning_rate": 9.75525579105075e-06, + "loss": 0.6968, + "step": 3724 + }, + { + "epoch": 0.20501953877483625, + "grad_norm": 0.7463479042053223, + "learning_rate": 9.75512181748228e-06, + "loss": 0.7581, + "step": 3725 + }, + { + "epoch": 0.2050745775771919, + "grad_norm": 0.8104956150054932, + "learning_rate": 9.754987808175587e-06, + "loss": 0.7838, + "step": 3726 + }, + { + "epoch": 0.20512961637954757, + "grad_norm": 0.7911564707756042, + "learning_rate": 9.75485376313168e-06, + "loss": 0.848, + "step": 3727 + }, + { + "epoch": 0.20518465518190324, + "grad_norm": 0.8340871334075928, + "learning_rate": 9.754719682351564e-06, + "loss": 0.7879, + "step": 3728 + }, + { + "epoch": 0.2052396939842589, + "grad_norm": 1.5543067455291748, + "learning_rate": 9.754585565836247e-06, + "loss": 0.8091, + "step": 3729 + }, + { + "epoch": 0.20529473278661456, + "grad_norm": 0.8262580633163452, + "learning_rate": 9.754451413586739e-06, + "loss": 0.9076, + "step": 3730 + }, + { + "epoch": 0.20534977158897022, + "grad_norm": 0.7558280825614929, + "learning_rate": 9.754317225604045e-06, + "loss": 0.7781, + "step": 3731 + }, + { + "epoch": 0.20540481039132588, + "grad_norm": 0.7197710275650024, + "learning_rate": 9.754183001889177e-06, + "loss": 0.765, + "step": 3732 + }, + { + "epoch": 0.20545984919368154, + "grad_norm": 0.8053440451622009, + "learning_rate": 9.754048742443141e-06, + "loss": 0.7986, + "step": 3733 + }, + { + "epoch": 0.2055148879960372, + "grad_norm": 0.9183983206748962, + "learning_rate": 9.753914447266947e-06, + "loss": 0.8522, + "step": 3734 + }, + { + "epoch": 0.20556992679839287, + "grad_norm": 0.8095504641532898, + "learning_rate": 9.753780116361607e-06, + "loss": 0.7243, + "step": 3735 + }, + { + "epoch": 0.20562496560074853, + "grad_norm": 0.816818356513977, + "learning_rate": 9.753645749728127e-06, + "loss": 0.8262, + "step": 3736 + }, + { + "epoch": 0.2056800044031042, + "grad_norm": 0.8425988554954529, + "learning_rate": 9.753511347367516e-06, + "loss": 0.8142, + "step": 3737 + }, + { + "epoch": 0.20573504320545985, + "grad_norm": 0.7719724178314209, + "learning_rate": 9.753376909280789e-06, + "loss": 0.8444, + "step": 3738 + }, + { + "epoch": 0.2057900820078155, + "grad_norm": 0.877646803855896, + "learning_rate": 9.753242435468952e-06, + "loss": 0.8515, + "step": 3739 + }, + { + "epoch": 0.20584512081017117, + "grad_norm": 0.9261211156845093, + "learning_rate": 9.753107925933017e-06, + "loss": 0.7605, + "step": 3740 + }, + { + "epoch": 0.20590015961252683, + "grad_norm": 0.7790889739990234, + "learning_rate": 9.752973380673995e-06, + "loss": 0.7911, + "step": 3741 + }, + { + "epoch": 0.2059551984148825, + "grad_norm": 0.7112367153167725, + "learning_rate": 9.752838799692899e-06, + "loss": 0.8212, + "step": 3742 + }, + { + "epoch": 0.20601023721723816, + "grad_norm": 0.7568365335464478, + "learning_rate": 9.752704182990736e-06, + "loss": 0.8505, + "step": 3743 + }, + { + "epoch": 0.20606527601959382, + "grad_norm": 0.7501981258392334, + "learning_rate": 9.752569530568523e-06, + "loss": 0.8191, + "step": 3744 + }, + { + "epoch": 0.20612031482194948, + "grad_norm": 0.7822220325469971, + "learning_rate": 9.752434842427268e-06, + "loss": 0.8032, + "step": 3745 + }, + { + "epoch": 0.20617535362430514, + "grad_norm": 0.810197114944458, + "learning_rate": 9.752300118567987e-06, + "loss": 0.7789, + "step": 3746 + }, + { + "epoch": 0.2062303924266608, + "grad_norm": 0.7386943101882935, + "learning_rate": 9.752165358991688e-06, + "loss": 0.7733, + "step": 3747 + }, + { + "epoch": 0.20628543122901646, + "grad_norm": 0.7086807489395142, + "learning_rate": 9.75203056369939e-06, + "loss": 0.6328, + "step": 3748 + }, + { + "epoch": 0.20634047003137213, + "grad_norm": 0.9881154894828796, + "learning_rate": 9.751895732692099e-06, + "loss": 0.8515, + "step": 3749 + }, + { + "epoch": 0.2063955088337278, + "grad_norm": 0.813521683216095, + "learning_rate": 9.751760865970831e-06, + "loss": 0.8438, + "step": 3750 + }, + { + "epoch": 0.20645054763608345, + "grad_norm": 0.8357470631599426, + "learning_rate": 9.751625963536602e-06, + "loss": 0.7635, + "step": 3751 + }, + { + "epoch": 0.2065055864384391, + "grad_norm": 0.8629693388938904, + "learning_rate": 9.751491025390423e-06, + "loss": 0.888, + "step": 3752 + }, + { + "epoch": 0.20656062524079477, + "grad_norm": 0.8844664096832275, + "learning_rate": 9.751356051533311e-06, + "loss": 0.7654, + "step": 3753 + }, + { + "epoch": 0.20661566404315043, + "grad_norm": 0.7006319165229797, + "learning_rate": 9.751221041966276e-06, + "loss": 0.7618, + "step": 3754 + }, + { + "epoch": 0.2066707028455061, + "grad_norm": 0.9291046261787415, + "learning_rate": 9.75108599669034e-06, + "loss": 0.8485, + "step": 3755 + }, + { + "epoch": 0.20672574164786175, + "grad_norm": 0.7670828700065613, + "learning_rate": 9.75095091570651e-06, + "loss": 0.7856, + "step": 3756 + }, + { + "epoch": 0.20678078045021742, + "grad_norm": 0.8709883689880371, + "learning_rate": 9.750815799015804e-06, + "loss": 0.7983, + "step": 3757 + }, + { + "epoch": 0.20683581925257308, + "grad_norm": 0.7688055634498596, + "learning_rate": 9.750680646619241e-06, + "loss": 0.8064, + "step": 3758 + }, + { + "epoch": 0.2068908580549287, + "grad_norm": 0.9492738246917725, + "learning_rate": 9.750545458517832e-06, + "loss": 0.8256, + "step": 3759 + }, + { + "epoch": 0.20694589685728437, + "grad_norm": 0.9685352444648743, + "learning_rate": 9.750410234712596e-06, + "loss": 0.839, + "step": 3760 + }, + { + "epoch": 0.20700093565964003, + "grad_norm": 0.788577139377594, + "learning_rate": 9.750274975204547e-06, + "loss": 0.8743, + "step": 3761 + }, + { + "epoch": 0.2070559744619957, + "grad_norm": 0.8496370315551758, + "learning_rate": 9.750139679994703e-06, + "loss": 0.9286, + "step": 3762 + }, + { + "epoch": 0.20711101326435136, + "grad_norm": 0.9539788961410522, + "learning_rate": 9.750004349084083e-06, + "loss": 0.7568, + "step": 3763 + }, + { + "epoch": 0.20716605206670702, + "grad_norm": 0.8825643062591553, + "learning_rate": 9.7498689824737e-06, + "loss": 0.9339, + "step": 3764 + }, + { + "epoch": 0.20722109086906268, + "grad_norm": 0.7771373391151428, + "learning_rate": 9.749733580164573e-06, + "loss": 0.851, + "step": 3765 + }, + { + "epoch": 0.20727612967141834, + "grad_norm": 0.7460281252861023, + "learning_rate": 9.749598142157721e-06, + "loss": 0.8208, + "step": 3766 + }, + { + "epoch": 0.207331168473774, + "grad_norm": 0.8370739817619324, + "learning_rate": 9.74946266845416e-06, + "loss": 0.8634, + "step": 3767 + }, + { + "epoch": 0.20738620727612966, + "grad_norm": 0.7770463228225708, + "learning_rate": 9.749327159054907e-06, + "loss": 0.7955, + "step": 3768 + }, + { + "epoch": 0.20744124607848533, + "grad_norm": 0.8048208355903625, + "learning_rate": 9.749191613960985e-06, + "loss": 0.7736, + "step": 3769 + }, + { + "epoch": 0.207496284880841, + "grad_norm": 0.9187547564506531, + "learning_rate": 9.74905603317341e-06, + "loss": 0.8534, + "step": 3770 + }, + { + "epoch": 0.20755132368319665, + "grad_norm": 0.7304024696350098, + "learning_rate": 9.7489204166932e-06, + "loss": 0.72, + "step": 3771 + }, + { + "epoch": 0.2076063624855523, + "grad_norm": 0.86177659034729, + "learning_rate": 9.748784764521376e-06, + "loss": 0.7838, + "step": 3772 + }, + { + "epoch": 0.20766140128790797, + "grad_norm": 0.7988011837005615, + "learning_rate": 9.748649076658956e-06, + "loss": 0.7776, + "step": 3773 + }, + { + "epoch": 0.20771644009026363, + "grad_norm": 0.706099808216095, + "learning_rate": 9.74851335310696e-06, + "loss": 0.759, + "step": 3774 + }, + { + "epoch": 0.2077714788926193, + "grad_norm": 0.8125914931297302, + "learning_rate": 9.748377593866412e-06, + "loss": 0.8155, + "step": 3775 + }, + { + "epoch": 0.20782651769497495, + "grad_norm": 0.8603429794311523, + "learning_rate": 9.748241798938326e-06, + "loss": 0.8018, + "step": 3776 + }, + { + "epoch": 0.20788155649733062, + "grad_norm": 0.7735254764556885, + "learning_rate": 9.748105968323726e-06, + "loss": 0.7788, + "step": 3777 + }, + { + "epoch": 0.20793659529968628, + "grad_norm": 0.9037501811981201, + "learning_rate": 9.747970102023635e-06, + "loss": 0.8907, + "step": 3778 + }, + { + "epoch": 0.20799163410204194, + "grad_norm": 0.8781846761703491, + "learning_rate": 9.74783420003907e-06, + "loss": 0.867, + "step": 3779 + }, + { + "epoch": 0.2080466729043976, + "grad_norm": 0.8486423492431641, + "learning_rate": 9.747698262371052e-06, + "loss": 0.817, + "step": 3780 + }, + { + "epoch": 0.20810171170675326, + "grad_norm": 0.8242751359939575, + "learning_rate": 9.747562289020607e-06, + "loss": 0.7385, + "step": 3781 + }, + { + "epoch": 0.20815675050910892, + "grad_norm": 0.8776529431343079, + "learning_rate": 9.747426279988754e-06, + "loss": 0.8222, + "step": 3782 + }, + { + "epoch": 0.20821178931146458, + "grad_norm": 0.7428975105285645, + "learning_rate": 9.747290235276517e-06, + "loss": 0.6954, + "step": 3783 + }, + { + "epoch": 0.20826682811382025, + "grad_norm": 0.8631997108459473, + "learning_rate": 9.747154154884917e-06, + "loss": 0.7956, + "step": 3784 + }, + { + "epoch": 0.2083218669161759, + "grad_norm": 0.7819229364395142, + "learning_rate": 9.747018038814976e-06, + "loss": 0.778, + "step": 3785 + }, + { + "epoch": 0.20837690571853157, + "grad_norm": 0.7770963311195374, + "learning_rate": 9.746881887067718e-06, + "loss": 0.8055, + "step": 3786 + }, + { + "epoch": 0.20843194452088723, + "grad_norm": 0.7168729305267334, + "learning_rate": 9.746745699644169e-06, + "loss": 0.7476, + "step": 3787 + }, + { + "epoch": 0.2084869833232429, + "grad_norm": 0.7963632941246033, + "learning_rate": 9.746609476545348e-06, + "loss": 0.8083, + "step": 3788 + }, + { + "epoch": 0.20854202212559855, + "grad_norm": 0.6689679026603699, + "learning_rate": 9.746473217772281e-06, + "loss": 0.6687, + "step": 3789 + }, + { + "epoch": 0.20859706092795421, + "grad_norm": 0.8085560202598572, + "learning_rate": 9.746336923325991e-06, + "loss": 0.8221, + "step": 3790 + }, + { + "epoch": 0.20865209973030988, + "grad_norm": 0.7215744256973267, + "learning_rate": 9.746200593207505e-06, + "loss": 0.7261, + "step": 3791 + }, + { + "epoch": 0.20870713853266554, + "grad_norm": 0.7821729779243469, + "learning_rate": 9.746064227417844e-06, + "loss": 0.7683, + "step": 3792 + }, + { + "epoch": 0.2087621773350212, + "grad_norm": 1.0014925003051758, + "learning_rate": 9.745927825958036e-06, + "loss": 0.7485, + "step": 3793 + }, + { + "epoch": 0.20881721613737686, + "grad_norm": 0.9447367787361145, + "learning_rate": 9.745791388829102e-06, + "loss": 0.835, + "step": 3794 + }, + { + "epoch": 0.20887225493973252, + "grad_norm": 0.7333751916885376, + "learning_rate": 9.745654916032073e-06, + "loss": 0.811, + "step": 3795 + }, + { + "epoch": 0.20892729374208818, + "grad_norm": 0.7516912221908569, + "learning_rate": 9.745518407567973e-06, + "loss": 0.7669, + "step": 3796 + }, + { + "epoch": 0.20898233254444384, + "grad_norm": 0.7826053500175476, + "learning_rate": 9.745381863437824e-06, + "loss": 0.7963, + "step": 3797 + }, + { + "epoch": 0.2090373713467995, + "grad_norm": 0.8258751630783081, + "learning_rate": 9.745245283642658e-06, + "loss": 0.7929, + "step": 3798 + }, + { + "epoch": 0.20909241014915517, + "grad_norm": 0.7990522980690002, + "learning_rate": 9.745108668183497e-06, + "loss": 0.8518, + "step": 3799 + }, + { + "epoch": 0.20914744895151083, + "grad_norm": 1.3855403661727905, + "learning_rate": 9.744972017061369e-06, + "loss": 0.7768, + "step": 3800 + }, + { + "epoch": 0.2092024877538665, + "grad_norm": 0.8456707000732422, + "learning_rate": 9.744835330277302e-06, + "loss": 0.7629, + "step": 3801 + }, + { + "epoch": 0.20925752655622212, + "grad_norm": 0.8992564678192139, + "learning_rate": 9.744698607832323e-06, + "loss": 0.8991, + "step": 3802 + }, + { + "epoch": 0.20931256535857778, + "grad_norm": 0.8533509969711304, + "learning_rate": 9.744561849727459e-06, + "loss": 0.8883, + "step": 3803 + }, + { + "epoch": 0.20936760416093345, + "grad_norm": 0.8363122940063477, + "learning_rate": 9.744425055963739e-06, + "loss": 0.8537, + "step": 3804 + }, + { + "epoch": 0.2094226429632891, + "grad_norm": 0.7462213039398193, + "learning_rate": 9.744288226542189e-06, + "loss": 0.7713, + "step": 3805 + }, + { + "epoch": 0.20947768176564477, + "grad_norm": 0.8148539066314697, + "learning_rate": 9.744151361463841e-06, + "loss": 0.7887, + "step": 3806 + }, + { + "epoch": 0.20953272056800043, + "grad_norm": 0.7504319548606873, + "learning_rate": 9.744014460729718e-06, + "loss": 0.7385, + "step": 3807 + }, + { + "epoch": 0.2095877593703561, + "grad_norm": 0.9291114807128906, + "learning_rate": 9.743877524340854e-06, + "loss": 0.9886, + "step": 3808 + }, + { + "epoch": 0.20964279817271175, + "grad_norm": 0.7747925519943237, + "learning_rate": 9.743740552298276e-06, + "loss": 0.8772, + "step": 3809 + }, + { + "epoch": 0.20969783697506741, + "grad_norm": 0.7283097505569458, + "learning_rate": 9.743603544603016e-06, + "loss": 0.7403, + "step": 3810 + }, + { + "epoch": 0.20975287577742308, + "grad_norm": 0.8403457999229431, + "learning_rate": 9.743466501256098e-06, + "loss": 0.7998, + "step": 3811 + }, + { + "epoch": 0.20980791457977874, + "grad_norm": 0.8218665719032288, + "learning_rate": 9.743329422258557e-06, + "loss": 0.8019, + "step": 3812 + }, + { + "epoch": 0.2098629533821344, + "grad_norm": 0.6991317868232727, + "learning_rate": 9.743192307611423e-06, + "loss": 0.743, + "step": 3813 + }, + { + "epoch": 0.20991799218449006, + "grad_norm": 0.767295241355896, + "learning_rate": 9.743055157315725e-06, + "loss": 0.8003, + "step": 3814 + }, + { + "epoch": 0.20997303098684572, + "grad_norm": 0.9457303285598755, + "learning_rate": 9.742917971372492e-06, + "loss": 0.8448, + "step": 3815 + }, + { + "epoch": 0.21002806978920138, + "grad_norm": 0.7839058637619019, + "learning_rate": 9.742780749782758e-06, + "loss": 0.8828, + "step": 3816 + }, + { + "epoch": 0.21008310859155704, + "grad_norm": 0.7831344604492188, + "learning_rate": 9.742643492547553e-06, + "loss": 0.7714, + "step": 3817 + }, + { + "epoch": 0.2101381473939127, + "grad_norm": 0.7637175917625427, + "learning_rate": 9.74250619966791e-06, + "loss": 0.7508, + "step": 3818 + }, + { + "epoch": 0.21019318619626837, + "grad_norm": 0.8830221891403198, + "learning_rate": 9.74236887114486e-06, + "loss": 0.8508, + "step": 3819 + }, + { + "epoch": 0.21024822499862403, + "grad_norm": 0.7803365588188171, + "learning_rate": 9.742231506979434e-06, + "loss": 0.8094, + "step": 3820 + }, + { + "epoch": 0.2103032638009797, + "grad_norm": 0.7701493501663208, + "learning_rate": 9.742094107172666e-06, + "loss": 0.8851, + "step": 3821 + }, + { + "epoch": 0.21035830260333535, + "grad_norm": 0.6434544324874878, + "learning_rate": 9.741956671725588e-06, + "loss": 0.7015, + "step": 3822 + }, + { + "epoch": 0.210413341405691, + "grad_norm": 0.7294684052467346, + "learning_rate": 9.741819200639233e-06, + "loss": 0.7357, + "step": 3823 + }, + { + "epoch": 0.21046838020804667, + "grad_norm": 0.702367901802063, + "learning_rate": 9.741681693914635e-06, + "loss": 0.7518, + "step": 3824 + }, + { + "epoch": 0.21052341901040234, + "grad_norm": 0.7567246556282043, + "learning_rate": 9.741544151552826e-06, + "loss": 0.8259, + "step": 3825 + }, + { + "epoch": 0.210578457812758, + "grad_norm": 0.7478607892990112, + "learning_rate": 9.741406573554841e-06, + "loss": 0.81, + "step": 3826 + }, + { + "epoch": 0.21063349661511366, + "grad_norm": 0.7270129323005676, + "learning_rate": 9.741268959921712e-06, + "loss": 0.8201, + "step": 3827 + }, + { + "epoch": 0.21068853541746932, + "grad_norm": 0.8108176589012146, + "learning_rate": 9.741131310654475e-06, + "loss": 0.8425, + "step": 3828 + }, + { + "epoch": 0.21074357421982498, + "grad_norm": 0.7773691415786743, + "learning_rate": 9.740993625754165e-06, + "loss": 0.8372, + "step": 3829 + }, + { + "epoch": 0.21079861302218064, + "grad_norm": 0.8988421559333801, + "learning_rate": 9.740855905221816e-06, + "loss": 0.8285, + "step": 3830 + }, + { + "epoch": 0.2108536518245363, + "grad_norm": 0.7339534759521484, + "learning_rate": 9.740718149058462e-06, + "loss": 0.7567, + "step": 3831 + }, + { + "epoch": 0.21090869062689196, + "grad_norm": 0.8465108275413513, + "learning_rate": 9.740580357265141e-06, + "loss": 0.8747, + "step": 3832 + }, + { + "epoch": 0.21096372942924763, + "grad_norm": 0.7956714034080505, + "learning_rate": 9.740442529842885e-06, + "loss": 0.7665, + "step": 3833 + }, + { + "epoch": 0.2110187682316033, + "grad_norm": 0.96270751953125, + "learning_rate": 9.740304666792733e-06, + "loss": 0.8338, + "step": 3834 + }, + { + "epoch": 0.21107380703395895, + "grad_norm": 0.812329113483429, + "learning_rate": 9.74016676811572e-06, + "loss": 0.8407, + "step": 3835 + }, + { + "epoch": 0.2111288458363146, + "grad_norm": 0.7975192070007324, + "learning_rate": 9.740028833812882e-06, + "loss": 0.7836, + "step": 3836 + }, + { + "epoch": 0.21118388463867027, + "grad_norm": 0.826621949672699, + "learning_rate": 9.739890863885258e-06, + "loss": 0.732, + "step": 3837 + }, + { + "epoch": 0.21123892344102593, + "grad_norm": 0.9015662670135498, + "learning_rate": 9.73975285833388e-06, + "loss": 0.8837, + "step": 3838 + }, + { + "epoch": 0.2112939622433816, + "grad_norm": 0.7641518712043762, + "learning_rate": 9.73961481715979e-06, + "loss": 0.7334, + "step": 3839 + }, + { + "epoch": 0.21134900104573726, + "grad_norm": 0.8062206506729126, + "learning_rate": 9.739476740364023e-06, + "loss": 0.8381, + "step": 3840 + }, + { + "epoch": 0.21140403984809292, + "grad_norm": 0.7301875352859497, + "learning_rate": 9.739338627947619e-06, + "loss": 0.7389, + "step": 3841 + }, + { + "epoch": 0.21145907865044858, + "grad_norm": 0.8995181322097778, + "learning_rate": 9.739200479911612e-06, + "loss": 0.8111, + "step": 3842 + }, + { + "epoch": 0.21151411745280424, + "grad_norm": 0.7154433131217957, + "learning_rate": 9.739062296257045e-06, + "loss": 0.7501, + "step": 3843 + }, + { + "epoch": 0.2115691562551599, + "grad_norm": 0.8403087854385376, + "learning_rate": 9.738924076984954e-06, + "loss": 0.8212, + "step": 3844 + }, + { + "epoch": 0.21162419505751554, + "grad_norm": 0.7616639137268066, + "learning_rate": 9.738785822096377e-06, + "loss": 0.82, + "step": 3845 + }, + { + "epoch": 0.2116792338598712, + "grad_norm": 0.7897970080375671, + "learning_rate": 9.738647531592356e-06, + "loss": 0.7972, + "step": 3846 + }, + { + "epoch": 0.21173427266222686, + "grad_norm": 0.7909015417098999, + "learning_rate": 9.738509205473928e-06, + "loss": 0.7939, + "step": 3847 + }, + { + "epoch": 0.21178931146458252, + "grad_norm": 0.9553212523460388, + "learning_rate": 9.73837084374213e-06, + "loss": 0.8672, + "step": 3848 + }, + { + "epoch": 0.21184435026693818, + "grad_norm": 0.9558283686637878, + "learning_rate": 9.73823244639801e-06, + "loss": 0.897, + "step": 3849 + }, + { + "epoch": 0.21189938906929384, + "grad_norm": 0.819530725479126, + "learning_rate": 9.7380940134426e-06, + "loss": 0.86, + "step": 3850 + }, + { + "epoch": 0.2119544278716495, + "grad_norm": 0.7301751971244812, + "learning_rate": 9.737955544876945e-06, + "loss": 0.8265, + "step": 3851 + }, + { + "epoch": 0.21200946667400516, + "grad_norm": 0.8564972281455994, + "learning_rate": 9.737817040702085e-06, + "loss": 0.8253, + "step": 3852 + }, + { + "epoch": 0.21206450547636083, + "grad_norm": 0.7715204358100891, + "learning_rate": 9.737678500919059e-06, + "loss": 0.7779, + "step": 3853 + }, + { + "epoch": 0.2121195442787165, + "grad_norm": 0.7296929955482483, + "learning_rate": 9.73753992552891e-06, + "loss": 0.787, + "step": 3854 + }, + { + "epoch": 0.21217458308107215, + "grad_norm": 0.8574217557907104, + "learning_rate": 9.73740131453268e-06, + "loss": 0.797, + "step": 3855 + }, + { + "epoch": 0.2122296218834278, + "grad_norm": 0.8320643901824951, + "learning_rate": 9.737262667931409e-06, + "loss": 0.876, + "step": 3856 + }, + { + "epoch": 0.21228466068578347, + "grad_norm": 0.7313587069511414, + "learning_rate": 9.73712398572614e-06, + "loss": 0.7151, + "step": 3857 + }, + { + "epoch": 0.21233969948813913, + "grad_norm": 0.7039312720298767, + "learning_rate": 9.736985267917916e-06, + "loss": 0.7353, + "step": 3858 + }, + { + "epoch": 0.2123947382904948, + "grad_norm": 0.7893409132957458, + "learning_rate": 9.736846514507776e-06, + "loss": 0.8383, + "step": 3859 + }, + { + "epoch": 0.21244977709285046, + "grad_norm": 0.8771371245384216, + "learning_rate": 9.736707725496767e-06, + "loss": 0.7543, + "step": 3860 + }, + { + "epoch": 0.21250481589520612, + "grad_norm": 1.0067707300186157, + "learning_rate": 9.736568900885932e-06, + "loss": 0.796, + "step": 3861 + }, + { + "epoch": 0.21255985469756178, + "grad_norm": 0.9171931743621826, + "learning_rate": 9.736430040676312e-06, + "loss": 0.8174, + "step": 3862 + }, + { + "epoch": 0.21261489349991744, + "grad_norm": 0.7616068720817566, + "learning_rate": 9.736291144868952e-06, + "loss": 0.7762, + "step": 3863 + }, + { + "epoch": 0.2126699323022731, + "grad_norm": 0.789010226726532, + "learning_rate": 9.736152213464895e-06, + "loss": 0.7749, + "step": 3864 + }, + { + "epoch": 0.21272497110462876, + "grad_norm": 0.7943348288536072, + "learning_rate": 9.736013246465187e-06, + "loss": 0.6687, + "step": 3865 + }, + { + "epoch": 0.21278000990698442, + "grad_norm": 0.8351758718490601, + "learning_rate": 9.73587424387087e-06, + "loss": 0.9201, + "step": 3866 + }, + { + "epoch": 0.21283504870934009, + "grad_norm": 0.7710975408554077, + "learning_rate": 9.735735205682991e-06, + "loss": 0.8357, + "step": 3867 + }, + { + "epoch": 0.21289008751169575, + "grad_norm": 0.8955768942832947, + "learning_rate": 9.73559613190259e-06, + "loss": 0.8396, + "step": 3868 + }, + { + "epoch": 0.2129451263140514, + "grad_norm": 0.8664666414260864, + "learning_rate": 9.735457022530722e-06, + "loss": 0.8176, + "step": 3869 + }, + { + "epoch": 0.21300016511640707, + "grad_norm": 0.7955949902534485, + "learning_rate": 9.735317877568424e-06, + "loss": 0.8421, + "step": 3870 + }, + { + "epoch": 0.21305520391876273, + "grad_norm": 0.849866509437561, + "learning_rate": 9.735178697016742e-06, + "loss": 0.7677, + "step": 3871 + }, + { + "epoch": 0.2131102427211184, + "grad_norm": 0.7782625555992126, + "learning_rate": 9.735039480876727e-06, + "loss": 0.7838, + "step": 3872 + }, + { + "epoch": 0.21316528152347405, + "grad_norm": 0.7734919190406799, + "learning_rate": 9.734900229149423e-06, + "loss": 0.757, + "step": 3873 + }, + { + "epoch": 0.21322032032582972, + "grad_norm": 0.8462040424346924, + "learning_rate": 9.734760941835876e-06, + "loss": 0.8841, + "step": 3874 + }, + { + "epoch": 0.21327535912818538, + "grad_norm": 0.7219869494438171, + "learning_rate": 9.734621618937133e-06, + "loss": 0.7651, + "step": 3875 + }, + { + "epoch": 0.21333039793054104, + "grad_norm": 0.7550874352455139, + "learning_rate": 9.734482260454241e-06, + "loss": 0.8032, + "step": 3876 + }, + { + "epoch": 0.2133854367328967, + "grad_norm": 0.7504588961601257, + "learning_rate": 9.734342866388247e-06, + "loss": 0.7923, + "step": 3877 + }, + { + "epoch": 0.21344047553525236, + "grad_norm": 0.7407390475273132, + "learning_rate": 9.7342034367402e-06, + "loss": 0.7569, + "step": 3878 + }, + { + "epoch": 0.21349551433760802, + "grad_norm": 0.7911562323570251, + "learning_rate": 9.734063971511147e-06, + "loss": 0.8726, + "step": 3879 + }, + { + "epoch": 0.21355055313996368, + "grad_norm": 0.9132450819015503, + "learning_rate": 9.733924470702139e-06, + "loss": 0.9445, + "step": 3880 + }, + { + "epoch": 0.21360559194231934, + "grad_norm": 0.9639442563056946, + "learning_rate": 9.733784934314218e-06, + "loss": 0.7307, + "step": 3881 + }, + { + "epoch": 0.213660630744675, + "grad_norm": 0.7724352478981018, + "learning_rate": 9.73364536234844e-06, + "loss": 0.8337, + "step": 3882 + }, + { + "epoch": 0.21371566954703067, + "grad_norm": 0.9643296599388123, + "learning_rate": 9.733505754805848e-06, + "loss": 0.8277, + "step": 3883 + }, + { + "epoch": 0.21377070834938633, + "grad_norm": 0.8135218620300293, + "learning_rate": 9.733366111687494e-06, + "loss": 0.7933, + "step": 3884 + }, + { + "epoch": 0.213825747151742, + "grad_norm": 0.7527105212211609, + "learning_rate": 9.733226432994426e-06, + "loss": 0.7302, + "step": 3885 + }, + { + "epoch": 0.21388078595409765, + "grad_norm": 1.090550184249878, + "learning_rate": 9.733086718727698e-06, + "loss": 0.8646, + "step": 3886 + }, + { + "epoch": 0.2139358247564533, + "grad_norm": 0.9227491617202759, + "learning_rate": 9.732946968888358e-06, + "loss": 0.8525, + "step": 3887 + }, + { + "epoch": 0.21399086355880895, + "grad_norm": 0.7781830430030823, + "learning_rate": 9.732807183477454e-06, + "loss": 0.8757, + "step": 3888 + }, + { + "epoch": 0.2140459023611646, + "grad_norm": 0.7740090489387512, + "learning_rate": 9.732667362496036e-06, + "loss": 0.7557, + "step": 3889 + }, + { + "epoch": 0.21410094116352027, + "grad_norm": 0.7341694831848145, + "learning_rate": 9.732527505945159e-06, + "loss": 0.7481, + "step": 3890 + }, + { + "epoch": 0.21415597996587593, + "grad_norm": 0.8691402673721313, + "learning_rate": 9.732387613825872e-06, + "loss": 0.8395, + "step": 3891 + }, + { + "epoch": 0.2142110187682316, + "grad_norm": 0.7845497131347656, + "learning_rate": 9.732247686139227e-06, + "loss": 0.6999, + "step": 3892 + }, + { + "epoch": 0.21426605757058725, + "grad_norm": 0.7944281697273254, + "learning_rate": 9.732107722886275e-06, + "loss": 0.7677, + "step": 3893 + }, + { + "epoch": 0.21432109637294292, + "grad_norm": 0.904195249080658, + "learning_rate": 9.731967724068065e-06, + "loss": 0.8429, + "step": 3894 + }, + { + "epoch": 0.21437613517529858, + "grad_norm": 0.7968988418579102, + "learning_rate": 9.731827689685655e-06, + "loss": 0.8224, + "step": 3895 + }, + { + "epoch": 0.21443117397765424, + "grad_norm": 0.773674726486206, + "learning_rate": 9.731687619740095e-06, + "loss": 0.7743, + "step": 3896 + }, + { + "epoch": 0.2144862127800099, + "grad_norm": 0.7873631715774536, + "learning_rate": 9.731547514232439e-06, + "loss": 0.8581, + "step": 3897 + }, + { + "epoch": 0.21454125158236556, + "grad_norm": 0.7989653944969177, + "learning_rate": 9.731407373163735e-06, + "loss": 0.8447, + "step": 3898 + }, + { + "epoch": 0.21459629038472122, + "grad_norm": 0.74820876121521, + "learning_rate": 9.73126719653504e-06, + "loss": 0.8745, + "step": 3899 + }, + { + "epoch": 0.21465132918707688, + "grad_norm": 0.7191246747970581, + "learning_rate": 9.731126984347408e-06, + "loss": 0.7533, + "step": 3900 + }, + { + "epoch": 0.21470636798943254, + "grad_norm": 0.7718465328216553, + "learning_rate": 9.730986736601893e-06, + "loss": 0.8184, + "step": 3901 + }, + { + "epoch": 0.2147614067917882, + "grad_norm": 0.7055066823959351, + "learning_rate": 9.730846453299547e-06, + "loss": 0.7352, + "step": 3902 + }, + { + "epoch": 0.21481644559414387, + "grad_norm": 0.7500855326652527, + "learning_rate": 9.730706134441425e-06, + "loss": 0.8111, + "step": 3903 + }, + { + "epoch": 0.21487148439649953, + "grad_norm": 0.7568232417106628, + "learning_rate": 9.730565780028583e-06, + "loss": 0.8126, + "step": 3904 + }, + { + "epoch": 0.2149265231988552, + "grad_norm": 0.7418738007545471, + "learning_rate": 9.730425390062075e-06, + "loss": 0.8014, + "step": 3905 + }, + { + "epoch": 0.21498156200121085, + "grad_norm": 0.7967441082000732, + "learning_rate": 9.730284964542955e-06, + "loss": 0.7965, + "step": 3906 + }, + { + "epoch": 0.2150366008035665, + "grad_norm": 0.7444791197776794, + "learning_rate": 9.730144503472281e-06, + "loss": 0.7113, + "step": 3907 + }, + { + "epoch": 0.21509163960592217, + "grad_norm": 0.8372869491577148, + "learning_rate": 9.730004006851107e-06, + "loss": 0.838, + "step": 3908 + }, + { + "epoch": 0.21514667840827784, + "grad_norm": 0.7984300851821899, + "learning_rate": 9.729863474680488e-06, + "loss": 0.856, + "step": 3909 + }, + { + "epoch": 0.2152017172106335, + "grad_norm": 0.7508612871170044, + "learning_rate": 9.72972290696148e-06, + "loss": 0.7947, + "step": 3910 + }, + { + "epoch": 0.21525675601298916, + "grad_norm": 0.7559992074966431, + "learning_rate": 9.729582303695142e-06, + "loss": 0.785, + "step": 3911 + }, + { + "epoch": 0.21531179481534482, + "grad_norm": 0.7764164209365845, + "learning_rate": 9.729441664882531e-06, + "loss": 0.8297, + "step": 3912 + }, + { + "epoch": 0.21536683361770048, + "grad_norm": 0.8112726211547852, + "learning_rate": 9.7293009905247e-06, + "loss": 0.8073, + "step": 3913 + }, + { + "epoch": 0.21542187242005614, + "grad_norm": 0.9748952388763428, + "learning_rate": 9.729160280622709e-06, + "loss": 0.7584, + "step": 3914 + }, + { + "epoch": 0.2154769112224118, + "grad_norm": 0.789191484451294, + "learning_rate": 9.729019535177617e-06, + "loss": 0.7568, + "step": 3915 + }, + { + "epoch": 0.21553195002476747, + "grad_norm": 0.7300963401794434, + "learning_rate": 9.728878754190478e-06, + "loss": 0.8029, + "step": 3916 + }, + { + "epoch": 0.21558698882712313, + "grad_norm": 0.9201067686080933, + "learning_rate": 9.728737937662354e-06, + "loss": 0.8665, + "step": 3917 + }, + { + "epoch": 0.2156420276294788, + "grad_norm": 0.8820425271987915, + "learning_rate": 9.728597085594301e-06, + "loss": 0.8378, + "step": 3918 + }, + { + "epoch": 0.21569706643183445, + "grad_norm": 0.7762684226036072, + "learning_rate": 9.728456197987376e-06, + "loss": 0.8005, + "step": 3919 + }, + { + "epoch": 0.2157521052341901, + "grad_norm": 0.8429732918739319, + "learning_rate": 9.728315274842641e-06, + "loss": 0.8337, + "step": 3920 + }, + { + "epoch": 0.21580714403654577, + "grad_norm": 0.7820748090744019, + "learning_rate": 9.728174316161156e-06, + "loss": 0.8085, + "step": 3921 + }, + { + "epoch": 0.21586218283890143, + "grad_norm": 0.8748064637184143, + "learning_rate": 9.728033321943977e-06, + "loss": 0.7734, + "step": 3922 + }, + { + "epoch": 0.2159172216412571, + "grad_norm": 0.8878050446510315, + "learning_rate": 9.727892292192166e-06, + "loss": 0.9226, + "step": 3923 + }, + { + "epoch": 0.21597226044361276, + "grad_norm": 0.8156047463417053, + "learning_rate": 9.72775122690678e-06, + "loss": 0.8111, + "step": 3924 + }, + { + "epoch": 0.21602729924596842, + "grad_norm": 0.7392945885658264, + "learning_rate": 9.727610126088883e-06, + "loss": 0.758, + "step": 3925 + }, + { + "epoch": 0.21608233804832408, + "grad_norm": 0.7573148608207703, + "learning_rate": 9.727468989739532e-06, + "loss": 0.8142, + "step": 3926 + }, + { + "epoch": 0.21613737685067974, + "grad_norm": 0.831847608089447, + "learning_rate": 9.727327817859792e-06, + "loss": 0.7337, + "step": 3927 + }, + { + "epoch": 0.2161924156530354, + "grad_norm": 0.8012371063232422, + "learning_rate": 9.72718661045072e-06, + "loss": 0.8128, + "step": 3928 + }, + { + "epoch": 0.21624745445539106, + "grad_norm": 0.7985890507698059, + "learning_rate": 9.72704536751338e-06, + "loss": 0.8549, + "step": 3929 + }, + { + "epoch": 0.21630249325774673, + "grad_norm": 0.7194695472717285, + "learning_rate": 9.726904089048832e-06, + "loss": 0.775, + "step": 3930 + }, + { + "epoch": 0.21635753206010236, + "grad_norm": 0.8029330968856812, + "learning_rate": 9.726762775058138e-06, + "loss": 0.9167, + "step": 3931 + }, + { + "epoch": 0.21641257086245802, + "grad_norm": 0.7388954162597656, + "learning_rate": 9.72662142554236e-06, + "loss": 0.7295, + "step": 3932 + }, + { + "epoch": 0.21646760966481368, + "grad_norm": 0.798796534538269, + "learning_rate": 9.726480040502559e-06, + "loss": 0.8686, + "step": 3933 + }, + { + "epoch": 0.21652264846716934, + "grad_norm": 0.9977202415466309, + "learning_rate": 9.726338619939802e-06, + "loss": 0.8387, + "step": 3934 + }, + { + "epoch": 0.216577687269525, + "grad_norm": 0.8173295855522156, + "learning_rate": 9.726197163855148e-06, + "loss": 0.7773, + "step": 3935 + }, + { + "epoch": 0.21663272607188067, + "grad_norm": 0.6519538760185242, + "learning_rate": 9.72605567224966e-06, + "loss": 0.6319, + "step": 3936 + }, + { + "epoch": 0.21668776487423633, + "grad_norm": 0.8004894852638245, + "learning_rate": 9.725914145124404e-06, + "loss": 0.8281, + "step": 3937 + }, + { + "epoch": 0.216742803676592, + "grad_norm": 0.7327558398246765, + "learning_rate": 9.725772582480442e-06, + "loss": 0.7105, + "step": 3938 + }, + { + "epoch": 0.21679784247894765, + "grad_norm": 0.7624199986457825, + "learning_rate": 9.725630984318839e-06, + "loss": 0.7823, + "step": 3939 + }, + { + "epoch": 0.2168528812813033, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.725489350640658e-06, + "loss": 0.8147, + "step": 3940 + }, + { + "epoch": 0.21690792008365897, + "grad_norm": 0.6886566877365112, + "learning_rate": 9.725347681446964e-06, + "loss": 0.7263, + "step": 3941 + }, + { + "epoch": 0.21696295888601463, + "grad_norm": 0.882060170173645, + "learning_rate": 9.725205976738821e-06, + "loss": 0.8931, + "step": 3942 + }, + { + "epoch": 0.2170179976883703, + "grad_norm": 0.7946881055831909, + "learning_rate": 9.725064236517297e-06, + "loss": 0.8036, + "step": 3943 + }, + { + "epoch": 0.21707303649072596, + "grad_norm": 0.7062187194824219, + "learning_rate": 9.724922460783453e-06, + "loss": 0.6915, + "step": 3944 + }, + { + "epoch": 0.21712807529308162, + "grad_norm": 0.7978640794754028, + "learning_rate": 9.724780649538356e-06, + "loss": 0.8873, + "step": 3945 + }, + { + "epoch": 0.21718311409543728, + "grad_norm": 0.8828096389770508, + "learning_rate": 9.724638802783073e-06, + "loss": 0.7114, + "step": 3946 + }, + { + "epoch": 0.21723815289779294, + "grad_norm": 0.7301073670387268, + "learning_rate": 9.724496920518672e-06, + "loss": 0.8107, + "step": 3947 + }, + { + "epoch": 0.2172931917001486, + "grad_norm": 0.7944212555885315, + "learning_rate": 9.724355002746213e-06, + "loss": 0.8135, + "step": 3948 + }, + { + "epoch": 0.21734823050250426, + "grad_norm": 0.7988898754119873, + "learning_rate": 9.724213049466768e-06, + "loss": 0.7173, + "step": 3949 + }, + { + "epoch": 0.21740326930485993, + "grad_norm": 0.7734915018081665, + "learning_rate": 9.724071060681401e-06, + "loss": 0.8131, + "step": 3950 + }, + { + "epoch": 0.2174583081072156, + "grad_norm": 0.6856646537780762, + "learning_rate": 9.723929036391183e-06, + "loss": 0.6873, + "step": 3951 + }, + { + "epoch": 0.21751334690957125, + "grad_norm": 0.8652976751327515, + "learning_rate": 9.723786976597179e-06, + "loss": 0.7908, + "step": 3952 + }, + { + "epoch": 0.2175683857119269, + "grad_norm": 0.7325445413589478, + "learning_rate": 9.723644881300453e-06, + "loss": 0.7389, + "step": 3953 + }, + { + "epoch": 0.21762342451428257, + "grad_norm": 0.8596270084381104, + "learning_rate": 9.723502750502079e-06, + "loss": 0.7785, + "step": 3954 + }, + { + "epoch": 0.21767846331663823, + "grad_norm": 0.739248514175415, + "learning_rate": 9.723360584203123e-06, + "loss": 0.8125, + "step": 3955 + }, + { + "epoch": 0.2177335021189939, + "grad_norm": 0.815617561340332, + "learning_rate": 9.723218382404652e-06, + "loss": 0.8682, + "step": 3956 + }, + { + "epoch": 0.21778854092134955, + "grad_norm": 0.758756160736084, + "learning_rate": 9.723076145107738e-06, + "loss": 0.7717, + "step": 3957 + }, + { + "epoch": 0.21784357972370522, + "grad_norm": 0.9007643461227417, + "learning_rate": 9.722933872313445e-06, + "loss": 0.7901, + "step": 3958 + }, + { + "epoch": 0.21789861852606088, + "grad_norm": 0.781548023223877, + "learning_rate": 9.722791564022846e-06, + "loss": 0.8338, + "step": 3959 + }, + { + "epoch": 0.21795365732841654, + "grad_norm": 0.7730190753936768, + "learning_rate": 9.722649220237011e-06, + "loss": 0.8032, + "step": 3960 + }, + { + "epoch": 0.2180086961307722, + "grad_norm": 0.8737791776657104, + "learning_rate": 9.722506840957009e-06, + "loss": 0.8436, + "step": 3961 + }, + { + "epoch": 0.21806373493312786, + "grad_norm": 0.8151329159736633, + "learning_rate": 9.722364426183908e-06, + "loss": 0.8115, + "step": 3962 + }, + { + "epoch": 0.21811877373548352, + "grad_norm": 0.7852860689163208, + "learning_rate": 9.722221975918782e-06, + "loss": 0.7977, + "step": 3963 + }, + { + "epoch": 0.21817381253783918, + "grad_norm": 0.9064140319824219, + "learning_rate": 9.722079490162698e-06, + "loss": 0.8799, + "step": 3964 + }, + { + "epoch": 0.21822885134019485, + "grad_norm": 0.8579906821250916, + "learning_rate": 9.72193696891673e-06, + "loss": 0.7825, + "step": 3965 + }, + { + "epoch": 0.2182838901425505, + "grad_norm": 0.8005900382995605, + "learning_rate": 9.721794412181946e-06, + "loss": 0.8601, + "step": 3966 + }, + { + "epoch": 0.21833892894490617, + "grad_norm": 0.7661529183387756, + "learning_rate": 9.721651819959421e-06, + "loss": 0.7446, + "step": 3967 + }, + { + "epoch": 0.21839396774726183, + "grad_norm": 0.7558436989784241, + "learning_rate": 9.721509192250224e-06, + "loss": 0.7484, + "step": 3968 + }, + { + "epoch": 0.2184490065496175, + "grad_norm": 0.765446126461029, + "learning_rate": 9.721366529055427e-06, + "loss": 0.7727, + "step": 3969 + }, + { + "epoch": 0.21850404535197315, + "grad_norm": 0.7329973578453064, + "learning_rate": 9.721223830376103e-06, + "loss": 0.797, + "step": 3970 + }, + { + "epoch": 0.21855908415432881, + "grad_norm": 0.8881974220275879, + "learning_rate": 9.721081096213324e-06, + "loss": 0.9199, + "step": 3971 + }, + { + "epoch": 0.21861412295668448, + "grad_norm": 0.8246786594390869, + "learning_rate": 9.720938326568165e-06, + "loss": 0.9108, + "step": 3972 + }, + { + "epoch": 0.21866916175904014, + "grad_norm": 0.7187291979789734, + "learning_rate": 9.720795521441697e-06, + "loss": 0.7756, + "step": 3973 + }, + { + "epoch": 0.21872420056139577, + "grad_norm": 0.7880695462226868, + "learning_rate": 9.720652680834995e-06, + "loss": 0.8548, + "step": 3974 + }, + { + "epoch": 0.21877923936375143, + "grad_norm": 0.8841108679771423, + "learning_rate": 9.720509804749128e-06, + "loss": 0.8477, + "step": 3975 + }, + { + "epoch": 0.2188342781661071, + "grad_norm": 0.9061402678489685, + "learning_rate": 9.720366893185173e-06, + "loss": 0.8235, + "step": 3976 + }, + { + "epoch": 0.21888931696846275, + "grad_norm": 0.8342392444610596, + "learning_rate": 9.720223946144206e-06, + "loss": 0.7777, + "step": 3977 + }, + { + "epoch": 0.21894435577081842, + "grad_norm": 0.7933762073516846, + "learning_rate": 9.720080963627299e-06, + "loss": 0.7943, + "step": 3978 + }, + { + "epoch": 0.21899939457317408, + "grad_norm": 0.8358896374702454, + "learning_rate": 9.719937945635527e-06, + "loss": 0.8932, + "step": 3979 + }, + { + "epoch": 0.21905443337552974, + "grad_norm": 0.7479808926582336, + "learning_rate": 9.719794892169964e-06, + "loss": 0.7446, + "step": 3980 + }, + { + "epoch": 0.2191094721778854, + "grad_norm": 0.7920958399772644, + "learning_rate": 9.719651803231685e-06, + "loss": 0.7489, + "step": 3981 + }, + { + "epoch": 0.21916451098024106, + "grad_norm": 0.7098824977874756, + "learning_rate": 9.719508678821768e-06, + "loss": 0.7763, + "step": 3982 + }, + { + "epoch": 0.21921954978259672, + "grad_norm": 0.8733491897583008, + "learning_rate": 9.719365518941288e-06, + "loss": 0.7325, + "step": 3983 + }, + { + "epoch": 0.21927458858495238, + "grad_norm": 0.8328796029090881, + "learning_rate": 9.719222323591318e-06, + "loss": 0.9097, + "step": 3984 + }, + { + "epoch": 0.21932962738730805, + "grad_norm": 0.7869352698326111, + "learning_rate": 9.719079092772936e-06, + "loss": 0.759, + "step": 3985 + }, + { + "epoch": 0.2193846661896637, + "grad_norm": 0.8278539180755615, + "learning_rate": 9.718935826487221e-06, + "loss": 0.8545, + "step": 3986 + }, + { + "epoch": 0.21943970499201937, + "grad_norm": 0.8122449517250061, + "learning_rate": 9.718792524735246e-06, + "loss": 0.7646, + "step": 3987 + }, + { + "epoch": 0.21949474379437503, + "grad_norm": 1.072253942489624, + "learning_rate": 9.71864918751809e-06, + "loss": 0.915, + "step": 3988 + }, + { + "epoch": 0.2195497825967307, + "grad_norm": 0.7770013213157654, + "learning_rate": 9.718505814836829e-06, + "loss": 0.7561, + "step": 3989 + }, + { + "epoch": 0.21960482139908635, + "grad_norm": 0.9011678695678711, + "learning_rate": 9.718362406692544e-06, + "loss": 0.7532, + "step": 3990 + }, + { + "epoch": 0.21965986020144201, + "grad_norm": 0.8867584466934204, + "learning_rate": 9.718218963086307e-06, + "loss": 0.8732, + "step": 3991 + }, + { + "epoch": 0.21971489900379768, + "grad_norm": 0.8884773850440979, + "learning_rate": 9.718075484019201e-06, + "loss": 0.7403, + "step": 3992 + }, + { + "epoch": 0.21976993780615334, + "grad_norm": 0.8995673060417175, + "learning_rate": 9.7179319694923e-06, + "loss": 0.9283, + "step": 3993 + }, + { + "epoch": 0.219824976608509, + "grad_norm": 0.7875818014144897, + "learning_rate": 9.717788419506688e-06, + "loss": 0.8633, + "step": 3994 + }, + { + "epoch": 0.21988001541086466, + "grad_norm": 0.7693219184875488, + "learning_rate": 9.71764483406344e-06, + "loss": 0.8073, + "step": 3995 + }, + { + "epoch": 0.21993505421322032, + "grad_norm": 0.7932817339897156, + "learning_rate": 9.717501213163636e-06, + "loss": 0.7537, + "step": 3996 + }, + { + "epoch": 0.21999009301557598, + "grad_norm": 0.8274912238121033, + "learning_rate": 9.717357556808358e-06, + "loss": 0.7715, + "step": 3997 + }, + { + "epoch": 0.22004513181793164, + "grad_norm": 0.7533993124961853, + "learning_rate": 9.71721386499868e-06, + "loss": 0.7482, + "step": 3998 + }, + { + "epoch": 0.2201001706202873, + "grad_norm": 1.028228759765625, + "learning_rate": 9.717070137735687e-06, + "loss": 0.9897, + "step": 3999 + }, + { + "epoch": 0.22015520942264297, + "grad_norm": 1.1093978881835938, + "learning_rate": 9.716926375020457e-06, + "loss": 0.8701, + "step": 4000 + }, + { + "epoch": 0.22021024822499863, + "grad_norm": 0.7891124486923218, + "learning_rate": 9.716782576854073e-06, + "loss": 0.8533, + "step": 4001 + }, + { + "epoch": 0.2202652870273543, + "grad_norm": 1.1783788204193115, + "learning_rate": 9.716638743237611e-06, + "loss": 0.8088, + "step": 4002 + }, + { + "epoch": 0.22032032582970995, + "grad_norm": 0.8713383078575134, + "learning_rate": 9.716494874172157e-06, + "loss": 0.8382, + "step": 4003 + }, + { + "epoch": 0.2203753646320656, + "grad_norm": 0.7821565270423889, + "learning_rate": 9.716350969658787e-06, + "loss": 0.8168, + "step": 4004 + }, + { + "epoch": 0.22043040343442127, + "grad_norm": 0.7642589211463928, + "learning_rate": 9.716207029698589e-06, + "loss": 0.7209, + "step": 4005 + }, + { + "epoch": 0.22048544223677694, + "grad_norm": 0.935625433921814, + "learning_rate": 9.716063054292639e-06, + "loss": 0.8436, + "step": 4006 + }, + { + "epoch": 0.2205404810391326, + "grad_norm": 0.7064627408981323, + "learning_rate": 9.715919043442024e-06, + "loss": 0.7651, + "step": 4007 + }, + { + "epoch": 0.22059551984148826, + "grad_norm": 0.6980876326560974, + "learning_rate": 9.715774997147823e-06, + "loss": 0.7842, + "step": 4008 + }, + { + "epoch": 0.22065055864384392, + "grad_norm": 0.7691119313240051, + "learning_rate": 9.715630915411118e-06, + "loss": 0.7345, + "step": 4009 + }, + { + "epoch": 0.22070559744619958, + "grad_norm": 0.8870186805725098, + "learning_rate": 9.715486798232994e-06, + "loss": 0.7531, + "step": 4010 + }, + { + "epoch": 0.22076063624855524, + "grad_norm": 0.7225383520126343, + "learning_rate": 9.715342645614533e-06, + "loss": 0.8543, + "step": 4011 + }, + { + "epoch": 0.2208156750509109, + "grad_norm": 0.7517428994178772, + "learning_rate": 9.71519845755682e-06, + "loss": 0.84, + "step": 4012 + }, + { + "epoch": 0.22087071385326656, + "grad_norm": 0.8115549087524414, + "learning_rate": 9.715054234060937e-06, + "loss": 0.7823, + "step": 4013 + }, + { + "epoch": 0.22092575265562223, + "grad_norm": 1.6656148433685303, + "learning_rate": 9.714909975127968e-06, + "loss": 0.8951, + "step": 4014 + }, + { + "epoch": 0.2209807914579779, + "grad_norm": 0.906508207321167, + "learning_rate": 9.714765680758997e-06, + "loss": 0.8599, + "step": 4015 + }, + { + "epoch": 0.22103583026033355, + "grad_norm": 0.8274093866348267, + "learning_rate": 9.71462135095511e-06, + "loss": 0.9568, + "step": 4016 + }, + { + "epoch": 0.22109086906268918, + "grad_norm": 0.7745386958122253, + "learning_rate": 9.714476985717393e-06, + "loss": 0.8641, + "step": 4017 + }, + { + "epoch": 0.22114590786504484, + "grad_norm": 0.8112689256668091, + "learning_rate": 9.714332585046928e-06, + "loss": 0.834, + "step": 4018 + }, + { + "epoch": 0.2212009466674005, + "grad_norm": 0.916847825050354, + "learning_rate": 9.714188148944799e-06, + "loss": 0.8546, + "step": 4019 + }, + { + "epoch": 0.22125598546975617, + "grad_norm": 0.8595414161682129, + "learning_rate": 9.714043677412096e-06, + "loss": 0.9388, + "step": 4020 + }, + { + "epoch": 0.22131102427211183, + "grad_norm": 0.8672438263893127, + "learning_rate": 9.713899170449901e-06, + "loss": 0.8151, + "step": 4021 + }, + { + "epoch": 0.2213660630744675, + "grad_norm": 0.699749767780304, + "learning_rate": 9.713754628059304e-06, + "loss": 0.7433, + "step": 4022 + }, + { + "epoch": 0.22142110187682315, + "grad_norm": 0.8071898818016052, + "learning_rate": 9.713610050241387e-06, + "loss": 0.7663, + "step": 4023 + }, + { + "epoch": 0.2214761406791788, + "grad_norm": 0.745030403137207, + "learning_rate": 9.713465436997239e-06, + "loss": 0.7733, + "step": 4024 + }, + { + "epoch": 0.22153117948153447, + "grad_norm": 0.8034930229187012, + "learning_rate": 9.713320788327947e-06, + "loss": 0.9015, + "step": 4025 + }, + { + "epoch": 0.22158621828389014, + "grad_norm": 0.8549708724021912, + "learning_rate": 9.713176104234597e-06, + "loss": 0.7127, + "step": 4026 + }, + { + "epoch": 0.2216412570862458, + "grad_norm": 0.8432256579399109, + "learning_rate": 9.713031384718277e-06, + "loss": 0.8163, + "step": 4027 + }, + { + "epoch": 0.22169629588860146, + "grad_norm": 0.7623703479766846, + "learning_rate": 9.712886629780075e-06, + "loss": 0.8272, + "step": 4028 + }, + { + "epoch": 0.22175133469095712, + "grad_norm": 0.8425806760787964, + "learning_rate": 9.712741839421079e-06, + "loss": 0.7907, + "step": 4029 + }, + { + "epoch": 0.22180637349331278, + "grad_norm": 0.7477750778198242, + "learning_rate": 9.712597013642376e-06, + "loss": 0.7662, + "step": 4030 + }, + { + "epoch": 0.22186141229566844, + "grad_norm": 0.7761805057525635, + "learning_rate": 9.712452152445056e-06, + "loss": 0.7999, + "step": 4031 + }, + { + "epoch": 0.2219164510980241, + "grad_norm": 0.8604531288146973, + "learning_rate": 9.712307255830207e-06, + "loss": 0.812, + "step": 4032 + }, + { + "epoch": 0.22197148990037976, + "grad_norm": 0.8113332986831665, + "learning_rate": 9.712162323798918e-06, + "loss": 0.8092, + "step": 4033 + }, + { + "epoch": 0.22202652870273543, + "grad_norm": 0.7980128526687622, + "learning_rate": 9.71201735635228e-06, + "loss": 0.6934, + "step": 4034 + }, + { + "epoch": 0.2220815675050911, + "grad_norm": 0.7819470763206482, + "learning_rate": 9.711872353491377e-06, + "loss": 0.8531, + "step": 4035 + }, + { + "epoch": 0.22213660630744675, + "grad_norm": 0.8283445835113525, + "learning_rate": 9.711727315217305e-06, + "loss": 0.8594, + "step": 4036 + }, + { + "epoch": 0.2221916451098024, + "grad_norm": 0.7282612919807434, + "learning_rate": 9.711582241531153e-06, + "loss": 0.7374, + "step": 4037 + }, + { + "epoch": 0.22224668391215807, + "grad_norm": 0.9564353823661804, + "learning_rate": 9.711437132434007e-06, + "loss": 0.7996, + "step": 4038 + }, + { + "epoch": 0.22230172271451373, + "grad_norm": 0.8559701442718506, + "learning_rate": 9.711291987926963e-06, + "loss": 0.949, + "step": 4039 + }, + { + "epoch": 0.2223567615168694, + "grad_norm": 0.7515334486961365, + "learning_rate": 9.71114680801111e-06, + "loss": 0.7188, + "step": 4040 + }, + { + "epoch": 0.22241180031922506, + "grad_norm": 0.7685608863830566, + "learning_rate": 9.711001592687537e-06, + "loss": 0.7679, + "step": 4041 + }, + { + "epoch": 0.22246683912158072, + "grad_norm": 0.6848913431167603, + "learning_rate": 9.710856341957337e-06, + "loss": 0.7666, + "step": 4042 + }, + { + "epoch": 0.22252187792393638, + "grad_norm": 0.7270542979240417, + "learning_rate": 9.710711055821602e-06, + "loss": 0.7563, + "step": 4043 + }, + { + "epoch": 0.22257691672629204, + "grad_norm": 0.7965164184570312, + "learning_rate": 9.710565734281424e-06, + "loss": 0.7586, + "step": 4044 + }, + { + "epoch": 0.2226319555286477, + "grad_norm": 0.7872949242591858, + "learning_rate": 9.710420377337895e-06, + "loss": 0.8423, + "step": 4045 + }, + { + "epoch": 0.22268699433100336, + "grad_norm": 0.7466526627540588, + "learning_rate": 9.710274984992107e-06, + "loss": 0.7578, + "step": 4046 + }, + { + "epoch": 0.22274203313335902, + "grad_norm": 0.7208731770515442, + "learning_rate": 9.710129557245154e-06, + "loss": 0.7019, + "step": 4047 + }, + { + "epoch": 0.22279707193571469, + "grad_norm": 0.6953400373458862, + "learning_rate": 9.709984094098127e-06, + "loss": 0.7234, + "step": 4048 + }, + { + "epoch": 0.22285211073807035, + "grad_norm": 0.7866283059120178, + "learning_rate": 9.709838595552122e-06, + "loss": 0.785, + "step": 4049 + }, + { + "epoch": 0.222907149540426, + "grad_norm": 0.7404114007949829, + "learning_rate": 9.709693061608227e-06, + "loss": 0.7706, + "step": 4050 + }, + { + "epoch": 0.22296218834278167, + "grad_norm": 0.8788254857063293, + "learning_rate": 9.709547492267544e-06, + "loss": 0.8392, + "step": 4051 + }, + { + "epoch": 0.22301722714513733, + "grad_norm": 0.7493161559104919, + "learning_rate": 9.70940188753116e-06, + "loss": 0.8346, + "step": 4052 + }, + { + "epoch": 0.223072265947493, + "grad_norm": 0.7340379357337952, + "learning_rate": 9.709256247400174e-06, + "loss": 0.7715, + "step": 4053 + }, + { + "epoch": 0.22312730474984865, + "grad_norm": 0.7291178107261658, + "learning_rate": 9.709110571875677e-06, + "loss": 0.866, + "step": 4054 + }, + { + "epoch": 0.22318234355220432, + "grad_norm": 0.8046013712882996, + "learning_rate": 9.708964860958765e-06, + "loss": 0.7885, + "step": 4055 + }, + { + "epoch": 0.22323738235455998, + "grad_norm": 0.832941472530365, + "learning_rate": 9.708819114650535e-06, + "loss": 0.873, + "step": 4056 + }, + { + "epoch": 0.22329242115691564, + "grad_norm": 0.6933377981185913, + "learning_rate": 9.70867333295208e-06, + "loss": 0.7944, + "step": 4057 + }, + { + "epoch": 0.2233474599592713, + "grad_norm": 0.7976044416427612, + "learning_rate": 9.708527515864499e-06, + "loss": 0.72, + "step": 4058 + }, + { + "epoch": 0.22340249876162696, + "grad_norm": 0.7698904871940613, + "learning_rate": 9.708381663388884e-06, + "loss": 0.7603, + "step": 4059 + }, + { + "epoch": 0.2234575375639826, + "grad_norm": 0.7554401159286499, + "learning_rate": 9.708235775526331e-06, + "loss": 0.7488, + "step": 4060 + }, + { + "epoch": 0.22351257636633826, + "grad_norm": 0.7382954359054565, + "learning_rate": 9.70808985227794e-06, + "loss": 0.7418, + "step": 4061 + }, + { + "epoch": 0.22356761516869392, + "grad_norm": 0.7220499515533447, + "learning_rate": 9.707943893644806e-06, + "loss": 0.7691, + "step": 4062 + }, + { + "epoch": 0.22362265397104958, + "grad_norm": 0.727542519569397, + "learning_rate": 9.707797899628027e-06, + "loss": 0.7603, + "step": 4063 + }, + { + "epoch": 0.22367769277340524, + "grad_norm": 0.7857500910758972, + "learning_rate": 9.707651870228697e-06, + "loss": 0.8633, + "step": 4064 + }, + { + "epoch": 0.2237327315757609, + "grad_norm": 0.7975600361824036, + "learning_rate": 9.707505805447917e-06, + "loss": 0.8591, + "step": 4065 + }, + { + "epoch": 0.22378777037811656, + "grad_norm": 1.0063475370407104, + "learning_rate": 9.707359705286784e-06, + "loss": 0.7935, + "step": 4066 + }, + { + "epoch": 0.22384280918047222, + "grad_norm": 0.7307062745094299, + "learning_rate": 9.707213569746393e-06, + "loss": 0.797, + "step": 4067 + }, + { + "epoch": 0.22389784798282789, + "grad_norm": 0.7891914248466492, + "learning_rate": 9.707067398827847e-06, + "loss": 0.853, + "step": 4068 + }, + { + "epoch": 0.22395288678518355, + "grad_norm": 0.7479422092437744, + "learning_rate": 9.706921192532242e-06, + "loss": 0.7359, + "step": 4069 + }, + { + "epoch": 0.2240079255875392, + "grad_norm": 0.8436065912246704, + "learning_rate": 9.706774950860676e-06, + "loss": 0.7916, + "step": 4070 + }, + { + "epoch": 0.22406296438989487, + "grad_norm": 0.7586960196495056, + "learning_rate": 9.706628673814252e-06, + "loss": 0.7871, + "step": 4071 + }, + { + "epoch": 0.22411800319225053, + "grad_norm": 0.8181111812591553, + "learning_rate": 9.706482361394064e-06, + "loss": 0.7782, + "step": 4072 + }, + { + "epoch": 0.2241730419946062, + "grad_norm": 0.7205253839492798, + "learning_rate": 9.706336013601217e-06, + "loss": 0.7912, + "step": 4073 + }, + { + "epoch": 0.22422808079696185, + "grad_norm": 0.9823397397994995, + "learning_rate": 9.706189630436806e-06, + "loss": 0.8393, + "step": 4074 + }, + { + "epoch": 0.22428311959931752, + "grad_norm": 0.7360854148864746, + "learning_rate": 9.706043211901935e-06, + "loss": 0.8239, + "step": 4075 + }, + { + "epoch": 0.22433815840167318, + "grad_norm": 0.7590144872665405, + "learning_rate": 9.705896757997701e-06, + "loss": 0.7177, + "step": 4076 + }, + { + "epoch": 0.22439319720402884, + "grad_norm": 0.7691343426704407, + "learning_rate": 9.70575026872521e-06, + "loss": 0.7731, + "step": 4077 + }, + { + "epoch": 0.2244482360063845, + "grad_norm": 0.7057286500930786, + "learning_rate": 9.705603744085556e-06, + "loss": 0.7746, + "step": 4078 + }, + { + "epoch": 0.22450327480874016, + "grad_norm": 0.7954769134521484, + "learning_rate": 9.705457184079847e-06, + "loss": 0.8215, + "step": 4079 + }, + { + "epoch": 0.22455831361109582, + "grad_norm": 0.7089072465896606, + "learning_rate": 9.70531058870918e-06, + "loss": 0.7263, + "step": 4080 + }, + { + "epoch": 0.22461335241345148, + "grad_norm": 0.9847552180290222, + "learning_rate": 9.705163957974657e-06, + "loss": 0.8948, + "step": 4081 + }, + { + "epoch": 0.22466839121580715, + "grad_norm": 0.7977012395858765, + "learning_rate": 9.705017291877383e-06, + "loss": 0.7518, + "step": 4082 + }, + { + "epoch": 0.2247234300181628, + "grad_norm": 0.8084518909454346, + "learning_rate": 9.704870590418458e-06, + "loss": 0.8711, + "step": 4083 + }, + { + "epoch": 0.22477846882051847, + "grad_norm": 0.9151536822319031, + "learning_rate": 9.704723853598986e-06, + "loss": 0.8217, + "step": 4084 + }, + { + "epoch": 0.22483350762287413, + "grad_norm": 0.908136248588562, + "learning_rate": 9.704577081420065e-06, + "loss": 0.6961, + "step": 4085 + }, + { + "epoch": 0.2248885464252298, + "grad_norm": 0.8569996953010559, + "learning_rate": 9.704430273882806e-06, + "loss": 0.8405, + "step": 4086 + }, + { + "epoch": 0.22494358522758545, + "grad_norm": 0.7687774300575256, + "learning_rate": 9.704283430988307e-06, + "loss": 0.6903, + "step": 4087 + }, + { + "epoch": 0.2249986240299411, + "grad_norm": 0.863203763961792, + "learning_rate": 9.704136552737673e-06, + "loss": 0.8927, + "step": 4088 + }, + { + "epoch": 0.22505366283229677, + "grad_norm": 1.252581238746643, + "learning_rate": 9.703989639132008e-06, + "loss": 0.8792, + "step": 4089 + }, + { + "epoch": 0.22510870163465244, + "grad_norm": 0.7844160795211792, + "learning_rate": 9.703842690172415e-06, + "loss": 0.844, + "step": 4090 + }, + { + "epoch": 0.2251637404370081, + "grad_norm": 0.8669766187667847, + "learning_rate": 9.703695705860002e-06, + "loss": 0.7008, + "step": 4091 + }, + { + "epoch": 0.22521877923936376, + "grad_norm": 0.7180137634277344, + "learning_rate": 9.703548686195869e-06, + "loss": 0.8242, + "step": 4092 + }, + { + "epoch": 0.22527381804171942, + "grad_norm": 0.7225000858306885, + "learning_rate": 9.703401631181124e-06, + "loss": 0.724, + "step": 4093 + }, + { + "epoch": 0.22532885684407508, + "grad_norm": 0.8348065614700317, + "learning_rate": 9.70325454081687e-06, + "loss": 0.7996, + "step": 4094 + }, + { + "epoch": 0.22538389564643074, + "grad_norm": 0.8099488019943237, + "learning_rate": 9.703107415104216e-06, + "loss": 0.7498, + "step": 4095 + }, + { + "epoch": 0.2254389344487864, + "grad_norm": 0.7051188945770264, + "learning_rate": 9.702960254044264e-06, + "loss": 0.7322, + "step": 4096 + }, + { + "epoch": 0.22549397325114207, + "grad_norm": 0.742859423160553, + "learning_rate": 9.702813057638122e-06, + "loss": 0.746, + "step": 4097 + }, + { + "epoch": 0.22554901205349773, + "grad_norm": 0.7981536984443665, + "learning_rate": 9.702665825886897e-06, + "loss": 0.8705, + "step": 4098 + }, + { + "epoch": 0.2256040508558534, + "grad_norm": 1.0317178964614868, + "learning_rate": 9.702518558791693e-06, + "loss": 0.8261, + "step": 4099 + }, + { + "epoch": 0.22565908965820905, + "grad_norm": 0.7811983823776245, + "learning_rate": 9.702371256353618e-06, + "loss": 0.7633, + "step": 4100 + }, + { + "epoch": 0.2257141284605647, + "grad_norm": 0.8288078308105469, + "learning_rate": 9.702223918573782e-06, + "loss": 0.7974, + "step": 4101 + }, + { + "epoch": 0.22576916726292034, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.702076545453286e-06, + "loss": 0.7517, + "step": 4102 + }, + { + "epoch": 0.225824206065276, + "grad_norm": 0.8342248201370239, + "learning_rate": 9.701929136993243e-06, + "loss": 0.8634, + "step": 4103 + }, + { + "epoch": 0.22587924486763167, + "grad_norm": 0.790392279624939, + "learning_rate": 9.701781693194761e-06, + "loss": 0.7705, + "step": 4104 + }, + { + "epoch": 0.22593428366998733, + "grad_norm": 0.824691891670227, + "learning_rate": 9.701634214058944e-06, + "loss": 0.877, + "step": 4105 + }, + { + "epoch": 0.225989322472343, + "grad_norm": 0.9237051010131836, + "learning_rate": 9.701486699586904e-06, + "loss": 0.842, + "step": 4106 + }, + { + "epoch": 0.22604436127469865, + "grad_norm": 0.7453535199165344, + "learning_rate": 9.701339149779747e-06, + "loss": 0.8217, + "step": 4107 + }, + { + "epoch": 0.2260994000770543, + "grad_norm": 0.727872371673584, + "learning_rate": 9.701191564638586e-06, + "loss": 0.849, + "step": 4108 + }, + { + "epoch": 0.22615443887940997, + "grad_norm": 0.966585636138916, + "learning_rate": 9.701043944164526e-06, + "loss": 0.7742, + "step": 4109 + }, + { + "epoch": 0.22620947768176564, + "grad_norm": 0.7556117177009583, + "learning_rate": 9.700896288358678e-06, + "loss": 0.7498, + "step": 4110 + }, + { + "epoch": 0.2262645164841213, + "grad_norm": 0.848143458366394, + "learning_rate": 9.700748597222151e-06, + "loss": 0.7237, + "step": 4111 + }, + { + "epoch": 0.22631955528647696, + "grad_norm": 0.9046787619590759, + "learning_rate": 9.700600870756056e-06, + "loss": 0.8066, + "step": 4112 + }, + { + "epoch": 0.22637459408883262, + "grad_norm": 0.923159658908844, + "learning_rate": 9.700453108961505e-06, + "loss": 0.8404, + "step": 4113 + }, + { + "epoch": 0.22642963289118828, + "grad_norm": 0.8697664737701416, + "learning_rate": 9.700305311839606e-06, + "loss": 0.7269, + "step": 4114 + }, + { + "epoch": 0.22648467169354394, + "grad_norm": 0.8179994821548462, + "learning_rate": 9.70015747939147e-06, + "loss": 0.8083, + "step": 4115 + }, + { + "epoch": 0.2265397104958996, + "grad_norm": 0.7961694002151489, + "learning_rate": 9.700009611618208e-06, + "loss": 0.7327, + "step": 4116 + }, + { + "epoch": 0.22659474929825527, + "grad_norm": 0.7317802309989929, + "learning_rate": 9.699861708520934e-06, + "loss": 0.8273, + "step": 4117 + }, + { + "epoch": 0.22664978810061093, + "grad_norm": 0.9190557599067688, + "learning_rate": 9.699713770100757e-06, + "loss": 0.8027, + "step": 4118 + }, + { + "epoch": 0.2267048269029666, + "grad_norm": 0.7618072628974915, + "learning_rate": 9.699565796358788e-06, + "loss": 0.7669, + "step": 4119 + }, + { + "epoch": 0.22675986570532225, + "grad_norm": 1.0236154794692993, + "learning_rate": 9.699417787296139e-06, + "loss": 0.7511, + "step": 4120 + }, + { + "epoch": 0.2268149045076779, + "grad_norm": 0.8011670708656311, + "learning_rate": 9.699269742913927e-06, + "loss": 0.7644, + "step": 4121 + }, + { + "epoch": 0.22686994331003357, + "grad_norm": 0.7808024287223816, + "learning_rate": 9.69912166321326e-06, + "loss": 0.7894, + "step": 4122 + }, + { + "epoch": 0.22692498211238923, + "grad_norm": 0.8645655512809753, + "learning_rate": 9.698973548195252e-06, + "loss": 0.7989, + "step": 4123 + }, + { + "epoch": 0.2269800209147449, + "grad_norm": 0.7478770613670349, + "learning_rate": 9.698825397861017e-06, + "loss": 0.7758, + "step": 4124 + }, + { + "epoch": 0.22703505971710056, + "grad_norm": 0.8988361954689026, + "learning_rate": 9.698677212211668e-06, + "loss": 0.8312, + "step": 4125 + }, + { + "epoch": 0.22709009851945622, + "grad_norm": 0.773028552532196, + "learning_rate": 9.69852899124832e-06, + "loss": 0.7415, + "step": 4126 + }, + { + "epoch": 0.22714513732181188, + "grad_norm": 0.8173778653144836, + "learning_rate": 9.698380734972085e-06, + "loss": 0.8241, + "step": 4127 + }, + { + "epoch": 0.22720017612416754, + "grad_norm": 0.7868672013282776, + "learning_rate": 9.698232443384078e-06, + "loss": 0.7294, + "step": 4128 + }, + { + "epoch": 0.2272552149265232, + "grad_norm": 0.8662189841270447, + "learning_rate": 9.698084116485413e-06, + "loss": 0.9307, + "step": 4129 + }, + { + "epoch": 0.22731025372887886, + "grad_norm": 0.7571321129798889, + "learning_rate": 9.697935754277207e-06, + "loss": 0.7756, + "step": 4130 + }, + { + "epoch": 0.22736529253123453, + "grad_norm": 0.8222649097442627, + "learning_rate": 9.697787356760574e-06, + "loss": 0.8689, + "step": 4131 + }, + { + "epoch": 0.2274203313335902, + "grad_norm": 0.8302241563796997, + "learning_rate": 9.697638923936626e-06, + "loss": 0.8139, + "step": 4132 + }, + { + "epoch": 0.22747537013594585, + "grad_norm": 0.779951274394989, + "learning_rate": 9.697490455806482e-06, + "loss": 0.7493, + "step": 4133 + }, + { + "epoch": 0.2275304089383015, + "grad_norm": 0.8409813046455383, + "learning_rate": 9.697341952371257e-06, + "loss": 0.777, + "step": 4134 + }, + { + "epoch": 0.22758544774065717, + "grad_norm": 0.8599729537963867, + "learning_rate": 9.697193413632068e-06, + "loss": 0.7678, + "step": 4135 + }, + { + "epoch": 0.22764048654301283, + "grad_norm": 0.7505115270614624, + "learning_rate": 9.69704483959003e-06, + "loss": 0.787, + "step": 4136 + }, + { + "epoch": 0.2276955253453685, + "grad_norm": 0.7326868176460266, + "learning_rate": 9.696896230246262e-06, + "loss": 0.7066, + "step": 4137 + }, + { + "epoch": 0.22775056414772415, + "grad_norm": 0.8269753456115723, + "learning_rate": 9.696747585601878e-06, + "loss": 0.7379, + "step": 4138 + }, + { + "epoch": 0.22780560295007982, + "grad_norm": 0.7841970324516296, + "learning_rate": 9.696598905657997e-06, + "loss": 0.764, + "step": 4139 + }, + { + "epoch": 0.22786064175243548, + "grad_norm": 0.7131417989730835, + "learning_rate": 9.696450190415735e-06, + "loss": 0.7629, + "step": 4140 + }, + { + "epoch": 0.22791568055479114, + "grad_norm": 0.7922703623771667, + "learning_rate": 9.69630143987621e-06, + "loss": 0.8354, + "step": 4141 + }, + { + "epoch": 0.2279707193571468, + "grad_norm": 0.9628629684448242, + "learning_rate": 9.696152654040543e-06, + "loss": 0.8077, + "step": 4142 + }, + { + "epoch": 0.22802575815950246, + "grad_norm": 0.8566663265228271, + "learning_rate": 9.696003832909847e-06, + "loss": 0.685, + "step": 4143 + }, + { + "epoch": 0.22808079696185812, + "grad_norm": 0.7181339859962463, + "learning_rate": 9.695854976485244e-06, + "loss": 0.8135, + "step": 4144 + }, + { + "epoch": 0.22813583576421376, + "grad_norm": 0.9119813442230225, + "learning_rate": 9.695706084767853e-06, + "loss": 0.7276, + "step": 4145 + }, + { + "epoch": 0.22819087456656942, + "grad_norm": 0.8547400832176208, + "learning_rate": 9.69555715775879e-06, + "loss": 0.8656, + "step": 4146 + }, + { + "epoch": 0.22824591336892508, + "grad_norm": 0.77585768699646, + "learning_rate": 9.695408195459179e-06, + "loss": 0.8218, + "step": 4147 + }, + { + "epoch": 0.22830095217128074, + "grad_norm": 0.7832447290420532, + "learning_rate": 9.695259197870135e-06, + "loss": 0.8002, + "step": 4148 + }, + { + "epoch": 0.2283559909736364, + "grad_norm": 0.9184865355491638, + "learning_rate": 9.69511016499278e-06, + "loss": 0.8651, + "step": 4149 + }, + { + "epoch": 0.22841102977599206, + "grad_norm": 0.8663797974586487, + "learning_rate": 9.694961096828235e-06, + "loss": 0.7381, + "step": 4150 + }, + { + "epoch": 0.22846606857834773, + "grad_norm": 0.843265950679779, + "learning_rate": 9.694811993377617e-06, + "loss": 0.8546, + "step": 4151 + }, + { + "epoch": 0.2285211073807034, + "grad_norm": 0.8021818399429321, + "learning_rate": 9.694662854642049e-06, + "loss": 0.9166, + "step": 4152 + }, + { + "epoch": 0.22857614618305905, + "grad_norm": 0.7762879729270935, + "learning_rate": 9.694513680622653e-06, + "loss": 0.7055, + "step": 4153 + }, + { + "epoch": 0.2286311849854147, + "grad_norm": 0.809352457523346, + "learning_rate": 9.694364471320548e-06, + "loss": 0.7988, + "step": 4154 + }, + { + "epoch": 0.22868622378777037, + "grad_norm": 0.7239902019500732, + "learning_rate": 9.694215226736858e-06, + "loss": 0.7783, + "step": 4155 + }, + { + "epoch": 0.22874126259012603, + "grad_norm": 0.7072625160217285, + "learning_rate": 9.694065946872702e-06, + "loss": 0.7607, + "step": 4156 + }, + { + "epoch": 0.2287963013924817, + "grad_norm": 0.7696169018745422, + "learning_rate": 9.693916631729201e-06, + "loss": 0.7519, + "step": 4157 + }, + { + "epoch": 0.22885134019483735, + "grad_norm": 0.9198557734489441, + "learning_rate": 9.69376728130748e-06, + "loss": 0.7754, + "step": 4158 + }, + { + "epoch": 0.22890637899719302, + "grad_norm": 0.7589097619056702, + "learning_rate": 9.693617895608662e-06, + "loss": 0.7258, + "step": 4159 + }, + { + "epoch": 0.22896141779954868, + "grad_norm": 0.8351333141326904, + "learning_rate": 9.693468474633867e-06, + "loss": 0.8633, + "step": 4160 + }, + { + "epoch": 0.22901645660190434, + "grad_norm": 0.8331828713417053, + "learning_rate": 9.69331901838422e-06, + "loss": 0.7361, + "step": 4161 + }, + { + "epoch": 0.22907149540426, + "grad_norm": 0.8810774087905884, + "learning_rate": 9.693169526860843e-06, + "loss": 0.7651, + "step": 4162 + }, + { + "epoch": 0.22912653420661566, + "grad_norm": 0.8151684999465942, + "learning_rate": 9.69302000006486e-06, + "loss": 0.8533, + "step": 4163 + }, + { + "epoch": 0.22918157300897132, + "grad_norm": 0.8683320879936218, + "learning_rate": 9.692870437997394e-06, + "loss": 0.8323, + "step": 4164 + }, + { + "epoch": 0.22923661181132698, + "grad_norm": 0.7488875389099121, + "learning_rate": 9.692720840659572e-06, + "loss": 0.8414, + "step": 4165 + }, + { + "epoch": 0.22929165061368265, + "grad_norm": 0.7916452288627625, + "learning_rate": 9.692571208052515e-06, + "loss": 0.7058, + "step": 4166 + }, + { + "epoch": 0.2293466894160383, + "grad_norm": 0.8228384256362915, + "learning_rate": 9.69242154017735e-06, + "loss": 0.7667, + "step": 4167 + }, + { + "epoch": 0.22940172821839397, + "grad_norm": 0.7395613789558411, + "learning_rate": 9.692271837035202e-06, + "loss": 0.7649, + "step": 4168 + }, + { + "epoch": 0.22945676702074963, + "grad_norm": 0.7187666893005371, + "learning_rate": 9.692122098627192e-06, + "loss": 0.7575, + "step": 4169 + }, + { + "epoch": 0.2295118058231053, + "grad_norm": 0.7060030102729797, + "learning_rate": 9.691972324954449e-06, + "loss": 0.8309, + "step": 4170 + }, + { + "epoch": 0.22956684462546095, + "grad_norm": 0.7807210087776184, + "learning_rate": 9.691822516018099e-06, + "loss": 0.8185, + "step": 4171 + }, + { + "epoch": 0.22962188342781661, + "grad_norm": 0.6918593645095825, + "learning_rate": 9.691672671819265e-06, + "loss": 0.6983, + "step": 4172 + }, + { + "epoch": 0.22967692223017228, + "grad_norm": 0.7947858572006226, + "learning_rate": 9.691522792359077e-06, + "loss": 0.8098, + "step": 4173 + }, + { + "epoch": 0.22973196103252794, + "grad_norm": 0.7907306551933289, + "learning_rate": 9.691372877638658e-06, + "loss": 0.8, + "step": 4174 + }, + { + "epoch": 0.2297869998348836, + "grad_norm": 0.7669435739517212, + "learning_rate": 9.691222927659137e-06, + "loss": 0.8121, + "step": 4175 + }, + { + "epoch": 0.22984203863723926, + "grad_norm": 0.8128299117088318, + "learning_rate": 9.691072942421642e-06, + "loss": 0.7554, + "step": 4176 + }, + { + "epoch": 0.22989707743959492, + "grad_norm": 0.9043960571289062, + "learning_rate": 9.690922921927295e-06, + "loss": 0.8601, + "step": 4177 + }, + { + "epoch": 0.22995211624195058, + "grad_norm": 0.835445761680603, + "learning_rate": 9.690772866177229e-06, + "loss": 0.8185, + "step": 4178 + }, + { + "epoch": 0.23000715504430624, + "grad_norm": 0.734601616859436, + "learning_rate": 9.69062277517257e-06, + "loss": 0.6486, + "step": 4179 + }, + { + "epoch": 0.2300621938466619, + "grad_norm": 0.8252671957015991, + "learning_rate": 9.690472648914445e-06, + "loss": 0.8455, + "step": 4180 + }, + { + "epoch": 0.23011723264901757, + "grad_norm": 0.8266329169273376, + "learning_rate": 9.690322487403984e-06, + "loss": 0.7348, + "step": 4181 + }, + { + "epoch": 0.23017227145137323, + "grad_norm": 0.8280256390571594, + "learning_rate": 9.690172290642314e-06, + "loss": 0.8191, + "step": 4182 + }, + { + "epoch": 0.2302273102537289, + "grad_norm": 0.8854276537895203, + "learning_rate": 9.690022058630564e-06, + "loss": 0.9327, + "step": 4183 + }, + { + "epoch": 0.23028234905608455, + "grad_norm": 0.7308807969093323, + "learning_rate": 9.689871791369865e-06, + "loss": 0.8144, + "step": 4184 + }, + { + "epoch": 0.2303373878584402, + "grad_norm": 0.7171719670295715, + "learning_rate": 9.689721488861344e-06, + "loss": 0.8265, + "step": 4185 + }, + { + "epoch": 0.23039242666079587, + "grad_norm": 0.7955548763275146, + "learning_rate": 9.689571151106131e-06, + "loss": 0.7313, + "step": 4186 + }, + { + "epoch": 0.23044746546315154, + "grad_norm": 0.8218876123428345, + "learning_rate": 9.689420778105359e-06, + "loss": 0.883, + "step": 4187 + }, + { + "epoch": 0.23050250426550717, + "grad_norm": 0.79570072889328, + "learning_rate": 9.689270369860154e-06, + "loss": 0.8898, + "step": 4188 + }, + { + "epoch": 0.23055754306786283, + "grad_norm": 0.8163344264030457, + "learning_rate": 9.689119926371649e-06, + "loss": 0.8638, + "step": 4189 + }, + { + "epoch": 0.2306125818702185, + "grad_norm": 0.7767764329910278, + "learning_rate": 9.688969447640972e-06, + "loss": 0.7822, + "step": 4190 + }, + { + "epoch": 0.23066762067257415, + "grad_norm": 0.9357114434242249, + "learning_rate": 9.688818933669258e-06, + "loss": 0.8031, + "step": 4191 + }, + { + "epoch": 0.23072265947492981, + "grad_norm": 0.8340080380439758, + "learning_rate": 9.688668384457635e-06, + "loss": 0.8947, + "step": 4192 + }, + { + "epoch": 0.23077769827728548, + "grad_norm": 0.8187471628189087, + "learning_rate": 9.688517800007235e-06, + "loss": 0.7989, + "step": 4193 + }, + { + "epoch": 0.23083273707964114, + "grad_norm": 0.8131871819496155, + "learning_rate": 9.688367180319191e-06, + "loss": 0.8377, + "step": 4194 + }, + { + "epoch": 0.2308877758819968, + "grad_norm": 0.7933448553085327, + "learning_rate": 9.688216525394634e-06, + "loss": 0.8723, + "step": 4195 + }, + { + "epoch": 0.23094281468435246, + "grad_norm": 0.7262325286865234, + "learning_rate": 9.688065835234695e-06, + "loss": 0.7802, + "step": 4196 + }, + { + "epoch": 0.23099785348670812, + "grad_norm": 0.8289293050765991, + "learning_rate": 9.68791510984051e-06, + "loss": 0.642, + "step": 4197 + }, + { + "epoch": 0.23105289228906378, + "grad_norm": 0.8835988640785217, + "learning_rate": 9.687764349213211e-06, + "loss": 0.9002, + "step": 4198 + }, + { + "epoch": 0.23110793109141944, + "grad_norm": 0.9478649497032166, + "learning_rate": 9.687613553353927e-06, + "loss": 0.8668, + "step": 4199 + }, + { + "epoch": 0.2311629698937751, + "grad_norm": 0.872936487197876, + "learning_rate": 9.687462722263796e-06, + "loss": 0.8312, + "step": 4200 + }, + { + "epoch": 0.23121800869613077, + "grad_norm": 0.7073879241943359, + "learning_rate": 9.68731185594395e-06, + "loss": 0.776, + "step": 4201 + }, + { + "epoch": 0.23127304749848643, + "grad_norm": 0.8265218734741211, + "learning_rate": 9.687160954395522e-06, + "loss": 0.8152, + "step": 4202 + }, + { + "epoch": 0.2313280863008421, + "grad_norm": 0.8027207255363464, + "learning_rate": 9.687010017619649e-06, + "loss": 0.9514, + "step": 4203 + }, + { + "epoch": 0.23138312510319775, + "grad_norm": 0.7416790127754211, + "learning_rate": 9.68685904561746e-06, + "loss": 0.7708, + "step": 4204 + }, + { + "epoch": 0.2314381639055534, + "grad_norm": 0.7916150689125061, + "learning_rate": 9.686708038390096e-06, + "loss": 0.7753, + "step": 4205 + }, + { + "epoch": 0.23149320270790907, + "grad_norm": 0.7213300466537476, + "learning_rate": 9.686556995938688e-06, + "loss": 0.83, + "step": 4206 + }, + { + "epoch": 0.23154824151026474, + "grad_norm": 0.7595892548561096, + "learning_rate": 9.68640591826437e-06, + "loss": 0.8186, + "step": 4207 + }, + { + "epoch": 0.2316032803126204, + "grad_norm": 0.7042104601860046, + "learning_rate": 9.686254805368282e-06, + "loss": 0.7126, + "step": 4208 + }, + { + "epoch": 0.23165831911497606, + "grad_norm": 0.7416805028915405, + "learning_rate": 9.686103657251558e-06, + "loss": 0.7791, + "step": 4209 + }, + { + "epoch": 0.23171335791733172, + "grad_norm": 0.9868568181991577, + "learning_rate": 9.685952473915333e-06, + "loss": 0.8453, + "step": 4210 + }, + { + "epoch": 0.23176839671968738, + "grad_norm": 0.7133191823959351, + "learning_rate": 9.68580125536074e-06, + "loss": 0.6061, + "step": 4211 + }, + { + "epoch": 0.23182343552204304, + "grad_norm": 0.8307366967201233, + "learning_rate": 9.685650001588921e-06, + "loss": 0.8403, + "step": 4212 + }, + { + "epoch": 0.2318784743243987, + "grad_norm": 0.8395226001739502, + "learning_rate": 9.685498712601014e-06, + "loss": 0.7945, + "step": 4213 + }, + { + "epoch": 0.23193351312675436, + "grad_norm": 0.7557219862937927, + "learning_rate": 9.68534738839815e-06, + "loss": 0.7765, + "step": 4214 + }, + { + "epoch": 0.23198855192911003, + "grad_norm": 0.7003554105758667, + "learning_rate": 9.68519602898147e-06, + "loss": 0.7228, + "step": 4215 + }, + { + "epoch": 0.2320435907314657, + "grad_norm": 0.8422999382019043, + "learning_rate": 9.68504463435211e-06, + "loss": 0.8524, + "step": 4216 + }, + { + "epoch": 0.23209862953382135, + "grad_norm": 0.9369016289710999, + "learning_rate": 9.68489320451121e-06, + "loss": 0.7646, + "step": 4217 + }, + { + "epoch": 0.232153668336177, + "grad_norm": 0.8456607460975647, + "learning_rate": 9.684741739459905e-06, + "loss": 0.7481, + "step": 4218 + }, + { + "epoch": 0.23220870713853267, + "grad_norm": 0.9284812211990356, + "learning_rate": 9.684590239199336e-06, + "loss": 0.8192, + "step": 4219 + }, + { + "epoch": 0.23226374594088833, + "grad_norm": 0.8474242687225342, + "learning_rate": 9.68443870373064e-06, + "loss": 0.7143, + "step": 4220 + }, + { + "epoch": 0.232318784743244, + "grad_norm": 0.8259334564208984, + "learning_rate": 9.684287133054957e-06, + "loss": 0.8667, + "step": 4221 + }, + { + "epoch": 0.23237382354559966, + "grad_norm": 0.8016416430473328, + "learning_rate": 9.684135527173427e-06, + "loss": 0.8694, + "step": 4222 + }, + { + "epoch": 0.23242886234795532, + "grad_norm": 0.7575937509536743, + "learning_rate": 9.683983886087186e-06, + "loss": 0.7591, + "step": 4223 + }, + { + "epoch": 0.23248390115031098, + "grad_norm": 0.7004683613777161, + "learning_rate": 9.683832209797377e-06, + "loss": 0.739, + "step": 4224 + }, + { + "epoch": 0.23253893995266664, + "grad_norm": 0.8265832662582397, + "learning_rate": 9.68368049830514e-06, + "loss": 0.7705, + "step": 4225 + }, + { + "epoch": 0.2325939787550223, + "grad_norm": 0.7705711722373962, + "learning_rate": 9.683528751611612e-06, + "loss": 0.7896, + "step": 4226 + }, + { + "epoch": 0.23264901755737796, + "grad_norm": 0.7426978349685669, + "learning_rate": 9.683376969717937e-06, + "loss": 0.8217, + "step": 4227 + }, + { + "epoch": 0.23270405635973362, + "grad_norm": 0.7425839304924011, + "learning_rate": 9.683225152625255e-06, + "loss": 0.7426, + "step": 4228 + }, + { + "epoch": 0.23275909516208929, + "grad_norm": 1.0415440797805786, + "learning_rate": 9.683073300334705e-06, + "loss": 0.8585, + "step": 4229 + }, + { + "epoch": 0.23281413396444495, + "grad_norm": 0.7706055045127869, + "learning_rate": 9.68292141284743e-06, + "loss": 0.8349, + "step": 4230 + }, + { + "epoch": 0.23286917276680058, + "grad_norm": 0.8407607674598694, + "learning_rate": 9.682769490164572e-06, + "loss": 0.8592, + "step": 4231 + }, + { + "epoch": 0.23292421156915624, + "grad_norm": 0.6830767393112183, + "learning_rate": 9.68261753228727e-06, + "loss": 0.6773, + "step": 4232 + }, + { + "epoch": 0.2329792503715119, + "grad_norm": 1.6661429405212402, + "learning_rate": 9.68246553921667e-06, + "loss": 1.005, + "step": 4233 + }, + { + "epoch": 0.23303428917386756, + "grad_norm": 0.7677092552185059, + "learning_rate": 9.682313510953912e-06, + "loss": 0.7689, + "step": 4234 + }, + { + "epoch": 0.23308932797622323, + "grad_norm": 0.7232248187065125, + "learning_rate": 9.682161447500139e-06, + "loss": 0.7765, + "step": 4235 + }, + { + "epoch": 0.2331443667785789, + "grad_norm": 0.8667388558387756, + "learning_rate": 9.682009348856494e-06, + "loss": 0.8099, + "step": 4236 + }, + { + "epoch": 0.23319940558093455, + "grad_norm": 0.8220446705818176, + "learning_rate": 9.68185721502412e-06, + "loss": 0.8078, + "step": 4237 + }, + { + "epoch": 0.2332544443832902, + "grad_norm": 0.9670323133468628, + "learning_rate": 9.68170504600416e-06, + "loss": 0.8912, + "step": 4238 + }, + { + "epoch": 0.23330948318564587, + "grad_norm": 0.7950771450996399, + "learning_rate": 9.68155284179776e-06, + "loss": 0.8165, + "step": 4239 + }, + { + "epoch": 0.23336452198800153, + "grad_norm": 0.7606233358383179, + "learning_rate": 9.68140060240606e-06, + "loss": 0.7795, + "step": 4240 + }, + { + "epoch": 0.2334195607903572, + "grad_norm": 0.9580656886100769, + "learning_rate": 9.681248327830205e-06, + "loss": 0.7949, + "step": 4241 + }, + { + "epoch": 0.23347459959271286, + "grad_norm": 0.6878347992897034, + "learning_rate": 9.681096018071341e-06, + "loss": 0.7776, + "step": 4242 + }, + { + "epoch": 0.23352963839506852, + "grad_norm": 0.8449816107749939, + "learning_rate": 9.680943673130614e-06, + "loss": 0.8456, + "step": 4243 + }, + { + "epoch": 0.23358467719742418, + "grad_norm": 0.77314692735672, + "learning_rate": 9.680791293009167e-06, + "loss": 0.7915, + "step": 4244 + }, + { + "epoch": 0.23363971599977984, + "grad_norm": 0.8034142255783081, + "learning_rate": 9.680638877708146e-06, + "loss": 0.7377, + "step": 4245 + }, + { + "epoch": 0.2336947548021355, + "grad_norm": 0.8754952549934387, + "learning_rate": 9.680486427228695e-06, + "loss": 0.8072, + "step": 4246 + }, + { + "epoch": 0.23374979360449116, + "grad_norm": 0.8169820308685303, + "learning_rate": 9.680333941571963e-06, + "loss": 0.8253, + "step": 4247 + }, + { + "epoch": 0.23380483240684682, + "grad_norm": 0.7848341464996338, + "learning_rate": 9.680181420739092e-06, + "loss": 0.8243, + "step": 4248 + }, + { + "epoch": 0.23385987120920249, + "grad_norm": 0.7599799036979675, + "learning_rate": 9.68002886473123e-06, + "loss": 0.781, + "step": 4249 + }, + { + "epoch": 0.23391491001155815, + "grad_norm": 0.8920254707336426, + "learning_rate": 9.679876273549524e-06, + "loss": 0.8199, + "step": 4250 + }, + { + "epoch": 0.2339699488139138, + "grad_norm": 0.7813586592674255, + "learning_rate": 9.679723647195121e-06, + "loss": 0.7758, + "step": 4251 + }, + { + "epoch": 0.23402498761626947, + "grad_norm": 0.735282838344574, + "learning_rate": 9.679570985669168e-06, + "loss": 0.7651, + "step": 4252 + }, + { + "epoch": 0.23408002641862513, + "grad_norm": 0.7305853962898254, + "learning_rate": 9.679418288972813e-06, + "loss": 0.8202, + "step": 4253 + }, + { + "epoch": 0.2341350652209808, + "grad_norm": 0.8331005573272705, + "learning_rate": 9.6792655571072e-06, + "loss": 0.8784, + "step": 4254 + }, + { + "epoch": 0.23419010402333645, + "grad_norm": 0.8526305556297302, + "learning_rate": 9.679112790073481e-06, + "loss": 0.8116, + "step": 4255 + }, + { + "epoch": 0.23424514282569212, + "grad_norm": 0.741073489189148, + "learning_rate": 9.678959987872805e-06, + "loss": 0.6928, + "step": 4256 + }, + { + "epoch": 0.23430018162804778, + "grad_norm": 0.727859616279602, + "learning_rate": 9.678807150506315e-06, + "loss": 0.7571, + "step": 4257 + }, + { + "epoch": 0.23435522043040344, + "grad_norm": 0.8890698552131653, + "learning_rate": 9.678654277975165e-06, + "loss": 0.8145, + "step": 4258 + }, + { + "epoch": 0.2344102592327591, + "grad_norm": 0.7372937798500061, + "learning_rate": 9.6785013702805e-06, + "loss": 0.7104, + "step": 4259 + }, + { + "epoch": 0.23446529803511476, + "grad_norm": 0.7205008268356323, + "learning_rate": 9.678348427423472e-06, + "loss": 0.7498, + "step": 4260 + }, + { + "epoch": 0.23452033683747042, + "grad_norm": 0.7766392230987549, + "learning_rate": 9.67819544940523e-06, + "loss": 0.7814, + "step": 4261 + }, + { + "epoch": 0.23457537563982608, + "grad_norm": 0.7441498637199402, + "learning_rate": 9.678042436226922e-06, + "loss": 0.7429, + "step": 4262 + }, + { + "epoch": 0.23463041444218175, + "grad_norm": 0.8838522434234619, + "learning_rate": 9.677889387889701e-06, + "loss": 0.8719, + "step": 4263 + }, + { + "epoch": 0.2346854532445374, + "grad_norm": 1.2349655628204346, + "learning_rate": 9.677736304394716e-06, + "loss": 0.8491, + "step": 4264 + }, + { + "epoch": 0.23474049204689307, + "grad_norm": 0.8050087690353394, + "learning_rate": 9.677583185743116e-06, + "loss": 0.795, + "step": 4265 + }, + { + "epoch": 0.23479553084924873, + "grad_norm": 0.7885709404945374, + "learning_rate": 9.677430031936051e-06, + "loss": 0.8594, + "step": 4266 + }, + { + "epoch": 0.2348505696516044, + "grad_norm": 0.7753557562828064, + "learning_rate": 9.677276842974676e-06, + "loss": 0.8196, + "step": 4267 + }, + { + "epoch": 0.23490560845396005, + "grad_norm": 0.7325392961502075, + "learning_rate": 9.67712361886014e-06, + "loss": 0.7905, + "step": 4268 + }, + { + "epoch": 0.2349606472563157, + "grad_norm": 0.7925617694854736, + "learning_rate": 9.676970359593594e-06, + "loss": 0.7416, + "step": 4269 + }, + { + "epoch": 0.23501568605867137, + "grad_norm": 0.7981371283531189, + "learning_rate": 9.676817065176192e-06, + "loss": 0.81, + "step": 4270 + }, + { + "epoch": 0.23507072486102704, + "grad_norm": 0.7490524053573608, + "learning_rate": 9.676663735609084e-06, + "loss": 0.8347, + "step": 4271 + }, + { + "epoch": 0.2351257636633827, + "grad_norm": 1.000349521636963, + "learning_rate": 9.676510370893424e-06, + "loss": 0.7469, + "step": 4272 + }, + { + "epoch": 0.23518080246573836, + "grad_norm": 0.9310774207115173, + "learning_rate": 9.676356971030364e-06, + "loss": 0.8088, + "step": 4273 + }, + { + "epoch": 0.235235841268094, + "grad_norm": 0.8868544101715088, + "learning_rate": 9.676203536021055e-06, + "loss": 0.7472, + "step": 4274 + }, + { + "epoch": 0.23529088007044965, + "grad_norm": 0.7702255845069885, + "learning_rate": 9.676050065866653e-06, + "loss": 0.8395, + "step": 4275 + }, + { + "epoch": 0.23534591887280532, + "grad_norm": 0.7138833999633789, + "learning_rate": 9.675896560568311e-06, + "loss": 0.8529, + "step": 4276 + }, + { + "epoch": 0.23540095767516098, + "grad_norm": 0.8399729132652283, + "learning_rate": 9.675743020127182e-06, + "loss": 0.7844, + "step": 4277 + }, + { + "epoch": 0.23545599647751664, + "grad_norm": 0.8500726819038391, + "learning_rate": 9.67558944454442e-06, + "loss": 0.8209, + "step": 4278 + }, + { + "epoch": 0.2355110352798723, + "grad_norm": 0.766638994216919, + "learning_rate": 9.675435833821178e-06, + "loss": 0.7834, + "step": 4279 + }, + { + "epoch": 0.23556607408222796, + "grad_norm": 0.9121370315551758, + "learning_rate": 9.675282187958613e-06, + "loss": 0.8697, + "step": 4280 + }, + { + "epoch": 0.23562111288458362, + "grad_norm": 0.7862319946289062, + "learning_rate": 9.675128506957879e-06, + "loss": 0.8262, + "step": 4281 + }, + { + "epoch": 0.23567615168693928, + "grad_norm": 1.072777509689331, + "learning_rate": 9.67497479082013e-06, + "loss": 0.7963, + "step": 4282 + }, + { + "epoch": 0.23573119048929495, + "grad_norm": 0.7574695944786072, + "learning_rate": 9.67482103954652e-06, + "loss": 0.8178, + "step": 4283 + }, + { + "epoch": 0.2357862292916506, + "grad_norm": 0.7996877431869507, + "learning_rate": 9.674667253138209e-06, + "loss": 0.8465, + "step": 4284 + }, + { + "epoch": 0.23584126809400627, + "grad_norm": 0.711513340473175, + "learning_rate": 9.674513431596349e-06, + "loss": 0.7445, + "step": 4285 + }, + { + "epoch": 0.23589630689636193, + "grad_norm": 0.7431296706199646, + "learning_rate": 9.674359574922098e-06, + "loss": 0.8102, + "step": 4286 + }, + { + "epoch": 0.2359513456987176, + "grad_norm": 0.7745676040649414, + "learning_rate": 9.674205683116612e-06, + "loss": 0.8733, + "step": 4287 + }, + { + "epoch": 0.23600638450107325, + "grad_norm": 1.0117937326431274, + "learning_rate": 9.674051756181046e-06, + "loss": 0.9035, + "step": 4288 + }, + { + "epoch": 0.2360614233034289, + "grad_norm": 0.7848078608512878, + "learning_rate": 9.67389779411656e-06, + "loss": 0.8486, + "step": 4289 + }, + { + "epoch": 0.23611646210578457, + "grad_norm": 0.8439378142356873, + "learning_rate": 9.673743796924307e-06, + "loss": 0.8032, + "step": 4290 + }, + { + "epoch": 0.23617150090814024, + "grad_norm": 0.8268104791641235, + "learning_rate": 9.673589764605449e-06, + "loss": 0.8182, + "step": 4291 + }, + { + "epoch": 0.2362265397104959, + "grad_norm": 0.8896234631538391, + "learning_rate": 9.67343569716114e-06, + "loss": 0.8081, + "step": 4292 + }, + { + "epoch": 0.23628157851285156, + "grad_norm": 0.8515019416809082, + "learning_rate": 9.67328159459254e-06, + "loss": 0.8239, + "step": 4293 + }, + { + "epoch": 0.23633661731520722, + "grad_norm": 0.7779792547225952, + "learning_rate": 9.673127456900806e-06, + "loss": 0.8437, + "step": 4294 + }, + { + "epoch": 0.23639165611756288, + "grad_norm": 0.7782402634620667, + "learning_rate": 9.672973284087097e-06, + "loss": 0.8498, + "step": 4295 + }, + { + "epoch": 0.23644669491991854, + "grad_norm": 0.7588973641395569, + "learning_rate": 9.67281907615257e-06, + "loss": 0.7034, + "step": 4296 + }, + { + "epoch": 0.2365017337222742, + "grad_norm": 0.8426640629768372, + "learning_rate": 9.67266483309839e-06, + "loss": 0.803, + "step": 4297 + }, + { + "epoch": 0.23655677252462987, + "grad_norm": 0.8945889472961426, + "learning_rate": 9.672510554925707e-06, + "loss": 0.8971, + "step": 4298 + }, + { + "epoch": 0.23661181132698553, + "grad_norm": 0.8604227304458618, + "learning_rate": 9.672356241635688e-06, + "loss": 0.7548, + "step": 4299 + }, + { + "epoch": 0.2366668501293412, + "grad_norm": 0.7277490496635437, + "learning_rate": 9.672201893229489e-06, + "loss": 0.8083, + "step": 4300 + }, + { + "epoch": 0.23672188893169685, + "grad_norm": 0.9089379906654358, + "learning_rate": 9.672047509708273e-06, + "loss": 0.9717, + "step": 4301 + }, + { + "epoch": 0.2367769277340525, + "grad_norm": 0.7207155823707581, + "learning_rate": 9.671893091073198e-06, + "loss": 0.6794, + "step": 4302 + }, + { + "epoch": 0.23683196653640817, + "grad_norm": 0.7319806814193726, + "learning_rate": 9.671738637325425e-06, + "loss": 0.6821, + "step": 4303 + }, + { + "epoch": 0.23688700533876383, + "grad_norm": 0.7339589595794678, + "learning_rate": 9.671584148466112e-06, + "loss": 0.7895, + "step": 4304 + }, + { + "epoch": 0.2369420441411195, + "grad_norm": 0.7725476622581482, + "learning_rate": 9.671429624496428e-06, + "loss": 0.7414, + "step": 4305 + }, + { + "epoch": 0.23699708294347516, + "grad_norm": 0.7040137648582458, + "learning_rate": 9.671275065417527e-06, + "loss": 0.696, + "step": 4306 + }, + { + "epoch": 0.23705212174583082, + "grad_norm": 0.8804189562797546, + "learning_rate": 9.671120471230572e-06, + "loss": 0.8184, + "step": 4307 + }, + { + "epoch": 0.23710716054818648, + "grad_norm": 0.8062872886657715, + "learning_rate": 9.670965841936728e-06, + "loss": 0.7856, + "step": 4308 + }, + { + "epoch": 0.23716219935054214, + "grad_norm": 0.7537097930908203, + "learning_rate": 9.670811177537154e-06, + "loss": 0.7562, + "step": 4309 + }, + { + "epoch": 0.2372172381528978, + "grad_norm": 0.8168618083000183, + "learning_rate": 9.670656478033013e-06, + "loss": 0.7416, + "step": 4310 + }, + { + "epoch": 0.23727227695525346, + "grad_norm": 0.8367040157318115, + "learning_rate": 9.670501743425469e-06, + "loss": 0.7759, + "step": 4311 + }, + { + "epoch": 0.23732731575760913, + "grad_norm": 0.860418975353241, + "learning_rate": 9.670346973715683e-06, + "loss": 0.9013, + "step": 4312 + }, + { + "epoch": 0.2373823545599648, + "grad_norm": 0.8736678957939148, + "learning_rate": 9.67019216890482e-06, + "loss": 0.8677, + "step": 4313 + }, + { + "epoch": 0.23743739336232045, + "grad_norm": 0.8258964419364929, + "learning_rate": 9.670037328994044e-06, + "loss": 0.8208, + "step": 4314 + }, + { + "epoch": 0.2374924321646761, + "grad_norm": 0.7936292886734009, + "learning_rate": 9.669882453984516e-06, + "loss": 0.8643, + "step": 4315 + }, + { + "epoch": 0.23754747096703177, + "grad_norm": 0.805500864982605, + "learning_rate": 9.669727543877401e-06, + "loss": 0.779, + "step": 4316 + }, + { + "epoch": 0.2376025097693874, + "grad_norm": 0.8072311282157898, + "learning_rate": 9.669572598673866e-06, + "loss": 0.8258, + "step": 4317 + }, + { + "epoch": 0.23765754857174307, + "grad_norm": 0.8917607665061951, + "learning_rate": 9.669417618375072e-06, + "loss": 0.7528, + "step": 4318 + }, + { + "epoch": 0.23771258737409873, + "grad_norm": 0.7054246068000793, + "learning_rate": 9.669262602982186e-06, + "loss": 0.86, + "step": 4319 + }, + { + "epoch": 0.2377676261764544, + "grad_norm": 0.8600299954414368, + "learning_rate": 9.66910755249637e-06, + "loss": 0.8165, + "step": 4320 + }, + { + "epoch": 0.23782266497881005, + "grad_norm": 0.8685561418533325, + "learning_rate": 9.668952466918793e-06, + "loss": 0.8129, + "step": 4321 + }, + { + "epoch": 0.2378777037811657, + "grad_norm": 0.7859770655632019, + "learning_rate": 9.668797346250618e-06, + "loss": 0.8703, + "step": 4322 + }, + { + "epoch": 0.23793274258352137, + "grad_norm": 0.8128730058670044, + "learning_rate": 9.668642190493015e-06, + "loss": 0.7595, + "step": 4323 + }, + { + "epoch": 0.23798778138587703, + "grad_norm": 0.8223204612731934, + "learning_rate": 9.668486999647143e-06, + "loss": 0.825, + "step": 4324 + }, + { + "epoch": 0.2380428201882327, + "grad_norm": 0.859619677066803, + "learning_rate": 9.668331773714175e-06, + "loss": 0.8239, + "step": 4325 + }, + { + "epoch": 0.23809785899058836, + "grad_norm": 0.9861679673194885, + "learning_rate": 9.668176512695273e-06, + "loss": 0.8409, + "step": 4326 + }, + { + "epoch": 0.23815289779294402, + "grad_norm": 0.7178627252578735, + "learning_rate": 9.668021216591607e-06, + "loss": 0.818, + "step": 4327 + }, + { + "epoch": 0.23820793659529968, + "grad_norm": 0.9160923957824707, + "learning_rate": 9.667865885404343e-06, + "loss": 0.8703, + "step": 4328 + }, + { + "epoch": 0.23826297539765534, + "grad_norm": 0.7043942213058472, + "learning_rate": 9.667710519134648e-06, + "loss": 0.6884, + "step": 4329 + }, + { + "epoch": 0.238318014200011, + "grad_norm": 1.213121771812439, + "learning_rate": 9.667555117783691e-06, + "loss": 0.7843, + "step": 4330 + }, + { + "epoch": 0.23837305300236666, + "grad_norm": 0.8008033037185669, + "learning_rate": 9.66739968135264e-06, + "loss": 0.9312, + "step": 4331 + }, + { + "epoch": 0.23842809180472233, + "grad_norm": 0.7862009406089783, + "learning_rate": 9.667244209842662e-06, + "loss": 0.6965, + "step": 4332 + }, + { + "epoch": 0.238483130607078, + "grad_norm": 1.081398844718933, + "learning_rate": 9.667088703254923e-06, + "loss": 0.9793, + "step": 4333 + }, + { + "epoch": 0.23853816940943365, + "grad_norm": 0.7672395706176758, + "learning_rate": 9.666933161590597e-06, + "loss": 0.813, + "step": 4334 + }, + { + "epoch": 0.2385932082117893, + "grad_norm": 0.6955493092536926, + "learning_rate": 9.66677758485085e-06, + "loss": 0.7778, + "step": 4335 + }, + { + "epoch": 0.23864824701414497, + "grad_norm": 0.8609682321548462, + "learning_rate": 9.666621973036854e-06, + "loss": 0.7817, + "step": 4336 + }, + { + "epoch": 0.23870328581650063, + "grad_norm": 0.7312196493148804, + "learning_rate": 9.666466326149774e-06, + "loss": 0.7368, + "step": 4337 + }, + { + "epoch": 0.2387583246188563, + "grad_norm": 0.7964538931846619, + "learning_rate": 9.666310644190782e-06, + "loss": 0.8124, + "step": 4338 + }, + { + "epoch": 0.23881336342121195, + "grad_norm": 1.1138910055160522, + "learning_rate": 9.66615492716105e-06, + "loss": 0.8886, + "step": 4339 + }, + { + "epoch": 0.23886840222356762, + "grad_norm": 0.8789949417114258, + "learning_rate": 9.665999175061747e-06, + "loss": 0.7854, + "step": 4340 + }, + { + "epoch": 0.23892344102592328, + "grad_norm": 0.7761380076408386, + "learning_rate": 9.665843387894041e-06, + "loss": 0.7915, + "step": 4341 + }, + { + "epoch": 0.23897847982827894, + "grad_norm": 0.888482928276062, + "learning_rate": 9.665687565659106e-06, + "loss": 0.8799, + "step": 4342 + }, + { + "epoch": 0.2390335186306346, + "grad_norm": 0.7799200415611267, + "learning_rate": 9.665531708358111e-06, + "loss": 0.8519, + "step": 4343 + }, + { + "epoch": 0.23908855743299026, + "grad_norm": 0.7407697439193726, + "learning_rate": 9.665375815992231e-06, + "loss": 0.7637, + "step": 4344 + }, + { + "epoch": 0.23914359623534592, + "grad_norm": 0.8098278045654297, + "learning_rate": 9.665219888562634e-06, + "loss": 0.7991, + "step": 4345 + }, + { + "epoch": 0.23919863503770158, + "grad_norm": 0.7585136294364929, + "learning_rate": 9.665063926070493e-06, + "loss": 0.8478, + "step": 4346 + }, + { + "epoch": 0.23925367384005725, + "grad_norm": 0.7294817566871643, + "learning_rate": 9.66490792851698e-06, + "loss": 0.8312, + "step": 4347 + }, + { + "epoch": 0.2393087126424129, + "grad_norm": 0.8325762748718262, + "learning_rate": 9.664751895903269e-06, + "loss": 0.9365, + "step": 4348 + }, + { + "epoch": 0.23936375144476857, + "grad_norm": 0.9992470741271973, + "learning_rate": 9.66459582823053e-06, + "loss": 0.8649, + "step": 4349 + }, + { + "epoch": 0.23941879024712423, + "grad_norm": 0.7206875681877136, + "learning_rate": 9.664439725499938e-06, + "loss": 0.7013, + "step": 4350 + }, + { + "epoch": 0.2394738290494799, + "grad_norm": 0.946657657623291, + "learning_rate": 9.664283587712665e-06, + "loss": 0.7953, + "step": 4351 + }, + { + "epoch": 0.23952886785183555, + "grad_norm": 0.7684911489486694, + "learning_rate": 9.664127414869887e-06, + "loss": 0.8403, + "step": 4352 + }, + { + "epoch": 0.23958390665419121, + "grad_norm": 0.7875770926475525, + "learning_rate": 9.663971206972773e-06, + "loss": 0.7961, + "step": 4353 + }, + { + "epoch": 0.23963894545654688, + "grad_norm": 0.7387273907661438, + "learning_rate": 9.663814964022502e-06, + "loss": 0.8265, + "step": 4354 + }, + { + "epoch": 0.23969398425890254, + "grad_norm": 0.7413492202758789, + "learning_rate": 9.663658686020245e-06, + "loss": 0.8458, + "step": 4355 + }, + { + "epoch": 0.2397490230612582, + "grad_norm": 0.7563235759735107, + "learning_rate": 9.663502372967177e-06, + "loss": 0.8498, + "step": 4356 + }, + { + "epoch": 0.23980406186361386, + "grad_norm": 0.7529472708702087, + "learning_rate": 9.663346024864475e-06, + "loss": 0.7597, + "step": 4357 + }, + { + "epoch": 0.23985910066596952, + "grad_norm": 0.7582191824913025, + "learning_rate": 9.663189641713314e-06, + "loss": 0.804, + "step": 4358 + }, + { + "epoch": 0.23991413946832518, + "grad_norm": 0.8394485712051392, + "learning_rate": 9.663033223514865e-06, + "loss": 0.8329, + "step": 4359 + }, + { + "epoch": 0.23996917827068082, + "grad_norm": 0.7088292241096497, + "learning_rate": 9.662876770270308e-06, + "loss": 0.7131, + "step": 4360 + }, + { + "epoch": 0.24002421707303648, + "grad_norm": 0.8548080325126648, + "learning_rate": 9.662720281980817e-06, + "loss": 0.8925, + "step": 4361 + }, + { + "epoch": 0.24007925587539214, + "grad_norm": 0.8027567267417908, + "learning_rate": 9.662563758647568e-06, + "loss": 0.8652, + "step": 4362 + }, + { + "epoch": 0.2401342946777478, + "grad_norm": 0.7471736669540405, + "learning_rate": 9.662407200271738e-06, + "loss": 0.7722, + "step": 4363 + }, + { + "epoch": 0.24018933348010346, + "grad_norm": 0.7358804941177368, + "learning_rate": 9.662250606854504e-06, + "loss": 0.767, + "step": 4364 + }, + { + "epoch": 0.24024437228245912, + "grad_norm": 0.7948476672172546, + "learning_rate": 9.662093978397042e-06, + "loss": 0.961, + "step": 4365 + }, + { + "epoch": 0.24029941108481478, + "grad_norm": 0.7030961513519287, + "learning_rate": 9.66193731490053e-06, + "loss": 0.7826, + "step": 4366 + }, + { + "epoch": 0.24035444988717045, + "grad_norm": 0.8376098871231079, + "learning_rate": 9.661780616366145e-06, + "loss": 0.7697, + "step": 4367 + }, + { + "epoch": 0.2404094886895261, + "grad_norm": 0.7449594140052795, + "learning_rate": 9.661623882795065e-06, + "loss": 0.7944, + "step": 4368 + }, + { + "epoch": 0.24046452749188177, + "grad_norm": 0.7317184805870056, + "learning_rate": 9.661467114188468e-06, + "loss": 0.7059, + "step": 4369 + }, + { + "epoch": 0.24051956629423743, + "grad_norm": 0.843912661075592, + "learning_rate": 9.661310310547531e-06, + "loss": 0.7889, + "step": 4370 + }, + { + "epoch": 0.2405746050965931, + "grad_norm": 0.8673211336135864, + "learning_rate": 9.661153471873435e-06, + "loss": 0.7234, + "step": 4371 + }, + { + "epoch": 0.24062964389894875, + "grad_norm": 0.8179688453674316, + "learning_rate": 9.660996598167354e-06, + "loss": 0.8937, + "step": 4372 + }, + { + "epoch": 0.24068468270130441, + "grad_norm": 0.7800211906433105, + "learning_rate": 9.660839689430473e-06, + "loss": 0.8596, + "step": 4373 + }, + { + "epoch": 0.24073972150366008, + "grad_norm": 0.8781671524047852, + "learning_rate": 9.660682745663967e-06, + "loss": 0.8507, + "step": 4374 + }, + { + "epoch": 0.24079476030601574, + "grad_norm": 0.7701708674430847, + "learning_rate": 9.660525766869019e-06, + "loss": 0.8212, + "step": 4375 + }, + { + "epoch": 0.2408497991083714, + "grad_norm": 0.7721084356307983, + "learning_rate": 9.660368753046806e-06, + "loss": 0.7493, + "step": 4376 + }, + { + "epoch": 0.24090483791072706, + "grad_norm": 0.8126489520072937, + "learning_rate": 9.660211704198508e-06, + "loss": 0.8527, + "step": 4377 + }, + { + "epoch": 0.24095987671308272, + "grad_norm": 0.8172717690467834, + "learning_rate": 9.660054620325307e-06, + "loss": 0.8448, + "step": 4378 + }, + { + "epoch": 0.24101491551543838, + "grad_norm": 0.8293611407279968, + "learning_rate": 9.659897501428384e-06, + "loss": 0.9318, + "step": 4379 + }, + { + "epoch": 0.24106995431779404, + "grad_norm": 0.7445098161697388, + "learning_rate": 9.659740347508917e-06, + "loss": 0.7358, + "step": 4380 + }, + { + "epoch": 0.2411249931201497, + "grad_norm": 0.7778907418251038, + "learning_rate": 9.659583158568088e-06, + "loss": 0.7671, + "step": 4381 + }, + { + "epoch": 0.24118003192250537, + "grad_norm": 0.7828608751296997, + "learning_rate": 9.659425934607082e-06, + "loss": 0.8141, + "step": 4382 + }, + { + "epoch": 0.24123507072486103, + "grad_norm": 0.9433113932609558, + "learning_rate": 9.659268675627075e-06, + "loss": 0.7904, + "step": 4383 + }, + { + "epoch": 0.2412901095272167, + "grad_norm": 0.7097491025924683, + "learning_rate": 9.659111381629255e-06, + "loss": 0.7445, + "step": 4384 + }, + { + "epoch": 0.24134514832957235, + "grad_norm": 0.7450230717658997, + "learning_rate": 9.6589540526148e-06, + "loss": 0.6869, + "step": 4385 + }, + { + "epoch": 0.241400187131928, + "grad_norm": 0.7429760694503784, + "learning_rate": 9.658796688584893e-06, + "loss": 0.7367, + "step": 4386 + }, + { + "epoch": 0.24145522593428367, + "grad_norm": 0.7250030040740967, + "learning_rate": 9.658639289540716e-06, + "loss": 0.7502, + "step": 4387 + }, + { + "epoch": 0.24151026473663934, + "grad_norm": 0.6577159762382507, + "learning_rate": 9.658481855483455e-06, + "loss": 0.5785, + "step": 4388 + }, + { + "epoch": 0.241565303538995, + "grad_norm": 0.7846524119377136, + "learning_rate": 9.65832438641429e-06, + "loss": 0.7435, + "step": 4389 + }, + { + "epoch": 0.24162034234135066, + "grad_norm": 0.8370404839515686, + "learning_rate": 9.658166882334408e-06, + "loss": 0.8536, + "step": 4390 + }, + { + "epoch": 0.24167538114370632, + "grad_norm": 0.7451018691062927, + "learning_rate": 9.658009343244987e-06, + "loss": 0.8443, + "step": 4391 + }, + { + "epoch": 0.24173041994606198, + "grad_norm": 0.7629074454307556, + "learning_rate": 9.657851769147218e-06, + "loss": 0.7394, + "step": 4392 + }, + { + "epoch": 0.24178545874841764, + "grad_norm": 0.7767705321311951, + "learning_rate": 9.657694160042282e-06, + "loss": 0.8497, + "step": 4393 + }, + { + "epoch": 0.2418404975507733, + "grad_norm": 0.8635357022285461, + "learning_rate": 9.65753651593136e-06, + "loss": 0.8495, + "step": 4394 + }, + { + "epoch": 0.24189553635312896, + "grad_norm": 0.7652365565299988, + "learning_rate": 9.657378836815643e-06, + "loss": 0.7967, + "step": 4395 + }, + { + "epoch": 0.24195057515548463, + "grad_norm": 0.7721680402755737, + "learning_rate": 9.657221122696313e-06, + "loss": 0.8227, + "step": 4396 + }, + { + "epoch": 0.2420056139578403, + "grad_norm": 1.016366720199585, + "learning_rate": 9.657063373574555e-06, + "loss": 0.8291, + "step": 4397 + }, + { + "epoch": 0.24206065276019595, + "grad_norm": 0.7770145535469055, + "learning_rate": 9.656905589451555e-06, + "loss": 0.8335, + "step": 4398 + }, + { + "epoch": 0.2421156915625516, + "grad_norm": 0.812882125377655, + "learning_rate": 9.6567477703285e-06, + "loss": 0.8189, + "step": 4399 + }, + { + "epoch": 0.24217073036490727, + "grad_norm": 0.7253247499465942, + "learning_rate": 9.656589916206576e-06, + "loss": 0.8418, + "step": 4400 + }, + { + "epoch": 0.24222576916726293, + "grad_norm": 0.7784958481788635, + "learning_rate": 9.656432027086969e-06, + "loss": 0.8541, + "step": 4401 + }, + { + "epoch": 0.2422808079696186, + "grad_norm": 0.8001978397369385, + "learning_rate": 9.656274102970865e-06, + "loss": 0.8888, + "step": 4402 + }, + { + "epoch": 0.24233584677197423, + "grad_norm": 0.7535765767097473, + "learning_rate": 9.656116143859448e-06, + "loss": 0.7691, + "step": 4403 + }, + { + "epoch": 0.2423908855743299, + "grad_norm": 0.6554346680641174, + "learning_rate": 9.655958149753913e-06, + "loss": 0.7592, + "step": 4404 + }, + { + "epoch": 0.24244592437668555, + "grad_norm": 0.8599995374679565, + "learning_rate": 9.655800120655439e-06, + "loss": 0.8396, + "step": 4405 + }, + { + "epoch": 0.2425009631790412, + "grad_norm": 0.8172232508659363, + "learning_rate": 9.65564205656522e-06, + "loss": 0.6931, + "step": 4406 + }, + { + "epoch": 0.24255600198139687, + "grad_norm": 0.8005852699279785, + "learning_rate": 9.65548395748444e-06, + "loss": 0.8344, + "step": 4407 + }, + { + "epoch": 0.24261104078375254, + "grad_norm": 0.7823762893676758, + "learning_rate": 9.65532582341429e-06, + "loss": 0.7991, + "step": 4408 + }, + { + "epoch": 0.2426660795861082, + "grad_norm": 0.7743250727653503, + "learning_rate": 9.655167654355957e-06, + "loss": 0.9048, + "step": 4409 + }, + { + "epoch": 0.24272111838846386, + "grad_norm": 0.9825221300125122, + "learning_rate": 9.655009450310629e-06, + "loss": 0.7491, + "step": 4410 + }, + { + "epoch": 0.24277615719081952, + "grad_norm": 1.2921068668365479, + "learning_rate": 9.654851211279496e-06, + "loss": 0.8175, + "step": 4411 + }, + { + "epoch": 0.24283119599317518, + "grad_norm": 0.8267684578895569, + "learning_rate": 9.65469293726375e-06, + "loss": 0.8896, + "step": 4412 + }, + { + "epoch": 0.24288623479553084, + "grad_norm": 0.8020186424255371, + "learning_rate": 9.654534628264576e-06, + "loss": 0.7145, + "step": 4413 + }, + { + "epoch": 0.2429412735978865, + "grad_norm": 0.8192574977874756, + "learning_rate": 9.654376284283166e-06, + "loss": 0.7451, + "step": 4414 + }, + { + "epoch": 0.24299631240024216, + "grad_norm": 0.7733662128448486, + "learning_rate": 9.65421790532071e-06, + "loss": 0.768, + "step": 4415 + }, + { + "epoch": 0.24305135120259783, + "grad_norm": 0.8342406153678894, + "learning_rate": 9.654059491378396e-06, + "loss": 0.8137, + "step": 4416 + }, + { + "epoch": 0.2431063900049535, + "grad_norm": 1.014755368232727, + "learning_rate": 9.653901042457418e-06, + "loss": 0.8922, + "step": 4417 + }, + { + "epoch": 0.24316142880730915, + "grad_norm": 0.864608645439148, + "learning_rate": 9.653742558558967e-06, + "loss": 0.9412, + "step": 4418 + }, + { + "epoch": 0.2432164676096648, + "grad_norm": 0.7383908033370972, + "learning_rate": 9.65358403968423e-06, + "loss": 0.8261, + "step": 4419 + }, + { + "epoch": 0.24327150641202047, + "grad_norm": 0.7464672923088074, + "learning_rate": 9.653425485834403e-06, + "loss": 0.7074, + "step": 4420 + }, + { + "epoch": 0.24332654521437613, + "grad_norm": 0.7010141611099243, + "learning_rate": 9.653266897010676e-06, + "loss": 0.6849, + "step": 4421 + }, + { + "epoch": 0.2433815840167318, + "grad_norm": 0.7135268449783325, + "learning_rate": 9.653108273214239e-06, + "loss": 0.8228, + "step": 4422 + }, + { + "epoch": 0.24343662281908746, + "grad_norm": 0.8061006665229797, + "learning_rate": 9.652949614446287e-06, + "loss": 0.8345, + "step": 4423 + }, + { + "epoch": 0.24349166162144312, + "grad_norm": 0.6954759955406189, + "learning_rate": 9.652790920708011e-06, + "loss": 0.7189, + "step": 4424 + }, + { + "epoch": 0.24354670042379878, + "grad_norm": 0.8669333457946777, + "learning_rate": 9.652632192000603e-06, + "loss": 0.8872, + "step": 4425 + }, + { + "epoch": 0.24360173922615444, + "grad_norm": 0.7445051670074463, + "learning_rate": 9.652473428325258e-06, + "loss": 0.826, + "step": 4426 + }, + { + "epoch": 0.2436567780285101, + "grad_norm": 0.7444632649421692, + "learning_rate": 9.652314629683165e-06, + "loss": 0.8568, + "step": 4427 + }, + { + "epoch": 0.24371181683086576, + "grad_norm": 0.7160165309906006, + "learning_rate": 9.652155796075524e-06, + "loss": 0.799, + "step": 4428 + }, + { + "epoch": 0.24376685563322142, + "grad_norm": 0.7098904252052307, + "learning_rate": 9.651996927503526e-06, + "loss": 0.8148, + "step": 4429 + }, + { + "epoch": 0.24382189443557709, + "grad_norm": 0.7911115288734436, + "learning_rate": 9.651838023968363e-06, + "loss": 0.8279, + "step": 4430 + }, + { + "epoch": 0.24387693323793275, + "grad_norm": 0.8887501955032349, + "learning_rate": 9.651679085471229e-06, + "loss": 0.8464, + "step": 4431 + }, + { + "epoch": 0.2439319720402884, + "grad_norm": 0.8343196511268616, + "learning_rate": 9.651520112013321e-06, + "loss": 0.7364, + "step": 4432 + }, + { + "epoch": 0.24398701084264407, + "grad_norm": 0.7279361486434937, + "learning_rate": 9.651361103595835e-06, + "loss": 0.7958, + "step": 4433 + }, + { + "epoch": 0.24404204964499973, + "grad_norm": 0.8221089243888855, + "learning_rate": 9.651202060219962e-06, + "loss": 0.7753, + "step": 4434 + }, + { + "epoch": 0.2440970884473554, + "grad_norm": 0.7205086350440979, + "learning_rate": 9.6510429818869e-06, + "loss": 0.7411, + "step": 4435 + }, + { + "epoch": 0.24415212724971105, + "grad_norm": 0.854967474937439, + "learning_rate": 9.650883868597845e-06, + "loss": 0.8192, + "step": 4436 + }, + { + "epoch": 0.24420716605206672, + "grad_norm": 0.7622473835945129, + "learning_rate": 9.65072472035399e-06, + "loss": 0.7645, + "step": 4437 + }, + { + "epoch": 0.24426220485442238, + "grad_norm": 0.7430302500724792, + "learning_rate": 9.650565537156533e-06, + "loss": 0.7817, + "step": 4438 + }, + { + "epoch": 0.24431724365677804, + "grad_norm": 0.8022677898406982, + "learning_rate": 9.650406319006672e-06, + "loss": 0.8035, + "step": 4439 + }, + { + "epoch": 0.2443722824591337, + "grad_norm": 0.7346476912498474, + "learning_rate": 9.6502470659056e-06, + "loss": 0.826, + "step": 4440 + }, + { + "epoch": 0.24442732126148936, + "grad_norm": 0.8393376469612122, + "learning_rate": 9.650087777854517e-06, + "loss": 0.8073, + "step": 4441 + }, + { + "epoch": 0.24448236006384502, + "grad_norm": 0.7920215129852295, + "learning_rate": 9.649928454854618e-06, + "loss": 0.7774, + "step": 4442 + }, + { + "epoch": 0.24453739886620068, + "grad_norm": 0.8192804455757141, + "learning_rate": 9.649769096907102e-06, + "loss": 0.7817, + "step": 4443 + }, + { + "epoch": 0.24459243766855635, + "grad_norm": 0.7727654576301575, + "learning_rate": 9.649609704013167e-06, + "loss": 0.8201, + "step": 4444 + }, + { + "epoch": 0.244647476470912, + "grad_norm": 0.8005746603012085, + "learning_rate": 9.649450276174008e-06, + "loss": 0.8893, + "step": 4445 + }, + { + "epoch": 0.24470251527326764, + "grad_norm": 0.9029125571250916, + "learning_rate": 9.649290813390828e-06, + "loss": 0.7735, + "step": 4446 + }, + { + "epoch": 0.2447575540756233, + "grad_norm": 0.8336170315742493, + "learning_rate": 9.64913131566482e-06, + "loss": 0.7505, + "step": 4447 + }, + { + "epoch": 0.24481259287797896, + "grad_norm": 1.0272265672683716, + "learning_rate": 9.648971782997188e-06, + "loss": 0.8371, + "step": 4448 + }, + { + "epoch": 0.24486763168033462, + "grad_norm": 0.8095843195915222, + "learning_rate": 9.648812215389128e-06, + "loss": 0.7599, + "step": 4449 + }, + { + "epoch": 0.24492267048269029, + "grad_norm": 0.7690166234970093, + "learning_rate": 9.648652612841837e-06, + "loss": 0.8172, + "step": 4450 + }, + { + "epoch": 0.24497770928504595, + "grad_norm": 0.8282617926597595, + "learning_rate": 9.64849297535652e-06, + "loss": 0.8477, + "step": 4451 + }, + { + "epoch": 0.2450327480874016, + "grad_norm": 0.8307822346687317, + "learning_rate": 9.648333302934373e-06, + "loss": 0.7744, + "step": 4452 + }, + { + "epoch": 0.24508778688975727, + "grad_norm": 0.7619080543518066, + "learning_rate": 9.6481735955766e-06, + "loss": 0.8417, + "step": 4453 + }, + { + "epoch": 0.24514282569211293, + "grad_norm": 0.7879447937011719, + "learning_rate": 9.648013853284396e-06, + "loss": 0.7799, + "step": 4454 + }, + { + "epoch": 0.2451978644944686, + "grad_norm": 0.7352256774902344, + "learning_rate": 9.647854076058965e-06, + "loss": 0.8386, + "step": 4455 + }, + { + "epoch": 0.24525290329682425, + "grad_norm": 0.8318933248519897, + "learning_rate": 9.647694263901507e-06, + "loss": 0.7631, + "step": 4456 + }, + { + "epoch": 0.24530794209917992, + "grad_norm": 0.8609912395477295, + "learning_rate": 9.647534416813221e-06, + "loss": 0.7479, + "step": 4457 + }, + { + "epoch": 0.24536298090153558, + "grad_norm": 0.9590480327606201, + "learning_rate": 9.647374534795311e-06, + "loss": 0.8543, + "step": 4458 + }, + { + "epoch": 0.24541801970389124, + "grad_norm": 0.7902723550796509, + "learning_rate": 9.647214617848979e-06, + "loss": 0.6796, + "step": 4459 + }, + { + "epoch": 0.2454730585062469, + "grad_norm": 0.7725642919540405, + "learning_rate": 9.647054665975427e-06, + "loss": 0.7563, + "step": 4460 + }, + { + "epoch": 0.24552809730860256, + "grad_norm": 0.8387014269828796, + "learning_rate": 9.646894679175853e-06, + "loss": 0.8184, + "step": 4461 + }, + { + "epoch": 0.24558313611095822, + "grad_norm": 0.9200852513313293, + "learning_rate": 9.646734657451464e-06, + "loss": 0.8436, + "step": 4462 + }, + { + "epoch": 0.24563817491331388, + "grad_norm": 0.7565840482711792, + "learning_rate": 9.646574600803462e-06, + "loss": 0.7393, + "step": 4463 + }, + { + "epoch": 0.24569321371566955, + "grad_norm": 0.7685559988021851, + "learning_rate": 9.646414509233048e-06, + "loss": 0.7836, + "step": 4464 + }, + { + "epoch": 0.2457482525180252, + "grad_norm": 0.8172003030776978, + "learning_rate": 9.646254382741428e-06, + "loss": 0.787, + "step": 4465 + }, + { + "epoch": 0.24580329132038087, + "grad_norm": 0.902632474899292, + "learning_rate": 9.646094221329802e-06, + "loss": 0.7139, + "step": 4466 + }, + { + "epoch": 0.24585833012273653, + "grad_norm": 0.7810692191123962, + "learning_rate": 9.645934024999374e-06, + "loss": 0.6904, + "step": 4467 + }, + { + "epoch": 0.2459133689250922, + "grad_norm": 0.7242134213447571, + "learning_rate": 9.645773793751352e-06, + "loss": 0.7035, + "step": 4468 + }, + { + "epoch": 0.24596840772744785, + "grad_norm": 0.7192920446395874, + "learning_rate": 9.645613527586938e-06, + "loss": 0.7081, + "step": 4469 + }, + { + "epoch": 0.2460234465298035, + "grad_norm": 0.7613840103149414, + "learning_rate": 9.645453226507336e-06, + "loss": 0.8066, + "step": 4470 + }, + { + "epoch": 0.24607848533215917, + "grad_norm": 0.8154922127723694, + "learning_rate": 9.64529289051375e-06, + "loss": 0.812, + "step": 4471 + }, + { + "epoch": 0.24613352413451484, + "grad_norm": 0.9521573185920715, + "learning_rate": 9.645132519607387e-06, + "loss": 0.7456, + "step": 4472 + }, + { + "epoch": 0.2461885629368705, + "grad_norm": 0.785943329334259, + "learning_rate": 9.64497211378945e-06, + "loss": 0.832, + "step": 4473 + }, + { + "epoch": 0.24624360173922616, + "grad_norm": 0.7675127983093262, + "learning_rate": 9.644811673061148e-06, + "loss": 0.7984, + "step": 4474 + }, + { + "epoch": 0.24629864054158182, + "grad_norm": 0.7317580580711365, + "learning_rate": 9.644651197423683e-06, + "loss": 0.7634, + "step": 4475 + }, + { + "epoch": 0.24635367934393748, + "grad_norm": 0.744937539100647, + "learning_rate": 9.644490686878265e-06, + "loss": 0.729, + "step": 4476 + }, + { + "epoch": 0.24640871814629314, + "grad_norm": 0.7472458481788635, + "learning_rate": 9.644330141426097e-06, + "loss": 0.7517, + "step": 4477 + }, + { + "epoch": 0.2464637569486488, + "grad_norm": 0.8379414677619934, + "learning_rate": 9.644169561068387e-06, + "loss": 0.8008, + "step": 4478 + }, + { + "epoch": 0.24651879575100447, + "grad_norm": 0.8845154047012329, + "learning_rate": 9.64400894580634e-06, + "loss": 0.8135, + "step": 4479 + }, + { + "epoch": 0.24657383455336013, + "grad_norm": 0.7394443154335022, + "learning_rate": 9.643848295641167e-06, + "loss": 0.7697, + "step": 4480 + }, + { + "epoch": 0.2466288733557158, + "grad_norm": 0.8840840458869934, + "learning_rate": 9.643687610574073e-06, + "loss": 0.825, + "step": 4481 + }, + { + "epoch": 0.24668391215807145, + "grad_norm": 0.7924874424934387, + "learning_rate": 9.643526890606265e-06, + "loss": 0.793, + "step": 4482 + }, + { + "epoch": 0.2467389509604271, + "grad_norm": 0.7966769933700562, + "learning_rate": 9.643366135738951e-06, + "loss": 0.8042, + "step": 4483 + }, + { + "epoch": 0.24679398976278277, + "grad_norm": 0.911756694316864, + "learning_rate": 9.643205345973343e-06, + "loss": 0.7801, + "step": 4484 + }, + { + "epoch": 0.24684902856513843, + "grad_norm": 0.903378963470459, + "learning_rate": 9.643044521310645e-06, + "loss": 0.7863, + "step": 4485 + }, + { + "epoch": 0.2469040673674941, + "grad_norm": 0.9021226167678833, + "learning_rate": 9.642883661752067e-06, + "loss": 0.8005, + "step": 4486 + }, + { + "epoch": 0.24695910616984976, + "grad_norm": 0.8853413462638855, + "learning_rate": 9.64272276729882e-06, + "loss": 0.8371, + "step": 4487 + }, + { + "epoch": 0.24701414497220542, + "grad_norm": 1.0654630661010742, + "learning_rate": 9.642561837952108e-06, + "loss": 0.92, + "step": 4488 + }, + { + "epoch": 0.24706918377456105, + "grad_norm": 0.8663573265075684, + "learning_rate": 9.642400873713146e-06, + "loss": 0.8066, + "step": 4489 + }, + { + "epoch": 0.2471242225769167, + "grad_norm": 0.7483134269714355, + "learning_rate": 9.642239874583143e-06, + "loss": 0.9013, + "step": 4490 + }, + { + "epoch": 0.24717926137927237, + "grad_norm": 0.7582293748855591, + "learning_rate": 9.642078840563306e-06, + "loss": 0.7795, + "step": 4491 + }, + { + "epoch": 0.24723430018162804, + "grad_norm": 0.8276637196540833, + "learning_rate": 9.641917771654848e-06, + "loss": 0.7756, + "step": 4492 + }, + { + "epoch": 0.2472893389839837, + "grad_norm": 0.697088360786438, + "learning_rate": 9.641756667858976e-06, + "loss": 0.7092, + "step": 4493 + }, + { + "epoch": 0.24734437778633936, + "grad_norm": 0.8960816860198975, + "learning_rate": 9.641595529176907e-06, + "loss": 0.8835, + "step": 4494 + }, + { + "epoch": 0.24739941658869502, + "grad_norm": 0.9210898280143738, + "learning_rate": 9.641434355609846e-06, + "loss": 0.7881, + "step": 4495 + }, + { + "epoch": 0.24745445539105068, + "grad_norm": 0.7205467820167542, + "learning_rate": 9.64127314715901e-06, + "loss": 0.7204, + "step": 4496 + }, + { + "epoch": 0.24750949419340634, + "grad_norm": 0.7313701510429382, + "learning_rate": 9.641111903825603e-06, + "loss": 0.8296, + "step": 4497 + }, + { + "epoch": 0.247564532995762, + "grad_norm": 0.771159827709198, + "learning_rate": 9.640950625610845e-06, + "loss": 0.7974, + "step": 4498 + }, + { + "epoch": 0.24761957179811767, + "grad_norm": 0.9227705597877502, + "learning_rate": 9.64078931251594e-06, + "loss": 0.9215, + "step": 4499 + }, + { + "epoch": 0.24767461060047333, + "grad_norm": 0.7569915652275085, + "learning_rate": 9.64062796454211e-06, + "loss": 0.83, + "step": 4500 + }, + { + "epoch": 0.247729649402829, + "grad_norm": 0.7453131675720215, + "learning_rate": 9.64046658169056e-06, + "loss": 0.6747, + "step": 4501 + }, + { + "epoch": 0.24778468820518465, + "grad_norm": 0.7228132486343384, + "learning_rate": 9.640305163962504e-06, + "loss": 0.7535, + "step": 4502 + }, + { + "epoch": 0.2478397270075403, + "grad_norm": 0.8160690069198608, + "learning_rate": 9.640143711359159e-06, + "loss": 0.8655, + "step": 4503 + }, + { + "epoch": 0.24789476580989597, + "grad_norm": 0.7641691565513611, + "learning_rate": 9.639982223881735e-06, + "loss": 0.8353, + "step": 4504 + }, + { + "epoch": 0.24794980461225163, + "grad_norm": 0.8669107556343079, + "learning_rate": 9.639820701531445e-06, + "loss": 0.8614, + "step": 4505 + }, + { + "epoch": 0.2480048434146073, + "grad_norm": 0.7433111667633057, + "learning_rate": 9.639659144309508e-06, + "loss": 0.6891, + "step": 4506 + }, + { + "epoch": 0.24805988221696296, + "grad_norm": 1.4303346872329712, + "learning_rate": 9.639497552217131e-06, + "loss": 0.8016, + "step": 4507 + }, + { + "epoch": 0.24811492101931862, + "grad_norm": 0.8684772253036499, + "learning_rate": 9.639335925255535e-06, + "loss": 0.8324, + "step": 4508 + }, + { + "epoch": 0.24816995982167428, + "grad_norm": 0.9222162365913391, + "learning_rate": 9.639174263425932e-06, + "loss": 0.8715, + "step": 4509 + }, + { + "epoch": 0.24822499862402994, + "grad_norm": 0.9789180755615234, + "learning_rate": 9.639012566729535e-06, + "loss": 0.823, + "step": 4510 + }, + { + "epoch": 0.2482800374263856, + "grad_norm": 0.8475140333175659, + "learning_rate": 9.638850835167564e-06, + "loss": 0.768, + "step": 4511 + }, + { + "epoch": 0.24833507622874126, + "grad_norm": 0.7943722605705261, + "learning_rate": 9.63868906874123e-06, + "loss": 0.788, + "step": 4512 + }, + { + "epoch": 0.24839011503109693, + "grad_norm": 0.8723915815353394, + "learning_rate": 9.63852726745175e-06, + "loss": 0.7865, + "step": 4513 + }, + { + "epoch": 0.2484451538334526, + "grad_norm": 0.837001383304596, + "learning_rate": 9.638365431300342e-06, + "loss": 0.7799, + "step": 4514 + }, + { + "epoch": 0.24850019263580825, + "grad_norm": 0.7992665767669678, + "learning_rate": 9.638203560288222e-06, + "loss": 0.8951, + "step": 4515 + }, + { + "epoch": 0.2485552314381639, + "grad_norm": 0.8712993264198303, + "learning_rate": 9.638041654416603e-06, + "loss": 0.8157, + "step": 4516 + }, + { + "epoch": 0.24861027024051957, + "grad_norm": 0.7176356911659241, + "learning_rate": 9.637879713686706e-06, + "loss": 0.8197, + "step": 4517 + }, + { + "epoch": 0.24866530904287523, + "grad_norm": 0.7624368071556091, + "learning_rate": 9.637717738099747e-06, + "loss": 0.7545, + "step": 4518 + }, + { + "epoch": 0.2487203478452309, + "grad_norm": 0.857222318649292, + "learning_rate": 9.637555727656943e-06, + "loss": 0.8146, + "step": 4519 + }, + { + "epoch": 0.24877538664758655, + "grad_norm": 0.7461313605308533, + "learning_rate": 9.637393682359511e-06, + "loss": 0.8569, + "step": 4520 + }, + { + "epoch": 0.24883042544994222, + "grad_norm": 0.8491896986961365, + "learning_rate": 9.637231602208668e-06, + "loss": 0.863, + "step": 4521 + }, + { + "epoch": 0.24888546425229788, + "grad_norm": 0.8139386177062988, + "learning_rate": 9.637069487205635e-06, + "loss": 0.7105, + "step": 4522 + }, + { + "epoch": 0.24894050305465354, + "grad_norm": 0.7782894968986511, + "learning_rate": 9.636907337351629e-06, + "loss": 0.8044, + "step": 4523 + }, + { + "epoch": 0.2489955418570092, + "grad_norm": 0.8225486874580383, + "learning_rate": 9.636745152647868e-06, + "loss": 0.7877, + "step": 4524 + }, + { + "epoch": 0.24905058065936486, + "grad_norm": 0.9087927341461182, + "learning_rate": 9.636582933095573e-06, + "loss": 0.8017, + "step": 4525 + }, + { + "epoch": 0.24910561946172052, + "grad_norm": 0.7392508387565613, + "learning_rate": 9.636420678695962e-06, + "loss": 0.7953, + "step": 4526 + }, + { + "epoch": 0.24916065826407618, + "grad_norm": 0.7906273007392883, + "learning_rate": 9.636258389450253e-06, + "loss": 0.9491, + "step": 4527 + }, + { + "epoch": 0.24921569706643185, + "grad_norm": 0.840394139289856, + "learning_rate": 9.636096065359666e-06, + "loss": 0.8621, + "step": 4528 + }, + { + "epoch": 0.2492707358687875, + "grad_norm": 0.7923862934112549, + "learning_rate": 9.635933706425424e-06, + "loss": 0.8215, + "step": 4529 + }, + { + "epoch": 0.24932577467114317, + "grad_norm": 0.8372805714607239, + "learning_rate": 9.635771312648744e-06, + "loss": 0.8845, + "step": 4530 + }, + { + "epoch": 0.24938081347349883, + "grad_norm": 0.7569165229797363, + "learning_rate": 9.635608884030848e-06, + "loss": 0.8406, + "step": 4531 + }, + { + "epoch": 0.24943585227585446, + "grad_norm": 0.8260865807533264, + "learning_rate": 9.635446420572956e-06, + "loss": 0.8418, + "step": 4532 + }, + { + "epoch": 0.24949089107821013, + "grad_norm": 0.6841318607330322, + "learning_rate": 9.635283922276291e-06, + "loss": 0.6732, + "step": 4533 + }, + { + "epoch": 0.2495459298805658, + "grad_norm": 0.7055326104164124, + "learning_rate": 9.635121389142072e-06, + "loss": 0.7702, + "step": 4534 + }, + { + "epoch": 0.24960096868292145, + "grad_norm": 0.7293457388877869, + "learning_rate": 9.63495882117152e-06, + "loss": 0.6836, + "step": 4535 + }, + { + "epoch": 0.2496560074852771, + "grad_norm": 0.7411924004554749, + "learning_rate": 9.63479621836586e-06, + "loss": 0.8686, + "step": 4536 + }, + { + "epoch": 0.24971104628763277, + "grad_norm": 0.7864643931388855, + "learning_rate": 9.634633580726313e-06, + "loss": 0.7801, + "step": 4537 + }, + { + "epoch": 0.24976608508998843, + "grad_norm": 0.9730797410011292, + "learning_rate": 9.634470908254099e-06, + "loss": 0.8362, + "step": 4538 + }, + { + "epoch": 0.2498211238923441, + "grad_norm": 0.8390370011329651, + "learning_rate": 9.634308200950442e-06, + "loss": 0.8079, + "step": 4539 + }, + { + "epoch": 0.24987616269469975, + "grad_norm": 0.8951246738433838, + "learning_rate": 9.634145458816566e-06, + "loss": 0.7662, + "step": 4540 + }, + { + "epoch": 0.24993120149705542, + "grad_norm": 0.7654157280921936, + "learning_rate": 9.633982681853693e-06, + "loss": 0.8699, + "step": 4541 + }, + { + "epoch": 0.24998624029941108, + "grad_norm": 0.8152109980583191, + "learning_rate": 9.633819870063046e-06, + "loss": 0.7875, + "step": 4542 + }, + { + "epoch": 0.25004127910176677, + "grad_norm": 0.9407321214675903, + "learning_rate": 9.63365702344585e-06, + "loss": 0.7708, + "step": 4543 + }, + { + "epoch": 0.2500963179041224, + "grad_norm": 0.8169927597045898, + "learning_rate": 9.633494142003327e-06, + "loss": 0.8078, + "step": 4544 + }, + { + "epoch": 0.2501513567064781, + "grad_norm": 0.7380755543708801, + "learning_rate": 9.633331225736704e-06, + "loss": 0.7818, + "step": 4545 + }, + { + "epoch": 0.2502063955088337, + "grad_norm": 0.8124812841415405, + "learning_rate": 9.633168274647203e-06, + "loss": 0.8133, + "step": 4546 + }, + { + "epoch": 0.2502614343111894, + "grad_norm": 0.8511367440223694, + "learning_rate": 9.63300528873605e-06, + "loss": 0.7747, + "step": 4547 + }, + { + "epoch": 0.25031647311354505, + "grad_norm": 0.7305121421813965, + "learning_rate": 9.632842268004469e-06, + "loss": 0.8479, + "step": 4548 + }, + { + "epoch": 0.25037151191590074, + "grad_norm": 0.7127692103385925, + "learning_rate": 9.632679212453686e-06, + "loss": 0.8514, + "step": 4549 + }, + { + "epoch": 0.25042655071825637, + "grad_norm": 0.8251872062683105, + "learning_rate": 9.632516122084926e-06, + "loss": 0.7686, + "step": 4550 + }, + { + "epoch": 0.25048158952061206, + "grad_norm": 0.6756613850593567, + "learning_rate": 9.632352996899413e-06, + "loss": 0.5959, + "step": 4551 + }, + { + "epoch": 0.2505366283229677, + "grad_norm": 0.9266120791435242, + "learning_rate": 9.632189836898377e-06, + "loss": 0.7889, + "step": 4552 + }, + { + "epoch": 0.2505916671253233, + "grad_norm": 0.769890546798706, + "learning_rate": 9.63202664208304e-06, + "loss": 0.7864, + "step": 4553 + }, + { + "epoch": 0.250646705927679, + "grad_norm": 0.7314025163650513, + "learning_rate": 9.631863412454634e-06, + "loss": 0.8088, + "step": 4554 + }, + { + "epoch": 0.25070174473003465, + "grad_norm": 0.818317711353302, + "learning_rate": 9.63170014801438e-06, + "loss": 0.7096, + "step": 4555 + }, + { + "epoch": 0.25075678353239034, + "grad_norm": 0.7538807392120361, + "learning_rate": 9.631536848763508e-06, + "loss": 0.7779, + "step": 4556 + }, + { + "epoch": 0.25081182233474597, + "grad_norm": 0.7658100128173828, + "learning_rate": 9.631373514703247e-06, + "loss": 0.8535, + "step": 4557 + }, + { + "epoch": 0.25086686113710166, + "grad_norm": 0.8019290566444397, + "learning_rate": 9.631210145834819e-06, + "loss": 0.8141, + "step": 4558 + }, + { + "epoch": 0.2509218999394573, + "grad_norm": 0.7257653474807739, + "learning_rate": 9.631046742159456e-06, + "loss": 0.7451, + "step": 4559 + }, + { + "epoch": 0.250976938741813, + "grad_norm": 0.7546024918556213, + "learning_rate": 9.630883303678386e-06, + "loss": 0.7707, + "step": 4560 + }, + { + "epoch": 0.2510319775441686, + "grad_norm": 0.7288938760757446, + "learning_rate": 9.630719830392835e-06, + "loss": 0.7362, + "step": 4561 + }, + { + "epoch": 0.2510870163465243, + "grad_norm": 0.7814223170280457, + "learning_rate": 9.630556322304036e-06, + "loss": 0.8514, + "step": 4562 + }, + { + "epoch": 0.25114205514887994, + "grad_norm": 0.7561381459236145, + "learning_rate": 9.630392779413214e-06, + "loss": 0.7659, + "step": 4563 + }, + { + "epoch": 0.25119709395123563, + "grad_norm": 0.750641942024231, + "learning_rate": 9.6302292017216e-06, + "loss": 0.8496, + "step": 4564 + }, + { + "epoch": 0.25125213275359126, + "grad_norm": 0.832155704498291, + "learning_rate": 9.630065589230422e-06, + "loss": 0.7778, + "step": 4565 + }, + { + "epoch": 0.25130717155594695, + "grad_norm": 0.8202440142631531, + "learning_rate": 9.62990194194091e-06, + "loss": 0.8962, + "step": 4566 + }, + { + "epoch": 0.2513622103583026, + "grad_norm": 0.8777977824211121, + "learning_rate": 9.629738259854295e-06, + "loss": 0.7215, + "step": 4567 + }, + { + "epoch": 0.2514172491606583, + "grad_norm": 1.1868599653244019, + "learning_rate": 9.629574542971806e-06, + "loss": 0.8238, + "step": 4568 + }, + { + "epoch": 0.2514722879630139, + "grad_norm": 0.9128753542900085, + "learning_rate": 9.629410791294675e-06, + "loss": 0.7638, + "step": 4569 + }, + { + "epoch": 0.2515273267653696, + "grad_norm": 0.7350082993507385, + "learning_rate": 9.629247004824132e-06, + "loss": 0.8041, + "step": 4570 + }, + { + "epoch": 0.25158236556772523, + "grad_norm": 0.7279660701751709, + "learning_rate": 9.629083183561407e-06, + "loss": 0.7377, + "step": 4571 + }, + { + "epoch": 0.2516374043700809, + "grad_norm": 0.8570461273193359, + "learning_rate": 9.628919327507732e-06, + "loss": 0.8106, + "step": 4572 + }, + { + "epoch": 0.25169244317243655, + "grad_norm": 0.8998312950134277, + "learning_rate": 9.62875543666434e-06, + "loss": 0.8171, + "step": 4573 + }, + { + "epoch": 0.25174748197479224, + "grad_norm": 0.7631624937057495, + "learning_rate": 9.628591511032456e-06, + "loss": 0.7871, + "step": 4574 + }, + { + "epoch": 0.2518025207771479, + "grad_norm": 0.7752320766448975, + "learning_rate": 9.628427550613322e-06, + "loss": 0.8241, + "step": 4575 + }, + { + "epoch": 0.25185755957950356, + "grad_norm": 0.8741563558578491, + "learning_rate": 9.628263555408163e-06, + "loss": 0.7312, + "step": 4576 + }, + { + "epoch": 0.2519125983818592, + "grad_norm": 0.8615008592605591, + "learning_rate": 9.628099525418216e-06, + "loss": 0.8586, + "step": 4577 + }, + { + "epoch": 0.2519676371842149, + "grad_norm": 0.8273662328720093, + "learning_rate": 9.62793546064471e-06, + "loss": 0.7838, + "step": 4578 + }, + { + "epoch": 0.2520226759865705, + "grad_norm": 0.7454090118408203, + "learning_rate": 9.627771361088882e-06, + "loss": 0.8461, + "step": 4579 + }, + { + "epoch": 0.2520777147889262, + "grad_norm": 0.8225379586219788, + "learning_rate": 9.627607226751962e-06, + "loss": 0.7792, + "step": 4580 + }, + { + "epoch": 0.25213275359128184, + "grad_norm": 0.8655416369438171, + "learning_rate": 9.627443057635184e-06, + "loss": 0.8165, + "step": 4581 + }, + { + "epoch": 0.25218779239363753, + "grad_norm": 0.7735984921455383, + "learning_rate": 9.627278853739783e-06, + "loss": 0.8208, + "step": 4582 + }, + { + "epoch": 0.25224283119599317, + "grad_norm": 0.8293350338935852, + "learning_rate": 9.627114615066994e-06, + "loss": 0.7394, + "step": 4583 + }, + { + "epoch": 0.25229786999834886, + "grad_norm": 0.7840214371681213, + "learning_rate": 9.626950341618048e-06, + "loss": 0.8522, + "step": 4584 + }, + { + "epoch": 0.2523529088007045, + "grad_norm": 0.7724186182022095, + "learning_rate": 9.626786033394185e-06, + "loss": 0.8175, + "step": 4585 + }, + { + "epoch": 0.2524079476030602, + "grad_norm": 1.0751588344573975, + "learning_rate": 9.626621690396634e-06, + "loss": 0.9229, + "step": 4586 + }, + { + "epoch": 0.2524629864054158, + "grad_norm": 0.7016913294792175, + "learning_rate": 9.626457312626634e-06, + "loss": 0.6883, + "step": 4587 + }, + { + "epoch": 0.2525180252077715, + "grad_norm": 0.918377697467804, + "learning_rate": 9.626292900085419e-06, + "loss": 0.7889, + "step": 4588 + }, + { + "epoch": 0.25257306401012714, + "grad_norm": 1.006564736366272, + "learning_rate": 9.626128452774226e-06, + "loss": 0.7888, + "step": 4589 + }, + { + "epoch": 0.2526281028124828, + "grad_norm": 1.0214998722076416, + "learning_rate": 9.625963970694287e-06, + "loss": 0.768, + "step": 4590 + }, + { + "epoch": 0.25268314161483846, + "grad_norm": 0.7980843186378479, + "learning_rate": 9.625799453846844e-06, + "loss": 0.8662, + "step": 4591 + }, + { + "epoch": 0.25273818041719415, + "grad_norm": 0.734582245349884, + "learning_rate": 9.625634902233128e-06, + "loss": 0.759, + "step": 4592 + }, + { + "epoch": 0.2527932192195498, + "grad_norm": 0.7185904383659363, + "learning_rate": 9.62547031585438e-06, + "loss": 0.774, + "step": 4593 + }, + { + "epoch": 0.25284825802190547, + "grad_norm": 0.7356622219085693, + "learning_rate": 9.625305694711835e-06, + "loss": 0.7435, + "step": 4594 + }, + { + "epoch": 0.2529032968242611, + "grad_norm": 0.7589355707168579, + "learning_rate": 9.62514103880673e-06, + "loss": 0.807, + "step": 4595 + }, + { + "epoch": 0.25295833562661674, + "grad_norm": 0.889228880405426, + "learning_rate": 9.624976348140305e-06, + "loss": 0.8609, + "step": 4596 + }, + { + "epoch": 0.2530133744289724, + "grad_norm": 0.7546125650405884, + "learning_rate": 9.624811622713793e-06, + "loss": 0.8379, + "step": 4597 + }, + { + "epoch": 0.25306841323132806, + "grad_norm": 0.8262770175933838, + "learning_rate": 9.624646862528436e-06, + "loss": 0.7611, + "step": 4598 + }, + { + "epoch": 0.25312345203368375, + "grad_norm": 0.8876076936721802, + "learning_rate": 9.624482067585472e-06, + "loss": 0.8106, + "step": 4599 + }, + { + "epoch": 0.2531784908360394, + "grad_norm": 0.7045544981956482, + "learning_rate": 9.624317237886137e-06, + "loss": 0.7121, + "step": 4600 + }, + { + "epoch": 0.25323352963839507, + "grad_norm": 0.7693355083465576, + "learning_rate": 9.624152373431672e-06, + "loss": 0.8052, + "step": 4601 + }, + { + "epoch": 0.2532885684407507, + "grad_norm": 0.8072683811187744, + "learning_rate": 9.623987474223316e-06, + "loss": 0.8543, + "step": 4602 + }, + { + "epoch": 0.2533436072431064, + "grad_norm": 0.8158687949180603, + "learning_rate": 9.62382254026231e-06, + "loss": 0.6922, + "step": 4603 + }, + { + "epoch": 0.25339864604546203, + "grad_norm": 0.7688641548156738, + "learning_rate": 9.623657571549887e-06, + "loss": 0.7198, + "step": 4604 + }, + { + "epoch": 0.2534536848478177, + "grad_norm": 0.7806578278541565, + "learning_rate": 9.623492568087293e-06, + "loss": 0.8539, + "step": 4605 + }, + { + "epoch": 0.25350872365017335, + "grad_norm": 0.9557347893714905, + "learning_rate": 9.623327529875769e-06, + "loss": 0.6996, + "step": 4606 + }, + { + "epoch": 0.25356376245252904, + "grad_norm": 0.9465067386627197, + "learning_rate": 9.62316245691655e-06, + "loss": 0.8756, + "step": 4607 + }, + { + "epoch": 0.2536188012548847, + "grad_norm": 0.8029165863990784, + "learning_rate": 9.62299734921088e-06, + "loss": 0.8573, + "step": 4608 + }, + { + "epoch": 0.25367384005724036, + "grad_norm": 0.7530128955841064, + "learning_rate": 9.62283220676e-06, + "loss": 0.7466, + "step": 4609 + }, + { + "epoch": 0.253728878859596, + "grad_norm": 0.6704453825950623, + "learning_rate": 9.622667029565151e-06, + "loss": 0.6512, + "step": 4610 + }, + { + "epoch": 0.2537839176619517, + "grad_norm": 0.7162728309631348, + "learning_rate": 9.622501817627574e-06, + "loss": 0.7615, + "step": 4611 + }, + { + "epoch": 0.2538389564643073, + "grad_norm": 0.7599188089370728, + "learning_rate": 9.622336570948509e-06, + "loss": 0.8463, + "step": 4612 + }, + { + "epoch": 0.253893995266663, + "grad_norm": 0.7922326922416687, + "learning_rate": 9.6221712895292e-06, + "loss": 0.9221, + "step": 4613 + }, + { + "epoch": 0.25394903406901864, + "grad_norm": 1.4635218381881714, + "learning_rate": 9.622005973370892e-06, + "loss": 0.9159, + "step": 4614 + }, + { + "epoch": 0.25400407287137433, + "grad_norm": 0.8695057034492493, + "learning_rate": 9.62184062247482e-06, + "loss": 0.6792, + "step": 4615 + }, + { + "epoch": 0.25405911167372996, + "grad_norm": 0.8070930242538452, + "learning_rate": 9.621675236842235e-06, + "loss": 0.8257, + "step": 4616 + }, + { + "epoch": 0.25411415047608565, + "grad_norm": 0.8642075061798096, + "learning_rate": 9.621509816474372e-06, + "loss": 0.8223, + "step": 4617 + }, + { + "epoch": 0.2541691892784413, + "grad_norm": 0.7131080031394958, + "learning_rate": 9.621344361372483e-06, + "loss": 0.6831, + "step": 4618 + }, + { + "epoch": 0.254224228080797, + "grad_norm": 0.7582216262817383, + "learning_rate": 9.621178871537804e-06, + "loss": 0.8091, + "step": 4619 + }, + { + "epoch": 0.2542792668831526, + "grad_norm": 0.7705016732215881, + "learning_rate": 9.62101334697158e-06, + "loss": 0.7537, + "step": 4620 + }, + { + "epoch": 0.2543343056855083, + "grad_norm": 0.7638342976570129, + "learning_rate": 9.62084778767506e-06, + "loss": 0.7661, + "step": 4621 + }, + { + "epoch": 0.25438934448786393, + "grad_norm": 0.9296607971191406, + "learning_rate": 9.620682193649482e-06, + "loss": 0.8875, + "step": 4622 + }, + { + "epoch": 0.2544443832902196, + "grad_norm": 0.795394778251648, + "learning_rate": 9.620516564896096e-06, + "loss": 0.6884, + "step": 4623 + }, + { + "epoch": 0.25449942209257526, + "grad_norm": 0.9164957404136658, + "learning_rate": 9.620350901416142e-06, + "loss": 0.8693, + "step": 4624 + }, + { + "epoch": 0.25455446089493095, + "grad_norm": 0.8306281566619873, + "learning_rate": 9.62018520321087e-06, + "loss": 0.8972, + "step": 4625 + }, + { + "epoch": 0.2546094996972866, + "grad_norm": 0.778831422328949, + "learning_rate": 9.620019470281521e-06, + "loss": 0.7574, + "step": 4626 + }, + { + "epoch": 0.25466453849964227, + "grad_norm": 0.9326225519180298, + "learning_rate": 9.619853702629343e-06, + "loss": 0.7712, + "step": 4627 + }, + { + "epoch": 0.2547195773019979, + "grad_norm": 0.8772255182266235, + "learning_rate": 9.619687900255581e-06, + "loss": 0.8241, + "step": 4628 + }, + { + "epoch": 0.2547746161043536, + "grad_norm": 0.8777550458908081, + "learning_rate": 9.619522063161482e-06, + "loss": 0.8724, + "step": 4629 + }, + { + "epoch": 0.2548296549067092, + "grad_norm": 0.8332602381706238, + "learning_rate": 9.61935619134829e-06, + "loss": 0.8716, + "step": 4630 + }, + { + "epoch": 0.2548846937090649, + "grad_norm": 0.8246355056762695, + "learning_rate": 9.619190284817255e-06, + "loss": 0.7789, + "step": 4631 + }, + { + "epoch": 0.25493973251142055, + "grad_norm": 0.7200644612312317, + "learning_rate": 9.61902434356962e-06, + "loss": 0.7956, + "step": 4632 + }, + { + "epoch": 0.25499477131377624, + "grad_norm": 0.827756404876709, + "learning_rate": 9.618858367606638e-06, + "loss": 0.7925, + "step": 4633 + }, + { + "epoch": 0.25504981011613187, + "grad_norm": 0.7749341726303101, + "learning_rate": 9.618692356929551e-06, + "loss": 0.8706, + "step": 4634 + }, + { + "epoch": 0.25510484891848756, + "grad_norm": 0.7233432531356812, + "learning_rate": 9.618526311539608e-06, + "loss": 0.7725, + "step": 4635 + }, + { + "epoch": 0.2551598877208432, + "grad_norm": 0.846340537071228, + "learning_rate": 9.618360231438058e-06, + "loss": 0.8758, + "step": 4636 + }, + { + "epoch": 0.2552149265231989, + "grad_norm": 0.8262908458709717, + "learning_rate": 9.61819411662615e-06, + "loss": 0.7758, + "step": 4637 + }, + { + "epoch": 0.2552699653255545, + "grad_norm": 0.7829110026359558, + "learning_rate": 9.61802796710513e-06, + "loss": 0.8494, + "step": 4638 + }, + { + "epoch": 0.25532500412791015, + "grad_norm": 0.7480815649032593, + "learning_rate": 9.617861782876247e-06, + "loss": 0.7639, + "step": 4639 + }, + { + "epoch": 0.25538004293026584, + "grad_norm": 0.8782994747161865, + "learning_rate": 9.617695563940752e-06, + "loss": 0.9651, + "step": 4640 + }, + { + "epoch": 0.25543508173262147, + "grad_norm": 0.7215868234634399, + "learning_rate": 9.617529310299895e-06, + "loss": 0.7833, + "step": 4641 + }, + { + "epoch": 0.25549012053497716, + "grad_norm": 0.8287535905838013, + "learning_rate": 9.617363021954922e-06, + "loss": 0.901, + "step": 4642 + }, + { + "epoch": 0.2555451593373328, + "grad_norm": 0.7679935097694397, + "learning_rate": 9.617196698907084e-06, + "loss": 0.761, + "step": 4643 + }, + { + "epoch": 0.2556001981396885, + "grad_norm": 0.7765942811965942, + "learning_rate": 9.617030341157632e-06, + "loss": 0.7356, + "step": 4644 + }, + { + "epoch": 0.2556552369420441, + "grad_norm": 0.6964583396911621, + "learning_rate": 9.616863948707816e-06, + "loss": 0.7683, + "step": 4645 + }, + { + "epoch": 0.2557102757443998, + "grad_norm": 0.8031953573226929, + "learning_rate": 9.616697521558886e-06, + "loss": 0.7875, + "step": 4646 + }, + { + "epoch": 0.25576531454675544, + "grad_norm": 0.7155965566635132, + "learning_rate": 9.616531059712094e-06, + "loss": 0.6516, + "step": 4647 + }, + { + "epoch": 0.25582035334911113, + "grad_norm": 0.6870070099830627, + "learning_rate": 9.61636456316869e-06, + "loss": 0.7217, + "step": 4648 + }, + { + "epoch": 0.25587539215146676, + "grad_norm": 0.7686315774917603, + "learning_rate": 9.616198031929926e-06, + "loss": 0.8136, + "step": 4649 + }, + { + "epoch": 0.25593043095382245, + "grad_norm": 0.7532772421836853, + "learning_rate": 9.616031465997054e-06, + "loss": 0.696, + "step": 4650 + }, + { + "epoch": 0.2559854697561781, + "grad_norm": 0.8111574053764343, + "learning_rate": 9.615864865371323e-06, + "loss": 0.8501, + "step": 4651 + }, + { + "epoch": 0.2560405085585338, + "grad_norm": 0.771065890789032, + "learning_rate": 9.615698230053989e-06, + "loss": 0.7417, + "step": 4652 + }, + { + "epoch": 0.2560955473608894, + "grad_norm": 0.7468003034591675, + "learning_rate": 9.6155315600463e-06, + "loss": 0.7303, + "step": 4653 + }, + { + "epoch": 0.2561505861632451, + "grad_norm": 0.8041057586669922, + "learning_rate": 9.615364855349514e-06, + "loss": 0.8689, + "step": 4654 + }, + { + "epoch": 0.25620562496560073, + "grad_norm": 0.8439033627510071, + "learning_rate": 9.61519811596488e-06, + "loss": 0.8654, + "step": 4655 + }, + { + "epoch": 0.2562606637679564, + "grad_norm": 0.7768430113792419, + "learning_rate": 9.615031341893653e-06, + "loss": 0.8789, + "step": 4656 + }, + { + "epoch": 0.25631570257031205, + "grad_norm": 0.712876558303833, + "learning_rate": 9.614864533137086e-06, + "loss": 0.7497, + "step": 4657 + }, + { + "epoch": 0.25637074137266774, + "grad_norm": 0.7586949467658997, + "learning_rate": 9.614697689696431e-06, + "loss": 0.81, + "step": 4658 + }, + { + "epoch": 0.2564257801750234, + "grad_norm": 0.717078447341919, + "learning_rate": 9.614530811572946e-06, + "loss": 0.8023, + "step": 4659 + }, + { + "epoch": 0.25648081897737907, + "grad_norm": 0.7369407415390015, + "learning_rate": 9.61436389876788e-06, + "loss": 0.784, + "step": 4660 + }, + { + "epoch": 0.2565358577797347, + "grad_norm": 0.7536265850067139, + "learning_rate": 9.61419695128249e-06, + "loss": 0.7687, + "step": 4661 + }, + { + "epoch": 0.2565908965820904, + "grad_norm": 0.9718124866485596, + "learning_rate": 9.614029969118033e-06, + "loss": 0.8495, + "step": 4662 + }, + { + "epoch": 0.256645935384446, + "grad_norm": 1.1578630208969116, + "learning_rate": 9.613862952275762e-06, + "loss": 0.9189, + "step": 4663 + }, + { + "epoch": 0.2567009741868017, + "grad_norm": 0.7752498984336853, + "learning_rate": 9.613695900756929e-06, + "loss": 0.7677, + "step": 4664 + }, + { + "epoch": 0.25675601298915735, + "grad_norm": 0.9640393257141113, + "learning_rate": 9.613528814562795e-06, + "loss": 0.719, + "step": 4665 + }, + { + "epoch": 0.25681105179151303, + "grad_norm": 0.7690972089767456, + "learning_rate": 9.613361693694614e-06, + "loss": 0.7977, + "step": 4666 + }, + { + "epoch": 0.25686609059386867, + "grad_norm": 0.8390190601348877, + "learning_rate": 9.61319453815364e-06, + "loss": 0.8032, + "step": 4667 + }, + { + "epoch": 0.25692112939622436, + "grad_norm": 0.8293220400810242, + "learning_rate": 9.613027347941131e-06, + "loss": 0.8645, + "step": 4668 + }, + { + "epoch": 0.25697616819858, + "grad_norm": 0.8020731210708618, + "learning_rate": 9.612860123058344e-06, + "loss": 0.8374, + "step": 4669 + }, + { + "epoch": 0.2570312070009357, + "grad_norm": 0.7756736278533936, + "learning_rate": 9.612692863506534e-06, + "loss": 0.7318, + "step": 4670 + }, + { + "epoch": 0.2570862458032913, + "grad_norm": 0.895416259765625, + "learning_rate": 9.61252556928696e-06, + "loss": 0.9654, + "step": 4671 + }, + { + "epoch": 0.257141284605647, + "grad_norm": 0.8647375106811523, + "learning_rate": 9.61235824040088e-06, + "loss": 0.7411, + "step": 4672 + }, + { + "epoch": 0.25719632340800264, + "grad_norm": 0.6927250623703003, + "learning_rate": 9.612190876849546e-06, + "loss": 0.7558, + "step": 4673 + }, + { + "epoch": 0.2572513622103583, + "grad_norm": 0.7614898085594177, + "learning_rate": 9.612023478634222e-06, + "loss": 0.7696, + "step": 4674 + }, + { + "epoch": 0.25730640101271396, + "grad_norm": 0.7910586595535278, + "learning_rate": 9.611856045756166e-06, + "loss": 0.8207, + "step": 4675 + }, + { + "epoch": 0.25736143981506965, + "grad_norm": 0.7330125570297241, + "learning_rate": 9.611688578216632e-06, + "loss": 0.8615, + "step": 4676 + }, + { + "epoch": 0.2574164786174253, + "grad_norm": 0.7703417539596558, + "learning_rate": 9.611521076016882e-06, + "loss": 0.8321, + "step": 4677 + }, + { + "epoch": 0.25747151741978097, + "grad_norm": 0.7121796607971191, + "learning_rate": 9.611353539158174e-06, + "loss": 0.8228, + "step": 4678 + }, + { + "epoch": 0.2575265562221366, + "grad_norm": 0.8313117027282715, + "learning_rate": 9.611185967641768e-06, + "loss": 0.9012, + "step": 4679 + }, + { + "epoch": 0.2575815950244923, + "grad_norm": 0.806776225566864, + "learning_rate": 9.61101836146892e-06, + "loss": 0.769, + "step": 4680 + }, + { + "epoch": 0.2576366338268479, + "grad_norm": 0.7049515843391418, + "learning_rate": 9.610850720640894e-06, + "loss": 0.7938, + "step": 4681 + }, + { + "epoch": 0.25769167262920356, + "grad_norm": 0.7286638021469116, + "learning_rate": 9.610683045158948e-06, + "loss": 0.8168, + "step": 4682 + }, + { + "epoch": 0.25774671143155925, + "grad_norm": 0.7916898727416992, + "learning_rate": 9.610515335024345e-06, + "loss": 0.7681, + "step": 4683 + }, + { + "epoch": 0.2578017502339149, + "grad_norm": 0.7649673819541931, + "learning_rate": 9.61034759023834e-06, + "loss": 0.7273, + "step": 4684 + }, + { + "epoch": 0.2578567890362706, + "grad_norm": 0.8280686736106873, + "learning_rate": 9.610179810802196e-06, + "loss": 0.7968, + "step": 4685 + }, + { + "epoch": 0.2579118278386262, + "grad_norm": 0.7206569910049438, + "learning_rate": 9.610011996717175e-06, + "loss": 0.7359, + "step": 4686 + }, + { + "epoch": 0.2579668666409819, + "grad_norm": 0.7365424036979675, + "learning_rate": 9.60984414798454e-06, + "loss": 0.7962, + "step": 4687 + }, + { + "epoch": 0.25802190544333753, + "grad_norm": 0.8030344247817993, + "learning_rate": 9.609676264605549e-06, + "loss": 0.7931, + "step": 4688 + }, + { + "epoch": 0.2580769442456932, + "grad_norm": 0.8812693357467651, + "learning_rate": 9.609508346581464e-06, + "loss": 0.8493, + "step": 4689 + }, + { + "epoch": 0.25813198304804885, + "grad_norm": 0.8026734590530396, + "learning_rate": 9.60934039391355e-06, + "loss": 0.8368, + "step": 4690 + }, + { + "epoch": 0.25818702185040454, + "grad_norm": 0.8270768523216248, + "learning_rate": 9.609172406603067e-06, + "loss": 0.9077, + "step": 4691 + }, + { + "epoch": 0.2582420606527602, + "grad_norm": 0.7362856864929199, + "learning_rate": 9.609004384651276e-06, + "loss": 0.7384, + "step": 4692 + }, + { + "epoch": 0.25829709945511586, + "grad_norm": 0.7195929288864136, + "learning_rate": 9.608836328059444e-06, + "loss": 0.8475, + "step": 4693 + }, + { + "epoch": 0.2583521382574715, + "grad_norm": 0.7653167843818665, + "learning_rate": 9.60866823682883e-06, + "loss": 0.7704, + "step": 4694 + }, + { + "epoch": 0.2584071770598272, + "grad_norm": 0.7056792974472046, + "learning_rate": 9.6085001109607e-06, + "loss": 0.7835, + "step": 4695 + }, + { + "epoch": 0.2584622158621828, + "grad_norm": 0.7299804091453552, + "learning_rate": 9.60833195045632e-06, + "loss": 0.7894, + "step": 4696 + }, + { + "epoch": 0.2585172546645385, + "grad_norm": 0.7235645055770874, + "learning_rate": 9.608163755316948e-06, + "loss": 0.8113, + "step": 4697 + }, + { + "epoch": 0.25857229346689414, + "grad_norm": 0.7066782116889954, + "learning_rate": 9.60799552554385e-06, + "loss": 0.739, + "step": 4698 + }, + { + "epoch": 0.25862733226924983, + "grad_norm": 0.769930362701416, + "learning_rate": 9.607827261138291e-06, + "loss": 0.7565, + "step": 4699 + }, + { + "epoch": 0.25868237107160547, + "grad_norm": 0.8875935077667236, + "learning_rate": 9.607658962101538e-06, + "loss": 0.849, + "step": 4700 + }, + { + "epoch": 0.25873740987396115, + "grad_norm": 0.7887380123138428, + "learning_rate": 9.60749062843485e-06, + "loss": 0.8795, + "step": 4701 + }, + { + "epoch": 0.2587924486763168, + "grad_norm": 0.7600420117378235, + "learning_rate": 9.607322260139499e-06, + "loss": 0.7581, + "step": 4702 + }, + { + "epoch": 0.2588474874786725, + "grad_norm": 0.7431491017341614, + "learning_rate": 9.607153857216746e-06, + "loss": 0.7119, + "step": 4703 + }, + { + "epoch": 0.2589025262810281, + "grad_norm": 0.7444193363189697, + "learning_rate": 9.606985419667858e-06, + "loss": 0.7492, + "step": 4704 + }, + { + "epoch": 0.2589575650833838, + "grad_norm": 0.8348917365074158, + "learning_rate": 9.6068169474941e-06, + "loss": 0.7656, + "step": 4705 + }, + { + "epoch": 0.25901260388573943, + "grad_norm": 0.6790240406990051, + "learning_rate": 9.60664844069674e-06, + "loss": 0.6354, + "step": 4706 + }, + { + "epoch": 0.2590676426880951, + "grad_norm": 0.8425769805908203, + "learning_rate": 9.606479899277044e-06, + "loss": 0.7927, + "step": 4707 + }, + { + "epoch": 0.25912268149045076, + "grad_norm": 0.7234740853309631, + "learning_rate": 9.606311323236277e-06, + "loss": 0.8122, + "step": 4708 + }, + { + "epoch": 0.25917772029280645, + "grad_norm": 0.839507520198822, + "learning_rate": 9.606142712575707e-06, + "loss": 0.8807, + "step": 4709 + }, + { + "epoch": 0.2592327590951621, + "grad_norm": 0.7155291438102722, + "learning_rate": 9.605974067296601e-06, + "loss": 0.7852, + "step": 4710 + }, + { + "epoch": 0.25928779789751777, + "grad_norm": 0.7222152352333069, + "learning_rate": 9.605805387400228e-06, + "loss": 0.7362, + "step": 4711 + }, + { + "epoch": 0.2593428366998734, + "grad_norm": 0.8350114226341248, + "learning_rate": 9.605636672887854e-06, + "loss": 0.7201, + "step": 4712 + }, + { + "epoch": 0.2593978755022291, + "grad_norm": 0.6805943250656128, + "learning_rate": 9.605467923760747e-06, + "loss": 0.6936, + "step": 4713 + }, + { + "epoch": 0.2594529143045847, + "grad_norm": 0.7863980531692505, + "learning_rate": 9.605299140020177e-06, + "loss": 0.9079, + "step": 4714 + }, + { + "epoch": 0.2595079531069404, + "grad_norm": 0.838843584060669, + "learning_rate": 9.60513032166741e-06, + "loss": 0.839, + "step": 4715 + }, + { + "epoch": 0.25956299190929605, + "grad_norm": 0.7872797250747681, + "learning_rate": 9.60496146870372e-06, + "loss": 0.9164, + "step": 4716 + }, + { + "epoch": 0.25961803071165174, + "grad_norm": 0.7300794720649719, + "learning_rate": 9.604792581130369e-06, + "loss": 0.8227, + "step": 4717 + }, + { + "epoch": 0.25967306951400737, + "grad_norm": 0.8420879244804382, + "learning_rate": 9.60462365894863e-06, + "loss": 0.7865, + "step": 4718 + }, + { + "epoch": 0.25972810831636306, + "grad_norm": 0.807697057723999, + "learning_rate": 9.604454702159771e-06, + "loss": 0.9081, + "step": 4719 + }, + { + "epoch": 0.2597831471187187, + "grad_norm": 0.9041245579719543, + "learning_rate": 9.604285710765064e-06, + "loss": 0.8102, + "step": 4720 + }, + { + "epoch": 0.2598381859210744, + "grad_norm": 0.7061690092086792, + "learning_rate": 9.604116684765779e-06, + "loss": 0.762, + "step": 4721 + }, + { + "epoch": 0.25989322472343, + "grad_norm": 0.7790346741676331, + "learning_rate": 9.603947624163186e-06, + "loss": 0.8038, + "step": 4722 + }, + { + "epoch": 0.2599482635257857, + "grad_norm": 0.8109704256057739, + "learning_rate": 9.603778528958553e-06, + "loss": 0.9105, + "step": 4723 + }, + { + "epoch": 0.26000330232814134, + "grad_norm": 0.7396997213363647, + "learning_rate": 9.603609399153153e-06, + "loss": 0.8384, + "step": 4724 + }, + { + "epoch": 0.260058341130497, + "grad_norm": 0.8594317436218262, + "learning_rate": 9.603440234748257e-06, + "loss": 0.8301, + "step": 4725 + }, + { + "epoch": 0.26011337993285266, + "grad_norm": 0.7087241411209106, + "learning_rate": 9.603271035745138e-06, + "loss": 0.6652, + "step": 4726 + }, + { + "epoch": 0.2601684187352083, + "grad_norm": 0.7405440211296082, + "learning_rate": 9.603101802145065e-06, + "loss": 0.7804, + "step": 4727 + }, + { + "epoch": 0.260223457537564, + "grad_norm": 0.8637508749961853, + "learning_rate": 9.602932533949312e-06, + "loss": 0.8509, + "step": 4728 + }, + { + "epoch": 0.2602784963399196, + "grad_norm": 0.7040451765060425, + "learning_rate": 9.60276323115915e-06, + "loss": 0.7842, + "step": 4729 + }, + { + "epoch": 0.2603335351422753, + "grad_norm": 0.7743955254554749, + "learning_rate": 9.602593893775852e-06, + "loss": 0.8492, + "step": 4730 + }, + { + "epoch": 0.26038857394463094, + "grad_norm": 0.7110480070114136, + "learning_rate": 9.602424521800688e-06, + "loss": 0.7227, + "step": 4731 + }, + { + "epoch": 0.26044361274698663, + "grad_norm": 1.0066583156585693, + "learning_rate": 9.602255115234936e-06, + "loss": 0.8825, + "step": 4732 + }, + { + "epoch": 0.26049865154934226, + "grad_norm": 0.7746492624282837, + "learning_rate": 9.602085674079864e-06, + "loss": 0.8316, + "step": 4733 + }, + { + "epoch": 0.26055369035169795, + "grad_norm": 0.7394356727600098, + "learning_rate": 9.60191619833675e-06, + "loss": 0.746, + "step": 4734 + }, + { + "epoch": 0.2606087291540536, + "grad_norm": 0.7140582203865051, + "learning_rate": 9.601746688006866e-06, + "loss": 0.7204, + "step": 4735 + }, + { + "epoch": 0.2606637679564093, + "grad_norm": 0.753399133682251, + "learning_rate": 9.601577143091483e-06, + "loss": 0.8157, + "step": 4736 + }, + { + "epoch": 0.2607188067587649, + "grad_norm": 0.674320638179779, + "learning_rate": 9.601407563591881e-06, + "loss": 0.7279, + "step": 4737 + }, + { + "epoch": 0.2607738455611206, + "grad_norm": 0.855944037437439, + "learning_rate": 9.60123794950933e-06, + "loss": 0.804, + "step": 4738 + }, + { + "epoch": 0.26082888436347623, + "grad_norm": 0.6833948493003845, + "learning_rate": 9.601068300845106e-06, + "loss": 0.701, + "step": 4739 + }, + { + "epoch": 0.2608839231658319, + "grad_norm": 0.8085536360740662, + "learning_rate": 9.600898617600485e-06, + "loss": 0.8435, + "step": 4740 + }, + { + "epoch": 0.26093896196818755, + "grad_norm": 0.752849817276001, + "learning_rate": 9.600728899776741e-06, + "loss": 0.7205, + "step": 4741 + }, + { + "epoch": 0.26099400077054324, + "grad_norm": 0.7320554852485657, + "learning_rate": 9.600559147375151e-06, + "loss": 0.7556, + "step": 4742 + }, + { + "epoch": 0.2610490395728989, + "grad_norm": 0.7789202928543091, + "learning_rate": 9.600389360396988e-06, + "loss": 0.8467, + "step": 4743 + }, + { + "epoch": 0.26110407837525457, + "grad_norm": 0.8480898141860962, + "learning_rate": 9.600219538843532e-06, + "loss": 0.7762, + "step": 4744 + }, + { + "epoch": 0.2611591171776102, + "grad_norm": 0.8382542133331299, + "learning_rate": 9.600049682716055e-06, + "loss": 0.9051, + "step": 4745 + }, + { + "epoch": 0.2612141559799659, + "grad_norm": 0.8319274187088013, + "learning_rate": 9.599879792015838e-06, + "loss": 0.8221, + "step": 4746 + }, + { + "epoch": 0.2612691947823215, + "grad_norm": 0.7325875163078308, + "learning_rate": 9.599709866744156e-06, + "loss": 0.7968, + "step": 4747 + }, + { + "epoch": 0.2613242335846772, + "grad_norm": 0.7053360342979431, + "learning_rate": 9.599539906902285e-06, + "loss": 0.7073, + "step": 4748 + }, + { + "epoch": 0.26137927238703285, + "grad_norm": 0.763017475605011, + "learning_rate": 9.599369912491503e-06, + "loss": 0.7031, + "step": 4749 + }, + { + "epoch": 0.26143431118938854, + "grad_norm": 0.6816151738166809, + "learning_rate": 9.599199883513088e-06, + "loss": 0.7295, + "step": 4750 + }, + { + "epoch": 0.26148934999174417, + "grad_norm": 0.8143941164016724, + "learning_rate": 9.599029819968319e-06, + "loss": 0.8449, + "step": 4751 + }, + { + "epoch": 0.26154438879409986, + "grad_norm": 0.8093858361244202, + "learning_rate": 9.598859721858471e-06, + "loss": 0.8397, + "step": 4752 + }, + { + "epoch": 0.2615994275964555, + "grad_norm": 0.7431835532188416, + "learning_rate": 9.598689589184827e-06, + "loss": 0.7299, + "step": 4753 + }, + { + "epoch": 0.2616544663988112, + "grad_norm": 0.9871510863304138, + "learning_rate": 9.59851942194866e-06, + "loss": 0.7992, + "step": 4754 + }, + { + "epoch": 0.2617095052011668, + "grad_norm": 0.9304273724555969, + "learning_rate": 9.598349220151254e-06, + "loss": 0.7519, + "step": 4755 + }, + { + "epoch": 0.2617645440035225, + "grad_norm": 0.9361812472343445, + "learning_rate": 9.598178983793886e-06, + "loss": 0.8131, + "step": 4756 + }, + { + "epoch": 0.26181958280587814, + "grad_norm": 0.7783429622650146, + "learning_rate": 9.598008712877835e-06, + "loss": 0.7351, + "step": 4757 + }, + { + "epoch": 0.2618746216082338, + "grad_norm": 0.8739376068115234, + "learning_rate": 9.597838407404381e-06, + "loss": 0.9458, + "step": 4758 + }, + { + "epoch": 0.26192966041058946, + "grad_norm": 0.7076277732849121, + "learning_rate": 9.597668067374805e-06, + "loss": 0.7632, + "step": 4759 + }, + { + "epoch": 0.26198469921294515, + "grad_norm": 0.7652345299720764, + "learning_rate": 9.597497692790386e-06, + "loss": 0.8018, + "step": 4760 + }, + { + "epoch": 0.2620397380153008, + "grad_norm": 0.7332149147987366, + "learning_rate": 9.597327283652405e-06, + "loss": 0.8223, + "step": 4761 + }, + { + "epoch": 0.26209477681765647, + "grad_norm": 0.8361638784408569, + "learning_rate": 9.597156839962145e-06, + "loss": 0.8784, + "step": 4762 + }, + { + "epoch": 0.2621498156200121, + "grad_norm": 1.183772325515747, + "learning_rate": 9.596986361720882e-06, + "loss": 0.8768, + "step": 4763 + }, + { + "epoch": 0.2622048544223678, + "grad_norm": 0.9895418882369995, + "learning_rate": 9.596815848929902e-06, + "loss": 0.714, + "step": 4764 + }, + { + "epoch": 0.26225989322472343, + "grad_norm": 0.8210558295249939, + "learning_rate": 9.59664530159048e-06, + "loss": 0.7246, + "step": 4765 + }, + { + "epoch": 0.2623149320270791, + "grad_norm": 0.8003455996513367, + "learning_rate": 9.596474719703908e-06, + "loss": 0.8385, + "step": 4766 + }, + { + "epoch": 0.26236997082943475, + "grad_norm": 0.7555826306343079, + "learning_rate": 9.59630410327146e-06, + "loss": 0.7243, + "step": 4767 + }, + { + "epoch": 0.2624250096317904, + "grad_norm": 0.7746273279190063, + "learning_rate": 9.596133452294421e-06, + "loss": 0.8763, + "step": 4768 + }, + { + "epoch": 0.2624800484341461, + "grad_norm": 0.7238507866859436, + "learning_rate": 9.595962766774074e-06, + "loss": 0.8302, + "step": 4769 + }, + { + "epoch": 0.2625350872365017, + "grad_norm": 0.7874132394790649, + "learning_rate": 9.595792046711699e-06, + "loss": 0.7979, + "step": 4770 + }, + { + "epoch": 0.2625901260388574, + "grad_norm": 0.8792033791542053, + "learning_rate": 9.595621292108583e-06, + "loss": 0.8555, + "step": 4771 + }, + { + "epoch": 0.26264516484121303, + "grad_norm": 0.7026945948600769, + "learning_rate": 9.595450502966006e-06, + "loss": 0.718, + "step": 4772 + }, + { + "epoch": 0.2627002036435687, + "grad_norm": 0.7747959494590759, + "learning_rate": 9.595279679285254e-06, + "loss": 0.8329, + "step": 4773 + }, + { + "epoch": 0.26275524244592435, + "grad_norm": 0.697979748249054, + "learning_rate": 9.59510882106761e-06, + "loss": 0.7456, + "step": 4774 + }, + { + "epoch": 0.26281028124828004, + "grad_norm": 0.7600447535514832, + "learning_rate": 9.594937928314359e-06, + "loss": 0.875, + "step": 4775 + }, + { + "epoch": 0.2628653200506357, + "grad_norm": 0.7591384649276733, + "learning_rate": 9.594767001026783e-06, + "loss": 0.7607, + "step": 4776 + }, + { + "epoch": 0.26292035885299136, + "grad_norm": 0.9267380833625793, + "learning_rate": 9.59459603920617e-06, + "loss": 0.8926, + "step": 4777 + }, + { + "epoch": 0.262975397655347, + "grad_norm": 0.7751328349113464, + "learning_rate": 9.594425042853802e-06, + "loss": 0.7449, + "step": 4778 + }, + { + "epoch": 0.2630304364577027, + "grad_norm": 0.7066012620925903, + "learning_rate": 9.594254011970966e-06, + "loss": 0.8374, + "step": 4779 + }, + { + "epoch": 0.2630854752600583, + "grad_norm": 0.7564317584037781, + "learning_rate": 9.594082946558945e-06, + "loss": 0.735, + "step": 4780 + }, + { + "epoch": 0.263140514062414, + "grad_norm": 0.8151416182518005, + "learning_rate": 9.593911846619027e-06, + "loss": 0.8575, + "step": 4781 + }, + { + "epoch": 0.26319555286476964, + "grad_norm": 0.719261646270752, + "learning_rate": 9.593740712152497e-06, + "loss": 0.7981, + "step": 4782 + }, + { + "epoch": 0.26325059166712533, + "grad_norm": 0.8627344369888306, + "learning_rate": 9.593569543160642e-06, + "loss": 0.895, + "step": 4783 + }, + { + "epoch": 0.26330563046948097, + "grad_norm": 1.293272614479065, + "learning_rate": 9.593398339644748e-06, + "loss": 0.7531, + "step": 4784 + }, + { + "epoch": 0.26336066927183666, + "grad_norm": 0.8475207686424255, + "learning_rate": 9.593227101606102e-06, + "loss": 0.9091, + "step": 4785 + }, + { + "epoch": 0.2634157080741923, + "grad_norm": 0.78054279088974, + "learning_rate": 9.593055829045989e-06, + "loss": 0.7692, + "step": 4786 + }, + { + "epoch": 0.263470746876548, + "grad_norm": 0.7677399516105652, + "learning_rate": 9.592884521965699e-06, + "loss": 0.6232, + "step": 4787 + }, + { + "epoch": 0.2635257856789036, + "grad_norm": 0.7232677340507507, + "learning_rate": 9.59271318036652e-06, + "loss": 0.8087, + "step": 4788 + }, + { + "epoch": 0.2635808244812593, + "grad_norm": 0.8728463649749756, + "learning_rate": 9.592541804249735e-06, + "loss": 0.7824, + "step": 4789 + }, + { + "epoch": 0.26363586328361494, + "grad_norm": 0.7569910883903503, + "learning_rate": 9.592370393616637e-06, + "loss": 0.7418, + "step": 4790 + }, + { + "epoch": 0.2636909020859706, + "grad_norm": 0.7631934285163879, + "learning_rate": 9.592198948468511e-06, + "loss": 0.7929, + "step": 4791 + }, + { + "epoch": 0.26374594088832626, + "grad_norm": 0.8021631240844727, + "learning_rate": 9.592027468806649e-06, + "loss": 0.8111, + "step": 4792 + }, + { + "epoch": 0.26380097969068195, + "grad_norm": 0.9454651474952698, + "learning_rate": 9.591855954632336e-06, + "loss": 0.8239, + "step": 4793 + }, + { + "epoch": 0.2638560184930376, + "grad_norm": 0.672924280166626, + "learning_rate": 9.591684405946863e-06, + "loss": 0.6877, + "step": 4794 + }, + { + "epoch": 0.26391105729539327, + "grad_norm": 0.7942802906036377, + "learning_rate": 9.59151282275152e-06, + "loss": 0.9002, + "step": 4795 + }, + { + "epoch": 0.2639660960977489, + "grad_norm": 0.7131155133247375, + "learning_rate": 9.591341205047596e-06, + "loss": 0.7692, + "step": 4796 + }, + { + "epoch": 0.2640211349001046, + "grad_norm": 1.0395869016647339, + "learning_rate": 9.59116955283638e-06, + "loss": 0.8352, + "step": 4797 + }, + { + "epoch": 0.2640761737024602, + "grad_norm": 0.9503256678581238, + "learning_rate": 9.590997866119163e-06, + "loss": 1.0287, + "step": 4798 + }, + { + "epoch": 0.2641312125048159, + "grad_norm": 0.7539612054824829, + "learning_rate": 9.590826144897235e-06, + "loss": 0.872, + "step": 4799 + }, + { + "epoch": 0.26418625130717155, + "grad_norm": 0.7067893743515015, + "learning_rate": 9.590654389171885e-06, + "loss": 0.7636, + "step": 4800 + }, + { + "epoch": 0.26424129010952724, + "grad_norm": 0.7355281710624695, + "learning_rate": 9.590482598944407e-06, + "loss": 0.7715, + "step": 4801 + }, + { + "epoch": 0.26429632891188287, + "grad_norm": 0.7589674592018127, + "learning_rate": 9.590310774216089e-06, + "loss": 0.7451, + "step": 4802 + }, + { + "epoch": 0.26435136771423856, + "grad_norm": 0.701386034488678, + "learning_rate": 9.590138914988226e-06, + "loss": 0.7317, + "step": 4803 + }, + { + "epoch": 0.2644064065165942, + "grad_norm": 0.7663118243217468, + "learning_rate": 9.589967021262105e-06, + "loss": 0.8227, + "step": 4804 + }, + { + "epoch": 0.2644614453189499, + "grad_norm": 0.7059655785560608, + "learning_rate": 9.589795093039023e-06, + "loss": 0.7829, + "step": 4805 + }, + { + "epoch": 0.2645164841213055, + "grad_norm": 0.7377020120620728, + "learning_rate": 9.58962313032027e-06, + "loss": 0.8308, + "step": 4806 + }, + { + "epoch": 0.2645715229236612, + "grad_norm": 0.8635388612747192, + "learning_rate": 9.589451133107134e-06, + "loss": 0.7882, + "step": 4807 + }, + { + "epoch": 0.26462656172601684, + "grad_norm": 0.8282824754714966, + "learning_rate": 9.589279101400915e-06, + "loss": 0.8055, + "step": 4808 + }, + { + "epoch": 0.26468160052837253, + "grad_norm": 0.7026814818382263, + "learning_rate": 9.589107035202903e-06, + "loss": 0.7567, + "step": 4809 + }, + { + "epoch": 0.26473663933072816, + "grad_norm": 0.7575708031654358, + "learning_rate": 9.588934934514392e-06, + "loss": 0.7456, + "step": 4810 + }, + { + "epoch": 0.2647916781330838, + "grad_norm": 0.9732069969177246, + "learning_rate": 9.588762799336671e-06, + "loss": 0.8217, + "step": 4811 + }, + { + "epoch": 0.2648467169354395, + "grad_norm": 0.786803126335144, + "learning_rate": 9.58859062967104e-06, + "loss": 0.729, + "step": 4812 + }, + { + "epoch": 0.2649017557377951, + "grad_norm": 0.8068973422050476, + "learning_rate": 9.588418425518789e-06, + "loss": 0.8204, + "step": 4813 + }, + { + "epoch": 0.2649567945401508, + "grad_norm": 0.8222702145576477, + "learning_rate": 9.588246186881213e-06, + "loss": 0.8349, + "step": 4814 + }, + { + "epoch": 0.26501183334250644, + "grad_norm": 0.7560802698135376, + "learning_rate": 9.588073913759608e-06, + "loss": 0.7601, + "step": 4815 + }, + { + "epoch": 0.26506687214486213, + "grad_norm": 0.9221365451812744, + "learning_rate": 9.587901606155266e-06, + "loss": 0.7725, + "step": 4816 + }, + { + "epoch": 0.26512191094721776, + "grad_norm": 0.8092262744903564, + "learning_rate": 9.587729264069485e-06, + "loss": 0.9074, + "step": 4817 + }, + { + "epoch": 0.26517694974957345, + "grad_norm": 0.8183920979499817, + "learning_rate": 9.587556887503557e-06, + "loss": 0.8321, + "step": 4818 + }, + { + "epoch": 0.2652319885519291, + "grad_norm": 0.7023420929908752, + "learning_rate": 9.587384476458781e-06, + "loss": 0.7842, + "step": 4819 + }, + { + "epoch": 0.2652870273542848, + "grad_norm": 1.2864880561828613, + "learning_rate": 9.58721203093645e-06, + "loss": 0.7519, + "step": 4820 + }, + { + "epoch": 0.2653420661566404, + "grad_norm": 0.8133784532546997, + "learning_rate": 9.587039550937864e-06, + "loss": 0.8208, + "step": 4821 + }, + { + "epoch": 0.2653971049589961, + "grad_norm": 0.739732027053833, + "learning_rate": 9.586867036464314e-06, + "loss": 0.8553, + "step": 4822 + }, + { + "epoch": 0.26545214376135173, + "grad_norm": 0.7539162635803223, + "learning_rate": 9.5866944875171e-06, + "loss": 0.7385, + "step": 4823 + }, + { + "epoch": 0.2655071825637074, + "grad_norm": 0.8012336492538452, + "learning_rate": 9.58652190409752e-06, + "loss": 0.8343, + "step": 4824 + }, + { + "epoch": 0.26556222136606306, + "grad_norm": 0.7972521185874939, + "learning_rate": 9.586349286206865e-06, + "loss": 0.8481, + "step": 4825 + }, + { + "epoch": 0.26561726016841875, + "grad_norm": 0.7772900462150574, + "learning_rate": 9.58617663384644e-06, + "loss": 0.7655, + "step": 4826 + }, + { + "epoch": 0.2656722989707744, + "grad_norm": 0.677916944026947, + "learning_rate": 9.586003947017537e-06, + "loss": 0.696, + "step": 4827 + }, + { + "epoch": 0.26572733777313007, + "grad_norm": 0.8254117369651794, + "learning_rate": 9.585831225721455e-06, + "loss": 0.7841, + "step": 4828 + }, + { + "epoch": 0.2657823765754857, + "grad_norm": 0.7256904244422913, + "learning_rate": 9.585658469959496e-06, + "loss": 0.8057, + "step": 4829 + }, + { + "epoch": 0.2658374153778414, + "grad_norm": 0.7651757001876831, + "learning_rate": 9.585485679732953e-06, + "loss": 0.7918, + "step": 4830 + }, + { + "epoch": 0.265892454180197, + "grad_norm": 0.7581052184104919, + "learning_rate": 9.58531285504313e-06, + "loss": 0.759, + "step": 4831 + }, + { + "epoch": 0.2659474929825527, + "grad_norm": 0.7190486192703247, + "learning_rate": 9.58513999589132e-06, + "loss": 0.7403, + "step": 4832 + }, + { + "epoch": 0.26600253178490835, + "grad_norm": 0.8603141903877258, + "learning_rate": 9.584967102278825e-06, + "loss": 0.8944, + "step": 4833 + }, + { + "epoch": 0.26605757058726404, + "grad_norm": 0.806297779083252, + "learning_rate": 9.584794174206947e-06, + "loss": 0.7039, + "step": 4834 + }, + { + "epoch": 0.26611260938961967, + "grad_norm": 0.7604451775550842, + "learning_rate": 9.584621211676981e-06, + "loss": 0.8076, + "step": 4835 + }, + { + "epoch": 0.26616764819197536, + "grad_norm": 0.7276773452758789, + "learning_rate": 9.584448214690232e-06, + "loss": 0.786, + "step": 4836 + }, + { + "epoch": 0.266222686994331, + "grad_norm": 0.8737080693244934, + "learning_rate": 9.584275183247994e-06, + "loss": 0.8071, + "step": 4837 + }, + { + "epoch": 0.2662777257966867, + "grad_norm": 0.8447219133377075, + "learning_rate": 9.584102117351574e-06, + "loss": 0.7682, + "step": 4838 + }, + { + "epoch": 0.2663327645990423, + "grad_norm": 0.7001703381538391, + "learning_rate": 9.583929017002268e-06, + "loss": 0.7077, + "step": 4839 + }, + { + "epoch": 0.266387803401398, + "grad_norm": 0.7935730218887329, + "learning_rate": 9.583755882201377e-06, + "loss": 0.8122, + "step": 4840 + }, + { + "epoch": 0.26644284220375364, + "grad_norm": 0.8763312697410583, + "learning_rate": 9.583582712950207e-06, + "loss": 0.8241, + "step": 4841 + }, + { + "epoch": 0.2664978810061093, + "grad_norm": 0.7910245656967163, + "learning_rate": 9.583409509250055e-06, + "loss": 0.7717, + "step": 4842 + }, + { + "epoch": 0.26655291980846496, + "grad_norm": 0.7975226640701294, + "learning_rate": 9.583236271102222e-06, + "loss": 0.7165, + "step": 4843 + }, + { + "epoch": 0.26660795861082065, + "grad_norm": 0.8060342073440552, + "learning_rate": 9.583062998508014e-06, + "loss": 0.7659, + "step": 4844 + }, + { + "epoch": 0.2666629974131763, + "grad_norm": 0.8779375553131104, + "learning_rate": 9.582889691468732e-06, + "loss": 0.8207, + "step": 4845 + }, + { + "epoch": 0.266718036215532, + "grad_norm": 0.7409310936927795, + "learning_rate": 9.582716349985677e-06, + "loss": 0.8439, + "step": 4846 + }, + { + "epoch": 0.2667730750178876, + "grad_norm": 0.8871899843215942, + "learning_rate": 9.582542974060152e-06, + "loss": 0.8305, + "step": 4847 + }, + { + "epoch": 0.2668281138202433, + "grad_norm": 0.9003115296363831, + "learning_rate": 9.58236956369346e-06, + "loss": 0.8334, + "step": 4848 + }, + { + "epoch": 0.26688315262259893, + "grad_norm": 1.0149577856063843, + "learning_rate": 9.582196118886909e-06, + "loss": 0.7962, + "step": 4849 + }, + { + "epoch": 0.2669381914249546, + "grad_norm": 0.785214900970459, + "learning_rate": 9.582022639641795e-06, + "loss": 0.7806, + "step": 4850 + }, + { + "epoch": 0.26699323022731025, + "grad_norm": 0.9833952188491821, + "learning_rate": 9.581849125959426e-06, + "loss": 0.7607, + "step": 4851 + }, + { + "epoch": 0.26704826902966594, + "grad_norm": 1.404751181602478, + "learning_rate": 9.581675577841104e-06, + "loss": 0.9046, + "step": 4852 + }, + { + "epoch": 0.2671033078320216, + "grad_norm": 0.791159451007843, + "learning_rate": 9.581501995288137e-06, + "loss": 0.6582, + "step": 4853 + }, + { + "epoch": 0.2671583466343772, + "grad_norm": 0.8507272005081177, + "learning_rate": 9.581328378301827e-06, + "loss": 0.8946, + "step": 4854 + }, + { + "epoch": 0.2672133854367329, + "grad_norm": 0.7372786998748779, + "learning_rate": 9.58115472688348e-06, + "loss": 0.7865, + "step": 4855 + }, + { + "epoch": 0.26726842423908853, + "grad_norm": 0.8293853998184204, + "learning_rate": 9.580981041034398e-06, + "loss": 0.9113, + "step": 4856 + }, + { + "epoch": 0.2673234630414442, + "grad_norm": 0.7212402820587158, + "learning_rate": 9.580807320755889e-06, + "loss": 0.7149, + "step": 4857 + }, + { + "epoch": 0.26737850184379985, + "grad_norm": 0.7885197401046753, + "learning_rate": 9.58063356604926e-06, + "loss": 0.8651, + "step": 4858 + }, + { + "epoch": 0.26743354064615554, + "grad_norm": 0.8444308042526245, + "learning_rate": 9.580459776915814e-06, + "loss": 0.7968, + "step": 4859 + }, + { + "epoch": 0.2674885794485112, + "grad_norm": 0.7974254488945007, + "learning_rate": 9.58028595335686e-06, + "loss": 0.8499, + "step": 4860 + }, + { + "epoch": 0.26754361825086687, + "grad_norm": 0.7491242289543152, + "learning_rate": 9.580112095373702e-06, + "loss": 0.8278, + "step": 4861 + }, + { + "epoch": 0.2675986570532225, + "grad_norm": 0.6856499314308167, + "learning_rate": 9.579938202967646e-06, + "loss": 0.7466, + "step": 4862 + }, + { + "epoch": 0.2676536958555782, + "grad_norm": 0.7347447872161865, + "learning_rate": 9.579764276140002e-06, + "loss": 0.8046, + "step": 4863 + }, + { + "epoch": 0.2677087346579338, + "grad_norm": 0.6797083020210266, + "learning_rate": 9.579590314892077e-06, + "loss": 0.7012, + "step": 4864 + }, + { + "epoch": 0.2677637734602895, + "grad_norm": 0.8219562768936157, + "learning_rate": 9.579416319225175e-06, + "loss": 0.7592, + "step": 4865 + }, + { + "epoch": 0.26781881226264515, + "grad_norm": 0.7388357520103455, + "learning_rate": 9.579242289140607e-06, + "loss": 0.8179, + "step": 4866 + }, + { + "epoch": 0.26787385106500083, + "grad_norm": 0.7394490838050842, + "learning_rate": 9.579068224639679e-06, + "loss": 0.694, + "step": 4867 + }, + { + "epoch": 0.26792888986735647, + "grad_norm": 0.7309017181396484, + "learning_rate": 9.578894125723699e-06, + "loss": 0.7882, + "step": 4868 + }, + { + "epoch": 0.26798392866971216, + "grad_norm": 0.7785035967826843, + "learning_rate": 9.578719992393978e-06, + "loss": 0.8142, + "step": 4869 + }, + { + "epoch": 0.2680389674720678, + "grad_norm": 0.8983079195022583, + "learning_rate": 9.57854582465182e-06, + "loss": 0.7809, + "step": 4870 + }, + { + "epoch": 0.2680940062744235, + "grad_norm": 0.7433765530586243, + "learning_rate": 9.578371622498542e-06, + "loss": 0.8937, + "step": 4871 + }, + { + "epoch": 0.2681490450767791, + "grad_norm": 0.8808667659759521, + "learning_rate": 9.578197385935446e-06, + "loss": 0.7821, + "step": 4872 + }, + { + "epoch": 0.2682040838791348, + "grad_norm": 0.825794517993927, + "learning_rate": 9.578023114963843e-06, + "loss": 0.8228, + "step": 4873 + }, + { + "epoch": 0.26825912268149044, + "grad_norm": 1.0165129899978638, + "learning_rate": 9.577848809585046e-06, + "loss": 0.7964, + "step": 4874 + }, + { + "epoch": 0.2683141614838461, + "grad_norm": 0.742028534412384, + "learning_rate": 9.577674469800362e-06, + "loss": 0.9126, + "step": 4875 + }, + { + "epoch": 0.26836920028620176, + "grad_norm": 0.7571890354156494, + "learning_rate": 9.577500095611101e-06, + "loss": 0.879, + "step": 4876 + }, + { + "epoch": 0.26842423908855745, + "grad_norm": 0.7577160596847534, + "learning_rate": 9.577325687018575e-06, + "loss": 0.8048, + "step": 4877 + }, + { + "epoch": 0.2684792778909131, + "grad_norm": 0.7704411745071411, + "learning_rate": 9.577151244024095e-06, + "loss": 0.7451, + "step": 4878 + }, + { + "epoch": 0.26853431669326877, + "grad_norm": 0.8323166966438293, + "learning_rate": 9.57697676662897e-06, + "loss": 0.7591, + "step": 4879 + }, + { + "epoch": 0.2685893554956244, + "grad_norm": 0.7257028222084045, + "learning_rate": 9.576802254834516e-06, + "loss": 0.7941, + "step": 4880 + }, + { + "epoch": 0.2686443942979801, + "grad_norm": 0.8170442581176758, + "learning_rate": 9.57662770864204e-06, + "loss": 0.8617, + "step": 4881 + }, + { + "epoch": 0.2686994331003357, + "grad_norm": 0.7435339689254761, + "learning_rate": 9.576453128052852e-06, + "loss": 0.7683, + "step": 4882 + }, + { + "epoch": 0.2687544719026914, + "grad_norm": 0.7932955026626587, + "learning_rate": 9.576278513068271e-06, + "loss": 0.7103, + "step": 4883 + }, + { + "epoch": 0.26880951070504705, + "grad_norm": 0.8008469939231873, + "learning_rate": 9.576103863689604e-06, + "loss": 0.8144, + "step": 4884 + }, + { + "epoch": 0.26886454950740274, + "grad_norm": 0.8573774695396423, + "learning_rate": 9.575929179918167e-06, + "loss": 0.8992, + "step": 4885 + }, + { + "epoch": 0.2689195883097584, + "grad_norm": 0.7326993942260742, + "learning_rate": 9.57575446175527e-06, + "loss": 0.699, + "step": 4886 + }, + { + "epoch": 0.26897462711211406, + "grad_norm": 0.8249791264533997, + "learning_rate": 9.575579709202228e-06, + "loss": 0.7445, + "step": 4887 + }, + { + "epoch": 0.2690296659144697, + "grad_norm": 0.7136644124984741, + "learning_rate": 9.575404922260351e-06, + "loss": 0.779, + "step": 4888 + }, + { + "epoch": 0.2690847047168254, + "grad_norm": 1.0130438804626465, + "learning_rate": 9.575230100930958e-06, + "loss": 0.8535, + "step": 4889 + }, + { + "epoch": 0.269139743519181, + "grad_norm": 0.6784926652908325, + "learning_rate": 9.575055245215358e-06, + "loss": 0.6745, + "step": 4890 + }, + { + "epoch": 0.2691947823215367, + "grad_norm": 0.7492508888244629, + "learning_rate": 9.57488035511487e-06, + "loss": 0.6748, + "step": 4891 + }, + { + "epoch": 0.26924982112389234, + "grad_norm": 0.7951217889785767, + "learning_rate": 9.574705430630807e-06, + "loss": 0.8119, + "step": 4892 + }, + { + "epoch": 0.26930485992624803, + "grad_norm": 0.9756677746772766, + "learning_rate": 9.574530471764478e-06, + "loss": 0.855, + "step": 4893 + }, + { + "epoch": 0.26935989872860366, + "grad_norm": 0.7806811928749084, + "learning_rate": 9.574355478517206e-06, + "loss": 0.8432, + "step": 4894 + }, + { + "epoch": 0.26941493753095935, + "grad_norm": 0.7814774513244629, + "learning_rate": 9.574180450890301e-06, + "loss": 0.8226, + "step": 4895 + }, + { + "epoch": 0.269469976333315, + "grad_norm": 0.7745325565338135, + "learning_rate": 9.574005388885081e-06, + "loss": 0.7722, + "step": 4896 + }, + { + "epoch": 0.2695250151356706, + "grad_norm": 0.7805666327476501, + "learning_rate": 9.573830292502862e-06, + "loss": 0.8357, + "step": 4897 + }, + { + "epoch": 0.2695800539380263, + "grad_norm": 0.8428031802177429, + "learning_rate": 9.573655161744958e-06, + "loss": 0.8056, + "step": 4898 + }, + { + "epoch": 0.26963509274038194, + "grad_norm": 0.7896600961685181, + "learning_rate": 9.573479996612684e-06, + "loss": 0.7984, + "step": 4899 + }, + { + "epoch": 0.26969013154273763, + "grad_norm": 0.7718683481216431, + "learning_rate": 9.57330479710736e-06, + "loss": 0.7527, + "step": 4900 + }, + { + "epoch": 0.26974517034509327, + "grad_norm": 0.7868129014968872, + "learning_rate": 9.573129563230302e-06, + "loss": 0.7876, + "step": 4901 + }, + { + "epoch": 0.26980020914744895, + "grad_norm": 0.8493777513504028, + "learning_rate": 9.572954294982826e-06, + "loss": 0.864, + "step": 4902 + }, + { + "epoch": 0.2698552479498046, + "grad_norm": 0.7492502331733704, + "learning_rate": 9.57277899236625e-06, + "loss": 0.8236, + "step": 4903 + }, + { + "epoch": 0.2699102867521603, + "grad_norm": 1.0534250736236572, + "learning_rate": 9.57260365538189e-06, + "loss": 0.8012, + "step": 4904 + }, + { + "epoch": 0.2699653255545159, + "grad_norm": 0.7557470202445984, + "learning_rate": 9.572428284031065e-06, + "loss": 0.9084, + "step": 4905 + }, + { + "epoch": 0.2700203643568716, + "grad_norm": 0.8055123686790466, + "learning_rate": 9.572252878315094e-06, + "loss": 0.7468, + "step": 4906 + }, + { + "epoch": 0.27007540315922723, + "grad_norm": 0.8399039506912231, + "learning_rate": 9.572077438235294e-06, + "loss": 0.9293, + "step": 4907 + }, + { + "epoch": 0.2701304419615829, + "grad_norm": 0.9800041317939758, + "learning_rate": 9.571901963792983e-06, + "loss": 0.8664, + "step": 4908 + }, + { + "epoch": 0.27018548076393856, + "grad_norm": 0.7732129096984863, + "learning_rate": 9.571726454989482e-06, + "loss": 0.7227, + "step": 4909 + }, + { + "epoch": 0.27024051956629425, + "grad_norm": 0.730754017829895, + "learning_rate": 9.571550911826109e-06, + "loss": 0.6467, + "step": 4910 + }, + { + "epoch": 0.2702955583686499, + "grad_norm": 0.8245325684547424, + "learning_rate": 9.57137533430418e-06, + "loss": 0.7847, + "step": 4911 + }, + { + "epoch": 0.27035059717100557, + "grad_norm": 0.8606786131858826, + "learning_rate": 9.57119972242502e-06, + "loss": 0.9556, + "step": 4912 + }, + { + "epoch": 0.2704056359733612, + "grad_norm": 0.7480195164680481, + "learning_rate": 9.571024076189947e-06, + "loss": 0.8504, + "step": 4913 + }, + { + "epoch": 0.2704606747757169, + "grad_norm": 0.718913197517395, + "learning_rate": 9.57084839560028e-06, + "loss": 0.7869, + "step": 4914 + }, + { + "epoch": 0.2705157135780725, + "grad_norm": 0.9778180122375488, + "learning_rate": 9.57067268065734e-06, + "loss": 0.8514, + "step": 4915 + }, + { + "epoch": 0.2705707523804282, + "grad_norm": 0.7394844889640808, + "learning_rate": 9.570496931362448e-06, + "loss": 0.7906, + "step": 4916 + }, + { + "epoch": 0.27062579118278385, + "grad_norm": 0.7648600339889526, + "learning_rate": 9.570321147716923e-06, + "loss": 0.8194, + "step": 4917 + }, + { + "epoch": 0.27068082998513954, + "grad_norm": 0.8002632260322571, + "learning_rate": 9.57014532972209e-06, + "loss": 0.8079, + "step": 4918 + }, + { + "epoch": 0.27073586878749517, + "grad_norm": 0.8668341040611267, + "learning_rate": 9.569969477379267e-06, + "loss": 0.8954, + "step": 4919 + }, + { + "epoch": 0.27079090758985086, + "grad_norm": 0.7403327226638794, + "learning_rate": 9.569793590689775e-06, + "loss": 0.7755, + "step": 4920 + }, + { + "epoch": 0.2708459463922065, + "grad_norm": 0.7399682998657227, + "learning_rate": 9.569617669654938e-06, + "loss": 0.8203, + "step": 4921 + }, + { + "epoch": 0.2709009851945622, + "grad_norm": 0.788600504398346, + "learning_rate": 9.56944171427608e-06, + "loss": 0.7565, + "step": 4922 + }, + { + "epoch": 0.2709560239969178, + "grad_norm": 0.7044861912727356, + "learning_rate": 9.56926572455452e-06, + "loss": 0.7073, + "step": 4923 + }, + { + "epoch": 0.2710110627992735, + "grad_norm": 0.8195114135742188, + "learning_rate": 9.569089700491581e-06, + "loss": 0.8658, + "step": 4924 + }, + { + "epoch": 0.27106610160162914, + "grad_norm": 0.7792258858680725, + "learning_rate": 9.568913642088589e-06, + "loss": 0.8628, + "step": 4925 + }, + { + "epoch": 0.27112114040398483, + "grad_norm": 0.764930248260498, + "learning_rate": 9.568737549346862e-06, + "loss": 0.7761, + "step": 4926 + }, + { + "epoch": 0.27117617920634046, + "grad_norm": 0.7226328253746033, + "learning_rate": 9.56856142226773e-06, + "loss": 0.7208, + "step": 4927 + }, + { + "epoch": 0.27123121800869615, + "grad_norm": 0.8726598620414734, + "learning_rate": 9.568385260852512e-06, + "loss": 0.8599, + "step": 4928 + }, + { + "epoch": 0.2712862568110518, + "grad_norm": 1.0126571655273438, + "learning_rate": 9.568209065102533e-06, + "loss": 0.8145, + "step": 4929 + }, + { + "epoch": 0.2713412956134075, + "grad_norm": 0.7764692306518555, + "learning_rate": 9.568032835019116e-06, + "loss": 0.6758, + "step": 4930 + }, + { + "epoch": 0.2713963344157631, + "grad_norm": 0.6955474019050598, + "learning_rate": 9.567856570603589e-06, + "loss": 0.7461, + "step": 4931 + }, + { + "epoch": 0.2714513732181188, + "grad_norm": 0.7136832475662231, + "learning_rate": 9.567680271857274e-06, + "loss": 0.7692, + "step": 4932 + }, + { + "epoch": 0.27150641202047443, + "grad_norm": 1.2288198471069336, + "learning_rate": 9.567503938781497e-06, + "loss": 0.7815, + "step": 4933 + }, + { + "epoch": 0.2715614508228301, + "grad_norm": 0.9182234406471252, + "learning_rate": 9.567327571377584e-06, + "loss": 0.8822, + "step": 4934 + }, + { + "epoch": 0.27161648962518575, + "grad_norm": 0.7684763669967651, + "learning_rate": 9.567151169646859e-06, + "loss": 0.7618, + "step": 4935 + }, + { + "epoch": 0.27167152842754144, + "grad_norm": 0.872360348701477, + "learning_rate": 9.566974733590647e-06, + "loss": 0.7975, + "step": 4936 + }, + { + "epoch": 0.2717265672298971, + "grad_norm": 0.9010463356971741, + "learning_rate": 9.566798263210277e-06, + "loss": 0.7159, + "step": 4937 + }, + { + "epoch": 0.27178160603225276, + "grad_norm": 0.7254281044006348, + "learning_rate": 9.566621758507072e-06, + "loss": 0.6724, + "step": 4938 + }, + { + "epoch": 0.2718366448346084, + "grad_norm": 0.8478212356567383, + "learning_rate": 9.566445219482363e-06, + "loss": 0.659, + "step": 4939 + }, + { + "epoch": 0.27189168363696403, + "grad_norm": 0.9038714170455933, + "learning_rate": 9.56626864613747e-06, + "loss": 0.8766, + "step": 4940 + }, + { + "epoch": 0.2719467224393197, + "grad_norm": 0.9704582691192627, + "learning_rate": 9.566092038473728e-06, + "loss": 0.8972, + "step": 4941 + }, + { + "epoch": 0.27200176124167535, + "grad_norm": 0.7069430947303772, + "learning_rate": 9.565915396492459e-06, + "loss": 0.8116, + "step": 4942 + }, + { + "epoch": 0.27205680004403104, + "grad_norm": 0.7432642579078674, + "learning_rate": 9.565738720194993e-06, + "loss": 0.847, + "step": 4943 + }, + { + "epoch": 0.2721118388463867, + "grad_norm": 0.6813814043998718, + "learning_rate": 9.565562009582655e-06, + "loss": 0.7146, + "step": 4944 + }, + { + "epoch": 0.27216687764874237, + "grad_norm": 0.7447707056999207, + "learning_rate": 9.565385264656776e-06, + "loss": 0.7696, + "step": 4945 + }, + { + "epoch": 0.272221916451098, + "grad_norm": 0.875073254108429, + "learning_rate": 9.565208485418685e-06, + "loss": 0.8714, + "step": 4946 + }, + { + "epoch": 0.2722769552534537, + "grad_norm": 0.7753880620002747, + "learning_rate": 9.565031671869707e-06, + "loss": 0.739, + "step": 4947 + }, + { + "epoch": 0.2723319940558093, + "grad_norm": 0.749264121055603, + "learning_rate": 9.564854824011172e-06, + "loss": 0.7957, + "step": 4948 + }, + { + "epoch": 0.272387032858165, + "grad_norm": 0.6733991503715515, + "learning_rate": 9.564677941844412e-06, + "loss": 0.7402, + "step": 4949 + }, + { + "epoch": 0.27244207166052065, + "grad_norm": 0.7426447868347168, + "learning_rate": 9.564501025370753e-06, + "loss": 0.7977, + "step": 4950 + }, + { + "epoch": 0.27249711046287634, + "grad_norm": 0.7930514812469482, + "learning_rate": 9.564324074591529e-06, + "loss": 0.8485, + "step": 4951 + }, + { + "epoch": 0.27255214926523197, + "grad_norm": 0.8087072968482971, + "learning_rate": 9.564147089508064e-06, + "loss": 0.9215, + "step": 4952 + }, + { + "epoch": 0.27260718806758766, + "grad_norm": 0.7560327053070068, + "learning_rate": 9.563970070121694e-06, + "loss": 0.7966, + "step": 4953 + }, + { + "epoch": 0.2726622268699433, + "grad_norm": 0.735573947429657, + "learning_rate": 9.563793016433744e-06, + "loss": 0.7737, + "step": 4954 + }, + { + "epoch": 0.272717265672299, + "grad_norm": 0.7603545784950256, + "learning_rate": 9.563615928445548e-06, + "loss": 0.7717, + "step": 4955 + }, + { + "epoch": 0.2727723044746546, + "grad_norm": 0.7185375094413757, + "learning_rate": 9.563438806158437e-06, + "loss": 0.8057, + "step": 4956 + }, + { + "epoch": 0.2728273432770103, + "grad_norm": 0.7619272470474243, + "learning_rate": 9.56326164957374e-06, + "loss": 0.8173, + "step": 4957 + }, + { + "epoch": 0.27288238207936594, + "grad_norm": 0.7868000864982605, + "learning_rate": 9.563084458692793e-06, + "loss": 0.6855, + "step": 4958 + }, + { + "epoch": 0.2729374208817216, + "grad_norm": 0.7949535846710205, + "learning_rate": 9.562907233516923e-06, + "loss": 0.7754, + "step": 4959 + }, + { + "epoch": 0.27299245968407726, + "grad_norm": 0.7037919163703918, + "learning_rate": 9.562729974047462e-06, + "loss": 0.7419, + "step": 4960 + }, + { + "epoch": 0.27304749848643295, + "grad_norm": 0.7236568927764893, + "learning_rate": 9.562552680285746e-06, + "loss": 0.7135, + "step": 4961 + }, + { + "epoch": 0.2731025372887886, + "grad_norm": 0.8410467505455017, + "learning_rate": 9.562375352233105e-06, + "loss": 0.8507, + "step": 4962 + }, + { + "epoch": 0.27315757609114427, + "grad_norm": 0.8043560981750488, + "learning_rate": 9.562197989890871e-06, + "loss": 0.8484, + "step": 4963 + }, + { + "epoch": 0.2732126148934999, + "grad_norm": 0.6926127672195435, + "learning_rate": 9.56202059326038e-06, + "loss": 0.8087, + "step": 4964 + }, + { + "epoch": 0.2732676536958556, + "grad_norm": 0.7149024605751038, + "learning_rate": 9.561843162342961e-06, + "loss": 0.7349, + "step": 4965 + }, + { + "epoch": 0.27332269249821123, + "grad_norm": 0.7165781855583191, + "learning_rate": 9.561665697139952e-06, + "loss": 0.8139, + "step": 4966 + }, + { + "epoch": 0.2733777313005669, + "grad_norm": 0.7481133341789246, + "learning_rate": 9.561488197652684e-06, + "loss": 0.7712, + "step": 4967 + }, + { + "epoch": 0.27343277010292255, + "grad_norm": 0.6928209066390991, + "learning_rate": 9.561310663882491e-06, + "loss": 0.7524, + "step": 4968 + }, + { + "epoch": 0.27348780890527824, + "grad_norm": 0.7397856116294861, + "learning_rate": 9.561133095830708e-06, + "loss": 0.718, + "step": 4969 + }, + { + "epoch": 0.2735428477076339, + "grad_norm": 0.7712383270263672, + "learning_rate": 9.560955493498672e-06, + "loss": 0.8201, + "step": 4970 + }, + { + "epoch": 0.27359788650998956, + "grad_norm": 0.96076899766922, + "learning_rate": 9.560777856887714e-06, + "loss": 0.8555, + "step": 4971 + }, + { + "epoch": 0.2736529253123452, + "grad_norm": 0.7331019639968872, + "learning_rate": 9.56060018599917e-06, + "loss": 0.8315, + "step": 4972 + }, + { + "epoch": 0.2737079641147009, + "grad_norm": 0.7157140970230103, + "learning_rate": 9.560422480834374e-06, + "loss": 0.7177, + "step": 4973 + }, + { + "epoch": 0.2737630029170565, + "grad_norm": 0.807614266872406, + "learning_rate": 9.560244741394666e-06, + "loss": 0.8413, + "step": 4974 + }, + { + "epoch": 0.2738180417194122, + "grad_norm": 0.7618574500083923, + "learning_rate": 9.560066967681378e-06, + "loss": 0.8248, + "step": 4975 + }, + { + "epoch": 0.27387308052176784, + "grad_norm": 0.7886885404586792, + "learning_rate": 9.559889159695848e-06, + "loss": 0.8793, + "step": 4976 + }, + { + "epoch": 0.27392811932412353, + "grad_norm": 1.0090755224227905, + "learning_rate": 9.559711317439411e-06, + "loss": 0.9255, + "step": 4977 + }, + { + "epoch": 0.27398315812647916, + "grad_norm": 0.7855443358421326, + "learning_rate": 9.559533440913405e-06, + "loss": 0.8001, + "step": 4978 + }, + { + "epoch": 0.27403819692883485, + "grad_norm": 0.768741250038147, + "learning_rate": 9.559355530119165e-06, + "loss": 0.8109, + "step": 4979 + }, + { + "epoch": 0.2740932357311905, + "grad_norm": 0.759589672088623, + "learning_rate": 9.55917758505803e-06, + "loss": 0.8001, + "step": 4980 + }, + { + "epoch": 0.2741482745335462, + "grad_norm": 0.7937445640563965, + "learning_rate": 9.558999605731338e-06, + "loss": 0.8924, + "step": 4981 + }, + { + "epoch": 0.2742033133359018, + "grad_norm": 0.9041592478752136, + "learning_rate": 9.558821592140423e-06, + "loss": 0.9167, + "step": 4982 + }, + { + "epoch": 0.27425835213825744, + "grad_norm": 0.6971380710601807, + "learning_rate": 9.558643544286627e-06, + "loss": 0.7589, + "step": 4983 + }, + { + "epoch": 0.27431339094061313, + "grad_norm": 0.9292929172515869, + "learning_rate": 9.558465462171287e-06, + "loss": 0.9566, + "step": 4984 + }, + { + "epoch": 0.27436842974296877, + "grad_norm": 0.8320629000663757, + "learning_rate": 9.558287345795738e-06, + "loss": 0.8854, + "step": 4985 + }, + { + "epoch": 0.27442346854532446, + "grad_norm": 0.797272801399231, + "learning_rate": 9.558109195161325e-06, + "loss": 0.7838, + "step": 4986 + }, + { + "epoch": 0.2744785073476801, + "grad_norm": 0.9702700972557068, + "learning_rate": 9.557931010269382e-06, + "loss": 0.8593, + "step": 4987 + }, + { + "epoch": 0.2745335461500358, + "grad_norm": 0.8309103846549988, + "learning_rate": 9.557752791121248e-06, + "loss": 0.8902, + "step": 4988 + }, + { + "epoch": 0.2745885849523914, + "grad_norm": 0.706667959690094, + "learning_rate": 9.557574537718265e-06, + "loss": 0.7259, + "step": 4989 + }, + { + "epoch": 0.2746436237547471, + "grad_norm": 0.770239531993866, + "learning_rate": 9.557396250061771e-06, + "loss": 0.8644, + "step": 4990 + }, + { + "epoch": 0.27469866255710274, + "grad_norm": 0.8695803880691528, + "learning_rate": 9.557217928153108e-06, + "loss": 0.895, + "step": 4991 + }, + { + "epoch": 0.2747537013594584, + "grad_norm": 0.7525948286056519, + "learning_rate": 9.557039571993614e-06, + "loss": 0.7029, + "step": 4992 + }, + { + "epoch": 0.27480874016181406, + "grad_norm": 0.7616680264472961, + "learning_rate": 9.556861181584631e-06, + "loss": 0.8025, + "step": 4993 + }, + { + "epoch": 0.27486377896416975, + "grad_norm": 0.7216167449951172, + "learning_rate": 9.5566827569275e-06, + "loss": 0.8314, + "step": 4994 + }, + { + "epoch": 0.2749188177665254, + "grad_norm": 0.7412614226341248, + "learning_rate": 9.55650429802356e-06, + "loss": 0.7877, + "step": 4995 + }, + { + "epoch": 0.27497385656888107, + "grad_norm": 0.7176525592803955, + "learning_rate": 9.556325804874154e-06, + "loss": 0.7615, + "step": 4996 + }, + { + "epoch": 0.2750288953712367, + "grad_norm": 0.7544515132904053, + "learning_rate": 9.556147277480623e-06, + "loss": 0.8352, + "step": 4997 + }, + { + "epoch": 0.2750839341735924, + "grad_norm": 0.7318205833435059, + "learning_rate": 9.555968715844309e-06, + "loss": 0.7403, + "step": 4998 + }, + { + "epoch": 0.275138972975948, + "grad_norm": 0.7495027780532837, + "learning_rate": 9.555790119966552e-06, + "loss": 0.7611, + "step": 4999 + }, + { + "epoch": 0.2751940117783037, + "grad_norm": 0.7544401288032532, + "learning_rate": 9.555611489848697e-06, + "loss": 0.8594, + "step": 5000 + }, + { + "epoch": 0.27524905058065935, + "grad_norm": 0.7698250412940979, + "learning_rate": 9.555432825492084e-06, + "loss": 0.8438, + "step": 5001 + }, + { + "epoch": 0.27530408938301504, + "grad_norm": 0.7668892741203308, + "learning_rate": 9.555254126898059e-06, + "loss": 0.8082, + "step": 5002 + }, + { + "epoch": 0.27535912818537067, + "grad_norm": 0.9170669317245483, + "learning_rate": 9.555075394067963e-06, + "loss": 0.7443, + "step": 5003 + }, + { + "epoch": 0.27541416698772636, + "grad_norm": 0.7890255451202393, + "learning_rate": 9.55489662700314e-06, + "loss": 0.8269, + "step": 5004 + }, + { + "epoch": 0.275469205790082, + "grad_norm": 0.6740512847900391, + "learning_rate": 9.554717825704932e-06, + "loss": 0.6906, + "step": 5005 + }, + { + "epoch": 0.2755242445924377, + "grad_norm": 0.8032376170158386, + "learning_rate": 9.554538990174685e-06, + "loss": 0.812, + "step": 5006 + }, + { + "epoch": 0.2755792833947933, + "grad_norm": 0.6932135224342346, + "learning_rate": 9.554360120413741e-06, + "loss": 0.7823, + "step": 5007 + }, + { + "epoch": 0.275634322197149, + "grad_norm": 0.7447643876075745, + "learning_rate": 9.554181216423447e-06, + "loss": 0.8753, + "step": 5008 + }, + { + "epoch": 0.27568936099950464, + "grad_norm": 0.8035081624984741, + "learning_rate": 9.554002278205145e-06, + "loss": 0.7135, + "step": 5009 + }, + { + "epoch": 0.27574439980186033, + "grad_norm": 0.7544171214103699, + "learning_rate": 9.553823305760182e-06, + "loss": 0.7574, + "step": 5010 + }, + { + "epoch": 0.27579943860421596, + "grad_norm": 0.6648419499397278, + "learning_rate": 9.553644299089902e-06, + "loss": 0.7566, + "step": 5011 + }, + { + "epoch": 0.27585447740657165, + "grad_norm": 0.7481752038002014, + "learning_rate": 9.55346525819565e-06, + "loss": 0.7862, + "step": 5012 + }, + { + "epoch": 0.2759095162089273, + "grad_norm": 0.7000668048858643, + "learning_rate": 9.55328618307877e-06, + "loss": 0.7767, + "step": 5013 + }, + { + "epoch": 0.275964555011283, + "grad_norm": 0.7435166239738464, + "learning_rate": 9.553107073740612e-06, + "loss": 0.6888, + "step": 5014 + }, + { + "epoch": 0.2760195938136386, + "grad_norm": 0.7593170404434204, + "learning_rate": 9.552927930182521e-06, + "loss": 0.7272, + "step": 5015 + }, + { + "epoch": 0.2760746326159943, + "grad_norm": 0.870079755783081, + "learning_rate": 9.55274875240584e-06, + "loss": 0.8692, + "step": 5016 + }, + { + "epoch": 0.27612967141834993, + "grad_norm": 0.8550307750701904, + "learning_rate": 9.55256954041192e-06, + "loss": 0.8729, + "step": 5017 + }, + { + "epoch": 0.2761847102207056, + "grad_norm": 0.888830304145813, + "learning_rate": 9.552390294202105e-06, + "loss": 0.8607, + "step": 5018 + }, + { + "epoch": 0.27623974902306125, + "grad_norm": 0.8295729160308838, + "learning_rate": 9.552211013777743e-06, + "loss": 0.8722, + "step": 5019 + }, + { + "epoch": 0.27629478782541694, + "grad_norm": 0.7732356190681458, + "learning_rate": 9.552031699140182e-06, + "loss": 0.8332, + "step": 5020 + }, + { + "epoch": 0.2763498266277726, + "grad_norm": 0.9132987856864929, + "learning_rate": 9.55185235029077e-06, + "loss": 0.769, + "step": 5021 + }, + { + "epoch": 0.27640486543012827, + "grad_norm": 0.7221076488494873, + "learning_rate": 9.551672967230851e-06, + "loss": 0.8505, + "step": 5022 + }, + { + "epoch": 0.2764599042324839, + "grad_norm": 0.8526949882507324, + "learning_rate": 9.551493549961778e-06, + "loss": 0.8002, + "step": 5023 + }, + { + "epoch": 0.2765149430348396, + "grad_norm": 0.9513188004493713, + "learning_rate": 9.551314098484901e-06, + "loss": 0.8558, + "step": 5024 + }, + { + "epoch": 0.2765699818371952, + "grad_norm": 0.7543003559112549, + "learning_rate": 9.551134612801563e-06, + "loss": 0.8292, + "step": 5025 + }, + { + "epoch": 0.27662502063955086, + "grad_norm": 0.7531017065048218, + "learning_rate": 9.550955092913115e-06, + "loss": 0.7837, + "step": 5026 + }, + { + "epoch": 0.27668005944190655, + "grad_norm": 0.8725717663764954, + "learning_rate": 9.550775538820907e-06, + "loss": 0.8362, + "step": 5027 + }, + { + "epoch": 0.2767350982442622, + "grad_norm": 0.8122721910476685, + "learning_rate": 9.550595950526288e-06, + "loss": 0.8539, + "step": 5028 + }, + { + "epoch": 0.27679013704661787, + "grad_norm": 0.7756829261779785, + "learning_rate": 9.550416328030608e-06, + "loss": 0.787, + "step": 5029 + }, + { + "epoch": 0.2768451758489735, + "grad_norm": 0.9086001515388489, + "learning_rate": 9.550236671335218e-06, + "loss": 0.7972, + "step": 5030 + }, + { + "epoch": 0.2769002146513292, + "grad_norm": 0.7857060432434082, + "learning_rate": 9.550056980441466e-06, + "loss": 0.7577, + "step": 5031 + }, + { + "epoch": 0.2769552534536848, + "grad_norm": 0.8190392851829529, + "learning_rate": 9.549877255350703e-06, + "loss": 0.81, + "step": 5032 + }, + { + "epoch": 0.2770102922560405, + "grad_norm": 0.7714588642120361, + "learning_rate": 9.549697496064283e-06, + "loss": 0.7916, + "step": 5033 + }, + { + "epoch": 0.27706533105839615, + "grad_norm": 0.7178533673286438, + "learning_rate": 9.549517702583552e-06, + "loss": 0.8001, + "step": 5034 + }, + { + "epoch": 0.27712036986075184, + "grad_norm": 0.7552955150604248, + "learning_rate": 9.549337874909865e-06, + "loss": 0.8361, + "step": 5035 + }, + { + "epoch": 0.27717540866310747, + "grad_norm": 0.7823992371559143, + "learning_rate": 9.549158013044573e-06, + "loss": 0.7033, + "step": 5036 + }, + { + "epoch": 0.27723044746546316, + "grad_norm": 0.731504499912262, + "learning_rate": 9.548978116989026e-06, + "loss": 0.73, + "step": 5037 + }, + { + "epoch": 0.2772854862678188, + "grad_norm": 0.7455994486808777, + "learning_rate": 9.548798186744578e-06, + "loss": 0.8005, + "step": 5038 + }, + { + "epoch": 0.2773405250701745, + "grad_norm": 0.7020164728164673, + "learning_rate": 9.54861822231258e-06, + "loss": 0.6707, + "step": 5039 + }, + { + "epoch": 0.2773955638725301, + "grad_norm": 0.7526360750198364, + "learning_rate": 9.548438223694385e-06, + "loss": 0.7686, + "step": 5040 + }, + { + "epoch": 0.2774506026748858, + "grad_norm": 0.7268579006195068, + "learning_rate": 9.548258190891344e-06, + "loss": 0.7039, + "step": 5041 + }, + { + "epoch": 0.27750564147724144, + "grad_norm": 0.9361631274223328, + "learning_rate": 9.548078123904815e-06, + "loss": 0.8023, + "step": 5042 + }, + { + "epoch": 0.2775606802795971, + "grad_norm": 0.7786710262298584, + "learning_rate": 9.547898022736147e-06, + "loss": 0.6866, + "step": 5043 + }, + { + "epoch": 0.27761571908195276, + "grad_norm": 0.7175624370574951, + "learning_rate": 9.547717887386695e-06, + "loss": 0.7554, + "step": 5044 + }, + { + "epoch": 0.27767075788430845, + "grad_norm": 0.9157657623291016, + "learning_rate": 9.547537717857813e-06, + "loss": 0.7936, + "step": 5045 + }, + { + "epoch": 0.2777257966866641, + "grad_norm": 0.7881377935409546, + "learning_rate": 9.547357514150854e-06, + "loss": 0.8198, + "step": 5046 + }, + { + "epoch": 0.2777808354890198, + "grad_norm": 1.0444039106369019, + "learning_rate": 9.547177276267173e-06, + "loss": 0.7954, + "step": 5047 + }, + { + "epoch": 0.2778358742913754, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.546997004208124e-06, + "loss": 0.7697, + "step": 5048 + }, + { + "epoch": 0.2778909130937311, + "grad_norm": 0.7304134368896484, + "learning_rate": 9.546816697975066e-06, + "loss": 0.7034, + "step": 5049 + }, + { + "epoch": 0.27794595189608673, + "grad_norm": 0.7783082723617554, + "learning_rate": 9.546636357569347e-06, + "loss": 0.8185, + "step": 5050 + }, + { + "epoch": 0.2780009906984424, + "grad_norm": 0.750712513923645, + "learning_rate": 9.54645598299233e-06, + "loss": 0.7336, + "step": 5051 + }, + { + "epoch": 0.27805602950079805, + "grad_norm": 0.7849590182304382, + "learning_rate": 9.546275574245364e-06, + "loss": 0.8088, + "step": 5052 + }, + { + "epoch": 0.27811106830315374, + "grad_norm": 0.8490208983421326, + "learning_rate": 9.546095131329809e-06, + "loss": 0.8507, + "step": 5053 + }, + { + "epoch": 0.2781661071055094, + "grad_norm": 0.8107250928878784, + "learning_rate": 9.54591465424702e-06, + "loss": 0.7787, + "step": 5054 + }, + { + "epoch": 0.27822114590786506, + "grad_norm": 0.8278594613075256, + "learning_rate": 9.54573414299835e-06, + "loss": 0.7836, + "step": 5055 + }, + { + "epoch": 0.2782761847102207, + "grad_norm": 0.7982015013694763, + "learning_rate": 9.545553597585163e-06, + "loss": 0.7672, + "step": 5056 + }, + { + "epoch": 0.2783312235125764, + "grad_norm": 0.7311522364616394, + "learning_rate": 9.54537301800881e-06, + "loss": 0.7571, + "step": 5057 + }, + { + "epoch": 0.278386262314932, + "grad_norm": 0.8039999604225159, + "learning_rate": 9.545192404270651e-06, + "loss": 0.764, + "step": 5058 + }, + { + "epoch": 0.2784413011172877, + "grad_norm": 0.7810946702957153, + "learning_rate": 9.545011756372042e-06, + "loss": 0.9217, + "step": 5059 + }, + { + "epoch": 0.27849633991964334, + "grad_norm": 0.7092248797416687, + "learning_rate": 9.544831074314343e-06, + "loss": 0.7599, + "step": 5060 + }, + { + "epoch": 0.27855137872199903, + "grad_norm": 0.831550657749176, + "learning_rate": 9.544650358098908e-06, + "loss": 0.7278, + "step": 5061 + }, + { + "epoch": 0.27860641752435467, + "grad_norm": 0.7645474076271057, + "learning_rate": 9.544469607727098e-06, + "loss": 0.7945, + "step": 5062 + }, + { + "epoch": 0.27866145632671036, + "grad_norm": 0.6956788301467896, + "learning_rate": 9.544288823200273e-06, + "loss": 0.749, + "step": 5063 + }, + { + "epoch": 0.278716495129066, + "grad_norm": 0.7262974381446838, + "learning_rate": 9.544108004519786e-06, + "loss": 0.8074, + "step": 5064 + }, + { + "epoch": 0.2787715339314217, + "grad_norm": 0.7439202666282654, + "learning_rate": 9.543927151687001e-06, + "loss": 0.9403, + "step": 5065 + }, + { + "epoch": 0.2788265727337773, + "grad_norm": 0.8468778133392334, + "learning_rate": 9.543746264703277e-06, + "loss": 0.8182, + "step": 5066 + }, + { + "epoch": 0.278881611536133, + "grad_norm": 0.8396204113960266, + "learning_rate": 9.54356534356997e-06, + "loss": 0.8067, + "step": 5067 + }, + { + "epoch": 0.27893665033848863, + "grad_norm": 0.718758225440979, + "learning_rate": 9.543384388288445e-06, + "loss": 0.8172, + "step": 5068 + }, + { + "epoch": 0.27899168914084427, + "grad_norm": 0.7562685012817383, + "learning_rate": 9.543203398860056e-06, + "loss": 0.9053, + "step": 5069 + }, + { + "epoch": 0.27904672794319996, + "grad_norm": 0.9592792987823486, + "learning_rate": 9.543022375286169e-06, + "loss": 0.9375, + "step": 5070 + }, + { + "epoch": 0.2791017667455556, + "grad_norm": 0.7162739634513855, + "learning_rate": 9.54284131756814e-06, + "loss": 0.7297, + "step": 5071 + }, + { + "epoch": 0.2791568055479113, + "grad_norm": 0.7703517079353333, + "learning_rate": 9.542660225707335e-06, + "loss": 0.8863, + "step": 5072 + }, + { + "epoch": 0.2792118443502669, + "grad_norm": 0.7860418558120728, + "learning_rate": 9.542479099705109e-06, + "loss": 0.8335, + "step": 5073 + }, + { + "epoch": 0.2792668831526226, + "grad_norm": 0.8880825042724609, + "learning_rate": 9.542297939562825e-06, + "loss": 0.8344, + "step": 5074 + }, + { + "epoch": 0.27932192195497824, + "grad_norm": 0.7900505661964417, + "learning_rate": 9.542116745281849e-06, + "loss": 0.7613, + "step": 5075 + }, + { + "epoch": 0.2793769607573339, + "grad_norm": 0.7446081042289734, + "learning_rate": 9.541935516863536e-06, + "loss": 0.6615, + "step": 5076 + }, + { + "epoch": 0.27943199955968956, + "grad_norm": 0.7831308245658875, + "learning_rate": 9.541754254309254e-06, + "loss": 0.779, + "step": 5077 + }, + { + "epoch": 0.27948703836204525, + "grad_norm": 0.9007606506347656, + "learning_rate": 9.541572957620361e-06, + "loss": 0.8883, + "step": 5078 + }, + { + "epoch": 0.2795420771644009, + "grad_norm": 0.8033407330513, + "learning_rate": 9.541391626798222e-06, + "loss": 0.7354, + "step": 5079 + }, + { + "epoch": 0.27959711596675657, + "grad_norm": 0.9259470105171204, + "learning_rate": 9.5412102618442e-06, + "loss": 0.7602, + "step": 5080 + }, + { + "epoch": 0.2796521547691122, + "grad_norm": 0.786523163318634, + "learning_rate": 9.541028862759656e-06, + "loss": 0.7402, + "step": 5081 + }, + { + "epoch": 0.2797071935714679, + "grad_norm": 0.8053372502326965, + "learning_rate": 9.540847429545954e-06, + "loss": 0.825, + "step": 5082 + }, + { + "epoch": 0.2797622323738235, + "grad_norm": 0.8578022122383118, + "learning_rate": 9.54066596220446e-06, + "loss": 0.7866, + "step": 5083 + }, + { + "epoch": 0.2798172711761792, + "grad_norm": 0.916161835193634, + "learning_rate": 9.540484460736535e-06, + "loss": 0.5961, + "step": 5084 + }, + { + "epoch": 0.27987230997853485, + "grad_norm": 0.7843562960624695, + "learning_rate": 9.540302925143545e-06, + "loss": 0.764, + "step": 5085 + }, + { + "epoch": 0.27992734878089054, + "grad_norm": 0.7392510771751404, + "learning_rate": 9.540121355426852e-06, + "loss": 0.8038, + "step": 5086 + }, + { + "epoch": 0.2799823875832462, + "grad_norm": 0.7406296133995056, + "learning_rate": 9.539939751587825e-06, + "loss": 0.8202, + "step": 5087 + }, + { + "epoch": 0.28003742638560186, + "grad_norm": 0.7274924516677856, + "learning_rate": 9.539758113627823e-06, + "loss": 0.7691, + "step": 5088 + }, + { + "epoch": 0.2800924651879575, + "grad_norm": 0.8563184142112732, + "learning_rate": 9.539576441548218e-06, + "loss": 0.8341, + "step": 5089 + }, + { + "epoch": 0.2801475039903132, + "grad_norm": 0.7708351016044617, + "learning_rate": 9.539394735350366e-06, + "loss": 0.7126, + "step": 5090 + }, + { + "epoch": 0.2802025427926688, + "grad_norm": 0.7314836382865906, + "learning_rate": 9.539212995035642e-06, + "loss": 0.7465, + "step": 5091 + }, + { + "epoch": 0.2802575815950245, + "grad_norm": 0.7594754695892334, + "learning_rate": 9.539031220605409e-06, + "loss": 0.7563, + "step": 5092 + }, + { + "epoch": 0.28031262039738014, + "grad_norm": 0.699414074420929, + "learning_rate": 9.53884941206103e-06, + "loss": 0.7847, + "step": 5093 + }, + { + "epoch": 0.28036765919973583, + "grad_norm": 0.8013063073158264, + "learning_rate": 9.538667569403877e-06, + "loss": 0.7769, + "step": 5094 + }, + { + "epoch": 0.28042269800209146, + "grad_norm": 0.7778805494308472, + "learning_rate": 9.538485692635312e-06, + "loss": 0.7646, + "step": 5095 + }, + { + "epoch": 0.28047773680444715, + "grad_norm": 0.785649299621582, + "learning_rate": 9.538303781756702e-06, + "loss": 0.8162, + "step": 5096 + }, + { + "epoch": 0.2805327756068028, + "grad_norm": 0.7073212265968323, + "learning_rate": 9.538121836769417e-06, + "loss": 0.7208, + "step": 5097 + }, + { + "epoch": 0.2805878144091585, + "grad_norm": 0.7545642852783203, + "learning_rate": 9.53793985767482e-06, + "loss": 0.8673, + "step": 5098 + }, + { + "epoch": 0.2806428532115141, + "grad_norm": 0.6818416118621826, + "learning_rate": 9.537757844474285e-06, + "loss": 0.7576, + "step": 5099 + }, + { + "epoch": 0.2806978920138698, + "grad_norm": 0.6718038320541382, + "learning_rate": 9.537575797169176e-06, + "loss": 0.6683, + "step": 5100 + }, + { + "epoch": 0.28075293081622543, + "grad_norm": 0.7851004600524902, + "learning_rate": 9.53739371576086e-06, + "loss": 0.8871, + "step": 5101 + }, + { + "epoch": 0.2808079696185811, + "grad_norm": 0.7565650343894958, + "learning_rate": 9.53721160025071e-06, + "loss": 0.8799, + "step": 5102 + }, + { + "epoch": 0.28086300842093676, + "grad_norm": 0.7522932887077332, + "learning_rate": 9.537029450640091e-06, + "loss": 0.838, + "step": 5103 + }, + { + "epoch": 0.28091804722329244, + "grad_norm": 0.929634690284729, + "learning_rate": 9.536847266930375e-06, + "loss": 0.7997, + "step": 5104 + }, + { + "epoch": 0.2809730860256481, + "grad_norm": 0.8050084710121155, + "learning_rate": 9.536665049122928e-06, + "loss": 0.7652, + "step": 5105 + }, + { + "epoch": 0.28102812482800377, + "grad_norm": 0.7401233315467834, + "learning_rate": 9.53648279721912e-06, + "loss": 0.7904, + "step": 5106 + }, + { + "epoch": 0.2810831636303594, + "grad_norm": 0.7125453948974609, + "learning_rate": 9.536300511220322e-06, + "loss": 0.7349, + "step": 5107 + }, + { + "epoch": 0.2811382024327151, + "grad_norm": 0.7165758609771729, + "learning_rate": 9.536118191127905e-06, + "loss": 0.7314, + "step": 5108 + }, + { + "epoch": 0.2811932412350707, + "grad_norm": 0.7507439851760864, + "learning_rate": 9.535935836943237e-06, + "loss": 0.7603, + "step": 5109 + }, + { + "epoch": 0.2812482800374264, + "grad_norm": 0.7832109332084656, + "learning_rate": 9.535753448667688e-06, + "loss": 0.7279, + "step": 5110 + }, + { + "epoch": 0.28130331883978205, + "grad_norm": 0.7346609234809875, + "learning_rate": 9.535571026302633e-06, + "loss": 0.6882, + "step": 5111 + }, + { + "epoch": 0.2813583576421377, + "grad_norm": 0.7569608688354492, + "learning_rate": 9.535388569849437e-06, + "loss": 0.8451, + "step": 5112 + }, + { + "epoch": 0.28141339644449337, + "grad_norm": 0.7319865822792053, + "learning_rate": 9.535206079309478e-06, + "loss": 0.8161, + "step": 5113 + }, + { + "epoch": 0.281468435246849, + "grad_norm": 0.7744631171226501, + "learning_rate": 9.535023554684122e-06, + "loss": 0.8025, + "step": 5114 + }, + { + "epoch": 0.2815234740492047, + "grad_norm": 0.6867525577545166, + "learning_rate": 9.534840995974743e-06, + "loss": 0.7693, + "step": 5115 + }, + { + "epoch": 0.2815785128515603, + "grad_norm": 0.7625848054885864, + "learning_rate": 9.534658403182715e-06, + "loss": 0.8034, + "step": 5116 + }, + { + "epoch": 0.281633551653916, + "grad_norm": 0.7369832992553711, + "learning_rate": 9.534475776309406e-06, + "loss": 0.873, + "step": 5117 + }, + { + "epoch": 0.28168859045627165, + "grad_norm": 0.7267127633094788, + "learning_rate": 9.534293115356191e-06, + "loss": 0.7954, + "step": 5118 + }, + { + "epoch": 0.28174362925862734, + "grad_norm": 0.7244247794151306, + "learning_rate": 9.534110420324443e-06, + "loss": 0.7784, + "step": 5119 + }, + { + "epoch": 0.28179866806098297, + "grad_norm": 0.8207812905311584, + "learning_rate": 9.533927691215534e-06, + "loss": 0.8696, + "step": 5120 + }, + { + "epoch": 0.28185370686333866, + "grad_norm": 0.8669891357421875, + "learning_rate": 9.53374492803084e-06, + "loss": 0.8203, + "step": 5121 + }, + { + "epoch": 0.2819087456656943, + "grad_norm": 0.7650816440582275, + "learning_rate": 9.533562130771732e-06, + "loss": 0.77, + "step": 5122 + }, + { + "epoch": 0.28196378446805, + "grad_norm": 0.7664972543716431, + "learning_rate": 9.533379299439584e-06, + "loss": 0.7187, + "step": 5123 + }, + { + "epoch": 0.2820188232704056, + "grad_norm": 0.7921896576881409, + "learning_rate": 9.533196434035772e-06, + "loss": 0.8669, + "step": 5124 + }, + { + "epoch": 0.2820738620727613, + "grad_norm": 0.7714456915855408, + "learning_rate": 9.533013534561669e-06, + "loss": 0.8783, + "step": 5125 + }, + { + "epoch": 0.28212890087511694, + "grad_norm": 0.7222065329551697, + "learning_rate": 9.532830601018648e-06, + "loss": 0.7449, + "step": 5126 + }, + { + "epoch": 0.28218393967747263, + "grad_norm": 0.718142569065094, + "learning_rate": 9.532647633408085e-06, + "loss": 0.8226, + "step": 5127 + }, + { + "epoch": 0.28223897847982826, + "grad_norm": 0.730592668056488, + "learning_rate": 9.532464631731357e-06, + "loss": 0.7878, + "step": 5128 + }, + { + "epoch": 0.28229401728218395, + "grad_norm": 0.7841802835464478, + "learning_rate": 9.532281595989839e-06, + "loss": 0.8262, + "step": 5129 + }, + { + "epoch": 0.2823490560845396, + "grad_norm": 0.8617212772369385, + "learning_rate": 9.532098526184904e-06, + "loss": 0.8368, + "step": 5130 + }, + { + "epoch": 0.2824040948868953, + "grad_norm": 0.6968556642532349, + "learning_rate": 9.53191542231793e-06, + "loss": 0.6848, + "step": 5131 + }, + { + "epoch": 0.2824591336892509, + "grad_norm": 0.7872157096862793, + "learning_rate": 9.531732284390294e-06, + "loss": 0.7898, + "step": 5132 + }, + { + "epoch": 0.2825141724916066, + "grad_norm": 0.7727276086807251, + "learning_rate": 9.53154911240337e-06, + "loss": 0.8506, + "step": 5133 + }, + { + "epoch": 0.28256921129396223, + "grad_norm": 0.7279896140098572, + "learning_rate": 9.531365906358536e-06, + "loss": 0.7415, + "step": 5134 + }, + { + "epoch": 0.2826242500963179, + "grad_norm": 0.7457457780838013, + "learning_rate": 9.53118266625717e-06, + "loss": 0.7652, + "step": 5135 + }, + { + "epoch": 0.28267928889867355, + "grad_norm": 0.8989270329475403, + "learning_rate": 9.530999392100646e-06, + "loss": 0.9085, + "step": 5136 + }, + { + "epoch": 0.28273432770102924, + "grad_norm": 0.9622626304626465, + "learning_rate": 9.530816083890347e-06, + "loss": 0.8726, + "step": 5137 + }, + { + "epoch": 0.2827893665033849, + "grad_norm": 0.7712846994400024, + "learning_rate": 9.530632741627643e-06, + "loss": 0.765, + "step": 5138 + }, + { + "epoch": 0.28284440530574056, + "grad_norm": 0.8320727348327637, + "learning_rate": 9.530449365313918e-06, + "loss": 0.7828, + "step": 5139 + }, + { + "epoch": 0.2828994441080962, + "grad_norm": 0.9310963153839111, + "learning_rate": 9.530265954950549e-06, + "loss": 0.8482, + "step": 5140 + }, + { + "epoch": 0.2829544829104519, + "grad_norm": 0.9984502792358398, + "learning_rate": 9.530082510538914e-06, + "loss": 0.8673, + "step": 5141 + }, + { + "epoch": 0.2830095217128075, + "grad_norm": 0.8300992250442505, + "learning_rate": 9.52989903208039e-06, + "loss": 0.8232, + "step": 5142 + }, + { + "epoch": 0.2830645605151632, + "grad_norm": 0.930052638053894, + "learning_rate": 9.529715519576356e-06, + "loss": 0.7766, + "step": 5143 + }, + { + "epoch": 0.28311959931751884, + "grad_norm": 0.8038359880447388, + "learning_rate": 9.529531973028194e-06, + "loss": 0.712, + "step": 5144 + }, + { + "epoch": 0.28317463811987453, + "grad_norm": 0.856250524520874, + "learning_rate": 9.529348392437283e-06, + "loss": 0.8578, + "step": 5145 + }, + { + "epoch": 0.28322967692223017, + "grad_norm": 0.7602483630180359, + "learning_rate": 9.529164777805002e-06, + "loss": 0.749, + "step": 5146 + }, + { + "epoch": 0.28328471572458586, + "grad_norm": 0.8946549892425537, + "learning_rate": 9.52898112913273e-06, + "loss": 0.8101, + "step": 5147 + }, + { + "epoch": 0.2833397545269415, + "grad_norm": 0.8015615344047546, + "learning_rate": 9.52879744642185e-06, + "loss": 0.8203, + "step": 5148 + }, + { + "epoch": 0.2833947933292972, + "grad_norm": 0.7767183780670166, + "learning_rate": 9.528613729673738e-06, + "loss": 0.8409, + "step": 5149 + }, + { + "epoch": 0.2834498321316528, + "grad_norm": 0.7604000568389893, + "learning_rate": 9.52842997888978e-06, + "loss": 0.8853, + "step": 5150 + }, + { + "epoch": 0.2835048709340085, + "grad_norm": 0.7079401016235352, + "learning_rate": 9.528246194071353e-06, + "loss": 0.6855, + "step": 5151 + }, + { + "epoch": 0.28355990973636414, + "grad_norm": 0.7616782188415527, + "learning_rate": 9.52806237521984e-06, + "loss": 0.785, + "step": 5152 + }, + { + "epoch": 0.2836149485387198, + "grad_norm": 0.7408583760261536, + "learning_rate": 9.527878522336622e-06, + "loss": 0.7105, + "step": 5153 + }, + { + "epoch": 0.28366998734107546, + "grad_norm": 0.694821834564209, + "learning_rate": 9.52769463542308e-06, + "loss": 0.6552, + "step": 5154 + }, + { + "epoch": 0.2837250261434311, + "grad_norm": 0.796925961971283, + "learning_rate": 9.5275107144806e-06, + "loss": 0.7122, + "step": 5155 + }, + { + "epoch": 0.2837800649457868, + "grad_norm": 0.8001971244812012, + "learning_rate": 9.527326759510558e-06, + "loss": 0.8528, + "step": 5156 + }, + { + "epoch": 0.2838351037481424, + "grad_norm": 0.8605831265449524, + "learning_rate": 9.527142770514341e-06, + "loss": 0.7948, + "step": 5157 + }, + { + "epoch": 0.2838901425504981, + "grad_norm": 0.8380078077316284, + "learning_rate": 9.526958747493334e-06, + "loss": 0.8184, + "step": 5158 + }, + { + "epoch": 0.28394518135285374, + "grad_norm": 0.8758485317230225, + "learning_rate": 9.526774690448913e-06, + "loss": 0.7625, + "step": 5159 + }, + { + "epoch": 0.2840002201552094, + "grad_norm": 0.7078989744186401, + "learning_rate": 9.526590599382466e-06, + "loss": 0.8179, + "step": 5160 + }, + { + "epoch": 0.28405525895756506, + "grad_norm": 0.6668990850448608, + "learning_rate": 9.526406474295376e-06, + "loss": 0.7169, + "step": 5161 + }, + { + "epoch": 0.28411029775992075, + "grad_norm": 0.7666084170341492, + "learning_rate": 9.526222315189026e-06, + "loss": 0.8511, + "step": 5162 + }, + { + "epoch": 0.2841653365622764, + "grad_norm": 0.7390545606613159, + "learning_rate": 9.526038122064802e-06, + "loss": 0.7926, + "step": 5163 + }, + { + "epoch": 0.28422037536463207, + "grad_norm": 0.7972092032432556, + "learning_rate": 9.525853894924086e-06, + "loss": 0.9166, + "step": 5164 + }, + { + "epoch": 0.2842754141669877, + "grad_norm": 0.8988455533981323, + "learning_rate": 9.525669633768265e-06, + "loss": 0.9497, + "step": 5165 + }, + { + "epoch": 0.2843304529693434, + "grad_norm": 0.7092710137367249, + "learning_rate": 9.525485338598722e-06, + "loss": 0.7241, + "step": 5166 + }, + { + "epoch": 0.28438549177169903, + "grad_norm": 0.8630063533782959, + "learning_rate": 9.525301009416843e-06, + "loss": 0.8318, + "step": 5167 + }, + { + "epoch": 0.2844405305740547, + "grad_norm": 0.7336890697479248, + "learning_rate": 9.52511664622401e-06, + "loss": 0.7077, + "step": 5168 + }, + { + "epoch": 0.28449556937641035, + "grad_norm": 0.8156722784042358, + "learning_rate": 9.524932249021615e-06, + "loss": 0.8573, + "step": 5169 + }, + { + "epoch": 0.28455060817876604, + "grad_norm": 0.7061388492584229, + "learning_rate": 9.524747817811038e-06, + "loss": 0.7432, + "step": 5170 + }, + { + "epoch": 0.2846056469811217, + "grad_norm": 0.7948413491249084, + "learning_rate": 9.52456335259367e-06, + "loss": 0.8082, + "step": 5171 + }, + { + "epoch": 0.28466068578347736, + "grad_norm": 0.7208091020584106, + "learning_rate": 9.524378853370893e-06, + "loss": 0.7027, + "step": 5172 + }, + { + "epoch": 0.284715724585833, + "grad_norm": 0.8377540111541748, + "learning_rate": 9.524194320144096e-06, + "loss": 0.7093, + "step": 5173 + }, + { + "epoch": 0.2847707633881887, + "grad_norm": 0.8734563589096069, + "learning_rate": 9.524009752914666e-06, + "loss": 0.8422, + "step": 5174 + }, + { + "epoch": 0.2848258021905443, + "grad_norm": 0.7303940653800964, + "learning_rate": 9.523825151683989e-06, + "loss": 0.811, + "step": 5175 + }, + { + "epoch": 0.2848808409929, + "grad_norm": 0.7653842568397522, + "learning_rate": 9.523640516453455e-06, + "loss": 0.8595, + "step": 5176 + }, + { + "epoch": 0.28493587979525564, + "grad_norm": 0.7366930246353149, + "learning_rate": 9.523455847224448e-06, + "loss": 0.7832, + "step": 5177 + }, + { + "epoch": 0.28499091859761133, + "grad_norm": 0.7908505797386169, + "learning_rate": 9.523271143998357e-06, + "loss": 0.8115, + "step": 5178 + }, + { + "epoch": 0.28504595739996696, + "grad_norm": 0.8176048398017883, + "learning_rate": 9.523086406776572e-06, + "loss": 0.8377, + "step": 5179 + }, + { + "epoch": 0.28510099620232265, + "grad_norm": 0.724086344242096, + "learning_rate": 9.52290163556048e-06, + "loss": 0.7804, + "step": 5180 + }, + { + "epoch": 0.2851560350046783, + "grad_norm": 0.6461299657821655, + "learning_rate": 9.52271683035147e-06, + "loss": 0.5727, + "step": 5181 + }, + { + "epoch": 0.285211073807034, + "grad_norm": 0.7275353074073792, + "learning_rate": 9.522531991150932e-06, + "loss": 0.8345, + "step": 5182 + }, + { + "epoch": 0.2852661126093896, + "grad_norm": 0.7321951985359192, + "learning_rate": 9.522347117960253e-06, + "loss": 0.8832, + "step": 5183 + }, + { + "epoch": 0.2853211514117453, + "grad_norm": 0.7526552677154541, + "learning_rate": 9.522162210780825e-06, + "loss": 0.831, + "step": 5184 + }, + { + "epoch": 0.28537619021410093, + "grad_norm": 0.7592381238937378, + "learning_rate": 9.521977269614036e-06, + "loss": 0.7293, + "step": 5185 + }, + { + "epoch": 0.2854312290164566, + "grad_norm": 0.8060448169708252, + "learning_rate": 9.521792294461274e-06, + "loss": 0.819, + "step": 5186 + }, + { + "epoch": 0.28548626781881226, + "grad_norm": 0.7178553342819214, + "learning_rate": 9.521607285323932e-06, + "loss": 0.7526, + "step": 5187 + }, + { + "epoch": 0.28554130662116795, + "grad_norm": 0.8186969757080078, + "learning_rate": 9.521422242203401e-06, + "loss": 0.8526, + "step": 5188 + }, + { + "epoch": 0.2855963454235236, + "grad_norm": 0.8480883240699768, + "learning_rate": 9.521237165101071e-06, + "loss": 0.8088, + "step": 5189 + }, + { + "epoch": 0.28565138422587927, + "grad_norm": 0.8053719401359558, + "learning_rate": 9.521052054018333e-06, + "loss": 0.928, + "step": 5190 + }, + { + "epoch": 0.2857064230282349, + "grad_norm": 0.6937163472175598, + "learning_rate": 9.52086690895658e-06, + "loss": 0.7418, + "step": 5191 + }, + { + "epoch": 0.2857614618305906, + "grad_norm": 1.0616179704666138, + "learning_rate": 9.520681729917196e-06, + "loss": 0.8726, + "step": 5192 + }, + { + "epoch": 0.2858165006329462, + "grad_norm": 0.7504106163978577, + "learning_rate": 9.520496516901582e-06, + "loss": 0.844, + "step": 5193 + }, + { + "epoch": 0.2858715394353019, + "grad_norm": 0.7634509205818176, + "learning_rate": 9.520311269911127e-06, + "loss": 0.7595, + "step": 5194 + }, + { + "epoch": 0.28592657823765755, + "grad_norm": 0.7069799900054932, + "learning_rate": 9.52012598894722e-06, + "loss": 0.7566, + "step": 5195 + }, + { + "epoch": 0.28598161704001324, + "grad_norm": 0.695737361907959, + "learning_rate": 9.519940674011256e-06, + "loss": 0.7534, + "step": 5196 + }, + { + "epoch": 0.28603665584236887, + "grad_norm": 0.7212124466896057, + "learning_rate": 9.51975532510463e-06, + "loss": 0.8237, + "step": 5197 + }, + { + "epoch": 0.2860916946447245, + "grad_norm": 0.7274062633514404, + "learning_rate": 9.519569942228732e-06, + "loss": 0.756, + "step": 5198 + }, + { + "epoch": 0.2861467334470802, + "grad_norm": 0.7038697600364685, + "learning_rate": 9.519384525384956e-06, + "loss": 0.7308, + "step": 5199 + }, + { + "epoch": 0.2862017722494358, + "grad_norm": 0.6897109150886536, + "learning_rate": 9.519199074574694e-06, + "loss": 0.7858, + "step": 5200 + }, + { + "epoch": 0.2862568110517915, + "grad_norm": 0.8471527099609375, + "learning_rate": 9.519013589799343e-06, + "loss": 0.8198, + "step": 5201 + }, + { + "epoch": 0.28631184985414715, + "grad_norm": 0.6828129291534424, + "learning_rate": 9.518828071060295e-06, + "loss": 0.7734, + "step": 5202 + }, + { + "epoch": 0.28636688865650284, + "grad_norm": 0.7437755465507507, + "learning_rate": 9.518642518358946e-06, + "loss": 0.7669, + "step": 5203 + }, + { + "epoch": 0.28642192745885847, + "grad_norm": 0.8841923475265503, + "learning_rate": 9.518456931696689e-06, + "loss": 0.8201, + "step": 5204 + }, + { + "epoch": 0.28647696626121416, + "grad_norm": 0.9514154195785522, + "learning_rate": 9.518271311074917e-06, + "loss": 0.7864, + "step": 5205 + }, + { + "epoch": 0.2865320050635698, + "grad_norm": 0.830795407295227, + "learning_rate": 9.51808565649503e-06, + "loss": 0.8024, + "step": 5206 + }, + { + "epoch": 0.2865870438659255, + "grad_norm": 0.7274934649467468, + "learning_rate": 9.51789996795842e-06, + "loss": 0.7631, + "step": 5207 + }, + { + "epoch": 0.2866420826682811, + "grad_norm": 0.7004290223121643, + "learning_rate": 9.517714245466482e-06, + "loss": 0.7344, + "step": 5208 + }, + { + "epoch": 0.2866971214706368, + "grad_norm": 0.8559010624885559, + "learning_rate": 9.517528489020614e-06, + "loss": 0.7502, + "step": 5209 + }, + { + "epoch": 0.28675216027299244, + "grad_norm": 0.8913494348526001, + "learning_rate": 9.517342698622212e-06, + "loss": 0.8908, + "step": 5210 + }, + { + "epoch": 0.28680719907534813, + "grad_norm": 0.8375207781791687, + "learning_rate": 9.51715687427267e-06, + "loss": 0.7701, + "step": 5211 + }, + { + "epoch": 0.28686223787770376, + "grad_norm": 1.1804776191711426, + "learning_rate": 9.516971015973386e-06, + "loss": 0.8449, + "step": 5212 + }, + { + "epoch": 0.28691727668005945, + "grad_norm": 0.7260473370552063, + "learning_rate": 9.516785123725758e-06, + "loss": 0.7978, + "step": 5213 + }, + { + "epoch": 0.2869723154824151, + "grad_norm": 0.8159041404724121, + "learning_rate": 9.516599197531182e-06, + "loss": 0.7454, + "step": 5214 + }, + { + "epoch": 0.2870273542847708, + "grad_norm": 0.7850227952003479, + "learning_rate": 9.516413237391056e-06, + "loss": 0.8082, + "step": 5215 + }, + { + "epoch": 0.2870823930871264, + "grad_norm": 0.7596960067749023, + "learning_rate": 9.516227243306774e-06, + "loss": 0.7286, + "step": 5216 + }, + { + "epoch": 0.2871374318894821, + "grad_norm": 0.8763321042060852, + "learning_rate": 9.516041215279741e-06, + "loss": 0.8685, + "step": 5217 + }, + { + "epoch": 0.28719247069183773, + "grad_norm": 1.2130110263824463, + "learning_rate": 9.515855153311349e-06, + "loss": 0.8374, + "step": 5218 + }, + { + "epoch": 0.2872475094941934, + "grad_norm": 0.7578628063201904, + "learning_rate": 9.515669057402999e-06, + "loss": 0.793, + "step": 5219 + }, + { + "epoch": 0.28730254829654905, + "grad_norm": 0.9085225462913513, + "learning_rate": 9.515482927556088e-06, + "loss": 0.8366, + "step": 5220 + }, + { + "epoch": 0.28735758709890474, + "grad_norm": 0.7107900977134705, + "learning_rate": 9.515296763772017e-06, + "loss": 0.6571, + "step": 5221 + }, + { + "epoch": 0.2874126259012604, + "grad_norm": 0.7742018699645996, + "learning_rate": 9.515110566052183e-06, + "loss": 0.8387, + "step": 5222 + }, + { + "epoch": 0.28746766470361607, + "grad_norm": 0.8934319615364075, + "learning_rate": 9.514924334397987e-06, + "loss": 0.8546, + "step": 5223 + }, + { + "epoch": 0.2875227035059717, + "grad_norm": 0.720245897769928, + "learning_rate": 9.51473806881083e-06, + "loss": 0.7459, + "step": 5224 + }, + { + "epoch": 0.2875777423083274, + "grad_norm": 0.7074370384216309, + "learning_rate": 9.514551769292109e-06, + "loss": 0.8598, + "step": 5225 + }, + { + "epoch": 0.287632781110683, + "grad_norm": 0.7608621120452881, + "learning_rate": 9.514365435843226e-06, + "loss": 0.7263, + "step": 5226 + }, + { + "epoch": 0.2876878199130387, + "grad_norm": 0.7581011652946472, + "learning_rate": 9.51417906846558e-06, + "loss": 0.7498, + "step": 5227 + }, + { + "epoch": 0.28774285871539435, + "grad_norm": 0.8184412121772766, + "learning_rate": 9.513992667160572e-06, + "loss": 0.6889, + "step": 5228 + }, + { + "epoch": 0.28779789751775003, + "grad_norm": 0.6835145354270935, + "learning_rate": 9.513806231929605e-06, + "loss": 0.7399, + "step": 5229 + }, + { + "epoch": 0.28785293632010567, + "grad_norm": 0.7601536512374878, + "learning_rate": 9.513619762774077e-06, + "loss": 0.846, + "step": 5230 + }, + { + "epoch": 0.28790797512246136, + "grad_norm": 0.781491219997406, + "learning_rate": 9.513433259695392e-06, + "loss": 0.8326, + "step": 5231 + }, + { + "epoch": 0.287963013924817, + "grad_norm": 0.7978106141090393, + "learning_rate": 9.513246722694951e-06, + "loss": 0.7917, + "step": 5232 + }, + { + "epoch": 0.2880180527271727, + "grad_norm": 0.8071381449699402, + "learning_rate": 9.513060151774156e-06, + "loss": 0.8054, + "step": 5233 + }, + { + "epoch": 0.2880730915295283, + "grad_norm": 0.815567135810852, + "learning_rate": 9.512873546934406e-06, + "loss": 0.8647, + "step": 5234 + }, + { + "epoch": 0.288128130331884, + "grad_norm": 0.8255048990249634, + "learning_rate": 9.512686908177111e-06, + "loss": 0.9011, + "step": 5235 + }, + { + "epoch": 0.28818316913423964, + "grad_norm": 0.8392062187194824, + "learning_rate": 9.512500235503666e-06, + "loss": 0.8778, + "step": 5236 + }, + { + "epoch": 0.2882382079365953, + "grad_norm": 0.7256191372871399, + "learning_rate": 9.512313528915478e-06, + "loss": 0.7231, + "step": 5237 + }, + { + "epoch": 0.28829324673895096, + "grad_norm": 0.9041032195091248, + "learning_rate": 9.51212678841395e-06, + "loss": 0.8469, + "step": 5238 + }, + { + "epoch": 0.28834828554130665, + "grad_norm": 0.7857525944709778, + "learning_rate": 9.511940014000485e-06, + "loss": 0.7447, + "step": 5239 + }, + { + "epoch": 0.2884033243436623, + "grad_norm": 0.6925225257873535, + "learning_rate": 9.511753205676485e-06, + "loss": 0.8302, + "step": 5240 + }, + { + "epoch": 0.2884583631460179, + "grad_norm": 0.7253623008728027, + "learning_rate": 9.511566363443356e-06, + "loss": 0.8373, + "step": 5241 + }, + { + "epoch": 0.2885134019483736, + "grad_norm": 0.7198607921600342, + "learning_rate": 9.511379487302504e-06, + "loss": 0.79, + "step": 5242 + }, + { + "epoch": 0.28856844075072924, + "grad_norm": 0.7966421246528625, + "learning_rate": 9.511192577255328e-06, + "loss": 0.7933, + "step": 5243 + }, + { + "epoch": 0.2886234795530849, + "grad_norm": 0.9159359931945801, + "learning_rate": 9.511005633303239e-06, + "loss": 0.7254, + "step": 5244 + }, + { + "epoch": 0.28867851835544056, + "grad_norm": 0.9514481425285339, + "learning_rate": 9.510818655447638e-06, + "loss": 0.8916, + "step": 5245 + }, + { + "epoch": 0.28873355715779625, + "grad_norm": 0.7505099773406982, + "learning_rate": 9.510631643689932e-06, + "loss": 0.765, + "step": 5246 + }, + { + "epoch": 0.2887885959601519, + "grad_norm": 0.7824658751487732, + "learning_rate": 9.510444598031526e-06, + "loss": 0.6972, + "step": 5247 + }, + { + "epoch": 0.2888436347625076, + "grad_norm": 0.7778681516647339, + "learning_rate": 9.510257518473824e-06, + "loss": 0.8705, + "step": 5248 + }, + { + "epoch": 0.2888986735648632, + "grad_norm": 0.6785199642181396, + "learning_rate": 9.510070405018235e-06, + "loss": 0.6889, + "step": 5249 + }, + { + "epoch": 0.2889537123672189, + "grad_norm": 0.7045316100120544, + "learning_rate": 9.509883257666164e-06, + "loss": 0.7979, + "step": 5250 + }, + { + "epoch": 0.28900875116957453, + "grad_norm": 1.3174562454223633, + "learning_rate": 9.509696076419018e-06, + "loss": 0.8802, + "step": 5251 + }, + { + "epoch": 0.2890637899719302, + "grad_norm": 1.1800767183303833, + "learning_rate": 9.509508861278205e-06, + "loss": 0.9246, + "step": 5252 + }, + { + "epoch": 0.28911882877428585, + "grad_norm": 0.7057580947875977, + "learning_rate": 9.509321612245128e-06, + "loss": 0.7565, + "step": 5253 + }, + { + "epoch": 0.28917386757664154, + "grad_norm": 0.7681905031204224, + "learning_rate": 9.509134329321197e-06, + "loss": 0.8678, + "step": 5254 + }, + { + "epoch": 0.2892289063789972, + "grad_norm": 0.96025550365448, + "learning_rate": 9.50894701250782e-06, + "loss": 0.9108, + "step": 5255 + }, + { + "epoch": 0.28928394518135286, + "grad_norm": 0.7786841988563538, + "learning_rate": 9.508759661806405e-06, + "loss": 0.7747, + "step": 5256 + }, + { + "epoch": 0.2893389839837085, + "grad_norm": 0.7073540091514587, + "learning_rate": 9.508572277218358e-06, + "loss": 0.7573, + "step": 5257 + }, + { + "epoch": 0.2893940227860642, + "grad_norm": 0.6648856401443481, + "learning_rate": 9.50838485874509e-06, + "loss": 0.7294, + "step": 5258 + }, + { + "epoch": 0.2894490615884198, + "grad_norm": 0.6794270873069763, + "learning_rate": 9.508197406388007e-06, + "loss": 0.7001, + "step": 5259 + }, + { + "epoch": 0.2895041003907755, + "grad_norm": 0.6819350123405457, + "learning_rate": 9.50800992014852e-06, + "loss": 0.7114, + "step": 5260 + }, + { + "epoch": 0.28955913919313114, + "grad_norm": 0.6616997122764587, + "learning_rate": 9.507822400028036e-06, + "loss": 0.7108, + "step": 5261 + }, + { + "epoch": 0.28961417799548683, + "grad_norm": 0.7447230219841003, + "learning_rate": 9.507634846027966e-06, + "loss": 0.7865, + "step": 5262 + }, + { + "epoch": 0.28966921679784247, + "grad_norm": 0.7826278209686279, + "learning_rate": 9.50744725814972e-06, + "loss": 0.7922, + "step": 5263 + }, + { + "epoch": 0.28972425560019816, + "grad_norm": 0.8054459095001221, + "learning_rate": 9.507259636394706e-06, + "loss": 0.795, + "step": 5264 + }, + { + "epoch": 0.2897792944025538, + "grad_norm": 0.9539191722869873, + "learning_rate": 9.507071980764335e-06, + "loss": 0.9495, + "step": 5265 + }, + { + "epoch": 0.2898343332049095, + "grad_norm": 0.8877993226051331, + "learning_rate": 9.506884291260017e-06, + "loss": 0.8418, + "step": 5266 + }, + { + "epoch": 0.2898893720072651, + "grad_norm": 0.6620327234268188, + "learning_rate": 9.506696567883164e-06, + "loss": 0.6285, + "step": 5267 + }, + { + "epoch": 0.2899444108096208, + "grad_norm": 0.7604434490203857, + "learning_rate": 9.506508810635187e-06, + "loss": 0.8562, + "step": 5268 + }, + { + "epoch": 0.28999944961197643, + "grad_norm": 0.8181812763214111, + "learning_rate": 9.506321019517494e-06, + "loss": 0.905, + "step": 5269 + }, + { + "epoch": 0.2900544884143321, + "grad_norm": 0.7776391506195068, + "learning_rate": 9.5061331945315e-06, + "loss": 0.8871, + "step": 5270 + }, + { + "epoch": 0.29010952721668776, + "grad_norm": 0.8125039339065552, + "learning_rate": 9.505945335678613e-06, + "loss": 0.7254, + "step": 5271 + }, + { + "epoch": 0.29016456601904345, + "grad_norm": 0.7229846715927124, + "learning_rate": 9.50575744296025e-06, + "loss": 0.8192, + "step": 5272 + }, + { + "epoch": 0.2902196048213991, + "grad_norm": 0.72443026304245, + "learning_rate": 9.505569516377817e-06, + "loss": 0.7813, + "step": 5273 + }, + { + "epoch": 0.29027464362375477, + "grad_norm": 0.6798073053359985, + "learning_rate": 9.505381555932731e-06, + "loss": 0.7655, + "step": 5274 + }, + { + "epoch": 0.2903296824261104, + "grad_norm": 1.0805624723434448, + "learning_rate": 9.505193561626404e-06, + "loss": 0.9035, + "step": 5275 + }, + { + "epoch": 0.2903847212284661, + "grad_norm": 0.7579694986343384, + "learning_rate": 9.505005533460247e-06, + "loss": 0.8612, + "step": 5276 + }, + { + "epoch": 0.2904397600308217, + "grad_norm": 1.2496099472045898, + "learning_rate": 9.504817471435676e-06, + "loss": 0.813, + "step": 5277 + }, + { + "epoch": 0.2904947988331774, + "grad_norm": 0.6915673017501831, + "learning_rate": 9.504629375554102e-06, + "loss": 0.6891, + "step": 5278 + }, + { + "epoch": 0.29054983763553305, + "grad_norm": 0.8581767082214355, + "learning_rate": 9.504441245816937e-06, + "loss": 0.7137, + "step": 5279 + }, + { + "epoch": 0.29060487643788874, + "grad_norm": 0.7469545006752014, + "learning_rate": 9.504253082225601e-06, + "loss": 0.7621, + "step": 5280 + }, + { + "epoch": 0.29065991524024437, + "grad_norm": 0.7725615501403809, + "learning_rate": 9.504064884781503e-06, + "loss": 0.7988, + "step": 5281 + }, + { + "epoch": 0.29071495404260006, + "grad_norm": 1.0187722444534302, + "learning_rate": 9.503876653486058e-06, + "loss": 0.7772, + "step": 5282 + }, + { + "epoch": 0.2907699928449557, + "grad_norm": 0.675574779510498, + "learning_rate": 9.503688388340683e-06, + "loss": 0.7096, + "step": 5283 + }, + { + "epoch": 0.2908250316473113, + "grad_norm": 0.7980207800865173, + "learning_rate": 9.503500089346792e-06, + "loss": 0.8291, + "step": 5284 + }, + { + "epoch": 0.290880070449667, + "grad_norm": 0.6891655325889587, + "learning_rate": 9.503311756505797e-06, + "loss": 0.7186, + "step": 5285 + }, + { + "epoch": 0.29093510925202265, + "grad_norm": 0.7273408770561218, + "learning_rate": 9.50312338981912e-06, + "loss": 0.7483, + "step": 5286 + }, + { + "epoch": 0.29099014805437834, + "grad_norm": 0.7346869111061096, + "learning_rate": 9.50293498928817e-06, + "loss": 0.766, + "step": 5287 + }, + { + "epoch": 0.291045186856734, + "grad_norm": 0.7627394795417786, + "learning_rate": 9.502746554914368e-06, + "loss": 0.867, + "step": 5288 + }, + { + "epoch": 0.29110022565908966, + "grad_norm": 0.8477200865745544, + "learning_rate": 9.502558086699128e-06, + "loss": 0.8317, + "step": 5289 + }, + { + "epoch": 0.2911552644614453, + "grad_norm": 0.7696006894111633, + "learning_rate": 9.502369584643867e-06, + "loss": 0.7814, + "step": 5290 + }, + { + "epoch": 0.291210303263801, + "grad_norm": 0.7614455819129944, + "learning_rate": 9.502181048749999e-06, + "loss": 0.7398, + "step": 5291 + }, + { + "epoch": 0.2912653420661566, + "grad_norm": 0.7877628207206726, + "learning_rate": 9.501992479018946e-06, + "loss": 0.8731, + "step": 5292 + }, + { + "epoch": 0.2913203808685123, + "grad_norm": 0.7455846667289734, + "learning_rate": 9.50180387545212e-06, + "loss": 0.7059, + "step": 5293 + }, + { + "epoch": 0.29137541967086794, + "grad_norm": 1.145520567893982, + "learning_rate": 9.501615238050944e-06, + "loss": 0.6968, + "step": 5294 + }, + { + "epoch": 0.29143045847322363, + "grad_norm": 0.8100234866142273, + "learning_rate": 9.501426566816831e-06, + "loss": 0.8122, + "step": 5295 + }, + { + "epoch": 0.29148549727557926, + "grad_norm": 0.6813066005706787, + "learning_rate": 9.501237861751203e-06, + "loss": 0.6718, + "step": 5296 + }, + { + "epoch": 0.29154053607793495, + "grad_norm": 0.7400195002555847, + "learning_rate": 9.501049122855473e-06, + "loss": 0.802, + "step": 5297 + }, + { + "epoch": 0.2915955748802906, + "grad_norm": 0.7948681712150574, + "learning_rate": 9.500860350131065e-06, + "loss": 0.8237, + "step": 5298 + }, + { + "epoch": 0.2916506136826463, + "grad_norm": 0.772093653678894, + "learning_rate": 9.500671543579394e-06, + "loss": 0.7687, + "step": 5299 + }, + { + "epoch": 0.2917056524850019, + "grad_norm": 0.7468486428260803, + "learning_rate": 9.500482703201881e-06, + "loss": 0.7827, + "step": 5300 + }, + { + "epoch": 0.2917606912873576, + "grad_norm": 0.7284440398216248, + "learning_rate": 9.500293828999945e-06, + "loss": 0.8086, + "step": 5301 + }, + { + "epoch": 0.29181573008971323, + "grad_norm": 0.8014211654663086, + "learning_rate": 9.500104920975005e-06, + "loss": 0.8409, + "step": 5302 + }, + { + "epoch": 0.2918707688920689, + "grad_norm": 0.7588346004486084, + "learning_rate": 9.49991597912848e-06, + "loss": 0.7149, + "step": 5303 + }, + { + "epoch": 0.29192580769442456, + "grad_norm": 0.8098518252372742, + "learning_rate": 9.499727003461794e-06, + "loss": 0.8375, + "step": 5304 + }, + { + "epoch": 0.29198084649678024, + "grad_norm": 0.8502426743507385, + "learning_rate": 9.499537993976363e-06, + "loss": 0.8177, + "step": 5305 + }, + { + "epoch": 0.2920358852991359, + "grad_norm": 0.8010903596878052, + "learning_rate": 9.499348950673607e-06, + "loss": 0.8457, + "step": 5306 + }, + { + "epoch": 0.29209092410149157, + "grad_norm": 0.6628156304359436, + "learning_rate": 9.49915987355495e-06, + "loss": 0.7327, + "step": 5307 + }, + { + "epoch": 0.2921459629038472, + "grad_norm": 0.7414939999580383, + "learning_rate": 9.49897076262181e-06, + "loss": 0.8271, + "step": 5308 + }, + { + "epoch": 0.2922010017062029, + "grad_norm": 0.7490847706794739, + "learning_rate": 9.498781617875613e-06, + "loss": 0.7689, + "step": 5309 + }, + { + "epoch": 0.2922560405085585, + "grad_norm": 0.7913424968719482, + "learning_rate": 9.498592439317777e-06, + "loss": 0.8571, + "step": 5310 + }, + { + "epoch": 0.2923110793109142, + "grad_norm": 0.6903867125511169, + "learning_rate": 9.498403226949724e-06, + "loss": 0.7325, + "step": 5311 + }, + { + "epoch": 0.29236611811326985, + "grad_norm": 0.8087130188941956, + "learning_rate": 9.498213980772875e-06, + "loss": 0.8167, + "step": 5312 + }, + { + "epoch": 0.29242115691562554, + "grad_norm": 1.1316752433776855, + "learning_rate": 9.498024700788655e-06, + "loss": 0.912, + "step": 5313 + }, + { + "epoch": 0.29247619571798117, + "grad_norm": 0.8701719045639038, + "learning_rate": 9.497835386998486e-06, + "loss": 0.8728, + "step": 5314 + }, + { + "epoch": 0.29253123452033686, + "grad_norm": 0.6688953638076782, + "learning_rate": 9.49764603940379e-06, + "loss": 0.6561, + "step": 5315 + }, + { + "epoch": 0.2925862733226925, + "grad_norm": 0.8067505359649658, + "learning_rate": 9.49745665800599e-06, + "loss": 0.8419, + "step": 5316 + }, + { + "epoch": 0.2926413121250482, + "grad_norm": 0.7157390117645264, + "learning_rate": 9.49726724280651e-06, + "loss": 0.7964, + "step": 5317 + }, + { + "epoch": 0.2926963509274038, + "grad_norm": 0.7038627862930298, + "learning_rate": 9.497077793806772e-06, + "loss": 0.7343, + "step": 5318 + }, + { + "epoch": 0.2927513897297595, + "grad_norm": 0.7674478888511658, + "learning_rate": 9.4968883110082e-06, + "loss": 0.7624, + "step": 5319 + }, + { + "epoch": 0.29280642853211514, + "grad_norm": 0.6708847284317017, + "learning_rate": 9.496698794412223e-06, + "loss": 0.6554, + "step": 5320 + }, + { + "epoch": 0.2928614673344708, + "grad_norm": 0.8332329392433167, + "learning_rate": 9.49650924402026e-06, + "loss": 0.9357, + "step": 5321 + }, + { + "epoch": 0.29291650613682646, + "grad_norm": 0.7601341605186462, + "learning_rate": 9.496319659833737e-06, + "loss": 0.8208, + "step": 5322 + }, + { + "epoch": 0.29297154493918215, + "grad_norm": 0.8320396542549133, + "learning_rate": 9.496130041854077e-06, + "loss": 0.8423, + "step": 5323 + }, + { + "epoch": 0.2930265837415378, + "grad_norm": 0.8242839574813843, + "learning_rate": 9.49594039008271e-06, + "loss": 0.9101, + "step": 5324 + }, + { + "epoch": 0.29308162254389347, + "grad_norm": 0.8906320333480835, + "learning_rate": 9.495750704521058e-06, + "loss": 0.7343, + "step": 5325 + }, + { + "epoch": 0.2931366613462491, + "grad_norm": 0.7964318990707397, + "learning_rate": 9.495560985170546e-06, + "loss": 0.7789, + "step": 5326 + }, + { + "epoch": 0.29319170014860474, + "grad_norm": 0.8267771601676941, + "learning_rate": 9.495371232032602e-06, + "loss": 0.7447, + "step": 5327 + }, + { + "epoch": 0.29324673895096043, + "grad_norm": 0.8120046257972717, + "learning_rate": 9.49518144510865e-06, + "loss": 0.7803, + "step": 5328 + }, + { + "epoch": 0.29330177775331606, + "grad_norm": 0.7314801812171936, + "learning_rate": 9.494991624400119e-06, + "loss": 0.6758, + "step": 5329 + }, + { + "epoch": 0.29335681655567175, + "grad_norm": 0.6989930272102356, + "learning_rate": 9.494801769908433e-06, + "loss": 0.7945, + "step": 5330 + }, + { + "epoch": 0.2934118553580274, + "grad_norm": 0.7804785966873169, + "learning_rate": 9.494611881635021e-06, + "loss": 0.7977, + "step": 5331 + }, + { + "epoch": 0.2934668941603831, + "grad_norm": 0.8377045392990112, + "learning_rate": 9.494421959581308e-06, + "loss": 0.8077, + "step": 5332 + }, + { + "epoch": 0.2935219329627387, + "grad_norm": 0.7463418245315552, + "learning_rate": 9.494232003748724e-06, + "loss": 0.783, + "step": 5333 + }, + { + "epoch": 0.2935769717650944, + "grad_norm": 0.7598912715911865, + "learning_rate": 9.494042014138695e-06, + "loss": 0.7869, + "step": 5334 + }, + { + "epoch": 0.29363201056745003, + "grad_norm": 0.7634113430976868, + "learning_rate": 9.493851990752648e-06, + "loss": 0.8108, + "step": 5335 + }, + { + "epoch": 0.2936870493698057, + "grad_norm": 0.8056474328041077, + "learning_rate": 9.493661933592013e-06, + "loss": 0.7921, + "step": 5336 + }, + { + "epoch": 0.29374208817216135, + "grad_norm": 0.8699371218681335, + "learning_rate": 9.493471842658219e-06, + "loss": 0.8833, + "step": 5337 + }, + { + "epoch": 0.29379712697451704, + "grad_norm": 0.8803261518478394, + "learning_rate": 9.493281717952691e-06, + "loss": 0.7848, + "step": 5338 + }, + { + "epoch": 0.2938521657768727, + "grad_norm": 0.7678453922271729, + "learning_rate": 9.493091559476864e-06, + "loss": 0.836, + "step": 5339 + }, + { + "epoch": 0.29390720457922836, + "grad_norm": 0.7653701305389404, + "learning_rate": 9.49290136723216e-06, + "loss": 0.8215, + "step": 5340 + }, + { + "epoch": 0.293962243381584, + "grad_norm": 0.768120527267456, + "learning_rate": 9.492711141220013e-06, + "loss": 0.7498, + "step": 5341 + }, + { + "epoch": 0.2940172821839397, + "grad_norm": 0.7665749788284302, + "learning_rate": 9.492520881441854e-06, + "loss": 0.7883, + "step": 5342 + }, + { + "epoch": 0.2940723209862953, + "grad_norm": 0.7405015230178833, + "learning_rate": 9.492330587899108e-06, + "loss": 0.8112, + "step": 5343 + }, + { + "epoch": 0.294127359788651, + "grad_norm": 0.7183459997177124, + "learning_rate": 9.492140260593208e-06, + "loss": 0.8227, + "step": 5344 + }, + { + "epoch": 0.29418239859100664, + "grad_norm": 0.7453572154045105, + "learning_rate": 9.491949899525585e-06, + "loss": 0.8148, + "step": 5345 + }, + { + "epoch": 0.29423743739336233, + "grad_norm": 0.8963750600814819, + "learning_rate": 9.491759504697669e-06, + "loss": 0.9261, + "step": 5346 + }, + { + "epoch": 0.29429247619571797, + "grad_norm": 0.7631667256355286, + "learning_rate": 9.49156907611089e-06, + "loss": 0.7708, + "step": 5347 + }, + { + "epoch": 0.29434751499807366, + "grad_norm": 0.6324381232261658, + "learning_rate": 9.49137861376668e-06, + "loss": 0.6688, + "step": 5348 + }, + { + "epoch": 0.2944025538004293, + "grad_norm": 0.6969807147979736, + "learning_rate": 9.491188117666472e-06, + "loss": 0.7516, + "step": 5349 + }, + { + "epoch": 0.294457592602785, + "grad_norm": 1.633340835571289, + "learning_rate": 9.490997587811697e-06, + "loss": 0.8111, + "step": 5350 + }, + { + "epoch": 0.2945126314051406, + "grad_norm": 0.7084371447563171, + "learning_rate": 9.490807024203785e-06, + "loss": 0.8375, + "step": 5351 + }, + { + "epoch": 0.2945676702074963, + "grad_norm": 0.7335958480834961, + "learning_rate": 9.490616426844169e-06, + "loss": 0.7884, + "step": 5352 + }, + { + "epoch": 0.29462270900985194, + "grad_norm": 0.7560276985168457, + "learning_rate": 9.490425795734282e-06, + "loss": 0.8918, + "step": 5353 + }, + { + "epoch": 0.2946777478122076, + "grad_norm": 0.9185894727706909, + "learning_rate": 9.490235130875557e-06, + "loss": 0.7976, + "step": 5354 + }, + { + "epoch": 0.29473278661456326, + "grad_norm": 0.7871553897857666, + "learning_rate": 9.490044432269427e-06, + "loss": 0.8564, + "step": 5355 + }, + { + "epoch": 0.29478782541691895, + "grad_norm": 0.8736812472343445, + "learning_rate": 9.489853699917326e-06, + "loss": 0.8114, + "step": 5356 + }, + { + "epoch": 0.2948428642192746, + "grad_norm": 0.8068968653678894, + "learning_rate": 9.489662933820684e-06, + "loss": 0.9198, + "step": 5357 + }, + { + "epoch": 0.29489790302163027, + "grad_norm": 0.7816325426101685, + "learning_rate": 9.489472133980939e-06, + "loss": 0.8012, + "step": 5358 + }, + { + "epoch": 0.2949529418239859, + "grad_norm": 0.7248200178146362, + "learning_rate": 9.489281300399522e-06, + "loss": 0.8099, + "step": 5359 + }, + { + "epoch": 0.2950079806263416, + "grad_norm": 0.7887724041938782, + "learning_rate": 9.48909043307787e-06, + "loss": 0.884, + "step": 5360 + }, + { + "epoch": 0.2950630194286972, + "grad_norm": 0.765163004398346, + "learning_rate": 9.488899532017415e-06, + "loss": 0.8563, + "step": 5361 + }, + { + "epoch": 0.2951180582310529, + "grad_norm": 0.7658557295799255, + "learning_rate": 9.488708597219592e-06, + "loss": 0.8897, + "step": 5362 + }, + { + "epoch": 0.29517309703340855, + "grad_norm": 0.6653227806091309, + "learning_rate": 9.488517628685838e-06, + "loss": 0.7107, + "step": 5363 + }, + { + "epoch": 0.29522813583576424, + "grad_norm": 0.787739098072052, + "learning_rate": 9.488326626417586e-06, + "loss": 0.8181, + "step": 5364 + }, + { + "epoch": 0.29528317463811987, + "grad_norm": 0.7822532057762146, + "learning_rate": 9.488135590416275e-06, + "loss": 0.8238, + "step": 5365 + }, + { + "epoch": 0.29533821344047556, + "grad_norm": 0.7797419428825378, + "learning_rate": 9.487944520683334e-06, + "loss": 0.8484, + "step": 5366 + }, + { + "epoch": 0.2953932522428312, + "grad_norm": 0.7230222225189209, + "learning_rate": 9.487753417220207e-06, + "loss": 0.8193, + "step": 5367 + }, + { + "epoch": 0.2954482910451869, + "grad_norm": 0.8256810307502747, + "learning_rate": 9.487562280028325e-06, + "loss": 0.7691, + "step": 5368 + }, + { + "epoch": 0.2955033298475425, + "grad_norm": 0.7704648375511169, + "learning_rate": 9.487371109109127e-06, + "loss": 0.8235, + "step": 5369 + }, + { + "epoch": 0.29555836864989815, + "grad_norm": 0.7580391764640808, + "learning_rate": 9.487179904464048e-06, + "loss": 0.7911, + "step": 5370 + }, + { + "epoch": 0.29561340745225384, + "grad_norm": 0.7211806774139404, + "learning_rate": 9.486988666094526e-06, + "loss": 0.7188, + "step": 5371 + }, + { + "epoch": 0.2956684462546095, + "grad_norm": 0.8375828862190247, + "learning_rate": 9.486797394001999e-06, + "loss": 0.881, + "step": 5372 + }, + { + "epoch": 0.29572348505696516, + "grad_norm": 0.8500093221664429, + "learning_rate": 9.486606088187903e-06, + "loss": 0.8632, + "step": 5373 + }, + { + "epoch": 0.2957785238593208, + "grad_norm": 0.7754727005958557, + "learning_rate": 9.486414748653677e-06, + "loss": 0.8124, + "step": 5374 + }, + { + "epoch": 0.2958335626616765, + "grad_norm": 0.9395208954811096, + "learning_rate": 9.486223375400759e-06, + "loss": 0.8046, + "step": 5375 + }, + { + "epoch": 0.2958886014640321, + "grad_norm": 0.7587517499923706, + "learning_rate": 9.486031968430587e-06, + "loss": 0.7852, + "step": 5376 + }, + { + "epoch": 0.2959436402663878, + "grad_norm": 0.6921781301498413, + "learning_rate": 9.485840527744599e-06, + "loss": 0.7392, + "step": 5377 + }, + { + "epoch": 0.29599867906874344, + "grad_norm": 0.8768522143363953, + "learning_rate": 9.485649053344233e-06, + "loss": 0.7819, + "step": 5378 + }, + { + "epoch": 0.29605371787109913, + "grad_norm": 0.7565680146217346, + "learning_rate": 9.485457545230932e-06, + "loss": 0.7489, + "step": 5379 + }, + { + "epoch": 0.29610875667345476, + "grad_norm": 0.7760992050170898, + "learning_rate": 9.485266003406132e-06, + "loss": 0.8129, + "step": 5380 + }, + { + "epoch": 0.29616379547581045, + "grad_norm": 0.7726097106933594, + "learning_rate": 9.485074427871272e-06, + "loss": 0.725, + "step": 5381 + }, + { + "epoch": 0.2962188342781661, + "grad_norm": 0.6885473728179932, + "learning_rate": 9.484882818627796e-06, + "loss": 0.685, + "step": 5382 + }, + { + "epoch": 0.2962738730805218, + "grad_norm": 0.776509702205658, + "learning_rate": 9.484691175677138e-06, + "loss": 0.8077, + "step": 5383 + }, + { + "epoch": 0.2963289118828774, + "grad_norm": 0.7436297535896301, + "learning_rate": 9.484499499020744e-06, + "loss": 0.8161, + "step": 5384 + }, + { + "epoch": 0.2963839506852331, + "grad_norm": 0.7604314088821411, + "learning_rate": 9.484307788660052e-06, + "loss": 0.825, + "step": 5385 + }, + { + "epoch": 0.29643898948758873, + "grad_norm": 0.7230789065361023, + "learning_rate": 9.484116044596501e-06, + "loss": 0.8005, + "step": 5386 + }, + { + "epoch": 0.2964940282899444, + "grad_norm": 0.820442259311676, + "learning_rate": 9.483924266831536e-06, + "loss": 0.789, + "step": 5387 + }, + { + "epoch": 0.29654906709230006, + "grad_norm": 0.7514582276344299, + "learning_rate": 9.483732455366596e-06, + "loss": 0.8531, + "step": 5388 + }, + { + "epoch": 0.29660410589465575, + "grad_norm": 0.6671503782272339, + "learning_rate": 9.483540610203124e-06, + "loss": 0.7627, + "step": 5389 + }, + { + "epoch": 0.2966591446970114, + "grad_norm": 0.6955942511558533, + "learning_rate": 9.483348731342559e-06, + "loss": 0.726, + "step": 5390 + }, + { + "epoch": 0.29671418349936707, + "grad_norm": 0.769781768321991, + "learning_rate": 9.483156818786347e-06, + "loss": 0.8064, + "step": 5391 + }, + { + "epoch": 0.2967692223017227, + "grad_norm": 1.0764707326889038, + "learning_rate": 9.482964872535927e-06, + "loss": 0.8249, + "step": 5392 + }, + { + "epoch": 0.2968242611040784, + "grad_norm": 1.0508921146392822, + "learning_rate": 9.482772892592744e-06, + "loss": 0.706, + "step": 5393 + }, + { + "epoch": 0.296879299906434, + "grad_norm": 0.6442564129829407, + "learning_rate": 9.482580878958239e-06, + "loss": 0.6025, + "step": 5394 + }, + { + "epoch": 0.2969343387087897, + "grad_norm": 0.7622735500335693, + "learning_rate": 9.482388831633856e-06, + "loss": 0.7639, + "step": 5395 + }, + { + "epoch": 0.29698937751114535, + "grad_norm": 0.8179057240486145, + "learning_rate": 9.482196750621038e-06, + "loss": 0.7641, + "step": 5396 + }, + { + "epoch": 0.29704441631350104, + "grad_norm": 0.7955192923545837, + "learning_rate": 9.48200463592123e-06, + "loss": 0.8407, + "step": 5397 + }, + { + "epoch": 0.29709945511585667, + "grad_norm": 0.7909773588180542, + "learning_rate": 9.481812487535875e-06, + "loss": 0.7833, + "step": 5398 + }, + { + "epoch": 0.29715449391821236, + "grad_norm": 0.8409042954444885, + "learning_rate": 9.481620305466417e-06, + "loss": 0.7788, + "step": 5399 + }, + { + "epoch": 0.297209532720568, + "grad_norm": 0.7521414160728455, + "learning_rate": 9.4814280897143e-06, + "loss": 0.7192, + "step": 5400 + }, + { + "epoch": 0.2972645715229237, + "grad_norm": 0.7016280889511108, + "learning_rate": 9.481235840280969e-06, + "loss": 0.7181, + "step": 5401 + }, + { + "epoch": 0.2973196103252793, + "grad_norm": 0.7257362604141235, + "learning_rate": 9.48104355716787e-06, + "loss": 0.7845, + "step": 5402 + }, + { + "epoch": 0.297374649127635, + "grad_norm": 0.8048765659332275, + "learning_rate": 9.480851240376445e-06, + "loss": 0.7921, + "step": 5403 + }, + { + "epoch": 0.29742968792999064, + "grad_norm": 0.8715546131134033, + "learning_rate": 9.480658889908143e-06, + "loss": 0.856, + "step": 5404 + }, + { + "epoch": 0.2974847267323463, + "grad_norm": 0.7211160063743591, + "learning_rate": 9.480466505764408e-06, + "loss": 0.7687, + "step": 5405 + }, + { + "epoch": 0.29753976553470196, + "grad_norm": 0.8749645352363586, + "learning_rate": 9.480274087946686e-06, + "loss": 0.8419, + "step": 5406 + }, + { + "epoch": 0.29759480433705765, + "grad_norm": 0.7986398935317993, + "learning_rate": 9.480081636456424e-06, + "loss": 0.8309, + "step": 5407 + }, + { + "epoch": 0.2976498431394133, + "grad_norm": 0.8435508012771606, + "learning_rate": 9.479889151295067e-06, + "loss": 0.7457, + "step": 5408 + }, + { + "epoch": 0.297704881941769, + "grad_norm": 0.8725010752677917, + "learning_rate": 9.479696632464063e-06, + "loss": 0.8069, + "step": 5409 + }, + { + "epoch": 0.2977599207441246, + "grad_norm": 0.7364320158958435, + "learning_rate": 9.479504079964856e-06, + "loss": 0.8316, + "step": 5410 + }, + { + "epoch": 0.2978149595464803, + "grad_norm": 0.7967824935913086, + "learning_rate": 9.479311493798898e-06, + "loss": 0.7689, + "step": 5411 + }, + { + "epoch": 0.29786999834883593, + "grad_norm": 0.8415414094924927, + "learning_rate": 9.479118873967632e-06, + "loss": 0.8288, + "step": 5412 + }, + { + "epoch": 0.29792503715119156, + "grad_norm": 0.9723265767097473, + "learning_rate": 9.478926220472508e-06, + "loss": 0.7422, + "step": 5413 + }, + { + "epoch": 0.29798007595354725, + "grad_norm": 0.7203155159950256, + "learning_rate": 9.478733533314974e-06, + "loss": 0.707, + "step": 5414 + }, + { + "epoch": 0.2980351147559029, + "grad_norm": 0.7643926739692688, + "learning_rate": 9.478540812496478e-06, + "loss": 0.7793, + "step": 5415 + }, + { + "epoch": 0.2980901535582586, + "grad_norm": 0.9177087545394897, + "learning_rate": 9.478348058018467e-06, + "loss": 0.865, + "step": 5416 + }, + { + "epoch": 0.2981451923606142, + "grad_norm": 0.678931713104248, + "learning_rate": 9.478155269882392e-06, + "loss": 0.7716, + "step": 5417 + }, + { + "epoch": 0.2982002311629699, + "grad_norm": 0.8440513610839844, + "learning_rate": 9.4779624480897e-06, + "loss": 0.8904, + "step": 5418 + }, + { + "epoch": 0.29825526996532553, + "grad_norm": 0.8508756756782532, + "learning_rate": 9.47776959264184e-06, + "loss": 0.7994, + "step": 5419 + }, + { + "epoch": 0.2983103087676812, + "grad_norm": 0.8736951947212219, + "learning_rate": 9.477576703540265e-06, + "loss": 0.8374, + "step": 5420 + }, + { + "epoch": 0.29836534757003685, + "grad_norm": 0.8063240051269531, + "learning_rate": 9.47738378078642e-06, + "loss": 0.7217, + "step": 5421 + }, + { + "epoch": 0.29842038637239254, + "grad_norm": 1.1495088338851929, + "learning_rate": 9.477190824381757e-06, + "loss": 0.8902, + "step": 5422 + }, + { + "epoch": 0.2984754251747482, + "grad_norm": 1.0241554975509644, + "learning_rate": 9.476997834327725e-06, + "loss": 0.9354, + "step": 5423 + }, + { + "epoch": 0.29853046397710387, + "grad_norm": 0.939950168132782, + "learning_rate": 9.476804810625779e-06, + "loss": 0.8714, + "step": 5424 + }, + { + "epoch": 0.2985855027794595, + "grad_norm": 0.7592660188674927, + "learning_rate": 9.476611753277364e-06, + "loss": 0.7513, + "step": 5425 + }, + { + "epoch": 0.2986405415818152, + "grad_norm": 0.776153028011322, + "learning_rate": 9.476418662283935e-06, + "loss": 0.7828, + "step": 5426 + }, + { + "epoch": 0.2986955803841708, + "grad_norm": 0.9317814707756042, + "learning_rate": 9.47622553764694e-06, + "loss": 0.865, + "step": 5427 + }, + { + "epoch": 0.2987506191865265, + "grad_norm": 0.7770501971244812, + "learning_rate": 9.476032379367832e-06, + "loss": 0.7281, + "step": 5428 + }, + { + "epoch": 0.29880565798888215, + "grad_norm": 0.7815201282501221, + "learning_rate": 9.475839187448064e-06, + "loss": 0.7565, + "step": 5429 + }, + { + "epoch": 0.29886069679123783, + "grad_norm": 0.7992607951164246, + "learning_rate": 9.475645961889086e-06, + "loss": 0.8109, + "step": 5430 + }, + { + "epoch": 0.29891573559359347, + "grad_norm": 0.7780614495277405, + "learning_rate": 9.475452702692351e-06, + "loss": 0.7814, + "step": 5431 + }, + { + "epoch": 0.29897077439594916, + "grad_norm": 0.7409062385559082, + "learning_rate": 9.475259409859313e-06, + "loss": 0.7712, + "step": 5432 + }, + { + "epoch": 0.2990258131983048, + "grad_norm": 0.7935584187507629, + "learning_rate": 9.47506608339142e-06, + "loss": 0.8301, + "step": 5433 + }, + { + "epoch": 0.2990808520006605, + "grad_norm": 0.6931030750274658, + "learning_rate": 9.474872723290132e-06, + "loss": 0.7471, + "step": 5434 + }, + { + "epoch": 0.2991358908030161, + "grad_norm": 0.7622918486595154, + "learning_rate": 9.474679329556894e-06, + "loss": 0.7727, + "step": 5435 + }, + { + "epoch": 0.2991909296053718, + "grad_norm": 0.7957701086997986, + "learning_rate": 9.474485902193169e-06, + "loss": 0.7663, + "step": 5436 + }, + { + "epoch": 0.29924596840772744, + "grad_norm": 1.0600612163543701, + "learning_rate": 9.474292441200404e-06, + "loss": 0.7861, + "step": 5437 + }, + { + "epoch": 0.2993010072100831, + "grad_norm": 0.7343600392341614, + "learning_rate": 9.474098946580053e-06, + "loss": 0.8609, + "step": 5438 + }, + { + "epoch": 0.29935604601243876, + "grad_norm": 0.7477726340293884, + "learning_rate": 9.473905418333573e-06, + "loss": 0.7683, + "step": 5439 + }, + { + "epoch": 0.29941108481479445, + "grad_norm": 0.7955546379089355, + "learning_rate": 9.473711856462417e-06, + "loss": 0.8406, + "step": 5440 + }, + { + "epoch": 0.2994661236171501, + "grad_norm": 0.8291183114051819, + "learning_rate": 9.47351826096804e-06, + "loss": 0.6919, + "step": 5441 + }, + { + "epoch": 0.29952116241950577, + "grad_norm": 0.8899849057197571, + "learning_rate": 9.473324631851898e-06, + "loss": 0.9403, + "step": 5442 + }, + { + "epoch": 0.2995762012218614, + "grad_norm": 0.837066650390625, + "learning_rate": 9.473130969115445e-06, + "loss": 0.8676, + "step": 5443 + }, + { + "epoch": 0.2996312400242171, + "grad_norm": 0.8385708928108215, + "learning_rate": 9.472937272760138e-06, + "loss": 0.7588, + "step": 5444 + }, + { + "epoch": 0.2996862788265727, + "grad_norm": 0.6990595459938049, + "learning_rate": 9.472743542787431e-06, + "loss": 0.6769, + "step": 5445 + }, + { + "epoch": 0.2997413176289284, + "grad_norm": 0.789165735244751, + "learning_rate": 9.472549779198781e-06, + "loss": 0.8084, + "step": 5446 + }, + { + "epoch": 0.29979635643128405, + "grad_norm": 0.8820298314094543, + "learning_rate": 9.472355981995643e-06, + "loss": 0.8262, + "step": 5447 + }, + { + "epoch": 0.29985139523363974, + "grad_norm": 0.8928382992744446, + "learning_rate": 9.472162151179475e-06, + "loss": 0.8123, + "step": 5448 + }, + { + "epoch": 0.2999064340359954, + "grad_norm": 0.7688086032867432, + "learning_rate": 9.471968286751735e-06, + "loss": 0.6846, + "step": 5449 + }, + { + "epoch": 0.29996147283835106, + "grad_norm": 0.6962918043136597, + "learning_rate": 9.471774388713877e-06, + "loss": 0.7872, + "step": 5450 + }, + { + "epoch": 0.3000165116407067, + "grad_norm": 0.7467569708824158, + "learning_rate": 9.47158045706736e-06, + "loss": 0.8201, + "step": 5451 + }, + { + "epoch": 0.3000715504430624, + "grad_norm": 0.7651814222335815, + "learning_rate": 9.471386491813642e-06, + "loss": 0.7734, + "step": 5452 + }, + { + "epoch": 0.300126589245418, + "grad_norm": 0.8001144528388977, + "learning_rate": 9.47119249295418e-06, + "loss": 0.8266, + "step": 5453 + }, + { + "epoch": 0.3001816280477737, + "grad_norm": 0.7937704920768738, + "learning_rate": 9.47099846049043e-06, + "loss": 0.8025, + "step": 5454 } ], "logging_steps": 1, @@ -25478,7 +38204,7 @@ "attributes": {} } }, - "total_flos": 1.0730058856487977e+19, + "total_flos": 1.6095088284731965e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null