{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7979688066739209, "eval_steps": 500, "global_step": 2200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003627130939426913, "grad_norm": 0.64821457862854, "learning_rate": 0.0, "loss": 0.4482, "step": 1 }, { "epoch": 0.0007254261878853826, "grad_norm": 0.6469861268997192, "learning_rate": 2e-05, "loss": 0.4874, "step": 2 }, { "epoch": 0.001088139281828074, "grad_norm": 0.45289790630340576, "learning_rate": 4e-05, "loss": 0.4732, "step": 3 }, { "epoch": 0.0014508523757707653, "grad_norm": 0.38072678446769714, "learning_rate": 6e-05, "loss": 0.4503, "step": 4 }, { "epoch": 0.0018135654697134566, "grad_norm": 0.4032226800918579, "learning_rate": 8e-05, "loss": 0.4312, "step": 5 }, { "epoch": 0.002176278563656148, "grad_norm": 0.3684772551059723, "learning_rate": 0.0001, "loss": 0.4055, "step": 6 }, { "epoch": 0.0025389916575988395, "grad_norm": 0.3409311771392822, "learning_rate": 0.00012, "loss": 0.4201, "step": 7 }, { "epoch": 0.0029017047515415306, "grad_norm": 0.3772580921649933, "learning_rate": 0.00014, "loss": 0.4086, "step": 8 }, { "epoch": 0.003264417845484222, "grad_norm": 0.30869755148887634, "learning_rate": 0.00016, "loss": 0.3954, "step": 9 }, { "epoch": 0.003627130939426913, "grad_norm": 0.23723824322223663, "learning_rate": 0.00018, "loss": 0.3992, "step": 10 }, { "epoch": 0.003989844033369605, "grad_norm": 0.18091322481632233, "learning_rate": 0.0002, "loss": 0.368, "step": 11 }, { "epoch": 0.004352557127312296, "grad_norm": 0.20436523854732513, "learning_rate": 0.00019999993460381957, "loss": 0.3711, "step": 12 }, { "epoch": 0.004715270221254987, "grad_norm": 0.19207683205604553, "learning_rate": 0.00019999973841536373, "loss": 0.3788, "step": 13 }, { "epoch": 0.005077983315197679, "grad_norm": 0.1436341255903244, "learning_rate": 0.00019999941143488914, "loss": 0.3936, "step": 14 }, { "epoch": 0.00544069640914037, "grad_norm": 0.13892005383968353, "learning_rate": 0.0001999989536628234, "loss": 0.4062, "step": 15 }, { "epoch": 0.005803409503083061, "grad_norm": 0.12910069525241852, "learning_rate": 0.00019999836509976534, "loss": 0.3863, "step": 16 }, { "epoch": 0.006166122597025753, "grad_norm": 0.10763731598854065, "learning_rate": 0.00019999764574648465, "loss": 0.3757, "step": 17 }, { "epoch": 0.006528835690968444, "grad_norm": 0.1078948974609375, "learning_rate": 0.00019999679560392226, "loss": 0.3342, "step": 18 }, { "epoch": 0.006891548784911135, "grad_norm": 0.10403122007846832, "learning_rate": 0.00019999581467319006, "loss": 0.3371, "step": 19 }, { "epoch": 0.007254261878853826, "grad_norm": 0.09776491671800613, "learning_rate": 0.00019999470295557105, "loss": 0.3263, "step": 20 }, { "epoch": 0.007616974972796518, "grad_norm": 0.10792049765586853, "learning_rate": 0.00019999346045251925, "loss": 0.3447, "step": 21 }, { "epoch": 0.00797968806673921, "grad_norm": 0.1174544170498848, "learning_rate": 0.00019999208716565977, "loss": 0.336, "step": 22 }, { "epoch": 0.008342401160681901, "grad_norm": 0.09458769857883453, "learning_rate": 0.0001999905830967888, "loss": 0.3262, "step": 23 }, { "epoch": 0.008705114254624592, "grad_norm": 0.09644383937120438, "learning_rate": 0.0001999889482478735, "loss": 0.3361, "step": 24 }, { "epoch": 0.009067827348567283, "grad_norm": 0.09843447804450989, "learning_rate": 0.0001999871826210521, "loss": 0.3485, "step": 25 }, { "epoch": 0.009430540442509974, "grad_norm": 0.10075519979000092, "learning_rate": 0.00019998528621863396, "loss": 0.3461, "step": 26 }, { "epoch": 0.009793253536452665, "grad_norm": 0.09084542095661163, "learning_rate": 0.00019998325904309946, "loss": 0.3267, "step": 27 }, { "epoch": 0.010155966630395358, "grad_norm": 0.10560671985149384, "learning_rate": 0.00019998110109709988, "loss": 0.3532, "step": 28 }, { "epoch": 0.01051867972433805, "grad_norm": 0.08736245334148407, "learning_rate": 0.00019997881238345775, "loss": 0.37, "step": 29 }, { "epoch": 0.01088139281828074, "grad_norm": 0.103543221950531, "learning_rate": 0.0001999763929051665, "loss": 0.3093, "step": 30 }, { "epoch": 0.011244105912223431, "grad_norm": 0.09106361120939255, "learning_rate": 0.0001999738426653906, "loss": 0.3231, "step": 31 }, { "epoch": 0.011606819006166122, "grad_norm": 0.09385113418102264, "learning_rate": 0.00019997116166746562, "loss": 0.3162, "step": 32 }, { "epoch": 0.011969532100108813, "grad_norm": 0.10086266696453094, "learning_rate": 0.00019996834991489805, "loss": 0.3105, "step": 33 }, { "epoch": 0.012332245194051506, "grad_norm": 0.08959592878818512, "learning_rate": 0.00019996540741136544, "loss": 0.3241, "step": 34 }, { "epoch": 0.012694958287994197, "grad_norm": 0.10446605086326599, "learning_rate": 0.00019996233416071644, "loss": 0.338, "step": 35 }, { "epoch": 0.013057671381936888, "grad_norm": 0.08997010439634323, "learning_rate": 0.00019995913016697053, "loss": 0.3089, "step": 36 }, { "epoch": 0.01342038447587958, "grad_norm": 0.09340513497591019, "learning_rate": 0.00019995579543431835, "loss": 0.3167, "step": 37 }, { "epoch": 0.01378309756982227, "grad_norm": 0.0928172841668129, "learning_rate": 0.00019995232996712146, "loss": 0.3236, "step": 38 }, { "epoch": 0.014145810663764961, "grad_norm": 0.10568640381097794, "learning_rate": 0.0001999487337699124, "loss": 0.3213, "step": 39 }, { "epoch": 0.014508523757707652, "grad_norm": 0.09213658422231674, "learning_rate": 0.0001999450068473948, "loss": 0.3308, "step": 40 }, { "epoch": 0.014871236851650345, "grad_norm": 0.09331916272640228, "learning_rate": 0.0001999411492044431, "loss": 0.3265, "step": 41 }, { "epoch": 0.015233949945593036, "grad_norm": 0.0938337966799736, "learning_rate": 0.00019993716084610284, "loss": 0.3084, "step": 42 }, { "epoch": 0.015596663039535727, "grad_norm": 0.1006985679268837, "learning_rate": 0.00019993304177759046, "loss": 0.3932, "step": 43 }, { "epoch": 0.01595937613347842, "grad_norm": 0.09978915005922318, "learning_rate": 0.00019992879200429346, "loss": 0.3147, "step": 44 }, { "epoch": 0.01632208922742111, "grad_norm": 0.095309779047966, "learning_rate": 0.00019992441153177015, "loss": 0.3271, "step": 45 }, { "epoch": 0.016684802321363802, "grad_norm": 0.09498284012079239, "learning_rate": 0.00019991990036574987, "loss": 0.3138, "step": 46 }, { "epoch": 0.017047515415306493, "grad_norm": 0.09961807727813721, "learning_rate": 0.0001999152585121329, "loss": 0.3447, "step": 47 }, { "epoch": 0.017410228509249184, "grad_norm": 0.11686038225889206, "learning_rate": 0.0001999104859769904, "loss": 0.3059, "step": 48 }, { "epoch": 0.017772941603191875, "grad_norm": 0.09790387004613876, "learning_rate": 0.0001999055827665645, "loss": 0.3241, "step": 49 }, { "epoch": 0.018135654697134566, "grad_norm": 0.0987682044506073, "learning_rate": 0.00019990054888726824, "loss": 0.3159, "step": 50 }, { "epoch": 0.018498367791077257, "grad_norm": 0.09558644145727158, "learning_rate": 0.0001998953843456855, "loss": 0.3528, "step": 51 }, { "epoch": 0.01886108088501995, "grad_norm": 0.1173083484172821, "learning_rate": 0.00019989008914857116, "loss": 0.3138, "step": 52 }, { "epoch": 0.01922379397896264, "grad_norm": 0.09404181689023972, "learning_rate": 0.0001998846633028509, "loss": 0.3262, "step": 53 }, { "epoch": 0.01958650707290533, "grad_norm": 0.09235358238220215, "learning_rate": 0.00019987910681562132, "loss": 0.3271, "step": 54 }, { "epoch": 0.01994922016684802, "grad_norm": 0.10229232162237167, "learning_rate": 0.0001998734196941499, "loss": 0.3098, "step": 55 }, { "epoch": 0.020311933260790716, "grad_norm": 0.08622050285339355, "learning_rate": 0.0001998676019458749, "loss": 0.2878, "step": 56 }, { "epoch": 0.020674646354733407, "grad_norm": 0.10718828439712524, "learning_rate": 0.00019986165357840558, "loss": 0.319, "step": 57 }, { "epoch": 0.0210373594486761, "grad_norm": 0.09529942274093628, "learning_rate": 0.00019985557459952188, "loss": 0.2974, "step": 58 }, { "epoch": 0.02140007254261879, "grad_norm": 0.09532184153795242, "learning_rate": 0.00019984936501717468, "loss": 0.3016, "step": 59 }, { "epoch": 0.02176278563656148, "grad_norm": 0.098875492811203, "learning_rate": 0.00019984302483948562, "loss": 0.3006, "step": 60 }, { "epoch": 0.02212549873050417, "grad_norm": 0.1071372851729393, "learning_rate": 0.00019983655407474719, "loss": 0.2796, "step": 61 }, { "epoch": 0.022488211824446862, "grad_norm": 0.11236250400543213, "learning_rate": 0.0001998299527314226, "loss": 0.3067, "step": 62 }, { "epoch": 0.022850924918389554, "grad_norm": 0.10537782311439514, "learning_rate": 0.00019982322081814596, "loss": 0.3415, "step": 63 }, { "epoch": 0.023213638012332245, "grad_norm": 0.09595459699630737, "learning_rate": 0.00019981635834372209, "loss": 0.3076, "step": 64 }, { "epoch": 0.023576351106274936, "grad_norm": 0.09259078651666641, "learning_rate": 0.00019980936531712652, "loss": 0.2913, "step": 65 }, { "epoch": 0.023939064200217627, "grad_norm": 0.20734301209449768, "learning_rate": 0.00019980224174750568, "loss": 0.3102, "step": 66 }, { "epoch": 0.024301777294160318, "grad_norm": 0.10769975185394287, "learning_rate": 0.0001997949876441766, "loss": 0.3336, "step": 67 }, { "epoch": 0.024664490388103012, "grad_norm": 0.1010124459862709, "learning_rate": 0.00019978760301662715, "loss": 0.3305, "step": 68 }, { "epoch": 0.025027203482045703, "grad_norm": 0.09571480005979538, "learning_rate": 0.0001997800878745158, "loss": 0.3181, "step": 69 }, { "epoch": 0.025389916575988394, "grad_norm": 0.10125493258237839, "learning_rate": 0.00019977244222767182, "loss": 0.2873, "step": 70 }, { "epoch": 0.025752629669931085, "grad_norm": 0.11057893931865692, "learning_rate": 0.0001997646660860951, "loss": 0.3125, "step": 71 }, { "epoch": 0.026115342763873776, "grad_norm": 0.1009269654750824, "learning_rate": 0.00019975675945995626, "loss": 0.3038, "step": 72 }, { "epoch": 0.026478055857816468, "grad_norm": 0.09274876117706299, "learning_rate": 0.00019974872235959654, "loss": 0.292, "step": 73 }, { "epoch": 0.02684076895175916, "grad_norm": 0.09206151217222214, "learning_rate": 0.00019974055479552791, "loss": 0.3064, "step": 74 }, { "epoch": 0.02720348204570185, "grad_norm": 0.09455125778913498, "learning_rate": 0.00019973225677843284, "loss": 0.3031, "step": 75 }, { "epoch": 0.02756619513964454, "grad_norm": 0.10313421487808228, "learning_rate": 0.00019972382831916457, "loss": 0.2975, "step": 76 }, { "epoch": 0.02792890823358723, "grad_norm": 0.08839363604784012, "learning_rate": 0.00019971526942874686, "loss": 0.2926, "step": 77 }, { "epoch": 0.028291621327529923, "grad_norm": 0.0924365371465683, "learning_rate": 0.00019970658011837404, "loss": 0.3071, "step": 78 }, { "epoch": 0.028654334421472614, "grad_norm": 0.09888923168182373, "learning_rate": 0.00019969776039941114, "loss": 0.3004, "step": 79 }, { "epoch": 0.029017047515415305, "grad_norm": 0.09569084644317627, "learning_rate": 0.00019968881028339363, "loss": 0.2923, "step": 80 }, { "epoch": 0.029379760609358, "grad_norm": 0.11503931879997253, "learning_rate": 0.0001996797297820276, "loss": 0.3117, "step": 81 }, { "epoch": 0.02974247370330069, "grad_norm": 0.09839354455471039, "learning_rate": 0.00019967051890718964, "loss": 0.2802, "step": 82 }, { "epoch": 0.03010518679724338, "grad_norm": 0.09043775498867035, "learning_rate": 0.00019966117767092686, "loss": 0.2877, "step": 83 }, { "epoch": 0.030467899891186073, "grad_norm": 0.09896934777498245, "learning_rate": 0.00019965170608545688, "loss": 0.3094, "step": 84 }, { "epoch": 0.030830612985128764, "grad_norm": 0.09892911463975906, "learning_rate": 0.00019964210416316787, "loss": 0.302, "step": 85 }, { "epoch": 0.031193326079071455, "grad_norm": 0.0898653194308281, "learning_rate": 0.00019963237191661834, "loss": 0.2982, "step": 86 }, { "epoch": 0.031556039173014146, "grad_norm": 0.10663247853517532, "learning_rate": 0.00019962250935853736, "loss": 0.2943, "step": 87 }, { "epoch": 0.03191875226695684, "grad_norm": 0.09792915731668472, "learning_rate": 0.0001996125165018244, "loss": 0.2826, "step": 88 }, { "epoch": 0.03228146536089953, "grad_norm": 0.09535045176744461, "learning_rate": 0.00019960239335954936, "loss": 0.3026, "step": 89 }, { "epoch": 0.03264417845484222, "grad_norm": 0.08838774263858795, "learning_rate": 0.0001995921399449525, "loss": 0.277, "step": 90 }, { "epoch": 0.03300689154878491, "grad_norm": 0.09616609662771225, "learning_rate": 0.00019958175627144453, "loss": 0.3015, "step": 91 }, { "epoch": 0.033369604642727604, "grad_norm": 0.0945005938410759, "learning_rate": 0.00019957124235260652, "loss": 0.288, "step": 92 }, { "epoch": 0.03373231773667029, "grad_norm": 0.10378480702638626, "learning_rate": 0.00019956059820218982, "loss": 0.3361, "step": 93 }, { "epoch": 0.034095030830612987, "grad_norm": 0.09242385625839233, "learning_rate": 0.0001995498238341162, "loss": 0.2903, "step": 94 }, { "epoch": 0.034457743924555674, "grad_norm": 0.0919501855969429, "learning_rate": 0.00019953891926247774, "loss": 0.3025, "step": 95 }, { "epoch": 0.03482045701849837, "grad_norm": 0.09978862851858139, "learning_rate": 0.00019952788450153675, "loss": 0.3335, "step": 96 }, { "epoch": 0.035183170112441056, "grad_norm": 0.10097439587116241, "learning_rate": 0.00019951671956572583, "loss": 0.3137, "step": 97 }, { "epoch": 0.03554588320638375, "grad_norm": 0.1043080985546112, "learning_rate": 0.00019950542446964793, "loss": 0.2896, "step": 98 }, { "epoch": 0.035908596300326445, "grad_norm": 0.09220679104328156, "learning_rate": 0.00019949399922807612, "loss": 0.3031, "step": 99 }, { "epoch": 0.03627130939426913, "grad_norm": 0.10692602396011353, "learning_rate": 0.00019948244385595374, "loss": 0.3057, "step": 100 }, { "epoch": 0.03663402248821183, "grad_norm": 0.10609027743339539, "learning_rate": 0.00019947075836839438, "loss": 0.3082, "step": 101 }, { "epoch": 0.036996735582154515, "grad_norm": 0.16867391765117645, "learning_rate": 0.00019945894278068172, "loss": 0.302, "step": 102 }, { "epoch": 0.03735944867609721, "grad_norm": 0.09805990755558014, "learning_rate": 0.00019944699710826966, "loss": 0.3218, "step": 103 }, { "epoch": 0.0377221617700399, "grad_norm": 0.09552697837352753, "learning_rate": 0.00019943492136678223, "loss": 0.2576, "step": 104 }, { "epoch": 0.03808487486398259, "grad_norm": 0.09718494862318039, "learning_rate": 0.0001994227155720136, "loss": 0.2882, "step": 105 }, { "epoch": 0.03844758795792528, "grad_norm": 0.0933772400021553, "learning_rate": 0.000199410379739928, "loss": 0.3069, "step": 106 }, { "epoch": 0.038810301051867974, "grad_norm": 0.09682098776102066, "learning_rate": 0.00019939791388665974, "loss": 0.3013, "step": 107 }, { "epoch": 0.03917301414581066, "grad_norm": 0.1064608246088028, "learning_rate": 0.0001993853180285132, "loss": 0.3307, "step": 108 }, { "epoch": 0.039535727239753356, "grad_norm": 0.09508496522903442, "learning_rate": 0.00019937259218196282, "loss": 0.2968, "step": 109 }, { "epoch": 0.03989844033369604, "grad_norm": 0.10839469730854034, "learning_rate": 0.00019935973636365305, "loss": 0.3017, "step": 110 }, { "epoch": 0.04026115342763874, "grad_norm": 0.10720638930797577, "learning_rate": 0.00019934675059039828, "loss": 0.2817, "step": 111 }, { "epoch": 0.04062386652158143, "grad_norm": 0.10672647505998611, "learning_rate": 0.00019933363487918294, "loss": 0.2876, "step": 112 }, { "epoch": 0.04098657961552412, "grad_norm": 0.10290908813476562, "learning_rate": 0.00019932038924716134, "loss": 0.2906, "step": 113 }, { "epoch": 0.041349292709466814, "grad_norm": 0.11226241290569305, "learning_rate": 0.0001993070137116578, "loss": 0.2816, "step": 114 }, { "epoch": 0.0417120058034095, "grad_norm": 0.09558378159999847, "learning_rate": 0.00019929350829016648, "loss": 0.3115, "step": 115 }, { "epoch": 0.0420747188973522, "grad_norm": 0.10267224162817001, "learning_rate": 0.00019927987300035147, "loss": 0.3035, "step": 116 }, { "epoch": 0.042437431991294884, "grad_norm": 0.09401127696037292, "learning_rate": 0.00019926610786004663, "loss": 0.2995, "step": 117 }, { "epoch": 0.04280014508523758, "grad_norm": 0.10615453869104385, "learning_rate": 0.00019925221288725573, "loss": 0.3062, "step": 118 }, { "epoch": 0.043162858179180266, "grad_norm": 0.11928743124008179, "learning_rate": 0.00019923818810015236, "loss": 0.317, "step": 119 }, { "epoch": 0.04352557127312296, "grad_norm": 0.10731657594442368, "learning_rate": 0.00019922403351707983, "loss": 0.3261, "step": 120 }, { "epoch": 0.04388828436706565, "grad_norm": 0.10545065253973007, "learning_rate": 0.0001992097491565513, "loss": 0.3125, "step": 121 }, { "epoch": 0.04425099746100834, "grad_norm": 0.1098426803946495, "learning_rate": 0.0001991953350372496, "loss": 0.2928, "step": 122 }, { "epoch": 0.04461371055495103, "grad_norm": 0.09736689925193787, "learning_rate": 0.00019918079117802725, "loss": 0.2736, "step": 123 }, { "epoch": 0.044976423648893725, "grad_norm": 0.11810169368982315, "learning_rate": 0.0001991661175979066, "loss": 0.2806, "step": 124 }, { "epoch": 0.04533913674283642, "grad_norm": 0.11560354381799698, "learning_rate": 0.00019915131431607952, "loss": 0.317, "step": 125 }, { "epoch": 0.04570184983677911, "grad_norm": 0.11197232455015182, "learning_rate": 0.00019913638135190756, "loss": 0.3382, "step": 126 }, { "epoch": 0.0460645629307218, "grad_norm": 0.1027117446064949, "learning_rate": 0.0001991213187249219, "loss": 0.2684, "step": 127 }, { "epoch": 0.04642727602466449, "grad_norm": 0.10549558699131012, "learning_rate": 0.00019910612645482334, "loss": 0.2939, "step": 128 }, { "epoch": 0.046789989118607184, "grad_norm": 0.09976191818714142, "learning_rate": 0.00019909080456148218, "loss": 0.2878, "step": 129 }, { "epoch": 0.04715270221254987, "grad_norm": 0.10141481459140778, "learning_rate": 0.0001990753530649383, "loss": 0.2959, "step": 130 }, { "epoch": 0.047515415306492566, "grad_norm": 0.10536810010671616, "learning_rate": 0.00019905977198540105, "loss": 0.283, "step": 131 }, { "epoch": 0.04787812840043525, "grad_norm": 0.1081426814198494, "learning_rate": 0.00019904406134324933, "loss": 0.2982, "step": 132 }, { "epoch": 0.04824084149437795, "grad_norm": 0.10106177628040314, "learning_rate": 0.00019902822115903143, "loss": 0.3301, "step": 133 }, { "epoch": 0.048603554588320635, "grad_norm": 0.09809243679046631, "learning_rate": 0.0001990122514534651, "loss": 0.2868, "step": 134 }, { "epoch": 0.04896626768226333, "grad_norm": 0.10104624181985855, "learning_rate": 0.00019899615224743753, "loss": 0.3035, "step": 135 }, { "epoch": 0.049328980776206025, "grad_norm": 0.09421058744192123, "learning_rate": 0.0001989799235620052, "loss": 0.2982, "step": 136 }, { "epoch": 0.04969169387014871, "grad_norm": 0.09937946498394012, "learning_rate": 0.00019896356541839404, "loss": 0.2988, "step": 137 }, { "epoch": 0.05005440696409141, "grad_norm": 0.10086655616760254, "learning_rate": 0.00019894707783799925, "loss": 0.2849, "step": 138 }, { "epoch": 0.050417120058034094, "grad_norm": 0.09309150278568268, "learning_rate": 0.0001989304608423853, "loss": 0.2792, "step": 139 }, { "epoch": 0.05077983315197679, "grad_norm": 0.15080593526363373, "learning_rate": 0.00019891371445328592, "loss": 0.2993, "step": 140 }, { "epoch": 0.051142546245919476, "grad_norm": 0.09852839261293411, "learning_rate": 0.0001988968386926042, "loss": 0.2887, "step": 141 }, { "epoch": 0.05150525933986217, "grad_norm": 0.13169077038764954, "learning_rate": 0.00019887983358241225, "loss": 0.2889, "step": 142 }, { "epoch": 0.05186797243380486, "grad_norm": 0.203284353017807, "learning_rate": 0.0001988626991449515, "loss": 0.2762, "step": 143 }, { "epoch": 0.05223068552774755, "grad_norm": 0.09370779246091843, "learning_rate": 0.00019884543540263247, "loss": 0.2717, "step": 144 }, { "epoch": 0.05259339862169024, "grad_norm": 0.10462846606969833, "learning_rate": 0.00019882804237803488, "loss": 0.2923, "step": 145 }, { "epoch": 0.052956111715632935, "grad_norm": 0.11297117918729782, "learning_rate": 0.00019881052009390737, "loss": 0.3037, "step": 146 }, { "epoch": 0.05331882480957562, "grad_norm": 0.11037133634090424, "learning_rate": 0.00019879286857316783, "loss": 0.2883, "step": 147 }, { "epoch": 0.05368153790351832, "grad_norm": 0.10279864072799683, "learning_rate": 0.00019877508783890306, "loss": 0.2847, "step": 148 }, { "epoch": 0.05404425099746101, "grad_norm": 0.09439583867788315, "learning_rate": 0.00019875717791436896, "loss": 0.2779, "step": 149 }, { "epoch": 0.0544069640914037, "grad_norm": 0.10622645914554596, "learning_rate": 0.00019873913882299026, "loss": 0.3099, "step": 150 }, { "epoch": 0.054769677185346394, "grad_norm": 0.10882750153541565, "learning_rate": 0.00019872097058836076, "loss": 0.2659, "step": 151 }, { "epoch": 0.05513239027928908, "grad_norm": 0.09320899844169617, "learning_rate": 0.00019870267323424313, "loss": 0.268, "step": 152 }, { "epoch": 0.055495103373231776, "grad_norm": 0.09685231000185013, "learning_rate": 0.00019868424678456888, "loss": 0.2745, "step": 153 }, { "epoch": 0.05585781646717446, "grad_norm": 0.10234569013118744, "learning_rate": 0.00019866569126343844, "loss": 0.2948, "step": 154 }, { "epoch": 0.05622052956111716, "grad_norm": 0.09876774251461029, "learning_rate": 0.00019864700669512098, "loss": 0.2808, "step": 155 }, { "epoch": 0.056583242655059846, "grad_norm": 0.10879123955965042, "learning_rate": 0.00019862819310405449, "loss": 0.2745, "step": 156 }, { "epoch": 0.05694595574900254, "grad_norm": 0.10035258531570435, "learning_rate": 0.00019860925051484572, "loss": 0.3027, "step": 157 }, { "epoch": 0.05730866884294523, "grad_norm": 0.098017618060112, "learning_rate": 0.00019859017895227014, "loss": 0.2844, "step": 158 }, { "epoch": 0.05767138193688792, "grad_norm": 0.09496638178825378, "learning_rate": 0.00019857097844127187, "loss": 0.2852, "step": 159 }, { "epoch": 0.05803409503083061, "grad_norm": 0.10773288458585739, "learning_rate": 0.00019855164900696375, "loss": 0.3112, "step": 160 }, { "epoch": 0.058396808124773304, "grad_norm": 0.09997101873159409, "learning_rate": 0.00019853219067462717, "loss": 0.2913, "step": 161 }, { "epoch": 0.058759521218716, "grad_norm": 0.09856441617012024, "learning_rate": 0.00019851260346971214, "loss": 0.2753, "step": 162 }, { "epoch": 0.059122234312658686, "grad_norm": 0.10671742260456085, "learning_rate": 0.00019849288741783728, "loss": 0.2958, "step": 163 }, { "epoch": 0.05948494740660138, "grad_norm": 0.10415424406528473, "learning_rate": 0.0001984730425447896, "loss": 0.284, "step": 164 }, { "epoch": 0.05984766050054407, "grad_norm": 0.10045934468507767, "learning_rate": 0.00019845306887652476, "loss": 0.281, "step": 165 }, { "epoch": 0.06021037359448676, "grad_norm": 0.10365572571754456, "learning_rate": 0.0001984329664391667, "loss": 0.3186, "step": 166 }, { "epoch": 0.06057308668842945, "grad_norm": 0.10675114393234253, "learning_rate": 0.00019841273525900794, "loss": 0.2774, "step": 167 }, { "epoch": 0.060935799782372145, "grad_norm": 0.100840725004673, "learning_rate": 0.0001983923753625093, "loss": 0.2723, "step": 168 }, { "epoch": 0.06129851287631483, "grad_norm": 0.09524688124656677, "learning_rate": 0.0001983718867763, "loss": 0.2679, "step": 169 }, { "epoch": 0.06166122597025753, "grad_norm": 0.10454592853784561, "learning_rate": 0.0001983512695271775, "loss": 0.2779, "step": 170 }, { "epoch": 0.062023939064200215, "grad_norm": 0.11385498940944672, "learning_rate": 0.00019833052364210757, "loss": 0.2892, "step": 171 }, { "epoch": 0.06238665215814291, "grad_norm": 0.10297231376171112, "learning_rate": 0.00019830964914822433, "loss": 0.2885, "step": 172 }, { "epoch": 0.0627493652520856, "grad_norm": 0.10694777965545654, "learning_rate": 0.00019828864607282994, "loss": 0.2951, "step": 173 }, { "epoch": 0.06311207834602829, "grad_norm": 0.10187729448080063, "learning_rate": 0.00019826751444339483, "loss": 0.267, "step": 174 }, { "epoch": 0.06347479143997098, "grad_norm": 0.10256768018007278, "learning_rate": 0.0001982462542875576, "loss": 0.2812, "step": 175 }, { "epoch": 0.06383750453391368, "grad_norm": 0.106157086789608, "learning_rate": 0.0001982248656331249, "loss": 0.2617, "step": 176 }, { "epoch": 0.06420021762785637, "grad_norm": 0.10591990500688553, "learning_rate": 0.00019820334850807143, "loss": 0.2792, "step": 177 }, { "epoch": 0.06456293072179906, "grad_norm": 0.10539959371089935, "learning_rate": 0.00019818170294053994, "loss": 0.2817, "step": 178 }, { "epoch": 0.06492564381574174, "grad_norm": 0.10033068805932999, "learning_rate": 0.00019815992895884122, "loss": 0.2917, "step": 179 }, { "epoch": 0.06528835690968444, "grad_norm": 0.11100872606039047, "learning_rate": 0.00019813802659145394, "loss": 0.276, "step": 180 }, { "epoch": 0.06565107000362713, "grad_norm": 0.10445630550384521, "learning_rate": 0.0001981159958670247, "loss": 0.3308, "step": 181 }, { "epoch": 0.06601378309756982, "grad_norm": 0.09888961911201477, "learning_rate": 0.00019809383681436809, "loss": 0.2651, "step": 182 }, { "epoch": 0.06637649619151251, "grad_norm": 0.10630346089601517, "learning_rate": 0.00019807154946246635, "loss": 0.2674, "step": 183 }, { "epoch": 0.06673920928545521, "grad_norm": 0.09556199610233307, "learning_rate": 0.00019804913384046974, "loss": 0.2988, "step": 184 }, { "epoch": 0.0671019223793979, "grad_norm": 0.10325701534748077, "learning_rate": 0.0001980265899776961, "loss": 0.2821, "step": 185 }, { "epoch": 0.06746463547334058, "grad_norm": 0.09466871619224548, "learning_rate": 0.00019800391790363112, "loss": 0.2632, "step": 186 }, { "epoch": 0.06782734856728329, "grad_norm": 0.09646070003509521, "learning_rate": 0.00019798111764792814, "loss": 0.2888, "step": 187 }, { "epoch": 0.06819006166122597, "grad_norm": 0.09636171907186508, "learning_rate": 0.00019795818924040815, "loss": 0.2766, "step": 188 }, { "epoch": 0.06855277475516866, "grad_norm": 0.10880020260810852, "learning_rate": 0.00019793513271105975, "loss": 0.3053, "step": 189 }, { "epoch": 0.06891548784911135, "grad_norm": 0.11933793127536774, "learning_rate": 0.0001979119480900391, "loss": 0.2903, "step": 190 }, { "epoch": 0.06927820094305405, "grad_norm": 0.1342136114835739, "learning_rate": 0.00019788863540766996, "loss": 0.2912, "step": 191 }, { "epoch": 0.06964091403699674, "grad_norm": 0.1037123054265976, "learning_rate": 0.0001978651946944435, "loss": 0.3044, "step": 192 }, { "epoch": 0.07000362713093942, "grad_norm": 0.11920095235109329, "learning_rate": 0.00019784162598101838, "loss": 0.2859, "step": 193 }, { "epoch": 0.07036634022488211, "grad_norm": 0.11973892152309418, "learning_rate": 0.00019781792929822068, "loss": 0.2959, "step": 194 }, { "epoch": 0.07072905331882481, "grad_norm": 0.11078456044197083, "learning_rate": 0.00019779410467704389, "loss": 0.2769, "step": 195 }, { "epoch": 0.0710917664127675, "grad_norm": 0.11091899126768112, "learning_rate": 0.00019777015214864877, "loss": 0.2832, "step": 196 }, { "epoch": 0.07145447950671019, "grad_norm": 0.09678234905004501, "learning_rate": 0.00019774607174436338, "loss": 0.2455, "step": 197 }, { "epoch": 0.07181719260065289, "grad_norm": 0.11300257593393326, "learning_rate": 0.00019772186349568304, "loss": 0.3242, "step": 198 }, { "epoch": 0.07217990569459558, "grad_norm": 0.1536862999200821, "learning_rate": 0.00019769752743427032, "loss": 0.2901, "step": 199 }, { "epoch": 0.07254261878853827, "grad_norm": 0.10081265866756439, "learning_rate": 0.00019767306359195493, "loss": 0.3059, "step": 200 }, { "epoch": 0.07290533188248095, "grad_norm": 0.10079798847436905, "learning_rate": 0.0001976484720007337, "loss": 0.2871, "step": 201 }, { "epoch": 0.07326804497642365, "grad_norm": 0.09981225430965424, "learning_rate": 0.00019762375269277054, "loss": 0.2713, "step": 202 }, { "epoch": 0.07363075807036634, "grad_norm": 0.10104259103536606, "learning_rate": 0.00019759890570039644, "loss": 0.3178, "step": 203 }, { "epoch": 0.07399347116430903, "grad_norm": 0.10694817453622818, "learning_rate": 0.00019757393105610934, "loss": 0.2725, "step": 204 }, { "epoch": 0.07435618425825172, "grad_norm": 0.10432042181491852, "learning_rate": 0.0001975488287925742, "loss": 0.2798, "step": 205 }, { "epoch": 0.07471889735219442, "grad_norm": 0.11903175711631775, "learning_rate": 0.00019752359894262283, "loss": 0.3138, "step": 206 }, { "epoch": 0.0750816104461371, "grad_norm": 0.10495443642139435, "learning_rate": 0.00019749824153925396, "loss": 0.2764, "step": 207 }, { "epoch": 0.0754443235400798, "grad_norm": 0.10551683604717255, "learning_rate": 0.00019747275661563312, "loss": 0.2884, "step": 208 }, { "epoch": 0.07580703663402248, "grad_norm": 0.12931138277053833, "learning_rate": 0.00019744714420509273, "loss": 0.2843, "step": 209 }, { "epoch": 0.07616974972796518, "grad_norm": 0.10500820726156235, "learning_rate": 0.0001974214043411317, "loss": 0.298, "step": 210 }, { "epoch": 0.07653246282190787, "grad_norm": 0.10469575226306915, "learning_rate": 0.000197395537057416, "loss": 0.2775, "step": 211 }, { "epoch": 0.07689517591585056, "grad_norm": 0.11616349220275879, "learning_rate": 0.00019736954238777792, "loss": 0.2868, "step": 212 }, { "epoch": 0.07725788900979326, "grad_norm": 0.10852184146642685, "learning_rate": 0.00019734342036621652, "loss": 0.2634, "step": 213 }, { "epoch": 0.07762060210373595, "grad_norm": 0.11353151500225067, "learning_rate": 0.00019731717102689747, "loss": 0.2988, "step": 214 }, { "epoch": 0.07798331519767863, "grad_norm": 0.10728183388710022, "learning_rate": 0.00019729079440415287, "loss": 0.273, "step": 215 }, { "epoch": 0.07834602829162132, "grad_norm": 0.11151303350925446, "learning_rate": 0.0001972642905324813, "loss": 0.282, "step": 216 }, { "epoch": 0.07870874138556402, "grad_norm": 0.1237482950091362, "learning_rate": 0.00019723765944654783, "loss": 0.2744, "step": 217 }, { "epoch": 0.07907145447950671, "grad_norm": 0.10815929621458054, "learning_rate": 0.0001972109011811839, "loss": 0.2893, "step": 218 }, { "epoch": 0.0794341675734494, "grad_norm": 0.1144891083240509, "learning_rate": 0.00019718401577138725, "loss": 0.3018, "step": 219 }, { "epoch": 0.07979688066739209, "grad_norm": 0.1146797463297844, "learning_rate": 0.00019715700325232194, "loss": 0.2759, "step": 220 }, { "epoch": 0.08015959376133479, "grad_norm": 0.1100744977593422, "learning_rate": 0.00019712986365931826, "loss": 0.2824, "step": 221 }, { "epoch": 0.08052230685527748, "grad_norm": 0.12042435258626938, "learning_rate": 0.0001971025970278728, "loss": 0.2683, "step": 222 }, { "epoch": 0.08088501994922016, "grad_norm": 0.11394108831882477, "learning_rate": 0.00019707520339364818, "loss": 0.312, "step": 223 }, { "epoch": 0.08124773304316286, "grad_norm": 0.10353437066078186, "learning_rate": 0.00019704768279247317, "loss": 0.2673, "step": 224 }, { "epoch": 0.08161044613710555, "grad_norm": 0.0966782197356224, "learning_rate": 0.00019702003526034264, "loss": 0.2995, "step": 225 }, { "epoch": 0.08197315923104824, "grad_norm": 0.11248703300952911, "learning_rate": 0.00019699226083341742, "loss": 0.2588, "step": 226 }, { "epoch": 0.08233587232499093, "grad_norm": 0.10794703662395477, "learning_rate": 0.00019696435954802438, "loss": 0.2594, "step": 227 }, { "epoch": 0.08269858541893363, "grad_norm": 0.1097991019487381, "learning_rate": 0.0001969363314406562, "loss": 0.2691, "step": 228 }, { "epoch": 0.08306129851287632, "grad_norm": 0.10738769918680191, "learning_rate": 0.00019690817654797161, "loss": 0.2811, "step": 229 }, { "epoch": 0.083424011606819, "grad_norm": 0.10677637159824371, "learning_rate": 0.00019687989490679503, "loss": 0.2864, "step": 230 }, { "epoch": 0.08378672470076169, "grad_norm": 0.11440913379192352, "learning_rate": 0.00019685148655411658, "loss": 0.2961, "step": 231 }, { "epoch": 0.0841494377947044, "grad_norm": 0.10899066925048828, "learning_rate": 0.00019682295152709234, "loss": 0.2852, "step": 232 }, { "epoch": 0.08451215088864708, "grad_norm": 0.10460548102855682, "learning_rate": 0.00019679428986304386, "loss": 0.2954, "step": 233 }, { "epoch": 0.08487486398258977, "grad_norm": 0.12301474809646606, "learning_rate": 0.00019676550159945845, "loss": 0.263, "step": 234 }, { "epoch": 0.08523757707653247, "grad_norm": 0.11282453685998917, "learning_rate": 0.000196736586773989, "loss": 0.3135, "step": 235 }, { "epoch": 0.08560029017047516, "grad_norm": 0.11679442226886749, "learning_rate": 0.0001967075454244538, "loss": 0.287, "step": 236 }, { "epoch": 0.08596300326441784, "grad_norm": 0.11096673458814621, "learning_rate": 0.0001966783775888368, "loss": 0.295, "step": 237 }, { "epoch": 0.08632571635836053, "grad_norm": 0.1101219430565834, "learning_rate": 0.00019664908330528725, "loss": 0.2694, "step": 238 }, { "epoch": 0.08668842945230323, "grad_norm": 0.10985169559717178, "learning_rate": 0.00019661966261211983, "loss": 0.2734, "step": 239 }, { "epoch": 0.08705114254624592, "grad_norm": 0.11106691509485245, "learning_rate": 0.0001965901155478146, "loss": 0.2781, "step": 240 }, { "epoch": 0.08741385564018861, "grad_norm": 0.1100887879729271, "learning_rate": 0.00019656044215101684, "loss": 0.3105, "step": 241 }, { "epoch": 0.0877765687341313, "grad_norm": 0.11487387865781784, "learning_rate": 0.00019653064246053707, "loss": 0.2824, "step": 242 }, { "epoch": 0.088139281828074, "grad_norm": 0.10977080464363098, "learning_rate": 0.00019650071651535104, "loss": 0.3309, "step": 243 }, { "epoch": 0.08850199492201669, "grad_norm": 0.11280547827482224, "learning_rate": 0.0001964706643545996, "loss": 0.2698, "step": 244 }, { "epoch": 0.08886470801595937, "grad_norm": 0.10025591403245926, "learning_rate": 0.00019644048601758865, "loss": 0.2623, "step": 245 }, { "epoch": 0.08922742110990206, "grad_norm": 0.10023844242095947, "learning_rate": 0.0001964101815437892, "loss": 0.2711, "step": 246 }, { "epoch": 0.08959013420384476, "grad_norm": 0.1235634833574295, "learning_rate": 0.0001963797509728371, "loss": 0.2884, "step": 247 }, { "epoch": 0.08995284729778745, "grad_norm": 0.10354435443878174, "learning_rate": 0.0001963491943445333, "loss": 0.2601, "step": 248 }, { "epoch": 0.09031556039173014, "grad_norm": 0.10399331152439117, "learning_rate": 0.00019631851169884352, "loss": 0.2817, "step": 249 }, { "epoch": 0.09067827348567284, "grad_norm": 0.11649379879236221, "learning_rate": 0.00019628770307589827, "loss": 0.3344, "step": 250 }, { "epoch": 0.09104098657961553, "grad_norm": 0.1313096284866333, "learning_rate": 0.00019625676851599288, "loss": 0.326, "step": 251 }, { "epoch": 0.09140369967355821, "grad_norm": 0.11555227637290955, "learning_rate": 0.00019622570805958746, "loss": 0.2687, "step": 252 }, { "epoch": 0.0917664127675009, "grad_norm": 0.1436738669872284, "learning_rate": 0.00019619452174730667, "loss": 0.2748, "step": 253 }, { "epoch": 0.0921291258614436, "grad_norm": 0.11013220995664597, "learning_rate": 0.0001961632096199398, "loss": 0.2556, "step": 254 }, { "epoch": 0.09249183895538629, "grad_norm": 0.11054322123527527, "learning_rate": 0.00019613177171844075, "loss": 0.2813, "step": 255 }, { "epoch": 0.09285455204932898, "grad_norm": 0.10872920602560043, "learning_rate": 0.00019610020808392788, "loss": 0.3022, "step": 256 }, { "epoch": 0.09321726514327167, "grad_norm": 0.12032327055931091, "learning_rate": 0.000196068518757684, "loss": 0.2836, "step": 257 }, { "epoch": 0.09357997823721437, "grad_norm": 0.10551446676254272, "learning_rate": 0.0001960367037811564, "loss": 0.281, "step": 258 }, { "epoch": 0.09394269133115705, "grad_norm": 0.11461377888917923, "learning_rate": 0.00019600476319595658, "loss": 0.2841, "step": 259 }, { "epoch": 0.09430540442509974, "grad_norm": 0.11937367916107178, "learning_rate": 0.00019597269704386036, "loss": 0.2695, "step": 260 }, { "epoch": 0.09466811751904244, "grad_norm": 0.109502412378788, "learning_rate": 0.0001959405053668079, "loss": 0.2796, "step": 261 }, { "epoch": 0.09503083061298513, "grad_norm": 0.12356701493263245, "learning_rate": 0.00019590818820690336, "loss": 0.2963, "step": 262 }, { "epoch": 0.09539354370692782, "grad_norm": 0.1127593144774437, "learning_rate": 0.00019587574560641518, "loss": 0.2646, "step": 263 }, { "epoch": 0.0957562568008705, "grad_norm": 0.13234767317771912, "learning_rate": 0.00019584317760777578, "loss": 0.2816, "step": 264 }, { "epoch": 0.09611896989481321, "grad_norm": 0.10984192788600922, "learning_rate": 0.00019581048425358158, "loss": 0.3069, "step": 265 }, { "epoch": 0.0964816829887559, "grad_norm": 0.1149398684501648, "learning_rate": 0.00019577766558659306, "loss": 0.2574, "step": 266 }, { "epoch": 0.09684439608269858, "grad_norm": 0.10994721949100494, "learning_rate": 0.00019574472164973452, "loss": 0.2705, "step": 267 }, { "epoch": 0.09720710917664127, "grad_norm": 0.10396052896976471, "learning_rate": 0.00019571165248609407, "loss": 0.2343, "step": 268 }, { "epoch": 0.09756982227058397, "grad_norm": 0.1382754147052765, "learning_rate": 0.00019567845813892368, "loss": 0.2586, "step": 269 }, { "epoch": 0.09793253536452666, "grad_norm": 0.10811847448348999, "learning_rate": 0.000195645138651639, "loss": 0.2599, "step": 270 }, { "epoch": 0.09829524845846935, "grad_norm": 0.12254346907138824, "learning_rate": 0.00019561169406781938, "loss": 0.2543, "step": 271 }, { "epoch": 0.09865796155241205, "grad_norm": 0.10719288885593414, "learning_rate": 0.00019557812443120779, "loss": 0.2788, "step": 272 }, { "epoch": 0.09902067464635474, "grad_norm": 0.11490897834300995, "learning_rate": 0.00019554442978571076, "loss": 0.3076, "step": 273 }, { "epoch": 0.09938338774029742, "grad_norm": 0.11272160708904266, "learning_rate": 0.00019551061017539828, "loss": 0.2719, "step": 274 }, { "epoch": 0.09974610083424011, "grad_norm": 0.11950589716434479, "learning_rate": 0.00019547666564450383, "loss": 0.2424, "step": 275 }, { "epoch": 0.10010881392818281, "grad_norm": 0.10737808048725128, "learning_rate": 0.00019544259623742428, "loss": 0.2628, "step": 276 }, { "epoch": 0.1004715270221255, "grad_norm": 0.10422177612781525, "learning_rate": 0.00019540840199871982, "loss": 0.2515, "step": 277 }, { "epoch": 0.10083424011606819, "grad_norm": 0.12654827535152435, "learning_rate": 0.00019537408297311384, "loss": 0.3258, "step": 278 }, { "epoch": 0.10119695321001088, "grad_norm": 0.10753121972084045, "learning_rate": 0.00019533963920549306, "loss": 0.2633, "step": 279 }, { "epoch": 0.10155966630395358, "grad_norm": 0.1134246215224266, "learning_rate": 0.0001953050707409073, "loss": 0.2777, "step": 280 }, { "epoch": 0.10192237939789626, "grad_norm": 0.11118260025978088, "learning_rate": 0.00019527037762456944, "loss": 0.2684, "step": 281 }, { "epoch": 0.10228509249183895, "grad_norm": 0.12425535172224045, "learning_rate": 0.0001952355599018554, "loss": 0.28, "step": 282 }, { "epoch": 0.10264780558578164, "grad_norm": 0.12097672373056412, "learning_rate": 0.00019520061761830424, "loss": 0.2589, "step": 283 }, { "epoch": 0.10301051867972434, "grad_norm": 0.11388805508613586, "learning_rate": 0.00019516555081961764, "loss": 0.2864, "step": 284 }, { "epoch": 0.10337323177366703, "grad_norm": 0.10794699192047119, "learning_rate": 0.00019513035955166035, "loss": 0.2754, "step": 285 }, { "epoch": 0.10373594486760972, "grad_norm": 0.10783129185438156, "learning_rate": 0.00019509504386045986, "loss": 0.252, "step": 286 }, { "epoch": 0.10409865796155242, "grad_norm": 0.12570741772651672, "learning_rate": 0.0001950596037922064, "loss": 0.2563, "step": 287 }, { "epoch": 0.1044613710554951, "grad_norm": 0.12100599706172943, "learning_rate": 0.0001950240393932529, "loss": 0.2811, "step": 288 }, { "epoch": 0.1048240841494378, "grad_norm": 0.09901045262813568, "learning_rate": 0.0001949883507101148, "loss": 0.2724, "step": 289 }, { "epoch": 0.10518679724338048, "grad_norm": 0.10405360162258148, "learning_rate": 0.00019495253778947026, "loss": 0.274, "step": 290 }, { "epoch": 0.10554951033732318, "grad_norm": 0.11303572356700897, "learning_rate": 0.0001949166006781598, "loss": 0.2669, "step": 291 }, { "epoch": 0.10591222343126587, "grad_norm": 0.1083337813615799, "learning_rate": 0.0001948805394231864, "loss": 0.2865, "step": 292 }, { "epoch": 0.10627493652520856, "grad_norm": 0.10910173505544662, "learning_rate": 0.00019484435407171545, "loss": 0.2651, "step": 293 }, { "epoch": 0.10663764961915125, "grad_norm": 0.10337372124195099, "learning_rate": 0.00019480804467107463, "loss": 0.2509, "step": 294 }, { "epoch": 0.10700036271309395, "grad_norm": 0.1112721636891365, "learning_rate": 0.00019477161126875387, "loss": 0.2666, "step": 295 }, { "epoch": 0.10736307580703663, "grad_norm": 0.11390243470668793, "learning_rate": 0.00019473505391240522, "loss": 0.278, "step": 296 }, { "epoch": 0.10772578890097932, "grad_norm": 0.11081282794475555, "learning_rate": 0.000194698372649843, "loss": 0.2725, "step": 297 }, { "epoch": 0.10808850199492202, "grad_norm": 0.12400209158658981, "learning_rate": 0.00019466156752904343, "loss": 0.2812, "step": 298 }, { "epoch": 0.10845121508886471, "grad_norm": 0.11567061394453049, "learning_rate": 0.0001946246385981448, "loss": 0.2907, "step": 299 }, { "epoch": 0.1088139281828074, "grad_norm": 0.11256127059459686, "learning_rate": 0.0001945875859054474, "loss": 0.2537, "step": 300 }, { "epoch": 0.10917664127675009, "grad_norm": 0.12261880189180374, "learning_rate": 0.0001945504094994132, "loss": 0.2726, "step": 301 }, { "epoch": 0.10953935437069279, "grad_norm": 0.10978831350803375, "learning_rate": 0.00019451310942866621, "loss": 0.2578, "step": 302 }, { "epoch": 0.10990206746463548, "grad_norm": 0.12203028053045273, "learning_rate": 0.00019447568574199202, "loss": 0.2685, "step": 303 }, { "epoch": 0.11026478055857816, "grad_norm": 0.11995328217744827, "learning_rate": 0.000194438138488338, "loss": 0.2914, "step": 304 }, { "epoch": 0.11062749365252085, "grad_norm": 0.1177087351679802, "learning_rate": 0.000194400467716813, "loss": 0.2576, "step": 305 }, { "epoch": 0.11099020674646355, "grad_norm": 0.11549436300992966, "learning_rate": 0.00019436267347668757, "loss": 0.2789, "step": 306 }, { "epoch": 0.11135291984040624, "grad_norm": 0.12319694459438324, "learning_rate": 0.0001943247558173937, "loss": 0.2676, "step": 307 }, { "epoch": 0.11171563293434893, "grad_norm": 0.13126415014266968, "learning_rate": 0.00019428671478852479, "loss": 0.2612, "step": 308 }, { "epoch": 0.11207834602829161, "grad_norm": 0.11185677349567413, "learning_rate": 0.00019424855043983556, "loss": 0.2607, "step": 309 }, { "epoch": 0.11244105912223432, "grad_norm": 0.1092672273516655, "learning_rate": 0.00019421026282124212, "loss": 0.2521, "step": 310 }, { "epoch": 0.112803772216177, "grad_norm": 0.12753579020500183, "learning_rate": 0.00019417185198282168, "loss": 0.2876, "step": 311 }, { "epoch": 0.11316648531011969, "grad_norm": 0.11622543632984161, "learning_rate": 0.00019413331797481277, "loss": 0.2656, "step": 312 }, { "epoch": 0.11352919840406239, "grad_norm": 0.11567405611276627, "learning_rate": 0.00019409466084761485, "loss": 0.2836, "step": 313 }, { "epoch": 0.11389191149800508, "grad_norm": 0.11441784352064133, "learning_rate": 0.00019405588065178852, "loss": 0.2523, "step": 314 }, { "epoch": 0.11425462459194777, "grad_norm": 0.11300231516361237, "learning_rate": 0.0001940169774380553, "loss": 0.2804, "step": 315 }, { "epoch": 0.11461733768589046, "grad_norm": 0.12194045633077621, "learning_rate": 0.00019397795125729767, "loss": 0.2867, "step": 316 }, { "epoch": 0.11498005077983316, "grad_norm": 0.12124588340520859, "learning_rate": 0.00019393880216055887, "loss": 0.2859, "step": 317 }, { "epoch": 0.11534276387377584, "grad_norm": 0.11623072624206543, "learning_rate": 0.00019389953019904285, "loss": 0.288, "step": 318 }, { "epoch": 0.11570547696771853, "grad_norm": 0.11297620832920074, "learning_rate": 0.00019386013542411449, "loss": 0.2896, "step": 319 }, { "epoch": 0.11606819006166122, "grad_norm": 0.11987963318824768, "learning_rate": 0.00019382061788729898, "loss": 0.3479, "step": 320 }, { "epoch": 0.11643090315560392, "grad_norm": 0.14857983589172363, "learning_rate": 0.00019378097764028235, "loss": 0.2519, "step": 321 }, { "epoch": 0.11679361624954661, "grad_norm": 0.10684715956449509, "learning_rate": 0.00019374121473491096, "loss": 0.3014, "step": 322 }, { "epoch": 0.1171563293434893, "grad_norm": 0.11060940474271774, "learning_rate": 0.0001937013292231917, "loss": 0.2522, "step": 323 }, { "epoch": 0.117519042437432, "grad_norm": 0.10806398838758469, "learning_rate": 0.00019366132115729173, "loss": 0.2695, "step": 324 }, { "epoch": 0.11788175553137469, "grad_norm": 0.11272536218166351, "learning_rate": 0.0001936211905895386, "loss": 0.2666, "step": 325 }, { "epoch": 0.11824446862531737, "grad_norm": 0.11766637116670609, "learning_rate": 0.00019358093757241996, "loss": 0.3007, "step": 326 }, { "epoch": 0.11860718171926006, "grad_norm": 0.1170196607708931, "learning_rate": 0.0001935405621585837, "loss": 0.2678, "step": 327 }, { "epoch": 0.11896989481320276, "grad_norm": 0.12220901250839233, "learning_rate": 0.0001935000644008378, "loss": 0.2519, "step": 328 }, { "epoch": 0.11933260790714545, "grad_norm": 0.1201847493648529, "learning_rate": 0.00019345944435215023, "loss": 0.267, "step": 329 }, { "epoch": 0.11969532100108814, "grad_norm": 0.11570829898118973, "learning_rate": 0.00019341870206564886, "loss": 0.2515, "step": 330 }, { "epoch": 0.12005803409503082, "grad_norm": 0.12002036720514297, "learning_rate": 0.0001933778375946216, "loss": 0.2767, "step": 331 }, { "epoch": 0.12042074718897353, "grad_norm": 0.12402871996164322, "learning_rate": 0.00019333685099251594, "loss": 0.2508, "step": 332 }, { "epoch": 0.12078346028291621, "grad_norm": 0.11982254683971405, "learning_rate": 0.00019329574231293926, "loss": 0.2802, "step": 333 }, { "epoch": 0.1211461733768589, "grad_norm": 0.11482241749763489, "learning_rate": 0.0001932545116096586, "loss": 0.2774, "step": 334 }, { "epoch": 0.1215088864708016, "grad_norm": 0.1279384046792984, "learning_rate": 0.00019321315893660056, "loss": 0.2718, "step": 335 }, { "epoch": 0.12187159956474429, "grad_norm": 0.11594551056623459, "learning_rate": 0.00019317168434785127, "loss": 0.2771, "step": 336 }, { "epoch": 0.12223431265868698, "grad_norm": 0.1129961609840393, "learning_rate": 0.0001931300878976563, "loss": 0.2602, "step": 337 }, { "epoch": 0.12259702575262967, "grad_norm": 0.11392521858215332, "learning_rate": 0.0001930883696404207, "loss": 0.2595, "step": 338 }, { "epoch": 0.12295973884657237, "grad_norm": 0.10742796212434769, "learning_rate": 0.0001930465296307087, "loss": 0.2473, "step": 339 }, { "epoch": 0.12332245194051505, "grad_norm": 0.11807534843683243, "learning_rate": 0.00019300456792324382, "loss": 0.2374, "step": 340 }, { "epoch": 0.12368516503445774, "grad_norm": 0.13207505643367767, "learning_rate": 0.00019296248457290882, "loss": 0.2732, "step": 341 }, { "epoch": 0.12404787812840043, "grad_norm": 0.13366468250751495, "learning_rate": 0.00019292027963474547, "loss": 0.2702, "step": 342 }, { "epoch": 0.12441059122234313, "grad_norm": 0.1288871318101883, "learning_rate": 0.00019287795316395468, "loss": 0.2667, "step": 343 }, { "epoch": 0.12477330431628582, "grad_norm": 0.11883368343114853, "learning_rate": 0.00019283550521589614, "loss": 0.2666, "step": 344 }, { "epoch": 0.1251360174102285, "grad_norm": 0.1264144480228424, "learning_rate": 0.00019279293584608856, "loss": 0.2795, "step": 345 }, { "epoch": 0.1254987305041712, "grad_norm": 0.12721741199493408, "learning_rate": 0.0001927502451102095, "loss": 0.2516, "step": 346 }, { "epoch": 0.12586144359811388, "grad_norm": 0.1189354807138443, "learning_rate": 0.00019270743306409505, "loss": 0.2489, "step": 347 }, { "epoch": 0.12622415669205658, "grad_norm": 0.12466361373662949, "learning_rate": 0.00019266449976374018, "loss": 0.2856, "step": 348 }, { "epoch": 0.12658686978599928, "grad_norm": 0.13144852221012115, "learning_rate": 0.00019262144526529832, "loss": 0.2612, "step": 349 }, { "epoch": 0.12694958287994196, "grad_norm": 0.10754833370447159, "learning_rate": 0.0001925782696250815, "loss": 0.2523, "step": 350 }, { "epoch": 0.12731229597388466, "grad_norm": 0.1237715408205986, "learning_rate": 0.0001925349728995602, "loss": 0.2526, "step": 351 }, { "epoch": 0.12767500906782736, "grad_norm": 0.1193939596414566, "learning_rate": 0.00019249155514536312, "loss": 0.2819, "step": 352 }, { "epoch": 0.12803772216177003, "grad_norm": 0.12648704648017883, "learning_rate": 0.00019244801641927746, "loss": 0.2709, "step": 353 }, { "epoch": 0.12840043525571274, "grad_norm": 0.11707579344511032, "learning_rate": 0.0001924043567782485, "loss": 0.2853, "step": 354 }, { "epoch": 0.1287631483496554, "grad_norm": 0.12175849080085754, "learning_rate": 0.00019236057627937975, "loss": 0.2702, "step": 355 }, { "epoch": 0.1291258614435981, "grad_norm": 0.1120310127735138, "learning_rate": 0.0001923166749799327, "loss": 0.2596, "step": 356 }, { "epoch": 0.1294885745375408, "grad_norm": 0.12282121926546097, "learning_rate": 0.00019227265293732693, "loss": 0.2581, "step": 357 }, { "epoch": 0.1298512876314835, "grad_norm": 0.13752269744873047, "learning_rate": 0.00019222851020913995, "loss": 0.2641, "step": 358 }, { "epoch": 0.1302140007254262, "grad_norm": 0.11744178086519241, "learning_rate": 0.00019218424685310702, "loss": 0.2462, "step": 359 }, { "epoch": 0.1305767138193689, "grad_norm": 0.11440069228410721, "learning_rate": 0.00019213986292712125, "loss": 0.2495, "step": 360 }, { "epoch": 0.13093942691331156, "grad_norm": 0.11646847426891327, "learning_rate": 0.00019209535848923343, "loss": 0.3054, "step": 361 }, { "epoch": 0.13130214000725426, "grad_norm": 0.11386696994304657, "learning_rate": 0.00019205073359765192, "loss": 0.2503, "step": 362 }, { "epoch": 0.13166485310119697, "grad_norm": 0.12510043382644653, "learning_rate": 0.00019200598831074274, "loss": 0.275, "step": 363 }, { "epoch": 0.13202756619513964, "grad_norm": 0.12363200634717941, "learning_rate": 0.00019196112268702925, "loss": 0.2746, "step": 364 }, { "epoch": 0.13239027928908234, "grad_norm": 0.11029732972383499, "learning_rate": 0.0001919161367851923, "loss": 0.3095, "step": 365 }, { "epoch": 0.13275299238302501, "grad_norm": 0.12199590355157852, "learning_rate": 0.00019187103066406998, "loss": 0.2641, "step": 366 }, { "epoch": 0.13311570547696772, "grad_norm": 0.11692757904529572, "learning_rate": 0.00019182580438265764, "loss": 0.2646, "step": 367 }, { "epoch": 0.13347841857091042, "grad_norm": 0.11142277717590332, "learning_rate": 0.00019178045800010787, "loss": 0.2495, "step": 368 }, { "epoch": 0.1338411316648531, "grad_norm": 0.11492447555065155, "learning_rate": 0.00019173499157573023, "loss": 0.2647, "step": 369 }, { "epoch": 0.1342038447587958, "grad_norm": 0.114183709025383, "learning_rate": 0.0001916894051689913, "loss": 0.2499, "step": 370 }, { "epoch": 0.1345665578527385, "grad_norm": 0.11262322962284088, "learning_rate": 0.00019164369883951468, "loss": 0.2749, "step": 371 }, { "epoch": 0.13492927094668117, "grad_norm": 0.11667259782552719, "learning_rate": 0.0001915978726470807, "loss": 0.269, "step": 372 }, { "epoch": 0.13529198404062387, "grad_norm": 0.1220724880695343, "learning_rate": 0.00019155192665162656, "loss": 0.2652, "step": 373 }, { "epoch": 0.13565469713456657, "grad_norm": 0.12185841798782349, "learning_rate": 0.0001915058609132461, "loss": 0.2754, "step": 374 }, { "epoch": 0.13601741022850924, "grad_norm": 0.11733336001634598, "learning_rate": 0.00019145967549218974, "loss": 0.2685, "step": 375 }, { "epoch": 0.13638012332245195, "grad_norm": 0.12325771152973175, "learning_rate": 0.00019141337044886457, "loss": 0.2548, "step": 376 }, { "epoch": 0.13674283641639462, "grad_norm": 0.11737928539514542, "learning_rate": 0.000191366945843834, "loss": 0.2875, "step": 377 }, { "epoch": 0.13710554951033732, "grad_norm": 0.11719442158937454, "learning_rate": 0.00019132040173781788, "loss": 0.244, "step": 378 }, { "epoch": 0.13746826260428002, "grad_norm": 0.1146400049328804, "learning_rate": 0.0001912737381916923, "loss": 0.2595, "step": 379 }, { "epoch": 0.1378309756982227, "grad_norm": 0.11577652394771576, "learning_rate": 0.00019122695526648968, "loss": 0.276, "step": 380 }, { "epoch": 0.1381936887921654, "grad_norm": 0.10648276656866074, "learning_rate": 0.00019118005302339847, "loss": 0.2444, "step": 381 }, { "epoch": 0.1385564018861081, "grad_norm": 0.10874751210212708, "learning_rate": 0.00019113303152376324, "loss": 0.2502, "step": 382 }, { "epoch": 0.13891911498005077, "grad_norm": 0.1190841868519783, "learning_rate": 0.00019108589082908453, "loss": 0.2477, "step": 383 }, { "epoch": 0.13928182807399347, "grad_norm": 0.11433839052915573, "learning_rate": 0.00019103863100101873, "loss": 0.2651, "step": 384 }, { "epoch": 0.13964454116793618, "grad_norm": 0.1088482066988945, "learning_rate": 0.00019099125210137813, "loss": 0.2452, "step": 385 }, { "epoch": 0.14000725426187885, "grad_norm": 0.115386962890625, "learning_rate": 0.00019094375419213065, "loss": 0.2579, "step": 386 }, { "epoch": 0.14036996735582155, "grad_norm": 0.1259610801935196, "learning_rate": 0.0001908961373354, "loss": 0.2712, "step": 387 }, { "epoch": 0.14073268044976422, "grad_norm": 4882568.5, "learning_rate": 0.00019084840159346532, "loss": 0.2385, "step": 388 }, { "epoch": 0.14109539354370693, "grad_norm": 0.12656670808792114, "learning_rate": 0.0001908005470287614, "loss": 0.2406, "step": 389 }, { "epoch": 0.14145810663764963, "grad_norm": 0.13908933103084564, "learning_rate": 0.00019075257370387827, "loss": 0.2433, "step": 390 }, { "epoch": 0.1418208197315923, "grad_norm": 0.14672155678272247, "learning_rate": 0.0001907044816815614, "loss": 0.2544, "step": 391 }, { "epoch": 0.142183532825535, "grad_norm": 0.15031826496124268, "learning_rate": 0.0001906562710247115, "loss": 0.2652, "step": 392 }, { "epoch": 0.1425462459194777, "grad_norm": 0.13194704055786133, "learning_rate": 0.00019060794179638445, "loss": 0.2603, "step": 393 }, { "epoch": 0.14290895901342038, "grad_norm": 0.13189998269081116, "learning_rate": 0.0001905594940597911, "loss": 0.2419, "step": 394 }, { "epoch": 0.14327167210736308, "grad_norm": 0.1245296448469162, "learning_rate": 0.00019051092787829746, "loss": 0.2816, "step": 395 }, { "epoch": 0.14363438520130578, "grad_norm": 0.14372986555099487, "learning_rate": 0.0001904622433154244, "loss": 0.261, "step": 396 }, { "epoch": 0.14399709829524845, "grad_norm": 0.13385535776615143, "learning_rate": 0.00019041344043484754, "loss": 0.2702, "step": 397 }, { "epoch": 0.14435981138919116, "grad_norm": 0.13935022056102753, "learning_rate": 0.00019036451930039738, "loss": 0.2907, "step": 398 }, { "epoch": 0.14472252448313383, "grad_norm": 0.11567000299692154, "learning_rate": 0.00019031547997605902, "loss": 0.2618, "step": 399 }, { "epoch": 0.14508523757707653, "grad_norm": 0.1412486582994461, "learning_rate": 0.0001902663225259721, "loss": 0.3055, "step": 400 }, { "epoch": 0.14544795067101923, "grad_norm": 0.13404829800128937, "learning_rate": 0.00019021704701443083, "loss": 0.2565, "step": 401 }, { "epoch": 0.1458106637649619, "grad_norm": 0.15074236690998077, "learning_rate": 0.00019016765350588389, "loss": 0.2737, "step": 402 }, { "epoch": 0.1461733768589046, "grad_norm": 0.11905822902917862, "learning_rate": 0.00019011814206493411, "loss": 0.2462, "step": 403 }, { "epoch": 0.1465360899528473, "grad_norm": 0.13609488308429718, "learning_rate": 0.00019006851275633871, "loss": 0.3008, "step": 404 }, { "epoch": 0.14689880304678998, "grad_norm": 0.13262596726417542, "learning_rate": 0.00019001876564500909, "loss": 0.2682, "step": 405 }, { "epoch": 0.14726151614073268, "grad_norm": 0.12421231716871262, "learning_rate": 0.00018996890079601059, "loss": 0.2553, "step": 406 }, { "epoch": 0.14762422923467536, "grad_norm": 0.14463739097118378, "learning_rate": 0.00018991891827456266, "loss": 0.2483, "step": 407 }, { "epoch": 0.14798694232861806, "grad_norm": 0.12037564069032669, "learning_rate": 0.00018986881814603862, "loss": 0.2807, "step": 408 }, { "epoch": 0.14834965542256076, "grad_norm": 0.1340160369873047, "learning_rate": 0.0001898186004759656, "loss": 0.248, "step": 409 }, { "epoch": 0.14871236851650343, "grad_norm": 0.13164542615413666, "learning_rate": 0.0001897682653300245, "loss": 0.2617, "step": 410 }, { "epoch": 0.14907508161044614, "grad_norm": 0.12125716358423233, "learning_rate": 0.0001897178127740498, "loss": 0.249, "step": 411 }, { "epoch": 0.14943779470438884, "grad_norm": 0.13088323175907135, "learning_rate": 0.00018966724287402964, "loss": 0.2855, "step": 412 }, { "epoch": 0.1498005077983315, "grad_norm": 0.13843600451946259, "learning_rate": 0.00018961655569610557, "loss": 0.2613, "step": 413 }, { "epoch": 0.1501632208922742, "grad_norm": 0.12319327145814896, "learning_rate": 0.00018956575130657256, "loss": 0.2675, "step": 414 }, { "epoch": 0.15052593398621691, "grad_norm": 0.12738944590091705, "learning_rate": 0.0001895148297718788, "loss": 0.2492, "step": 415 }, { "epoch": 0.1508886470801596, "grad_norm": 0.1370190680027008, "learning_rate": 0.00018946379115862585, "loss": 0.2565, "step": 416 }, { "epoch": 0.1512513601741023, "grad_norm": 0.12752386927604675, "learning_rate": 0.00018941263553356829, "loss": 0.2752, "step": 417 }, { "epoch": 0.15161407326804496, "grad_norm": 0.12467992305755615, "learning_rate": 0.00018936136296361373, "loss": 0.261, "step": 418 }, { "epoch": 0.15197678636198766, "grad_norm": 0.12830005586147308, "learning_rate": 0.00018930997351582286, "loss": 0.2579, "step": 419 }, { "epoch": 0.15233949945593037, "grad_norm": 0.1329096108675003, "learning_rate": 0.00018925846725740907, "loss": 0.2736, "step": 420 }, { "epoch": 0.15270221254987304, "grad_norm": 0.12870270013809204, "learning_rate": 0.00018920684425573865, "loss": 0.2519, "step": 421 }, { "epoch": 0.15306492564381574, "grad_norm": 0.1223597452044487, "learning_rate": 0.00018915510457833055, "loss": 0.2462, "step": 422 }, { "epoch": 0.15342763873775844, "grad_norm": 0.13859276473522186, "learning_rate": 0.0001891032482928563, "loss": 0.2546, "step": 423 }, { "epoch": 0.15379035183170112, "grad_norm": 0.12266798317432404, "learning_rate": 0.00018905127546713996, "loss": 0.2426, "step": 424 }, { "epoch": 0.15415306492564382, "grad_norm": 0.1270112842321396, "learning_rate": 0.00018899918616915802, "loss": 0.2719, "step": 425 }, { "epoch": 0.15451577801958652, "grad_norm": 0.12060489505529404, "learning_rate": 0.0001889469804670393, "loss": 0.2617, "step": 426 }, { "epoch": 0.1548784911135292, "grad_norm": 0.1132146492600441, "learning_rate": 0.00018889465842906488, "loss": 0.2464, "step": 427 }, { "epoch": 0.1552412042074719, "grad_norm": 0.12224707752466202, "learning_rate": 0.00018884222012366796, "loss": 0.2963, "step": 428 }, { "epoch": 0.15560391730141457, "grad_norm": 0.11490823328495026, "learning_rate": 0.00018878966561943386, "loss": 0.2686, "step": 429 }, { "epoch": 0.15596663039535727, "grad_norm": 0.16463352739810944, "learning_rate": 0.00018873699498509988, "loss": 0.2986, "step": 430 }, { "epoch": 0.15632934348929997, "grad_norm": 0.12075062096118927, "learning_rate": 0.00018868420828955514, "loss": 0.2968, "step": 431 }, { "epoch": 0.15669205658324264, "grad_norm": 0.1205056831240654, "learning_rate": 0.00018863130560184063, "loss": 0.2565, "step": 432 }, { "epoch": 0.15705476967718535, "grad_norm": 0.1396438032388687, "learning_rate": 0.00018857828699114904, "loss": 0.2686, "step": 433 }, { "epoch": 0.15741748277112805, "grad_norm": 0.11857564747333527, "learning_rate": 0.0001885251525268246, "loss": 0.2453, "step": 434 }, { "epoch": 0.15778019586507072, "grad_norm": 0.12120261788368225, "learning_rate": 0.0001884719022783632, "loss": 0.2363, "step": 435 }, { "epoch": 0.15814290895901342, "grad_norm": 0.1222701370716095, "learning_rate": 0.00018841853631541207, "loss": 0.2641, "step": 436 }, { "epoch": 0.15850562205295612, "grad_norm": 0.12121476233005524, "learning_rate": 0.00018836505470776983, "loss": 0.2542, "step": 437 }, { "epoch": 0.1588683351468988, "grad_norm": 0.12737686932086945, "learning_rate": 0.0001883114575253863, "loss": 0.2502, "step": 438 }, { "epoch": 0.1592310482408415, "grad_norm": 0.12551474571228027, "learning_rate": 0.00018825774483836248, "loss": 0.2676, "step": 439 }, { "epoch": 0.15959376133478417, "grad_norm": 0.12225164473056793, "learning_rate": 0.00018820391671695057, "loss": 0.2695, "step": 440 }, { "epoch": 0.15995647442872687, "grad_norm": 0.12774313986301422, "learning_rate": 0.00018814997323155357, "loss": 0.2454, "step": 441 }, { "epoch": 0.16031918752266958, "grad_norm": 0.12761445343494415, "learning_rate": 0.0001880959144527254, "loss": 0.2539, "step": 442 }, { "epoch": 0.16068190061661225, "grad_norm": 0.11978595703840256, "learning_rate": 0.00018804174045117087, "loss": 0.2301, "step": 443 }, { "epoch": 0.16104461371055495, "grad_norm": 0.12763962149620056, "learning_rate": 0.00018798745129774543, "loss": 0.2376, "step": 444 }, { "epoch": 0.16140732680449765, "grad_norm": 0.13063186407089233, "learning_rate": 0.00018793304706345515, "loss": 0.2768, "step": 445 }, { "epoch": 0.16177003989844033, "grad_norm": 0.11672946810722351, "learning_rate": 0.00018787852781945656, "loss": 0.246, "step": 446 }, { "epoch": 0.16213275299238303, "grad_norm": 0.12725545465946198, "learning_rate": 0.00018782389363705674, "loss": 0.262, "step": 447 }, { "epoch": 0.16249546608632573, "grad_norm": 0.1206207126379013, "learning_rate": 0.00018776914458771296, "loss": 0.2385, "step": 448 }, { "epoch": 0.1628581791802684, "grad_norm": 0.11878547072410583, "learning_rate": 0.00018771428074303286, "loss": 0.2666, "step": 449 }, { "epoch": 0.1632208922742111, "grad_norm": 0.12689107656478882, "learning_rate": 0.0001876593021747741, "loss": 0.2828, "step": 450 }, { "epoch": 0.16358360536815378, "grad_norm": 0.11968659609556198, "learning_rate": 0.00018760420895484446, "loss": 0.2428, "step": 451 }, { "epoch": 0.16394631846209648, "grad_norm": 0.13296844065189362, "learning_rate": 0.0001875490011553017, "loss": 0.2689, "step": 452 }, { "epoch": 0.16430903155603918, "grad_norm": 0.13149085640907288, "learning_rate": 0.00018749367884835337, "loss": 0.259, "step": 453 }, { "epoch": 0.16467174464998185, "grad_norm": 0.13679270446300507, "learning_rate": 0.00018743824210635683, "loss": 0.2604, "step": 454 }, { "epoch": 0.16503445774392456, "grad_norm": 0.12205653637647629, "learning_rate": 0.0001873826910018191, "loss": 0.2557, "step": 455 }, { "epoch": 0.16539717083786726, "grad_norm": 0.11403360217809677, "learning_rate": 0.00018732702560739678, "loss": 0.2596, "step": 456 }, { "epoch": 0.16575988393180993, "grad_norm": 0.15047647058963776, "learning_rate": 0.000187271245995896, "loss": 0.2571, "step": 457 }, { "epoch": 0.16612259702575263, "grad_norm": 0.12830372154712677, "learning_rate": 0.00018721535224027212, "loss": 0.256, "step": 458 }, { "epoch": 0.16648531011969533, "grad_norm": 0.12144992500543594, "learning_rate": 0.00018715934441363002, "loss": 0.2488, "step": 459 }, { "epoch": 0.166848023213638, "grad_norm": 0.128736212849617, "learning_rate": 0.00018710322258922357, "loss": 0.2541, "step": 460 }, { "epoch": 0.1672107363075807, "grad_norm": 0.1277531534433365, "learning_rate": 0.0001870469868404559, "loss": 0.2609, "step": 461 }, { "epoch": 0.16757344940152338, "grad_norm": 0.12313154339790344, "learning_rate": 0.00018699063724087904, "loss": 0.2547, "step": 462 }, { "epoch": 0.16793616249546608, "grad_norm": 0.12278270721435547, "learning_rate": 0.00018693417386419397, "loss": 0.2509, "step": 463 }, { "epoch": 0.1682988755894088, "grad_norm": 0.12022969871759415, "learning_rate": 0.00018687759678425044, "loss": 0.2384, "step": 464 }, { "epoch": 0.16866158868335146, "grad_norm": 0.12230958789587021, "learning_rate": 0.000186820906075047, "loss": 0.2535, "step": 465 }, { "epoch": 0.16902430177729416, "grad_norm": 0.13055519759655, "learning_rate": 0.00018676410181073073, "loss": 0.244, "step": 466 }, { "epoch": 0.16938701487123686, "grad_norm": 0.12790988385677338, "learning_rate": 0.0001867071840655973, "loss": 0.2479, "step": 467 }, { "epoch": 0.16974972796517954, "grad_norm": 0.13046807050704956, "learning_rate": 0.00018665015291409077, "loss": 0.2493, "step": 468 }, { "epoch": 0.17011244105912224, "grad_norm": 0.1160719096660614, "learning_rate": 0.00018659300843080348, "loss": 0.2274, "step": 469 }, { "epoch": 0.17047515415306494, "grad_norm": 0.1292848438024521, "learning_rate": 0.00018653575069047608, "loss": 0.258, "step": 470 }, { "epoch": 0.1708378672470076, "grad_norm": 0.1197739690542221, "learning_rate": 0.00018647837976799734, "loss": 0.2276, "step": 471 }, { "epoch": 0.17120058034095031, "grad_norm": 0.11929846554994583, "learning_rate": 0.00018642089573840402, "loss": 0.2617, "step": 472 }, { "epoch": 0.171563293434893, "grad_norm": 0.12611514329910278, "learning_rate": 0.00018636329867688085, "loss": 0.2525, "step": 473 }, { "epoch": 0.1719260065288357, "grad_norm": 0.1322082132101059, "learning_rate": 0.0001863055886587604, "loss": 0.2564, "step": 474 }, { "epoch": 0.1722887196227784, "grad_norm": 0.1298658400774002, "learning_rate": 0.0001862477657595229, "loss": 0.2451, "step": 475 }, { "epoch": 0.17265143271672106, "grad_norm": 0.1305808424949646, "learning_rate": 0.00018618983005479637, "loss": 0.2546, "step": 476 }, { "epoch": 0.17301414581066377, "grad_norm": 0.1403343826532364, "learning_rate": 0.00018613178162035624, "loss": 0.2566, "step": 477 }, { "epoch": 0.17337685890460647, "grad_norm": 0.12340683490037918, "learning_rate": 0.00018607362053212545, "loss": 0.2402, "step": 478 }, { "epoch": 0.17373957199854914, "grad_norm": 0.12032376229763031, "learning_rate": 0.00018601534686617423, "loss": 0.2524, "step": 479 }, { "epoch": 0.17410228509249184, "grad_norm": 0.14251156151294708, "learning_rate": 0.00018595696069872013, "loss": 0.2386, "step": 480 }, { "epoch": 0.17446499818643452, "grad_norm": 0.12001265585422516, "learning_rate": 0.00018589846210612776, "loss": 0.2311, "step": 481 }, { "epoch": 0.17482771128037722, "grad_norm": 0.127760112285614, "learning_rate": 0.00018583985116490877, "loss": 0.2528, "step": 482 }, { "epoch": 0.17519042437431992, "grad_norm": 0.1348508894443512, "learning_rate": 0.0001857811279517219, "loss": 0.2861, "step": 483 }, { "epoch": 0.1755531374682626, "grad_norm": 0.1362610161304474, "learning_rate": 0.00018572229254337254, "loss": 0.2606, "step": 484 }, { "epoch": 0.1759158505622053, "grad_norm": 0.12335646897554398, "learning_rate": 0.00018566334501681294, "loss": 0.2735, "step": 485 }, { "epoch": 0.176278563656148, "grad_norm": 0.2398405522108078, "learning_rate": 0.000185604285449142, "loss": 0.2686, "step": 486 }, { "epoch": 0.17664127675009067, "grad_norm": 0.12291895598173141, "learning_rate": 0.00018554511391760502, "loss": 0.251, "step": 487 }, { "epoch": 0.17700398984403337, "grad_norm": 0.1420765072107315, "learning_rate": 0.00018548583049959394, "loss": 0.3053, "step": 488 }, { "epoch": 0.17736670293797607, "grad_norm": 0.13731782138347626, "learning_rate": 0.0001854264352726469, "loss": 0.2508, "step": 489 }, { "epoch": 0.17772941603191875, "grad_norm": 0.12329670786857605, "learning_rate": 0.00018536692831444836, "loss": 0.2544, "step": 490 }, { "epoch": 0.17809212912586145, "grad_norm": 0.13219058513641357, "learning_rate": 0.0001853073097028288, "loss": 0.2933, "step": 491 }, { "epoch": 0.17845484221980412, "grad_norm": 0.13322101533412933, "learning_rate": 0.00018524757951576487, "loss": 0.2546, "step": 492 }, { "epoch": 0.17881755531374682, "grad_norm": 0.13400037586688995, "learning_rate": 0.00018518773783137907, "loss": 0.2538, "step": 493 }, { "epoch": 0.17918026840768952, "grad_norm": 0.1361285001039505, "learning_rate": 0.0001851277847279398, "loss": 0.2522, "step": 494 }, { "epoch": 0.1795429815016322, "grad_norm": 0.1310225874185562, "learning_rate": 0.00018506772028386106, "loss": 0.2667, "step": 495 }, { "epoch": 0.1799056945955749, "grad_norm": 0.12234266102313995, "learning_rate": 0.00018500754457770257, "loss": 0.2392, "step": 496 }, { "epoch": 0.1802684076895176, "grad_norm": 0.1298176795244217, "learning_rate": 0.00018494725768816958, "loss": 0.2573, "step": 497 }, { "epoch": 0.18063112078346028, "grad_norm": 0.1306108981370926, "learning_rate": 0.00018488685969411276, "loss": 0.2524, "step": 498 }, { "epoch": 0.18099383387740298, "grad_norm": 0.13212443888187408, "learning_rate": 0.00018482635067452804, "loss": 0.2577, "step": 499 }, { "epoch": 0.18135654697134568, "grad_norm": 0.12641021609306335, "learning_rate": 0.0001847657307085566, "loss": 0.2585, "step": 500 }, { "epoch": 0.18171926006528835, "grad_norm": 0.13970649242401123, "learning_rate": 0.00018470499987548473, "loss": 0.2652, "step": 501 }, { "epoch": 0.18208197315923105, "grad_norm": 0.12708009779453278, "learning_rate": 0.0001846441582547437, "loss": 0.2675, "step": 502 }, { "epoch": 0.18244468625317373, "grad_norm": 0.1252969652414322, "learning_rate": 0.00018458320592590975, "loss": 0.2622, "step": 503 }, { "epoch": 0.18280739934711643, "grad_norm": 0.13454315066337585, "learning_rate": 0.0001845221429687038, "loss": 0.2848, "step": 504 }, { "epoch": 0.18317011244105913, "grad_norm": 0.11531683802604675, "learning_rate": 0.0001844609694629916, "loss": 0.2335, "step": 505 }, { "epoch": 0.1835328255350018, "grad_norm": 0.12405534833669662, "learning_rate": 0.00018439968548878338, "loss": 0.2494, "step": 506 }, { "epoch": 0.1838955386289445, "grad_norm": 0.12868863344192505, "learning_rate": 0.00018433829112623394, "loss": 0.2551, "step": 507 }, { "epoch": 0.1842582517228872, "grad_norm": 0.12778586149215698, "learning_rate": 0.00018427678645564235, "loss": 0.2519, "step": 508 }, { "epoch": 0.18462096481682988, "grad_norm": 0.12378937751054764, "learning_rate": 0.00018421517155745208, "loss": 0.2463, "step": 509 }, { "epoch": 0.18498367791077258, "grad_norm": 0.12006038427352905, "learning_rate": 0.00018415344651225067, "loss": 0.2434, "step": 510 }, { "epoch": 0.18534639100471528, "grad_norm": 0.12323882430791855, "learning_rate": 0.0001840916114007698, "loss": 0.2495, "step": 511 }, { "epoch": 0.18570910409865796, "grad_norm": 0.12510351836681366, "learning_rate": 0.00018402966630388505, "loss": 0.2421, "step": 512 }, { "epoch": 0.18607181719260066, "grad_norm": 0.16430193185806274, "learning_rate": 0.00018396761130261586, "loss": 0.261, "step": 513 }, { "epoch": 0.18643453028654333, "grad_norm": 0.13129295408725739, "learning_rate": 0.0001839054464781255, "loss": 0.2552, "step": 514 }, { "epoch": 0.18679724338048603, "grad_norm": 0.12675730884075165, "learning_rate": 0.00018384317191172072, "loss": 0.2443, "step": 515 }, { "epoch": 0.18715995647442873, "grad_norm": 0.1283879280090332, "learning_rate": 0.00018378078768485192, "loss": 0.2453, "step": 516 }, { "epoch": 0.1875226695683714, "grad_norm": 0.12647312879562378, "learning_rate": 0.00018371829387911292, "loss": 0.2434, "step": 517 }, { "epoch": 0.1878853826623141, "grad_norm": 0.12233056873083115, "learning_rate": 0.0001836556905762409, "loss": 0.283, "step": 518 }, { "epoch": 0.1882480957562568, "grad_norm": 0.13304516673088074, "learning_rate": 0.00018359297785811612, "loss": 0.2545, "step": 519 }, { "epoch": 0.18861080885019949, "grad_norm": 0.13864544034004211, "learning_rate": 0.000183530155806762, "loss": 0.2571, "step": 520 }, { "epoch": 0.1889735219441422, "grad_norm": 0.1448136270046234, "learning_rate": 0.00018346722450434508, "loss": 0.2576, "step": 521 }, { "epoch": 0.1893362350380849, "grad_norm": 0.14094996452331543, "learning_rate": 0.00018340418403317463, "loss": 0.2568, "step": 522 }, { "epoch": 0.18969894813202756, "grad_norm": 0.13471728563308716, "learning_rate": 0.00018334103447570282, "loss": 0.2271, "step": 523 }, { "epoch": 0.19006166122597026, "grad_norm": 0.12976421415805817, "learning_rate": 0.00018327777591452436, "loss": 0.2386, "step": 524 }, { "epoch": 0.19042437431991294, "grad_norm": 0.15379559993743896, "learning_rate": 0.00018321440843237672, "loss": 0.2681, "step": 525 }, { "epoch": 0.19078708741385564, "grad_norm": 0.16950151324272156, "learning_rate": 0.00018315093211213962, "loss": 0.2526, "step": 526 }, { "epoch": 0.19114980050779834, "grad_norm": 0.13350321352481842, "learning_rate": 0.00018308734703683535, "loss": 0.2495, "step": 527 }, { "epoch": 0.191512513601741, "grad_norm": 0.14698749780654907, "learning_rate": 0.00018302365328962824, "loss": 0.2381, "step": 528 }, { "epoch": 0.19187522669568371, "grad_norm": 0.12897023558616638, "learning_rate": 0.0001829598509538249, "loss": 0.256, "step": 529 }, { "epoch": 0.19223793978962642, "grad_norm": 0.14562232792377472, "learning_rate": 0.0001828959401128739, "loss": 0.2607, "step": 530 }, { "epoch": 0.1926006528835691, "grad_norm": 0.13689380884170532, "learning_rate": 0.0001828319208503657, "loss": 0.2451, "step": 531 }, { "epoch": 0.1929633659775118, "grad_norm": 0.130660280585289, "learning_rate": 0.00018276779325003268, "loss": 0.2554, "step": 532 }, { "epoch": 0.1933260790714545, "grad_norm": 0.12638305127620697, "learning_rate": 0.00018270355739574877, "loss": 0.2496, "step": 533 }, { "epoch": 0.19368879216539717, "grad_norm": 0.14226087927818298, "learning_rate": 0.00018263921337152955, "loss": 0.2423, "step": 534 }, { "epoch": 0.19405150525933987, "grad_norm": 0.1410246342420578, "learning_rate": 0.00018257476126153218, "loss": 0.2721, "step": 535 }, { "epoch": 0.19441421835328254, "grad_norm": 0.1288328468799591, "learning_rate": 0.00018251020115005504, "loss": 0.2321, "step": 536 }, { "epoch": 0.19477693144722524, "grad_norm": 0.14098510146141052, "learning_rate": 0.0001824455331215378, "loss": 0.2467, "step": 537 }, { "epoch": 0.19513964454116794, "grad_norm": 0.13489827513694763, "learning_rate": 0.00018238075726056136, "loss": 0.2491, "step": 538 }, { "epoch": 0.19550235763511062, "grad_norm": 0.13195975124835968, "learning_rate": 0.00018231587365184754, "loss": 0.2443, "step": 539 }, { "epoch": 0.19586507072905332, "grad_norm": 0.1283298283815384, "learning_rate": 0.00018225088238025915, "loss": 0.2465, "step": 540 }, { "epoch": 0.19622778382299602, "grad_norm": 0.11871767789125443, "learning_rate": 0.00018218578353079988, "loss": 0.227, "step": 541 }, { "epoch": 0.1965904969169387, "grad_norm": 0.14271649718284607, "learning_rate": 0.00018212057718861396, "loss": 0.2734, "step": 542 }, { "epoch": 0.1969532100108814, "grad_norm": 0.14445483684539795, "learning_rate": 0.00018205526343898637, "loss": 0.2417, "step": 543 }, { "epoch": 0.1973159231048241, "grad_norm": 0.13704335689544678, "learning_rate": 0.00018198984236734246, "loss": 0.287, "step": 544 }, { "epoch": 0.19767863619876677, "grad_norm": 0.12846963107585907, "learning_rate": 0.00018192431405924804, "loss": 0.2448, "step": 545 }, { "epoch": 0.19804134929270947, "grad_norm": 0.14025187492370605, "learning_rate": 0.00018185867860040907, "loss": 0.2277, "step": 546 }, { "epoch": 0.19840406238665215, "grad_norm": 0.12117055058479309, "learning_rate": 0.00018179293607667178, "loss": 0.2434, "step": 547 }, { "epoch": 0.19876677548059485, "grad_norm": 0.1310604214668274, "learning_rate": 0.00018172708657402233, "loss": 0.2414, "step": 548 }, { "epoch": 0.19912948857453755, "grad_norm": 0.15536460280418396, "learning_rate": 0.00018166113017858683, "loss": 0.2608, "step": 549 }, { "epoch": 0.19949220166848022, "grad_norm": 0.1420615315437317, "learning_rate": 0.00018159506697663127, "loss": 0.269, "step": 550 }, { "epoch": 0.19985491476242292, "grad_norm": 0.13386112451553345, "learning_rate": 0.00018152889705456117, "loss": 0.2728, "step": 551 }, { "epoch": 0.20021762785636563, "grad_norm": 0.12435004115104675, "learning_rate": 0.00018146262049892185, "loss": 0.251, "step": 552 }, { "epoch": 0.2005803409503083, "grad_norm": 0.13267625868320465, "learning_rate": 0.00018139623739639788, "loss": 0.2844, "step": 553 }, { "epoch": 0.200943054044251, "grad_norm": 0.13061115145683289, "learning_rate": 0.00018132974783381336, "loss": 0.2287, "step": 554 }, { "epoch": 0.20130576713819368, "grad_norm": 0.13054601848125458, "learning_rate": 0.0001812631518981315, "loss": 0.237, "step": 555 }, { "epoch": 0.20166848023213638, "grad_norm": 0.1794627159833908, "learning_rate": 0.00018119644967645474, "loss": 0.2752, "step": 556 }, { "epoch": 0.20203119332607908, "grad_norm": 0.13099108636379242, "learning_rate": 0.00018112964125602447, "loss": 0.2514, "step": 557 }, { "epoch": 0.20239390642002175, "grad_norm": 0.13102415204048157, "learning_rate": 0.000181062726724221, "loss": 0.2428, "step": 558 }, { "epoch": 0.20275661951396445, "grad_norm": 0.13251091539859772, "learning_rate": 0.00018099570616856344, "loss": 0.2452, "step": 559 }, { "epoch": 0.20311933260790715, "grad_norm": 0.12863093614578247, "learning_rate": 0.00018092857967670956, "loss": 0.256, "step": 560 }, { "epoch": 0.20348204570184983, "grad_norm": 0.13334688544273376, "learning_rate": 0.00018086134733645565, "loss": 0.2608, "step": 561 }, { "epoch": 0.20384475879579253, "grad_norm": 0.15378229320049286, "learning_rate": 0.00018079400923573652, "loss": 0.2416, "step": 562 }, { "epoch": 0.20420747188973523, "grad_norm": 0.1594190150499344, "learning_rate": 0.00018072656546262524, "loss": 0.2526, "step": 563 }, { "epoch": 0.2045701849836779, "grad_norm": 0.13872471451759338, "learning_rate": 0.00018065901610533306, "loss": 0.2379, "step": 564 }, { "epoch": 0.2049328980776206, "grad_norm": 0.1253708302974701, "learning_rate": 0.0001805913612522095, "loss": 0.2352, "step": 565 }, { "epoch": 0.20529561117156328, "grad_norm": 0.13366468250751495, "learning_rate": 0.00018052360099174184, "loss": 0.2448, "step": 566 }, { "epoch": 0.20565832426550598, "grad_norm": 605528.9375, "learning_rate": 0.00018045573541255534, "loss": 0.2251, "step": 567 }, { "epoch": 0.20602103735944868, "grad_norm": 0.18479633331298828, "learning_rate": 0.00018038776460341303, "loss": 0.254, "step": 568 }, { "epoch": 0.20638375045339136, "grad_norm": 0.20463520288467407, "learning_rate": 0.0001803196886532155, "loss": 0.2328, "step": 569 }, { "epoch": 0.20674646354733406, "grad_norm": 0.1946071833372116, "learning_rate": 0.00018025150765100094, "loss": 0.2622, "step": 570 }, { "epoch": 0.20710917664127676, "grad_norm": 0.16838648915290833, "learning_rate": 0.00018018322168594485, "loss": 0.2712, "step": 571 }, { "epoch": 0.20747188973521943, "grad_norm": 0.20080481469631195, "learning_rate": 0.00018011483084736006, "loss": 0.2465, "step": 572 }, { "epoch": 0.20783460282916214, "grad_norm": 0.19547294080257416, "learning_rate": 0.00018004633522469656, "loss": 0.2829, "step": 573 }, { "epoch": 0.20819731592310484, "grad_norm": 0.14593558013439178, "learning_rate": 0.00017997773490754137, "loss": 0.2532, "step": 574 }, { "epoch": 0.2085600290170475, "grad_norm": 0.1449822634458542, "learning_rate": 0.00017990902998561855, "loss": 0.2528, "step": 575 }, { "epoch": 0.2089227421109902, "grad_norm": 0.14969614148139954, "learning_rate": 0.0001798402205487888, "loss": 0.2389, "step": 576 }, { "epoch": 0.20928545520493289, "grad_norm": 0.14283829927444458, "learning_rate": 0.00017977130668704965, "loss": 0.2337, "step": 577 }, { "epoch": 0.2096481682988756, "grad_norm": 0.1496269404888153, "learning_rate": 0.00017970228849053515, "loss": 0.259, "step": 578 }, { "epoch": 0.2100108813928183, "grad_norm": 0.13835981488227844, "learning_rate": 0.00017963316604951586, "loss": 0.2628, "step": 579 }, { "epoch": 0.21037359448676096, "grad_norm": 0.14784668385982513, "learning_rate": 0.0001795639394543986, "loss": 0.2488, "step": 580 }, { "epoch": 0.21073630758070366, "grad_norm": 0.13575692474842072, "learning_rate": 0.00017949460879572652, "loss": 0.2403, "step": 581 }, { "epoch": 0.21109902067464636, "grad_norm": 0.14234420657157898, "learning_rate": 0.00017942517416417878, "loss": 0.2649, "step": 582 }, { "epoch": 0.21146173376858904, "grad_norm": 0.13922925293445587, "learning_rate": 0.0001793556356505706, "loss": 0.2466, "step": 583 }, { "epoch": 0.21182444686253174, "grad_norm": 0.1288311779499054, "learning_rate": 0.00017928599334585306, "loss": 0.2314, "step": 584 }, { "epoch": 0.21218715995647444, "grad_norm": 0.12375061213970184, "learning_rate": 0.00017921624734111292, "loss": 0.2401, "step": 585 }, { "epoch": 0.21254987305041712, "grad_norm": 0.12890039384365082, "learning_rate": 0.0001791463977275727, "loss": 0.2416, "step": 586 }, { "epoch": 0.21291258614435982, "grad_norm": 0.13691289722919464, "learning_rate": 0.00017907644459659033, "loss": 0.2473, "step": 587 }, { "epoch": 0.2132752992383025, "grad_norm": 0.3051564693450928, "learning_rate": 0.0001790063880396591, "loss": 0.2464, "step": 588 }, { "epoch": 0.2136380123322452, "grad_norm": 0.13991987705230713, "learning_rate": 0.00017893622814840773, "loss": 0.2526, "step": 589 }, { "epoch": 0.2140007254261879, "grad_norm": 0.12774237990379333, "learning_rate": 0.00017886596501459992, "loss": 0.2375, "step": 590 }, { "epoch": 0.21436343852013057, "grad_norm": 0.13759708404541016, "learning_rate": 0.00017879559873013452, "loss": 0.2248, "step": 591 }, { "epoch": 0.21472615161407327, "grad_norm": 0.13571417331695557, "learning_rate": 0.00017872512938704523, "loss": 0.2612, "step": 592 }, { "epoch": 0.21508886470801597, "grad_norm": 0.1446496546268463, "learning_rate": 0.00017865455707750063, "loss": 0.2466, "step": 593 }, { "epoch": 0.21545157780195864, "grad_norm": 0.12743471562862396, "learning_rate": 0.00017858388189380387, "loss": 0.2681, "step": 594 }, { "epoch": 0.21581429089590135, "grad_norm": 0.1251528263092041, "learning_rate": 0.00017851310392839266, "loss": 0.246, "step": 595 }, { "epoch": 0.21617700398984405, "grad_norm": 0.12966857850551605, "learning_rate": 0.0001784422232738392, "loss": 0.2293, "step": 596 }, { "epoch": 0.21653971708378672, "grad_norm": 0.14909860491752625, "learning_rate": 0.00017837124002285, "loss": 0.2577, "step": 597 }, { "epoch": 0.21690243017772942, "grad_norm": 0.136635884642601, "learning_rate": 0.00017830015426826567, "loss": 0.262, "step": 598 }, { "epoch": 0.2172651432716721, "grad_norm": 0.13285911083221436, "learning_rate": 0.000178228966103061, "loss": 0.2598, "step": 599 }, { "epoch": 0.2176278563656148, "grad_norm": 0.13522981107234955, "learning_rate": 0.00017815767562034463, "loss": 0.2469, "step": 600 }, { "epoch": 0.2179905694595575, "grad_norm": 0.13613048195838928, "learning_rate": 0.00017808628291335912, "loss": 0.2519, "step": 601 }, { "epoch": 0.21835328255350017, "grad_norm": 0.14597558975219727, "learning_rate": 0.00017801478807548063, "loss": 0.2651, "step": 602 }, { "epoch": 0.21871599564744287, "grad_norm": 0.13757093250751495, "learning_rate": 0.00017794319120021895, "loss": 0.2593, "step": 603 }, { "epoch": 0.21907870874138557, "grad_norm": 0.13094554841518402, "learning_rate": 0.00017787149238121733, "loss": 0.2546, "step": 604 }, { "epoch": 0.21944142183532825, "grad_norm": 125.78084564208984, "learning_rate": 0.00017779969171225236, "loss": 0.2456, "step": 605 }, { "epoch": 0.21980413492927095, "grad_norm": 0.15768976509571075, "learning_rate": 0.00017772778928723383, "loss": 0.2412, "step": 606 }, { "epoch": 0.22016684802321365, "grad_norm": 0.19074760377407074, "learning_rate": 0.00017765578520020459, "loss": 0.2699, "step": 607 }, { "epoch": 0.22052956111715633, "grad_norm": 0.1577846109867096, "learning_rate": 0.0001775836795453405, "loss": 0.2737, "step": 608 }, { "epoch": 0.22089227421109903, "grad_norm": 0.153973788022995, "learning_rate": 0.00017751147241695025, "loss": 0.2336, "step": 609 }, { "epoch": 0.2212549873050417, "grad_norm": 0.16625823080539703, "learning_rate": 0.0001774391639094753, "loss": 0.248, "step": 610 }, { "epoch": 0.2216177003989844, "grad_norm": 0.17917267978191376, "learning_rate": 0.00017736675411748955, "loss": 0.2559, "step": 611 }, { "epoch": 0.2219804134929271, "grad_norm": 0.15878534317016602, "learning_rate": 0.00017729424313569955, "loss": 0.249, "step": 612 }, { "epoch": 0.22234312658686978, "grad_norm": 0.1509985774755478, "learning_rate": 0.00017722163105894412, "loss": 0.2607, "step": 613 }, { "epoch": 0.22270583968081248, "grad_norm": 0.13934160768985748, "learning_rate": 0.0001771489179821943, "loss": 0.2377, "step": 614 }, { "epoch": 0.22306855277475518, "grad_norm": 0.1717095524072647, "learning_rate": 0.00017707610400055323, "loss": 0.2554, "step": 615 }, { "epoch": 0.22343126586869785, "grad_norm": 0.13818614184856415, "learning_rate": 0.00017700318920925605, "loss": 0.2748, "step": 616 }, { "epoch": 0.22379397896264056, "grad_norm": 0.12828463315963745, "learning_rate": 0.00017693017370366972, "loss": 0.2398, "step": 617 }, { "epoch": 0.22415669205658323, "grad_norm": 0.13687558472156525, "learning_rate": 0.00017685705757929294, "loss": 0.2735, "step": 618 }, { "epoch": 0.22451940515052593, "grad_norm": 0.1353394091129303, "learning_rate": 0.00017678384093175605, "loss": 0.2428, "step": 619 }, { "epoch": 0.22488211824446863, "grad_norm": 0.1443159580230713, "learning_rate": 0.00017671052385682078, "loss": 0.2566, "step": 620 }, { "epoch": 0.2252448313384113, "grad_norm": 0.14144475758075714, "learning_rate": 0.00017663710645038035, "loss": 0.2482, "step": 621 }, { "epoch": 0.225607544432354, "grad_norm": 0.14739158749580383, "learning_rate": 0.000176563588808459, "loss": 0.253, "step": 622 }, { "epoch": 0.2259702575262967, "grad_norm": 0.14374294877052307, "learning_rate": 0.0001764899710272123, "loss": 0.2394, "step": 623 }, { "epoch": 0.22633297062023938, "grad_norm": 0.14988651871681213, "learning_rate": 0.00017641625320292663, "loss": 0.2953, "step": 624 }, { "epoch": 0.22669568371418208, "grad_norm": 0.1295817494392395, "learning_rate": 0.00017634243543201926, "loss": 0.2177, "step": 625 }, { "epoch": 0.22705839680812478, "grad_norm": 0.13908831775188446, "learning_rate": 0.0001762685178110382, "loss": 0.2348, "step": 626 }, { "epoch": 0.22742110990206746, "grad_norm": 0.12676572799682617, "learning_rate": 0.0001761945004366621, "loss": 0.2347, "step": 627 }, { "epoch": 0.22778382299601016, "grad_norm": 0.12473898380994797, "learning_rate": 0.00017612038340569997, "loss": 0.2161, "step": 628 }, { "epoch": 0.22814653608995283, "grad_norm": 0.12910184264183044, "learning_rate": 0.00017604616681509127, "loss": 0.2476, "step": 629 }, { "epoch": 0.22850924918389554, "grad_norm": 0.1438639611005783, "learning_rate": 0.0001759718507619056, "loss": 0.2464, "step": 630 }, { "epoch": 0.22887196227783824, "grad_norm": 0.1412367820739746, "learning_rate": 0.00017589743534334273, "loss": 0.2475, "step": 631 }, { "epoch": 0.2292346753717809, "grad_norm": 0.13323849439620972, "learning_rate": 0.00017582292065673226, "loss": 0.2352, "step": 632 }, { "epoch": 0.2295973884657236, "grad_norm": 0.13439258933067322, "learning_rate": 0.0001757483067995338, "loss": 0.3278, "step": 633 }, { "epoch": 0.2299601015596663, "grad_norm": 0.1343153417110443, "learning_rate": 0.0001756735938693365, "loss": 0.2419, "step": 634 }, { "epoch": 0.230322814653609, "grad_norm": 0.12620678544044495, "learning_rate": 0.0001755987819638592, "loss": 0.2428, "step": 635 }, { "epoch": 0.2306855277475517, "grad_norm": 0.1367313414812088, "learning_rate": 0.00017552387118095015, "loss": 0.2501, "step": 636 }, { "epoch": 0.2310482408414944, "grad_norm": 0.14542607963085175, "learning_rate": 0.00017544886161858695, "loss": 0.2838, "step": 637 }, { "epoch": 0.23141095393543706, "grad_norm": 0.13652457296848297, "learning_rate": 0.0001753737533748763, "loss": 0.2328, "step": 638 }, { "epoch": 0.23177366702937977, "grad_norm": 0.13839620351791382, "learning_rate": 0.00017529854654805416, "loss": 0.2479, "step": 639 }, { "epoch": 0.23213638012332244, "grad_norm": 0.1453743427991867, "learning_rate": 0.00017522324123648525, "loss": 0.2267, "step": 640 }, { "epoch": 0.23249909321726514, "grad_norm": 0.1310967206954956, "learning_rate": 0.0001751478375386632, "loss": 0.2194, "step": 641 }, { "epoch": 0.23286180631120784, "grad_norm": 0.13854770362377167, "learning_rate": 0.00017507233555321024, "loss": 0.2447, "step": 642 }, { "epoch": 0.23322451940515052, "grad_norm": 0.12980610132217407, "learning_rate": 0.00017499673537887722, "loss": 0.2391, "step": 643 }, { "epoch": 0.23358723249909322, "grad_norm": 0.12693443894386292, "learning_rate": 0.0001749210371145434, "loss": 0.2267, "step": 644 }, { "epoch": 0.23394994559303592, "grad_norm": 0.13409999012947083, "learning_rate": 0.00017484524085921633, "loss": 0.2464, "step": 645 }, { "epoch": 0.2343126586869786, "grad_norm": 0.1421654224395752, "learning_rate": 0.0001747693467120317, "loss": 0.2544, "step": 646 }, { "epoch": 0.2346753717809213, "grad_norm": 0.13795344531536102, "learning_rate": 0.00017469335477225326, "loss": 0.2368, "step": 647 }, { "epoch": 0.235038084874864, "grad_norm": 0.14090494811534882, "learning_rate": 0.0001746172651392727, "loss": 0.2414, "step": 648 }, { "epoch": 0.23540079796880667, "grad_norm": 0.13511234521865845, "learning_rate": 0.0001745410779126094, "loss": 0.2548, "step": 649 }, { "epoch": 0.23576351106274937, "grad_norm": 0.12285248935222626, "learning_rate": 0.00017446479319191047, "loss": 0.2211, "step": 650 }, { "epoch": 0.23612622415669204, "grad_norm": 0.13343022763729095, "learning_rate": 0.00017438841107695046, "loss": 0.2848, "step": 651 }, { "epoch": 0.23648893725063475, "grad_norm": 0.1315213143825531, "learning_rate": 0.00017431193166763138, "loss": 0.2493, "step": 652 }, { "epoch": 0.23685165034457745, "grad_norm": 0.13958190381526947, "learning_rate": 0.0001742353550639824, "loss": 0.3001, "step": 653 }, { "epoch": 0.23721436343852012, "grad_norm": 0.13711069524288177, "learning_rate": 0.00017415868136615994, "loss": 0.249, "step": 654 }, { "epoch": 0.23757707653246282, "grad_norm": 0.13686099648475647, "learning_rate": 0.0001740819106744473, "loss": 0.2493, "step": 655 }, { "epoch": 0.23793978962640552, "grad_norm": 0.14648962020874023, "learning_rate": 0.00017400504308925468, "loss": 0.2368, "step": 656 }, { "epoch": 0.2383025027203482, "grad_norm": 0.13652493059635162, "learning_rate": 0.000173928078711119, "loss": 0.2198, "step": 657 }, { "epoch": 0.2386652158142909, "grad_norm": 0.13376450538635254, "learning_rate": 0.00017385101764070383, "loss": 0.2388, "step": 658 }, { "epoch": 0.2390279289082336, "grad_norm": 0.13941293954849243, "learning_rate": 0.00017377385997879911, "loss": 0.2465, "step": 659 }, { "epoch": 0.23939064200217627, "grad_norm": 0.13455533981323242, "learning_rate": 0.0001736966058263212, "loss": 0.2366, "step": 660 }, { "epoch": 0.23975335509611898, "grad_norm": 0.1292707622051239, "learning_rate": 0.00017361925528431262, "loss": 0.2234, "step": 661 }, { "epoch": 0.24011606819006165, "grad_norm": 0.14742062985897064, "learning_rate": 0.00017354180845394196, "loss": 0.2498, "step": 662 }, { "epoch": 0.24047878128400435, "grad_norm": 0.14243729412555695, "learning_rate": 0.00017346426543650377, "loss": 0.249, "step": 663 }, { "epoch": 0.24084149437794705, "grad_norm": 0.12824714183807373, "learning_rate": 0.00017338662633341844, "loss": 0.2407, "step": 664 }, { "epoch": 0.24120420747188973, "grad_norm": 0.13394343852996826, "learning_rate": 0.00017330889124623187, "loss": 0.2375, "step": 665 }, { "epoch": 0.24156692056583243, "grad_norm": 0.13167209923267365, "learning_rate": 0.0001732310602766157, "loss": 0.2201, "step": 666 }, { "epoch": 0.24192963365977513, "grad_norm": 0.14167827367782593, "learning_rate": 0.0001731531335263669, "loss": 0.2351, "step": 667 }, { "epoch": 0.2422923467537178, "grad_norm": 0.13489162921905518, "learning_rate": 0.0001730751110974077, "loss": 0.2298, "step": 668 }, { "epoch": 0.2426550598476605, "grad_norm": 0.1397753804922104, "learning_rate": 0.0001729969930917854, "loss": 0.2408, "step": 669 }, { "epoch": 0.2430177729416032, "grad_norm": 0.1405513882637024, "learning_rate": 0.00017291877961167251, "loss": 0.2098, "step": 670 }, { "epoch": 0.24338048603554588, "grad_norm": 0.17330865561962128, "learning_rate": 0.00017284047075936617, "loss": 0.2655, "step": 671 }, { "epoch": 0.24374319912948858, "grad_norm": 0.1363557130098343, "learning_rate": 0.00017276206663728846, "loss": 0.2611, "step": 672 }, { "epoch": 0.24410591222343125, "grad_norm": 0.1307671070098877, "learning_rate": 0.00017268356734798595, "loss": 0.2198, "step": 673 }, { "epoch": 0.24446862531737396, "grad_norm": 0.1409989595413208, "learning_rate": 0.0001726049729941297, "loss": 0.2404, "step": 674 }, { "epoch": 0.24483133841131666, "grad_norm": 0.136042058467865, "learning_rate": 0.00017252628367851513, "loss": 0.2537, "step": 675 }, { "epoch": 0.24519405150525933, "grad_norm": 0.1308341771364212, "learning_rate": 0.00017244749950406186, "loss": 0.2296, "step": 676 }, { "epoch": 0.24555676459920203, "grad_norm": 0.14312215149402618, "learning_rate": 0.00017236862057381358, "loss": 0.2414, "step": 677 }, { "epoch": 0.24591947769314473, "grad_norm": 0.14419759809970856, "learning_rate": 0.0001722896469909379, "loss": 0.2353, "step": 678 }, { "epoch": 0.2462821907870874, "grad_norm": 0.13765071332454681, "learning_rate": 0.0001722105788587262, "loss": 0.2317, "step": 679 }, { "epoch": 0.2466449038810301, "grad_norm": 0.1362527757883072, "learning_rate": 0.0001721314162805936, "loss": 0.2201, "step": 680 }, { "epoch": 0.2470076169749728, "grad_norm": 0.13269595801830292, "learning_rate": 0.0001720521593600787, "loss": 0.2625, "step": 681 }, { "epoch": 0.24737033006891548, "grad_norm": 0.12634457647800446, "learning_rate": 0.0001719728082008435, "loss": 0.223, "step": 682 }, { "epoch": 0.24773304316285819, "grad_norm": 0.1394185721874237, "learning_rate": 0.00017189336290667325, "loss": 0.2418, "step": 683 }, { "epoch": 0.24809575625680086, "grad_norm": 0.14138251543045044, "learning_rate": 0.00017181382358147625, "loss": 0.2377, "step": 684 }, { "epoch": 0.24845846935074356, "grad_norm": 0.14079631865024567, "learning_rate": 0.00017173419032928398, "loss": 0.2207, "step": 685 }, { "epoch": 0.24882118244468626, "grad_norm": 0.1409912407398224, "learning_rate": 0.00017165446325425064, "loss": 0.2234, "step": 686 }, { "epoch": 0.24918389553862894, "grad_norm": 0.16069121658802032, "learning_rate": 0.00017157464246065306, "loss": 0.2661, "step": 687 }, { "epoch": 0.24954660863257164, "grad_norm": 0.14292632043361664, "learning_rate": 0.0001714947280528908, "loss": 0.2316, "step": 688 }, { "epoch": 0.24990932172651434, "grad_norm": 0.13920721411705017, "learning_rate": 0.0001714147201354858, "loss": 0.2432, "step": 689 }, { "epoch": 0.250272034820457, "grad_norm": 0.13971884548664093, "learning_rate": 0.0001713346188130823, "loss": 0.2281, "step": 690 }, { "epoch": 0.2506347479143997, "grad_norm": 0.15373115241527557, "learning_rate": 0.0001712544241904467, "loss": 0.2264, "step": 691 }, { "epoch": 0.2509974610083424, "grad_norm": 0.13534583151340485, "learning_rate": 0.00017117413637246748, "loss": 0.2263, "step": 692 }, { "epoch": 0.2513601741022851, "grad_norm": 0.14140291512012482, "learning_rate": 0.00017109375546415495, "loss": 0.24, "step": 693 }, { "epoch": 0.25172288719622776, "grad_norm": 0.1363680064678192, "learning_rate": 0.00017101328157064115, "loss": 0.2212, "step": 694 }, { "epoch": 0.25208560029017046, "grad_norm": 0.13761445879936218, "learning_rate": 0.00017093271479717986, "loss": 0.2368, "step": 695 }, { "epoch": 0.25244831338411317, "grad_norm": 0.13729073107242584, "learning_rate": 0.0001708520552491462, "loss": 0.2403, "step": 696 }, { "epoch": 0.25281102647805587, "grad_norm": 0.13290317356586456, "learning_rate": 0.00017077130303203673, "loss": 0.2234, "step": 697 }, { "epoch": 0.25317373957199857, "grad_norm": 0.14121422171592712, "learning_rate": 0.0001706904582514692, "loss": 0.2289, "step": 698 }, { "epoch": 0.2535364526659412, "grad_norm": 0.1334342509508133, "learning_rate": 0.0001706095210131824, "loss": 0.2333, "step": 699 }, { "epoch": 0.2538991657598839, "grad_norm": 0.13697004318237305, "learning_rate": 0.00017052849142303603, "loss": 0.2244, "step": 700 }, { "epoch": 0.2542618788538266, "grad_norm": 0.14427930116653442, "learning_rate": 0.00017044736958701058, "loss": 0.2731, "step": 701 }, { "epoch": 0.2546245919477693, "grad_norm": 0.14478136599063873, "learning_rate": 0.00017036615561120727, "loss": 0.2432, "step": 702 }, { "epoch": 0.254987305041712, "grad_norm": 0.1374034285545349, "learning_rate": 0.0001702848496018478, "loss": 0.217, "step": 703 }, { "epoch": 0.2553500181356547, "grad_norm": 0.14599081873893738, "learning_rate": 0.00017020345166527412, "loss": 0.241, "step": 704 }, { "epoch": 0.25571273122959737, "grad_norm": 0.13574494421482086, "learning_rate": 0.00017012196190794858, "loss": 0.2329, "step": 705 }, { "epoch": 0.25607544432354007, "grad_norm": 0.1376832127571106, "learning_rate": 0.00017004038043645357, "loss": 0.252, "step": 706 }, { "epoch": 0.25643815741748277, "grad_norm": 0.13819095492362976, "learning_rate": 0.00016995870735749138, "loss": 0.2547, "step": 707 }, { "epoch": 0.25680087051142547, "grad_norm": 0.12175976485013962, "learning_rate": 0.00016987694277788417, "loss": 0.2058, "step": 708 }, { "epoch": 0.2571635836053682, "grad_norm": 0.13914383947849274, "learning_rate": 0.0001697950868045738, "loss": 0.2311, "step": 709 }, { "epoch": 0.2575262966993108, "grad_norm": 0.1349351704120636, "learning_rate": 0.00016971313954462156, "loss": 0.2203, "step": 710 }, { "epoch": 0.2578890097932535, "grad_norm": 0.1311430037021637, "learning_rate": 0.00016963110110520827, "loss": 0.242, "step": 711 }, { "epoch": 0.2582517228871962, "grad_norm": 0.13092203438282013, "learning_rate": 0.0001695489715936339, "loss": 0.25, "step": 712 }, { "epoch": 0.2586144359811389, "grad_norm": 0.13544927537441254, "learning_rate": 0.00016946675111731766, "loss": 0.2263, "step": 713 }, { "epoch": 0.2589771490750816, "grad_norm": 0.13862383365631104, "learning_rate": 0.00016938443978379753, "loss": 0.2404, "step": 714 }, { "epoch": 0.2593398621690243, "grad_norm": 0.14725641906261444, "learning_rate": 0.00016930203770073053, "loss": 0.2482, "step": 715 }, { "epoch": 0.259702575262967, "grad_norm": 0.13641703128814697, "learning_rate": 0.00016921954497589226, "loss": 0.2431, "step": 716 }, { "epoch": 0.2600652883569097, "grad_norm": 0.1381891518831253, "learning_rate": 0.00016913696171717688, "loss": 0.2321, "step": 717 }, { "epoch": 0.2604280014508524, "grad_norm": 0.14194577932357788, "learning_rate": 0.000169054288032597, "loss": 0.2907, "step": 718 }, { "epoch": 0.2607907145447951, "grad_norm": 0.14137552678585052, "learning_rate": 0.00016897152403028357, "loss": 0.2205, "step": 719 }, { "epoch": 0.2611534276387378, "grad_norm": 0.12619373202323914, "learning_rate": 0.00016888866981848544, "loss": 0.2097, "step": 720 }, { "epoch": 0.2615161407326804, "grad_norm": 0.15918751060962677, "learning_rate": 0.0001688057255055697, "loss": 0.2578, "step": 721 }, { "epoch": 0.2618788538266231, "grad_norm": 0.13455507159233093, "learning_rate": 0.00016872269120002108, "loss": 0.2676, "step": 722 }, { "epoch": 0.2622415669205658, "grad_norm": 0.14259149134159088, "learning_rate": 0.0001686395670104422, "loss": 0.2176, "step": 723 }, { "epoch": 0.26260428001450853, "grad_norm": 0.13362933695316315, "learning_rate": 0.0001685563530455531, "loss": 0.2167, "step": 724 }, { "epoch": 0.26296699310845123, "grad_norm": 0.13542160391807556, "learning_rate": 0.00016847304941419128, "loss": 0.2288, "step": 725 }, { "epoch": 0.26332970620239393, "grad_norm": 0.15378214418888092, "learning_rate": 0.00016838965622531157, "loss": 0.287, "step": 726 }, { "epoch": 0.2636924192963366, "grad_norm": 0.1565556526184082, "learning_rate": 0.00016830617358798587, "loss": 0.2692, "step": 727 }, { "epoch": 0.2640551323902793, "grad_norm": 0.14884917438030243, "learning_rate": 0.0001682226016114031, "loss": 0.2368, "step": 728 }, { "epoch": 0.264417845484222, "grad_norm": 0.13870306313037872, "learning_rate": 0.000168138940404869, "loss": 0.2356, "step": 729 }, { "epoch": 0.2647805585781647, "grad_norm": 0.15050628781318665, "learning_rate": 0.00016805519007780602, "loss": 0.2524, "step": 730 }, { "epoch": 0.2651432716721074, "grad_norm": 0.1477731466293335, "learning_rate": 0.00016797135073975326, "loss": 0.2184, "step": 731 }, { "epoch": 0.26550598476605003, "grad_norm": 0.1533484160900116, "learning_rate": 0.0001678874225003661, "loss": 0.2301, "step": 732 }, { "epoch": 0.26586869785999273, "grad_norm": 0.14348532259464264, "learning_rate": 0.0001678034054694163, "loss": 0.2397, "step": 733 }, { "epoch": 0.26623141095393543, "grad_norm": 0.14960677921772003, "learning_rate": 0.0001677192997567917, "loss": 0.2244, "step": 734 }, { "epoch": 0.26659412404787813, "grad_norm": 0.15019361674785614, "learning_rate": 0.00016763510547249615, "loss": 0.2466, "step": 735 }, { "epoch": 0.26695683714182084, "grad_norm": 0.14875197410583496, "learning_rate": 0.00016755082272664937, "loss": 0.2106, "step": 736 }, { "epoch": 0.26731955023576354, "grad_norm": 0.14142164587974548, "learning_rate": 0.00016746645162948672, "loss": 0.2387, "step": 737 }, { "epoch": 0.2676822633297062, "grad_norm": 0.16096633672714233, "learning_rate": 0.0001673819922913592, "loss": 0.2346, "step": 738 }, { "epoch": 0.2680449764236489, "grad_norm": 0.15639543533325195, "learning_rate": 0.0001672974448227331, "loss": 0.2839, "step": 739 }, { "epoch": 0.2684076895175916, "grad_norm": 0.1443796008825302, "learning_rate": 0.0001672128093341901, "loss": 0.2314, "step": 740 }, { "epoch": 0.2687704026115343, "grad_norm": 0.15442712604999542, "learning_rate": 0.00016712808593642695, "loss": 0.2299, "step": 741 }, { "epoch": 0.269133115705477, "grad_norm": 0.14457674324512482, "learning_rate": 0.00016704327474025533, "loss": 0.2526, "step": 742 }, { "epoch": 0.26949582879941963, "grad_norm": 0.14981432259082794, "learning_rate": 0.00016695837585660187, "loss": 0.2288, "step": 743 }, { "epoch": 0.26985854189336234, "grad_norm": 0.1518179178237915, "learning_rate": 0.00016687338939650782, "loss": 0.2264, "step": 744 }, { "epoch": 0.27022125498730504, "grad_norm": 0.16115126013755798, "learning_rate": 0.00016678831547112895, "loss": 0.2533, "step": 745 }, { "epoch": 0.27058396808124774, "grad_norm": 0.1538068801164627, "learning_rate": 0.00016670315419173548, "loss": 0.2429, "step": 746 }, { "epoch": 0.27094668117519044, "grad_norm": 0.1365380436182022, "learning_rate": 0.00016661790566971181, "loss": 0.2222, "step": 747 }, { "epoch": 0.27130939426913314, "grad_norm": 0.14484576880931854, "learning_rate": 0.00016653257001655652, "loss": 0.2197, "step": 748 }, { "epoch": 0.2716721073630758, "grad_norm": 0.16303595900535583, "learning_rate": 0.00016644714734388217, "loss": 0.253, "step": 749 }, { "epoch": 0.2720348204570185, "grad_norm": 0.14876610040664673, "learning_rate": 0.00016636163776341504, "loss": 0.2205, "step": 750 }, { "epoch": 0.2723975335509612, "grad_norm": 0.13568569719791412, "learning_rate": 0.00016627604138699515, "loss": 0.2251, "step": 751 }, { "epoch": 0.2727602466449039, "grad_norm": 0.14528821408748627, "learning_rate": 0.00016619035832657602, "loss": 0.2346, "step": 752 }, { "epoch": 0.2731229597388466, "grad_norm": 0.13951005041599274, "learning_rate": 0.0001661045886942245, "loss": 0.2311, "step": 753 }, { "epoch": 0.27348567283278924, "grad_norm": 0.1355544924736023, "learning_rate": 0.0001660187326021208, "loss": 0.2235, "step": 754 }, { "epoch": 0.27384838592673194, "grad_norm": 0.14282123744487762, "learning_rate": 0.00016593279016255806, "loss": 0.211, "step": 755 }, { "epoch": 0.27421109902067464, "grad_norm": 0.1680796593427658, "learning_rate": 0.0001658467614879425, "loss": 0.2518, "step": 756 }, { "epoch": 0.27457381211461734, "grad_norm": 0.15991435945034027, "learning_rate": 0.00016576064669079297, "loss": 0.2419, "step": 757 }, { "epoch": 0.27493652520856005, "grad_norm": 0.1730770766735077, "learning_rate": 0.0001656744458837411, "loss": 0.257, "step": 758 }, { "epoch": 0.27529923830250275, "grad_norm": 0.1453644037246704, "learning_rate": 0.00016558815917953095, "loss": 0.2532, "step": 759 }, { "epoch": 0.2756619513964454, "grad_norm": 0.1334659457206726, "learning_rate": 0.00016550178669101891, "loss": 0.2098, "step": 760 }, { "epoch": 0.2760246644903881, "grad_norm": 0.13118910789489746, "learning_rate": 0.00016541532853117365, "loss": 0.214, "step": 761 }, { "epoch": 0.2763873775843308, "grad_norm": 0.14156754314899445, "learning_rate": 0.0001653287848130758, "loss": 0.2434, "step": 762 }, { "epoch": 0.2767500906782735, "grad_norm": 0.16743269562721252, "learning_rate": 0.0001652421556499179, "loss": 0.2692, "step": 763 }, { "epoch": 0.2771128037722162, "grad_norm": 0.16182062029838562, "learning_rate": 0.0001651554411550044, "loss": 0.2194, "step": 764 }, { "epoch": 0.27747551686615884, "grad_norm": 0.14829173684120178, "learning_rate": 0.0001650686414417511, "loss": 0.2444, "step": 765 }, { "epoch": 0.27783822996010155, "grad_norm": 0.14184747636318207, "learning_rate": 0.00016498175662368544, "loss": 0.2275, "step": 766 }, { "epoch": 0.27820094305404425, "grad_norm": 0.14175622165203094, "learning_rate": 0.00016489478681444615, "loss": 0.2368, "step": 767 }, { "epoch": 0.27856365614798695, "grad_norm": 0.14495515823364258, "learning_rate": 0.0001648077321277831, "loss": 0.2087, "step": 768 }, { "epoch": 0.27892636924192965, "grad_norm": 0.14581428468227386, "learning_rate": 0.0001647205926775571, "loss": 0.2339, "step": 769 }, { "epoch": 0.27928908233587235, "grad_norm": 0.16971313953399658, "learning_rate": 0.00016463336857773996, "loss": 0.2564, "step": 770 }, { "epoch": 0.279651795429815, "grad_norm": 0.16059347987174988, "learning_rate": 0.00016454605994241413, "loss": 0.2495, "step": 771 }, { "epoch": 0.2800145085237577, "grad_norm": 0.13135506212711334, "learning_rate": 0.00016445866688577268, "loss": 0.221, "step": 772 }, { "epoch": 0.2803772216177004, "grad_norm": 0.14712165296077728, "learning_rate": 0.00016437118952211893, "loss": 0.232, "step": 773 }, { "epoch": 0.2807399347116431, "grad_norm": 0.1340080052614212, "learning_rate": 0.00016428362796586668, "loss": 0.2134, "step": 774 }, { "epoch": 0.2811026478055858, "grad_norm": 0.1442837119102478, "learning_rate": 0.00016419598233153977, "loss": 0.2507, "step": 775 }, { "epoch": 0.28146536089952845, "grad_norm": 0.1472170352935791, "learning_rate": 0.00016410825273377192, "loss": 0.2053, "step": 776 }, { "epoch": 0.28182807399347115, "grad_norm": 0.16951750218868256, "learning_rate": 0.0001640204392873068, "loss": 0.2226, "step": 777 }, { "epoch": 0.28219078708741385, "grad_norm": 0.1475476771593094, "learning_rate": 0.00016393254210699765, "loss": 0.2255, "step": 778 }, { "epoch": 0.28255350018135655, "grad_norm": 0.1399717628955841, "learning_rate": 0.00016384456130780732, "loss": 0.2296, "step": 779 }, { "epoch": 0.28291621327529926, "grad_norm": 0.15422862768173218, "learning_rate": 0.00016375649700480792, "loss": 0.2549, "step": 780 }, { "epoch": 0.28327892636924196, "grad_norm": 0.14808495342731476, "learning_rate": 0.0001636683493131809, "loss": 0.2125, "step": 781 }, { "epoch": 0.2836416394631846, "grad_norm": 0.13389019668102264, "learning_rate": 0.00016358011834821662, "loss": 0.2216, "step": 782 }, { "epoch": 0.2840043525571273, "grad_norm": 0.14201773703098297, "learning_rate": 0.0001634918042253145, "loss": 0.2257, "step": 783 }, { "epoch": 0.28436706565107, "grad_norm": 0.16533806920051575, "learning_rate": 0.00016340340705998265, "loss": 0.2245, "step": 784 }, { "epoch": 0.2847297787450127, "grad_norm": 0.15893639624118805, "learning_rate": 0.0001633149269678378, "loss": 0.2175, "step": 785 }, { "epoch": 0.2850924918389554, "grad_norm": 0.1425047069787979, "learning_rate": 0.0001632263640646052, "loss": 0.252, "step": 786 }, { "epoch": 0.28545520493289805, "grad_norm": 0.15391702950000763, "learning_rate": 0.00016313771846611827, "loss": 0.2222, "step": 787 }, { "epoch": 0.28581791802684076, "grad_norm": 132164.875, "learning_rate": 0.00016304899028831874, "loss": 0.2179, "step": 788 }, { "epoch": 0.28618063112078346, "grad_norm": 0.1637081801891327, "learning_rate": 0.00016296017964725632, "loss": 0.2205, "step": 789 }, { "epoch": 0.28654334421472616, "grad_norm": 0.20489241182804108, "learning_rate": 0.0001628712866590885, "loss": 0.2479, "step": 790 }, { "epoch": 0.28690605730866886, "grad_norm": 0.17106997966766357, "learning_rate": 0.00016278231144008053, "loss": 0.227, "step": 791 }, { "epoch": 0.28726877040261156, "grad_norm": 0.16591399908065796, "learning_rate": 0.00016269325410660517, "loss": 0.2001, "step": 792 }, { "epoch": 0.2876314834965542, "grad_norm": 0.17908765375614166, "learning_rate": 0.00016260411477514265, "loss": 0.2311, "step": 793 }, { "epoch": 0.2879941965904969, "grad_norm": 0.2103756070137024, "learning_rate": 0.00016251489356228037, "loss": 0.251, "step": 794 }, { "epoch": 0.2883569096844396, "grad_norm": 0.1727806031703949, "learning_rate": 0.00016242559058471292, "loss": 0.2193, "step": 795 }, { "epoch": 0.2887196227783823, "grad_norm": 0.15671540796756744, "learning_rate": 0.0001623362059592417, "loss": 0.2462, "step": 796 }, { "epoch": 0.289082335872325, "grad_norm": 0.14824596047401428, "learning_rate": 0.00016224673980277503, "loss": 0.2235, "step": 797 }, { "epoch": 0.28944504896626766, "grad_norm": 0.15403501689434052, "learning_rate": 0.00016215719223232778, "loss": 0.2644, "step": 798 }, { "epoch": 0.28980776206021036, "grad_norm": 0.15009653568267822, "learning_rate": 0.0001620675633650213, "loss": 0.243, "step": 799 }, { "epoch": 0.29017047515415306, "grad_norm": 0.16066166758537292, "learning_rate": 0.0001619778533180834, "loss": 0.2171, "step": 800 }, { "epoch": 0.29053318824809576, "grad_norm": 0.15927597880363464, "learning_rate": 0.00016188806220884786, "loss": 0.217, "step": 801 }, { "epoch": 0.29089590134203847, "grad_norm": 0.14611735939979553, "learning_rate": 0.00016179819015475465, "loss": 0.2204, "step": 802 }, { "epoch": 0.2912586144359811, "grad_norm": 0.14521051943302155, "learning_rate": 0.00016170823727334956, "loss": 0.1962, "step": 803 }, { "epoch": 0.2916213275299238, "grad_norm": 0.1608162224292755, "learning_rate": 0.00016161820368228402, "loss": 0.2263, "step": 804 }, { "epoch": 0.2919840406238665, "grad_norm": 0.1577100157737732, "learning_rate": 0.00016152808949931516, "loss": 0.2208, "step": 805 }, { "epoch": 0.2923467537178092, "grad_norm": 0.15033476054668427, "learning_rate": 0.00016143789484230543, "loss": 0.215, "step": 806 }, { "epoch": 0.2927094668117519, "grad_norm": 0.14740067720413208, "learning_rate": 0.00016134761982922253, "loss": 0.2042, "step": 807 }, { "epoch": 0.2930721799056946, "grad_norm": 0.15068073570728302, "learning_rate": 0.0001612572645781393, "loss": 0.2221, "step": 808 }, { "epoch": 0.29343489299963726, "grad_norm": 0.17142775654792786, "learning_rate": 0.00016116682920723352, "loss": 0.2142, "step": 809 }, { "epoch": 0.29379760609357997, "grad_norm": 0.15067829191684723, "learning_rate": 0.0001610763138347877, "loss": 0.2225, "step": 810 }, { "epoch": 0.29416031918752267, "grad_norm": 0.1574852466583252, "learning_rate": 0.0001609857185791891, "loss": 0.2106, "step": 811 }, { "epoch": 0.29452303228146537, "grad_norm": 0.17060889303684235, "learning_rate": 0.00016089504355892931, "loss": 0.233, "step": 812 }, { "epoch": 0.29488574537540807, "grad_norm": 0.14020898938179016, "learning_rate": 0.0001608042888926044, "loss": 0.2162, "step": 813 }, { "epoch": 0.2952484584693507, "grad_norm": 0.1367609053850174, "learning_rate": 0.0001607134546989145, "loss": 0.2224, "step": 814 }, { "epoch": 0.2956111715632934, "grad_norm": 0.14028465747833252, "learning_rate": 0.0001606225410966638, "loss": 0.2237, "step": 815 }, { "epoch": 0.2959738846572361, "grad_norm": 0.13773570954799652, "learning_rate": 0.00016053154820476037, "loss": 0.224, "step": 816 }, { "epoch": 0.2963365977511788, "grad_norm": 0.14603252708911896, "learning_rate": 0.000160440476142216, "loss": 0.217, "step": 817 }, { "epoch": 0.2966993108451215, "grad_norm": 0.15531830489635468, "learning_rate": 0.00016034932502814587, "loss": 0.2137, "step": 818 }, { "epoch": 0.2970620239390642, "grad_norm": 0.15454085171222687, "learning_rate": 0.00016025809498176874, "loss": 0.2244, "step": 819 }, { "epoch": 0.29742473703300687, "grad_norm": 0.1548180729150772, "learning_rate": 0.0001601667861224066, "loss": 0.2517, "step": 820 }, { "epoch": 0.29778745012694957, "grad_norm": 0.1498357206583023, "learning_rate": 0.00016007539856948436, "loss": 0.2512, "step": 821 }, { "epoch": 0.2981501632208923, "grad_norm": 0.1419772207736969, "learning_rate": 0.00015998393244253002, "loss": 0.2067, "step": 822 }, { "epoch": 0.298512876314835, "grad_norm": 0.14814653992652893, "learning_rate": 0.0001598923878611743, "loss": 0.2293, "step": 823 }, { "epoch": 0.2988755894087777, "grad_norm": 0.15222403407096863, "learning_rate": 0.00015980076494515047, "loss": 0.2247, "step": 824 }, { "epoch": 0.2992383025027203, "grad_norm": 0.1679450124502182, "learning_rate": 0.0001597090638142943, "loss": 0.2631, "step": 825 }, { "epoch": 0.299601015596663, "grad_norm": 0.14880560338497162, "learning_rate": 0.00015961728458854397, "loss": 0.2069, "step": 826 }, { "epoch": 0.2999637286906057, "grad_norm": 0.14599819481372833, "learning_rate": 0.00015952542738793956, "loss": 0.226, "step": 827 }, { "epoch": 0.3003264417845484, "grad_norm": 0.14673501253128052, "learning_rate": 0.00015943349233262332, "loss": 0.2131, "step": 828 }, { "epoch": 0.3006891548784911, "grad_norm": 0.1625213623046875, "learning_rate": 0.00015934147954283932, "loss": 0.2289, "step": 829 }, { "epoch": 0.30105186797243383, "grad_norm": 0.15041042864322662, "learning_rate": 0.00015924938913893324, "loss": 0.2217, "step": 830 }, { "epoch": 0.3014145810663765, "grad_norm": 0.14617730677127838, "learning_rate": 0.00015915722124135227, "loss": 0.2396, "step": 831 }, { "epoch": 0.3017772941603192, "grad_norm": 0.15437570214271545, "learning_rate": 0.00015906497597064495, "loss": 0.2434, "step": 832 }, { "epoch": 0.3021400072542619, "grad_norm": 0.146324023604393, "learning_rate": 0.00015897265344746113, "loss": 0.2621, "step": 833 }, { "epoch": 0.3025027203482046, "grad_norm": 0.15348979830741882, "learning_rate": 0.00015888025379255156, "loss": 0.2198, "step": 834 }, { "epoch": 0.3028654334421473, "grad_norm": 0.14553911983966827, "learning_rate": 0.00015878777712676796, "loss": 0.2168, "step": 835 }, { "epoch": 0.3032281465360899, "grad_norm": 0.15064238011837006, "learning_rate": 0.00015869522357106272, "loss": 0.2381, "step": 836 }, { "epoch": 0.30359085963003263, "grad_norm": 0.1429353505373001, "learning_rate": 0.00015860259324648886, "loss": 0.2444, "step": 837 }, { "epoch": 0.30395357272397533, "grad_norm": 0.14742977917194366, "learning_rate": 0.00015850988627419968, "loss": 0.2112, "step": 838 }, { "epoch": 0.30431628581791803, "grad_norm": 0.14249765872955322, "learning_rate": 0.00015841710277544896, "loss": 0.2287, "step": 839 }, { "epoch": 0.30467899891186073, "grad_norm": 0.14514710009098053, "learning_rate": 0.00015832424287159027, "loss": 0.2229, "step": 840 }, { "epoch": 0.30504171200580343, "grad_norm": 0.15762075781822205, "learning_rate": 0.00015823130668407738, "loss": 0.212, "step": 841 }, { "epoch": 0.3054044250997461, "grad_norm": 0.16756275296211243, "learning_rate": 0.00015813829433446367, "loss": 0.2431, "step": 842 }, { "epoch": 0.3057671381936888, "grad_norm": 0.2156544029712677, "learning_rate": 0.00015804520594440223, "loss": 0.2045, "step": 843 }, { "epoch": 0.3061298512876315, "grad_norm": 0.18604739010334015, "learning_rate": 0.00015795204163564556, "loss": 0.2644, "step": 844 }, { "epoch": 0.3064925643815742, "grad_norm": 0.14301113784313202, "learning_rate": 0.0001578588015300454, "loss": 0.2114, "step": 845 }, { "epoch": 0.3068552774755169, "grad_norm": 0.14301526546478271, "learning_rate": 0.00015776548574955275, "loss": 0.2127, "step": 846 }, { "epoch": 0.30721799056945953, "grad_norm": 0.15024398267269135, "learning_rate": 0.0001576720944162175, "loss": 0.207, "step": 847 }, { "epoch": 0.30758070366340223, "grad_norm": 0.14672665297985077, "learning_rate": 0.00015757862765218838, "loss": 0.2112, "step": 848 }, { "epoch": 0.30794341675734493, "grad_norm": 0.177405446767807, "learning_rate": 0.00015748508557971276, "loss": 0.2248, "step": 849 }, { "epoch": 0.30830612985128764, "grad_norm": 0.16310465335845947, "learning_rate": 0.00015739146832113656, "loss": 0.2389, "step": 850 }, { "epoch": 0.30866884294523034, "grad_norm": 0.14648981392383575, "learning_rate": 0.00015729777599890395, "loss": 0.2159, "step": 851 }, { "epoch": 0.30903155603917304, "grad_norm": 0.1470453441143036, "learning_rate": 0.0001572040087355574, "loss": 0.2216, "step": 852 }, { "epoch": 0.3093942691331157, "grad_norm": 0.15409401059150696, "learning_rate": 0.00015711016665373727, "loss": 0.2497, "step": 853 }, { "epoch": 0.3097569822270584, "grad_norm": 0.16030748188495636, "learning_rate": 0.0001570162498761819, "loss": 0.2108, "step": 854 }, { "epoch": 0.3101196953210011, "grad_norm": 0.16415894031524658, "learning_rate": 0.00015692225852572715, "loss": 0.2297, "step": 855 }, { "epoch": 0.3104824084149438, "grad_norm": 0.1503467857837677, "learning_rate": 0.00015682819272530663, "loss": 0.1972, "step": 856 }, { "epoch": 0.3108451215088865, "grad_norm": 0.15261000394821167, "learning_rate": 0.00015673405259795118, "loss": 0.2296, "step": 857 }, { "epoch": 0.31120783460282914, "grad_norm": 0.15605837106704712, "learning_rate": 0.00015663983826678888, "loss": 0.2135, "step": 858 }, { "epoch": 0.31157054769677184, "grad_norm": 0.13954474031925201, "learning_rate": 0.0001565455498550449, "loss": 0.2064, "step": 859 }, { "epoch": 0.31193326079071454, "grad_norm": 0.14538753032684326, "learning_rate": 0.0001564511874860413, "loss": 0.2279, "step": 860 }, { "epoch": 0.31229597388465724, "grad_norm": 0.1461893618106842, "learning_rate": 0.00015635675128319683, "loss": 0.2203, "step": 861 }, { "epoch": 0.31265868697859994, "grad_norm": 0.14321376383304596, "learning_rate": 0.0001562622413700268, "loss": 0.2112, "step": 862 }, { "epoch": 0.31302140007254264, "grad_norm": 0.14480461180210114, "learning_rate": 0.00015616765787014302, "loss": 0.2182, "step": 863 }, { "epoch": 0.3133841131664853, "grad_norm": 0.16734722256660461, "learning_rate": 0.00015607300090725342, "loss": 0.2222, "step": 864 }, { "epoch": 0.313746826260428, "grad_norm": 0.14616838097572327, "learning_rate": 0.00015597827060516211, "loss": 0.2075, "step": 865 }, { "epoch": 0.3141095393543707, "grad_norm": 0.16457431018352509, "learning_rate": 0.00015588346708776904, "loss": 0.2271, "step": 866 }, { "epoch": 0.3144722524483134, "grad_norm": 0.16780099272727966, "learning_rate": 0.00015578859047907004, "loss": 0.2196, "step": 867 }, { "epoch": 0.3148349655422561, "grad_norm": 0.14990176260471344, "learning_rate": 0.00015569364090315646, "loss": 0.2162, "step": 868 }, { "epoch": 0.31519767863619874, "grad_norm": 0.1400328129529953, "learning_rate": 0.00015559861848421505, "loss": 0.2114, "step": 869 }, { "epoch": 0.31556039173014144, "grad_norm": 0.15837667882442474, "learning_rate": 0.00015550352334652788, "loss": 0.2755, "step": 870 }, { "epoch": 0.31592310482408414, "grad_norm": 0.14617806673049927, "learning_rate": 0.00015540835561447214, "loss": 0.2029, "step": 871 }, { "epoch": 0.31628581791802685, "grad_norm": 0.1634027361869812, "learning_rate": 0.00015531311541251995, "loss": 0.2451, "step": 872 }, { "epoch": 0.31664853101196955, "grad_norm": 0.17340759932994843, "learning_rate": 0.00015521780286523824, "loss": 0.2267, "step": 873 }, { "epoch": 0.31701124410591225, "grad_norm": 0.15501338243484497, "learning_rate": 0.0001551224180972885, "loss": 0.1988, "step": 874 }, { "epoch": 0.3173739571998549, "grad_norm": 0.15017758309841156, "learning_rate": 0.00015502696123342676, "loss": 0.211, "step": 875 }, { "epoch": 0.3177366702937976, "grad_norm": 0.15657378733158112, "learning_rate": 0.00015493143239850329, "loss": 0.2092, "step": 876 }, { "epoch": 0.3180993833877403, "grad_norm": 0.15220540761947632, "learning_rate": 0.00015483583171746248, "loss": 0.2413, "step": 877 }, { "epoch": 0.318462096481683, "grad_norm": 0.15332242846488953, "learning_rate": 0.00015474015931534276, "loss": 0.2333, "step": 878 }, { "epoch": 0.3188248095756257, "grad_norm": 0.14318165183067322, "learning_rate": 0.00015464441531727632, "loss": 0.2282, "step": 879 }, { "epoch": 0.31918752266956835, "grad_norm": 0.15234385430812836, "learning_rate": 0.00015454859984848895, "loss": 0.2092, "step": 880 }, { "epoch": 0.31955023576351105, "grad_norm": 0.15263251960277557, "learning_rate": 0.0001544527130343, "loss": 0.2142, "step": 881 }, { "epoch": 0.31991294885745375, "grad_norm": 0.1610080748796463, "learning_rate": 0.00015435675500012212, "loss": 0.2305, "step": 882 }, { "epoch": 0.32027566195139645, "grad_norm": 0.15507538616657257, "learning_rate": 0.00015426072587146106, "loss": 0.2316, "step": 883 }, { "epoch": 0.32063837504533915, "grad_norm": 0.16231822967529297, "learning_rate": 0.00015416462577391558, "loss": 0.2953, "step": 884 }, { "epoch": 0.32100108813928185, "grad_norm": 0.14619815349578857, "learning_rate": 0.00015406845483317727, "loss": 0.2335, "step": 885 }, { "epoch": 0.3213638012332245, "grad_norm": 0.15803977847099304, "learning_rate": 0.00015397221317503039, "loss": 0.212, "step": 886 }, { "epoch": 0.3217265143271672, "grad_norm": 0.148417666554451, "learning_rate": 0.00015387590092535164, "loss": 0.2063, "step": 887 }, { "epoch": 0.3220892274211099, "grad_norm": 0.1504986435174942, "learning_rate": 0.00015377951821011015, "loss": 0.2156, "step": 888 }, { "epoch": 0.3224519405150526, "grad_norm": 0.1552225649356842, "learning_rate": 0.00015368306515536708, "loss": 0.209, "step": 889 }, { "epoch": 0.3228146536089953, "grad_norm": 0.1671207845211029, "learning_rate": 0.00015358654188727568, "loss": 0.218, "step": 890 }, { "epoch": 0.32317736670293795, "grad_norm": 0.15497446060180664, "learning_rate": 0.00015348994853208104, "loss": 0.2239, "step": 891 }, { "epoch": 0.32354007979688065, "grad_norm": 0.16032548248767853, "learning_rate": 0.00015339328521611983, "loss": 0.2069, "step": 892 }, { "epoch": 0.32390279289082335, "grad_norm": 0.15629202127456665, "learning_rate": 0.00015329655206582036, "loss": 0.2262, "step": 893 }, { "epoch": 0.32426550598476606, "grad_norm": 0.15609470009803772, "learning_rate": 0.00015319974920770214, "loss": 0.2444, "step": 894 }, { "epoch": 0.32462821907870876, "grad_norm": 0.16244526207447052, "learning_rate": 0.00015310287676837593, "loss": 0.211, "step": 895 }, { "epoch": 0.32499093217265146, "grad_norm": 0.1519642472267151, "learning_rate": 0.00015300593487454348, "loss": 0.2091, "step": 896 }, { "epoch": 0.3253536452665941, "grad_norm": 0.1546807587146759, "learning_rate": 0.0001529089236529974, "loss": 0.2226, "step": 897 }, { "epoch": 0.3257163583605368, "grad_norm": 0.14414747059345245, "learning_rate": 0.00015281184323062097, "loss": 0.2259, "step": 898 }, { "epoch": 0.3260790714544795, "grad_norm": 0.1484064757823944, "learning_rate": 0.00015271469373438792, "loss": 0.2353, "step": 899 }, { "epoch": 0.3264417845484222, "grad_norm": 0.15261922776699066, "learning_rate": 0.00015261747529136236, "loss": 0.2094, "step": 900 }, { "epoch": 0.3268044976423649, "grad_norm": 0.16096492111682892, "learning_rate": 0.00015252018802869866, "loss": 0.2102, "step": 901 }, { "epoch": 0.32716721073630756, "grad_norm": 0.14988648891448975, "learning_rate": 0.00015242283207364107, "loss": 0.1933, "step": 902 }, { "epoch": 0.32752992383025026, "grad_norm": 0.16668923199176788, "learning_rate": 0.00015232540755352373, "loss": 0.2132, "step": 903 }, { "epoch": 0.32789263692419296, "grad_norm": 0.1562613993883133, "learning_rate": 0.00015222791459577051, "loss": 0.2174, "step": 904 }, { "epoch": 0.32825535001813566, "grad_norm": 0.15152856707572937, "learning_rate": 0.00015213035332789477, "loss": 0.2223, "step": 905 }, { "epoch": 0.32861806311207836, "grad_norm": 0.15007184445858002, "learning_rate": 0.00015203272387749915, "loss": 0.2184, "step": 906 }, { "epoch": 0.32898077620602106, "grad_norm": 0.1500440090894699, "learning_rate": 0.0001519350263722755, "loss": 0.2493, "step": 907 }, { "epoch": 0.3293434892999637, "grad_norm": 0.15756063163280487, "learning_rate": 0.00015183726094000476, "loss": 0.2112, "step": 908 }, { "epoch": 0.3297062023939064, "grad_norm": 0.15649868547916412, "learning_rate": 0.00015173942770855655, "loss": 0.2105, "step": 909 }, { "epoch": 0.3300689154878491, "grad_norm": 0.17396046221256256, "learning_rate": 0.00015164152680588938, "loss": 0.2092, "step": 910 }, { "epoch": 0.3304316285817918, "grad_norm": 0.15336064994335175, "learning_rate": 0.00015154355836005006, "loss": 0.2168, "step": 911 }, { "epoch": 0.3307943416757345, "grad_norm": 0.1463136523962021, "learning_rate": 0.00015144552249917386, "loss": 0.2175, "step": 912 }, { "epoch": 0.33115705476967716, "grad_norm": 0.14064238965511322, "learning_rate": 0.0001513474193514842, "loss": 0.2342, "step": 913 }, { "epoch": 0.33151976786361986, "grad_norm": 0.15353120863437653, "learning_rate": 0.00015124924904529253, "loss": 0.2269, "step": 914 }, { "epoch": 0.33188248095756256, "grad_norm": 0.1634497493505478, "learning_rate": 0.00015115101170899806, "loss": 0.2303, "step": 915 }, { "epoch": 0.33224519405150527, "grad_norm": 0.15802593529224396, "learning_rate": 0.00015105270747108778, "loss": 0.2181, "step": 916 }, { "epoch": 0.33260790714544797, "grad_norm": 0.16792048513889313, "learning_rate": 0.00015095433646013606, "loss": 0.2042, "step": 917 }, { "epoch": 0.33297062023939067, "grad_norm": 0.14907622337341309, "learning_rate": 0.0001508558988048047, "loss": 0.198, "step": 918 }, { "epoch": 0.3333333333333333, "grad_norm": 0.15107260644435883, "learning_rate": 0.00015075739463384267, "loss": 0.2103, "step": 919 }, { "epoch": 0.333696046427276, "grad_norm": 0.16222083568572998, "learning_rate": 0.00015065882407608582, "loss": 0.2267, "step": 920 }, { "epoch": 0.3340587595212187, "grad_norm": 0.14550422132015228, "learning_rate": 0.00015056018726045697, "loss": 0.2197, "step": 921 }, { "epoch": 0.3344214726151614, "grad_norm": 0.14713485538959503, "learning_rate": 0.00015046148431596554, "loss": 0.2261, "step": 922 }, { "epoch": 0.3347841857091041, "grad_norm": 0.15137678384780884, "learning_rate": 0.0001503627153717074, "loss": 0.2196, "step": 923 }, { "epoch": 0.33514689880304677, "grad_norm": 0.15455511212348938, "learning_rate": 0.00015026388055686485, "loss": 0.2111, "step": 924 }, { "epoch": 0.33550961189698947, "grad_norm": 0.15710324048995972, "learning_rate": 0.00015016498000070618, "loss": 0.2138, "step": 925 }, { "epoch": 0.33587232499093217, "grad_norm": 0.19984076917171478, "learning_rate": 0.00015006601383258584, "loss": 0.2264, "step": 926 }, { "epoch": 0.33623503808487487, "grad_norm": 0.15135234594345093, "learning_rate": 0.000149966982181944, "loss": 0.2121, "step": 927 }, { "epoch": 0.3365977511788176, "grad_norm": 0.14553037285804749, "learning_rate": 0.0001498678851783065, "loss": 0.2095, "step": 928 }, { "epoch": 0.3369604642727603, "grad_norm": 0.1508447229862213, "learning_rate": 0.00014976872295128463, "loss": 0.2377, "step": 929 }, { "epoch": 0.3373231773667029, "grad_norm": 0.15726783871650696, "learning_rate": 0.000149669495630575, "loss": 0.2453, "step": 930 }, { "epoch": 0.3376858904606456, "grad_norm": 0.1487269550561905, "learning_rate": 0.0001495702033459594, "loss": 0.1958, "step": 931 }, { "epoch": 0.3380486035545883, "grad_norm": 0.1618356555700302, "learning_rate": 0.00014947084622730453, "loss": 0.2061, "step": 932 }, { "epoch": 0.338411316648531, "grad_norm": 0.178387850522995, "learning_rate": 0.00014937142440456195, "loss": 0.2358, "step": 933 }, { "epoch": 0.3387740297424737, "grad_norm": 0.15690761804580688, "learning_rate": 0.00014927193800776776, "loss": 0.2077, "step": 934 }, { "epoch": 0.33913674283641637, "grad_norm": 0.15255998075008392, "learning_rate": 0.00014917238716704258, "loss": 0.214, "step": 935 }, { "epoch": 0.3394994559303591, "grad_norm": 0.15309607982635498, "learning_rate": 0.00014907277201259132, "loss": 0.2326, "step": 936 }, { "epoch": 0.3398621690243018, "grad_norm": 0.15764005482196808, "learning_rate": 0.00014897309267470295, "loss": 0.2096, "step": 937 }, { "epoch": 0.3402248821182445, "grad_norm": 0.15512487292289734, "learning_rate": 0.0001488733492837505, "loss": 0.2322, "step": 938 }, { "epoch": 0.3405875952121872, "grad_norm": 0.17276284098625183, "learning_rate": 0.00014877354197019064, "loss": 0.2217, "step": 939 }, { "epoch": 0.3409503083061299, "grad_norm": 0.16367502510547638, "learning_rate": 0.00014867367086456373, "loss": 0.2187, "step": 940 }, { "epoch": 0.3413130214000725, "grad_norm": 0.1784859150648117, "learning_rate": 0.0001485737360974936, "loss": 0.2339, "step": 941 }, { "epoch": 0.3416757344940152, "grad_norm": 0.15108786523342133, "learning_rate": 0.00014847373779968724, "loss": 0.207, "step": 942 }, { "epoch": 0.34203844758795793, "grad_norm": 0.15686751902103424, "learning_rate": 0.00014837367610193476, "loss": 0.2155, "step": 943 }, { "epoch": 0.34240116068190063, "grad_norm": 0.17520059645175934, "learning_rate": 0.00014827355113510927, "loss": 0.2185, "step": 944 }, { "epoch": 0.34276387377584333, "grad_norm": 0.14490067958831787, "learning_rate": 0.0001481733630301666, "loss": 0.2049, "step": 945 }, { "epoch": 0.343126586869786, "grad_norm": 0.15382413566112518, "learning_rate": 0.000148073111918145, "loss": 0.2061, "step": 946 }, { "epoch": 0.3434892999637287, "grad_norm": 0.15271534025669098, "learning_rate": 0.0001479727979301654, "loss": 0.2208, "step": 947 }, { "epoch": 0.3438520130576714, "grad_norm": 0.1692724972963333, "learning_rate": 0.0001478724211974308, "loss": 0.24, "step": 948 }, { "epoch": 0.3442147261516141, "grad_norm": 0.18430864810943604, "learning_rate": 0.0001477719818512263, "loss": 0.2347, "step": 949 }, { "epoch": 0.3445774392455568, "grad_norm": 0.16035676002502441, "learning_rate": 0.00014767148002291886, "loss": 0.229, "step": 950 }, { "epoch": 0.34494015233949943, "grad_norm": 0.14710398018360138, "learning_rate": 0.00014757091584395726, "loss": 0.2184, "step": 951 }, { "epoch": 0.34530286543344213, "grad_norm": 0.1524522453546524, "learning_rate": 0.00014747028944587167, "loss": 0.2067, "step": 952 }, { "epoch": 0.34566557852738483, "grad_norm": 0.1544627547264099, "learning_rate": 0.00014736960096027385, "loss": 0.1903, "step": 953 }, { "epoch": 0.34602829162132753, "grad_norm": 0.15999731421470642, "learning_rate": 0.00014726885051885653, "loss": 0.1956, "step": 954 }, { "epoch": 0.34639100471527023, "grad_norm": 0.16488391160964966, "learning_rate": 0.00014716803825339368, "loss": 0.227, "step": 955 }, { "epoch": 0.34675371780921294, "grad_norm": 0.1626642644405365, "learning_rate": 0.00014706716429573996, "loss": 0.2302, "step": 956 }, { "epoch": 0.3471164309031556, "grad_norm": 0.1589454710483551, "learning_rate": 0.00014696622877783088, "loss": 0.1998, "step": 957 }, { "epoch": 0.3474791439970983, "grad_norm": 0.17863640189170837, "learning_rate": 0.00014686523183168236, "loss": 0.2244, "step": 958 }, { "epoch": 0.347841857091041, "grad_norm": 0.15809310972690582, "learning_rate": 0.00014676417358939063, "loss": 0.2156, "step": 959 }, { "epoch": 0.3482045701849837, "grad_norm": 0.14684627950191498, "learning_rate": 0.00014666305418313224, "loss": 0.2037, "step": 960 }, { "epoch": 0.3485672832789264, "grad_norm": 0.14479795098304749, "learning_rate": 0.00014656187374516365, "loss": 0.1991, "step": 961 }, { "epoch": 0.34892999637286903, "grad_norm": 0.17033007740974426, "learning_rate": 0.00014646063240782105, "loss": 0.1991, "step": 962 }, { "epoch": 0.34929270946681173, "grad_norm": 0.1695454865694046, "learning_rate": 0.00014635933030352044, "loss": 0.2039, "step": 963 }, { "epoch": 0.34965542256075444, "grad_norm": 0.16838496923446655, "learning_rate": 0.00014625796756475724, "loss": 0.2111, "step": 964 }, { "epoch": 0.35001813565469714, "grad_norm": 0.16217052936553955, "learning_rate": 0.00014615654432410612, "loss": 0.2091, "step": 965 }, { "epoch": 0.35038084874863984, "grad_norm": 0.15333756804466248, "learning_rate": 0.00014605506071422103, "loss": 0.2225, "step": 966 }, { "epoch": 0.35074356184258254, "grad_norm": 0.15081751346588135, "learning_rate": 0.00014595351686783465, "loss": 0.2138, "step": 967 }, { "epoch": 0.3511062749365252, "grad_norm": 0.16661369800567627, "learning_rate": 0.00014585191291775868, "loss": 0.211, "step": 968 }, { "epoch": 0.3514689880304679, "grad_norm": 0.15592342615127563, "learning_rate": 0.00014575024899688324, "loss": 0.2069, "step": 969 }, { "epoch": 0.3518317011244106, "grad_norm": 0.15869508683681488, "learning_rate": 0.00014564852523817705, "loss": 0.1961, "step": 970 }, { "epoch": 0.3521944142183533, "grad_norm": 0.18337900936603546, "learning_rate": 0.00014554674177468695, "loss": 0.2039, "step": 971 }, { "epoch": 0.352557127312296, "grad_norm": 0.20202304422855377, "learning_rate": 0.00014544489873953803, "loss": 0.2344, "step": 972 }, { "epoch": 0.35291984040623864, "grad_norm": 0.1616135686635971, "learning_rate": 0.0001453429962659331, "loss": 0.2117, "step": 973 }, { "epoch": 0.35328255350018134, "grad_norm": 0.15346059203147888, "learning_rate": 0.00014524103448715283, "loss": 0.2235, "step": 974 }, { "epoch": 0.35364526659412404, "grad_norm": 0.148000568151474, "learning_rate": 0.00014513901353655547, "loss": 0.1944, "step": 975 }, { "epoch": 0.35400797968806674, "grad_norm": 0.15789712965488434, "learning_rate": 0.00014503693354757667, "loss": 0.2139, "step": 976 }, { "epoch": 0.35437069278200944, "grad_norm": 0.16983194649219513, "learning_rate": 0.00014493479465372912, "loss": 0.2122, "step": 977 }, { "epoch": 0.35473340587595215, "grad_norm": 0.19161252677440643, "learning_rate": 0.0001448325969886028, "loss": 0.2799, "step": 978 }, { "epoch": 0.3550961189698948, "grad_norm": 0.16653449833393097, "learning_rate": 0.00014473034068586445, "loss": 0.2166, "step": 979 }, { "epoch": 0.3554588320638375, "grad_norm": 0.1566229611635208, "learning_rate": 0.00014462802587925742, "loss": 0.2104, "step": 980 }, { "epoch": 0.3558215451577802, "grad_norm": 0.15640553832054138, "learning_rate": 0.00014452565270260177, "loss": 0.1979, "step": 981 }, { "epoch": 0.3561842582517229, "grad_norm": 0.15835930407047272, "learning_rate": 0.00014442322128979372, "loss": 0.2412, "step": 982 }, { "epoch": 0.3565469713456656, "grad_norm": 0.172097310423851, "learning_rate": 0.00014432073177480576, "loss": 0.2146, "step": 983 }, { "epoch": 0.35690968443960824, "grad_norm": 0.1693415641784668, "learning_rate": 0.00014421818429168634, "loss": 0.2408, "step": 984 }, { "epoch": 0.35727239753355094, "grad_norm": 0.15985938906669617, "learning_rate": 0.00014411557897455973, "loss": 0.2167, "step": 985 }, { "epoch": 0.35763511062749365, "grad_norm": 0.16702041029930115, "learning_rate": 0.00014401291595762586, "loss": 0.2062, "step": 986 }, { "epoch": 0.35799782372143635, "grad_norm": 0.16588671505451202, "learning_rate": 0.00014391019537516006, "loss": 0.2023, "step": 987 }, { "epoch": 0.35836053681537905, "grad_norm": 0.15971873700618744, "learning_rate": 0.0001438074173615131, "loss": 0.2162, "step": 988 }, { "epoch": 0.35872324990932175, "grad_norm": 0.1973976194858551, "learning_rate": 0.0001437045820511107, "loss": 0.2135, "step": 989 }, { "epoch": 0.3590859630032644, "grad_norm": 0.1852118968963623, "learning_rate": 0.00014360168957845362, "loss": 0.2161, "step": 990 }, { "epoch": 0.3594486760972071, "grad_norm": 0.15597601234912872, "learning_rate": 0.00014349874007811735, "loss": 0.2053, "step": 991 }, { "epoch": 0.3598113891911498, "grad_norm": 0.16251103579998016, "learning_rate": 0.00014339573368475197, "loss": 0.2122, "step": 992 }, { "epoch": 0.3601741022850925, "grad_norm": 0.1582382768392563, "learning_rate": 0.00014329267053308194, "loss": 0.2175, "step": 993 }, { "epoch": 0.3605368153790352, "grad_norm": 0.15138986706733704, "learning_rate": 0.00014318955075790605, "loss": 0.201, "step": 994 }, { "epoch": 0.36089952847297785, "grad_norm": 0.16074247658252716, "learning_rate": 0.00014308637449409706, "loss": 0.2281, "step": 995 }, { "epoch": 0.36126224156692055, "grad_norm": 0.153158500790596, "learning_rate": 0.00014298314187660162, "loss": 0.1925, "step": 996 }, { "epoch": 0.36162495466086325, "grad_norm": 0.17264969646930695, "learning_rate": 0.00014287985304044015, "loss": 0.2069, "step": 997 }, { "epoch": 0.36198766775480595, "grad_norm": 0.18429549038410187, "learning_rate": 0.0001427765081207065, "loss": 0.2185, "step": 998 }, { "epoch": 0.36235038084874865, "grad_norm": 0.1758868545293808, "learning_rate": 0.000142673107252568, "loss": 0.2432, "step": 999 }, { "epoch": 0.36271309394269136, "grad_norm": 0.15705294907093048, "learning_rate": 0.00014256965057126504, "loss": 0.1986, "step": 1000 }, { "epoch": 0.363075807036634, "grad_norm": 0.1507769376039505, "learning_rate": 0.00014246613821211108, "loss": 0.1876, "step": 1001 }, { "epoch": 0.3634385201305767, "grad_norm": 0.17133677005767822, "learning_rate": 0.00014236257031049232, "loss": 0.209, "step": 1002 }, { "epoch": 0.3638012332245194, "grad_norm": 0.15936224162578583, "learning_rate": 0.00014225894700186774, "loss": 0.1974, "step": 1003 }, { "epoch": 0.3641639463184621, "grad_norm": 0.19848595559597015, "learning_rate": 0.00014215526842176868, "loss": 0.2218, "step": 1004 }, { "epoch": 0.3645266594124048, "grad_norm": 0.17126554250717163, "learning_rate": 0.00014205153470579882, "loss": 0.2229, "step": 1005 }, { "epoch": 0.36488937250634745, "grad_norm": 0.15903635323047638, "learning_rate": 0.0001419477459896339, "loss": 0.2127, "step": 1006 }, { "epoch": 0.36525208560029016, "grad_norm": 0.16994720697402954, "learning_rate": 0.00014184390240902167, "loss": 0.2289, "step": 1007 }, { "epoch": 0.36561479869423286, "grad_norm": 0.17226669192314148, "learning_rate": 0.00014174000409978156, "loss": 0.2147, "step": 1008 }, { "epoch": 0.36597751178817556, "grad_norm": 0.1492406278848648, "learning_rate": 0.00014163605119780467, "loss": 0.2087, "step": 1009 }, { "epoch": 0.36634022488211826, "grad_norm": 0.16116073727607727, "learning_rate": 0.00014153204383905344, "loss": 0.2176, "step": 1010 }, { "epoch": 0.36670293797606096, "grad_norm": 0.16366463899612427, "learning_rate": 0.00014142798215956148, "loss": 0.1925, "step": 1011 }, { "epoch": 0.3670656510700036, "grad_norm": 0.15476755797863007, "learning_rate": 0.00014132386629543364, "loss": 0.1994, "step": 1012 }, { "epoch": 0.3674283641639463, "grad_norm": 0.16290143132209778, "learning_rate": 0.00014121969638284542, "loss": 0.2131, "step": 1013 }, { "epoch": 0.367791077257889, "grad_norm": 0.15869063138961792, "learning_rate": 0.00014111547255804316, "loss": 0.1889, "step": 1014 }, { "epoch": 0.3681537903518317, "grad_norm": 0.1735077053308487, "learning_rate": 0.00014101119495734364, "loss": 0.2261, "step": 1015 }, { "epoch": 0.3685165034457744, "grad_norm": 0.16333554685115814, "learning_rate": 0.00014090686371713402, "loss": 0.2247, "step": 1016 }, { "epoch": 0.36887921653971706, "grad_norm": 0.18004798889160156, "learning_rate": 0.00014080247897387156, "loss": 0.2334, "step": 1017 }, { "epoch": 0.36924192963365976, "grad_norm": 0.16508519649505615, "learning_rate": 0.0001406980408640835, "loss": 0.1995, "step": 1018 }, { "epoch": 0.36960464272760246, "grad_norm": 0.1622190773487091, "learning_rate": 0.00014059354952436698, "loss": 0.2003, "step": 1019 }, { "epoch": 0.36996735582154516, "grad_norm": 0.16706664860248566, "learning_rate": 0.00014048900509138867, "loss": 0.219, "step": 1020 }, { "epoch": 0.37033006891548786, "grad_norm": 0.1640990823507309, "learning_rate": 0.00014038440770188467, "loss": 0.2018, "step": 1021 }, { "epoch": 0.37069278200943057, "grad_norm": 0.17155148088932037, "learning_rate": 0.0001402797574926604, "loss": 0.2234, "step": 1022 }, { "epoch": 0.3710554951033732, "grad_norm": 0.1780928671360016, "learning_rate": 0.00014017505460059036, "loss": 0.2346, "step": 1023 }, { "epoch": 0.3714182081973159, "grad_norm": 0.1557503193616867, "learning_rate": 0.0001400702991626179, "loss": 0.1969, "step": 1024 }, { "epoch": 0.3717809212912586, "grad_norm": 0.14212948083877563, "learning_rate": 0.00013996549131575515, "loss": 0.1883, "step": 1025 }, { "epoch": 0.3721436343852013, "grad_norm": 0.16952791810035706, "learning_rate": 0.00013986063119708275, "loss": 0.2157, "step": 1026 }, { "epoch": 0.372506347479144, "grad_norm": 0.16988742351531982, "learning_rate": 0.00013975571894374973, "loss": 0.2103, "step": 1027 }, { "epoch": 0.37286906057308666, "grad_norm": 0.16801071166992188, "learning_rate": 0.00013965075469297332, "loss": 0.2094, "step": 1028 }, { "epoch": 0.37323177366702937, "grad_norm": 0.19034814834594727, "learning_rate": 0.00013954573858203874, "loss": 0.2444, "step": 1029 }, { "epoch": 0.37359448676097207, "grad_norm": 0.15771815180778503, "learning_rate": 0.000139440670748299, "loss": 0.1987, "step": 1030 }, { "epoch": 0.37395719985491477, "grad_norm": 0.1528027504682541, "learning_rate": 0.00013933555132917487, "loss": 0.2138, "step": 1031 }, { "epoch": 0.37431991294885747, "grad_norm": 0.16030389070510864, "learning_rate": 0.00013923038046215446, "loss": 0.2057, "step": 1032 }, { "epoch": 0.37468262604280017, "grad_norm": 0.1645725667476654, "learning_rate": 0.0001391251582847932, "loss": 0.1957, "step": 1033 }, { "epoch": 0.3750453391367428, "grad_norm": 0.17184780538082123, "learning_rate": 0.0001390198849347138, "loss": 0.2244, "step": 1034 }, { "epoch": 0.3754080522306855, "grad_norm": 0.16507604718208313, "learning_rate": 0.00013891456054960564, "loss": 0.2126, "step": 1035 }, { "epoch": 0.3757707653246282, "grad_norm": 0.15355214476585388, "learning_rate": 0.00013880918526722497, "loss": 0.1853, "step": 1036 }, { "epoch": 0.3761334784185709, "grad_norm": 0.1596059501171112, "learning_rate": 0.00013870375922539466, "loss": 0.229, "step": 1037 }, { "epoch": 0.3764961915125136, "grad_norm": 0.16307580471038818, "learning_rate": 0.00013859828256200394, "loss": 0.2149, "step": 1038 }, { "epoch": 0.37685890460645627, "grad_norm": 0.15789788961410522, "learning_rate": 0.00013849275541500812, "loss": 0.2351, "step": 1039 }, { "epoch": 0.37722161770039897, "grad_norm": 0.1589316725730896, "learning_rate": 0.00013838717792242876, "loss": 0.2164, "step": 1040 }, { "epoch": 0.37758433079434167, "grad_norm": 0.15134315192699432, "learning_rate": 0.00013828155022235308, "loss": 0.1925, "step": 1041 }, { "epoch": 0.3779470438882844, "grad_norm": 0.14640171825885773, "learning_rate": 0.00013817587245293407, "loss": 0.2138, "step": 1042 }, { "epoch": 0.3783097569822271, "grad_norm": 0.1695149838924408, "learning_rate": 0.0001380701447523902, "loss": 0.2139, "step": 1043 }, { "epoch": 0.3786724700761698, "grad_norm": 0.1683790236711502, "learning_rate": 0.0001379643672590052, "loss": 0.1954, "step": 1044 }, { "epoch": 0.3790351831701124, "grad_norm": 0.17694401741027832, "learning_rate": 0.00013785854011112798, "loss": 0.2022, "step": 1045 }, { "epoch": 0.3793978962640551, "grad_norm": 0.17428404092788696, "learning_rate": 0.00013775266344717233, "loss": 0.1832, "step": 1046 }, { "epoch": 0.3797606093579978, "grad_norm": 0.1612454652786255, "learning_rate": 0.00013764673740561685, "loss": 0.1917, "step": 1047 }, { "epoch": 0.3801233224519405, "grad_norm": 0.16686902940273285, "learning_rate": 0.0001375407621250047, "loss": 0.1989, "step": 1048 }, { "epoch": 0.3804860355458832, "grad_norm": 0.14911605417728424, "learning_rate": 0.00013743473774394346, "loss": 0.2004, "step": 1049 }, { "epoch": 0.3808487486398259, "grad_norm": 0.15896974503993988, "learning_rate": 0.00013732866440110497, "loss": 0.2466, "step": 1050 }, { "epoch": 0.3812114617337686, "grad_norm": 0.16059251129627228, "learning_rate": 0.000137222542235225, "loss": 0.2042, "step": 1051 }, { "epoch": 0.3815741748277113, "grad_norm": 0.16174575686454773, "learning_rate": 0.0001371163713851032, "loss": 0.1979, "step": 1052 }, { "epoch": 0.381936887921654, "grad_norm": 0.1577538102865219, "learning_rate": 0.00013701015198960302, "loss": 0.213, "step": 1053 }, { "epoch": 0.3822996010155967, "grad_norm": 0.1710449755191803, "learning_rate": 0.0001369038841876513, "loss": 0.223, "step": 1054 }, { "epoch": 0.3826623141095394, "grad_norm": 0.17627973854541779, "learning_rate": 0.00013679756811823813, "loss": 0.2397, "step": 1055 }, { "epoch": 0.383025027203482, "grad_norm": 0.15820728242397308, "learning_rate": 0.0001366912039204169, "loss": 0.1959, "step": 1056 }, { "epoch": 0.38338774029742473, "grad_norm": 0.15889425575733185, "learning_rate": 0.00013658479173330384, "loss": 0.1805, "step": 1057 }, { "epoch": 0.38375045339136743, "grad_norm": 0.18348795175552368, "learning_rate": 0.00013647833169607788, "loss": 0.2061, "step": 1058 }, { "epoch": 0.38411316648531013, "grad_norm": 0.16327665746212006, "learning_rate": 0.0001363718239479807, "loss": 0.1899, "step": 1059 }, { "epoch": 0.38447587957925283, "grad_norm": 0.15636590123176575, "learning_rate": 0.00013626526862831628, "loss": 0.2161, "step": 1060 }, { "epoch": 0.3848385926731955, "grad_norm": 0.158644899725914, "learning_rate": 0.00013615866587645084, "loss": 0.1991, "step": 1061 }, { "epoch": 0.3852013057671382, "grad_norm": 0.16064795851707458, "learning_rate": 0.0001360520158318126, "loss": 0.2009, "step": 1062 }, { "epoch": 0.3855640188610809, "grad_norm": 0.18209217488765717, "learning_rate": 0.00013594531863389173, "loss": 0.2538, "step": 1063 }, { "epoch": 0.3859267319550236, "grad_norm": 0.16186301410198212, "learning_rate": 0.00013583857442223994, "loss": 0.2249, "step": 1064 }, { "epoch": 0.3862894450489663, "grad_norm": 0.16660407185554504, "learning_rate": 0.00013573178333647058, "loss": 0.2116, "step": 1065 }, { "epoch": 0.386652158142909, "grad_norm": 0.16199025511741638, "learning_rate": 0.0001356249455162582, "loss": 0.2156, "step": 1066 }, { "epoch": 0.38701487123685163, "grad_norm": 0.1578529328107834, "learning_rate": 0.0001355180611013385, "loss": 0.2066, "step": 1067 }, { "epoch": 0.38737758433079433, "grad_norm": 0.17841364443302155, "learning_rate": 0.00013541113023150816, "loss": 0.205, "step": 1068 }, { "epoch": 0.38774029742473703, "grad_norm": 0.1555965095758438, "learning_rate": 0.00013530415304662457, "loss": 0.2027, "step": 1069 }, { "epoch": 0.38810301051867974, "grad_norm": 0.15105211734771729, "learning_rate": 0.00013519712968660568, "loss": 0.1963, "step": 1070 }, { "epoch": 0.38846572361262244, "grad_norm": 0.16452065110206604, "learning_rate": 0.0001350900602914299, "loss": 0.2129, "step": 1071 }, { "epoch": 0.3888284367065651, "grad_norm": 0.16760526597499847, "learning_rate": 0.00013498294500113585, "loss": 0.2418, "step": 1072 }, { "epoch": 0.3891911498005078, "grad_norm": 0.16931942105293274, "learning_rate": 0.00013487578395582206, "loss": 0.1914, "step": 1073 }, { "epoch": 0.3895538628944505, "grad_norm": 0.1739332228899002, "learning_rate": 0.0001347685772956471, "loss": 0.2107, "step": 1074 }, { "epoch": 0.3899165759883932, "grad_norm": 0.1568581908941269, "learning_rate": 0.00013466132516082907, "loss": 0.1835, "step": 1075 }, { "epoch": 0.3902792890823359, "grad_norm": 0.16916148364543915, "learning_rate": 0.0001345540276916455, "loss": 0.2041, "step": 1076 }, { "epoch": 0.3906420021762786, "grad_norm": 0.16345995664596558, "learning_rate": 0.0001344466850284333, "loss": 0.1789, "step": 1077 }, { "epoch": 0.39100471527022124, "grad_norm": 0.16848930716514587, "learning_rate": 0.00013433929731158852, "loss": 0.1961, "step": 1078 }, { "epoch": 0.39136742836416394, "grad_norm": 0.1991538405418396, "learning_rate": 0.00013423186468156608, "loss": 0.2544, "step": 1079 }, { "epoch": 0.39173014145810664, "grad_norm": 0.17732208967208862, "learning_rate": 0.0001341243872788796, "loss": 0.258, "step": 1080 }, { "epoch": 0.39209285455204934, "grad_norm": 0.16117359697818756, "learning_rate": 0.0001340168652441014, "loss": 0.2389, "step": 1081 }, { "epoch": 0.39245556764599204, "grad_norm": 0.1693982034921646, "learning_rate": 0.00013390929871786203, "loss": 0.2022, "step": 1082 }, { "epoch": 0.3928182807399347, "grad_norm": 0.1722104698419571, "learning_rate": 0.00013380168784085027, "loss": 0.1977, "step": 1083 }, { "epoch": 0.3931809938338774, "grad_norm": 0.1871337741613388, "learning_rate": 0.000133694032753813, "loss": 0.2249, "step": 1084 }, { "epoch": 0.3935437069278201, "grad_norm": 0.17777620255947113, "learning_rate": 0.0001335863335975548, "loss": 0.1949, "step": 1085 }, { "epoch": 0.3939064200217628, "grad_norm": 0.18331852555274963, "learning_rate": 0.00013347859051293792, "loss": 0.1969, "step": 1086 }, { "epoch": 0.3942691331157055, "grad_norm": 0.158721461892128, "learning_rate": 0.0001333708036408821, "loss": 0.1919, "step": 1087 }, { "epoch": 0.3946318462096482, "grad_norm": 0.16589364409446716, "learning_rate": 0.00013326297312236439, "loss": 0.2044, "step": 1088 }, { "epoch": 0.39499455930359084, "grad_norm": 0.15952499210834503, "learning_rate": 0.0001331550990984188, "loss": 0.2005, "step": 1089 }, { "epoch": 0.39535727239753354, "grad_norm": 0.15588688850402832, "learning_rate": 0.00013304718171013632, "loss": 0.2234, "step": 1090 }, { "epoch": 0.39571998549147624, "grad_norm": 0.17283542454242706, "learning_rate": 0.0001329392210986647, "loss": 0.2001, "step": 1091 }, { "epoch": 0.39608269858541895, "grad_norm": 0.15617555379867554, "learning_rate": 0.00013283121740520812, "loss": 0.1982, "step": 1092 }, { "epoch": 0.39644541167936165, "grad_norm": 0.18503715097904205, "learning_rate": 0.0001327231707710272, "loss": 0.2315, "step": 1093 }, { "epoch": 0.3968081247733043, "grad_norm": 0.16704030334949493, "learning_rate": 0.00013261508133743865, "loss": 0.2, "step": 1094 }, { "epoch": 0.397170837867247, "grad_norm": 0.17188745737075806, "learning_rate": 0.0001325069492458152, "loss": 0.258, "step": 1095 }, { "epoch": 0.3975335509611897, "grad_norm": 0.1544748693704605, "learning_rate": 0.00013239877463758537, "loss": 0.19, "step": 1096 }, { "epoch": 0.3978962640551324, "grad_norm": 0.1821664571762085, "learning_rate": 0.0001322905576542333, "loss": 0.2071, "step": 1097 }, { "epoch": 0.3982589771490751, "grad_norm": 0.15686167776584625, "learning_rate": 0.00013218229843729856, "loss": 0.1807, "step": 1098 }, { "epoch": 0.39862169024301775, "grad_norm": 0.1645747721195221, "learning_rate": 0.00013207399712837582, "loss": 0.1941, "step": 1099 }, { "epoch": 0.39898440333696045, "grad_norm": 0.15510335564613342, "learning_rate": 0.00013196565386911505, "loss": 0.1982, "step": 1100 }, { "epoch": 0.39934711643090315, "grad_norm": 0.17434607446193695, "learning_rate": 0.0001318572688012209, "loss": 0.2012, "step": 1101 }, { "epoch": 0.39970982952484585, "grad_norm": 0.1454346626996994, "learning_rate": 0.00013174884206645278, "loss": 0.1887, "step": 1102 }, { "epoch": 0.40007254261878855, "grad_norm": 0.16709522902965546, "learning_rate": 0.00013164037380662452, "loss": 0.1914, "step": 1103 }, { "epoch": 0.40043525571273125, "grad_norm": 0.17922160029411316, "learning_rate": 0.0001315318641636044, "loss": 0.2002, "step": 1104 }, { "epoch": 0.4007979688066739, "grad_norm": 0.1769881397485733, "learning_rate": 0.00013142331327931469, "loss": 0.1993, "step": 1105 }, { "epoch": 0.4011606819006166, "grad_norm": 0.1627112329006195, "learning_rate": 0.00013131472129573166, "loss": 0.2096, "step": 1106 }, { "epoch": 0.4015233949945593, "grad_norm": 0.1649940013885498, "learning_rate": 0.00013120608835488532, "loss": 0.2032, "step": 1107 }, { "epoch": 0.401886108088502, "grad_norm": 0.18944235146045685, "learning_rate": 0.00013109741459885928, "loss": 0.2163, "step": 1108 }, { "epoch": 0.4022488211824447, "grad_norm": 0.16329450905323029, "learning_rate": 0.00013098870016979051, "loss": 0.1833, "step": 1109 }, { "epoch": 0.40261153427638735, "grad_norm": 0.20053814351558685, "learning_rate": 0.00013087994520986923, "loss": 0.2166, "step": 1110 }, { "epoch": 0.40297424737033005, "grad_norm": 0.19225598871707916, "learning_rate": 0.00013077114986133847, "loss": 0.2544, "step": 1111 }, { "epoch": 0.40333696046427275, "grad_norm": 0.17340917885303497, "learning_rate": 0.00013066231426649437, "loss": 0.2005, "step": 1112 }, { "epoch": 0.40369967355821545, "grad_norm": 0.1653253436088562, "learning_rate": 0.00013055343856768555, "loss": 0.2119, "step": 1113 }, { "epoch": 0.40406238665215816, "grad_norm": 0.16865472495555878, "learning_rate": 0.00013044452290731306, "loss": 0.1748, "step": 1114 }, { "epoch": 0.40442509974610086, "grad_norm": 0.17820391058921814, "learning_rate": 0.0001303355674278303, "loss": 0.2094, "step": 1115 }, { "epoch": 0.4047878128400435, "grad_norm": 0.17825227975845337, "learning_rate": 0.0001302265722717427, "loss": 0.2174, "step": 1116 }, { "epoch": 0.4051505259339862, "grad_norm": 0.6229606866836548, "learning_rate": 0.0001301175375816076, "loss": 0.2072, "step": 1117 }, { "epoch": 0.4055132390279289, "grad_norm": 0.21105241775512695, "learning_rate": 0.0001300084635000341, "loss": 0.2041, "step": 1118 }, { "epoch": 0.4058759521218716, "grad_norm": 0.20768609642982483, "learning_rate": 0.00012989935016968266, "loss": 0.2091, "step": 1119 }, { "epoch": 0.4062386652158143, "grad_norm": 0.1655317097902298, "learning_rate": 0.00012979019773326524, "loss": 0.2095, "step": 1120 }, { "epoch": 0.40660137830975696, "grad_norm": 0.1594689041376114, "learning_rate": 0.00012968100633354492, "loss": 0.1922, "step": 1121 }, { "epoch": 0.40696409140369966, "grad_norm": 0.1779058277606964, "learning_rate": 0.00012957177611333566, "loss": 0.1948, "step": 1122 }, { "epoch": 0.40732680449764236, "grad_norm": 0.18424735963344574, "learning_rate": 0.00012946250721550224, "loss": 0.2174, "step": 1123 }, { "epoch": 0.40768951759158506, "grad_norm": 0.19321289658546448, "learning_rate": 0.00012935319978296008, "loss": 0.2032, "step": 1124 }, { "epoch": 0.40805223068552776, "grad_norm": 0.1741238832473755, "learning_rate": 0.00012924385395867493, "loss": 0.1928, "step": 1125 }, { "epoch": 0.40841494377947046, "grad_norm": 0.16779236495494843, "learning_rate": 0.00012913446988566273, "loss": 0.2021, "step": 1126 }, { "epoch": 0.4087776568734131, "grad_norm": 0.16747735440731049, "learning_rate": 0.00012902504770698954, "loss": 0.1993, "step": 1127 }, { "epoch": 0.4091403699673558, "grad_norm": 0.18401382863521576, "learning_rate": 0.00012891558756577122, "loss": 0.2151, "step": 1128 }, { "epoch": 0.4095030830612985, "grad_norm": 0.15898433327674866, "learning_rate": 0.00012880608960517322, "loss": 0.187, "step": 1129 }, { "epoch": 0.4098657961552412, "grad_norm": 0.1666088104248047, "learning_rate": 0.0001286965539684106, "loss": 0.1849, "step": 1130 }, { "epoch": 0.4102285092491839, "grad_norm": 0.17613482475280762, "learning_rate": 0.00012858698079874748, "loss": 0.1993, "step": 1131 }, { "epoch": 0.41059122234312656, "grad_norm": 0.17263801395893097, "learning_rate": 0.0001284773702394973, "loss": 0.1947, "step": 1132 }, { "epoch": 0.41095393543706926, "grad_norm": 0.1618073433637619, "learning_rate": 0.00012836772243402224, "loss": 0.1869, "step": 1133 }, { "epoch": 0.41131664853101196, "grad_norm": 0.1828174889087677, "learning_rate": 0.00012825803752573327, "loss": 0.2207, "step": 1134 }, { "epoch": 0.41167936162495467, "grad_norm": 0.17469796538352966, "learning_rate": 0.00012814831565808986, "loss": 0.2008, "step": 1135 }, { "epoch": 0.41204207471889737, "grad_norm": 0.17154814302921295, "learning_rate": 0.00012803855697459987, "loss": 0.2098, "step": 1136 }, { "epoch": 0.41240478781284007, "grad_norm": 0.1646650731563568, "learning_rate": 0.00012792876161881925, "loss": 0.2103, "step": 1137 }, { "epoch": 0.4127675009067827, "grad_norm": 0.17539532482624054, "learning_rate": 0.00012781892973435195, "loss": 0.1966, "step": 1138 }, { "epoch": 0.4131302140007254, "grad_norm": 0.17781807482242584, "learning_rate": 0.00012770906146484964, "loss": 0.206, "step": 1139 }, { "epoch": 0.4134929270946681, "grad_norm": 0.1847347617149353, "learning_rate": 0.0001275991569540117, "loss": 0.2026, "step": 1140 }, { "epoch": 0.4138556401886108, "grad_norm": 0.17020414769649506, "learning_rate": 0.00012748921634558473, "loss": 0.1958, "step": 1141 }, { "epoch": 0.4142183532825535, "grad_norm": 0.18093371391296387, "learning_rate": 0.00012737923978336274, "loss": 0.2062, "step": 1142 }, { "epoch": 0.41458106637649617, "grad_norm": 0.1588636189699173, "learning_rate": 0.00012726922741118662, "loss": 0.1892, "step": 1143 }, { "epoch": 0.41494377947043887, "grad_norm": 0.19953924417495728, "learning_rate": 0.00012715917937294418, "loss": 0.2188, "step": 1144 }, { "epoch": 0.41530649256438157, "grad_norm": 0.16585423052310944, "learning_rate": 0.00012704909581256986, "loss": 0.2231, "step": 1145 }, { "epoch": 0.41566920565832427, "grad_norm": 0.17226840555667877, "learning_rate": 0.0001269389768740445, "loss": 0.1895, "step": 1146 }, { "epoch": 0.41603191875226697, "grad_norm": 0.2125304490327835, "learning_rate": 0.00012682882270139526, "loss": 0.2122, "step": 1147 }, { "epoch": 0.4163946318462097, "grad_norm": 0.19522660970687866, "learning_rate": 0.00012671863343869543, "loss": 0.2055, "step": 1148 }, { "epoch": 0.4167573449401523, "grad_norm": 0.19831117987632751, "learning_rate": 0.00012660840923006412, "loss": 0.189, "step": 1149 }, { "epoch": 0.417120058034095, "grad_norm": 0.16252368688583374, "learning_rate": 0.0001264981502196662, "loss": 0.2051, "step": 1150 }, { "epoch": 0.4174827711280377, "grad_norm": 0.17360906302928925, "learning_rate": 0.00012638785655171196, "loss": 0.1957, "step": 1151 }, { "epoch": 0.4178454842219804, "grad_norm": 0.1837020069360733, "learning_rate": 0.0001262775283704572, "loss": 0.2131, "step": 1152 }, { "epoch": 0.4182081973159231, "grad_norm": 0.1726016104221344, "learning_rate": 0.00012616716582020265, "loss": 0.1897, "step": 1153 }, { "epoch": 0.41857091040986577, "grad_norm": 0.16881223022937775, "learning_rate": 0.00012605676904529415, "loss": 0.1905, "step": 1154 }, { "epoch": 0.41893362350380847, "grad_norm": 0.2182941734790802, "learning_rate": 0.00012594633819012225, "loss": 0.2176, "step": 1155 }, { "epoch": 0.4192963365977512, "grad_norm": 0.1766914576292038, "learning_rate": 0.00012583587339912207, "loss": 0.2067, "step": 1156 }, { "epoch": 0.4196590496916939, "grad_norm": 0.16632500290870667, "learning_rate": 0.00012572537481677308, "loss": 0.1902, "step": 1157 }, { "epoch": 0.4200217627856366, "grad_norm": 0.16559042036533356, "learning_rate": 0.00012561484258759905, "loss": 0.1848, "step": 1158 }, { "epoch": 0.4203844758795793, "grad_norm": 0.16212663054466248, "learning_rate": 0.00012550427685616765, "loss": 0.2009, "step": 1159 }, { "epoch": 0.4207471889735219, "grad_norm": 0.16951881349086761, "learning_rate": 0.0001253936777670904, "loss": 0.1896, "step": 1160 }, { "epoch": 0.4211099020674646, "grad_norm": 0.19102217257022858, "learning_rate": 0.0001252830454650225, "loss": 0.2012, "step": 1161 }, { "epoch": 0.4214726151614073, "grad_norm": 0.1638030707836151, "learning_rate": 0.00012517238009466253, "loss": 0.1731, "step": 1162 }, { "epoch": 0.42183532825535003, "grad_norm": 0.1885092556476593, "learning_rate": 0.00012506168180075232, "loss": 0.212, "step": 1163 }, { "epoch": 0.42219804134929273, "grad_norm": 0.19661776721477509, "learning_rate": 0.00012495095072807678, "loss": 0.1969, "step": 1164 }, { "epoch": 0.4225607544432354, "grad_norm": 0.1665484458208084, "learning_rate": 0.00012484018702146375, "loss": 0.1886, "step": 1165 }, { "epoch": 0.4229234675371781, "grad_norm": 0.16225306689739227, "learning_rate": 0.00012472939082578365, "loss": 0.1869, "step": 1166 }, { "epoch": 0.4232861806311208, "grad_norm": 0.16616645455360413, "learning_rate": 0.00012461856228594947, "loss": 0.1778, "step": 1167 }, { "epoch": 0.4236488937250635, "grad_norm": 0.15914376080036163, "learning_rate": 0.00012450770154691642, "loss": 0.1809, "step": 1168 }, { "epoch": 0.4240116068190062, "grad_norm": 0.18165045976638794, "learning_rate": 0.00012439680875368192, "loss": 0.1981, "step": 1169 }, { "epoch": 0.4243743199129489, "grad_norm": 0.17815563082695007, "learning_rate": 0.00012428588405128527, "loss": 0.2462, "step": 1170 }, { "epoch": 0.42473703300689153, "grad_norm": 0.1577123999595642, "learning_rate": 0.0001241749275848075, "loss": 0.1848, "step": 1171 }, { "epoch": 0.42509974610083423, "grad_norm": 0.16714733839035034, "learning_rate": 0.0001240639394993712, "loss": 0.1878, "step": 1172 }, { "epoch": 0.42546245919477693, "grad_norm": 0.18040674924850464, "learning_rate": 0.0001239529199401403, "loss": 0.2087, "step": 1173 }, { "epoch": 0.42582517228871963, "grad_norm": 0.17369875311851501, "learning_rate": 0.0001238418690523199, "loss": 0.2198, "step": 1174 }, { "epoch": 0.42618788538266233, "grad_norm": 0.17522990703582764, "learning_rate": 0.0001237307869811561, "loss": 0.1898, "step": 1175 }, { "epoch": 0.426550598476605, "grad_norm": 0.1890110820531845, "learning_rate": 0.0001236196738719357, "loss": 0.1946, "step": 1176 }, { "epoch": 0.4269133115705477, "grad_norm": 0.19072000682353973, "learning_rate": 0.00012350852986998628, "loss": 0.1782, "step": 1177 }, { "epoch": 0.4272760246644904, "grad_norm": 0.16412675380706787, "learning_rate": 0.00012339735512067557, "loss": 0.1957, "step": 1178 }, { "epoch": 0.4276387377584331, "grad_norm": 0.16497628390789032, "learning_rate": 0.0001232861497694117, "loss": 0.1914, "step": 1179 }, { "epoch": 0.4280014508523758, "grad_norm": 0.1696443408727646, "learning_rate": 0.00012317491396164281, "loss": 0.2205, "step": 1180 }, { "epoch": 0.4283641639463185, "grad_norm": 0.1990218162536621, "learning_rate": 0.00012306364784285683, "loss": 0.221, "step": 1181 }, { "epoch": 0.42872687704026113, "grad_norm": 0.15306927263736725, "learning_rate": 0.00012295235155858128, "loss": 0.1894, "step": 1182 }, { "epoch": 0.42908959013420384, "grad_norm": 0.16716569662094116, "learning_rate": 0.00012284102525438327, "loss": 0.2124, "step": 1183 }, { "epoch": 0.42945230322814654, "grad_norm": 0.16371683776378632, "learning_rate": 0.00012272966907586906, "loss": 0.1952, "step": 1184 }, { "epoch": 0.42981501632208924, "grad_norm": 0.19524066150188446, "learning_rate": 0.00012261828316868404, "loss": 0.1967, "step": 1185 }, { "epoch": 0.43017772941603194, "grad_norm": 0.1753699630498886, "learning_rate": 0.0001225068676785125, "loss": 0.2057, "step": 1186 }, { "epoch": 0.4305404425099746, "grad_norm": 0.15853376686573029, "learning_rate": 0.00012239542275107733, "loss": 0.1852, "step": 1187 }, { "epoch": 0.4309031556039173, "grad_norm": 0.1545594483613968, "learning_rate": 0.00012228394853214, "loss": 0.1827, "step": 1188 }, { "epoch": 0.43126586869786, "grad_norm": 0.1596081703901291, "learning_rate": 0.0001221724451675003, "loss": 0.2032, "step": 1189 }, { "epoch": 0.4316285817918027, "grad_norm": 0.17133690416812897, "learning_rate": 0.00012206091280299608, "loss": 0.201, "step": 1190 }, { "epoch": 0.4319912948857454, "grad_norm": 0.18594324588775635, "learning_rate": 0.00012194935158450318, "loss": 0.1999, "step": 1191 }, { "epoch": 0.4323540079796881, "grad_norm": 0.1757342368364334, "learning_rate": 0.0001218377616579351, "loss": 0.2048, "step": 1192 }, { "epoch": 0.43271672107363074, "grad_norm": 0.15969473123550415, "learning_rate": 0.00012172614316924303, "loss": 0.1896, "step": 1193 }, { "epoch": 0.43307943416757344, "grad_norm": 0.1708168387413025, "learning_rate": 0.00012161449626441535, "loss": 0.1871, "step": 1194 }, { "epoch": 0.43344214726151614, "grad_norm": 0.16224978864192963, "learning_rate": 0.0001215028210894777, "loss": 0.1995, "step": 1195 }, { "epoch": 0.43380486035545884, "grad_norm": 0.17344152927398682, "learning_rate": 0.00012139111779049272, "loss": 0.2102, "step": 1196 }, { "epoch": 0.43416757344940154, "grad_norm": 0.1607237160205841, "learning_rate": 0.00012127938651355973, "loss": 0.198, "step": 1197 }, { "epoch": 0.4345302865433442, "grad_norm": 0.19598302245140076, "learning_rate": 0.00012116762740481473, "loss": 0.2048, "step": 1198 }, { "epoch": 0.4348929996372869, "grad_norm": 0.17380495369434357, "learning_rate": 0.00012105584061043011, "loss": 0.1998, "step": 1199 }, { "epoch": 0.4352557127312296, "grad_norm": 0.16845153272151947, "learning_rate": 0.00012094402627661447, "loss": 0.1944, "step": 1200 }, { "epoch": 0.4356184258251723, "grad_norm": 0.17525669932365417, "learning_rate": 0.00012083218454961237, "loss": 0.2262, "step": 1201 }, { "epoch": 0.435981138919115, "grad_norm": 0.182146355509758, "learning_rate": 0.00012072031557570425, "loss": 0.1899, "step": 1202 }, { "epoch": 0.4363438520130577, "grad_norm": 0.1767880618572235, "learning_rate": 0.00012060841950120623, "loss": 0.1853, "step": 1203 }, { "epoch": 0.43670656510700034, "grad_norm": 0.1868688315153122, "learning_rate": 0.00012049649647246976, "loss": 0.1884, "step": 1204 }, { "epoch": 0.43706927820094305, "grad_norm": 0.16299636662006378, "learning_rate": 0.0001203845466358817, "loss": 0.1903, "step": 1205 }, { "epoch": 0.43743199129488575, "grad_norm": 0.1743989884853363, "learning_rate": 0.00012027257013786382, "loss": 0.1741, "step": 1206 }, { "epoch": 0.43779470438882845, "grad_norm": 0.16983556747436523, "learning_rate": 0.00012016056712487281, "loss": 0.1756, "step": 1207 }, { "epoch": 0.43815741748277115, "grad_norm": 0.16869889199733734, "learning_rate": 0.0001200485377434001, "loss": 0.2091, "step": 1208 }, { "epoch": 0.4385201305767138, "grad_norm": 0.18009315431118011, "learning_rate": 0.00011993648213997155, "loss": 0.1876, "step": 1209 }, { "epoch": 0.4388828436706565, "grad_norm": 0.17261937260627747, "learning_rate": 0.00011982440046114734, "loss": 0.1888, "step": 1210 }, { "epoch": 0.4392455567645992, "grad_norm": 0.1700652837753296, "learning_rate": 0.00011971229285352173, "loss": 0.1929, "step": 1211 }, { "epoch": 0.4396082698585419, "grad_norm": 0.1701359897851944, "learning_rate": 0.0001196001594637229, "loss": 0.196, "step": 1212 }, { "epoch": 0.4399709829524846, "grad_norm": 0.17813630402088165, "learning_rate": 0.00011948800043841275, "loss": 0.2116, "step": 1213 }, { "epoch": 0.4403336960464273, "grad_norm": 0.1756308227777481, "learning_rate": 0.00011937581592428677, "loss": 0.2036, "step": 1214 }, { "epoch": 0.44069640914036995, "grad_norm": 0.17653414607048035, "learning_rate": 0.00011926360606807367, "loss": 0.186, "step": 1215 }, { "epoch": 0.44105912223431265, "grad_norm": 0.16713349521160126, "learning_rate": 0.00011915137101653539, "loss": 0.2161, "step": 1216 }, { "epoch": 0.44142183532825535, "grad_norm": 0.17466074228286743, "learning_rate": 0.00011903911091646684, "loss": 0.2025, "step": 1217 }, { "epoch": 0.44178454842219805, "grad_norm": 0.17018508911132812, "learning_rate": 0.00011892682591469562, "loss": 0.1901, "step": 1218 }, { "epoch": 0.44214726151614075, "grad_norm": 0.18613681197166443, "learning_rate": 0.00011881451615808192, "loss": 0.1994, "step": 1219 }, { "epoch": 0.4425099746100834, "grad_norm": 0.17624922096729279, "learning_rate": 0.00011870218179351838, "loss": 0.1909, "step": 1220 }, { "epoch": 0.4428726877040261, "grad_norm": 0.16530555486679077, "learning_rate": 0.00011858982296792971, "loss": 0.1925, "step": 1221 }, { "epoch": 0.4432354007979688, "grad_norm": 0.17213410139083862, "learning_rate": 0.00011847743982827269, "loss": 0.188, "step": 1222 }, { "epoch": 0.4435981138919115, "grad_norm": 0.17941850423812866, "learning_rate": 0.00011836503252153588, "loss": 0.1836, "step": 1223 }, { "epoch": 0.4439608269858542, "grad_norm": 0.211356058716774, "learning_rate": 0.00011825260119473946, "loss": 0.1958, "step": 1224 }, { "epoch": 0.4443235400797969, "grad_norm": 0.1753711849451065, "learning_rate": 0.00011814014599493502, "loss": 0.1784, "step": 1225 }, { "epoch": 0.44468625317373955, "grad_norm": 0.17775994539260864, "learning_rate": 0.00011802766706920533, "loss": 0.1984, "step": 1226 }, { "epoch": 0.44504896626768226, "grad_norm": 0.15988726913928986, "learning_rate": 0.00011791516456466429, "loss": 0.196, "step": 1227 }, { "epoch": 0.44541167936162496, "grad_norm": 0.17853982746601105, "learning_rate": 0.00011780263862845655, "loss": 0.193, "step": 1228 }, { "epoch": 0.44577439245556766, "grad_norm": 0.1804809272289276, "learning_rate": 0.00011769008940775744, "loss": 0.1995, "step": 1229 }, { "epoch": 0.44613710554951036, "grad_norm": 0.18296337127685547, "learning_rate": 0.00011757751704977275, "loss": 0.1907, "step": 1230 }, { "epoch": 0.446499818643453, "grad_norm": 0.15713930130004883, "learning_rate": 0.00011746492170173853, "loss": 0.1945, "step": 1231 }, { "epoch": 0.4468625317373957, "grad_norm": 0.18204668164253235, "learning_rate": 0.00011735230351092087, "loss": 0.2187, "step": 1232 }, { "epoch": 0.4472252448313384, "grad_norm": 0.16009126603603363, "learning_rate": 0.00011723966262461579, "loss": 0.1786, "step": 1233 }, { "epoch": 0.4475879579252811, "grad_norm": 0.20128843188285828, "learning_rate": 0.00011712699919014896, "loss": 0.1941, "step": 1234 }, { "epoch": 0.4479506710192238, "grad_norm": 0.17296966910362244, "learning_rate": 0.0001170143133548755, "loss": 0.1843, "step": 1235 }, { "epoch": 0.44831338411316646, "grad_norm": 0.18363478779792786, "learning_rate": 0.00011690160526617995, "loss": 0.197, "step": 1236 }, { "epoch": 0.44867609720710916, "grad_norm": 0.17751774191856384, "learning_rate": 0.00011678887507147582, "loss": 0.1756, "step": 1237 }, { "epoch": 0.44903881030105186, "grad_norm": 0.1821131557226181, "learning_rate": 0.00011667612291820562, "loss": 0.1911, "step": 1238 }, { "epoch": 0.44940152339499456, "grad_norm": 0.16961705684661865, "learning_rate": 0.00011656334895384053, "loss": 0.1782, "step": 1239 }, { "epoch": 0.44976423648893726, "grad_norm": 0.1650359183549881, "learning_rate": 0.00011645055332588032, "loss": 0.1849, "step": 1240 }, { "epoch": 0.45012694958287996, "grad_norm": 0.1794784963130951, "learning_rate": 0.00011633773618185302, "loss": 0.2059, "step": 1241 }, { "epoch": 0.4504896626768226, "grad_norm": 0.17137840390205383, "learning_rate": 0.00011622489766931488, "loss": 0.206, "step": 1242 }, { "epoch": 0.4508523757707653, "grad_norm": 0.1728799045085907, "learning_rate": 0.00011611203793584999, "loss": 0.1812, "step": 1243 }, { "epoch": 0.451215088864708, "grad_norm": 0.17596741020679474, "learning_rate": 0.0001159991571290703, "loss": 0.1935, "step": 1244 }, { "epoch": 0.4515778019586507, "grad_norm": 0.18633347749710083, "learning_rate": 0.00011588625539661528, "loss": 0.1908, "step": 1245 }, { "epoch": 0.4519405150525934, "grad_norm": 0.15337157249450684, "learning_rate": 0.00011577333288615175, "loss": 0.1779, "step": 1246 }, { "epoch": 0.45230322814653606, "grad_norm": 0.18902058899402618, "learning_rate": 0.00011566038974537374, "loss": 0.2063, "step": 1247 }, { "epoch": 0.45266594124047876, "grad_norm": 0.17245811223983765, "learning_rate": 0.00011554742612200229, "loss": 0.1827, "step": 1248 }, { "epoch": 0.45302865433442147, "grad_norm": 0.17236045002937317, "learning_rate": 0.00011543444216378517, "loss": 0.1944, "step": 1249 }, { "epoch": 0.45339136742836417, "grad_norm": 0.1754477322101593, "learning_rate": 0.00011532143801849668, "loss": 0.1933, "step": 1250 }, { "epoch": 0.45375408052230687, "grad_norm": 0.16361160576343536, "learning_rate": 0.00011520841383393774, "loss": 0.193, "step": 1251 }, { "epoch": 0.45411679361624957, "grad_norm": 0.17561082541942596, "learning_rate": 0.00011509536975793527, "loss": 0.2062, "step": 1252 }, { "epoch": 0.4544795067101922, "grad_norm": 0.1636163592338562, "learning_rate": 0.00011498230593834229, "loss": 0.1839, "step": 1253 }, { "epoch": 0.4548422198041349, "grad_norm": 0.16940078139305115, "learning_rate": 0.00011486922252303769, "loss": 0.18, "step": 1254 }, { "epoch": 0.4552049328980776, "grad_norm": 0.1866592913866043, "learning_rate": 0.0001147561196599259, "loss": 0.1789, "step": 1255 }, { "epoch": 0.4555676459920203, "grad_norm": 0.1689455509185791, "learning_rate": 0.00011464299749693679, "loss": 0.1775, "step": 1256 }, { "epoch": 0.455930359085963, "grad_norm": 0.17223703861236572, "learning_rate": 0.00011452985618202559, "loss": 0.1813, "step": 1257 }, { "epoch": 0.45629307217990567, "grad_norm": 0.16031506657600403, "learning_rate": 0.00011441669586317243, "loss": 0.1867, "step": 1258 }, { "epoch": 0.45665578527384837, "grad_norm": 0.17869757115840912, "learning_rate": 0.00011430351668838237, "loss": 0.1678, "step": 1259 }, { "epoch": 0.45701849836779107, "grad_norm": 0.18296487629413605, "learning_rate": 0.00011419031880568518, "loss": 0.1848, "step": 1260 }, { "epoch": 0.45738121146173377, "grad_norm": 0.19954228401184082, "learning_rate": 0.00011407710236313498, "loss": 0.1961, "step": 1261 }, { "epoch": 0.4577439245556765, "grad_norm": 0.16006030142307281, "learning_rate": 0.00011396386750881025, "loss": 0.1738, "step": 1262 }, { "epoch": 0.4581066376496192, "grad_norm": 0.17467838525772095, "learning_rate": 0.00011385061439081355, "loss": 0.2, "step": 1263 }, { "epoch": 0.4584693507435618, "grad_norm": 0.1634225696325302, "learning_rate": 0.00011373734315727125, "loss": 0.1593, "step": 1264 }, { "epoch": 0.4588320638375045, "grad_norm": 0.1675540953874588, "learning_rate": 0.00011362405395633355, "loss": 0.1761, "step": 1265 }, { "epoch": 0.4591947769314472, "grad_norm": 0.2374797910451889, "learning_rate": 0.00011351074693617398, "loss": 0.2401, "step": 1266 }, { "epoch": 0.4595574900253899, "grad_norm": 0.16424275934696198, "learning_rate": 0.00011339742224498957, "loss": 0.1822, "step": 1267 }, { "epoch": 0.4599202031193326, "grad_norm": 0.1777309626340866, "learning_rate": 0.00011328408003100031, "loss": 0.199, "step": 1268 }, { "epoch": 0.4602829162132753, "grad_norm": 0.17055995762348175, "learning_rate": 0.0001131707204424491, "loss": 0.1743, "step": 1269 }, { "epoch": 0.460645629307218, "grad_norm": 0.17005477845668793, "learning_rate": 0.0001130573436276017, "loss": 0.1767, "step": 1270 }, { "epoch": 0.4610083424011607, "grad_norm": 0.18844565749168396, "learning_rate": 0.00011294394973474631, "loss": 0.1836, "step": 1271 }, { "epoch": 0.4613710554951034, "grad_norm": 0.17676351964473724, "learning_rate": 0.00011283053891219344, "loss": 0.1806, "step": 1272 }, { "epoch": 0.4617337685890461, "grad_norm": 0.1949535459280014, "learning_rate": 0.00011271711130827584, "loss": 0.2162, "step": 1273 }, { "epoch": 0.4620964816829888, "grad_norm": 0.16555753350257874, "learning_rate": 0.0001126036670713481, "loss": 0.2051, "step": 1274 }, { "epoch": 0.4624591947769314, "grad_norm": 0.16618479788303375, "learning_rate": 0.00011249020634978664, "loss": 0.1686, "step": 1275 }, { "epoch": 0.4628219078708741, "grad_norm": 0.16579975187778473, "learning_rate": 0.00011237672929198944, "loss": 0.1887, "step": 1276 }, { "epoch": 0.46318462096481683, "grad_norm": 0.1672372817993164, "learning_rate": 0.00011226323604637577, "loss": 0.1801, "step": 1277 }, { "epoch": 0.46354733405875953, "grad_norm": 0.18061618506908417, "learning_rate": 0.00011214972676138612, "loss": 0.2006, "step": 1278 }, { "epoch": 0.46391004715270223, "grad_norm": 0.19238020479679108, "learning_rate": 0.00011203620158548205, "loss": 0.1693, "step": 1279 }, { "epoch": 0.4642727602466449, "grad_norm": 0.18483294546604156, "learning_rate": 0.00011192266066714576, "loss": 0.181, "step": 1280 }, { "epoch": 0.4646354733405876, "grad_norm": 0.1617163121700287, "learning_rate": 0.00011180910415488006, "loss": 0.1812, "step": 1281 }, { "epoch": 0.4649981864345303, "grad_norm": 0.18640659749507904, "learning_rate": 0.00011169553219720828, "loss": 0.1877, "step": 1282 }, { "epoch": 0.465360899528473, "grad_norm": 0.1695108264684677, "learning_rate": 0.00011158194494267375, "loss": 0.1848, "step": 1283 }, { "epoch": 0.4657236126224157, "grad_norm": 0.1813160479068756, "learning_rate": 0.00011146834253984006, "loss": 0.1897, "step": 1284 }, { "epoch": 0.4660863257163584, "grad_norm": 0.19932959973812103, "learning_rate": 0.00011135472513729037, "loss": 0.1924, "step": 1285 }, { "epoch": 0.46644903881030103, "grad_norm": 0.18082661926746368, "learning_rate": 0.0001112410928836276, "loss": 0.1856, "step": 1286 }, { "epoch": 0.46681175190424373, "grad_norm": 0.18553735315799713, "learning_rate": 0.00011112744592747406, "loss": 0.215, "step": 1287 }, { "epoch": 0.46717446499818643, "grad_norm": 0.1664389669895172, "learning_rate": 0.0001110137844174713, "loss": 0.181, "step": 1288 }, { "epoch": 0.46753717809212914, "grad_norm": 0.16226251423358917, "learning_rate": 0.00011090010850227987, "loss": 0.1818, "step": 1289 }, { "epoch": 0.46789989118607184, "grad_norm": 0.17768961191177368, "learning_rate": 0.00011078641833057917, "loss": 0.2087, "step": 1290 }, { "epoch": 0.4682626042800145, "grad_norm": 0.16539828479290009, "learning_rate": 0.0001106727140510673, "loss": 0.1882, "step": 1291 }, { "epoch": 0.4686253173739572, "grad_norm": 0.17121171951293945, "learning_rate": 0.00011055899581246074, "loss": 0.1664, "step": 1292 }, { "epoch": 0.4689880304678999, "grad_norm": 0.19726701080799103, "learning_rate": 0.00011044526376349427, "loss": 0.1924, "step": 1293 }, { "epoch": 0.4693507435618426, "grad_norm": 0.16600336134433746, "learning_rate": 0.0001103315180529207, "loss": 0.1775, "step": 1294 }, { "epoch": 0.4697134566557853, "grad_norm": 0.1898517608642578, "learning_rate": 0.00011021775882951078, "loss": 0.248, "step": 1295 }, { "epoch": 0.470076169749728, "grad_norm": 0.16445770859718323, "learning_rate": 0.00011010398624205285, "loss": 0.1828, "step": 1296 }, { "epoch": 0.47043888284367064, "grad_norm": 0.17627963423728943, "learning_rate": 0.00010999020043935275, "loss": 0.1736, "step": 1297 }, { "epoch": 0.47080159593761334, "grad_norm": 0.19058868288993835, "learning_rate": 0.00010987640157023367, "loss": 0.2618, "step": 1298 }, { "epoch": 0.47116430903155604, "grad_norm": 0.1651872992515564, "learning_rate": 0.00010976258978353577, "loss": 0.1975, "step": 1299 }, { "epoch": 0.47152702212549874, "grad_norm": 0.20072801411151886, "learning_rate": 0.00010964876522811623, "loss": 0.1723, "step": 1300 }, { "epoch": 0.47188973521944144, "grad_norm": 0.19362793862819672, "learning_rate": 0.00010953492805284882, "loss": 0.1682, "step": 1301 }, { "epoch": 0.4722524483133841, "grad_norm": 0.16751596331596375, "learning_rate": 0.0001094210784066239, "loss": 0.1792, "step": 1302 }, { "epoch": 0.4726151614073268, "grad_norm": 0.16240975260734558, "learning_rate": 0.00010930721643834811, "loss": 0.1805, "step": 1303 }, { "epoch": 0.4729778745012695, "grad_norm": 0.1741744726896286, "learning_rate": 0.00010919334229694424, "loss": 0.1823, "step": 1304 }, { "epoch": 0.4733405875952122, "grad_norm": 0.17905928194522858, "learning_rate": 0.00010907945613135093, "loss": 0.1873, "step": 1305 }, { "epoch": 0.4737033006891549, "grad_norm": 0.16759923100471497, "learning_rate": 0.00010896555809052255, "loss": 0.1805, "step": 1306 }, { "epoch": 0.4740660137830976, "grad_norm": 0.1604134738445282, "learning_rate": 0.00010885164832342911, "loss": 0.1817, "step": 1307 }, { "epoch": 0.47442872687704024, "grad_norm": 0.22676977515220642, "learning_rate": 0.00010873772697905584, "loss": 0.1901, "step": 1308 }, { "epoch": 0.47479143997098294, "grad_norm": 0.21346516907215118, "learning_rate": 0.00010862379420640316, "loss": 0.2146, "step": 1309 }, { "epoch": 0.47515415306492564, "grad_norm": 0.18681135773658752, "learning_rate": 0.00010850985015448644, "loss": 0.1992, "step": 1310 }, { "epoch": 0.47551686615886835, "grad_norm": 0.18223214149475098, "learning_rate": 0.00010839589497233579, "loss": 0.1937, "step": 1311 }, { "epoch": 0.47587957925281105, "grad_norm": 0.16445523500442505, "learning_rate": 0.00010828192880899588, "loss": 0.201, "step": 1312 }, { "epoch": 0.4762422923467537, "grad_norm": 0.20072023570537567, "learning_rate": 0.00010816795181352576, "loss": 0.221, "step": 1313 }, { "epoch": 0.4766050054406964, "grad_norm": 0.1709073781967163, "learning_rate": 0.00010805396413499865, "loss": 0.1726, "step": 1314 }, { "epoch": 0.4769677185346391, "grad_norm": 0.20039378106594086, "learning_rate": 0.00010793996592250166, "loss": 0.2096, "step": 1315 }, { "epoch": 0.4773304316285818, "grad_norm": 0.17664781212806702, "learning_rate": 0.00010782595732513581, "loss": 0.1763, "step": 1316 }, { "epoch": 0.4776931447225245, "grad_norm": 0.19013923406600952, "learning_rate": 0.00010771193849201561, "loss": 0.1977, "step": 1317 }, { "epoch": 0.4780558578164672, "grad_norm": 0.18075336515903473, "learning_rate": 0.00010759790957226896, "loss": 0.2101, "step": 1318 }, { "epoch": 0.47841857091040985, "grad_norm": 0.16578859090805054, "learning_rate": 0.00010748387071503703, "loss": 0.1724, "step": 1319 }, { "epoch": 0.47878128400435255, "grad_norm": 0.1706560105085373, "learning_rate": 0.00010736982206947388, "loss": 0.1826, "step": 1320 }, { "epoch": 0.47914399709829525, "grad_norm": 0.1749362051486969, "learning_rate": 0.00010725576378474647, "loss": 0.1957, "step": 1321 }, { "epoch": 0.47950671019223795, "grad_norm": 0.16315925121307373, "learning_rate": 0.0001071416960100343, "loss": 0.1675, "step": 1322 }, { "epoch": 0.47986942328618065, "grad_norm": 0.18400579690933228, "learning_rate": 0.0001070276188945293, "loss": 0.1896, "step": 1323 }, { "epoch": 0.4802321363801233, "grad_norm": 0.15948963165283203, "learning_rate": 0.00010691353258743566, "loss": 0.1816, "step": 1324 }, { "epoch": 0.480594849474066, "grad_norm": 0.17198865115642548, "learning_rate": 0.00010679943723796948, "loss": 0.176, "step": 1325 }, { "epoch": 0.4809575625680087, "grad_norm": 0.15731912851333618, "learning_rate": 0.00010668533299535885, "loss": 0.1717, "step": 1326 }, { "epoch": 0.4813202756619514, "grad_norm": 0.19525468349456787, "learning_rate": 0.00010657122000884334, "loss": 0.2036, "step": 1327 }, { "epoch": 0.4816829887558941, "grad_norm": 0.18892909586429596, "learning_rate": 0.00010645709842767404, "loss": 0.1886, "step": 1328 }, { "epoch": 0.4820457018498368, "grad_norm": 0.1732751876115799, "learning_rate": 0.00010634296840111328, "loss": 0.1654, "step": 1329 }, { "epoch": 0.48240841494377945, "grad_norm": 0.17465728521347046, "learning_rate": 0.00010622883007843439, "loss": 0.2119, "step": 1330 }, { "epoch": 0.48277112803772215, "grad_norm": 0.17398551106452942, "learning_rate": 0.00010611468360892157, "loss": 0.1833, "step": 1331 }, { "epoch": 0.48313384113166485, "grad_norm": 0.16920240223407745, "learning_rate": 0.00010600052914186971, "loss": 0.182, "step": 1332 }, { "epoch": 0.48349655422560756, "grad_norm": 0.1846507489681244, "learning_rate": 0.0001058863668265841, "loss": 0.2106, "step": 1333 }, { "epoch": 0.48385926731955026, "grad_norm": 0.2055717557668686, "learning_rate": 0.00010577219681238035, "loss": 0.1992, "step": 1334 }, { "epoch": 0.4842219804134929, "grad_norm": 0.18416710197925568, "learning_rate": 0.00010565801924858411, "loss": 0.1997, "step": 1335 }, { "epoch": 0.4845846935074356, "grad_norm": 0.1609608680009842, "learning_rate": 0.00010554383428453093, "loss": 0.2147, "step": 1336 }, { "epoch": 0.4849474066013783, "grad_norm": 0.1770334094762802, "learning_rate": 0.00010542964206956601, "loss": 0.1796, "step": 1337 }, { "epoch": 0.485310119695321, "grad_norm": 0.24041593074798584, "learning_rate": 0.00010531544275304403, "loss": 0.1828, "step": 1338 }, { "epoch": 0.4856728327892637, "grad_norm": 0.18589763343334198, "learning_rate": 0.00010520123648432896, "loss": 0.1894, "step": 1339 }, { "epoch": 0.4860355458832064, "grad_norm": 0.19816087186336517, "learning_rate": 0.00010508702341279391, "loss": 0.1849, "step": 1340 }, { "epoch": 0.48639825897714906, "grad_norm": 0.20071928203105927, "learning_rate": 0.00010497280368782083, "loss": 0.1871, "step": 1341 }, { "epoch": 0.48676097207109176, "grad_norm": 0.19222816824913025, "learning_rate": 0.00010485857745880043, "loss": 0.2114, "step": 1342 }, { "epoch": 0.48712368516503446, "grad_norm": 0.17220762372016907, "learning_rate": 0.00010474434487513183, "loss": 0.186, "step": 1343 }, { "epoch": 0.48748639825897716, "grad_norm": 0.1726873815059662, "learning_rate": 0.00010463010608622259, "loss": 0.1945, "step": 1344 }, { "epoch": 0.48784911135291986, "grad_norm": 0.1876380741596222, "learning_rate": 0.0001045158612414883, "loss": 0.214, "step": 1345 }, { "epoch": 0.4882118244468625, "grad_norm": 0.16988040506839752, "learning_rate": 0.00010440161049035242, "loss": 0.1901, "step": 1346 }, { "epoch": 0.4885745375408052, "grad_norm": 0.15666206181049347, "learning_rate": 0.00010428735398224629, "loss": 0.1788, "step": 1347 }, { "epoch": 0.4889372506347479, "grad_norm": 0.16927142441272736, "learning_rate": 0.00010417309186660869, "loss": 0.1847, "step": 1348 }, { "epoch": 0.4892999637286906, "grad_norm": 0.17525021731853485, "learning_rate": 0.00010405882429288567, "loss": 0.1826, "step": 1349 }, { "epoch": 0.4896626768226333, "grad_norm": 0.19699347019195557, "learning_rate": 0.00010394455141053056, "loss": 0.2124, "step": 1350 }, { "epoch": 0.490025389916576, "grad_norm": 0.17007745802402496, "learning_rate": 0.00010383027336900355, "loss": 0.1936, "step": 1351 }, { "epoch": 0.49038810301051866, "grad_norm": 0.19556905329227448, "learning_rate": 0.00010371599031777155, "loss": 0.197, "step": 1352 }, { "epoch": 0.49075081610446136, "grad_norm": 0.16477836668491364, "learning_rate": 0.00010360170240630808, "loss": 0.1625, "step": 1353 }, { "epoch": 0.49111352919840406, "grad_norm": 0.1732366383075714, "learning_rate": 0.00010348740978409302, "loss": 0.1971, "step": 1354 }, { "epoch": 0.49147624229234677, "grad_norm": 0.16834014654159546, "learning_rate": 0.00010337311260061233, "loss": 0.1766, "step": 1355 }, { "epoch": 0.49183895538628947, "grad_norm": 0.16868503391742706, "learning_rate": 0.00010325881100535806, "loss": 0.1708, "step": 1356 }, { "epoch": 0.4922016684802321, "grad_norm": 0.1709543913602829, "learning_rate": 0.00010314450514782792, "loss": 0.1639, "step": 1357 }, { "epoch": 0.4925643815741748, "grad_norm": 0.19506582617759705, "learning_rate": 0.0001030301951775253, "loss": 0.2164, "step": 1358 }, { "epoch": 0.4929270946681175, "grad_norm": 0.1822308748960495, "learning_rate": 0.00010291588124395881, "loss": 0.1743, "step": 1359 }, { "epoch": 0.4932898077620602, "grad_norm": 0.16552822291851044, "learning_rate": 0.00010280156349664245, "loss": 0.1826, "step": 1360 }, { "epoch": 0.4936525208560029, "grad_norm": 0.1888803243637085, "learning_rate": 0.00010268724208509504, "loss": 0.2016, "step": 1361 }, { "epoch": 0.4940152339499456, "grad_norm": 0.15892508625984192, "learning_rate": 0.00010257291715884023, "loss": 0.1666, "step": 1362 }, { "epoch": 0.49437794704388827, "grad_norm": 0.18861308693885803, "learning_rate": 0.00010245858886740636, "loss": 0.1969, "step": 1363 }, { "epoch": 0.49474066013783097, "grad_norm": 0.1808152198791504, "learning_rate": 0.00010234425736032607, "loss": 0.181, "step": 1364 }, { "epoch": 0.49510337323177367, "grad_norm": 0.18545283377170563, "learning_rate": 0.00010222992278713619, "loss": 0.1757, "step": 1365 }, { "epoch": 0.49546608632571637, "grad_norm": 0.16214706003665924, "learning_rate": 0.00010211558529737768, "loss": 0.1809, "step": 1366 }, { "epoch": 0.49582879941965907, "grad_norm": 0.16413475573062897, "learning_rate": 0.00010200124504059522, "loss": 0.1765, "step": 1367 }, { "epoch": 0.4961915125136017, "grad_norm": 0.17465685307979584, "learning_rate": 0.0001018869021663371, "loss": 0.1786, "step": 1368 }, { "epoch": 0.4965542256075444, "grad_norm": 0.16205236315727234, "learning_rate": 0.00010177255682415512, "loss": 0.1778, "step": 1369 }, { "epoch": 0.4969169387014871, "grad_norm": 0.17154665291309357, "learning_rate": 0.0001016582091636042, "loss": 0.1848, "step": 1370 }, { "epoch": 0.4972796517954298, "grad_norm": 0.19808538258075714, "learning_rate": 0.00010154385933424236, "loss": 0.1872, "step": 1371 }, { "epoch": 0.4976423648893725, "grad_norm": 0.17381629347801208, "learning_rate": 0.00010142950748563047, "loss": 0.1706, "step": 1372 }, { "epoch": 0.4980050779833152, "grad_norm": 0.18413105607032776, "learning_rate": 0.00010131515376733199, "loss": 0.2041, "step": 1373 }, { "epoch": 0.49836779107725787, "grad_norm": 0.16707438230514526, "learning_rate": 0.0001012007983289128, "loss": 0.1824, "step": 1374 }, { "epoch": 0.4987305041712006, "grad_norm": 0.18369126319885254, "learning_rate": 0.00010108644131994118, "loss": 0.1838, "step": 1375 }, { "epoch": 0.4990932172651433, "grad_norm": 0.17866884171962738, "learning_rate": 0.00010097208288998727, "loss": 0.18, "step": 1376 }, { "epoch": 0.499455930359086, "grad_norm": 0.17458714544773102, "learning_rate": 0.0001008577231886232, "loss": 0.1863, "step": 1377 }, { "epoch": 0.4998186434530287, "grad_norm": 0.16435278952121735, "learning_rate": 0.00010074336236542275, "loss": 0.1691, "step": 1378 }, { "epoch": 0.5001813565469714, "grad_norm": 0.18374355137348175, "learning_rate": 0.00010062900056996111, "loss": 0.2016, "step": 1379 }, { "epoch": 0.500544069640914, "grad_norm": 0.1715199500322342, "learning_rate": 0.00010051463795181475, "loss": 0.1775, "step": 1380 }, { "epoch": 0.5009067827348568, "grad_norm": 0.17471933364868164, "learning_rate": 0.00010040027466056128, "loss": 0.1784, "step": 1381 }, { "epoch": 0.5012694958287994, "grad_norm": 0.182729572057724, "learning_rate": 0.00010028591084577914, "loss": 0.1848, "step": 1382 }, { "epoch": 0.5016322089227421, "grad_norm": 0.1831514835357666, "learning_rate": 0.00010017154665704742, "loss": 0.1782, "step": 1383 }, { "epoch": 0.5019949220166848, "grad_norm": 0.18920493125915527, "learning_rate": 0.00010005718224394583, "loss": 0.1983, "step": 1384 }, { "epoch": 0.5023576351106275, "grad_norm": 0.18116223812103271, "learning_rate": 9.994281775605417e-05, "loss": 0.1703, "step": 1385 }, { "epoch": 0.5027203482045702, "grad_norm": 0.16635280847549438, "learning_rate": 9.982845334295257e-05, "loss": 0.1826, "step": 1386 }, { "epoch": 0.5030830612985129, "grad_norm": 0.1902262419462204, "learning_rate": 9.971408915422089e-05, "loss": 0.1821, "step": 1387 }, { "epoch": 0.5034457743924555, "grad_norm": 0.1716509759426117, "learning_rate": 9.959972533943873e-05, "loss": 0.1774, "step": 1388 }, { "epoch": 0.5038084874863983, "grad_norm": 0.1831229031085968, "learning_rate": 9.948536204818527e-05, "loss": 0.1859, "step": 1389 }, { "epoch": 0.5041712005803409, "grad_norm": 0.17858019471168518, "learning_rate": 9.937099943003894e-05, "loss": 0.1763, "step": 1390 }, { "epoch": 0.5045339136742837, "grad_norm": 0.20118439197540283, "learning_rate": 9.925663763457726e-05, "loss": 0.2055, "step": 1391 }, { "epoch": 0.5048966267682263, "grad_norm": 0.1790417581796646, "learning_rate": 9.91422768113768e-05, "loss": 0.1886, "step": 1392 }, { "epoch": 0.505259339862169, "grad_norm": 0.18328474462032318, "learning_rate": 9.902791711001274e-05, "loss": 0.2024, "step": 1393 }, { "epoch": 0.5056220529561117, "grad_norm": 0.17188413441181183, "learning_rate": 9.891355868005885e-05, "loss": 0.1822, "step": 1394 }, { "epoch": 0.5059847660500544, "grad_norm": 0.20239926874637604, "learning_rate": 9.879920167108722e-05, "loss": 0.1811, "step": 1395 }, { "epoch": 0.5063474791439971, "grad_norm": 0.1758367419242859, "learning_rate": 9.868484623266807e-05, "loss": 0.2074, "step": 1396 }, { "epoch": 0.5067101922379398, "grad_norm": 0.16482442617416382, "learning_rate": 9.857049251436957e-05, "loss": 0.1748, "step": 1397 }, { "epoch": 0.5070729053318824, "grad_norm": 0.19277919828891754, "learning_rate": 9.845614066575764e-05, "loss": 0.2831, "step": 1398 }, { "epoch": 0.5074356184258252, "grad_norm": 0.19243241846561432, "learning_rate": 9.834179083639581e-05, "loss": 0.1817, "step": 1399 }, { "epoch": 0.5077983315197678, "grad_norm": 0.19496306777000427, "learning_rate": 9.822744317584492e-05, "loss": 0.1614, "step": 1400 }, { "epoch": 0.5081610446137106, "grad_norm": 0.1891697645187378, "learning_rate": 9.811309783366291e-05, "loss": 0.1952, "step": 1401 }, { "epoch": 0.5085237577076532, "grad_norm": 0.19444064795970917, "learning_rate": 9.799875495940481e-05, "loss": 0.1942, "step": 1402 }, { "epoch": 0.508886470801596, "grad_norm": 0.18112586438655853, "learning_rate": 9.788441470262235e-05, "loss": 0.1963, "step": 1403 }, { "epoch": 0.5092491838955386, "grad_norm": 0.17287184298038483, "learning_rate": 9.777007721286381e-05, "loss": 0.1733, "step": 1404 }, { "epoch": 0.5096118969894813, "grad_norm": 0.18775591254234314, "learning_rate": 9.765574263967396e-05, "loss": 0.1965, "step": 1405 }, { "epoch": 0.509974610083424, "grad_norm": 0.17914709448814392, "learning_rate": 9.754141113259366e-05, "loss": 0.1748, "step": 1406 }, { "epoch": 0.5103373231773667, "grad_norm": 0.19423453509807587, "learning_rate": 9.74270828411598e-05, "loss": 0.1833, "step": 1407 }, { "epoch": 0.5107000362713094, "grad_norm": 0.18104608356952667, "learning_rate": 9.731275791490501e-05, "loss": 0.177, "step": 1408 }, { "epoch": 0.5110627493652521, "grad_norm": 0.17595738172531128, "learning_rate": 9.719843650335758e-05, "loss": 0.1839, "step": 1409 }, { "epoch": 0.5114254624591947, "grad_norm": 0.18560685217380524, "learning_rate": 9.708411875604118e-05, "loss": 0.1995, "step": 1410 }, { "epoch": 0.5117881755531375, "grad_norm": 0.18210434913635254, "learning_rate": 9.696980482247474e-05, "loss": 0.1963, "step": 1411 }, { "epoch": 0.5121508886470801, "grad_norm": 0.16906267404556274, "learning_rate": 9.685549485217209e-05, "loss": 0.1636, "step": 1412 }, { "epoch": 0.5125136017410229, "grad_norm": 0.19701135158538818, "learning_rate": 9.674118899464195e-05, "loss": 0.2184, "step": 1413 }, { "epoch": 0.5128763148349655, "grad_norm": 0.18875081837177277, "learning_rate": 9.662688739938769e-05, "loss": 0.2142, "step": 1414 }, { "epoch": 0.5132390279289082, "grad_norm": 0.20290465652942657, "learning_rate": 9.651259021590703e-05, "loss": 0.2041, "step": 1415 }, { "epoch": 0.5136017410228509, "grad_norm": 0.1915699690580368, "learning_rate": 9.639829759369191e-05, "loss": 0.1741, "step": 1416 }, { "epoch": 0.5139644541167936, "grad_norm": 0.1645934134721756, "learning_rate": 9.628400968222846e-05, "loss": 0.179, "step": 1417 }, { "epoch": 0.5143271672107363, "grad_norm": 0.18472225964069366, "learning_rate": 9.616972663099647e-05, "loss": 0.1876, "step": 1418 }, { "epoch": 0.514689880304679, "grad_norm": 0.17435920238494873, "learning_rate": 9.605544858946945e-05, "loss": 0.175, "step": 1419 }, { "epoch": 0.5150525933986216, "grad_norm": 0.1865229606628418, "learning_rate": 9.594117570711434e-05, "loss": 0.2141, "step": 1420 }, { "epoch": 0.5154153064925644, "grad_norm": 0.18339309096336365, "learning_rate": 9.582690813339136e-05, "loss": 0.1794, "step": 1421 }, { "epoch": 0.515778019586507, "grad_norm": 0.1994606852531433, "learning_rate": 9.571264601775369e-05, "loss": 0.1835, "step": 1422 }, { "epoch": 0.5161407326804498, "grad_norm": 0.16973696649074554, "learning_rate": 9.559838950964757e-05, "loss": 0.1587, "step": 1423 }, { "epoch": 0.5165034457743924, "grad_norm": 0.17294169962406158, "learning_rate": 9.548413875851174e-05, "loss": 0.1748, "step": 1424 }, { "epoch": 0.5168661588683352, "grad_norm": 0.19328264892101288, "learning_rate": 9.536989391377743e-05, "loss": 0.2012, "step": 1425 }, { "epoch": 0.5172288719622778, "grad_norm": 0.18262383341789246, "learning_rate": 9.52556551248682e-05, "loss": 0.1806, "step": 1426 }, { "epoch": 0.5175915850562205, "grad_norm": 0.16941824555397034, "learning_rate": 9.514142254119962e-05, "loss": 0.1739, "step": 1427 }, { "epoch": 0.5179542981501633, "grad_norm": 0.1808822602033615, "learning_rate": 9.502719631217917e-05, "loss": 0.1685, "step": 1428 }, { "epoch": 0.5183170112441059, "grad_norm": 0.213886559009552, "learning_rate": 9.49129765872061e-05, "loss": 0.1851, "step": 1429 }, { "epoch": 0.5186797243380487, "grad_norm": 0.1952863335609436, "learning_rate": 9.479876351567107e-05, "loss": 0.1691, "step": 1430 }, { "epoch": 0.5190424374319913, "grad_norm": 0.1745711863040924, "learning_rate": 9.4684557246956e-05, "loss": 0.1883, "step": 1431 }, { "epoch": 0.519405150525934, "grad_norm": 0.19590620696544647, "learning_rate": 9.457035793043401e-05, "loss": 0.1822, "step": 1432 }, { "epoch": 0.5197678636198767, "grad_norm": 0.17998209595680237, "learning_rate": 9.445616571546909e-05, "loss": 0.172, "step": 1433 }, { "epoch": 0.5201305767138193, "grad_norm": 0.1765129566192627, "learning_rate": 9.434198075141591e-05, "loss": 0.1748, "step": 1434 }, { "epoch": 0.5204932898077621, "grad_norm": 0.19922930002212524, "learning_rate": 9.422780318761965e-05, "loss": 0.1941, "step": 1435 }, { "epoch": 0.5208560029017048, "grad_norm": 0.1994534283876419, "learning_rate": 9.411363317341592e-05, "loss": 0.1838, "step": 1436 }, { "epoch": 0.5212187159956474, "grad_norm": 0.19850608706474304, "learning_rate": 9.399947085813032e-05, "loss": 0.1768, "step": 1437 }, { "epoch": 0.5215814290895902, "grad_norm": 0.16051959991455078, "learning_rate": 9.388531639107846e-05, "loss": 0.1781, "step": 1438 }, { "epoch": 0.5219441421835328, "grad_norm": 0.18641552329063416, "learning_rate": 9.377116992156566e-05, "loss": 0.1884, "step": 1439 }, { "epoch": 0.5223068552774756, "grad_norm": 0.16958610713481903, "learning_rate": 9.365703159888677e-05, "loss": 0.1768, "step": 1440 }, { "epoch": 0.5226695683714182, "grad_norm": 0.16557306051254272, "learning_rate": 9.354290157232596e-05, "loss": 0.1648, "step": 1441 }, { "epoch": 0.5230322814653608, "grad_norm": 0.18799157440662384, "learning_rate": 9.342877999115667e-05, "loss": 0.1711, "step": 1442 }, { "epoch": 0.5233949945593036, "grad_norm": 0.19848479330539703, "learning_rate": 9.331466700464117e-05, "loss": 0.1807, "step": 1443 }, { "epoch": 0.5237577076532463, "grad_norm": 0.17750594019889832, "learning_rate": 9.320056276203054e-05, "loss": 0.1907, "step": 1444 }, { "epoch": 0.524120420747189, "grad_norm": 0.16206082701683044, "learning_rate": 9.308646741256439e-05, "loss": 0.1808, "step": 1445 }, { "epoch": 0.5244831338411317, "grad_norm": 0.1657271534204483, "learning_rate": 9.297238110547074e-05, "loss": 0.177, "step": 1446 }, { "epoch": 0.5248458469350744, "grad_norm": 0.19123826920986176, "learning_rate": 9.285830398996571e-05, "loss": 0.1817, "step": 1447 }, { "epoch": 0.5252085600290171, "grad_norm": 0.16904449462890625, "learning_rate": 9.274423621525354e-05, "loss": 0.1837, "step": 1448 }, { "epoch": 0.5255712731229597, "grad_norm": 0.19816622138023376, "learning_rate": 9.263017793052615e-05, "loss": 0.1954, "step": 1449 }, { "epoch": 0.5259339862169025, "grad_norm": 0.18440890312194824, "learning_rate": 9.251612928496298e-05, "loss": 0.1708, "step": 1450 }, { "epoch": 0.5262966993108451, "grad_norm": 0.18821316957473755, "learning_rate": 9.240209042773105e-05, "loss": 0.1929, "step": 1451 }, { "epoch": 0.5266594124047879, "grad_norm": 0.18499478697776794, "learning_rate": 9.228806150798442e-05, "loss": 0.1774, "step": 1452 }, { "epoch": 0.5270221254987305, "grad_norm": 0.21519748866558075, "learning_rate": 9.21740426748642e-05, "loss": 0.1915, "step": 1453 }, { "epoch": 0.5273848385926732, "grad_norm": 0.18411661684513092, "learning_rate": 9.206003407749833e-05, "loss": 0.2101, "step": 1454 }, { "epoch": 0.5277475516866159, "grad_norm": 0.17182524502277374, "learning_rate": 9.194603586500136e-05, "loss": 0.1672, "step": 1455 }, { "epoch": 0.5281102647805586, "grad_norm": 0.18551282584667206, "learning_rate": 9.183204818647424e-05, "loss": 0.1924, "step": 1456 }, { "epoch": 0.5284729778745013, "grad_norm": 0.18289272487163544, "learning_rate": 9.171807119100413e-05, "loss": 0.1781, "step": 1457 }, { "epoch": 0.528835690968444, "grad_norm": 0.169638991355896, "learning_rate": 9.160410502766424e-05, "loss": 0.1704, "step": 1458 }, { "epoch": 0.5291984040623866, "grad_norm": 0.17855599522590637, "learning_rate": 9.149014984551357e-05, "loss": 0.1761, "step": 1459 }, { "epoch": 0.5295611171563294, "grad_norm": 0.21452195942401886, "learning_rate": 9.137620579359685e-05, "loss": 0.1778, "step": 1460 }, { "epoch": 0.529923830250272, "grad_norm": 0.20922896265983582, "learning_rate": 9.126227302094417e-05, "loss": 0.2186, "step": 1461 }, { "epoch": 0.5302865433442148, "grad_norm": 0.15859532356262207, "learning_rate": 9.114835167657091e-05, "loss": 0.1829, "step": 1462 }, { "epoch": 0.5306492564381574, "grad_norm": 0.17610323429107666, "learning_rate": 9.103444190947746e-05, "loss": 0.2027, "step": 1463 }, { "epoch": 0.5310119695321001, "grad_norm": 0.17557282745838165, "learning_rate": 9.092054386864912e-05, "loss": 0.1747, "step": 1464 }, { "epoch": 0.5313746826260428, "grad_norm": 0.19372673332691193, "learning_rate": 9.080665770305578e-05, "loss": 0.1644, "step": 1465 }, { "epoch": 0.5317373957199855, "grad_norm": 0.20970730483531952, "learning_rate": 9.069278356165187e-05, "loss": 0.2032, "step": 1466 }, { "epoch": 0.5321001088139282, "grad_norm": 0.2470318228006363, "learning_rate": 9.057892159337612e-05, "loss": 0.2121, "step": 1467 }, { "epoch": 0.5324628219078709, "grad_norm": 0.1663379967212677, "learning_rate": 9.046507194715121e-05, "loss": 0.1741, "step": 1468 }, { "epoch": 0.5328255350018135, "grad_norm": 0.1842135637998581, "learning_rate": 9.035123477188381e-05, "loss": 0.1793, "step": 1469 }, { "epoch": 0.5331882480957563, "grad_norm": 0.19390299916267395, "learning_rate": 9.023741021646427e-05, "loss": 0.2071, "step": 1470 }, { "epoch": 0.5335509611896989, "grad_norm": 0.17016194760799408, "learning_rate": 9.012359842976638e-05, "loss": 0.1718, "step": 1471 }, { "epoch": 0.5339136742836417, "grad_norm": 0.19337502121925354, "learning_rate": 9.000979956064725e-05, "loss": 0.2095, "step": 1472 }, { "epoch": 0.5342763873775843, "grad_norm": 0.21092645823955536, "learning_rate": 8.989601375794717e-05, "loss": 0.1854, "step": 1473 }, { "epoch": 0.5346391004715271, "grad_norm": 0.18103566765785217, "learning_rate": 8.978224117048925e-05, "loss": 0.1829, "step": 1474 }, { "epoch": 0.5350018135654697, "grad_norm": 0.17190292477607727, "learning_rate": 8.966848194707931e-05, "loss": 0.1733, "step": 1475 }, { "epoch": 0.5353645266594124, "grad_norm": 0.18108366429805756, "learning_rate": 8.955473623650578e-05, "loss": 0.2058, "step": 1476 }, { "epoch": 0.5357272397533551, "grad_norm": 0.16649720072746277, "learning_rate": 8.944100418753931e-05, "loss": 0.1744, "step": 1477 }, { "epoch": 0.5360899528472978, "grad_norm": 0.15770559012889862, "learning_rate": 8.932728594893271e-05, "loss": 0.1775, "step": 1478 }, { "epoch": 0.5364526659412405, "grad_norm": 0.1907668113708496, "learning_rate": 8.921358166942084e-05, "loss": 0.1766, "step": 1479 }, { "epoch": 0.5368153790351832, "grad_norm": 0.18284808099269867, "learning_rate": 8.909989149772015e-05, "loss": 0.1708, "step": 1480 }, { "epoch": 0.5371780921291258, "grad_norm": 0.20297999680042267, "learning_rate": 8.898621558252874e-05, "loss": 0.165, "step": 1481 }, { "epoch": 0.5375408052230686, "grad_norm": 0.22023969888687134, "learning_rate": 8.887255407252596e-05, "loss": 0.1668, "step": 1482 }, { "epoch": 0.5379035183170112, "grad_norm": 0.17669132351875305, "learning_rate": 8.875890711637243e-05, "loss": 0.2046, "step": 1483 }, { "epoch": 0.538266231410954, "grad_norm": 0.17783772945404053, "learning_rate": 8.864527486270964e-05, "loss": 0.1648, "step": 1484 }, { "epoch": 0.5386289445048966, "grad_norm": 0.171718031167984, "learning_rate": 8.853165746015997e-05, "loss": 0.1897, "step": 1485 }, { "epoch": 0.5389916575988393, "grad_norm": 0.16997992992401123, "learning_rate": 8.841805505732626e-05, "loss": 0.167, "step": 1486 }, { "epoch": 0.539354370692782, "grad_norm": 0.1764468550682068, "learning_rate": 8.830446780279176e-05, "loss": 0.1659, "step": 1487 }, { "epoch": 0.5397170837867247, "grad_norm": 0.18435722589492798, "learning_rate": 8.819089584511996e-05, "loss": 0.1754, "step": 1488 }, { "epoch": 0.5400797968806674, "grad_norm": 0.19305875897407532, "learning_rate": 8.807733933285429e-05, "loss": 0.1918, "step": 1489 }, { "epoch": 0.5404425099746101, "grad_norm": 0.1882489174604416, "learning_rate": 8.796379841451796e-05, "loss": 0.1906, "step": 1490 }, { "epoch": 0.5408052230685527, "grad_norm": 0.14983880519866943, "learning_rate": 8.785027323861386e-05, "loss": 0.1552, "step": 1491 }, { "epoch": 0.5411679361624955, "grad_norm": 0.16522106528282166, "learning_rate": 8.773676395362425e-05, "loss": 0.1761, "step": 1492 }, { "epoch": 0.5415306492564381, "grad_norm": 0.17727860808372498, "learning_rate": 8.76232707080106e-05, "loss": 0.1631, "step": 1493 }, { "epoch": 0.5418933623503809, "grad_norm": 0.1912899911403656, "learning_rate": 8.750979365021338e-05, "loss": 0.1804, "step": 1494 }, { "epoch": 0.5422560754443235, "grad_norm": 0.185381218791008, "learning_rate": 8.739633292865192e-05, "loss": 0.1831, "step": 1495 }, { "epoch": 0.5426187885382663, "grad_norm": 0.18887324631214142, "learning_rate": 8.728288869172421e-05, "loss": 0.178, "step": 1496 }, { "epoch": 0.5429815016322089, "grad_norm": 0.1737644374370575, "learning_rate": 8.716946108780655e-05, "loss": 0.1769, "step": 1497 }, { "epoch": 0.5433442147261516, "grad_norm": 0.18002916872501373, "learning_rate": 8.705605026525371e-05, "loss": 0.1599, "step": 1498 }, { "epoch": 0.5437069278200943, "grad_norm": 0.18868666887283325, "learning_rate": 8.694265637239831e-05, "loss": 0.1661, "step": 1499 }, { "epoch": 0.544069640914037, "grad_norm": 0.20771367847919464, "learning_rate": 8.682927955755093e-05, "loss": 0.1839, "step": 1500 }, { "epoch": 0.5444323540079797, "grad_norm": 0.1799492985010147, "learning_rate": 8.671591996899974e-05, "loss": 0.1782, "step": 1501 }, { "epoch": 0.5447950671019224, "grad_norm": 0.17485234141349792, "learning_rate": 8.660257775501045e-05, "loss": 0.1698, "step": 1502 }, { "epoch": 0.545157780195865, "grad_norm": 0.17470629513263702, "learning_rate": 8.6489253063826e-05, "loss": 0.1695, "step": 1503 }, { "epoch": 0.5455204932898078, "grad_norm": 0.17630697786808014, "learning_rate": 8.637594604366647e-05, "loss": 0.175, "step": 1504 }, { "epoch": 0.5458832063837504, "grad_norm": 0.19793953001499176, "learning_rate": 8.626265684272876e-05, "loss": 0.1798, "step": 1505 }, { "epoch": 0.5462459194776932, "grad_norm": 0.19965516030788422, "learning_rate": 8.614938560918649e-05, "loss": 0.2011, "step": 1506 }, { "epoch": 0.5466086325716358, "grad_norm": 0.18119129538536072, "learning_rate": 8.603613249118977e-05, "loss": 0.1624, "step": 1507 }, { "epoch": 0.5469713456655785, "grad_norm": 0.19433656334877014, "learning_rate": 8.592289763686505e-05, "loss": 0.1842, "step": 1508 }, { "epoch": 0.5473340587595212, "grad_norm": 0.17872895300388336, "learning_rate": 8.580968119431483e-05, "loss": 0.1628, "step": 1509 }, { "epoch": 0.5476967718534639, "grad_norm": 0.18134737014770508, "learning_rate": 8.569648331161762e-05, "loss": 0.1649, "step": 1510 }, { "epoch": 0.5480594849474066, "grad_norm": 0.19080941379070282, "learning_rate": 8.558330413682759e-05, "loss": 0.1856, "step": 1511 }, { "epoch": 0.5484221980413493, "grad_norm": 0.20772339403629303, "learning_rate": 8.547014381797445e-05, "loss": 0.1904, "step": 1512 }, { "epoch": 0.5487849111352919, "grad_norm": 0.1807977259159088, "learning_rate": 8.535700250306322e-05, "loss": 0.1719, "step": 1513 }, { "epoch": 0.5491476242292347, "grad_norm": 0.18353581428527832, "learning_rate": 8.524388034007415e-05, "loss": 0.1758, "step": 1514 }, { "epoch": 0.5495103373231773, "grad_norm": 0.22524112462997437, "learning_rate": 8.51307774769623e-05, "loss": 0.1821, "step": 1515 }, { "epoch": 0.5498730504171201, "grad_norm": 0.17495766282081604, "learning_rate": 8.501769406165769e-05, "loss": 0.2193, "step": 1516 }, { "epoch": 0.5502357635110627, "grad_norm": 0.17903603613376617, "learning_rate": 8.490463024206474e-05, "loss": 0.1687, "step": 1517 }, { "epoch": 0.5505984766050055, "grad_norm": 0.1783863753080368, "learning_rate": 8.479158616606228e-05, "loss": 0.1699, "step": 1518 }, { "epoch": 0.5509611896989481, "grad_norm": 0.17774266004562378, "learning_rate": 8.467856198150333e-05, "loss": 0.1946, "step": 1519 }, { "epoch": 0.5513239027928908, "grad_norm": 0.20432449877262115, "learning_rate": 8.45655578362149e-05, "loss": 0.193, "step": 1520 }, { "epoch": 0.5516866158868335, "grad_norm": 0.1733636111021042, "learning_rate": 8.44525738779977e-05, "loss": 0.1712, "step": 1521 }, { "epoch": 0.5520493289807762, "grad_norm": 0.19748555123806, "learning_rate": 8.433961025462624e-05, "loss": 0.1969, "step": 1522 }, { "epoch": 0.552412042074719, "grad_norm": 0.18513956665992737, "learning_rate": 8.422666711384827e-05, "loss": 0.1735, "step": 1523 }, { "epoch": 0.5527747551686616, "grad_norm": 0.22357869148254395, "learning_rate": 8.411374460338474e-05, "loss": 0.1725, "step": 1524 }, { "epoch": 0.5531374682626042, "grad_norm": 0.18229088187217712, "learning_rate": 8.400084287092973e-05, "loss": 0.1724, "step": 1525 }, { "epoch": 0.553500181356547, "grad_norm": 0.15753042697906494, "learning_rate": 8.388796206415004e-05, "loss": 0.1762, "step": 1526 }, { "epoch": 0.5538628944504896, "grad_norm": 0.18276041746139526, "learning_rate": 8.377510233068518e-05, "loss": 0.1862, "step": 1527 }, { "epoch": 0.5542256075444324, "grad_norm": 0.2091018706560135, "learning_rate": 8.366226381814697e-05, "loss": 0.1722, "step": 1528 }, { "epoch": 0.554588320638375, "grad_norm": 0.1851229965686798, "learning_rate": 8.354944667411968e-05, "loss": 0.174, "step": 1529 }, { "epoch": 0.5549510337323177, "grad_norm": 0.18812698125839233, "learning_rate": 8.343665104615948e-05, "loss": 0.192, "step": 1530 }, { "epoch": 0.5553137468262604, "grad_norm": 0.18323373794555664, "learning_rate": 8.332387708179441e-05, "loss": 0.185, "step": 1531 }, { "epoch": 0.5556764599202031, "grad_norm": 0.187171071767807, "learning_rate": 8.321112492852422e-05, "loss": 0.18, "step": 1532 }, { "epoch": 0.5560391730141458, "grad_norm": 0.18064919114112854, "learning_rate": 8.30983947338201e-05, "loss": 0.1739, "step": 1533 }, { "epoch": 0.5564018861080885, "grad_norm": 0.1815587282180786, "learning_rate": 8.29856866451245e-05, "loss": 0.1818, "step": 1534 }, { "epoch": 0.5567645992020311, "grad_norm": 0.19945740699768066, "learning_rate": 8.287300080985106e-05, "loss": 0.2014, "step": 1535 }, { "epoch": 0.5571273122959739, "grad_norm": 0.1874108761548996, "learning_rate": 8.276033737538424e-05, "loss": 0.1719, "step": 1536 }, { "epoch": 0.5574900253899165, "grad_norm": 0.173946350812912, "learning_rate": 8.264769648907915e-05, "loss": 0.1616, "step": 1537 }, { "epoch": 0.5578527384838593, "grad_norm": 0.20264151692390442, "learning_rate": 8.25350782982615e-05, "loss": 0.1815, "step": 1538 }, { "epoch": 0.558215451577802, "grad_norm": 0.17723354697227478, "learning_rate": 8.242248295022727e-05, "loss": 0.1869, "step": 1539 }, { "epoch": 0.5585781646717447, "grad_norm": 0.16882532835006714, "learning_rate": 8.230991059224257e-05, "loss": 0.1593, "step": 1540 }, { "epoch": 0.5589408777656873, "grad_norm": 0.17361445724964142, "learning_rate": 8.219736137154347e-05, "loss": 0.1696, "step": 1541 }, { "epoch": 0.55930359085963, "grad_norm": 0.1865490823984146, "learning_rate": 8.208483543533573e-05, "loss": 0.2033, "step": 1542 }, { "epoch": 0.5596663039535728, "grad_norm": 0.17689920961856842, "learning_rate": 8.197233293079468e-05, "loss": 0.1679, "step": 1543 }, { "epoch": 0.5600290170475154, "grad_norm": 0.18286365270614624, "learning_rate": 8.185985400506502e-05, "loss": 0.1654, "step": 1544 }, { "epoch": 0.5603917301414582, "grad_norm": 0.18033449351787567, "learning_rate": 8.174739880526057e-05, "loss": 0.1814, "step": 1545 }, { "epoch": 0.5607544432354008, "grad_norm": 0.17507143318653107, "learning_rate": 8.163496747846411e-05, "loss": 0.1669, "step": 1546 }, { "epoch": 0.5611171563293434, "grad_norm": 0.16485197842121124, "learning_rate": 8.152256017172732e-05, "loss": 0.1666, "step": 1547 }, { "epoch": 0.5614798694232862, "grad_norm": 0.18058069050312042, "learning_rate": 8.14101770320703e-05, "loss": 0.1626, "step": 1548 }, { "epoch": 0.5618425825172289, "grad_norm": 0.17364412546157837, "learning_rate": 8.129781820648164e-05, "loss": 0.1913, "step": 1549 }, { "epoch": 0.5622052956111716, "grad_norm": 0.18617358803749084, "learning_rate": 8.118548384191809e-05, "loss": 0.1844, "step": 1550 }, { "epoch": 0.5625680087051143, "grad_norm": 0.17379792034626007, "learning_rate": 8.107317408530441e-05, "loss": 0.1657, "step": 1551 }, { "epoch": 0.5629307217990569, "grad_norm": 0.1696668565273285, "learning_rate": 8.096088908353315e-05, "loss": 0.1663, "step": 1552 }, { "epoch": 0.5632934348929997, "grad_norm": 0.16332849860191345, "learning_rate": 8.084862898346459e-05, "loss": 0.1707, "step": 1553 }, { "epoch": 0.5636561479869423, "grad_norm": 0.20836418867111206, "learning_rate": 8.073639393192634e-05, "loss": 0.1849, "step": 1554 }, { "epoch": 0.5640188610808851, "grad_norm": 0.1766640543937683, "learning_rate": 8.062418407571326e-05, "loss": 0.1593, "step": 1555 }, { "epoch": 0.5643815741748277, "grad_norm": 0.1723148226737976, "learning_rate": 8.051199956158727e-05, "loss": 0.1753, "step": 1556 }, { "epoch": 0.5647442872687704, "grad_norm": 0.17197547852993011, "learning_rate": 8.039984053627714e-05, "loss": 0.1664, "step": 1557 }, { "epoch": 0.5651070003627131, "grad_norm": 0.17370520532131195, "learning_rate": 8.02877071464783e-05, "loss": 0.1767, "step": 1558 }, { "epoch": 0.5654697134566558, "grad_norm": 0.18708960711956024, "learning_rate": 8.017559953885267e-05, "loss": 0.1951, "step": 1559 }, { "epoch": 0.5658324265505985, "grad_norm": 0.21225912868976593, "learning_rate": 8.006351786002846e-05, "loss": 0.1752, "step": 1560 }, { "epoch": 0.5661951396445412, "grad_norm": 0.17883837223052979, "learning_rate": 7.995146225659994e-05, "loss": 0.1665, "step": 1561 }, { "epoch": 0.5665578527384839, "grad_norm": 0.16992917656898499, "learning_rate": 7.98394328751272e-05, "loss": 0.1691, "step": 1562 }, { "epoch": 0.5669205658324266, "grad_norm": 0.18541240692138672, "learning_rate": 7.972742986213623e-05, "loss": 0.1818, "step": 1563 }, { "epoch": 0.5672832789263692, "grad_norm": 0.17470984160900116, "learning_rate": 7.961545336411836e-05, "loss": 0.1715, "step": 1564 }, { "epoch": 0.567645992020312, "grad_norm": 0.21040913462638855, "learning_rate": 7.950350352753023e-05, "loss": 0.1873, "step": 1565 }, { "epoch": 0.5680087051142546, "grad_norm": 0.17107225954532623, "learning_rate": 7.93915804987938e-05, "loss": 0.1559, "step": 1566 }, { "epoch": 0.5683714182081974, "grad_norm": 0.16713112592697144, "learning_rate": 7.927968442429576e-05, "loss": 0.1734, "step": 1567 }, { "epoch": 0.56873413130214, "grad_norm": 0.18837302923202515, "learning_rate": 7.916781545038767e-05, "loss": 0.167, "step": 1568 }, { "epoch": 0.5690968443960827, "grad_norm": 0.17015686631202698, "learning_rate": 7.905597372338558e-05, "loss": 0.1703, "step": 1569 }, { "epoch": 0.5694595574900254, "grad_norm": 0.17552775144577026, "learning_rate": 7.894415938956991e-05, "loss": 0.1623, "step": 1570 }, { "epoch": 0.5698222705839681, "grad_norm": 0.1910295933485031, "learning_rate": 7.883237259518526e-05, "loss": 0.1642, "step": 1571 }, { "epoch": 0.5701849836779108, "grad_norm": 0.19286568462848663, "learning_rate": 7.872061348644028e-05, "loss": 0.1776, "step": 1572 }, { "epoch": 0.5705476967718535, "grad_norm": 0.17776118218898773, "learning_rate": 7.86088822095073e-05, "loss": 0.167, "step": 1573 }, { "epoch": 0.5709104098657961, "grad_norm": 0.1805812269449234, "learning_rate": 7.84971789105223e-05, "loss": 0.1666, "step": 1574 }, { "epoch": 0.5712731229597389, "grad_norm": 0.3048454523086548, "learning_rate": 7.838550373558469e-05, "loss": 0.2252, "step": 1575 }, { "epoch": 0.5716358360536815, "grad_norm": 0.18575210869312286, "learning_rate": 7.827385683075701e-05, "loss": 0.1673, "step": 1576 }, { "epoch": 0.5719985491476243, "grad_norm": 0.19140534102916718, "learning_rate": 7.816223834206489e-05, "loss": 0.1651, "step": 1577 }, { "epoch": 0.5723612622415669, "grad_norm": 0.15774936974048615, "learning_rate": 7.805064841549685e-05, "loss": 0.1579, "step": 1578 }, { "epoch": 0.5727239753355096, "grad_norm": 0.16118699312210083, "learning_rate": 7.793908719700396e-05, "loss": 0.1656, "step": 1579 }, { "epoch": 0.5730866884294523, "grad_norm": 0.19020985066890717, "learning_rate": 7.782755483249973e-05, "loss": 0.1775, "step": 1580 }, { "epoch": 0.573449401523395, "grad_norm": 0.1851213425397873, "learning_rate": 7.771605146786003e-05, "loss": 0.1876, "step": 1581 }, { "epoch": 0.5738121146173377, "grad_norm": 0.17101642489433289, "learning_rate": 7.760457724892272e-05, "loss": 0.1714, "step": 1582 }, { "epoch": 0.5741748277112804, "grad_norm": 0.17683084309101105, "learning_rate": 7.749313232148753e-05, "loss": 0.166, "step": 1583 }, { "epoch": 0.5745375408052231, "grad_norm": 0.1966182291507721, "learning_rate": 7.738171683131594e-05, "loss": 0.1763, "step": 1584 }, { "epoch": 0.5749002538991658, "grad_norm": 0.1787012666463852, "learning_rate": 7.727033092413094e-05, "loss": 0.1621, "step": 1585 }, { "epoch": 0.5752629669931084, "grad_norm": 0.18337036669254303, "learning_rate": 7.715897474561675e-05, "loss": 0.1568, "step": 1586 }, { "epoch": 0.5756256800870512, "grad_norm": 0.18342240154743195, "learning_rate": 7.704764844141873e-05, "loss": 0.1722, "step": 1587 }, { "epoch": 0.5759883931809938, "grad_norm": 0.17828598618507385, "learning_rate": 7.693635215714322e-05, "loss": 0.1562, "step": 1588 }, { "epoch": 0.5763511062749366, "grad_norm": 0.19080400466918945, "learning_rate": 7.682508603835722e-05, "loss": 0.1783, "step": 1589 }, { "epoch": 0.5767138193688792, "grad_norm": 0.16964450478553772, "learning_rate": 7.67138502305883e-05, "loss": 0.1726, "step": 1590 }, { "epoch": 0.5770765324628219, "grad_norm": 0.19029711186885834, "learning_rate": 7.660264487932444e-05, "loss": 0.1574, "step": 1591 }, { "epoch": 0.5774392455567646, "grad_norm": 0.21546104550361633, "learning_rate": 7.649147013001376e-05, "loss": 0.1691, "step": 1592 }, { "epoch": 0.5778019586507073, "grad_norm": 0.17420600354671478, "learning_rate": 7.63803261280643e-05, "loss": 0.1612, "step": 1593 }, { "epoch": 0.57816467174465, "grad_norm": 0.18015912175178528, "learning_rate": 7.626921301884395e-05, "loss": 0.1622, "step": 1594 }, { "epoch": 0.5785273848385927, "grad_norm": 0.16851022839546204, "learning_rate": 7.615813094768012e-05, "loss": 0.1642, "step": 1595 }, { "epoch": 0.5788900979325353, "grad_norm": 0.1783701479434967, "learning_rate": 7.604708005985971e-05, "loss": 0.1726, "step": 1596 }, { "epoch": 0.5792528110264781, "grad_norm": 0.16931217908859253, "learning_rate": 7.593606050062881e-05, "loss": 0.1608, "step": 1597 }, { "epoch": 0.5796155241204207, "grad_norm": 0.16568873822689056, "learning_rate": 7.582507241519252e-05, "loss": 0.163, "step": 1598 }, { "epoch": 0.5799782372143635, "grad_norm": 0.16731184720993042, "learning_rate": 7.571411594871474e-05, "loss": 0.2004, "step": 1599 }, { "epoch": 0.5803409503083061, "grad_norm": 0.2044878900051117, "learning_rate": 7.56031912463181e-05, "loss": 0.1608, "step": 1600 }, { "epoch": 0.5807036634022488, "grad_norm": 0.2527421712875366, "learning_rate": 7.549229845308362e-05, "loss": 0.1948, "step": 1601 }, { "epoch": 0.5810663764961915, "grad_norm": 0.16458679735660553, "learning_rate": 7.538143771405055e-05, "loss": 0.1495, "step": 1602 }, { "epoch": 0.5814290895901342, "grad_norm": 0.16658927500247955, "learning_rate": 7.527060917421635e-05, "loss": 0.1555, "step": 1603 }, { "epoch": 0.5817918026840769, "grad_norm": 0.17401687800884247, "learning_rate": 7.515981297853626e-05, "loss": 0.1702, "step": 1604 }, { "epoch": 0.5821545157780196, "grad_norm": 0.17915883660316467, "learning_rate": 7.504904927192322e-05, "loss": 0.175, "step": 1605 }, { "epoch": 0.5825172288719622, "grad_norm": 0.18019749224185944, "learning_rate": 7.493831819924772e-05, "loss": 0.1703, "step": 1606 }, { "epoch": 0.582879941965905, "grad_norm": 0.18500368297100067, "learning_rate": 7.482761990533752e-05, "loss": 0.1741, "step": 1607 }, { "epoch": 0.5832426550598476, "grad_norm": 0.18486149609088898, "learning_rate": 7.47169545349775e-05, "loss": 0.1508, "step": 1608 }, { "epoch": 0.5836053681537904, "grad_norm": 0.2042957842350006, "learning_rate": 7.46063222329096e-05, "loss": 0.173, "step": 1609 }, { "epoch": 0.583968081247733, "grad_norm": 0.19605065882205963, "learning_rate": 7.449572314383237e-05, "loss": 0.1865, "step": 1610 }, { "epoch": 0.5843307943416758, "grad_norm": 0.18277035653591156, "learning_rate": 7.438515741240097e-05, "loss": 0.195, "step": 1611 }, { "epoch": 0.5846935074356184, "grad_norm": 0.18347297608852386, "learning_rate": 7.427462518322693e-05, "loss": 0.1579, "step": 1612 }, { "epoch": 0.5850562205295611, "grad_norm": 0.1746947020292282, "learning_rate": 7.416412660087796e-05, "loss": 0.1951, "step": 1613 }, { "epoch": 0.5854189336235038, "grad_norm": 0.1751972883939743, "learning_rate": 7.405366180987775e-05, "loss": 0.1633, "step": 1614 }, { "epoch": 0.5857816467174465, "grad_norm": 0.17814141511917114, "learning_rate": 7.394323095470586e-05, "loss": 0.1845, "step": 1615 }, { "epoch": 0.5861443598113892, "grad_norm": 0.1747366487979889, "learning_rate": 7.383283417979739e-05, "loss": 0.1777, "step": 1616 }, { "epoch": 0.5865070729053319, "grad_norm": 0.177615687251091, "learning_rate": 7.372247162954282e-05, "loss": 0.1691, "step": 1617 }, { "epoch": 0.5868697859992745, "grad_norm": 0.1927955448627472, "learning_rate": 7.361214344828805e-05, "loss": 0.164, "step": 1618 }, { "epoch": 0.5872324990932173, "grad_norm": 0.17188555002212524, "learning_rate": 7.350184978033386e-05, "loss": 0.1704, "step": 1619 }, { "epoch": 0.5875952121871599, "grad_norm": 0.25001007318496704, "learning_rate": 7.339159076993592e-05, "loss": 0.2025, "step": 1620 }, { "epoch": 0.5879579252811027, "grad_norm": 0.18958470225334167, "learning_rate": 7.328136656130458e-05, "loss": 0.1793, "step": 1621 }, { "epoch": 0.5883206383750453, "grad_norm": 0.18085351586341858, "learning_rate": 7.317117729860475e-05, "loss": 0.1669, "step": 1622 }, { "epoch": 0.588683351468988, "grad_norm": 0.18232987821102142, "learning_rate": 7.306102312595553e-05, "loss": 0.1649, "step": 1623 }, { "epoch": 0.5890460645629307, "grad_norm": 0.17970141768455505, "learning_rate": 7.295090418743018e-05, "loss": 0.1757, "step": 1624 }, { "epoch": 0.5894087776568734, "grad_norm": 0.1799871325492859, "learning_rate": 7.284082062705584e-05, "loss": 0.1716, "step": 1625 }, { "epoch": 0.5897714907508161, "grad_norm": 0.1792754977941513, "learning_rate": 7.273077258881342e-05, "loss": 0.1825, "step": 1626 }, { "epoch": 0.5901342038447588, "grad_norm": 0.17742280662059784, "learning_rate": 7.262076021663727e-05, "loss": 0.159, "step": 1627 }, { "epoch": 0.5904969169387014, "grad_norm": 0.20353969931602478, "learning_rate": 7.251078365441528e-05, "loss": 0.1597, "step": 1628 }, { "epoch": 0.5908596300326442, "grad_norm": 0.18415038287639618, "learning_rate": 7.240084304598835e-05, "loss": 0.1774, "step": 1629 }, { "epoch": 0.5912223431265868, "grad_norm": 0.18927162885665894, "learning_rate": 7.229093853515038e-05, "loss": 0.1628, "step": 1630 }, { "epoch": 0.5915850562205296, "grad_norm": 0.1826174110174179, "learning_rate": 7.21810702656481e-05, "loss": 0.1604, "step": 1631 }, { "epoch": 0.5919477693144722, "grad_norm": 0.17375624179840088, "learning_rate": 7.207123838118077e-05, "loss": 0.1647, "step": 1632 }, { "epoch": 0.592310482408415, "grad_norm": 0.1889926791191101, "learning_rate": 7.196144302540014e-05, "loss": 0.1882, "step": 1633 }, { "epoch": 0.5926731955023576, "grad_norm": 0.17155472934246063, "learning_rate": 7.185168434191014e-05, "loss": 0.1552, "step": 1634 }, { "epoch": 0.5930359085963003, "grad_norm": 0.18929725885391235, "learning_rate": 7.174196247426677e-05, "loss": 0.163, "step": 1635 }, { "epoch": 0.593398621690243, "grad_norm": 0.18491095304489136, "learning_rate": 7.163227756597779e-05, "loss": 0.172, "step": 1636 }, { "epoch": 0.5937613347841857, "grad_norm": 0.19160285592079163, "learning_rate": 7.152262976050275e-05, "loss": 0.1642, "step": 1637 }, { "epoch": 0.5941240478781284, "grad_norm": 0.18393130600452423, "learning_rate": 7.141301920125256e-05, "loss": 0.1504, "step": 1638 }, { "epoch": 0.5944867609720711, "grad_norm": 0.1797264665365219, "learning_rate": 7.130344603158942e-05, "loss": 0.1607, "step": 1639 }, { "epoch": 0.5948494740660137, "grad_norm": 0.16639918088912964, "learning_rate": 7.119391039482677e-05, "loss": 0.1637, "step": 1640 }, { "epoch": 0.5952121871599565, "grad_norm": 0.17723850905895233, "learning_rate": 7.10844124342288e-05, "loss": 0.1695, "step": 1641 }, { "epoch": 0.5955749002538991, "grad_norm": 0.1672993302345276, "learning_rate": 7.097495229301048e-05, "loss": 0.1596, "step": 1642 }, { "epoch": 0.5959376133478419, "grad_norm": 0.18969713151454926, "learning_rate": 7.08655301143373e-05, "loss": 0.1658, "step": 1643 }, { "epoch": 0.5963003264417845, "grad_norm": 0.18681742250919342, "learning_rate": 7.075614604132512e-05, "loss": 0.1822, "step": 1644 }, { "epoch": 0.5966630395357272, "grad_norm": 0.22509360313415527, "learning_rate": 7.064680021703992e-05, "loss": 0.1951, "step": 1645 }, { "epoch": 0.59702575262967, "grad_norm": 0.1588478535413742, "learning_rate": 7.053749278449774e-05, "loss": 0.1643, "step": 1646 }, { "epoch": 0.5973884657236126, "grad_norm": 0.1908983290195465, "learning_rate": 7.042822388666436e-05, "loss": 0.1674, "step": 1647 }, { "epoch": 0.5977511788175554, "grad_norm": 0.19821012020111084, "learning_rate": 7.031899366645511e-05, "loss": 0.1817, "step": 1648 }, { "epoch": 0.598113891911498, "grad_norm": 0.18674594163894653, "learning_rate": 7.020980226673477e-05, "loss": 0.1547, "step": 1649 }, { "epoch": 0.5984766050054406, "grad_norm": 0.2012438029050827, "learning_rate": 7.010064983031737e-05, "loss": 0.1793, "step": 1650 }, { "epoch": 0.5988393180993834, "grad_norm": 0.18832942843437195, "learning_rate": 6.999153649996595e-05, "loss": 0.1797, "step": 1651 }, { "epoch": 0.599202031193326, "grad_norm": 0.20757931470870972, "learning_rate": 6.98824624183924e-05, "loss": 0.174, "step": 1652 }, { "epoch": 0.5995647442872688, "grad_norm": 0.1787773221731186, "learning_rate": 6.977342772825732e-05, "loss": 0.1577, "step": 1653 }, { "epoch": 0.5999274573812114, "grad_norm": 0.18228726089000702, "learning_rate": 6.966443257216971e-05, "loss": 0.1834, "step": 1654 }, { "epoch": 0.6002901704751542, "grad_norm": 10869.5341796875, "learning_rate": 6.955547709268697e-05, "loss": 0.1647, "step": 1655 }, { "epoch": 0.6006528835690969, "grad_norm": 0.18677209317684174, "learning_rate": 6.94465614323145e-05, "loss": 0.1921, "step": 1656 }, { "epoch": 0.6010155966630395, "grad_norm": 0.21163515746593475, "learning_rate": 6.933768573350567e-05, "loss": 0.171, "step": 1657 }, { "epoch": 0.6013783097569823, "grad_norm": 0.1897449642419815, "learning_rate": 6.922885013866153e-05, "loss": 0.1877, "step": 1658 }, { "epoch": 0.6017410228509249, "grad_norm": 0.20126648247241974, "learning_rate": 6.912005479013082e-05, "loss": 0.2154, "step": 1659 }, { "epoch": 0.6021037359448677, "grad_norm": 0.21092937886714935, "learning_rate": 6.901129983020948e-05, "loss": 0.1868, "step": 1660 }, { "epoch": 0.6024664490388103, "grad_norm": 0.23496972024440765, "learning_rate": 6.890258540114074e-05, "loss": 0.1784, "step": 1661 }, { "epoch": 0.602829162132753, "grad_norm": 0.21016502380371094, "learning_rate": 6.879391164511471e-05, "loss": 0.1728, "step": 1662 }, { "epoch": 0.6031918752266957, "grad_norm": 0.2230292558670044, "learning_rate": 6.86852787042684e-05, "loss": 0.1849, "step": 1663 }, { "epoch": 0.6035545883206384, "grad_norm": 0.19853949546813965, "learning_rate": 6.857668672068534e-05, "loss": 0.1782, "step": 1664 }, { "epoch": 0.6039173014145811, "grad_norm": 0.1775451898574829, "learning_rate": 6.846813583639562e-05, "loss": 0.1497, "step": 1665 }, { "epoch": 0.6042800145085238, "grad_norm": 0.1857757419347763, "learning_rate": 6.835962619337549e-05, "loss": 0.1836, "step": 1666 }, { "epoch": 0.6046427276024664, "grad_norm": 0.1867503970861435, "learning_rate": 6.825115793354726e-05, "loss": 0.1556, "step": 1667 }, { "epoch": 0.6050054406964092, "grad_norm": 0.18607592582702637, "learning_rate": 6.814273119877912e-05, "loss": 0.2011, "step": 1668 }, { "epoch": 0.6053681537903518, "grad_norm": 0.18926583230495453, "learning_rate": 6.803434613088497e-05, "loss": 0.1661, "step": 1669 }, { "epoch": 0.6057308668842946, "grad_norm": 0.18735969066619873, "learning_rate": 6.792600287162416e-05, "loss": 0.1591, "step": 1670 }, { "epoch": 0.6060935799782372, "grad_norm": 0.23324711620807648, "learning_rate": 6.781770156270149e-05, "loss": 0.1656, "step": 1671 }, { "epoch": 0.6064562930721799, "grad_norm": 0.1974279284477234, "learning_rate": 6.77094423457667e-05, "loss": 0.1585, "step": 1672 }, { "epoch": 0.6068190061661226, "grad_norm": 0.20500749349594116, "learning_rate": 6.760122536241462e-05, "loss": 0.164, "step": 1673 }, { "epoch": 0.6071817192600653, "grad_norm": 0.16157761216163635, "learning_rate": 6.749305075418482e-05, "loss": 0.171, "step": 1674 }, { "epoch": 0.607544432354008, "grad_norm": 0.19271859526634216, "learning_rate": 6.738491866256138e-05, "loss": 0.1777, "step": 1675 }, { "epoch": 0.6079071454479507, "grad_norm": 0.18441638350486755, "learning_rate": 6.727682922897282e-05, "loss": 0.1683, "step": 1676 }, { "epoch": 0.6082698585418934, "grad_norm": 0.17519617080688477, "learning_rate": 6.716878259479189e-05, "loss": 0.1739, "step": 1677 }, { "epoch": 0.6086325716358361, "grad_norm": 0.18938271701335907, "learning_rate": 6.706077890133531e-05, "loss": 0.1606, "step": 1678 }, { "epoch": 0.6089952847297787, "grad_norm": 0.20264668762683868, "learning_rate": 6.695281828986369e-05, "loss": 0.174, "step": 1679 }, { "epoch": 0.6093579978237215, "grad_norm": 0.22438956797122955, "learning_rate": 6.684490090158124e-05, "loss": 0.1594, "step": 1680 }, { "epoch": 0.6097207109176641, "grad_norm": 0.19163423776626587, "learning_rate": 6.673702687763565e-05, "loss": 0.1594, "step": 1681 }, { "epoch": 0.6100834240116069, "grad_norm": 0.1845075786113739, "learning_rate": 6.662919635911793e-05, "loss": 0.173, "step": 1682 }, { "epoch": 0.6104461371055495, "grad_norm": 0.18868669867515564, "learning_rate": 6.652140948706209e-05, "loss": 0.1786, "step": 1683 }, { "epoch": 0.6108088501994922, "grad_norm": 0.22319957613945007, "learning_rate": 6.641366640244525e-05, "loss": 0.2068, "step": 1684 }, { "epoch": 0.6111715632934349, "grad_norm": 0.18685069680213928, "learning_rate": 6.630596724618703e-05, "loss": 0.1751, "step": 1685 }, { "epoch": 0.6115342763873776, "grad_norm": 0.18427863717079163, "learning_rate": 6.619831215914974e-05, "loss": 0.1707, "step": 1686 }, { "epoch": 0.6118969894813203, "grad_norm": 0.19461330771446228, "learning_rate": 6.609070128213802e-05, "loss": 0.178, "step": 1687 }, { "epoch": 0.612259702575263, "grad_norm": 0.21272696554660797, "learning_rate": 6.598313475589863e-05, "loss": 0.1789, "step": 1688 }, { "epoch": 0.6126224156692056, "grad_norm": 0.20163173973560333, "learning_rate": 6.58756127211204e-05, "loss": 0.2014, "step": 1689 }, { "epoch": 0.6129851287631484, "grad_norm": 0.1940133273601532, "learning_rate": 6.576813531843396e-05, "loss": 0.1703, "step": 1690 }, { "epoch": 0.613347841857091, "grad_norm": 0.17384611070156097, "learning_rate": 6.566070268841152e-05, "loss": 0.1556, "step": 1691 }, { "epoch": 0.6137105549510338, "grad_norm": 0.1869945228099823, "learning_rate": 6.555331497156672e-05, "loss": 0.1548, "step": 1692 }, { "epoch": 0.6140732680449764, "grad_norm": 0.18520064651966095, "learning_rate": 6.544597230835454e-05, "loss": 0.1807, "step": 1693 }, { "epoch": 0.6144359811389191, "grad_norm": 0.17966820299625397, "learning_rate": 6.533867483917098e-05, "loss": 0.1516, "step": 1694 }, { "epoch": 0.6147986942328618, "grad_norm": 0.1705074906349182, "learning_rate": 6.523142270435288e-05, "loss": 0.1518, "step": 1695 }, { "epoch": 0.6151614073268045, "grad_norm": 0.24414807558059692, "learning_rate": 6.512421604417792e-05, "loss": 0.2026, "step": 1696 }, { "epoch": 0.6155241204207472, "grad_norm": 0.16796554625034332, "learning_rate": 6.501705499886418e-05, "loss": 0.1554, "step": 1697 }, { "epoch": 0.6158868335146899, "grad_norm": 0.19749103486537933, "learning_rate": 6.490993970857011e-05, "loss": 0.1807, "step": 1698 }, { "epoch": 0.6162495466086326, "grad_norm": 0.16789931058883667, "learning_rate": 6.480287031339436e-05, "loss": 0.1617, "step": 1699 }, { "epoch": 0.6166122597025753, "grad_norm": 0.1916869580745697, "learning_rate": 6.469584695337548e-05, "loss": 0.188, "step": 1700 }, { "epoch": 0.6169749727965179, "grad_norm": 0.19540345668792725, "learning_rate": 6.458886976849183e-05, "loss": 0.1743, "step": 1701 }, { "epoch": 0.6173376858904607, "grad_norm": 0.17193295061588287, "learning_rate": 6.448193889866149e-05, "loss": 0.1763, "step": 1702 }, { "epoch": 0.6177003989844033, "grad_norm": 0.17156308889389038, "learning_rate": 6.43750544837418e-05, "loss": 0.158, "step": 1703 }, { "epoch": 0.6180631120783461, "grad_norm": 0.1796158254146576, "learning_rate": 6.426821666352942e-05, "loss": 0.1656, "step": 1704 }, { "epoch": 0.6184258251722887, "grad_norm": 0.18700680136680603, "learning_rate": 6.416142557776006e-05, "loss": 0.174, "step": 1705 }, { "epoch": 0.6187885382662314, "grad_norm": 0.16723744571208954, "learning_rate": 6.405468136610832e-05, "loss": 0.1619, "step": 1706 }, { "epoch": 0.6191512513601741, "grad_norm": 0.17422862350940704, "learning_rate": 6.394798416818739e-05, "loss": 0.1609, "step": 1707 }, { "epoch": 0.6195139644541168, "grad_norm": 0.20079629123210907, "learning_rate": 6.384133412354918e-05, "loss": 0.1652, "step": 1708 }, { "epoch": 0.6198766775480595, "grad_norm": 0.2474866658449173, "learning_rate": 6.373473137168373e-05, "loss": 0.1663, "step": 1709 }, { "epoch": 0.6202393906420022, "grad_norm": 0.1707204282283783, "learning_rate": 6.36281760520193e-05, "loss": 0.1592, "step": 1710 }, { "epoch": 0.6206021037359448, "grad_norm": 0.17606933414936066, "learning_rate": 6.352166830392213e-05, "loss": 0.1662, "step": 1711 }, { "epoch": 0.6209648168298876, "grad_norm": 0.17025688290596008, "learning_rate": 6.341520826669621e-05, "loss": 0.1592, "step": 1712 }, { "epoch": 0.6213275299238302, "grad_norm": 0.18838566541671753, "learning_rate": 6.330879607958314e-05, "loss": 0.1816, "step": 1713 }, { "epoch": 0.621690243017773, "grad_norm": 0.2592281103134155, "learning_rate": 6.320243188176185e-05, "loss": 0.2014, "step": 1714 }, { "epoch": 0.6220529561117156, "grad_norm": 0.16398011147975922, "learning_rate": 6.309611581234872e-05, "loss": 0.1585, "step": 1715 }, { "epoch": 0.6224156692056583, "grad_norm": 0.1793876439332962, "learning_rate": 6.298984801039697e-05, "loss": 0.1532, "step": 1716 }, { "epoch": 0.622778382299601, "grad_norm": 0.1910189986228943, "learning_rate": 6.28836286148968e-05, "loss": 0.1666, "step": 1717 }, { "epoch": 0.6231410953935437, "grad_norm": 0.20349231362342834, "learning_rate": 6.277745776477506e-05, "loss": 0.2075, "step": 1718 }, { "epoch": 0.6235038084874864, "grad_norm": 0.19140169024467468, "learning_rate": 6.267133559889509e-05, "loss": 0.1574, "step": 1719 }, { "epoch": 0.6238665215814291, "grad_norm": 0.18104875087738037, "learning_rate": 6.256526225605652e-05, "loss": 0.1594, "step": 1720 }, { "epoch": 0.6242292346753718, "grad_norm": 0.18763144314289093, "learning_rate": 6.245923787499532e-05, "loss": 0.1613, "step": 1721 }, { "epoch": 0.6245919477693145, "grad_norm": 0.16338056325912476, "learning_rate": 6.235326259438317e-05, "loss": 0.1823, "step": 1722 }, { "epoch": 0.6249546608632571, "grad_norm": 0.1663455367088318, "learning_rate": 6.224733655282771e-05, "loss": 0.167, "step": 1723 }, { "epoch": 0.6253173739571999, "grad_norm": 0.17179372906684875, "learning_rate": 6.214145988887206e-05, "loss": 0.1645, "step": 1724 }, { "epoch": 0.6256800870511425, "grad_norm": 0.16161875426769257, "learning_rate": 6.203563274099481e-05, "loss": 0.1402, "step": 1725 }, { "epoch": 0.6260428001450853, "grad_norm": 0.2017858475446701, "learning_rate": 6.19298552476098e-05, "loss": 0.1667, "step": 1726 }, { "epoch": 0.6264055132390279, "grad_norm": 0.22198174893856049, "learning_rate": 6.182412754706594e-05, "loss": 0.1902, "step": 1727 }, { "epoch": 0.6267682263329706, "grad_norm": 0.1705772578716278, "learning_rate": 6.171844977764695e-05, "loss": 0.1588, "step": 1728 }, { "epoch": 0.6271309394269133, "grad_norm": 0.17019295692443848, "learning_rate": 6.161282207757126e-05, "loss": 0.1609, "step": 1729 }, { "epoch": 0.627493652520856, "grad_norm": 0.1743742674589157, "learning_rate": 6.15072445849919e-05, "loss": 0.179, "step": 1730 }, { "epoch": 0.6278563656147987, "grad_norm": 0.16775129735469818, "learning_rate": 6.140171743799611e-05, "loss": 0.1807, "step": 1731 }, { "epoch": 0.6282190787087414, "grad_norm": 0.18963152170181274, "learning_rate": 6.129624077460532e-05, "loss": 0.2007, "step": 1732 }, { "epoch": 0.628581791802684, "grad_norm": 0.182524174451828, "learning_rate": 6.119081473277501e-05, "loss": 0.1738, "step": 1733 }, { "epoch": 0.6289445048966268, "grad_norm": 0.18262414634227753, "learning_rate": 6.108543945039438e-05, "loss": 0.1897, "step": 1734 }, { "epoch": 0.6293072179905694, "grad_norm": 0.1729535162448883, "learning_rate": 6.098011506528623e-05, "loss": 0.1586, "step": 1735 }, { "epoch": 0.6296699310845122, "grad_norm": 0.1677355319261551, "learning_rate": 6.0874841715206785e-05, "loss": 0.1871, "step": 1736 }, { "epoch": 0.6300326441784548, "grad_norm": 0.17900875210762024, "learning_rate": 6.076961953784559e-05, "loss": 0.1595, "step": 1737 }, { "epoch": 0.6303953572723975, "grad_norm": 0.18250757455825806, "learning_rate": 6.066444867082515e-05, "loss": 0.1842, "step": 1738 }, { "epoch": 0.6307580703663402, "grad_norm": 0.17696964740753174, "learning_rate": 6.0559329251701005e-05, "loss": 0.1709, "step": 1739 }, { "epoch": 0.6311207834602829, "grad_norm": 0.1764724850654602, "learning_rate": 6.045426141796128e-05, "loss": 0.161, "step": 1740 }, { "epoch": 0.6314834965542256, "grad_norm": 0.17228443920612335, "learning_rate": 6.03492453070267e-05, "loss": 0.1579, "step": 1741 }, { "epoch": 0.6318462096481683, "grad_norm": 0.17399545013904572, "learning_rate": 6.024428105625028e-05, "loss": 0.1555, "step": 1742 }, { "epoch": 0.6322089227421109, "grad_norm": 0.1953967958688736, "learning_rate": 6.0139368802917284e-05, "loss": 0.2569, "step": 1743 }, { "epoch": 0.6325716358360537, "grad_norm": 0.17359597980976105, "learning_rate": 6.0034508684244875e-05, "loss": 0.1783, "step": 1744 }, { "epoch": 0.6329343489299963, "grad_norm": 0.1505521535873413, "learning_rate": 5.992970083738212e-05, "loss": 0.1567, "step": 1745 }, { "epoch": 0.6332970620239391, "grad_norm": 0.18801428377628326, "learning_rate": 5.982494539940966e-05, "loss": 0.2076, "step": 1746 }, { "epoch": 0.6336597751178817, "grad_norm": 0.16666316986083984, "learning_rate": 5.97202425073396e-05, "loss": 0.1617, "step": 1747 }, { "epoch": 0.6340224882118245, "grad_norm": 0.174256831407547, "learning_rate": 5.961559229811535e-05, "loss": 0.167, "step": 1748 }, { "epoch": 0.6343852013057671, "grad_norm": 0.16997861862182617, "learning_rate": 5.951099490861136e-05, "loss": 0.191, "step": 1749 }, { "epoch": 0.6347479143997098, "grad_norm": 0.18059667944908142, "learning_rate": 5.940645047563306e-05, "loss": 0.1769, "step": 1750 }, { "epoch": 0.6351106274936525, "grad_norm": 0.17815832793712616, "learning_rate": 5.9301959135916496e-05, "loss": 0.1406, "step": 1751 }, { "epoch": 0.6354733405875952, "grad_norm": 0.1702101081609726, "learning_rate": 5.919752102612848e-05, "loss": 0.1471, "step": 1752 }, { "epoch": 0.635836053681538, "grad_norm": 0.1625283807516098, "learning_rate": 5.909313628286601e-05, "loss": 0.1446, "step": 1753 }, { "epoch": 0.6361987667754806, "grad_norm": 0.16857244074344635, "learning_rate": 5.898880504265638e-05, "loss": 0.1561, "step": 1754 }, { "epoch": 0.6365614798694232, "grad_norm": 0.18340398371219635, "learning_rate": 5.888452744195687e-05, "loss": 0.1862, "step": 1755 }, { "epoch": 0.636924192963366, "grad_norm": 0.20158030092716217, "learning_rate": 5.878030361715461e-05, "loss": 0.1571, "step": 1756 }, { "epoch": 0.6372869060573086, "grad_norm": 0.17433685064315796, "learning_rate": 5.867613370456636e-05, "loss": 0.1629, "step": 1757 }, { "epoch": 0.6376496191512514, "grad_norm": 0.16959048807621002, "learning_rate": 5.857201784043851e-05, "loss": 0.1742, "step": 1758 }, { "epoch": 0.638012332245194, "grad_norm": 0.17399851977825165, "learning_rate": 5.8467956160946604e-05, "loss": 0.1605, "step": 1759 }, { "epoch": 0.6383750453391367, "grad_norm": 0.1925593912601471, "learning_rate": 5.8363948802195356e-05, "loss": 0.2142, "step": 1760 }, { "epoch": 0.6387377584330794, "grad_norm": 0.1870613396167755, "learning_rate": 5.8259995900218465e-05, "loss": 0.1619, "step": 1761 }, { "epoch": 0.6391004715270221, "grad_norm": 0.18008996546268463, "learning_rate": 5.815609759097837e-05, "loss": 0.1594, "step": 1762 }, { "epoch": 0.6394631846209649, "grad_norm": 0.1749439388513565, "learning_rate": 5.8052254010366105e-05, "loss": 0.1543, "step": 1763 }, { "epoch": 0.6398258977149075, "grad_norm": 0.17792417109012604, "learning_rate": 5.7948465294201194e-05, "loss": 0.1679, "step": 1764 }, { "epoch": 0.6401886108088501, "grad_norm": 0.18781551718711853, "learning_rate": 5.7844731578231334e-05, "loss": 0.1634, "step": 1765 }, { "epoch": 0.6405513239027929, "grad_norm": 0.17064349353313446, "learning_rate": 5.7741052998132285e-05, "loss": 0.1547, "step": 1766 }, { "epoch": 0.6409140369967355, "grad_norm": 0.15985310077667236, "learning_rate": 5.7637429689507713e-05, "loss": 0.1446, "step": 1767 }, { "epoch": 0.6412767500906783, "grad_norm": 0.18584533035755157, "learning_rate": 5.7533861787888995e-05, "loss": 0.1692, "step": 1768 }, { "epoch": 0.641639463184621, "grad_norm": 0.18340182304382324, "learning_rate": 5.7430349428734995e-05, "loss": 0.1698, "step": 1769 }, { "epoch": 0.6420021762785637, "grad_norm": 0.15710604190826416, "learning_rate": 5.732689274743204e-05, "loss": 0.1465, "step": 1770 }, { "epoch": 0.6423648893725064, "grad_norm": 0.17073456943035126, "learning_rate": 5.7223491879293526e-05, "loss": 0.1531, "step": 1771 }, { "epoch": 0.642727602466449, "grad_norm": 0.17552490532398224, "learning_rate": 5.712014695955991e-05, "loss": 0.1519, "step": 1772 }, { "epoch": 0.6430903155603918, "grad_norm": 0.20075669884681702, "learning_rate": 5.7016858123398434e-05, "loss": 0.167, "step": 1773 }, { "epoch": 0.6434530286543344, "grad_norm": 0.20733250677585602, "learning_rate": 5.691362550590297e-05, "loss": 0.1745, "step": 1774 }, { "epoch": 0.6438157417482772, "grad_norm": 0.16159029304981232, "learning_rate": 5.681044924209398e-05, "loss": 0.15, "step": 1775 }, { "epoch": 0.6441784548422198, "grad_norm": 0.184630885720253, "learning_rate": 5.670732946691808e-05, "loss": 0.1756, "step": 1776 }, { "epoch": 0.6445411679361625, "grad_norm": 0.16852855682373047, "learning_rate": 5.6604266315248034e-05, "loss": 0.1642, "step": 1777 }, { "epoch": 0.6449038810301052, "grad_norm": 0.16728003323078156, "learning_rate": 5.6501259921882655e-05, "loss": 0.1612, "step": 1778 }, { "epoch": 0.6452665941240479, "grad_norm": 0.17908404767513275, "learning_rate": 5.6398310421546376e-05, "loss": 0.1759, "step": 1779 }, { "epoch": 0.6456293072179906, "grad_norm": 0.16568151116371155, "learning_rate": 5.6295417948889306e-05, "loss": 0.1514, "step": 1780 }, { "epoch": 0.6459920203119333, "grad_norm": 0.2028510570526123, "learning_rate": 5.619258263848692e-05, "loss": 0.1626, "step": 1781 }, { "epoch": 0.6463547334058759, "grad_norm": 0.19075465202331543, "learning_rate": 5.608980462483991e-05, "loss": 0.1809, "step": 1782 }, { "epoch": 0.6467174464998187, "grad_norm": 0.18601737916469574, "learning_rate": 5.598708404237416e-05, "loss": 0.1606, "step": 1783 }, { "epoch": 0.6470801595937613, "grad_norm": 0.18421201407909393, "learning_rate": 5.588442102544029e-05, "loss": 0.1527, "step": 1784 }, { "epoch": 0.6474428726877041, "grad_norm": 0.20656828582286835, "learning_rate": 5.578181570831369e-05, "loss": 0.1726, "step": 1785 }, { "epoch": 0.6478055857816467, "grad_norm": 0.1901615560054779, "learning_rate": 5.567926822519427e-05, "loss": 0.1865, "step": 1786 }, { "epoch": 0.6481682988755894, "grad_norm": 0.17387042939662933, "learning_rate": 5.55767787102063e-05, "loss": 0.1643, "step": 1787 }, { "epoch": 0.6485310119695321, "grad_norm": 0.16012033820152283, "learning_rate": 5.547434729739822e-05, "loss": 0.162, "step": 1788 }, { "epoch": 0.6488937250634748, "grad_norm": 0.17737270891666412, "learning_rate": 5.537197412074257e-05, "loss": 0.1563, "step": 1789 }, { "epoch": 0.6492564381574175, "grad_norm": 0.17308826744556427, "learning_rate": 5.526965931413557e-05, "loss": 0.1596, "step": 1790 }, { "epoch": 0.6496191512513602, "grad_norm": 0.20024463534355164, "learning_rate": 5.516740301139721e-05, "loss": 0.1763, "step": 1791 }, { "epoch": 0.6499818643453029, "grad_norm": 0.17333653569221497, "learning_rate": 5.506520534627091e-05, "loss": 0.1666, "step": 1792 }, { "epoch": 0.6503445774392456, "grad_norm": 0.17827224731445312, "learning_rate": 5.496306645242339e-05, "loss": 0.1718, "step": 1793 }, { "epoch": 0.6507072905331882, "grad_norm": 0.19950279593467712, "learning_rate": 5.4860986463444506e-05, "loss": 0.2117, "step": 1794 }, { "epoch": 0.651070003627131, "grad_norm": 0.17631955444812775, "learning_rate": 5.475896551284716e-05, "loss": 0.1784, "step": 1795 }, { "epoch": 0.6514327167210736, "grad_norm": 0.18082845211029053, "learning_rate": 5.4657003734066925e-05, "loss": 0.2068, "step": 1796 }, { "epoch": 0.6517954298150164, "grad_norm": 0.17366324365139008, "learning_rate": 5.455510126046199e-05, "loss": 0.1443, "step": 1797 }, { "epoch": 0.652158142908959, "grad_norm": 0.17154483497142792, "learning_rate": 5.445325822531304e-05, "loss": 0.17, "step": 1798 }, { "epoch": 0.6525208560029017, "grad_norm": 0.18583987653255463, "learning_rate": 5.435147476182298e-05, "loss": 0.1609, "step": 1799 }, { "epoch": 0.6528835690968444, "grad_norm": 0.16991505026817322, "learning_rate": 5.424975100311676e-05, "loss": 0.1537, "step": 1800 }, { "epoch": 0.6532462821907871, "grad_norm": 0.1840389221906662, "learning_rate": 5.414808708224135e-05, "loss": 0.1628, "step": 1801 }, { "epoch": 0.6536089952847298, "grad_norm": 0.197292760014534, "learning_rate": 5.404648313216538e-05, "loss": 0.1722, "step": 1802 }, { "epoch": 0.6539717083786725, "grad_norm": 0.1785934418439865, "learning_rate": 5.394493928577903e-05, "loss": 0.1629, "step": 1803 }, { "epoch": 0.6543344214726151, "grad_norm": 0.17052417993545532, "learning_rate": 5.384345567589391e-05, "loss": 0.1639, "step": 1804 }, { "epoch": 0.6546971345665579, "grad_norm": 0.1716339886188507, "learning_rate": 5.374203243524283e-05, "loss": 0.1628, "step": 1805 }, { "epoch": 0.6550598476605005, "grad_norm": 0.16768915951251984, "learning_rate": 5.364066969647963e-05, "loss": 0.1426, "step": 1806 }, { "epoch": 0.6554225607544433, "grad_norm": 0.1639591008424759, "learning_rate": 5.353936759217899e-05, "loss": 0.1604, "step": 1807 }, { "epoch": 0.6557852738483859, "grad_norm": 0.1945423036813736, "learning_rate": 5.343812625483642e-05, "loss": 0.1562, "step": 1808 }, { "epoch": 0.6561479869423286, "grad_norm": 0.1996852457523346, "learning_rate": 5.333694581686779e-05, "loss": 0.1712, "step": 1809 }, { "epoch": 0.6565107000362713, "grad_norm": 0.18032366037368774, "learning_rate": 5.32358264106094e-05, "loss": 0.196, "step": 1810 }, { "epoch": 0.656873413130214, "grad_norm": 0.16884812712669373, "learning_rate": 5.313476816831768e-05, "loss": 0.1558, "step": 1811 }, { "epoch": 0.6572361262241567, "grad_norm": 0.1865408569574356, "learning_rate": 5.303377122216915e-05, "loss": 0.184, "step": 1812 }, { "epoch": 0.6575988393180994, "grad_norm": 0.18371020257472992, "learning_rate": 5.293283570426007e-05, "loss": 0.1672, "step": 1813 }, { "epoch": 0.6579615524120421, "grad_norm": 0.1799343377351761, "learning_rate": 5.283196174660633e-05, "loss": 0.1544, "step": 1814 }, { "epoch": 0.6583242655059848, "grad_norm": 0.17262513935565948, "learning_rate": 5.273114948114346e-05, "loss": 0.1582, "step": 1815 }, { "epoch": 0.6586869785999274, "grad_norm": 0.19773328304290771, "learning_rate": 5.263039903972618e-05, "loss": 0.1649, "step": 1816 }, { "epoch": 0.6590496916938702, "grad_norm": 0.18928907811641693, "learning_rate": 5.252971055412832e-05, "loss": 0.1853, "step": 1817 }, { "epoch": 0.6594124047878128, "grad_norm": 0.17779038846492767, "learning_rate": 5.242908415604277e-05, "loss": 0.1643, "step": 1818 }, { "epoch": 0.6597751178817556, "grad_norm": 0.2303963601589203, "learning_rate": 5.2328519977081105e-05, "loss": 0.1926, "step": 1819 }, { "epoch": 0.6601378309756982, "grad_norm": 0.16455812752246857, "learning_rate": 5.222801814877369e-05, "loss": 0.1582, "step": 1820 }, { "epoch": 0.6605005440696409, "grad_norm": 0.16079877316951752, "learning_rate": 5.21275788025692e-05, "loss": 0.149, "step": 1821 }, { "epoch": 0.6608632571635836, "grad_norm": 0.1705598533153534, "learning_rate": 5.20272020698346e-05, "loss": 0.1624, "step": 1822 }, { "epoch": 0.6612259702575263, "grad_norm": 0.16610048711299896, "learning_rate": 5.192688808185502e-05, "loss": 0.1527, "step": 1823 }, { "epoch": 0.661588683351469, "grad_norm": 0.19774171710014343, "learning_rate": 5.1826636969833475e-05, "loss": 0.1631, "step": 1824 }, { "epoch": 0.6619513964454117, "grad_norm": 0.17446525394916534, "learning_rate": 5.172644886489073e-05, "loss": 0.1621, "step": 1825 }, { "epoch": 0.6623141095393543, "grad_norm": 0.20300233364105225, "learning_rate": 5.162632389806523e-05, "loss": 0.1907, "step": 1826 }, { "epoch": 0.6626768226332971, "grad_norm": 0.204659104347229, "learning_rate": 5.152626220031278e-05, "loss": 0.1596, "step": 1827 }, { "epoch": 0.6630395357272397, "grad_norm": 0.1757912039756775, "learning_rate": 5.1426263902506414e-05, "loss": 0.1535, "step": 1828 }, { "epoch": 0.6634022488211825, "grad_norm": 0.19932380318641663, "learning_rate": 5.132632913543627e-05, "loss": 0.1705, "step": 1829 }, { "epoch": 0.6637649619151251, "grad_norm": 0.18215243518352509, "learning_rate": 5.1226458029809387e-05, "loss": 0.1636, "step": 1830 }, { "epoch": 0.6641276750090678, "grad_norm": 0.1725538820028305, "learning_rate": 5.112665071624951e-05, "loss": 0.1397, "step": 1831 }, { "epoch": 0.6644903881030105, "grad_norm": 0.18406741321086884, "learning_rate": 5.1026907325297044e-05, "loss": 0.1639, "step": 1832 }, { "epoch": 0.6648531011969532, "grad_norm": 0.17330917716026306, "learning_rate": 5.092722798740871e-05, "loss": 0.1588, "step": 1833 }, { "epoch": 0.6652158142908959, "grad_norm": 0.16775713860988617, "learning_rate": 5.082761283295745e-05, "loss": 0.1407, "step": 1834 }, { "epoch": 0.6655785273848386, "grad_norm": 0.17397847771644592, "learning_rate": 5.072806199223228e-05, "loss": 0.1767, "step": 1835 }, { "epoch": 0.6659412404787813, "grad_norm": 0.17217876017093658, "learning_rate": 5.062857559543809e-05, "loss": 0.1644, "step": 1836 }, { "epoch": 0.666303953572724, "grad_norm": 0.1916993409395218, "learning_rate": 5.0529153772695495e-05, "loss": 0.1631, "step": 1837 }, { "epoch": 0.6666666666666666, "grad_norm": 0.19008702039718628, "learning_rate": 5.0429796654040595e-05, "loss": 0.1677, "step": 1838 }, { "epoch": 0.6670293797606094, "grad_norm": 0.18073846399784088, "learning_rate": 5.033050436942501e-05, "loss": 0.1644, "step": 1839 }, { "epoch": 0.667392092854552, "grad_norm": 0.1769622266292572, "learning_rate": 5.023127704871541e-05, "loss": 0.1764, "step": 1840 }, { "epoch": 0.6677548059484948, "grad_norm": 0.17394478619098663, "learning_rate": 5.013211482169354e-05, "loss": 0.1652, "step": 1841 }, { "epoch": 0.6681175190424374, "grad_norm": 0.18357783555984497, "learning_rate": 5.003301781805604e-05, "loss": 0.1799, "step": 1842 }, { "epoch": 0.6684802321363801, "grad_norm": 0.18445712327957153, "learning_rate": 4.993398616741421e-05, "loss": 0.1731, "step": 1843 }, { "epoch": 0.6688429452303228, "grad_norm": 0.17181545495986938, "learning_rate": 4.983501999929384e-05, "loss": 0.1647, "step": 1844 }, { "epoch": 0.6692056583242655, "grad_norm": 0.1643923968076706, "learning_rate": 4.97361194431352e-05, "loss": 0.1522, "step": 1845 }, { "epoch": 0.6695683714182082, "grad_norm": 0.178927481174469, "learning_rate": 4.963728462829262e-05, "loss": 0.1693, "step": 1846 }, { "epoch": 0.6699310845121509, "grad_norm": 0.16977953910827637, "learning_rate": 4.95385156840345e-05, "loss": 0.1634, "step": 1847 }, { "epoch": 0.6702937976060935, "grad_norm": 0.19453585147857666, "learning_rate": 4.943981273954302e-05, "loss": 0.161, "step": 1848 }, { "epoch": 0.6706565107000363, "grad_norm": 0.15591104328632355, "learning_rate": 4.9341175923914184e-05, "loss": 0.1336, "step": 1849 }, { "epoch": 0.6710192237939789, "grad_norm": 0.19056206941604614, "learning_rate": 4.9242605366157356e-05, "loss": 0.1647, "step": 1850 }, { "epoch": 0.6713819368879217, "grad_norm": 0.18081988394260406, "learning_rate": 4.914410119519528e-05, "loss": 0.1679, "step": 1851 }, { "epoch": 0.6717446499818643, "grad_norm": 0.1665160208940506, "learning_rate": 4.904566353986394e-05, "loss": 0.1585, "step": 1852 }, { "epoch": 0.672107363075807, "grad_norm": 0.18015241622924805, "learning_rate": 4.894729252891224e-05, "loss": 0.1687, "step": 1853 }, { "epoch": 0.6724700761697497, "grad_norm": 0.16529425978660583, "learning_rate": 4.884898829100194e-05, "loss": 0.1569, "step": 1854 }, { "epoch": 0.6728327892636924, "grad_norm": 0.17505323886871338, "learning_rate": 4.87507509547075e-05, "loss": 0.1651, "step": 1855 }, { "epoch": 0.6731955023576351, "grad_norm": 0.18190018832683563, "learning_rate": 4.865258064851579e-05, "loss": 0.1706, "step": 1856 }, { "epoch": 0.6735582154515778, "grad_norm": 0.1668224334716797, "learning_rate": 4.855447750082615e-05, "loss": 0.1639, "step": 1857 }, { "epoch": 0.6739209285455205, "grad_norm": 0.18514534831047058, "learning_rate": 4.845644163994996e-05, "loss": 0.1912, "step": 1858 }, { "epoch": 0.6742836416394632, "grad_norm": 0.19206570088863373, "learning_rate": 4.835847319411065e-05, "loss": 0.1595, "step": 1859 }, { "epoch": 0.6746463547334058, "grad_norm": 0.19193512201309204, "learning_rate": 4.8260572291443465e-05, "loss": 0.1586, "step": 1860 }, { "epoch": 0.6750090678273486, "grad_norm": 0.1866559088230133, "learning_rate": 4.816273905999529e-05, "loss": 0.1841, "step": 1861 }, { "epoch": 0.6753717809212912, "grad_norm": 0.1677185595035553, "learning_rate": 4.80649736277245e-05, "loss": 0.1672, "step": 1862 }, { "epoch": 0.675734494015234, "grad_norm": 0.16328024864196777, "learning_rate": 4.796727612250087e-05, "loss": 0.1556, "step": 1863 }, { "epoch": 0.6760972071091766, "grad_norm": 0.1733468621969223, "learning_rate": 4.7869646672105254e-05, "loss": 0.1572, "step": 1864 }, { "epoch": 0.6764599202031193, "grad_norm": 0.19276085495948792, "learning_rate": 4.7772085404229495e-05, "loss": 0.1681, "step": 1865 }, { "epoch": 0.676822633297062, "grad_norm": 0.2415236532688141, "learning_rate": 4.767459244647629e-05, "loss": 0.2347, "step": 1866 }, { "epoch": 0.6771853463910047, "grad_norm": 0.21599356830120087, "learning_rate": 4.757716792635898e-05, "loss": 0.1985, "step": 1867 }, { "epoch": 0.6775480594849475, "grad_norm": 0.17399145662784576, "learning_rate": 4.747981197130139e-05, "loss": 0.1662, "step": 1868 }, { "epoch": 0.6779107725788901, "grad_norm": 0.1672641634941101, "learning_rate": 4.738252470863763e-05, "loss": 0.1482, "step": 1869 }, { "epoch": 0.6782734856728327, "grad_norm": 0.17770545184612274, "learning_rate": 4.7285306265612106e-05, "loss": 0.157, "step": 1870 }, { "epoch": 0.6786361987667755, "grad_norm": 0.18158309161663055, "learning_rate": 4.7188156769379063e-05, "loss": 0.1569, "step": 1871 }, { "epoch": 0.6789989118607181, "grad_norm": 0.2067386507987976, "learning_rate": 4.7091076347002613e-05, "loss": 0.1686, "step": 1872 }, { "epoch": 0.6793616249546609, "grad_norm": 0.16841058433055878, "learning_rate": 4.6994065125456546e-05, "loss": 0.1564, "step": 1873 }, { "epoch": 0.6797243380486035, "grad_norm": 0.19121627509593964, "learning_rate": 4.6897123231624105e-05, "loss": 0.1794, "step": 1874 }, { "epoch": 0.6800870511425462, "grad_norm": 0.16333697736263275, "learning_rate": 4.6800250792297885e-05, "loss": 0.1497, "step": 1875 }, { "epoch": 0.680449764236489, "grad_norm": 0.16970248520374298, "learning_rate": 4.670344793417967e-05, "loss": 0.1672, "step": 1876 }, { "epoch": 0.6808124773304316, "grad_norm": 0.1738625019788742, "learning_rate": 4.660671478388019e-05, "loss": 0.1689, "step": 1877 }, { "epoch": 0.6811751904243744, "grad_norm": 0.167289599776268, "learning_rate": 4.651005146791901e-05, "loss": 0.1446, "step": 1878 }, { "epoch": 0.681537903518317, "grad_norm": 0.1755392998456955, "learning_rate": 4.641345811272436e-05, "loss": 0.1509, "step": 1879 }, { "epoch": 0.6819006166122598, "grad_norm": 0.1769733875989914, "learning_rate": 4.631693484463299e-05, "loss": 0.1688, "step": 1880 }, { "epoch": 0.6822633297062024, "grad_norm": 0.17857052385807037, "learning_rate": 4.622048178988989e-05, "loss": 0.1799, "step": 1881 }, { "epoch": 0.682626042800145, "grad_norm": 0.17262940108776093, "learning_rate": 4.6124099074648375e-05, "loss": 0.1613, "step": 1882 }, { "epoch": 0.6829887558940878, "grad_norm": 0.17255136370658875, "learning_rate": 4.602778682496965e-05, "loss": 0.1647, "step": 1883 }, { "epoch": 0.6833514689880305, "grad_norm": 0.19975058734416962, "learning_rate": 4.593154516682276e-05, "loss": 0.1705, "step": 1884 }, { "epoch": 0.6837141820819732, "grad_norm": 0.19348910450935364, "learning_rate": 4.5835374226084424e-05, "loss": 0.1635, "step": 1885 }, { "epoch": 0.6840768951759159, "grad_norm": 0.17311705648899078, "learning_rate": 4.573927412853896e-05, "loss": 0.1764, "step": 1886 }, { "epoch": 0.6844396082698585, "grad_norm": 0.17351648211479187, "learning_rate": 4.56432449998779e-05, "loss": 0.1466, "step": 1887 }, { "epoch": 0.6848023213638013, "grad_norm": 0.16917894780635834, "learning_rate": 4.554728696570001e-05, "loss": 0.1565, "step": 1888 }, { "epoch": 0.6851650344577439, "grad_norm": 0.17121654748916626, "learning_rate": 4.545140015151105e-05, "loss": 0.1638, "step": 1889 }, { "epoch": 0.6855277475516867, "grad_norm": 0.18969422578811646, "learning_rate": 4.535558468272371e-05, "loss": 0.1533, "step": 1890 }, { "epoch": 0.6858904606456293, "grad_norm": 0.17447051405906677, "learning_rate": 4.525984068465725e-05, "loss": 0.1624, "step": 1891 }, { "epoch": 0.686253173739572, "grad_norm": 0.1624990999698639, "learning_rate": 4.5164168282537546e-05, "loss": 0.1482, "step": 1892 }, { "epoch": 0.6866158868335147, "grad_norm": 0.17492160201072693, "learning_rate": 4.506856760149671e-05, "loss": 0.1733, "step": 1893 }, { "epoch": 0.6869785999274574, "grad_norm": 0.16198083758354187, "learning_rate": 4.497303876657324e-05, "loss": 0.1433, "step": 1894 }, { "epoch": 0.6873413130214001, "grad_norm": 0.1759859323501587, "learning_rate": 4.48775819027115e-05, "loss": 0.146, "step": 1895 }, { "epoch": 0.6877040261153428, "grad_norm": 0.16649121046066284, "learning_rate": 4.478219713476178e-05, "loss": 0.1652, "step": 1896 }, { "epoch": 0.6880667392092854, "grad_norm": 0.1907196342945099, "learning_rate": 4.468688458748006e-05, "loss": 0.1731, "step": 1897 }, { "epoch": 0.6884294523032282, "grad_norm": 0.1932022124528885, "learning_rate": 4.459164438552789e-05, "loss": 0.1693, "step": 1898 }, { "epoch": 0.6887921653971708, "grad_norm": 0.1829594522714615, "learning_rate": 4.449647665347216e-05, "loss": 0.1957, "step": 1899 }, { "epoch": 0.6891548784911136, "grad_norm": 0.17210708558559418, "learning_rate": 4.4401381515784965e-05, "loss": 0.1596, "step": 1900 }, { "epoch": 0.6895175915850562, "grad_norm": 0.16382241249084473, "learning_rate": 4.430635909684356e-05, "loss": 0.1417, "step": 1901 }, { "epoch": 0.6898803046789989, "grad_norm": 0.16617849469184875, "learning_rate": 4.421140952092997e-05, "loss": 0.1459, "step": 1902 }, { "epoch": 0.6902430177729416, "grad_norm": 0.16519035398960114, "learning_rate": 4.411653291223097e-05, "loss": 0.1616, "step": 1903 }, { "epoch": 0.6906057308668843, "grad_norm": 0.17537926137447357, "learning_rate": 4.402172939483794e-05, "loss": 0.1637, "step": 1904 }, { "epoch": 0.690968443960827, "grad_norm": 0.18427397310733795, "learning_rate": 4.392699909274664e-05, "loss": 0.1876, "step": 1905 }, { "epoch": 0.6913311570547697, "grad_norm": 0.1629849672317505, "learning_rate": 4.383234212985701e-05, "loss": 0.1436, "step": 1906 }, { "epoch": 0.6916938701487124, "grad_norm": 0.1907191276550293, "learning_rate": 4.3737758629973204e-05, "loss": 0.1723, "step": 1907 }, { "epoch": 0.6920565832426551, "grad_norm": 0.18214593827724457, "learning_rate": 4.3643248716803184e-05, "loss": 0.1683, "step": 1908 }, { "epoch": 0.6924192963365977, "grad_norm": 0.18101546168327332, "learning_rate": 4.354881251395871e-05, "loss": 0.1554, "step": 1909 }, { "epoch": 0.6927820094305405, "grad_norm": 0.18527980148792267, "learning_rate": 4.3454450144955105e-05, "loss": 0.1832, "step": 1910 }, { "epoch": 0.6931447225244831, "grad_norm": 0.16371949017047882, "learning_rate": 4.3360161733211145e-05, "loss": 0.1528, "step": 1911 }, { "epoch": 0.6935074356184259, "grad_norm": 0.172775536775589, "learning_rate": 4.3265947402048834e-05, "loss": 0.1564, "step": 1912 }, { "epoch": 0.6938701487123685, "grad_norm": 0.17069590091705322, "learning_rate": 4.3171807274693386e-05, "loss": 0.1555, "step": 1913 }, { "epoch": 0.6942328618063112, "grad_norm": 0.1884002387523651, "learning_rate": 4.307774147427287e-05, "loss": 0.1611, "step": 1914 }, { "epoch": 0.6945955749002539, "grad_norm": 0.17518699169158936, "learning_rate": 4.2983750123818155e-05, "loss": 0.1651, "step": 1915 }, { "epoch": 0.6949582879941966, "grad_norm": 0.17112936079502106, "learning_rate": 4.288983334626275e-05, "loss": 0.1472, "step": 1916 }, { "epoch": 0.6953210010881393, "grad_norm": 0.1765616238117218, "learning_rate": 4.279599126444264e-05, "loss": 0.1552, "step": 1917 }, { "epoch": 0.695683714182082, "grad_norm": 0.18281279504299164, "learning_rate": 4.2702224001096045e-05, "loss": 0.1758, "step": 1918 }, { "epoch": 0.6960464272760246, "grad_norm": 0.1792001724243164, "learning_rate": 4.2608531678863475e-05, "loss": 0.1643, "step": 1919 }, { "epoch": 0.6964091403699674, "grad_norm": 0.1666647344827652, "learning_rate": 4.2514914420287266e-05, "loss": 0.146, "step": 1920 }, { "epoch": 0.69677185346391, "grad_norm": 0.2033475637435913, "learning_rate": 4.242137234781166e-05, "loss": 0.1841, "step": 1921 }, { "epoch": 0.6971345665578528, "grad_norm": 0.17656663060188293, "learning_rate": 4.23279055837825e-05, "loss": 0.1614, "step": 1922 }, { "epoch": 0.6974972796517954, "grad_norm": 0.1725003868341446, "learning_rate": 4.2234514250447255e-05, "loss": 0.155, "step": 1923 }, { "epoch": 0.6978599927457381, "grad_norm": 0.17976543307304382, "learning_rate": 4.214119846995461e-05, "loss": 0.1646, "step": 1924 }, { "epoch": 0.6982227058396808, "grad_norm": 0.16774506866931915, "learning_rate": 4.204795836435448e-05, "loss": 0.1672, "step": 1925 }, { "epoch": 0.6985854189336235, "grad_norm": 0.18107999861240387, "learning_rate": 4.1954794055597756e-05, "loss": 0.1769, "step": 1926 }, { "epoch": 0.6989481320275662, "grad_norm": 0.19499120116233826, "learning_rate": 4.1861705665536324e-05, "loss": 0.1737, "step": 1927 }, { "epoch": 0.6993108451215089, "grad_norm": 0.18403582274913788, "learning_rate": 4.1768693315922635e-05, "loss": 0.1671, "step": 1928 }, { "epoch": 0.6996735582154516, "grad_norm": 0.18355792760849, "learning_rate": 4.167575712840974e-05, "loss": 0.1587, "step": 1929 }, { "epoch": 0.7000362713093943, "grad_norm": 0.20113395154476166, "learning_rate": 4.15828972245511e-05, "loss": 0.1667, "step": 1930 }, { "epoch": 0.7003989844033369, "grad_norm": 0.1907624453306198, "learning_rate": 4.149011372580029e-05, "loss": 0.1708, "step": 1931 }, { "epoch": 0.7007616974972797, "grad_norm": 0.16733594238758087, "learning_rate": 4.139740675351116e-05, "loss": 0.1629, "step": 1932 }, { "epoch": 0.7011244105912223, "grad_norm": 0.15931111574172974, "learning_rate": 4.130477642893729e-05, "loss": 0.1534, "step": 1933 }, { "epoch": 0.7014871236851651, "grad_norm": 0.19512903690338135, "learning_rate": 4.1212222873232054e-05, "loss": 0.1814, "step": 1934 }, { "epoch": 0.7018498367791077, "grad_norm": 0.18595078587532043, "learning_rate": 4.111974620744845e-05, "loss": 0.1632, "step": 1935 }, { "epoch": 0.7022125498730504, "grad_norm": 0.17419064044952393, "learning_rate": 4.10273465525389e-05, "loss": 0.1448, "step": 1936 }, { "epoch": 0.7025752629669931, "grad_norm": 0.178279310464859, "learning_rate": 4.093502402935504e-05, "loss": 0.1578, "step": 1937 }, { "epoch": 0.7029379760609358, "grad_norm": 0.18063177168369293, "learning_rate": 4.084277875864776e-05, "loss": 0.1502, "step": 1938 }, { "epoch": 0.7033006891548785, "grad_norm": 0.20529168844223022, "learning_rate": 4.075061086106678e-05, "loss": 0.1748, "step": 1939 }, { "epoch": 0.7036634022488212, "grad_norm": 0.1844182014465332, "learning_rate": 4.065852045716069e-05, "loss": 0.1543, "step": 1940 }, { "epoch": 0.7040261153427638, "grad_norm": 0.1840999871492386, "learning_rate": 4.056650766737669e-05, "loss": 0.189, "step": 1941 }, { "epoch": 0.7043888284367066, "grad_norm": 0.1571437418460846, "learning_rate": 4.047457261206047e-05, "loss": 0.1546, "step": 1942 }, { "epoch": 0.7047515415306492, "grad_norm": 0.17258736491203308, "learning_rate": 4.038271541145604e-05, "loss": 0.1531, "step": 1943 }, { "epoch": 0.705114254624592, "grad_norm": 0.16301092505455017, "learning_rate": 4.0290936185705674e-05, "loss": 0.1554, "step": 1944 }, { "epoch": 0.7054769677185346, "grad_norm": 0.1766006052494049, "learning_rate": 4.0199235054849546e-05, "loss": 0.1484, "step": 1945 }, { "epoch": 0.7058396808124773, "grad_norm": 0.18022476136684418, "learning_rate": 4.010761213882572e-05, "loss": 0.1519, "step": 1946 }, { "epoch": 0.70620239390642, "grad_norm": 0.16101764142513275, "learning_rate": 4.001606755746999e-05, "loss": 0.1564, "step": 1947 }, { "epoch": 0.7065651070003627, "grad_norm": 0.18494002521038055, "learning_rate": 3.992460143051566e-05, "loss": 0.1549, "step": 1948 }, { "epoch": 0.7069278200943054, "grad_norm": 0.18700887262821198, "learning_rate": 3.983321387759342e-05, "loss": 0.1656, "step": 1949 }, { "epoch": 0.7072905331882481, "grad_norm": 0.18422120809555054, "learning_rate": 3.974190501823126e-05, "loss": 0.1646, "step": 1950 }, { "epoch": 0.7076532462821908, "grad_norm": 0.17262974381446838, "learning_rate": 3.965067497185416e-05, "loss": 0.1553, "step": 1951 }, { "epoch": 0.7080159593761335, "grad_norm": 0.16152386367321014, "learning_rate": 3.955952385778406e-05, "loss": 0.1341, "step": 1952 }, { "epoch": 0.7083786724700761, "grad_norm": 0.16990354657173157, "learning_rate": 3.946845179523965e-05, "loss": 0.1727, "step": 1953 }, { "epoch": 0.7087413855640189, "grad_norm": 0.1854991912841797, "learning_rate": 3.937745890333623e-05, "loss": 0.1548, "step": 1954 }, { "epoch": 0.7091040986579615, "grad_norm": 0.1773202270269394, "learning_rate": 3.928654530108552e-05, "loss": 0.1723, "step": 1955 }, { "epoch": 0.7094668117519043, "grad_norm": 0.18670934438705444, "learning_rate": 3.9195711107395624e-05, "loss": 0.1688, "step": 1956 }, { "epoch": 0.7098295248458469, "grad_norm": 0.17176151275634766, "learning_rate": 3.9104956441070715e-05, "loss": 0.1524, "step": 1957 }, { "epoch": 0.7101922379397896, "grad_norm": 0.17264217138290405, "learning_rate": 3.901428142081095e-05, "loss": 0.1568, "step": 1958 }, { "epoch": 0.7105549510337323, "grad_norm": 0.16863767802715302, "learning_rate": 3.892368616521229e-05, "loss": 0.1514, "step": 1959 }, { "epoch": 0.710917664127675, "grad_norm": 0.1810598075389862, "learning_rate": 3.883317079276649e-05, "loss": 0.1494, "step": 1960 }, { "epoch": 0.7112803772216177, "grad_norm": 0.18499146401882172, "learning_rate": 3.87427354218607e-05, "loss": 0.155, "step": 1961 }, { "epoch": 0.7116430903155604, "grad_norm": 0.16301509737968445, "learning_rate": 3.865238017077748e-05, "loss": 0.1505, "step": 1962 }, { "epoch": 0.712005803409503, "grad_norm": 0.18313588201999664, "learning_rate": 3.856210515769456e-05, "loss": 0.1696, "step": 1963 }, { "epoch": 0.7123685165034458, "grad_norm": 0.18576788902282715, "learning_rate": 3.847191050068483e-05, "loss": 0.1584, "step": 1964 }, { "epoch": 0.7127312295973884, "grad_norm": 0.16800563037395477, "learning_rate": 3.838179631771598e-05, "loss": 0.1409, "step": 1965 }, { "epoch": 0.7130939426913312, "grad_norm": 0.15716706216335297, "learning_rate": 3.829176272665047e-05, "loss": 0.1647, "step": 1966 }, { "epoch": 0.7134566557852738, "grad_norm": 0.19974446296691895, "learning_rate": 3.8201809845245364e-05, "loss": 0.2084, "step": 1967 }, { "epoch": 0.7138193688792165, "grad_norm": 0.18544046580791473, "learning_rate": 3.811193779115213e-05, "loss": 0.1579, "step": 1968 }, { "epoch": 0.7141820819731592, "grad_norm": 0.17015773057937622, "learning_rate": 3.80221466819166e-05, "loss": 0.1663, "step": 1969 }, { "epoch": 0.7145447950671019, "grad_norm": 0.1646818220615387, "learning_rate": 3.7932436634978684e-05, "loss": 0.1582, "step": 1970 }, { "epoch": 0.7149075081610446, "grad_norm": 0.16714130342006683, "learning_rate": 3.784280776767224e-05, "loss": 0.137, "step": 1971 }, { "epoch": 0.7152702212549873, "grad_norm": 0.17864611744880676, "learning_rate": 3.7753260197224995e-05, "loss": 0.1496, "step": 1972 }, { "epoch": 0.71563293434893, "grad_norm": 0.18264222145080566, "learning_rate": 3.766379404075832e-05, "loss": 0.1583, "step": 1973 }, { "epoch": 0.7159956474428727, "grad_norm": 0.1730545610189438, "learning_rate": 3.757440941528708e-05, "loss": 0.1616, "step": 1974 }, { "epoch": 0.7163583605368153, "grad_norm": 0.1775929182767868, "learning_rate": 3.748510643771962e-05, "loss": 0.1514, "step": 1975 }, { "epoch": 0.7167210736307581, "grad_norm": 0.1856832504272461, "learning_rate": 3.739588522485736e-05, "loss": 0.1558, "step": 1976 }, { "epoch": 0.7170837867247007, "grad_norm": 0.19256243109703064, "learning_rate": 3.7306745893394845e-05, "loss": 0.1966, "step": 1977 }, { "epoch": 0.7174464998186435, "grad_norm": 0.15902438759803772, "learning_rate": 3.72176885599195e-05, "loss": 0.1493, "step": 1978 }, { "epoch": 0.7178092129125861, "grad_norm": 0.16954579949378967, "learning_rate": 3.7128713340911535e-05, "loss": 0.1692, "step": 1979 }, { "epoch": 0.7181719260065288, "grad_norm": 0.17363213002681732, "learning_rate": 3.7039820352743685e-05, "loss": 0.1491, "step": 1980 }, { "epoch": 0.7185346391004716, "grad_norm": 0.18617630004882812, "learning_rate": 3.6951009711681253e-05, "loss": 0.1762, "step": 1981 }, { "epoch": 0.7188973521944142, "grad_norm": 0.15999780595302582, "learning_rate": 3.6862281533881745e-05, "loss": 0.1488, "step": 1982 }, { "epoch": 0.719260065288357, "grad_norm": 0.16866905987262726, "learning_rate": 3.677363593539485e-05, "loss": 0.1467, "step": 1983 }, { "epoch": 0.7196227783822996, "grad_norm": 0.1777690201997757, "learning_rate": 3.668507303216223e-05, "loss": 0.1525, "step": 1984 }, { "epoch": 0.7199854914762422, "grad_norm": 0.19426722824573517, "learning_rate": 3.659659294001739e-05, "loss": 0.2006, "step": 1985 }, { "epoch": 0.720348204570185, "grad_norm": 0.17638282477855682, "learning_rate": 3.6508195774685515e-05, "loss": 0.1548, "step": 1986 }, { "epoch": 0.7207109176641276, "grad_norm": 0.16942881047725677, "learning_rate": 3.641988165178339e-05, "loss": 0.1646, "step": 1987 }, { "epoch": 0.7210736307580704, "grad_norm": 0.17678217589855194, "learning_rate": 3.633165068681914e-05, "loss": 0.1342, "step": 1988 }, { "epoch": 0.721436343852013, "grad_norm": 0.15457268059253693, "learning_rate": 3.624350299519209e-05, "loss": 0.1489, "step": 1989 }, { "epoch": 0.7217990569459557, "grad_norm": 0.17524264752864838, "learning_rate": 3.615543869219271e-05, "loss": 0.1565, "step": 1990 }, { "epoch": 0.7221617700398985, "grad_norm": 0.16811302304267883, "learning_rate": 3.6067457893002376e-05, "loss": 0.1518, "step": 1991 }, { "epoch": 0.7225244831338411, "grad_norm": 0.18975135684013367, "learning_rate": 3.597956071269326e-05, "loss": 0.1605, "step": 1992 }, { "epoch": 0.7228871962277839, "grad_norm": 0.17413167655467987, "learning_rate": 3.58917472662281e-05, "loss": 0.1782, "step": 1993 }, { "epoch": 0.7232499093217265, "grad_norm": 0.17248669266700745, "learning_rate": 3.580401766846028e-05, "loss": 0.1499, "step": 1994 }, { "epoch": 0.7236126224156693, "grad_norm": 0.16712360084056854, "learning_rate": 3.571637203413334e-05, "loss": 0.1561, "step": 1995 }, { "epoch": 0.7239753355096119, "grad_norm": 0.17022311687469482, "learning_rate": 3.56288104778811e-05, "loss": 0.152, "step": 1996 }, { "epoch": 0.7243380486035546, "grad_norm": 0.17325520515441895, "learning_rate": 3.554133311422735e-05, "loss": 0.1554, "step": 1997 }, { "epoch": 0.7247007616974973, "grad_norm": 0.17560617625713348, "learning_rate": 3.5453940057585866e-05, "loss": 0.1869, "step": 1998 }, { "epoch": 0.72506347479144, "grad_norm": 0.19136746227741241, "learning_rate": 3.5366631422260045e-05, "loss": 0.1761, "step": 1999 }, { "epoch": 0.7254261878853827, "grad_norm": 0.1808745115995407, "learning_rate": 3.527940732244289e-05, "loss": 0.1558, "step": 2000 }, { "epoch": 0.7257889009793254, "grad_norm": 0.16616669297218323, "learning_rate": 3.519226787221692e-05, "loss": 0.1465, "step": 2001 }, { "epoch": 0.726151614073268, "grad_norm": 0.1782522052526474, "learning_rate": 3.5105213185553856e-05, "loss": 0.1546, "step": 2002 }, { "epoch": 0.7265143271672108, "grad_norm": 0.1684170663356781, "learning_rate": 3.5018243376314574e-05, "loss": 0.1625, "step": 2003 }, { "epoch": 0.7268770402611534, "grad_norm": 0.16710427403450012, "learning_rate": 3.493135855824894e-05, "loss": 0.155, "step": 2004 }, { "epoch": 0.7272397533550962, "grad_norm": 4411.4638671875, "learning_rate": 3.484455884499561e-05, "loss": 0.1437, "step": 2005 }, { "epoch": 0.7276024664490388, "grad_norm": 0.1757262647151947, "learning_rate": 3.475784435008208e-05, "loss": 0.1531, "step": 2006 }, { "epoch": 0.7279651795429815, "grad_norm": 0.1928826868534088, "learning_rate": 3.467121518692422e-05, "loss": 0.1655, "step": 2007 }, { "epoch": 0.7283278926369242, "grad_norm": 0.19880840182304382, "learning_rate": 3.458467146882637e-05, "loss": 0.1579, "step": 2008 }, { "epoch": 0.7286906057308669, "grad_norm": 0.23102417588233948, "learning_rate": 3.4498213308981095e-05, "loss": 0.1581, "step": 2009 }, { "epoch": 0.7290533188248096, "grad_norm": 0.1807643175125122, "learning_rate": 3.441184082046908e-05, "loss": 0.1462, "step": 2010 }, { "epoch": 0.7294160319187523, "grad_norm": 0.18923969566822052, "learning_rate": 3.4325554116258894e-05, "loss": 0.1507, "step": 2011 }, { "epoch": 0.7297787450126949, "grad_norm": 0.22489802539348602, "learning_rate": 3.423935330920702e-05, "loss": 0.1803, "step": 2012 }, { "epoch": 0.7301414581066377, "grad_norm": 0.23475851118564606, "learning_rate": 3.415323851205752e-05, "loss": 0.1649, "step": 2013 }, { "epoch": 0.7305041712005803, "grad_norm": 0.2082839459180832, "learning_rate": 3.406720983744193e-05, "loss": 0.182, "step": 2014 }, { "epoch": 0.7308668842945231, "grad_norm": 0.19769790768623352, "learning_rate": 3.3981267397879215e-05, "loss": 0.1543, "step": 2015 }, { "epoch": 0.7312295973884657, "grad_norm": 0.1755545437335968, "learning_rate": 3.38954113057755e-05, "loss": 0.1469, "step": 2016 }, { "epoch": 0.7315923104824085, "grad_norm": 0.18786299228668213, "learning_rate": 3.3809641673423985e-05, "loss": 0.1778, "step": 2017 }, { "epoch": 0.7319550235763511, "grad_norm": 0.17806515097618103, "learning_rate": 3.3723958613004855e-05, "loss": 0.1567, "step": 2018 }, { "epoch": 0.7323177366702938, "grad_norm": 0.17538048326969147, "learning_rate": 3.3638362236584965e-05, "loss": 0.1573, "step": 2019 }, { "epoch": 0.7326804497642365, "grad_norm": 0.17543213069438934, "learning_rate": 3.355285265611784e-05, "loss": 0.1651, "step": 2020 }, { "epoch": 0.7330431628581792, "grad_norm": 0.1797361820936203, "learning_rate": 3.346742998344348e-05, "loss": 0.1696, "step": 2021 }, { "epoch": 0.7334058759521219, "grad_norm": 0.20315411686897278, "learning_rate": 3.3382094330288216e-05, "loss": 0.1682, "step": 2022 }, { "epoch": 0.7337685890460646, "grad_norm": 0.17584829032421112, "learning_rate": 3.3296845808264574e-05, "loss": 0.1734, "step": 2023 }, { "epoch": 0.7341313021400072, "grad_norm": 0.192337304353714, "learning_rate": 3.321168452887106e-05, "loss": 0.185, "step": 2024 }, { "epoch": 0.73449401523395, "grad_norm": 0.1659361571073532, "learning_rate": 3.3126610603492194e-05, "loss": 0.1556, "step": 2025 }, { "epoch": 0.7348567283278926, "grad_norm": 0.16753138601779938, "learning_rate": 3.304162414339814e-05, "loss": 0.1467, "step": 2026 }, { "epoch": 0.7352194414218354, "grad_norm": 0.18743427097797394, "learning_rate": 3.295672525974469e-05, "loss": 0.1653, "step": 2027 }, { "epoch": 0.735582154515778, "grad_norm": 0.16860130429267883, "learning_rate": 3.287191406357311e-05, "loss": 0.1563, "step": 2028 }, { "epoch": 0.7359448676097207, "grad_norm": 0.16440363228321075, "learning_rate": 3.278719066580995e-05, "loss": 0.1493, "step": 2029 }, { "epoch": 0.7363075807036634, "grad_norm": 0.1813763827085495, "learning_rate": 3.270255517726691e-05, "loss": 0.1621, "step": 2030 }, { "epoch": 0.7366702937976061, "grad_norm": 0.16494570672512054, "learning_rate": 3.261800770864083e-05, "loss": 0.1381, "step": 2031 }, { "epoch": 0.7370330068915488, "grad_norm": 0.1700211763381958, "learning_rate": 3.2533548370513286e-05, "loss": 0.1508, "step": 2032 }, { "epoch": 0.7373957199854915, "grad_norm": 0.19019465148448944, "learning_rate": 3.244917727335066e-05, "loss": 0.1596, "step": 2033 }, { "epoch": 0.7377584330794341, "grad_norm": 0.1853635013103485, "learning_rate": 3.236489452750385e-05, "loss": 0.1433, "step": 2034 }, { "epoch": 0.7381211461733769, "grad_norm": 0.19163811206817627, "learning_rate": 3.228070024320833e-05, "loss": 0.1605, "step": 2035 }, { "epoch": 0.7384838592673195, "grad_norm": 0.2122446596622467, "learning_rate": 3.2196594530583735e-05, "loss": 0.1792, "step": 2036 }, { "epoch": 0.7388465723612623, "grad_norm": 0.18100525438785553, "learning_rate": 3.211257749963391e-05, "loss": 0.1703, "step": 2037 }, { "epoch": 0.7392092854552049, "grad_norm": 0.15972734987735748, "learning_rate": 3.2028649260246754e-05, "loss": 0.1691, "step": 2038 }, { "epoch": 0.7395719985491476, "grad_norm": 0.17128963768482208, "learning_rate": 3.1944809922193986e-05, "loss": 0.1611, "step": 2039 }, { "epoch": 0.7399347116430903, "grad_norm": 0.18161478638648987, "learning_rate": 3.186105959513103e-05, "loss": 0.1457, "step": 2040 }, { "epoch": 0.740297424737033, "grad_norm": 0.1911374032497406, "learning_rate": 3.177739838859694e-05, "loss": 0.1655, "step": 2041 }, { "epoch": 0.7406601378309757, "grad_norm": 0.16643930971622467, "learning_rate": 3.1693826412014114e-05, "loss": 0.1744, "step": 2042 }, { "epoch": 0.7410228509249184, "grad_norm": 0.17060095071792603, "learning_rate": 3.1610343774688414e-05, "loss": 0.1469, "step": 2043 }, { "epoch": 0.7413855640188611, "grad_norm": 0.1795426309108734, "learning_rate": 3.152695058580871e-05, "loss": 0.1487, "step": 2044 }, { "epoch": 0.7417482771128038, "grad_norm": 0.1854647696018219, "learning_rate": 3.1443646954446914e-05, "loss": 0.17, "step": 2045 }, { "epoch": 0.7421109902067464, "grad_norm": 0.1683138608932495, "learning_rate": 3.136043298955782e-05, "loss": 0.1584, "step": 2046 }, { "epoch": 0.7424737033006892, "grad_norm": 0.18557599186897278, "learning_rate": 3.127730879997895e-05, "loss": 0.1507, "step": 2047 }, { "epoch": 0.7428364163946318, "grad_norm": 0.17158469557762146, "learning_rate": 3.119427449443032e-05, "loss": 0.1512, "step": 2048 }, { "epoch": 0.7431991294885746, "grad_norm": 0.1670829951763153, "learning_rate": 3.111133018151456e-05, "loss": 0.167, "step": 2049 }, { "epoch": 0.7435618425825172, "grad_norm": 0.1642339676618576, "learning_rate": 3.102847596971646e-05, "loss": 0.144, "step": 2050 }, { "epoch": 0.7439245556764599, "grad_norm": 0.16173475980758667, "learning_rate": 3.094571196740299e-05, "loss": 0.1412, "step": 2051 }, { "epoch": 0.7442872687704026, "grad_norm": 0.16731561720371246, "learning_rate": 3.086303828282315e-05, "loss": 0.1586, "step": 2052 }, { "epoch": 0.7446499818643453, "grad_norm": 0.19204100966453552, "learning_rate": 3.078045502410779e-05, "loss": 0.2226, "step": 2053 }, { "epoch": 0.745012694958288, "grad_norm": 0.17547018826007843, "learning_rate": 3.069796229926952e-05, "loss": 0.1509, "step": 2054 }, { "epoch": 0.7453754080522307, "grad_norm": 0.1662409007549286, "learning_rate": 3.0615560216202486e-05, "loss": 0.1554, "step": 2055 }, { "epoch": 0.7457381211461733, "grad_norm": 0.18224076926708221, "learning_rate": 3.0533248882682374e-05, "loss": 0.1608, "step": 2056 }, { "epoch": 0.7461008342401161, "grad_norm": 0.2161344736814499, "learning_rate": 3.045102840636609e-05, "loss": 0.1661, "step": 2057 }, { "epoch": 0.7464635473340587, "grad_norm": 0.16624325513839722, "learning_rate": 3.0368898894791753e-05, "loss": 0.1558, "step": 2058 }, { "epoch": 0.7468262604280015, "grad_norm": 0.15912269055843353, "learning_rate": 3.0286860455378462e-05, "loss": 0.1536, "step": 2059 }, { "epoch": 0.7471889735219441, "grad_norm": 0.1618340164422989, "learning_rate": 3.0204913195426254e-05, "loss": 0.1436, "step": 2060 }, { "epoch": 0.7475516866158868, "grad_norm": 0.16747722029685974, "learning_rate": 3.0123057222115836e-05, "loss": 0.149, "step": 2061 }, { "epoch": 0.7479143997098295, "grad_norm": 0.1707213968038559, "learning_rate": 3.0041292642508644e-05, "loss": 0.1522, "step": 2062 }, { "epoch": 0.7482771128037722, "grad_norm": 0.17695897817611694, "learning_rate": 2.995961956354646e-05, "loss": 0.1573, "step": 2063 }, { "epoch": 0.7486398258977149, "grad_norm": 0.18760527670383453, "learning_rate": 2.9878038092051443e-05, "loss": 0.1551, "step": 2064 }, { "epoch": 0.7490025389916576, "grad_norm": 0.1940336525440216, "learning_rate": 2.9796548334725916e-05, "loss": 0.1531, "step": 2065 }, { "epoch": 0.7493652520856003, "grad_norm": 0.16656464338302612, "learning_rate": 2.9715150398152268e-05, "loss": 0.1474, "step": 2066 }, { "epoch": 0.749727965179543, "grad_norm": 0.16804639995098114, "learning_rate": 2.9633844388792732e-05, "loss": 0.1651, "step": 2067 }, { "epoch": 0.7500906782734856, "grad_norm": 0.16543330252170563, "learning_rate": 2.9552630412989434e-05, "loss": 0.1433, "step": 2068 }, { "epoch": 0.7504533913674284, "grad_norm": 0.17684879899024963, "learning_rate": 2.9471508576964023e-05, "loss": 0.1533, "step": 2069 }, { "epoch": 0.750816104461371, "grad_norm": 0.16878783702850342, "learning_rate": 2.939047898681765e-05, "loss": 0.1509, "step": 2070 }, { "epoch": 0.7511788175553138, "grad_norm": 0.16449496150016785, "learning_rate": 2.93095417485308e-05, "loss": 0.1628, "step": 2071 }, { "epoch": 0.7515415306492564, "grad_norm": 0.20348592102527618, "learning_rate": 2.9228696967963275e-05, "loss": 0.1695, "step": 2072 }, { "epoch": 0.7519042437431991, "grad_norm": 0.1528720259666443, "learning_rate": 2.9147944750853816e-05, "loss": 0.1396, "step": 2073 }, { "epoch": 0.7522669568371418, "grad_norm": 0.17836391925811768, "learning_rate": 2.906728520282015e-05, "loss": 0.1538, "step": 2074 }, { "epoch": 0.7526296699310845, "grad_norm": 0.16207584738731384, "learning_rate": 2.898671842935885e-05, "loss": 0.1457, "step": 2075 }, { "epoch": 0.7529923830250272, "grad_norm": 0.17391245067119598, "learning_rate": 2.8906244535845072e-05, "loss": 0.1813, "step": 2076 }, { "epoch": 0.7533550961189699, "grad_norm": 0.1827738881111145, "learning_rate": 2.8825863627532524e-05, "loss": 0.1712, "step": 2077 }, { "epoch": 0.7537178092129125, "grad_norm": 0.16939976811408997, "learning_rate": 2.8745575809553294e-05, "loss": 0.1599, "step": 2078 }, { "epoch": 0.7540805223068553, "grad_norm": 0.15600422024726868, "learning_rate": 2.8665381186917718e-05, "loss": 0.1469, "step": 2079 }, { "epoch": 0.7544432354007979, "grad_norm": 0.2160848081111908, "learning_rate": 2.858527986451419e-05, "loss": 0.1748, "step": 2080 }, { "epoch": 0.7548059484947407, "grad_norm": 0.16352678835391998, "learning_rate": 2.8505271947109203e-05, "loss": 0.1486, "step": 2081 }, { "epoch": 0.7551686615886833, "grad_norm": 0.16789479553699493, "learning_rate": 2.842535753934695e-05, "loss": 0.1765, "step": 2082 }, { "epoch": 0.755531374682626, "grad_norm": 0.16260650753974915, "learning_rate": 2.8345536745749403e-05, "loss": 0.1374, "step": 2083 }, { "epoch": 0.7558940877765687, "grad_norm": 0.16362746059894562, "learning_rate": 2.8265809670716027e-05, "loss": 0.1528, "step": 2084 }, { "epoch": 0.7562568008705114, "grad_norm": 0.1730203479528427, "learning_rate": 2.818617641852376e-05, "loss": 0.16, "step": 2085 }, { "epoch": 0.7566195139644541, "grad_norm": 0.1941351443529129, "learning_rate": 2.8106637093326782e-05, "loss": 0.1578, "step": 2086 }, { "epoch": 0.7569822270583968, "grad_norm": 0.17957964539527893, "learning_rate": 2.8027191799156514e-05, "loss": 0.1497, "step": 2087 }, { "epoch": 0.7573449401523396, "grad_norm": 0.1569589227437973, "learning_rate": 2.794784063992131e-05, "loss": 0.1377, "step": 2088 }, { "epoch": 0.7577076532462822, "grad_norm": 0.16305673122406006, "learning_rate": 2.7868583719406403e-05, "loss": 0.1471, "step": 2089 }, { "epoch": 0.7580703663402248, "grad_norm": 0.171325221657753, "learning_rate": 2.778942114127382e-05, "loss": 0.1501, "step": 2090 }, { "epoch": 0.7584330794341676, "grad_norm": 0.1620980203151703, "learning_rate": 2.771035300906215e-05, "loss": 0.1461, "step": 2091 }, { "epoch": 0.7587957925281102, "grad_norm": 0.16900931298732758, "learning_rate": 2.7631379426186434e-05, "loss": 0.143, "step": 2092 }, { "epoch": 0.759158505622053, "grad_norm": 0.1761879175901413, "learning_rate": 2.755250049593816e-05, "loss": 0.1541, "step": 2093 }, { "epoch": 0.7595212187159956, "grad_norm": 0.18240278959274292, "learning_rate": 2.74737163214849e-05, "loss": 0.1931, "step": 2094 }, { "epoch": 0.7598839318099383, "grad_norm": 0.15427257120609283, "learning_rate": 2.7395027005870343e-05, "loss": 0.1453, "step": 2095 }, { "epoch": 0.760246644903881, "grad_norm": 0.18148113787174225, "learning_rate": 2.73164326520141e-05, "loss": 0.1733, "step": 2096 }, { "epoch": 0.7606093579978237, "grad_norm": 0.1736038774251938, "learning_rate": 2.7237933362711576e-05, "loss": 0.1532, "step": 2097 }, { "epoch": 0.7609720710917665, "grad_norm": 0.18636751174926758, "learning_rate": 2.715952924063383e-05, "loss": 0.1627, "step": 2098 }, { "epoch": 0.7613347841857091, "grad_norm": 0.18383683264255524, "learning_rate": 2.7081220388327522e-05, "loss": 0.1625, "step": 2099 }, { "epoch": 0.7616974972796517, "grad_norm": 0.16700130701065063, "learning_rate": 2.70030069082146e-05, "loss": 0.1536, "step": 2100 }, { "epoch": 0.7620602103735945, "grad_norm": 0.178177148103714, "learning_rate": 2.692488890259235e-05, "loss": 0.1593, "step": 2101 }, { "epoch": 0.7624229234675372, "grad_norm": 0.16141119599342346, "learning_rate": 2.6846866473633125e-05, "loss": 0.1476, "step": 2102 }, { "epoch": 0.7627856365614799, "grad_norm": 0.16690880060195923, "learning_rate": 2.676893972338432e-05, "loss": 0.1606, "step": 2103 }, { "epoch": 0.7631483496554226, "grad_norm": 0.18088023364543915, "learning_rate": 2.6691108753768146e-05, "loss": 0.1799, "step": 2104 }, { "epoch": 0.7635110627493652, "grad_norm": 0.16774174571037292, "learning_rate": 2.661337366658161e-05, "loss": 0.1534, "step": 2105 }, { "epoch": 0.763873775843308, "grad_norm": 0.1739625185728073, "learning_rate": 2.653573456349624e-05, "loss": 0.1752, "step": 2106 }, { "epoch": 0.7642364889372506, "grad_norm": 0.1661982536315918, "learning_rate": 2.6458191546058064e-05, "loss": 0.1554, "step": 2107 }, { "epoch": 0.7645992020311934, "grad_norm": 0.15863363444805145, "learning_rate": 2.638074471568739e-05, "loss": 0.1563, "step": 2108 }, { "epoch": 0.764961915125136, "grad_norm": 0.1664765626192093, "learning_rate": 2.630339417367882e-05, "loss": 0.1613, "step": 2109 }, { "epoch": 0.7653246282190788, "grad_norm": 0.17983406782150269, "learning_rate": 2.622614002120091e-05, "loss": 0.1354, "step": 2110 }, { "epoch": 0.7656873413130214, "grad_norm": 0.18512356281280518, "learning_rate": 2.6148982359296205e-05, "loss": 0.1548, "step": 2111 }, { "epoch": 0.766050054406964, "grad_norm": 0.16237185895442963, "learning_rate": 2.6071921288880984e-05, "loss": 0.151, "step": 2112 }, { "epoch": 0.7664127675009068, "grad_norm": 0.16601556539535522, "learning_rate": 2.5994956910745326e-05, "loss": 0.1616, "step": 2113 }, { "epoch": 0.7667754805948495, "grad_norm": 0.163995161652565, "learning_rate": 2.5918089325552707e-05, "loss": 0.1485, "step": 2114 }, { "epoch": 0.7671381936887922, "grad_norm": 0.18575289845466614, "learning_rate": 2.5841318633840072e-05, "loss": 0.1577, "step": 2115 }, { "epoch": 0.7675009067827349, "grad_norm": 0.19277150928974152, "learning_rate": 2.576464493601761e-05, "loss": 0.155, "step": 2116 }, { "epoch": 0.7678636198766775, "grad_norm": 0.1656551957130432, "learning_rate": 2.5688068332368632e-05, "loss": 0.1486, "step": 2117 }, { "epoch": 0.7682263329706203, "grad_norm": 0.15799161791801453, "learning_rate": 2.5611588923049544e-05, "loss": 0.1369, "step": 2118 }, { "epoch": 0.7685890460645629, "grad_norm": 0.17702096700668335, "learning_rate": 2.5535206808089553e-05, "loss": 0.1789, "step": 2119 }, { "epoch": 0.7689517591585057, "grad_norm": 2096.28515625, "learning_rate": 2.5458922087390613e-05, "loss": 0.1436, "step": 2120 }, { "epoch": 0.7693144722524483, "grad_norm": 0.17093558609485626, "learning_rate": 2.5382734860727332e-05, "loss": 0.1518, "step": 2121 }, { "epoch": 0.769677185346391, "grad_norm": 0.1638222485780716, "learning_rate": 2.5306645227746762e-05, "loss": 0.1473, "step": 2122 }, { "epoch": 0.7700398984403337, "grad_norm": 0.1996994912624359, "learning_rate": 2.523065328796831e-05, "loss": 0.1809, "step": 2123 }, { "epoch": 0.7704026115342764, "grad_norm": 0.1753552258014679, "learning_rate": 2.515475914078369e-05, "loss": 0.1811, "step": 2124 }, { "epoch": 0.7707653246282191, "grad_norm": 0.19755405187606812, "learning_rate": 2.5078962885456612e-05, "loss": 0.1783, "step": 2125 }, { "epoch": 0.7711280377221618, "grad_norm": 0.18720857799053192, "learning_rate": 2.5003264621122802e-05, "loss": 0.1519, "step": 2126 }, { "epoch": 0.7714907508161044, "grad_norm": 0.1806974709033966, "learning_rate": 2.4927664446789788e-05, "loss": 0.1594, "step": 2127 }, { "epoch": 0.7718534639100472, "grad_norm": 0.18246807157993317, "learning_rate": 2.4852162461336835e-05, "loss": 0.1395, "step": 2128 }, { "epoch": 0.7722161770039898, "grad_norm": 0.18061847984790802, "learning_rate": 2.477675876351475e-05, "loss": 0.1709, "step": 2129 }, { "epoch": 0.7725788900979326, "grad_norm": 0.1823715716600418, "learning_rate": 2.4701453451945846e-05, "loss": 0.1488, "step": 2130 }, { "epoch": 0.7729416031918752, "grad_norm": 0.16946843266487122, "learning_rate": 2.4626246625123706e-05, "loss": 0.1498, "step": 2131 }, { "epoch": 0.773304316285818, "grad_norm": 0.17811253666877747, "learning_rate": 2.455113838141311e-05, "loss": 0.1649, "step": 2132 }, { "epoch": 0.7736670293797606, "grad_norm": 0.16584321856498718, "learning_rate": 2.4476128819049893e-05, "loss": 0.1814, "step": 2133 }, { "epoch": 0.7740297424737033, "grad_norm": 0.15835148096084595, "learning_rate": 2.4401218036140848e-05, "loss": 0.1453, "step": 2134 }, { "epoch": 0.774392455567646, "grad_norm": 0.17442336678504944, "learning_rate": 2.4326406130663527e-05, "loss": 0.1457, "step": 2135 }, { "epoch": 0.7747551686615887, "grad_norm": 0.18500109016895294, "learning_rate": 2.4251693200466242e-05, "loss": 0.1673, "step": 2136 }, { "epoch": 0.7751178817555314, "grad_norm": 0.17963416874408722, "learning_rate": 2.417707934326775e-05, "loss": 0.1522, "step": 2137 }, { "epoch": 0.7754805948494741, "grad_norm": 0.17526273429393768, "learning_rate": 2.4102564656657312e-05, "loss": 0.1485, "step": 2138 }, { "epoch": 0.7758433079434167, "grad_norm": 0.15860708057880402, "learning_rate": 2.402814923809442e-05, "loss": 0.1446, "step": 2139 }, { "epoch": 0.7762060210373595, "grad_norm": 0.1740608960390091, "learning_rate": 2.3953833184908757e-05, "loss": 0.1521, "step": 2140 }, { "epoch": 0.7765687341313021, "grad_norm": 0.1701829582452774, "learning_rate": 2.387961659430007e-05, "loss": 0.1386, "step": 2141 }, { "epoch": 0.7769314472252449, "grad_norm": 0.17111440002918243, "learning_rate": 2.380549956333793e-05, "loss": 0.1452, "step": 2142 }, { "epoch": 0.7772941603191875, "grad_norm": 0.17982304096221924, "learning_rate": 2.3731482188961818e-05, "loss": 0.163, "step": 2143 }, { "epoch": 0.7776568734131302, "grad_norm": 0.1801091730594635, "learning_rate": 2.3657564567980782e-05, "loss": 0.1423, "step": 2144 }, { "epoch": 0.7780195865070729, "grad_norm": 0.15309491753578186, "learning_rate": 2.358374679707339e-05, "loss": 0.1393, "step": 2145 }, { "epoch": 0.7783822996010156, "grad_norm": 0.15650945901870728, "learning_rate": 2.351002897278771e-05, "loss": 0.1894, "step": 2146 }, { "epoch": 0.7787450126949583, "grad_norm": 0.17866793274879456, "learning_rate": 2.343641119154101e-05, "loss": 0.1549, "step": 2147 }, { "epoch": 0.779107725788901, "grad_norm": 0.17232728004455566, "learning_rate": 2.336289354961969e-05, "loss": 0.1802, "step": 2148 }, { "epoch": 0.7794704388828436, "grad_norm": 0.18021385371685028, "learning_rate": 2.3289476143179202e-05, "loss": 0.143, "step": 2149 }, { "epoch": 0.7798331519767864, "grad_norm": 0.18300630152225494, "learning_rate": 2.3216159068243958e-05, "loss": 0.1739, "step": 2150 }, { "epoch": 0.780195865070729, "grad_norm": 0.18222151696681976, "learning_rate": 2.314294242070706e-05, "loss": 0.1653, "step": 2151 }, { "epoch": 0.7805585781646718, "grad_norm": 0.16753800213336945, "learning_rate": 2.30698262963303e-05, "loss": 0.1766, "step": 2152 }, { "epoch": 0.7809212912586144, "grad_norm": 0.16288548707962036, "learning_rate": 2.2996810790743983e-05, "loss": 0.1417, "step": 2153 }, { "epoch": 0.7812840043525572, "grad_norm": 0.14791814982891083, "learning_rate": 2.2923895999446764e-05, "loss": 0.1452, "step": 2154 }, { "epoch": 0.7816467174464998, "grad_norm": 0.17105069756507874, "learning_rate": 2.2851082017805703e-05, "loss": 0.1641, "step": 2155 }, { "epoch": 0.7820094305404425, "grad_norm": 0.17432281374931335, "learning_rate": 2.2778368941055882e-05, "loss": 0.1774, "step": 2156 }, { "epoch": 0.7823721436343852, "grad_norm": 0.19430530071258545, "learning_rate": 2.2705756864300454e-05, "loss": 0.167, "step": 2157 }, { "epoch": 0.7827348567283279, "grad_norm": 0.16627925634384155, "learning_rate": 2.2633245882510457e-05, "loss": 0.1328, "step": 2158 }, { "epoch": 0.7830975698222706, "grad_norm": 0.1691751331090927, "learning_rate": 2.256083609052474e-05, "loss": 0.1504, "step": 2159 }, { "epoch": 0.7834602829162133, "grad_norm": 0.17866089940071106, "learning_rate": 2.2488527583049736e-05, "loss": 0.1503, "step": 2160 }, { "epoch": 0.7838229960101559, "grad_norm": 0.19467145204544067, "learning_rate": 2.2416320454659512e-05, "loss": 0.1611, "step": 2161 }, { "epoch": 0.7841857091040987, "grad_norm": 0.17603172361850739, "learning_rate": 2.2344214799795438e-05, "loss": 0.1519, "step": 2162 }, { "epoch": 0.7845484221980413, "grad_norm": 0.18451876938343048, "learning_rate": 2.2272210712766205e-05, "loss": 0.1675, "step": 2163 }, { "epoch": 0.7849111352919841, "grad_norm": 0.17610016465187073, "learning_rate": 2.2200308287747673e-05, "loss": 0.1597, "step": 2164 }, { "epoch": 0.7852738483859267, "grad_norm": 0.1533452421426773, "learning_rate": 2.21285076187827e-05, "loss": 0.1381, "step": 2165 }, { "epoch": 0.7856365614798694, "grad_norm": 0.16271378099918365, "learning_rate": 2.205680879978107e-05, "loss": 0.1435, "step": 2166 }, { "epoch": 0.7859992745738121, "grad_norm": 0.15660040080547333, "learning_rate": 2.19852119245194e-05, "loss": 0.1441, "step": 2167 }, { "epoch": 0.7863619876677548, "grad_norm": 0.16608907282352448, "learning_rate": 2.1913717086640906e-05, "loss": 0.1603, "step": 2168 }, { "epoch": 0.7867247007616975, "grad_norm": 0.19811011850833893, "learning_rate": 2.1842324379655378e-05, "loss": 0.1729, "step": 2169 }, { "epoch": 0.7870874138556402, "grad_norm": 0.16923308372497559, "learning_rate": 2.177103389693903e-05, "loss": 0.1572, "step": 2170 }, { "epoch": 0.7874501269495828, "grad_norm": 0.16869297623634338, "learning_rate": 2.169984573173436e-05, "loss": 0.1523, "step": 2171 }, { "epoch": 0.7878128400435256, "grad_norm": 0.16741646826267242, "learning_rate": 2.162875997715005e-05, "loss": 0.1336, "step": 2172 }, { "epoch": 0.7881755531374682, "grad_norm": 0.17434288561344147, "learning_rate": 2.1557776726160807e-05, "loss": 0.1615, "step": 2173 }, { "epoch": 0.788538266231411, "grad_norm": 0.19176846742630005, "learning_rate": 2.1486896071607364e-05, "loss": 0.158, "step": 2174 }, { "epoch": 0.7889009793253536, "grad_norm": 0.19300417602062225, "learning_rate": 2.141611810619617e-05, "loss": 0.1618, "step": 2175 }, { "epoch": 0.7892636924192964, "grad_norm": 0.18857765197753906, "learning_rate": 2.1345442922499394e-05, "loss": 0.1552, "step": 2176 }, { "epoch": 0.789626405513239, "grad_norm": 0.16958756744861603, "learning_rate": 2.127487061295478e-05, "loss": 0.1498, "step": 2177 }, { "epoch": 0.7899891186071817, "grad_norm": 0.1617862582206726, "learning_rate": 2.1204401269865526e-05, "loss": 0.1468, "step": 2178 }, { "epoch": 0.7903518317011244, "grad_norm": 0.17696796357631683, "learning_rate": 2.113403498540011e-05, "loss": 0.158, "step": 2179 }, { "epoch": 0.7907145447950671, "grad_norm": 0.18679635226726532, "learning_rate": 2.1063771851592316e-05, "loss": 0.1725, "step": 2180 }, { "epoch": 0.7910772578890098, "grad_norm": 0.16767951846122742, "learning_rate": 2.099361196034093e-05, "loss": 0.1541, "step": 2181 }, { "epoch": 0.7914399709829525, "grad_norm": 0.17078953981399536, "learning_rate": 2.09235554034097e-05, "loss": 0.1517, "step": 2182 }, { "epoch": 0.7918026840768951, "grad_norm": 0.18054896593093872, "learning_rate": 2.085360227242731e-05, "loss": 0.1668, "step": 2183 }, { "epoch": 0.7921653971708379, "grad_norm": 0.17167535424232483, "learning_rate": 2.0783752658887066e-05, "loss": 0.1486, "step": 2184 }, { "epoch": 0.7925281102647805, "grad_norm": 0.18194803595542908, "learning_rate": 2.0714006654146955e-05, "loss": 0.1705, "step": 2185 }, { "epoch": 0.7928908233587233, "grad_norm": 0.15957947075366974, "learning_rate": 2.0644364349429378e-05, "loss": 0.1393, "step": 2186 }, { "epoch": 0.7932535364526659, "grad_norm": 0.17193473875522614, "learning_rate": 2.057482583582122e-05, "loss": 0.1549, "step": 2187 }, { "epoch": 0.7936162495466086, "grad_norm": 0.16619963943958282, "learning_rate": 2.0505391204273495e-05, "loss": 0.1526, "step": 2188 }, { "epoch": 0.7939789626405513, "grad_norm": 0.15132339298725128, "learning_rate": 2.043606054560141e-05, "loss": 0.1602, "step": 2189 }, { "epoch": 0.794341675734494, "grad_norm": 0.17620229721069336, "learning_rate": 2.0366833950484164e-05, "loss": 0.1505, "step": 2190 }, { "epoch": 0.7947043888284367, "grad_norm": 0.16328759491443634, "learning_rate": 2.0297711509464833e-05, "loss": 0.1407, "step": 2191 }, { "epoch": 0.7950671019223794, "grad_norm": 0.16912280023097992, "learning_rate": 2.0228693312950352e-05, "loss": 0.1571, "step": 2192 }, { "epoch": 0.795429815016322, "grad_norm": 0.16919687390327454, "learning_rate": 2.0159779451211204e-05, "loss": 0.1484, "step": 2193 }, { "epoch": 0.7957925281102648, "grad_norm": 0.17652738094329834, "learning_rate": 2.009097001438147e-05, "loss": 0.1388, "step": 2194 }, { "epoch": 0.7961552412042074, "grad_norm": 0.17439448833465576, "learning_rate": 2.0022265092458638e-05, "loss": 0.162, "step": 2195 }, { "epoch": 0.7965179542981502, "grad_norm": 0.16315314173698425, "learning_rate": 1.9953664775303483e-05, "loss": 0.1463, "step": 2196 }, { "epoch": 0.7968806673920928, "grad_norm": 0.15268266201019287, "learning_rate": 1.988516915263996e-05, "loss": 0.1421, "step": 2197 }, { "epoch": 0.7972433804860355, "grad_norm": 0.16543833911418915, "learning_rate": 1.981677831405516e-05, "loss": 0.1495, "step": 2198 }, { "epoch": 0.7976060935799782, "grad_norm": 0.1608053743839264, "learning_rate": 1.974849234899907e-05, "loss": 0.1383, "step": 2199 }, { "epoch": 0.7979688066739209, "grad_norm": 0.1577446609735489, "learning_rate": 1.9680311346784496e-05, "loss": 0.1418, "step": 2200 } ], "logging_steps": 1, "max_steps": 2757, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.5288134512966107e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }