diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.644412191582003, + "epoch": 2.9027576197387517, "eval_steps": 300, - "global_step": 4800, + "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -21087,12654 +21087,6 @@ "eval_samples_per_second": 4.818, "eval_steps_per_second": 0.603, "step": 3000 - }, - { - "epoch": 2.9, - "grad_norm": 1.183372974395752, - "learning_rate": 1.2506255212677231e-05, - "loss": 0.8625, - "step": 3001 - }, - { - "epoch": 2.9, - "grad_norm": 0.9380956292152405, - "learning_rate": 1.2502085070892411e-05, - "loss": 0.6166, - "step": 3002 - }, - { - "epoch": 2.91, - "grad_norm": 1.162009358406067, - "learning_rate": 1.249791492910759e-05, - "loss": 0.7709, - "step": 3003 - }, - { - "epoch": 2.91, - "grad_norm": 1.1164528131484985, - "learning_rate": 1.249374478732277e-05, - "loss": 0.7264, - "step": 3004 - }, - { - "epoch": 2.91, - "grad_norm": 1.1282567977905273, - "learning_rate": 1.248957464553795e-05, - "loss": 0.7105, - "step": 3005 - }, - { - "epoch": 2.91, - "grad_norm": 1.0823564529418945, - "learning_rate": 1.2485404503753128e-05, - "loss": 0.7055, - "step": 3006 - }, - { - "epoch": 2.91, - "grad_norm": 1.1487401723861694, - "learning_rate": 1.2481234361968308e-05, - "loss": 0.8436, - "step": 3007 - }, - { - "epoch": 2.91, - "grad_norm": 1.1463149785995483, - "learning_rate": 1.2477064220183488e-05, - "loss": 0.7143, - "step": 3008 - }, - { - "epoch": 2.91, - "grad_norm": 1.3678183555603027, - "learning_rate": 1.2472894078398665e-05, - "loss": 0.8593, - "step": 3009 - }, - { - "epoch": 2.91, - "grad_norm": 1.038821816444397, - "learning_rate": 1.2468723936613845e-05, - "loss": 0.7619, - "step": 3010 - }, - { - "epoch": 2.91, - "grad_norm": 0.9451192617416382, - "learning_rate": 1.2464553794829025e-05, - "loss": 0.7227, - "step": 3011 - }, - { - "epoch": 2.91, - "grad_norm": 1.0573679208755493, - "learning_rate": 1.2460383653044203e-05, - "loss": 0.716, - "step": 3012 - }, - { - "epoch": 2.92, - "grad_norm": 1.4460304975509644, - "learning_rate": 1.2456213511259383e-05, - "loss": 0.93, - "step": 3013 - }, - { - "epoch": 2.92, - "grad_norm": 1.2816954851150513, - "learning_rate": 1.2452043369474563e-05, - "loss": 0.8125, - "step": 3014 - }, - { - "epoch": 2.92, - "grad_norm": 1.0708842277526855, - "learning_rate": 1.2447873227689742e-05, - "loss": 0.6789, - "step": 3015 - }, - { - "epoch": 2.92, - "grad_norm": 1.2512792348861694, - "learning_rate": 1.2443703085904922e-05, - "loss": 0.7466, - "step": 3016 - }, - { - "epoch": 2.92, - "grad_norm": 1.228427529335022, - "learning_rate": 1.2439532944120102e-05, - "loss": 0.7453, - "step": 3017 - }, - { - "epoch": 2.92, - "grad_norm": 0.9310507774353027, - "learning_rate": 1.243536280233528e-05, - "loss": 0.8435, - "step": 3018 - }, - { - "epoch": 2.92, - "grad_norm": 1.0072879791259766, - "learning_rate": 1.243119266055046e-05, - "loss": 0.7963, - "step": 3019 - }, - { - "epoch": 2.92, - "grad_norm": 1.1436797380447388, - "learning_rate": 1.2427022518765639e-05, - "loss": 0.5735, - "step": 3020 - }, - { - "epoch": 2.92, - "grad_norm": 1.3681470155715942, - "learning_rate": 1.2422852376980817e-05, - "loss": 1.0026, - "step": 3021 - }, - { - "epoch": 2.92, - "grad_norm": 1.094900131225586, - "learning_rate": 1.2418682235195997e-05, - "loss": 0.9181, - "step": 3022 - }, - { - "epoch": 2.93, - "grad_norm": 1.0876237154006958, - "learning_rate": 1.2414512093411177e-05, - "loss": 0.7428, - "step": 3023 - }, - { - "epoch": 2.93, - "grad_norm": 0.9946409463882446, - "learning_rate": 1.2410341951626355e-05, - "loss": 0.8567, - "step": 3024 - }, - { - "epoch": 2.93, - "grad_norm": 1.1546865701675415, - "learning_rate": 1.2406171809841535e-05, - "loss": 0.6416, - "step": 3025 - }, - { - "epoch": 2.93, - "grad_norm": 1.0324201583862305, - "learning_rate": 1.2402001668056715e-05, - "loss": 0.8069, - "step": 3026 - }, - { - "epoch": 2.93, - "grad_norm": 1.2249302864074707, - "learning_rate": 1.2397831526271894e-05, - "loss": 0.8944, - "step": 3027 - }, - { - "epoch": 2.93, - "grad_norm": 1.0357005596160889, - "learning_rate": 1.2393661384487074e-05, - "loss": 0.874, - "step": 3028 - }, - { - "epoch": 2.93, - "grad_norm": 1.270063877105713, - "learning_rate": 1.2389491242702254e-05, - "loss": 0.7789, - "step": 3029 - }, - { - "epoch": 2.93, - "grad_norm": 1.2423442602157593, - "learning_rate": 1.2385321100917432e-05, - "loss": 0.7947, - "step": 3030 - }, - { - "epoch": 2.93, - "grad_norm": 1.1018481254577637, - "learning_rate": 1.238115095913261e-05, - "loss": 0.6639, - "step": 3031 - }, - { - "epoch": 2.93, - "grad_norm": 1.2056713104248047, - "learning_rate": 1.237698081734779e-05, - "loss": 1.0062, - "step": 3032 - }, - { - "epoch": 2.93, - "grad_norm": 1.1301981210708618, - "learning_rate": 1.2372810675562969e-05, - "loss": 0.7844, - "step": 3033 - }, - { - "epoch": 2.94, - "grad_norm": 0.863381564617157, - "learning_rate": 1.2368640533778149e-05, - "loss": 0.6276, - "step": 3034 - }, - { - "epoch": 2.94, - "grad_norm": 0.9315210580825806, - "learning_rate": 1.2364470391993329e-05, - "loss": 0.6685, - "step": 3035 - }, - { - "epoch": 2.94, - "grad_norm": 1.191622257232666, - "learning_rate": 1.2360300250208507e-05, - "loss": 0.8425, - "step": 3036 - }, - { - "epoch": 2.94, - "grad_norm": 1.063096046447754, - "learning_rate": 1.2356130108423687e-05, - "loss": 0.7942, - "step": 3037 - }, - { - "epoch": 2.94, - "grad_norm": 1.0077861547470093, - "learning_rate": 1.2351959966638867e-05, - "loss": 0.6582, - "step": 3038 - }, - { - "epoch": 2.94, - "grad_norm": 0.9887157678604126, - "learning_rate": 1.2347789824854046e-05, - "loss": 0.6519, - "step": 3039 - }, - { - "epoch": 2.94, - "grad_norm": 1.119428277015686, - "learning_rate": 1.2343619683069226e-05, - "loss": 0.6829, - "step": 3040 - }, - { - "epoch": 2.94, - "grad_norm": 1.360180377960205, - "learning_rate": 1.2339449541284406e-05, - "loss": 0.8045, - "step": 3041 - }, - { - "epoch": 2.94, - "grad_norm": 1.0610947608947754, - "learning_rate": 1.2335279399499582e-05, - "loss": 0.7798, - "step": 3042 - }, - { - "epoch": 2.94, - "grad_norm": 1.1289516687393188, - "learning_rate": 1.2331109257714762e-05, - "loss": 0.7918, - "step": 3043 - }, - { - "epoch": 2.95, - "grad_norm": 0.9875333905220032, - "learning_rate": 1.2326939115929942e-05, - "loss": 0.7375, - "step": 3044 - }, - { - "epoch": 2.95, - "grad_norm": 1.1864285469055176, - "learning_rate": 1.232276897414512e-05, - "loss": 0.7995, - "step": 3045 - }, - { - "epoch": 2.95, - "grad_norm": 1.0463160276412964, - "learning_rate": 1.23185988323603e-05, - "loss": 0.7161, - "step": 3046 - }, - { - "epoch": 2.95, - "grad_norm": 1.065520167350769, - "learning_rate": 1.231442869057548e-05, - "loss": 0.858, - "step": 3047 - }, - { - "epoch": 2.95, - "grad_norm": 1.010031819343567, - "learning_rate": 1.231025854879066e-05, - "loss": 0.7777, - "step": 3048 - }, - { - "epoch": 2.95, - "grad_norm": 0.9578259587287903, - "learning_rate": 1.230608840700584e-05, - "loss": 0.7283, - "step": 3049 - }, - { - "epoch": 2.95, - "grad_norm": 0.9351388812065125, - "learning_rate": 1.230191826522102e-05, - "loss": 0.7382, - "step": 3050 - }, - { - "epoch": 2.95, - "grad_norm": 1.047263503074646, - "learning_rate": 1.2297748123436198e-05, - "loss": 0.9232, - "step": 3051 - }, - { - "epoch": 2.95, - "grad_norm": 1.0473809242248535, - "learning_rate": 1.2293577981651378e-05, - "loss": 0.6634, - "step": 3052 - }, - { - "epoch": 2.95, - "grad_norm": 0.9520713090896606, - "learning_rate": 1.2289407839866556e-05, - "loss": 0.8747, - "step": 3053 - }, - { - "epoch": 2.96, - "grad_norm": 1.191543459892273, - "learning_rate": 1.2285237698081734e-05, - "loss": 0.9448, - "step": 3054 - }, - { - "epoch": 2.96, - "grad_norm": 1.0252786874771118, - "learning_rate": 1.2281067556296914e-05, - "loss": 0.7491, - "step": 3055 - }, - { - "epoch": 2.96, - "grad_norm": 1.3725627660751343, - "learning_rate": 1.2276897414512094e-05, - "loss": 0.9492, - "step": 3056 - }, - { - "epoch": 2.96, - "grad_norm": 1.0086253881454468, - "learning_rate": 1.2272727272727273e-05, - "loss": 0.886, - "step": 3057 - }, - { - "epoch": 2.96, - "grad_norm": 1.1088051795959473, - "learning_rate": 1.2268557130942453e-05, - "loss": 0.6875, - "step": 3058 - }, - { - "epoch": 2.96, - "grad_norm": 1.0585017204284668, - "learning_rate": 1.2264386989157633e-05, - "loss": 0.7512, - "step": 3059 - }, - { - "epoch": 2.96, - "grad_norm": 1.4700901508331299, - "learning_rate": 1.2260216847372811e-05, - "loss": 0.7477, - "step": 3060 - }, - { - "epoch": 2.96, - "grad_norm": 1.0440226793289185, - "learning_rate": 1.2256046705587991e-05, - "loss": 0.8848, - "step": 3061 - }, - { - "epoch": 2.96, - "grad_norm": 1.2762154340744019, - "learning_rate": 1.2251876563803171e-05, - "loss": 0.7058, - "step": 3062 - }, - { - "epoch": 2.96, - "grad_norm": 1.3675118684768677, - "learning_rate": 1.224770642201835e-05, - "loss": 0.7938, - "step": 3063 - }, - { - "epoch": 2.96, - "grad_norm": 1.107913613319397, - "learning_rate": 1.2243536280233528e-05, - "loss": 0.7387, - "step": 3064 - }, - { - "epoch": 2.97, - "grad_norm": 1.0426727533340454, - "learning_rate": 1.2239366138448708e-05, - "loss": 0.6806, - "step": 3065 - }, - { - "epoch": 2.97, - "grad_norm": 1.1301614046096802, - "learning_rate": 1.2235195996663886e-05, - "loss": 1.0033, - "step": 3066 - }, - { - "epoch": 2.97, - "grad_norm": 1.4832754135131836, - "learning_rate": 1.2231025854879066e-05, - "loss": 0.7841, - "step": 3067 - }, - { - "epoch": 2.97, - "grad_norm": 0.9917944669723511, - "learning_rate": 1.2226855713094246e-05, - "loss": 0.7744, - "step": 3068 - }, - { - "epoch": 2.97, - "grad_norm": 1.2754501104354858, - "learning_rate": 1.2222685571309425e-05, - "loss": 0.6216, - "step": 3069 - }, - { - "epoch": 2.97, - "grad_norm": 1.2075879573822021, - "learning_rate": 1.2218515429524605e-05, - "loss": 0.6349, - "step": 3070 - }, - { - "epoch": 2.97, - "grad_norm": 0.9997736215591431, - "learning_rate": 1.2214345287739785e-05, - "loss": 0.7959, - "step": 3071 - }, - { - "epoch": 2.97, - "grad_norm": 0.9662615656852722, - "learning_rate": 1.2210175145954963e-05, - "loss": 0.8179, - "step": 3072 - }, - { - "epoch": 2.97, - "grad_norm": 1.3078581094741821, - "learning_rate": 1.2206005004170143e-05, - "loss": 0.8373, - "step": 3073 - }, - { - "epoch": 2.97, - "grad_norm": 1.290023684501648, - "learning_rate": 1.2201834862385323e-05, - "loss": 0.8505, - "step": 3074 - }, - { - "epoch": 2.98, - "grad_norm": 0.8792833089828491, - "learning_rate": 1.21976647206005e-05, - "loss": 0.65, - "step": 3075 - }, - { - "epoch": 2.98, - "grad_norm": 0.9692551493644714, - "learning_rate": 1.219349457881568e-05, - "loss": 0.6735, - "step": 3076 - }, - { - "epoch": 2.98, - "grad_norm": 1.0831763744354248, - "learning_rate": 1.218932443703086e-05, - "loss": 0.7884, - "step": 3077 - }, - { - "epoch": 2.98, - "grad_norm": 1.2194184064865112, - "learning_rate": 1.2185154295246038e-05, - "loss": 0.9213, - "step": 3078 - }, - { - "epoch": 2.98, - "grad_norm": 1.0239665508270264, - "learning_rate": 1.2180984153461218e-05, - "loss": 0.723, - "step": 3079 - }, - { - "epoch": 2.98, - "grad_norm": 1.1858700513839722, - "learning_rate": 1.2176814011676398e-05, - "loss": 0.7594, - "step": 3080 - }, - { - "epoch": 2.98, - "grad_norm": 1.1054250001907349, - "learning_rate": 1.2172643869891577e-05, - "loss": 0.6723, - "step": 3081 - }, - { - "epoch": 2.98, - "grad_norm": 1.1830312013626099, - "learning_rate": 1.2168473728106757e-05, - "loss": 0.6662, - "step": 3082 - }, - { - "epoch": 2.98, - "grad_norm": 1.3367156982421875, - "learning_rate": 1.2164303586321937e-05, - "loss": 0.686, - "step": 3083 - }, - { - "epoch": 2.98, - "grad_norm": 1.384082317352295, - "learning_rate": 1.2160133444537115e-05, - "loss": 0.8121, - "step": 3084 - }, - { - "epoch": 2.99, - "grad_norm": 1.128053069114685, - "learning_rate": 1.2155963302752295e-05, - "loss": 0.7576, - "step": 3085 - }, - { - "epoch": 2.99, - "grad_norm": 0.936526358127594, - "learning_rate": 1.2151793160967473e-05, - "loss": 0.7828, - "step": 3086 - }, - { - "epoch": 2.99, - "grad_norm": 1.0597058534622192, - "learning_rate": 1.2147623019182652e-05, - "loss": 0.6795, - "step": 3087 - }, - { - "epoch": 2.99, - "grad_norm": 1.3651597499847412, - "learning_rate": 1.2143452877397832e-05, - "loss": 0.7318, - "step": 3088 - }, - { - "epoch": 2.99, - "grad_norm": 1.3782498836517334, - "learning_rate": 1.2139282735613012e-05, - "loss": 0.8455, - "step": 3089 - }, - { - "epoch": 2.99, - "grad_norm": 1.1808693408966064, - "learning_rate": 1.213511259382819e-05, - "loss": 0.835, - "step": 3090 - }, - { - "epoch": 2.99, - "grad_norm": 1.006447672843933, - "learning_rate": 1.213094245204337e-05, - "loss": 0.7341, - "step": 3091 - }, - { - "epoch": 2.99, - "grad_norm": 0.8787515163421631, - "learning_rate": 1.212677231025855e-05, - "loss": 0.7691, - "step": 3092 - }, - { - "epoch": 2.99, - "grad_norm": 0.9759172797203064, - "learning_rate": 1.2122602168473729e-05, - "loss": 0.8682, - "step": 3093 - }, - { - "epoch": 2.99, - "grad_norm": 0.9055489897727966, - "learning_rate": 1.2118432026688909e-05, - "loss": 0.7662, - "step": 3094 - }, - { - "epoch": 2.99, - "grad_norm": 1.107927680015564, - "learning_rate": 1.2114261884904089e-05, - "loss": 0.7179, - "step": 3095 - }, - { - "epoch": 3.0, - "grad_norm": 1.0260263681411743, - "learning_rate": 1.2110091743119267e-05, - "loss": 0.7617, - "step": 3096 - }, - { - "epoch": 3.0, - "grad_norm": 1.3138796091079712, - "learning_rate": 1.2105921601334445e-05, - "loss": 0.8302, - "step": 3097 - }, - { - "epoch": 3.0, - "grad_norm": 1.2815160751342773, - "learning_rate": 1.2101751459549625e-05, - "loss": 0.8708, - "step": 3098 - }, - { - "epoch": 3.0, - "grad_norm": 0.7362232804298401, - "learning_rate": 1.2097581317764804e-05, - "loss": 0.6188, - "step": 3099 - }, - { - "epoch": 3.0, - "grad_norm": 1.3997468948364258, - "learning_rate": 1.2093411175979984e-05, - "loss": 0.8976, - "step": 3100 - }, - { - "epoch": 3.0, - "grad_norm": 1.1959848403930664, - "learning_rate": 1.2089241034195164e-05, - "loss": 0.7688, - "step": 3101 - }, - { - "epoch": 3.0, - "grad_norm": 1.161894679069519, - "learning_rate": 1.2085070892410342e-05, - "loss": 0.7745, - "step": 3102 - }, - { - "epoch": 3.0, - "grad_norm": 1.0665125846862793, - "learning_rate": 1.2080900750625522e-05, - "loss": 0.7202, - "step": 3103 - }, - { - "epoch": 3.0, - "grad_norm": 1.1533799171447754, - "learning_rate": 1.2076730608840702e-05, - "loss": 0.9159, - "step": 3104 - }, - { - "epoch": 3.0, - "grad_norm": 1.1835631132125854, - "learning_rate": 1.207256046705588e-05, - "loss": 0.7914, - "step": 3105 - }, - { - "epoch": 3.01, - "grad_norm": 1.195507526397705, - "learning_rate": 1.206839032527106e-05, - "loss": 0.7561, - "step": 3106 - }, - { - "epoch": 3.01, - "grad_norm": 0.9535657167434692, - "learning_rate": 1.2064220183486239e-05, - "loss": 0.7956, - "step": 3107 - }, - { - "epoch": 3.01, - "grad_norm": 0.994099497795105, - "learning_rate": 1.2060050041701417e-05, - "loss": 0.7672, - "step": 3108 - }, - { - "epoch": 3.01, - "grad_norm": 1.235750675201416, - "learning_rate": 1.2055879899916597e-05, - "loss": 0.7141, - "step": 3109 - }, - { - "epoch": 3.01, - "grad_norm": 0.9944607019424438, - "learning_rate": 1.2051709758131777e-05, - "loss": 0.624, - "step": 3110 - }, - { - "epoch": 3.01, - "grad_norm": 1.014190673828125, - "learning_rate": 1.2047539616346956e-05, - "loss": 0.7399, - "step": 3111 - }, - { - "epoch": 3.01, - "grad_norm": 0.9740110635757446, - "learning_rate": 1.2043369474562136e-05, - "loss": 0.864, - "step": 3112 - }, - { - "epoch": 3.01, - "grad_norm": 1.0639153718948364, - "learning_rate": 1.2039199332777316e-05, - "loss": 0.7062, - "step": 3113 - }, - { - "epoch": 3.01, - "grad_norm": 1.0522348880767822, - "learning_rate": 1.2035029190992494e-05, - "loss": 0.6025, - "step": 3114 - }, - { - "epoch": 3.01, - "grad_norm": 1.1692399978637695, - "learning_rate": 1.2030859049207674e-05, - "loss": 0.7637, - "step": 3115 - }, - { - "epoch": 3.01, - "grad_norm": 1.0824507474899292, - "learning_rate": 1.2026688907422854e-05, - "loss": 0.7185, - "step": 3116 - }, - { - "epoch": 3.02, - "grad_norm": 1.295857548713684, - "learning_rate": 1.2022518765638033e-05, - "loss": 0.688, - "step": 3117 - }, - { - "epoch": 3.02, - "grad_norm": 1.3810677528381348, - "learning_rate": 1.2018348623853213e-05, - "loss": 0.7539, - "step": 3118 - }, - { - "epoch": 3.02, - "grad_norm": 1.003240942955017, - "learning_rate": 1.2014178482068391e-05, - "loss": 0.6637, - "step": 3119 - }, - { - "epoch": 3.02, - "grad_norm": 1.0012834072113037, - "learning_rate": 1.201000834028357e-05, - "loss": 0.6153, - "step": 3120 - }, - { - "epoch": 3.02, - "grad_norm": 1.3184245824813843, - "learning_rate": 1.200583819849875e-05, - "loss": 0.8524, - "step": 3121 - }, - { - "epoch": 3.02, - "grad_norm": 0.9816140532493591, - "learning_rate": 1.200166805671393e-05, - "loss": 0.6306, - "step": 3122 - }, - { - "epoch": 3.02, - "grad_norm": 1.8915208578109741, - "learning_rate": 1.1997497914929108e-05, - "loss": 0.6339, - "step": 3123 - }, - { - "epoch": 3.02, - "grad_norm": 1.3101775646209717, - "learning_rate": 1.1993327773144288e-05, - "loss": 0.6759, - "step": 3124 - }, - { - "epoch": 3.02, - "grad_norm": 1.115962266921997, - "learning_rate": 1.1989157631359468e-05, - "loss": 0.6816, - "step": 3125 - }, - { - "epoch": 3.02, - "grad_norm": 1.248052716255188, - "learning_rate": 1.1984987489574646e-05, - "loss": 0.7884, - "step": 3126 - }, - { - "epoch": 3.03, - "grad_norm": 1.063430905342102, - "learning_rate": 1.1980817347789826e-05, - "loss": 0.5605, - "step": 3127 - }, - { - "epoch": 3.03, - "grad_norm": 1.0415267944335938, - "learning_rate": 1.1976647206005004e-05, - "loss": 0.7655, - "step": 3128 - }, - { - "epoch": 3.03, - "grad_norm": 1.1343778371810913, - "learning_rate": 1.1972477064220184e-05, - "loss": 0.6113, - "step": 3129 - }, - { - "epoch": 3.03, - "grad_norm": 1.2976269721984863, - "learning_rate": 1.1968306922435363e-05, - "loss": 0.8483, - "step": 3130 - }, - { - "epoch": 3.03, - "grad_norm": 0.9503476619720459, - "learning_rate": 1.1964136780650543e-05, - "loss": 0.7105, - "step": 3131 - }, - { - "epoch": 3.03, - "grad_norm": 1.0131187438964844, - "learning_rate": 1.1959966638865721e-05, - "loss": 0.7041, - "step": 3132 - }, - { - "epoch": 3.03, - "grad_norm": 1.1790024042129517, - "learning_rate": 1.1955796497080901e-05, - "loss": 0.6306, - "step": 3133 - }, - { - "epoch": 3.03, - "grad_norm": 1.0622471570968628, - "learning_rate": 1.1951626355296081e-05, - "loss": 0.7699, - "step": 3134 - }, - { - "epoch": 3.03, - "grad_norm": 1.123382568359375, - "learning_rate": 1.194745621351126e-05, - "loss": 0.6603, - "step": 3135 - }, - { - "epoch": 3.03, - "grad_norm": 1.2708872556686401, - "learning_rate": 1.194328607172644e-05, - "loss": 0.5941, - "step": 3136 - }, - { - "epoch": 3.04, - "grad_norm": 1.224805235862732, - "learning_rate": 1.193911592994162e-05, - "loss": 0.9069, - "step": 3137 - }, - { - "epoch": 3.04, - "grad_norm": 1.0035231113433838, - "learning_rate": 1.1934945788156798e-05, - "loss": 0.555, - "step": 3138 - }, - { - "epoch": 3.04, - "grad_norm": 1.4581968784332275, - "learning_rate": 1.1930775646371978e-05, - "loss": 0.6956, - "step": 3139 - }, - { - "epoch": 3.04, - "grad_norm": 0.9744970202445984, - "learning_rate": 1.1926605504587156e-05, - "loss": 0.6884, - "step": 3140 - }, - { - "epoch": 3.04, - "grad_norm": 1.4142813682556152, - "learning_rate": 1.1922435362802335e-05, - "loss": 0.7292, - "step": 3141 - }, - { - "epoch": 3.04, - "grad_norm": 1.4209671020507812, - "learning_rate": 1.1918265221017515e-05, - "loss": 0.6784, - "step": 3142 - }, - { - "epoch": 3.04, - "grad_norm": 1.2037233114242554, - "learning_rate": 1.1914095079232695e-05, - "loss": 0.7394, - "step": 3143 - }, - { - "epoch": 3.04, - "grad_norm": 1.495166301727295, - "learning_rate": 1.1909924937447873e-05, - "loss": 0.8081, - "step": 3144 - }, - { - "epoch": 3.04, - "grad_norm": 1.2146245241165161, - "learning_rate": 1.1905754795663053e-05, - "loss": 0.8138, - "step": 3145 - }, - { - "epoch": 3.04, - "grad_norm": 1.210348129272461, - "learning_rate": 1.1901584653878233e-05, - "loss": 0.7309, - "step": 3146 - }, - { - "epoch": 3.04, - "grad_norm": 0.89698725938797, - "learning_rate": 1.1897414512093412e-05, - "loss": 0.6199, - "step": 3147 - }, - { - "epoch": 3.05, - "grad_norm": 1.1466608047485352, - "learning_rate": 1.1893244370308592e-05, - "loss": 0.9122, - "step": 3148 - }, - { - "epoch": 3.05, - "grad_norm": 1.2042369842529297, - "learning_rate": 1.188907422852377e-05, - "loss": 0.7966, - "step": 3149 - }, - { - "epoch": 3.05, - "grad_norm": 1.3219712972640991, - "learning_rate": 1.188490408673895e-05, - "loss": 0.6718, - "step": 3150 - }, - { - "epoch": 3.05, - "grad_norm": 1.2861446142196655, - "learning_rate": 1.188073394495413e-05, - "loss": 0.5506, - "step": 3151 - }, - { - "epoch": 3.05, - "grad_norm": 1.157288670539856, - "learning_rate": 1.1876563803169308e-05, - "loss": 0.7712, - "step": 3152 - }, - { - "epoch": 3.05, - "grad_norm": 1.078233242034912, - "learning_rate": 1.1872393661384487e-05, - "loss": 0.6353, - "step": 3153 - }, - { - "epoch": 3.05, - "grad_norm": 1.1989777088165283, - "learning_rate": 1.1868223519599667e-05, - "loss": 0.7074, - "step": 3154 - }, - { - "epoch": 3.05, - "grad_norm": 1.2606451511383057, - "learning_rate": 1.1864053377814847e-05, - "loss": 0.7956, - "step": 3155 - }, - { - "epoch": 3.05, - "grad_norm": 1.0449250936508179, - "learning_rate": 1.1859883236030025e-05, - "loss": 0.7048, - "step": 3156 - }, - { - "epoch": 3.05, - "grad_norm": 1.2699267864227295, - "learning_rate": 1.1855713094245205e-05, - "loss": 0.7136, - "step": 3157 - }, - { - "epoch": 3.06, - "grad_norm": 1.2244712114334106, - "learning_rate": 1.1851542952460385e-05, - "loss": 0.6536, - "step": 3158 - }, - { - "epoch": 3.06, - "grad_norm": 1.1166603565216064, - "learning_rate": 1.1847372810675564e-05, - "loss": 0.8708, - "step": 3159 - }, - { - "epoch": 3.06, - "grad_norm": 1.1476706266403198, - "learning_rate": 1.1843202668890744e-05, - "loss": 0.8007, - "step": 3160 - }, - { - "epoch": 3.06, - "grad_norm": 1.7055881023406982, - "learning_rate": 1.1839032527105922e-05, - "loss": 0.7564, - "step": 3161 - }, - { - "epoch": 3.06, - "grad_norm": 1.3506639003753662, - "learning_rate": 1.1834862385321102e-05, - "loss": 0.8376, - "step": 3162 - }, - { - "epoch": 3.06, - "grad_norm": 1.1865992546081543, - "learning_rate": 1.183069224353628e-05, - "loss": 1.0005, - "step": 3163 - }, - { - "epoch": 3.06, - "grad_norm": 1.372769832611084, - "learning_rate": 1.182652210175146e-05, - "loss": 0.7519, - "step": 3164 - }, - { - "epoch": 3.06, - "grad_norm": 1.202907681465149, - "learning_rate": 1.1822351959966639e-05, - "loss": 0.6703, - "step": 3165 - }, - { - "epoch": 3.06, - "grad_norm": 1.507893443107605, - "learning_rate": 1.1818181818181819e-05, - "loss": 0.6624, - "step": 3166 - }, - { - "epoch": 3.06, - "grad_norm": 1.3672096729278564, - "learning_rate": 1.1814011676396999e-05, - "loss": 0.6845, - "step": 3167 - }, - { - "epoch": 3.07, - "grad_norm": 1.0586228370666504, - "learning_rate": 1.1809841534612177e-05, - "loss": 0.6801, - "step": 3168 - }, - { - "epoch": 3.07, - "grad_norm": 0.8661351203918457, - "learning_rate": 1.1805671392827357e-05, - "loss": 0.591, - "step": 3169 - }, - { - "epoch": 3.07, - "grad_norm": 1.038102388381958, - "learning_rate": 1.1801501251042537e-05, - "loss": 0.7926, - "step": 3170 - }, - { - "epoch": 3.07, - "grad_norm": 1.1683626174926758, - "learning_rate": 1.1797331109257715e-05, - "loss": 0.6006, - "step": 3171 - }, - { - "epoch": 3.07, - "grad_norm": 1.1106253862380981, - "learning_rate": 1.1793160967472896e-05, - "loss": 0.7019, - "step": 3172 - }, - { - "epoch": 3.07, - "grad_norm": 1.1502753496170044, - "learning_rate": 1.1788990825688074e-05, - "loss": 0.6601, - "step": 3173 - }, - { - "epoch": 3.07, - "grad_norm": 1.1901448965072632, - "learning_rate": 1.1784820683903252e-05, - "loss": 0.7255, - "step": 3174 - }, - { - "epoch": 3.07, - "grad_norm": 1.3774100542068481, - "learning_rate": 1.1780650542118432e-05, - "loss": 0.731, - "step": 3175 - }, - { - "epoch": 3.07, - "grad_norm": 1.0183756351470947, - "learning_rate": 1.1776480400333612e-05, - "loss": 0.5829, - "step": 3176 - }, - { - "epoch": 3.07, - "grad_norm": 1.3123226165771484, - "learning_rate": 1.177231025854879e-05, - "loss": 0.6625, - "step": 3177 - }, - { - "epoch": 3.07, - "grad_norm": 1.1603424549102783, - "learning_rate": 1.176814011676397e-05, - "loss": 0.6779, - "step": 3178 - }, - { - "epoch": 3.08, - "grad_norm": 0.9390170574188232, - "learning_rate": 1.176396997497915e-05, - "loss": 0.7167, - "step": 3179 - }, - { - "epoch": 3.08, - "grad_norm": 0.9851157069206238, - "learning_rate": 1.1759799833194329e-05, - "loss": 0.7626, - "step": 3180 - }, - { - "epoch": 3.08, - "grad_norm": 1.2829152345657349, - "learning_rate": 1.1755629691409509e-05, - "loss": 0.6149, - "step": 3181 - }, - { - "epoch": 3.08, - "grad_norm": 1.1150360107421875, - "learning_rate": 1.1751459549624687e-05, - "loss": 0.6953, - "step": 3182 - }, - { - "epoch": 3.08, - "grad_norm": 1.5720709562301636, - "learning_rate": 1.1747289407839867e-05, - "loss": 0.7633, - "step": 3183 - }, - { - "epoch": 3.08, - "grad_norm": 1.0386683940887451, - "learning_rate": 1.1743119266055047e-05, - "loss": 0.7953, - "step": 3184 - }, - { - "epoch": 3.08, - "grad_norm": 0.9080824255943298, - "learning_rate": 1.1738949124270226e-05, - "loss": 0.7657, - "step": 3185 - }, - { - "epoch": 3.08, - "grad_norm": 1.2492918968200684, - "learning_rate": 1.1734778982485404e-05, - "loss": 0.6641, - "step": 3186 - }, - { - "epoch": 3.08, - "grad_norm": 0.8783729076385498, - "learning_rate": 1.1730608840700584e-05, - "loss": 0.8305, - "step": 3187 - }, - { - "epoch": 3.08, - "grad_norm": 1.0769270658493042, - "learning_rate": 1.1726438698915764e-05, - "loss": 0.6373, - "step": 3188 - }, - { - "epoch": 3.09, - "grad_norm": 0.9104419350624084, - "learning_rate": 1.1722268557130943e-05, - "loss": 0.7511, - "step": 3189 - }, - { - "epoch": 3.09, - "grad_norm": 1.2244678735733032, - "learning_rate": 1.1718098415346123e-05, - "loss": 0.5847, - "step": 3190 - }, - { - "epoch": 3.09, - "grad_norm": 1.139946460723877, - "learning_rate": 1.1713928273561303e-05, - "loss": 0.6609, - "step": 3191 - }, - { - "epoch": 3.09, - "grad_norm": 1.1527796983718872, - "learning_rate": 1.1709758131776481e-05, - "loss": 0.7893, - "step": 3192 - }, - { - "epoch": 3.09, - "grad_norm": 1.2792586088180542, - "learning_rate": 1.1705587989991661e-05, - "loss": 0.7077, - "step": 3193 - }, - { - "epoch": 3.09, - "grad_norm": 1.7594659328460693, - "learning_rate": 1.170141784820684e-05, - "loss": 0.7908, - "step": 3194 - }, - { - "epoch": 3.09, - "grad_norm": 0.8258441686630249, - "learning_rate": 1.169724770642202e-05, - "loss": 0.5861, - "step": 3195 - }, - { - "epoch": 3.09, - "grad_norm": 1.1048345565795898, - "learning_rate": 1.1693077564637198e-05, - "loss": 0.7191, - "step": 3196 - }, - { - "epoch": 3.09, - "grad_norm": 1.8866416215896606, - "learning_rate": 1.1688907422852378e-05, - "loss": 0.5792, - "step": 3197 - }, - { - "epoch": 3.09, - "grad_norm": 1.2641773223876953, - "learning_rate": 1.1684737281067556e-05, - "loss": 0.8419, - "step": 3198 - }, - { - "epoch": 3.1, - "grad_norm": 1.423106074333191, - "learning_rate": 1.1680567139282736e-05, - "loss": 0.9026, - "step": 3199 - }, - { - "epoch": 3.1, - "grad_norm": 1.3765684366226196, - "learning_rate": 1.1676396997497916e-05, - "loss": 0.7982, - "step": 3200 - }, - { - "epoch": 3.1, - "grad_norm": 1.383523941040039, - "learning_rate": 1.1672226855713095e-05, - "loss": 0.6969, - "step": 3201 - }, - { - "epoch": 3.1, - "grad_norm": 1.086730718612671, - "learning_rate": 1.1668056713928275e-05, - "loss": 0.6888, - "step": 3202 - }, - { - "epoch": 3.1, - "grad_norm": 1.153071403503418, - "learning_rate": 1.1663886572143453e-05, - "loss": 0.7826, - "step": 3203 - }, - { - "epoch": 3.1, - "grad_norm": 1.1363312005996704, - "learning_rate": 1.1659716430358633e-05, - "loss": 0.6431, - "step": 3204 - }, - { - "epoch": 3.1, - "grad_norm": 1.2717450857162476, - "learning_rate": 1.1655546288573813e-05, - "loss": 0.5969, - "step": 3205 - }, - { - "epoch": 3.1, - "grad_norm": 1.1417663097381592, - "learning_rate": 1.1651376146788991e-05, - "loss": 0.6747, - "step": 3206 - }, - { - "epoch": 3.1, - "grad_norm": 1.2456254959106445, - "learning_rate": 1.164720600500417e-05, - "loss": 0.6171, - "step": 3207 - }, - { - "epoch": 3.1, - "grad_norm": 1.1771053075790405, - "learning_rate": 1.164303586321935e-05, - "loss": 0.5973, - "step": 3208 - }, - { - "epoch": 3.1, - "grad_norm": 1.0299570560455322, - "learning_rate": 1.163886572143453e-05, - "loss": 0.7783, - "step": 3209 - }, - { - "epoch": 3.11, - "grad_norm": 1.4078141450881958, - "learning_rate": 1.1634695579649708e-05, - "loss": 0.7157, - "step": 3210 - }, - { - "epoch": 3.11, - "grad_norm": 1.0649255514144897, - "learning_rate": 1.1630525437864888e-05, - "loss": 0.7021, - "step": 3211 - }, - { - "epoch": 3.11, - "grad_norm": 1.2134660482406616, - "learning_rate": 1.1626355296080068e-05, - "loss": 0.7837, - "step": 3212 - }, - { - "epoch": 3.11, - "grad_norm": 1.4087175130844116, - "learning_rate": 1.1622185154295246e-05, - "loss": 0.9266, - "step": 3213 - }, - { - "epoch": 3.11, - "grad_norm": 1.3690346479415894, - "learning_rate": 1.1618015012510427e-05, - "loss": 0.6973, - "step": 3214 - }, - { - "epoch": 3.11, - "grad_norm": 1.0998339653015137, - "learning_rate": 1.1613844870725605e-05, - "loss": 0.6504, - "step": 3215 - }, - { - "epoch": 3.11, - "grad_norm": 1.1038250923156738, - "learning_rate": 1.1609674728940785e-05, - "loss": 0.6989, - "step": 3216 - }, - { - "epoch": 3.11, - "grad_norm": 1.1213037967681885, - "learning_rate": 1.1605504587155965e-05, - "loss": 0.7214, - "step": 3217 - }, - { - "epoch": 3.11, - "grad_norm": 1.2067272663116455, - "learning_rate": 1.1601334445371143e-05, - "loss": 0.8201, - "step": 3218 - }, - { - "epoch": 3.11, - "grad_norm": 1.98118257522583, - "learning_rate": 1.1597164303586322e-05, - "loss": 0.6148, - "step": 3219 - }, - { - "epoch": 3.12, - "grad_norm": 1.2058942317962646, - "learning_rate": 1.1592994161801502e-05, - "loss": 0.6153, - "step": 3220 - }, - { - "epoch": 3.12, - "grad_norm": 1.4307641983032227, - "learning_rate": 1.1588824020016682e-05, - "loss": 0.6797, - "step": 3221 - }, - { - "epoch": 3.12, - "grad_norm": 1.0934830904006958, - "learning_rate": 1.158465387823186e-05, - "loss": 0.7091, - "step": 3222 - }, - { - "epoch": 3.12, - "grad_norm": 1.8597009181976318, - "learning_rate": 1.158048373644704e-05, - "loss": 0.7104, - "step": 3223 - }, - { - "epoch": 3.12, - "grad_norm": 1.0559728145599365, - "learning_rate": 1.1576313594662218e-05, - "loss": 0.6183, - "step": 3224 - }, - { - "epoch": 3.12, - "grad_norm": 1.4455348253250122, - "learning_rate": 1.1572143452877398e-05, - "loss": 0.7366, - "step": 3225 - }, - { - "epoch": 3.12, - "grad_norm": 1.103995442390442, - "learning_rate": 1.1567973311092578e-05, - "loss": 0.5635, - "step": 3226 - }, - { - "epoch": 3.12, - "grad_norm": 1.14092218875885, - "learning_rate": 1.1563803169307757e-05, - "loss": 0.7168, - "step": 3227 - }, - { - "epoch": 3.12, - "grad_norm": 1.125809907913208, - "learning_rate": 1.1559633027522937e-05, - "loss": 0.7188, - "step": 3228 - }, - { - "epoch": 3.12, - "grad_norm": 1.1541872024536133, - "learning_rate": 1.1555462885738115e-05, - "loss": 0.8327, - "step": 3229 - }, - { - "epoch": 3.13, - "grad_norm": 1.1878880262374878, - "learning_rate": 1.1551292743953295e-05, - "loss": 0.8089, - "step": 3230 - }, - { - "epoch": 3.13, - "grad_norm": 1.2504700422286987, - "learning_rate": 1.1547122602168474e-05, - "loss": 0.7335, - "step": 3231 - }, - { - "epoch": 3.13, - "grad_norm": 1.0725075006484985, - "learning_rate": 1.1542952460383654e-05, - "loss": 0.6189, - "step": 3232 - }, - { - "epoch": 3.13, - "grad_norm": 1.3383662700653076, - "learning_rate": 1.1538782318598834e-05, - "loss": 0.6417, - "step": 3233 - }, - { - "epoch": 3.13, - "grad_norm": 1.5360132455825806, - "learning_rate": 1.1534612176814012e-05, - "loss": 0.8597, - "step": 3234 - }, - { - "epoch": 3.13, - "grad_norm": 1.3072084188461304, - "learning_rate": 1.1530442035029192e-05, - "loss": 0.7842, - "step": 3235 - }, - { - "epoch": 3.13, - "grad_norm": 1.2421491146087646, - "learning_rate": 1.152627189324437e-05, - "loss": 0.6952, - "step": 3236 - }, - { - "epoch": 3.13, - "grad_norm": 1.0458987951278687, - "learning_rate": 1.152210175145955e-05, - "loss": 0.8233, - "step": 3237 - }, - { - "epoch": 3.13, - "grad_norm": 1.2006381750106812, - "learning_rate": 1.151793160967473e-05, - "loss": 0.6739, - "step": 3238 - }, - { - "epoch": 3.13, - "grad_norm": 1.2476036548614502, - "learning_rate": 1.1513761467889909e-05, - "loss": 0.7066, - "step": 3239 - }, - { - "epoch": 3.13, - "grad_norm": 1.2618188858032227, - "learning_rate": 1.1509591326105087e-05, - "loss": 0.752, - "step": 3240 - }, - { - "epoch": 3.14, - "grad_norm": 1.302243709564209, - "learning_rate": 1.1505421184320267e-05, - "loss": 0.7866, - "step": 3241 - }, - { - "epoch": 3.14, - "grad_norm": 1.2427616119384766, - "learning_rate": 1.1501251042535447e-05, - "loss": 0.6593, - "step": 3242 - }, - { - "epoch": 3.14, - "grad_norm": 1.4073095321655273, - "learning_rate": 1.1497080900750626e-05, - "loss": 0.7185, - "step": 3243 - }, - { - "epoch": 3.14, - "grad_norm": 1.598067283630371, - "learning_rate": 1.1492910758965806e-05, - "loss": 0.8257, - "step": 3244 - }, - { - "epoch": 3.14, - "grad_norm": 1.067767858505249, - "learning_rate": 1.1488740617180984e-05, - "loss": 0.8638, - "step": 3245 - }, - { - "epoch": 3.14, - "grad_norm": 1.256085991859436, - "learning_rate": 1.1484570475396164e-05, - "loss": 0.8165, - "step": 3246 - }, - { - "epoch": 3.14, - "grad_norm": 1.1388213634490967, - "learning_rate": 1.1480400333611344e-05, - "loss": 0.6575, - "step": 3247 - }, - { - "epoch": 3.14, - "grad_norm": 1.2170623540878296, - "learning_rate": 1.1476230191826522e-05, - "loss": 0.7292, - "step": 3248 - }, - { - "epoch": 3.14, - "grad_norm": 1.07816481590271, - "learning_rate": 1.1472060050041702e-05, - "loss": 0.6152, - "step": 3249 - }, - { - "epoch": 3.14, - "grad_norm": 1.3839792013168335, - "learning_rate": 1.1467889908256882e-05, - "loss": 0.6492, - "step": 3250 - }, - { - "epoch": 3.15, - "grad_norm": 1.230368971824646, - "learning_rate": 1.146371976647206e-05, - "loss": 0.584, - "step": 3251 - }, - { - "epoch": 3.15, - "grad_norm": 1.0888391733169556, - "learning_rate": 1.1459549624687239e-05, - "loss": 0.5848, - "step": 3252 - }, - { - "epoch": 3.15, - "grad_norm": 1.0804716348648071, - "learning_rate": 1.1455379482902419e-05, - "loss": 0.8888, - "step": 3253 - }, - { - "epoch": 3.15, - "grad_norm": 1.1284668445587158, - "learning_rate": 1.1451209341117599e-05, - "loss": 0.762, - "step": 3254 - }, - { - "epoch": 3.15, - "grad_norm": 1.244312047958374, - "learning_rate": 1.1447039199332777e-05, - "loss": 0.6616, - "step": 3255 - }, - { - "epoch": 3.15, - "grad_norm": 1.1920812129974365, - "learning_rate": 1.1442869057547958e-05, - "loss": 0.5905, - "step": 3256 - }, - { - "epoch": 3.15, - "grad_norm": 1.5419564247131348, - "learning_rate": 1.1438698915763136e-05, - "loss": 0.7983, - "step": 3257 - }, - { - "epoch": 3.15, - "grad_norm": 1.2202568054199219, - "learning_rate": 1.1434528773978316e-05, - "loss": 0.757, - "step": 3258 - }, - { - "epoch": 3.15, - "grad_norm": 1.3789610862731934, - "learning_rate": 1.1430358632193496e-05, - "loss": 0.8221, - "step": 3259 - }, - { - "epoch": 3.15, - "grad_norm": 1.289023518562317, - "learning_rate": 1.1426188490408674e-05, - "loss": 0.6762, - "step": 3260 - }, - { - "epoch": 3.16, - "grad_norm": 1.1462633609771729, - "learning_rate": 1.1422018348623854e-05, - "loss": 0.6354, - "step": 3261 - }, - { - "epoch": 3.16, - "grad_norm": 1.338168978691101, - "learning_rate": 1.1417848206839033e-05, - "loss": 0.7027, - "step": 3262 - }, - { - "epoch": 3.16, - "grad_norm": 1.2175195217132568, - "learning_rate": 1.1413678065054213e-05, - "loss": 0.7992, - "step": 3263 - }, - { - "epoch": 3.16, - "grad_norm": 1.1366231441497803, - "learning_rate": 1.1409507923269391e-05, - "loss": 0.5934, - "step": 3264 - }, - { - "epoch": 3.16, - "grad_norm": 0.9199458360671997, - "learning_rate": 1.1405337781484571e-05, - "loss": 0.6479, - "step": 3265 - }, - { - "epoch": 3.16, - "grad_norm": 1.1771986484527588, - "learning_rate": 1.140116763969975e-05, - "loss": 0.7222, - "step": 3266 - }, - { - "epoch": 3.16, - "grad_norm": 1.0597327947616577, - "learning_rate": 1.139699749791493e-05, - "loss": 0.7216, - "step": 3267 - }, - { - "epoch": 3.16, - "grad_norm": 1.1762417554855347, - "learning_rate": 1.139282735613011e-05, - "loss": 0.8565, - "step": 3268 - }, - { - "epoch": 3.16, - "grad_norm": 1.5666489601135254, - "learning_rate": 1.1388657214345288e-05, - "loss": 0.7854, - "step": 3269 - }, - { - "epoch": 3.16, - "grad_norm": 1.2026633024215698, - "learning_rate": 1.1384487072560468e-05, - "loss": 0.7129, - "step": 3270 - }, - { - "epoch": 3.16, - "grad_norm": 0.9615378975868225, - "learning_rate": 1.1380316930775648e-05, - "loss": 0.7439, - "step": 3271 - }, - { - "epoch": 3.17, - "grad_norm": 0.9021217823028564, - "learning_rate": 1.1376146788990826e-05, - "loss": 0.6624, - "step": 3272 - }, - { - "epoch": 3.17, - "grad_norm": 1.459649682044983, - "learning_rate": 1.1371976647206005e-05, - "loss": 0.7181, - "step": 3273 - }, - { - "epoch": 3.17, - "grad_norm": 1.446385383605957, - "learning_rate": 1.1367806505421185e-05, - "loss": 0.7422, - "step": 3274 - }, - { - "epoch": 3.17, - "grad_norm": 1.3447153568267822, - "learning_rate": 1.1363636363636365e-05, - "loss": 0.7428, - "step": 3275 - }, - { - "epoch": 3.17, - "grad_norm": 1.2977463006973267, - "learning_rate": 1.1359466221851543e-05, - "loss": 0.5531, - "step": 3276 - }, - { - "epoch": 3.17, - "grad_norm": 1.3001067638397217, - "learning_rate": 1.1355296080066723e-05, - "loss": 0.684, - "step": 3277 - }, - { - "epoch": 3.17, - "grad_norm": 0.9697086215019226, - "learning_rate": 1.1351125938281901e-05, - "loss": 0.6882, - "step": 3278 - }, - { - "epoch": 3.17, - "grad_norm": 1.4971201419830322, - "learning_rate": 1.1346955796497081e-05, - "loss": 0.7898, - "step": 3279 - }, - { - "epoch": 3.17, - "grad_norm": 1.437363624572754, - "learning_rate": 1.1342785654712261e-05, - "loss": 0.8547, - "step": 3280 - }, - { - "epoch": 3.17, - "grad_norm": 0.9968420267105103, - "learning_rate": 1.133861551292744e-05, - "loss": 0.7263, - "step": 3281 - }, - { - "epoch": 3.18, - "grad_norm": 1.1355323791503906, - "learning_rate": 1.133444537114262e-05, - "loss": 0.8424, - "step": 3282 - }, - { - "epoch": 3.18, - "grad_norm": 1.5121216773986816, - "learning_rate": 1.13302752293578e-05, - "loss": 0.5438, - "step": 3283 - }, - { - "epoch": 3.18, - "grad_norm": 1.0721768140792847, - "learning_rate": 1.1326105087572978e-05, - "loss": 0.8685, - "step": 3284 - }, - { - "epoch": 3.18, - "grad_norm": 1.1499862670898438, - "learning_rate": 1.1321934945788157e-05, - "loss": 0.7728, - "step": 3285 - }, - { - "epoch": 3.18, - "grad_norm": 1.206310510635376, - "learning_rate": 1.1317764804003337e-05, - "loss": 0.7517, - "step": 3286 - }, - { - "epoch": 3.18, - "grad_norm": 1.5181379318237305, - "learning_rate": 1.1313594662218515e-05, - "loss": 0.7631, - "step": 3287 - }, - { - "epoch": 3.18, - "grad_norm": 1.2266368865966797, - "learning_rate": 1.1309424520433695e-05, - "loss": 0.6491, - "step": 3288 - }, - { - "epoch": 3.18, - "grad_norm": 1.2430081367492676, - "learning_rate": 1.1305254378648875e-05, - "loss": 0.7077, - "step": 3289 - }, - { - "epoch": 3.18, - "grad_norm": 1.419384241104126, - "learning_rate": 1.1301084236864053e-05, - "loss": 0.7809, - "step": 3290 - }, - { - "epoch": 3.18, - "grad_norm": 1.0880069732666016, - "learning_rate": 1.1296914095079233e-05, - "loss": 0.7008, - "step": 3291 - }, - { - "epoch": 3.19, - "grad_norm": 1.357650637626648, - "learning_rate": 1.1292743953294413e-05, - "loss": 0.6893, - "step": 3292 - }, - { - "epoch": 3.19, - "grad_norm": 1.299135446548462, - "learning_rate": 1.1288573811509592e-05, - "loss": 0.7299, - "step": 3293 - }, - { - "epoch": 3.19, - "grad_norm": 1.3011831045150757, - "learning_rate": 1.1284403669724772e-05, - "loss": 0.6976, - "step": 3294 - }, - { - "epoch": 3.19, - "grad_norm": 1.0060980319976807, - "learning_rate": 1.128023352793995e-05, - "loss": 0.7907, - "step": 3295 - }, - { - "epoch": 3.19, - "grad_norm": 1.269822359085083, - "learning_rate": 1.127606338615513e-05, - "loss": 0.6656, - "step": 3296 - }, - { - "epoch": 3.19, - "grad_norm": 1.283442735671997, - "learning_rate": 1.1271893244370308e-05, - "loss": 0.836, - "step": 3297 - }, - { - "epoch": 3.19, - "grad_norm": 1.4523227214813232, - "learning_rate": 1.1267723102585489e-05, - "loss": 0.683, - "step": 3298 - }, - { - "epoch": 3.19, - "grad_norm": 1.170319676399231, - "learning_rate": 1.1263552960800667e-05, - "loss": 0.8076, - "step": 3299 - }, - { - "epoch": 3.19, - "grad_norm": 1.1295701265335083, - "learning_rate": 1.1259382819015847e-05, - "loss": 0.7454, - "step": 3300 - }, - { - "epoch": 3.19, - "eval_loss": 0.8572828769683838, - "eval_runtime": 861.1348, - "eval_samples_per_second": 4.801, - "eval_steps_per_second": 0.6, - "step": 3300 - }, - { - "epoch": 3.19, - "grad_norm": 1.3477787971496582, - "learning_rate": 1.1255212677231027e-05, - "loss": 0.6758, - "step": 3301 - }, - { - "epoch": 3.19, - "grad_norm": 1.2770168781280518, - "learning_rate": 1.1251042535446205e-05, - "loss": 0.5763, - "step": 3302 - }, - { - "epoch": 3.2, - "grad_norm": 1.169847846031189, - "learning_rate": 1.1246872393661385e-05, - "loss": 0.7559, - "step": 3303 - }, - { - "epoch": 3.2, - "grad_norm": 1.362985372543335, - "learning_rate": 1.1242702251876565e-05, - "loss": 0.6733, - "step": 3304 - }, - { - "epoch": 3.2, - "grad_norm": 1.3402783870697021, - "learning_rate": 1.1238532110091744e-05, - "loss": 0.7126, - "step": 3305 - }, - { - "epoch": 3.2, - "grad_norm": 1.2492231130599976, - "learning_rate": 1.1234361968306922e-05, - "loss": 0.6474, - "step": 3306 - }, - { - "epoch": 3.2, - "grad_norm": 1.181850790977478, - "learning_rate": 1.1230191826522102e-05, - "loss": 0.7208, - "step": 3307 - }, - { - "epoch": 3.2, - "grad_norm": 1.5102277994155884, - "learning_rate": 1.122602168473728e-05, - "loss": 0.9829, - "step": 3308 - }, - { - "epoch": 3.2, - "grad_norm": 1.0260789394378662, - "learning_rate": 1.122185154295246e-05, - "loss": 0.6908, - "step": 3309 - }, - { - "epoch": 3.2, - "grad_norm": 1.4059216976165771, - "learning_rate": 1.121768140116764e-05, - "loss": 0.6147, - "step": 3310 - }, - { - "epoch": 3.2, - "grad_norm": 1.2179521322250366, - "learning_rate": 1.1213511259382819e-05, - "loss": 0.6273, - "step": 3311 - }, - { - "epoch": 3.2, - "grad_norm": 1.3010526895523071, - "learning_rate": 1.1209341117597999e-05, - "loss": 0.6503, - "step": 3312 - }, - { - "epoch": 3.21, - "grad_norm": 1.1008187532424927, - "learning_rate": 1.1205170975813179e-05, - "loss": 0.7652, - "step": 3313 - }, - { - "epoch": 3.21, - "grad_norm": 1.5400184392929077, - "learning_rate": 1.1201000834028357e-05, - "loss": 0.7523, - "step": 3314 - }, - { - "epoch": 3.21, - "grad_norm": 1.293662667274475, - "learning_rate": 1.1196830692243537e-05, - "loss": 0.7678, - "step": 3315 - }, - { - "epoch": 3.21, - "grad_norm": 1.223686933517456, - "learning_rate": 1.1192660550458717e-05, - "loss": 0.8951, - "step": 3316 - }, - { - "epoch": 3.21, - "grad_norm": 1.3788419961929321, - "learning_rate": 1.1188490408673896e-05, - "loss": 0.876, - "step": 3317 - }, - { - "epoch": 3.21, - "grad_norm": 0.9645219445228577, - "learning_rate": 1.1184320266889074e-05, - "loss": 0.7768, - "step": 3318 - }, - { - "epoch": 3.21, - "grad_norm": 1.4818342924118042, - "learning_rate": 1.1180150125104254e-05, - "loss": 0.8519, - "step": 3319 - }, - { - "epoch": 3.21, - "grad_norm": 1.0755265951156616, - "learning_rate": 1.1175979983319432e-05, - "loss": 0.6771, - "step": 3320 - }, - { - "epoch": 3.21, - "grad_norm": 1.3540762662887573, - "learning_rate": 1.1171809841534612e-05, - "loss": 0.6359, - "step": 3321 - }, - { - "epoch": 3.21, - "grad_norm": 1.3275072574615479, - "learning_rate": 1.1167639699749792e-05, - "loss": 0.7787, - "step": 3322 - }, - { - "epoch": 3.22, - "grad_norm": 1.2326210737228394, - "learning_rate": 1.116346955796497e-05, - "loss": 0.7172, - "step": 3323 - }, - { - "epoch": 3.22, - "grad_norm": 1.4028030633926392, - "learning_rate": 1.115929941618015e-05, - "loss": 0.7234, - "step": 3324 - }, - { - "epoch": 3.22, - "grad_norm": 1.1928136348724365, - "learning_rate": 1.1155129274395331e-05, - "loss": 0.9787, - "step": 3325 - }, - { - "epoch": 3.22, - "grad_norm": 1.1200298070907593, - "learning_rate": 1.115095913261051e-05, - "loss": 0.6041, - "step": 3326 - }, - { - "epoch": 3.22, - "grad_norm": 1.1073306798934937, - "learning_rate": 1.114678899082569e-05, - "loss": 0.7624, - "step": 3327 - }, - { - "epoch": 3.22, - "grad_norm": 1.3048022985458374, - "learning_rate": 1.1142618849040868e-05, - "loss": 0.7386, - "step": 3328 - }, - { - "epoch": 3.22, - "grad_norm": 1.2235902547836304, - "learning_rate": 1.1138448707256046e-05, - "loss": 0.6995, - "step": 3329 - }, - { - "epoch": 3.22, - "grad_norm": 1.3890070915222168, - "learning_rate": 1.1134278565471226e-05, - "loss": 0.7412, - "step": 3330 - }, - { - "epoch": 3.22, - "grad_norm": 1.0528852939605713, - "learning_rate": 1.1130108423686406e-05, - "loss": 0.7006, - "step": 3331 - }, - { - "epoch": 3.22, - "grad_norm": 1.308143138885498, - "learning_rate": 1.1125938281901584e-05, - "loss": 0.6696, - "step": 3332 - }, - { - "epoch": 3.22, - "grad_norm": 1.071150541305542, - "learning_rate": 1.1121768140116764e-05, - "loss": 0.7458, - "step": 3333 - }, - { - "epoch": 3.23, - "grad_norm": 1.224147081375122, - "learning_rate": 1.1117597998331944e-05, - "loss": 0.9626, - "step": 3334 - }, - { - "epoch": 3.23, - "grad_norm": 1.3847763538360596, - "learning_rate": 1.1113427856547123e-05, - "loss": 0.6449, - "step": 3335 - }, - { - "epoch": 3.23, - "grad_norm": 1.3238705396652222, - "learning_rate": 1.1109257714762303e-05, - "loss": 0.7557, - "step": 3336 - }, - { - "epoch": 3.23, - "grad_norm": 1.1174274682998657, - "learning_rate": 1.1105087572977483e-05, - "loss": 0.7587, - "step": 3337 - }, - { - "epoch": 3.23, - "grad_norm": 1.061917781829834, - "learning_rate": 1.1100917431192661e-05, - "loss": 0.6395, - "step": 3338 - }, - { - "epoch": 3.23, - "grad_norm": 1.3624316453933716, - "learning_rate": 1.109674728940784e-05, - "loss": 0.7386, - "step": 3339 - }, - { - "epoch": 3.23, - "grad_norm": 0.9669332504272461, - "learning_rate": 1.109257714762302e-05, - "loss": 0.7333, - "step": 3340 - }, - { - "epoch": 3.23, - "grad_norm": 1.1314890384674072, - "learning_rate": 1.1088407005838198e-05, - "loss": 0.7405, - "step": 3341 - }, - { - "epoch": 3.23, - "grad_norm": 1.1925511360168457, - "learning_rate": 1.1084236864053378e-05, - "loss": 0.8459, - "step": 3342 - }, - { - "epoch": 3.23, - "grad_norm": 1.0116933584213257, - "learning_rate": 1.1080066722268558e-05, - "loss": 0.7452, - "step": 3343 - }, - { - "epoch": 3.24, - "grad_norm": 1.3489868640899658, - "learning_rate": 1.1075896580483736e-05, - "loss": 0.659, - "step": 3344 - }, - { - "epoch": 3.24, - "grad_norm": 1.6504747867584229, - "learning_rate": 1.1071726438698916e-05, - "loss": 0.7387, - "step": 3345 - }, - { - "epoch": 3.24, - "grad_norm": 1.038535714149475, - "learning_rate": 1.1067556296914096e-05, - "loss": 0.612, - "step": 3346 - }, - { - "epoch": 3.24, - "grad_norm": 1.3701627254486084, - "learning_rate": 1.1063386155129275e-05, - "loss": 0.8176, - "step": 3347 - }, - { - "epoch": 3.24, - "grad_norm": 1.492211937904358, - "learning_rate": 1.1059216013344455e-05, - "loss": 0.7442, - "step": 3348 - }, - { - "epoch": 3.24, - "grad_norm": 1.3194350004196167, - "learning_rate": 1.1055045871559635e-05, - "loss": 0.7451, - "step": 3349 - }, - { - "epoch": 3.24, - "grad_norm": 1.4297610521316528, - "learning_rate": 1.1050875729774811e-05, - "loss": 0.7047, - "step": 3350 - }, - { - "epoch": 3.24, - "grad_norm": 1.2381166219711304, - "learning_rate": 1.1046705587989991e-05, - "loss": 0.672, - "step": 3351 - }, - { - "epoch": 3.24, - "grad_norm": 1.6374913454055786, - "learning_rate": 1.1042535446205171e-05, - "loss": 0.6271, - "step": 3352 - }, - { - "epoch": 3.24, - "grad_norm": 1.383934497833252, - "learning_rate": 1.103836530442035e-05, - "loss": 0.7244, - "step": 3353 - }, - { - "epoch": 3.25, - "grad_norm": 1.6124647855758667, - "learning_rate": 1.103419516263553e-05, - "loss": 0.6762, - "step": 3354 - }, - { - "epoch": 3.25, - "grad_norm": 1.262732744216919, - "learning_rate": 1.103002502085071e-05, - "loss": 0.5914, - "step": 3355 - }, - { - "epoch": 3.25, - "grad_norm": 1.2295849323272705, - "learning_rate": 1.1025854879065888e-05, - "loss": 0.7001, - "step": 3356 - }, - { - "epoch": 3.25, - "grad_norm": 1.1591074466705322, - "learning_rate": 1.1021684737281068e-05, - "loss": 0.7922, - "step": 3357 - }, - { - "epoch": 3.25, - "grad_norm": 1.4156862497329712, - "learning_rate": 1.1017514595496248e-05, - "loss": 0.7443, - "step": 3358 - }, - { - "epoch": 3.25, - "grad_norm": 1.3502463102340698, - "learning_rate": 1.1013344453711427e-05, - "loss": 0.686, - "step": 3359 - }, - { - "epoch": 3.25, - "grad_norm": 0.9454214572906494, - "learning_rate": 1.1009174311926607e-05, - "loss": 0.6481, - "step": 3360 - }, - { - "epoch": 3.25, - "grad_norm": 1.1023411750793457, - "learning_rate": 1.1005004170141785e-05, - "loss": 0.6559, - "step": 3361 - }, - { - "epoch": 3.25, - "grad_norm": 1.402946949005127, - "learning_rate": 1.1000834028356963e-05, - "loss": 0.6993, - "step": 3362 - }, - { - "epoch": 3.25, - "grad_norm": 1.5812368392944336, - "learning_rate": 1.0996663886572143e-05, - "loss": 0.9209, - "step": 3363 - }, - { - "epoch": 3.25, - "grad_norm": 1.5126149654388428, - "learning_rate": 1.0992493744787323e-05, - "loss": 0.7068, - "step": 3364 - }, - { - "epoch": 3.26, - "grad_norm": 1.2705059051513672, - "learning_rate": 1.0988323603002502e-05, - "loss": 0.7095, - "step": 3365 - }, - { - "epoch": 3.26, - "grad_norm": 1.0155035257339478, - "learning_rate": 1.0984153461217682e-05, - "loss": 0.7165, - "step": 3366 - }, - { - "epoch": 3.26, - "grad_norm": 1.4430103302001953, - "learning_rate": 1.0979983319432862e-05, - "loss": 0.7907, - "step": 3367 - }, - { - "epoch": 3.26, - "grad_norm": 1.3394001722335815, - "learning_rate": 1.097581317764804e-05, - "loss": 0.7652, - "step": 3368 - }, - { - "epoch": 3.26, - "grad_norm": 1.068241000175476, - "learning_rate": 1.097164303586322e-05, - "loss": 0.6892, - "step": 3369 - }, - { - "epoch": 3.26, - "grad_norm": 1.3863632678985596, - "learning_rate": 1.09674728940784e-05, - "loss": 0.7418, - "step": 3370 - }, - { - "epoch": 3.26, - "grad_norm": 1.6057028770446777, - "learning_rate": 1.0963302752293579e-05, - "loss": 0.78, - "step": 3371 - }, - { - "epoch": 3.26, - "grad_norm": 1.0776585340499878, - "learning_rate": 1.0959132610508757e-05, - "loss": 0.8162, - "step": 3372 - }, - { - "epoch": 3.26, - "grad_norm": 1.25663423538208, - "learning_rate": 1.0954962468723937e-05, - "loss": 0.7252, - "step": 3373 - }, - { - "epoch": 3.26, - "grad_norm": 1.0809129476547241, - "learning_rate": 1.0950792326939115e-05, - "loss": 0.7351, - "step": 3374 - }, - { - "epoch": 3.27, - "grad_norm": 1.4997795820236206, - "learning_rate": 1.0946622185154295e-05, - "loss": 0.6246, - "step": 3375 - }, - { - "epoch": 3.27, - "grad_norm": 2.021662712097168, - "learning_rate": 1.0942452043369475e-05, - "loss": 0.7095, - "step": 3376 - }, - { - "epoch": 3.27, - "grad_norm": 1.0122530460357666, - "learning_rate": 1.0938281901584654e-05, - "loss": 0.7469, - "step": 3377 - }, - { - "epoch": 3.27, - "grad_norm": 1.9117823839187622, - "learning_rate": 1.0934111759799834e-05, - "loss": 0.6241, - "step": 3378 - }, - { - "epoch": 3.27, - "grad_norm": 1.371226191520691, - "learning_rate": 1.0929941618015014e-05, - "loss": 0.7936, - "step": 3379 - }, - { - "epoch": 3.27, - "grad_norm": 1.0318444967269897, - "learning_rate": 1.0925771476230192e-05, - "loss": 0.778, - "step": 3380 - }, - { - "epoch": 3.27, - "grad_norm": 0.9649823307991028, - "learning_rate": 1.0921601334445372e-05, - "loss": 0.6454, - "step": 3381 - }, - { - "epoch": 3.27, - "grad_norm": 1.4121474027633667, - "learning_rate": 1.0917431192660552e-05, - "loss": 0.7538, - "step": 3382 - }, - { - "epoch": 3.27, - "grad_norm": 0.9472850561141968, - "learning_rate": 1.0913261050875729e-05, - "loss": 0.6823, - "step": 3383 - }, - { - "epoch": 3.27, - "grad_norm": 1.024497389793396, - "learning_rate": 1.0909090909090909e-05, - "loss": 0.5917, - "step": 3384 - }, - { - "epoch": 3.28, - "grad_norm": 1.088594913482666, - "learning_rate": 1.0904920767306089e-05, - "loss": 0.6154, - "step": 3385 - }, - { - "epoch": 3.28, - "grad_norm": 1.2852396965026855, - "learning_rate": 1.0900750625521267e-05, - "loss": 0.7288, - "step": 3386 - }, - { - "epoch": 3.28, - "grad_norm": 1.1869783401489258, - "learning_rate": 1.0896580483736447e-05, - "loss": 0.64, - "step": 3387 - }, - { - "epoch": 3.28, - "grad_norm": 1.390867829322815, - "learning_rate": 1.0892410341951627e-05, - "loss": 0.7947, - "step": 3388 - }, - { - "epoch": 3.28, - "grad_norm": 1.1302490234375, - "learning_rate": 1.0888240200166806e-05, - "loss": 0.6224, - "step": 3389 - }, - { - "epoch": 3.28, - "grad_norm": 1.118886113166809, - "learning_rate": 1.0884070058381986e-05, - "loss": 0.6958, - "step": 3390 - }, - { - "epoch": 3.28, - "grad_norm": 1.2561124563217163, - "learning_rate": 1.0879899916597166e-05, - "loss": 0.6514, - "step": 3391 - }, - { - "epoch": 3.28, - "grad_norm": 1.2579007148742676, - "learning_rate": 1.0875729774812344e-05, - "loss": 0.5491, - "step": 3392 - }, - { - "epoch": 3.28, - "grad_norm": 1.1266318559646606, - "learning_rate": 1.0871559633027524e-05, - "loss": 0.6375, - "step": 3393 - }, - { - "epoch": 3.28, - "grad_norm": 1.3354460000991821, - "learning_rate": 1.0867389491242702e-05, - "loss": 0.6581, - "step": 3394 - }, - { - "epoch": 3.28, - "grad_norm": 1.1436158418655396, - "learning_rate": 1.086321934945788e-05, - "loss": 0.8215, - "step": 3395 - }, - { - "epoch": 3.29, - "grad_norm": 1.3141074180603027, - "learning_rate": 1.0859049207673061e-05, - "loss": 0.7193, - "step": 3396 - }, - { - "epoch": 3.29, - "grad_norm": 1.368032693862915, - "learning_rate": 1.0854879065888241e-05, - "loss": 0.5584, - "step": 3397 - }, - { - "epoch": 3.29, - "grad_norm": 1.2383919954299927, - "learning_rate": 1.085070892410342e-05, - "loss": 0.6618, - "step": 3398 - }, - { - "epoch": 3.29, - "grad_norm": 1.2585275173187256, - "learning_rate": 1.08465387823186e-05, - "loss": 0.7238, - "step": 3399 - }, - { - "epoch": 3.29, - "grad_norm": 1.2602593898773193, - "learning_rate": 1.084236864053378e-05, - "loss": 0.6925, - "step": 3400 - }, - { - "epoch": 3.29, - "grad_norm": 1.2530949115753174, - "learning_rate": 1.0838198498748958e-05, - "loss": 0.709, - "step": 3401 - }, - { - "epoch": 3.29, - "grad_norm": 1.637156367301941, - "learning_rate": 1.0834028356964138e-05, - "loss": 0.6816, - "step": 3402 - }, - { - "epoch": 3.29, - "grad_norm": 1.5359946489334106, - "learning_rate": 1.0829858215179318e-05, - "loss": 0.6496, - "step": 3403 - }, - { - "epoch": 3.29, - "grad_norm": 1.7935192584991455, - "learning_rate": 1.0825688073394496e-05, - "loss": 0.6654, - "step": 3404 - }, - { - "epoch": 3.29, - "grad_norm": 1.1224169731140137, - "learning_rate": 1.0821517931609674e-05, - "loss": 0.717, - "step": 3405 - }, - { - "epoch": 3.3, - "grad_norm": 1.2829378843307495, - "learning_rate": 1.0817347789824854e-05, - "loss": 0.8593, - "step": 3406 - }, - { - "epoch": 3.3, - "grad_norm": 1.1093112230300903, - "learning_rate": 1.0813177648040033e-05, - "loss": 0.8091, - "step": 3407 - }, - { - "epoch": 3.3, - "grad_norm": 1.4325751066207886, - "learning_rate": 1.0809007506255213e-05, - "loss": 0.6826, - "step": 3408 - }, - { - "epoch": 3.3, - "grad_norm": 1.0864099264144897, - "learning_rate": 1.0804837364470393e-05, - "loss": 0.6138, - "step": 3409 - }, - { - "epoch": 3.3, - "grad_norm": 1.59944748878479, - "learning_rate": 1.0800667222685571e-05, - "loss": 0.9302, - "step": 3410 - }, - { - "epoch": 3.3, - "grad_norm": 1.2777928113937378, - "learning_rate": 1.0796497080900751e-05, - "loss": 0.83, - "step": 3411 - }, - { - "epoch": 3.3, - "grad_norm": 1.2557361125946045, - "learning_rate": 1.0792326939115931e-05, - "loss": 0.6753, - "step": 3412 - }, - { - "epoch": 3.3, - "grad_norm": 1.3049935102462769, - "learning_rate": 1.078815679733111e-05, - "loss": 0.7715, - "step": 3413 - }, - { - "epoch": 3.3, - "grad_norm": 1.0733563899993896, - "learning_rate": 1.078398665554629e-05, - "loss": 0.6678, - "step": 3414 - }, - { - "epoch": 3.3, - "grad_norm": 1.47896146774292, - "learning_rate": 1.077981651376147e-05, - "loss": 0.7656, - "step": 3415 - }, - { - "epoch": 3.31, - "grad_norm": 1.1716828346252441, - "learning_rate": 1.0775646371976646e-05, - "loss": 0.5947, - "step": 3416 - }, - { - "epoch": 3.31, - "grad_norm": 1.2961649894714355, - "learning_rate": 1.0771476230191826e-05, - "loss": 0.6478, - "step": 3417 - }, - { - "epoch": 3.31, - "grad_norm": 1.2759963274002075, - "learning_rate": 1.0767306088407006e-05, - "loss": 0.6047, - "step": 3418 - }, - { - "epoch": 3.31, - "grad_norm": 1.008695125579834, - "learning_rate": 1.0763135946622185e-05, - "loss": 0.6799, - "step": 3419 - }, - { - "epoch": 3.31, - "grad_norm": 1.134704828262329, - "learning_rate": 1.0758965804837365e-05, - "loss": 0.5557, - "step": 3420 - }, - { - "epoch": 3.31, - "grad_norm": 1.4728366136550903, - "learning_rate": 1.0754795663052545e-05, - "loss": 0.7292, - "step": 3421 - }, - { - "epoch": 3.31, - "grad_norm": 1.1189053058624268, - "learning_rate": 1.0750625521267723e-05, - "loss": 0.7542, - "step": 3422 - }, - { - "epoch": 3.31, - "grad_norm": 1.1999545097351074, - "learning_rate": 1.0746455379482903e-05, - "loss": 0.5912, - "step": 3423 - }, - { - "epoch": 3.31, - "grad_norm": 1.0336962938308716, - "learning_rate": 1.0742285237698083e-05, - "loss": 0.7258, - "step": 3424 - }, - { - "epoch": 3.31, - "grad_norm": 1.879111409187317, - "learning_rate": 1.0738115095913262e-05, - "loss": 1.0062, - "step": 3425 - }, - { - "epoch": 3.31, - "grad_norm": 1.4221199750900269, - "learning_rate": 1.0733944954128442e-05, - "loss": 0.5881, - "step": 3426 - }, - { - "epoch": 3.32, - "grad_norm": 1.394644856452942, - "learning_rate": 1.072977481234362e-05, - "loss": 0.8218, - "step": 3427 - }, - { - "epoch": 3.32, - "grad_norm": 2.010867118835449, - "learning_rate": 1.0725604670558798e-05, - "loss": 0.8283, - "step": 3428 - }, - { - "epoch": 3.32, - "grad_norm": 1.330498456954956, - "learning_rate": 1.0721434528773978e-05, - "loss": 0.605, - "step": 3429 - }, - { - "epoch": 3.32, - "grad_norm": 1.4259686470031738, - "learning_rate": 1.0717264386989158e-05, - "loss": 0.7469, - "step": 3430 - }, - { - "epoch": 3.32, - "grad_norm": 1.2063336372375488, - "learning_rate": 1.0713094245204337e-05, - "loss": 0.7167, - "step": 3431 - }, - { - "epoch": 3.32, - "grad_norm": 1.2673969268798828, - "learning_rate": 1.0708924103419517e-05, - "loss": 0.6076, - "step": 3432 - }, - { - "epoch": 3.32, - "grad_norm": 1.4459670782089233, - "learning_rate": 1.0704753961634697e-05, - "loss": 0.8424, - "step": 3433 - }, - { - "epoch": 3.32, - "grad_norm": 0.9861494898796082, - "learning_rate": 1.0700583819849875e-05, - "loss": 0.6108, - "step": 3434 - }, - { - "epoch": 3.32, - "grad_norm": 1.0082473754882812, - "learning_rate": 1.0696413678065055e-05, - "loss": 0.7346, - "step": 3435 - }, - { - "epoch": 3.32, - "grad_norm": 1.4336408376693726, - "learning_rate": 1.0692243536280235e-05, - "loss": 0.8671, - "step": 3436 - }, - { - "epoch": 3.33, - "grad_norm": 1.3534523248672485, - "learning_rate": 1.0688073394495414e-05, - "loss": 0.5643, - "step": 3437 - }, - { - "epoch": 3.33, - "grad_norm": 1.3873342275619507, - "learning_rate": 1.0683903252710592e-05, - "loss": 0.7677, - "step": 3438 - }, - { - "epoch": 3.33, - "grad_norm": 1.1296403408050537, - "learning_rate": 1.0679733110925772e-05, - "loss": 0.6464, - "step": 3439 - }, - { - "epoch": 3.33, - "grad_norm": 1.3493890762329102, - "learning_rate": 1.067556296914095e-05, - "loss": 0.7079, - "step": 3440 - }, - { - "epoch": 3.33, - "grad_norm": 1.0572352409362793, - "learning_rate": 1.067139282735613e-05, - "loss": 0.7055, - "step": 3441 - }, - { - "epoch": 3.33, - "grad_norm": 1.1493865251541138, - "learning_rate": 1.066722268557131e-05, - "loss": 0.9469, - "step": 3442 - }, - { - "epoch": 3.33, - "grad_norm": 1.1443610191345215, - "learning_rate": 1.0663052543786489e-05, - "loss": 0.7493, - "step": 3443 - }, - { - "epoch": 3.33, - "grad_norm": 0.9636917114257812, - "learning_rate": 1.0658882402001669e-05, - "loss": 0.6283, - "step": 3444 - }, - { - "epoch": 3.33, - "grad_norm": 1.1887658834457397, - "learning_rate": 1.0654712260216849e-05, - "loss": 0.6482, - "step": 3445 - }, - { - "epoch": 3.33, - "grad_norm": 1.5771211385726929, - "learning_rate": 1.0650542118432027e-05, - "loss": 0.5913, - "step": 3446 - }, - { - "epoch": 3.34, - "grad_norm": 1.117101788520813, - "learning_rate": 1.0646371976647207e-05, - "loss": 0.823, - "step": 3447 - }, - { - "epoch": 3.34, - "grad_norm": 1.4400957822799683, - "learning_rate": 1.0642201834862387e-05, - "loss": 0.6638, - "step": 3448 - }, - { - "epoch": 3.34, - "grad_norm": 1.2020106315612793, - "learning_rate": 1.0638031693077564e-05, - "loss": 0.5566, - "step": 3449 - }, - { - "epoch": 3.34, - "grad_norm": 1.2653436660766602, - "learning_rate": 1.0633861551292744e-05, - "loss": 0.7604, - "step": 3450 - }, - { - "epoch": 3.34, - "grad_norm": 1.4160075187683105, - "learning_rate": 1.0629691409507924e-05, - "loss": 0.7211, - "step": 3451 - }, - { - "epoch": 3.34, - "grad_norm": 0.963518500328064, - "learning_rate": 1.0625521267723102e-05, - "loss": 0.7777, - "step": 3452 - }, - { - "epoch": 3.34, - "grad_norm": 1.4676203727722168, - "learning_rate": 1.0621351125938282e-05, - "loss": 0.8751, - "step": 3453 - }, - { - "epoch": 3.34, - "grad_norm": 1.1271092891693115, - "learning_rate": 1.0617180984153462e-05, - "loss": 0.8859, - "step": 3454 - }, - { - "epoch": 3.34, - "grad_norm": 1.1082775592803955, - "learning_rate": 1.061301084236864e-05, - "loss": 0.7262, - "step": 3455 - }, - { - "epoch": 3.34, - "grad_norm": 1.2201300859451294, - "learning_rate": 1.060884070058382e-05, - "loss": 0.8005, - "step": 3456 - }, - { - "epoch": 3.34, - "grad_norm": 0.9868558645248413, - "learning_rate": 1.0604670558799e-05, - "loss": 0.5953, - "step": 3457 - }, - { - "epoch": 3.35, - "grad_norm": 1.012241005897522, - "learning_rate": 1.0600500417014179e-05, - "loss": 0.7785, - "step": 3458 - }, - { - "epoch": 3.35, - "grad_norm": 1.1775126457214355, - "learning_rate": 1.0596330275229359e-05, - "loss": 0.6593, - "step": 3459 - }, - { - "epoch": 3.35, - "grad_norm": 1.3616118431091309, - "learning_rate": 1.0592160133444537e-05, - "loss": 0.7491, - "step": 3460 - }, - { - "epoch": 3.35, - "grad_norm": 1.2633185386657715, - "learning_rate": 1.0587989991659716e-05, - "loss": 0.838, - "step": 3461 - }, - { - "epoch": 3.35, - "grad_norm": 1.2192989587783813, - "learning_rate": 1.0583819849874896e-05, - "loss": 0.7723, - "step": 3462 - }, - { - "epoch": 3.35, - "grad_norm": 1.3479501008987427, - "learning_rate": 1.0579649708090076e-05, - "loss": 0.7368, - "step": 3463 - }, - { - "epoch": 3.35, - "grad_norm": 1.292114019393921, - "learning_rate": 1.0575479566305254e-05, - "loss": 0.7949, - "step": 3464 - }, - { - "epoch": 3.35, - "grad_norm": 1.1060526371002197, - "learning_rate": 1.0571309424520434e-05, - "loss": 0.7733, - "step": 3465 - }, - { - "epoch": 3.35, - "grad_norm": 1.2905687093734741, - "learning_rate": 1.0567139282735614e-05, - "loss": 0.7951, - "step": 3466 - }, - { - "epoch": 3.35, - "grad_norm": 1.126912236213684, - "learning_rate": 1.0562969140950793e-05, - "loss": 0.7167, - "step": 3467 - }, - { - "epoch": 3.36, - "grad_norm": 1.5318437814712524, - "learning_rate": 1.0558798999165973e-05, - "loss": 0.7958, - "step": 3468 - }, - { - "epoch": 3.36, - "grad_norm": 1.1199828386306763, - "learning_rate": 1.0554628857381153e-05, - "loss": 0.6292, - "step": 3469 - }, - { - "epoch": 3.36, - "grad_norm": 1.6919323205947876, - "learning_rate": 1.0550458715596331e-05, - "loss": 0.9646, - "step": 3470 - }, - { - "epoch": 3.36, - "grad_norm": 1.2313265800476074, - "learning_rate": 1.054628857381151e-05, - "loss": 0.6957, - "step": 3471 - }, - { - "epoch": 3.36, - "grad_norm": 1.1872867345809937, - "learning_rate": 1.054211843202669e-05, - "loss": 0.6459, - "step": 3472 - }, - { - "epoch": 3.36, - "grad_norm": 1.381390929222107, - "learning_rate": 1.0537948290241868e-05, - "loss": 0.8237, - "step": 3473 - }, - { - "epoch": 3.36, - "grad_norm": 1.0828696489334106, - "learning_rate": 1.0533778148457048e-05, - "loss": 0.6105, - "step": 3474 - }, - { - "epoch": 3.36, - "grad_norm": 1.3132773637771606, - "learning_rate": 1.0529608006672228e-05, - "loss": 0.7536, - "step": 3475 - }, - { - "epoch": 3.36, - "grad_norm": 1.01531183719635, - "learning_rate": 1.0525437864887406e-05, - "loss": 0.6987, - "step": 3476 - }, - { - "epoch": 3.36, - "grad_norm": 1.4168379306793213, - "learning_rate": 1.0521267723102586e-05, - "loss": 0.8741, - "step": 3477 - }, - { - "epoch": 3.37, - "grad_norm": 1.338118314743042, - "learning_rate": 1.0517097581317766e-05, - "loss": 0.6214, - "step": 3478 - }, - { - "epoch": 3.37, - "grad_norm": 1.4483674764633179, - "learning_rate": 1.0512927439532945e-05, - "loss": 0.7309, - "step": 3479 - }, - { - "epoch": 3.37, - "grad_norm": 1.3336416482925415, - "learning_rate": 1.0508757297748125e-05, - "loss": 0.7135, - "step": 3480 - }, - { - "epoch": 3.37, - "grad_norm": 1.3744287490844727, - "learning_rate": 1.0504587155963305e-05, - "loss": 0.7492, - "step": 3481 - }, - { - "epoch": 3.37, - "grad_norm": 1.222573161125183, - "learning_rate": 1.0500417014178481e-05, - "loss": 0.7949, - "step": 3482 - }, - { - "epoch": 3.37, - "grad_norm": 1.2898274660110474, - "learning_rate": 1.0496246872393661e-05, - "loss": 0.7056, - "step": 3483 - }, - { - "epoch": 3.37, - "grad_norm": 1.5685310363769531, - "learning_rate": 1.0492076730608841e-05, - "loss": 0.8209, - "step": 3484 - }, - { - "epoch": 3.37, - "grad_norm": 1.5154378414154053, - "learning_rate": 1.048790658882402e-05, - "loss": 0.6577, - "step": 3485 - }, - { - "epoch": 3.37, - "grad_norm": 1.3797118663787842, - "learning_rate": 1.04837364470392e-05, - "loss": 0.7417, - "step": 3486 - }, - { - "epoch": 3.37, - "grad_norm": 1.2001053094863892, - "learning_rate": 1.047956630525438e-05, - "loss": 0.9088, - "step": 3487 - }, - { - "epoch": 3.37, - "grad_norm": 1.3712891340255737, - "learning_rate": 1.0475396163469558e-05, - "loss": 0.748, - "step": 3488 - }, - { - "epoch": 3.38, - "grad_norm": 1.3195301294326782, - "learning_rate": 1.0471226021684738e-05, - "loss": 0.9355, - "step": 3489 - }, - { - "epoch": 3.38, - "grad_norm": 1.404589056968689, - "learning_rate": 1.0467055879899918e-05, - "loss": 0.8267, - "step": 3490 - }, - { - "epoch": 3.38, - "grad_norm": 1.5360671281814575, - "learning_rate": 1.0462885738115096e-05, - "loss": 0.7572, - "step": 3491 - }, - { - "epoch": 3.38, - "grad_norm": 1.7579607963562012, - "learning_rate": 1.0458715596330277e-05, - "loss": 0.7259, - "step": 3492 - }, - { - "epoch": 3.38, - "grad_norm": 1.5258115530014038, - "learning_rate": 1.0454545454545455e-05, - "loss": 0.5538, - "step": 3493 - }, - { - "epoch": 3.38, - "grad_norm": 1.3881744146347046, - "learning_rate": 1.0450375312760633e-05, - "loss": 0.7602, - "step": 3494 - }, - { - "epoch": 3.38, - "grad_norm": 1.1759934425354004, - "learning_rate": 1.0446205170975813e-05, - "loss": 0.7173, - "step": 3495 - }, - { - "epoch": 3.38, - "grad_norm": 1.5940110683441162, - "learning_rate": 1.0442035029190993e-05, - "loss": 0.7985, - "step": 3496 - }, - { - "epoch": 3.38, - "grad_norm": 1.5001188516616821, - "learning_rate": 1.0437864887406172e-05, - "loss": 0.6589, - "step": 3497 - }, - { - "epoch": 3.38, - "grad_norm": 1.4686529636383057, - "learning_rate": 1.0433694745621352e-05, - "loss": 0.6081, - "step": 3498 - }, - { - "epoch": 3.39, - "grad_norm": 0.9764127135276794, - "learning_rate": 1.0429524603836532e-05, - "loss": 0.7013, - "step": 3499 - }, - { - "epoch": 3.39, - "grad_norm": 1.1044386625289917, - "learning_rate": 1.042535446205171e-05, - "loss": 0.6973, - "step": 3500 - }, - { - "epoch": 3.39, - "grad_norm": 1.2028939723968506, - "learning_rate": 1.042118432026689e-05, - "loss": 0.7388, - "step": 3501 - }, - { - "epoch": 3.39, - "grad_norm": 1.0677155256271362, - "learning_rate": 1.041701417848207e-05, - "loss": 0.6227, - "step": 3502 - }, - { - "epoch": 3.39, - "grad_norm": 1.062029242515564, - "learning_rate": 1.0412844036697248e-05, - "loss": 0.8376, - "step": 3503 - }, - { - "epoch": 3.39, - "grad_norm": 0.8803884983062744, - "learning_rate": 1.0408673894912427e-05, - "loss": 0.6413, - "step": 3504 - }, - { - "epoch": 3.39, - "grad_norm": 1.3496851921081543, - "learning_rate": 1.0404503753127607e-05, - "loss": 0.8495, - "step": 3505 - }, - { - "epoch": 3.39, - "grad_norm": 1.4136476516723633, - "learning_rate": 1.0400333611342785e-05, - "loss": 0.7462, - "step": 3506 - }, - { - "epoch": 3.39, - "grad_norm": 1.549994707107544, - "learning_rate": 1.0396163469557965e-05, - "loss": 0.7594, - "step": 3507 - }, - { - "epoch": 3.39, - "grad_norm": 1.1506428718566895, - "learning_rate": 1.0391993327773145e-05, - "loss": 0.898, - "step": 3508 - }, - { - "epoch": 3.4, - "grad_norm": 1.1922247409820557, - "learning_rate": 1.0387823185988324e-05, - "loss": 0.7385, - "step": 3509 - }, - { - "epoch": 3.4, - "grad_norm": 1.2994155883789062, - "learning_rate": 1.0383653044203504e-05, - "loss": 0.6704, - "step": 3510 - }, - { - "epoch": 3.4, - "grad_norm": 1.820528268814087, - "learning_rate": 1.0379482902418684e-05, - "loss": 0.7495, - "step": 3511 - }, - { - "epoch": 3.4, - "grad_norm": 1.6319626569747925, - "learning_rate": 1.0375312760633862e-05, - "loss": 0.8123, - "step": 3512 - }, - { - "epoch": 3.4, - "grad_norm": 1.3884319067001343, - "learning_rate": 1.0371142618849042e-05, - "loss": 0.941, - "step": 3513 - }, - { - "epoch": 3.4, - "grad_norm": 1.1962778568267822, - "learning_rate": 1.0366972477064222e-05, - "loss": 0.7931, - "step": 3514 - }, - { - "epoch": 3.4, - "grad_norm": 1.1450304985046387, - "learning_rate": 1.0362802335279399e-05, - "loss": 0.6582, - "step": 3515 - }, - { - "epoch": 3.4, - "grad_norm": 1.7188949584960938, - "learning_rate": 1.0358632193494579e-05, - "loss": 0.9605, - "step": 3516 - }, - { - "epoch": 3.4, - "grad_norm": 1.057910442352295, - "learning_rate": 1.0354462051709759e-05, - "loss": 0.775, - "step": 3517 - }, - { - "epoch": 3.4, - "grad_norm": 1.2940022945404053, - "learning_rate": 1.0350291909924937e-05, - "loss": 0.7155, - "step": 3518 - }, - { - "epoch": 3.4, - "grad_norm": 1.1312918663024902, - "learning_rate": 1.0346121768140117e-05, - "loss": 0.7355, - "step": 3519 - }, - { - "epoch": 3.41, - "grad_norm": 1.2899068593978882, - "learning_rate": 1.0341951626355297e-05, - "loss": 0.7127, - "step": 3520 - }, - { - "epoch": 3.41, - "grad_norm": 1.1840060949325562, - "learning_rate": 1.0337781484570476e-05, - "loss": 0.6164, - "step": 3521 - }, - { - "epoch": 3.41, - "grad_norm": 1.1102287769317627, - "learning_rate": 1.0333611342785656e-05, - "loss": 0.8481, - "step": 3522 - }, - { - "epoch": 3.41, - "grad_norm": 1.0823404788970947, - "learning_rate": 1.0329441201000836e-05, - "loss": 0.7277, - "step": 3523 - }, - { - "epoch": 3.41, - "grad_norm": 1.7079617977142334, - "learning_rate": 1.0325271059216014e-05, - "loss": 0.6981, - "step": 3524 - }, - { - "epoch": 3.41, - "grad_norm": 1.2939664125442505, - "learning_rate": 1.0321100917431194e-05, - "loss": 0.8583, - "step": 3525 - }, - { - "epoch": 3.41, - "grad_norm": 1.1833579540252686, - "learning_rate": 1.0316930775646372e-05, - "loss": 0.8205, - "step": 3526 - }, - { - "epoch": 3.41, - "grad_norm": 1.7491941452026367, - "learning_rate": 1.031276063386155e-05, - "loss": 0.6701, - "step": 3527 - }, - { - "epoch": 3.41, - "grad_norm": 2.246164560317993, - "learning_rate": 1.030859049207673e-05, - "loss": 0.6322, - "step": 3528 - }, - { - "epoch": 3.41, - "grad_norm": 1.0349647998809814, - "learning_rate": 1.030442035029191e-05, - "loss": 0.6437, - "step": 3529 - }, - { - "epoch": 3.42, - "grad_norm": 1.9165652990341187, - "learning_rate": 1.0300250208507089e-05, - "loss": 0.6983, - "step": 3530 - }, - { - "epoch": 3.42, - "grad_norm": 1.4408538341522217, - "learning_rate": 1.0296080066722269e-05, - "loss": 0.879, - "step": 3531 - }, - { - "epoch": 3.42, - "grad_norm": 1.6121331453323364, - "learning_rate": 1.0291909924937449e-05, - "loss": 0.7728, - "step": 3532 - }, - { - "epoch": 3.42, - "grad_norm": 1.1499223709106445, - "learning_rate": 1.0287739783152627e-05, - "loss": 0.7445, - "step": 3533 - }, - { - "epoch": 3.42, - "grad_norm": 1.153995394706726, - "learning_rate": 1.0283569641367808e-05, - "loss": 0.648, - "step": 3534 - }, - { - "epoch": 3.42, - "grad_norm": 1.144078016281128, - "learning_rate": 1.0279399499582988e-05, - "loss": 0.5877, - "step": 3535 - }, - { - "epoch": 3.42, - "grad_norm": 1.1591856479644775, - "learning_rate": 1.0275229357798166e-05, - "loss": 0.7853, - "step": 3536 - }, - { - "epoch": 3.42, - "grad_norm": 1.3697547912597656, - "learning_rate": 1.0271059216013344e-05, - "loss": 0.7107, - "step": 3537 - }, - { - "epoch": 3.42, - "grad_norm": 0.9899626970291138, - "learning_rate": 1.0266889074228524e-05, - "loss": 0.6433, - "step": 3538 - }, - { - "epoch": 3.42, - "grad_norm": 1.0768812894821167, - "learning_rate": 1.0262718932443703e-05, - "loss": 0.7091, - "step": 3539 - }, - { - "epoch": 3.43, - "grad_norm": 1.24858820438385, - "learning_rate": 1.0258548790658883e-05, - "loss": 0.8891, - "step": 3540 - }, - { - "epoch": 3.43, - "grad_norm": 1.4133400917053223, - "learning_rate": 1.0254378648874063e-05, - "loss": 0.7898, - "step": 3541 - }, - { - "epoch": 3.43, - "grad_norm": 1.4721908569335938, - "learning_rate": 1.0250208507089241e-05, - "loss": 0.7279, - "step": 3542 - }, - { - "epoch": 3.43, - "grad_norm": 1.4527523517608643, - "learning_rate": 1.0246038365304421e-05, - "loss": 0.8065, - "step": 3543 - }, - { - "epoch": 3.43, - "grad_norm": 1.46436607837677, - "learning_rate": 1.0241868223519601e-05, - "loss": 0.874, - "step": 3544 - }, - { - "epoch": 3.43, - "grad_norm": 2.378770589828491, - "learning_rate": 1.023769808173478e-05, - "loss": 0.646, - "step": 3545 - }, - { - "epoch": 3.43, - "grad_norm": 1.1264159679412842, - "learning_rate": 1.023352793994996e-05, - "loss": 0.6833, - "step": 3546 - }, - { - "epoch": 3.43, - "grad_norm": 1.3123902082443237, - "learning_rate": 1.022935779816514e-05, - "loss": 0.8684, - "step": 3547 - }, - { - "epoch": 3.43, - "grad_norm": 1.419434666633606, - "learning_rate": 1.0225187656380318e-05, - "loss": 0.7733, - "step": 3548 - }, - { - "epoch": 3.43, - "grad_norm": 1.465518832206726, - "learning_rate": 1.0221017514595496e-05, - "loss": 0.8666, - "step": 3549 - }, - { - "epoch": 3.43, - "grad_norm": 1.7353533506393433, - "learning_rate": 1.0216847372810676e-05, - "loss": 0.5738, - "step": 3550 - }, - { - "epoch": 3.44, - "grad_norm": 1.3762145042419434, - "learning_rate": 1.0212677231025855e-05, - "loss": 0.8024, - "step": 3551 - }, - { - "epoch": 3.44, - "grad_norm": 1.4958751201629639, - "learning_rate": 1.0208507089241035e-05, - "loss": 0.6455, - "step": 3552 - }, - { - "epoch": 3.44, - "grad_norm": 1.5199908018112183, - "learning_rate": 1.0204336947456215e-05, - "loss": 0.579, - "step": 3553 - }, - { - "epoch": 3.44, - "grad_norm": 1.4705257415771484, - "learning_rate": 1.0200166805671393e-05, - "loss": 0.7725, - "step": 3554 - }, - { - "epoch": 3.44, - "grad_norm": 1.429896354675293, - "learning_rate": 1.0195996663886573e-05, - "loss": 0.7724, - "step": 3555 - }, - { - "epoch": 3.44, - "grad_norm": 1.4332140684127808, - "learning_rate": 1.0191826522101753e-05, - "loss": 0.7289, - "step": 3556 - }, - { - "epoch": 3.44, - "grad_norm": 1.1396420001983643, - "learning_rate": 1.0187656380316931e-05, - "loss": 0.6976, - "step": 3557 - }, - { - "epoch": 3.44, - "grad_norm": 1.388533115386963, - "learning_rate": 1.0183486238532111e-05, - "loss": 0.6746, - "step": 3558 - }, - { - "epoch": 3.44, - "grad_norm": 1.1702568531036377, - "learning_rate": 1.017931609674729e-05, - "loss": 0.7049, - "step": 3559 - }, - { - "epoch": 3.44, - "grad_norm": 1.3851664066314697, - "learning_rate": 1.0175145954962468e-05, - "loss": 0.799, - "step": 3560 - }, - { - "epoch": 3.45, - "grad_norm": 1.3145420551300049, - "learning_rate": 1.0170975813177648e-05, - "loss": 0.7003, - "step": 3561 - }, - { - "epoch": 3.45, - "grad_norm": 1.273146152496338, - "learning_rate": 1.0166805671392828e-05, - "loss": 0.7805, - "step": 3562 - }, - { - "epoch": 3.45, - "grad_norm": 1.5097302198410034, - "learning_rate": 1.0162635529608007e-05, - "loss": 0.8458, - "step": 3563 - }, - { - "epoch": 3.45, - "grad_norm": 1.6792807579040527, - "learning_rate": 1.0158465387823187e-05, - "loss": 0.7154, - "step": 3564 - }, - { - "epoch": 3.45, - "grad_norm": 1.318212866783142, - "learning_rate": 1.0154295246038367e-05, - "loss": 0.7312, - "step": 3565 - }, - { - "epoch": 3.45, - "grad_norm": 1.5390247106552124, - "learning_rate": 1.0150125104253545e-05, - "loss": 0.6338, - "step": 3566 - }, - { - "epoch": 3.45, - "grad_norm": 1.0180580615997314, - "learning_rate": 1.0145954962468725e-05, - "loss": 0.6228, - "step": 3567 - }, - { - "epoch": 3.45, - "grad_norm": 1.420424461364746, - "learning_rate": 1.0141784820683905e-05, - "loss": 0.7449, - "step": 3568 - }, - { - "epoch": 3.45, - "grad_norm": 1.2157551050186157, - "learning_rate": 1.0137614678899083e-05, - "loss": 0.8455, - "step": 3569 - }, - { - "epoch": 3.45, - "grad_norm": 1.3864554166793823, - "learning_rate": 1.0133444537114263e-05, - "loss": 0.754, - "step": 3570 - }, - { - "epoch": 3.46, - "grad_norm": 1.347893476486206, - "learning_rate": 1.0129274395329442e-05, - "loss": 0.8759, - "step": 3571 - }, - { - "epoch": 3.46, - "grad_norm": 1.1174273490905762, - "learning_rate": 1.012510425354462e-05, - "loss": 0.6399, - "step": 3572 - }, - { - "epoch": 3.46, - "grad_norm": 1.2639257907867432, - "learning_rate": 1.01209341117598e-05, - "loss": 0.6835, - "step": 3573 - }, - { - "epoch": 3.46, - "grad_norm": 1.1181992292404175, - "learning_rate": 1.011676396997498e-05, - "loss": 0.7897, - "step": 3574 - }, - { - "epoch": 3.46, - "grad_norm": 1.1306074857711792, - "learning_rate": 1.0112593828190158e-05, - "loss": 0.5676, - "step": 3575 - }, - { - "epoch": 3.46, - "grad_norm": 1.2635434865951538, - "learning_rate": 1.0108423686405339e-05, - "loss": 0.6779, - "step": 3576 - }, - { - "epoch": 3.46, - "grad_norm": 1.540344476699829, - "learning_rate": 1.0104253544620519e-05, - "loss": 0.6709, - "step": 3577 - }, - { - "epoch": 3.46, - "grad_norm": 1.293089509010315, - "learning_rate": 1.0100083402835697e-05, - "loss": 0.66, - "step": 3578 - }, - { - "epoch": 3.46, - "grad_norm": 1.7892770767211914, - "learning_rate": 1.0095913261050877e-05, - "loss": 0.8133, - "step": 3579 - }, - { - "epoch": 3.46, - "grad_norm": 1.2047173976898193, - "learning_rate": 1.0091743119266055e-05, - "loss": 0.8122, - "step": 3580 - }, - { - "epoch": 3.46, - "grad_norm": 1.4319950342178345, - "learning_rate": 1.0087572977481235e-05, - "loss": 0.6982, - "step": 3581 - }, - { - "epoch": 3.47, - "grad_norm": 1.3874304294586182, - "learning_rate": 1.0083402835696414e-05, - "loss": 0.7791, - "step": 3582 - }, - { - "epoch": 3.47, - "grad_norm": 1.123240351676941, - "learning_rate": 1.0079232693911594e-05, - "loss": 0.7105, - "step": 3583 - }, - { - "epoch": 3.47, - "grad_norm": 1.1747161149978638, - "learning_rate": 1.0075062552126772e-05, - "loss": 0.7439, - "step": 3584 - }, - { - "epoch": 3.47, - "grad_norm": 1.8504464626312256, - "learning_rate": 1.0070892410341952e-05, - "loss": 0.6902, - "step": 3585 - }, - { - "epoch": 3.47, - "grad_norm": 1.3045039176940918, - "learning_rate": 1.0066722268557132e-05, - "loss": 0.7246, - "step": 3586 - }, - { - "epoch": 3.47, - "grad_norm": 1.3498984575271606, - "learning_rate": 1.006255212677231e-05, - "loss": 0.6498, - "step": 3587 - }, - { - "epoch": 3.47, - "grad_norm": 1.1970775127410889, - "learning_rate": 1.005838198498749e-05, - "loss": 0.6152, - "step": 3588 - }, - { - "epoch": 3.47, - "grad_norm": 1.417137861251831, - "learning_rate": 1.005421184320267e-05, - "loss": 0.7811, - "step": 3589 - }, - { - "epoch": 3.47, - "grad_norm": 1.563003659248352, - "learning_rate": 1.0050041701417849e-05, - "loss": 0.6854, - "step": 3590 - }, - { - "epoch": 3.47, - "grad_norm": 1.1362507343292236, - "learning_rate": 1.0045871559633029e-05, - "loss": 0.6716, - "step": 3591 - }, - { - "epoch": 3.48, - "grad_norm": 1.1619484424591064, - "learning_rate": 1.0041701417848207e-05, - "loss": 0.7862, - "step": 3592 - }, - { - "epoch": 3.48, - "grad_norm": 1.3124399185180664, - "learning_rate": 1.0037531276063386e-05, - "loss": 0.6504, - "step": 3593 - }, - { - "epoch": 3.48, - "grad_norm": 1.0640451908111572, - "learning_rate": 1.0033361134278566e-05, - "loss": 0.7805, - "step": 3594 - }, - { - "epoch": 3.48, - "grad_norm": 1.3605332374572754, - "learning_rate": 1.0029190992493746e-05, - "loss": 0.777, - "step": 3595 - }, - { - "epoch": 3.48, - "grad_norm": 1.501366376876831, - "learning_rate": 1.0025020850708924e-05, - "loss": 0.7642, - "step": 3596 - }, - { - "epoch": 3.48, - "grad_norm": 1.334129810333252, - "learning_rate": 1.0020850708924104e-05, - "loss": 0.7327, - "step": 3597 - }, - { - "epoch": 3.48, - "grad_norm": 1.3075995445251465, - "learning_rate": 1.0016680567139284e-05, - "loss": 0.8363, - "step": 3598 - }, - { - "epoch": 3.48, - "grad_norm": 1.2604775428771973, - "learning_rate": 1.0012510425354462e-05, - "loss": 0.8748, - "step": 3599 - }, - { - "epoch": 3.48, - "grad_norm": 1.1842291355133057, - "learning_rate": 1.0008340283569642e-05, - "loss": 0.8248, - "step": 3600 - }, - { - "epoch": 3.48, - "eval_loss": 0.8547488451004028, - "eval_runtime": 860.9138, - "eval_samples_per_second": 4.802, - "eval_steps_per_second": 0.601, - "step": 3600 - }, - { - "epoch": 3.48, - "grad_norm": 1.398797631263733, - "learning_rate": 1.000417014178482e-05, - "loss": 0.7477, - "step": 3601 - }, - { - "epoch": 3.49, - "grad_norm": 1.5675469636917114, - "learning_rate": 1e-05, - "loss": 0.6492, - "step": 3602 - }, - { - "epoch": 3.49, - "grad_norm": 1.36086905002594, - "learning_rate": 9.995829858215181e-06, - "loss": 0.8345, - "step": 3603 - }, - { - "epoch": 3.49, - "grad_norm": 1.302552342414856, - "learning_rate": 9.99165971643036e-06, - "loss": 0.6947, - "step": 3604 - }, - { - "epoch": 3.49, - "grad_norm": 1.0472469329833984, - "learning_rate": 9.987489574645538e-06, - "loss": 0.7053, - "step": 3605 - }, - { - "epoch": 3.49, - "grad_norm": 1.2574175596237183, - "learning_rate": 9.983319432860718e-06, - "loss": 0.7126, - "step": 3606 - }, - { - "epoch": 3.49, - "grad_norm": 1.4842021465301514, - "learning_rate": 9.979149291075898e-06, - "loss": 0.7707, - "step": 3607 - }, - { - "epoch": 3.49, - "grad_norm": 1.20749831199646, - "learning_rate": 9.974979149291076e-06, - "loss": 0.6888, - "step": 3608 - }, - { - "epoch": 3.49, - "grad_norm": 1.147926926612854, - "learning_rate": 9.970809007506256e-06, - "loss": 0.5688, - "step": 3609 - }, - { - "epoch": 3.49, - "grad_norm": 1.2449201345443726, - "learning_rate": 9.966638865721436e-06, - "loss": 0.7079, - "step": 3610 - }, - { - "epoch": 3.49, - "grad_norm": 1.2286193370819092, - "learning_rate": 9.962468723936614e-06, - "loss": 0.7146, - "step": 3611 - }, - { - "epoch": 3.49, - "grad_norm": 1.2378441095352173, - "learning_rate": 9.958298582151794e-06, - "loss": 0.6854, - "step": 3612 - }, - { - "epoch": 3.5, - "grad_norm": 1.1023712158203125, - "learning_rate": 9.954128440366973e-06, - "loss": 0.6053, - "step": 3613 - }, - { - "epoch": 3.5, - "grad_norm": 1.265261173248291, - "learning_rate": 9.949958298582153e-06, - "loss": 0.9018, - "step": 3614 - }, - { - "epoch": 3.5, - "grad_norm": 1.1046671867370605, - "learning_rate": 9.945788156797331e-06, - "loss": 0.6952, - "step": 3615 - }, - { - "epoch": 3.5, - "grad_norm": 1.394481897354126, - "learning_rate": 9.941618015012511e-06, - "loss": 0.7404, - "step": 3616 - }, - { - "epoch": 3.5, - "grad_norm": 1.2353272438049316, - "learning_rate": 9.93744787322769e-06, - "loss": 0.7205, - "step": 3617 - }, - { - "epoch": 3.5, - "grad_norm": 1.3304924964904785, - "learning_rate": 9.93327773144287e-06, - "loss": 0.7631, - "step": 3618 - }, - { - "epoch": 3.5, - "grad_norm": 1.1558955907821655, - "learning_rate": 9.92910758965805e-06, - "loss": 0.7562, - "step": 3619 - }, - { - "epoch": 3.5, - "grad_norm": 1.4406038522720337, - "learning_rate": 9.924937447873228e-06, - "loss": 0.8024, - "step": 3620 - }, - { - "epoch": 3.5, - "grad_norm": 1.1018840074539185, - "learning_rate": 9.920767306088408e-06, - "loss": 0.6209, - "step": 3621 - }, - { - "epoch": 3.5, - "grad_norm": 1.299937129020691, - "learning_rate": 9.916597164303586e-06, - "loss": 0.775, - "step": 3622 - }, - { - "epoch": 3.51, - "grad_norm": 1.8240668773651123, - "learning_rate": 9.912427022518766e-06, - "loss": 0.7077, - "step": 3623 - }, - { - "epoch": 3.51, - "grad_norm": 1.3343193531036377, - "learning_rate": 9.908256880733946e-06, - "loss": 0.7171, - "step": 3624 - }, - { - "epoch": 3.51, - "grad_norm": 1.530676007270813, - "learning_rate": 9.904086738949125e-06, - "loss": 0.6222, - "step": 3625 - }, - { - "epoch": 3.51, - "grad_norm": 1.612051010131836, - "learning_rate": 9.899916597164303e-06, - "loss": 0.6966, - "step": 3626 - }, - { - "epoch": 3.51, - "grad_norm": 1.2645072937011719, - "learning_rate": 9.895746455379483e-06, - "loss": 0.6714, - "step": 3627 - }, - { - "epoch": 3.51, - "grad_norm": 0.970465898513794, - "learning_rate": 9.891576313594663e-06, - "loss": 0.7006, - "step": 3628 - }, - { - "epoch": 3.51, - "grad_norm": 1.1989374160766602, - "learning_rate": 9.887406171809841e-06, - "loss": 0.6727, - "step": 3629 - }, - { - "epoch": 3.51, - "grad_norm": 1.3782232999801636, - "learning_rate": 9.883236030025021e-06, - "loss": 0.922, - "step": 3630 - }, - { - "epoch": 3.51, - "grad_norm": 1.1525388956069946, - "learning_rate": 9.879065888240202e-06, - "loss": 0.8822, - "step": 3631 - }, - { - "epoch": 3.51, - "grad_norm": 1.0232452154159546, - "learning_rate": 9.87489574645538e-06, - "loss": 0.6676, - "step": 3632 - }, - { - "epoch": 3.52, - "grad_norm": 1.1694217920303345, - "learning_rate": 9.87072560467056e-06, - "loss": 0.8334, - "step": 3633 - }, - { - "epoch": 3.52, - "grad_norm": 1.4995096921920776, - "learning_rate": 9.866555462885738e-06, - "loss": 0.6476, - "step": 3634 - }, - { - "epoch": 3.52, - "grad_norm": 1.2373857498168945, - "learning_rate": 9.862385321100918e-06, - "loss": 0.727, - "step": 3635 - }, - { - "epoch": 3.52, - "grad_norm": 1.32621169090271, - "learning_rate": 9.858215179316098e-06, - "loss": 0.6985, - "step": 3636 - }, - { - "epoch": 3.52, - "grad_norm": 1.0579652786254883, - "learning_rate": 9.854045037531277e-06, - "loss": 0.6741, - "step": 3637 - }, - { - "epoch": 3.52, - "grad_norm": 1.0375930070877075, - "learning_rate": 9.849874895746455e-06, - "loss": 0.5241, - "step": 3638 - }, - { - "epoch": 3.52, - "grad_norm": 1.08954918384552, - "learning_rate": 9.845704753961635e-06, - "loss": 0.6364, - "step": 3639 - }, - { - "epoch": 3.52, - "grad_norm": 1.3299015760421753, - "learning_rate": 9.841534612176815e-06, - "loss": 0.7207, - "step": 3640 - }, - { - "epoch": 3.52, - "grad_norm": 1.180511236190796, - "learning_rate": 9.837364470391993e-06, - "loss": 0.8255, - "step": 3641 - }, - { - "epoch": 3.52, - "grad_norm": 1.2076964378356934, - "learning_rate": 9.833194328607173e-06, - "loss": 0.6078, - "step": 3642 - }, - { - "epoch": 3.52, - "grad_norm": 1.210663080215454, - "learning_rate": 9.829024186822352e-06, - "loss": 0.7695, - "step": 3643 - }, - { - "epoch": 3.53, - "grad_norm": 1.2857632637023926, - "learning_rate": 9.824854045037532e-06, - "loss": 0.6456, - "step": 3644 - }, - { - "epoch": 3.53, - "grad_norm": 1.383831262588501, - "learning_rate": 9.820683903252712e-06, - "loss": 0.6638, - "step": 3645 - }, - { - "epoch": 3.53, - "grad_norm": 1.1777606010437012, - "learning_rate": 9.81651376146789e-06, - "loss": 0.7401, - "step": 3646 - }, - { - "epoch": 3.53, - "grad_norm": 1.6303627490997314, - "learning_rate": 9.81234361968307e-06, - "loss": 0.6611, - "step": 3647 - }, - { - "epoch": 3.53, - "grad_norm": 1.566899061203003, - "learning_rate": 9.808173477898249e-06, - "loss": 0.6503, - "step": 3648 - }, - { - "epoch": 3.53, - "grad_norm": 1.554189920425415, - "learning_rate": 9.804003336113429e-06, - "loss": 0.7261, - "step": 3649 - }, - { - "epoch": 3.53, - "grad_norm": 1.2478817701339722, - "learning_rate": 9.799833194328607e-06, - "loss": 0.786, - "step": 3650 - }, - { - "epoch": 3.53, - "grad_norm": 1.0713328123092651, - "learning_rate": 9.795663052543787e-06, - "loss": 0.6749, - "step": 3651 - }, - { - "epoch": 3.53, - "grad_norm": 1.0255318880081177, - "learning_rate": 9.791492910758967e-06, - "loss": 0.6317, - "step": 3652 - }, - { - "epoch": 3.53, - "grad_norm": 1.5766685009002686, - "learning_rate": 9.787322768974145e-06, - "loss": 0.7716, - "step": 3653 - }, - { - "epoch": 3.54, - "grad_norm": 1.2392951250076294, - "learning_rate": 9.783152627189325e-06, - "loss": 0.7043, - "step": 3654 - }, - { - "epoch": 3.54, - "grad_norm": 1.043566107749939, - "learning_rate": 9.778982485404504e-06, - "loss": 0.5966, - "step": 3655 - }, - { - "epoch": 3.54, - "grad_norm": 1.1397144794464111, - "learning_rate": 9.774812343619684e-06, - "loss": 0.7369, - "step": 3656 - }, - { - "epoch": 3.54, - "grad_norm": 1.215667486190796, - "learning_rate": 9.770642201834864e-06, - "loss": 0.7371, - "step": 3657 - }, - { - "epoch": 3.54, - "grad_norm": 1.9744904041290283, - "learning_rate": 9.766472060050042e-06, - "loss": 0.7856, - "step": 3658 - }, - { - "epoch": 3.54, - "grad_norm": 1.0514297485351562, - "learning_rate": 9.76230191826522e-06, - "loss": 0.7465, - "step": 3659 - }, - { - "epoch": 3.54, - "grad_norm": 1.0875815153121948, - "learning_rate": 9.7581317764804e-06, - "loss": 0.6965, - "step": 3660 - }, - { - "epoch": 3.54, - "grad_norm": 1.1832997798919678, - "learning_rate": 9.75396163469558e-06, - "loss": 0.6994, - "step": 3661 - }, - { - "epoch": 3.54, - "grad_norm": 1.108242392539978, - "learning_rate": 9.749791492910759e-06, - "loss": 0.685, - "step": 3662 - }, - { - "epoch": 3.54, - "grad_norm": 1.3324147462844849, - "learning_rate": 9.745621351125939e-06, - "loss": 0.6214, - "step": 3663 - }, - { - "epoch": 3.55, - "grad_norm": 1.1734163761138916, - "learning_rate": 9.741451209341117e-06, - "loss": 0.7048, - "step": 3664 - }, - { - "epoch": 3.55, - "grad_norm": 1.1073940992355347, - "learning_rate": 9.737281067556297e-06, - "loss": 0.5457, - "step": 3665 - }, - { - "epoch": 3.55, - "grad_norm": 1.361920714378357, - "learning_rate": 9.733110925771477e-06, - "loss": 0.8382, - "step": 3666 - }, - { - "epoch": 3.55, - "grad_norm": 1.220529317855835, - "learning_rate": 9.728940783986656e-06, - "loss": 0.5753, - "step": 3667 - }, - { - "epoch": 3.55, - "grad_norm": 1.112630009651184, - "learning_rate": 9.724770642201836e-06, - "loss": 0.6776, - "step": 3668 - }, - { - "epoch": 3.55, - "grad_norm": 1.415000557899475, - "learning_rate": 9.720600500417016e-06, - "loss": 0.7272, - "step": 3669 - }, - { - "epoch": 3.55, - "grad_norm": 1.3768407106399536, - "learning_rate": 9.716430358632194e-06, - "loss": 0.9133, - "step": 3670 - }, - { - "epoch": 3.55, - "grad_norm": 1.0739002227783203, - "learning_rate": 9.712260216847372e-06, - "loss": 0.6825, - "step": 3671 - }, - { - "epoch": 3.55, - "grad_norm": 1.6859723329544067, - "learning_rate": 9.708090075062552e-06, - "loss": 0.6812, - "step": 3672 - }, - { - "epoch": 3.55, - "grad_norm": 1.2905598878860474, - "learning_rate": 9.703919933277733e-06, - "loss": 0.6885, - "step": 3673 - }, - { - "epoch": 3.55, - "grad_norm": 1.417656660079956, - "learning_rate": 9.699749791492911e-06, - "loss": 0.6986, - "step": 3674 - }, - { - "epoch": 3.56, - "grad_norm": 1.1009142398834229, - "learning_rate": 9.695579649708091e-06, - "loss": 0.6903, - "step": 3675 - }, - { - "epoch": 3.56, - "grad_norm": 1.9833390712738037, - "learning_rate": 9.69140950792327e-06, - "loss": 0.7584, - "step": 3676 - }, - { - "epoch": 3.56, - "grad_norm": 1.8520958423614502, - "learning_rate": 9.68723936613845e-06, - "loss": 0.6074, - "step": 3677 - }, - { - "epoch": 3.56, - "grad_norm": 1.2613887786865234, - "learning_rate": 9.68306922435363e-06, - "loss": 0.7776, - "step": 3678 - }, - { - "epoch": 3.56, - "grad_norm": 1.8603888750076294, - "learning_rate": 9.678899082568808e-06, - "loss": 0.8068, - "step": 3679 - }, - { - "epoch": 3.56, - "grad_norm": 1.218383550643921, - "learning_rate": 9.674728940783988e-06, - "loss": 0.8195, - "step": 3680 - }, - { - "epoch": 3.56, - "grad_norm": 1.3218454122543335, - "learning_rate": 9.670558798999166e-06, - "loss": 0.7639, - "step": 3681 - }, - { - "epoch": 3.56, - "grad_norm": 1.1455937623977661, - "learning_rate": 9.666388657214346e-06, - "loss": 0.5993, - "step": 3682 - }, - { - "epoch": 3.56, - "grad_norm": 1.2614988088607788, - "learning_rate": 9.662218515429524e-06, - "loss": 0.5664, - "step": 3683 - }, - { - "epoch": 3.56, - "grad_norm": 1.880332589149475, - "learning_rate": 9.658048373644704e-06, - "loss": 0.7618, - "step": 3684 - }, - { - "epoch": 3.57, - "grad_norm": 1.4358258247375488, - "learning_rate": 9.653878231859883e-06, - "loss": 0.9282, - "step": 3685 - }, - { - "epoch": 3.57, - "grad_norm": 1.2800836563110352, - "learning_rate": 9.649708090075063e-06, - "loss": 0.8026, - "step": 3686 - }, - { - "epoch": 3.57, - "grad_norm": 1.3461788892745972, - "learning_rate": 9.645537948290243e-06, - "loss": 0.7689, - "step": 3687 - }, - { - "epoch": 3.57, - "grad_norm": 1.6501184701919556, - "learning_rate": 9.641367806505421e-06, - "loss": 0.6133, - "step": 3688 - }, - { - "epoch": 3.57, - "grad_norm": 1.2736846208572388, - "learning_rate": 9.637197664720601e-06, - "loss": 0.8256, - "step": 3689 - }, - { - "epoch": 3.57, - "grad_norm": 1.0804342031478882, - "learning_rate": 9.633027522935781e-06, - "loss": 0.645, - "step": 3690 - }, - { - "epoch": 3.57, - "grad_norm": 1.1882388591766357, - "learning_rate": 9.62885738115096e-06, - "loss": 0.738, - "step": 3691 - }, - { - "epoch": 3.57, - "grad_norm": 1.4110417366027832, - "learning_rate": 9.624687239366138e-06, - "loss": 0.6724, - "step": 3692 - }, - { - "epoch": 3.57, - "grad_norm": 1.164905071258545, - "learning_rate": 9.620517097581318e-06, - "loss": 0.6826, - "step": 3693 - }, - { - "epoch": 3.57, - "grad_norm": 1.250449776649475, - "learning_rate": 9.616346955796498e-06, - "loss": 0.7408, - "step": 3694 - }, - { - "epoch": 3.58, - "grad_norm": 1.728674054145813, - "learning_rate": 9.612176814011676e-06, - "loss": 0.9098, - "step": 3695 - }, - { - "epoch": 3.58, - "grad_norm": 1.9090059995651245, - "learning_rate": 9.608006672226856e-06, - "loss": 0.7101, - "step": 3696 - }, - { - "epoch": 3.58, - "grad_norm": 1.120461106300354, - "learning_rate": 9.603836530442035e-06, - "loss": 0.6108, - "step": 3697 - }, - { - "epoch": 3.58, - "grad_norm": 1.108109951019287, - "learning_rate": 9.599666388657215e-06, - "loss": 0.7364, - "step": 3698 - }, - { - "epoch": 3.58, - "grad_norm": 1.2246441841125488, - "learning_rate": 9.595496246872395e-06, - "loss": 0.8161, - "step": 3699 - }, - { - "epoch": 3.58, - "grad_norm": 1.3570644855499268, - "learning_rate": 9.591326105087573e-06, - "loss": 0.6529, - "step": 3700 - }, - { - "epoch": 3.58, - "grad_norm": 1.089701533317566, - "learning_rate": 9.587155963302753e-06, - "loss": 0.7329, - "step": 3701 - }, - { - "epoch": 3.58, - "grad_norm": 1.3717594146728516, - "learning_rate": 9.582985821517933e-06, - "loss": 0.7759, - "step": 3702 - }, - { - "epoch": 3.58, - "grad_norm": 1.5683867931365967, - "learning_rate": 9.578815679733112e-06, - "loss": 0.7888, - "step": 3703 - }, - { - "epoch": 3.58, - "grad_norm": 1.0901628732681274, - "learning_rate": 9.57464553794829e-06, - "loss": 0.6703, - "step": 3704 - }, - { - "epoch": 3.58, - "grad_norm": 1.1070640087127686, - "learning_rate": 9.57047539616347e-06, - "loss": 0.8831, - "step": 3705 - }, - { - "epoch": 3.59, - "grad_norm": 1.2490179538726807, - "learning_rate": 9.566305254378648e-06, - "loss": 0.6944, - "step": 3706 - }, - { - "epoch": 3.59, - "grad_norm": 1.61417818069458, - "learning_rate": 9.562135112593828e-06, - "loss": 0.8053, - "step": 3707 - }, - { - "epoch": 3.59, - "grad_norm": 1.1403989791870117, - "learning_rate": 9.557964970809008e-06, - "loss": 0.5259, - "step": 3708 - }, - { - "epoch": 3.59, - "grad_norm": 1.4693373441696167, - "learning_rate": 9.553794829024187e-06, - "loss": 0.7022, - "step": 3709 - }, - { - "epoch": 3.59, - "grad_norm": 1.109744668006897, - "learning_rate": 9.549624687239367e-06, - "loss": 0.8134, - "step": 3710 - }, - { - "epoch": 3.59, - "grad_norm": 1.321455955505371, - "learning_rate": 9.545454545454547e-06, - "loss": 0.7871, - "step": 3711 - }, - { - "epoch": 3.59, - "grad_norm": 1.0245014429092407, - "learning_rate": 9.541284403669725e-06, - "loss": 0.5754, - "step": 3712 - }, - { - "epoch": 3.59, - "grad_norm": 1.4864879846572876, - "learning_rate": 9.537114261884905e-06, - "loss": 0.7603, - "step": 3713 - }, - { - "epoch": 3.59, - "grad_norm": 1.1182165145874023, - "learning_rate": 9.532944120100083e-06, - "loss": 0.7357, - "step": 3714 - }, - { - "epoch": 3.59, - "grad_norm": 1.3495372533798218, - "learning_rate": 9.528773978315264e-06, - "loss": 0.7678, - "step": 3715 - }, - { - "epoch": 3.6, - "grad_norm": 1.2772372961044312, - "learning_rate": 9.524603836530442e-06, - "loss": 0.655, - "step": 3716 - }, - { - "epoch": 3.6, - "grad_norm": 1.4288204908370972, - "learning_rate": 9.520433694745622e-06, - "loss": 0.8159, - "step": 3717 - }, - { - "epoch": 3.6, - "grad_norm": 1.1076468229293823, - "learning_rate": 9.5162635529608e-06, - "loss": 0.7313, - "step": 3718 - }, - { - "epoch": 3.6, - "grad_norm": 1.0478272438049316, - "learning_rate": 9.51209341117598e-06, - "loss": 0.6276, - "step": 3719 - }, - { - "epoch": 3.6, - "grad_norm": 1.0887244939804077, - "learning_rate": 9.50792326939116e-06, - "loss": 0.4719, - "step": 3720 - }, - { - "epoch": 3.6, - "grad_norm": 1.290826678276062, - "learning_rate": 9.503753127606339e-06, - "loss": 0.7153, - "step": 3721 - }, - { - "epoch": 3.6, - "grad_norm": 1.2441591024398804, - "learning_rate": 9.499582985821519e-06, - "loss": 0.5785, - "step": 3722 - }, - { - "epoch": 3.6, - "grad_norm": 1.3765219449996948, - "learning_rate": 9.495412844036699e-06, - "loss": 0.7279, - "step": 3723 - }, - { - "epoch": 3.6, - "grad_norm": 1.2894562482833862, - "learning_rate": 9.491242702251877e-06, - "loss": 0.6829, - "step": 3724 - }, - { - "epoch": 3.6, - "grad_norm": 1.8220261335372925, - "learning_rate": 9.487072560467055e-06, - "loss": 0.6256, - "step": 3725 - }, - { - "epoch": 3.61, - "grad_norm": 1.4330283403396606, - "learning_rate": 9.482902418682235e-06, - "loss": 0.6358, - "step": 3726 - }, - { - "epoch": 3.61, - "grad_norm": 1.0621377229690552, - "learning_rate": 9.478732276897414e-06, - "loss": 0.6017, - "step": 3727 - }, - { - "epoch": 3.61, - "grad_norm": 1.6693440675735474, - "learning_rate": 9.474562135112594e-06, - "loss": 0.6218, - "step": 3728 - }, - { - "epoch": 3.61, - "grad_norm": 1.2444632053375244, - "learning_rate": 9.470391993327774e-06, - "loss": 0.8163, - "step": 3729 - }, - { - "epoch": 3.61, - "grad_norm": 1.4920762777328491, - "learning_rate": 9.466221851542952e-06, - "loss": 0.8798, - "step": 3730 - }, - { - "epoch": 3.61, - "grad_norm": 1.4989265203475952, - "learning_rate": 9.462051709758132e-06, - "loss": 0.6288, - "step": 3731 - }, - { - "epoch": 3.61, - "grad_norm": 1.5583096742630005, - "learning_rate": 9.457881567973312e-06, - "loss": 0.7625, - "step": 3732 - }, - { - "epoch": 3.61, - "grad_norm": 1.2095603942871094, - "learning_rate": 9.45371142618849e-06, - "loss": 0.7579, - "step": 3733 - }, - { - "epoch": 3.61, - "grad_norm": 1.6411110162734985, - "learning_rate": 9.44954128440367e-06, - "loss": 0.7544, - "step": 3734 - }, - { - "epoch": 3.61, - "grad_norm": 1.5525989532470703, - "learning_rate": 9.44537114261885e-06, - "loss": 0.6798, - "step": 3735 - }, - { - "epoch": 3.61, - "grad_norm": 1.4102756977081299, - "learning_rate": 9.441201000834029e-06, - "loss": 0.6924, - "step": 3736 - }, - { - "epoch": 3.62, - "grad_norm": 1.1991088390350342, - "learning_rate": 9.437030859049207e-06, - "loss": 0.8546, - "step": 3737 - }, - { - "epoch": 3.62, - "grad_norm": 1.2599495649337769, - "learning_rate": 9.432860717264387e-06, - "loss": 0.8464, - "step": 3738 - }, - { - "epoch": 3.62, - "grad_norm": 1.1184576749801636, - "learning_rate": 9.428690575479566e-06, - "loss": 0.6982, - "step": 3739 - }, - { - "epoch": 3.62, - "grad_norm": 1.1469248533248901, - "learning_rate": 9.424520433694746e-06, - "loss": 0.579, - "step": 3740 - }, - { - "epoch": 3.62, - "grad_norm": 1.3901900053024292, - "learning_rate": 9.420350291909926e-06, - "loss": 0.583, - "step": 3741 - }, - { - "epoch": 3.62, - "grad_norm": 1.032654881477356, - "learning_rate": 9.416180150125104e-06, - "loss": 0.7967, - "step": 3742 - }, - { - "epoch": 3.62, - "grad_norm": 1.558066487312317, - "learning_rate": 9.412010008340284e-06, - "loss": 0.8004, - "step": 3743 - }, - { - "epoch": 3.62, - "grad_norm": 1.0906705856323242, - "learning_rate": 9.407839866555464e-06, - "loss": 0.6494, - "step": 3744 - }, - { - "epoch": 3.62, - "grad_norm": 1.29024076461792, - "learning_rate": 9.403669724770643e-06, - "loss": 0.8209, - "step": 3745 - }, - { - "epoch": 3.62, - "grad_norm": 1.4281020164489746, - "learning_rate": 9.399499582985823e-06, - "loss": 0.7458, - "step": 3746 - }, - { - "epoch": 3.63, - "grad_norm": 1.579883337020874, - "learning_rate": 9.395329441201001e-06, - "loss": 0.7569, - "step": 3747 - }, - { - "epoch": 3.63, - "grad_norm": 1.3795017004013062, - "learning_rate": 9.39115929941618e-06, - "loss": 0.7465, - "step": 3748 - }, - { - "epoch": 3.63, - "grad_norm": 1.039727807044983, - "learning_rate": 9.38698915763136e-06, - "loss": 0.6091, - "step": 3749 - }, - { - "epoch": 3.63, - "grad_norm": 1.275769591331482, - "learning_rate": 9.38281901584654e-06, - "loss": 0.8901, - "step": 3750 - }, - { - "epoch": 3.63, - "grad_norm": 0.8903956413269043, - "learning_rate": 9.378648874061718e-06, - "loss": 0.6723, - "step": 3751 - }, - { - "epoch": 3.63, - "grad_norm": 1.2578043937683105, - "learning_rate": 9.374478732276898e-06, - "loss": 0.7633, - "step": 3752 - }, - { - "epoch": 3.63, - "grad_norm": 1.2086877822875977, - "learning_rate": 9.370308590492078e-06, - "loss": 0.7965, - "step": 3753 - }, - { - "epoch": 3.63, - "grad_norm": 1.6043813228607178, - "learning_rate": 9.366138448707256e-06, - "loss": 0.6879, - "step": 3754 - }, - { - "epoch": 3.63, - "grad_norm": 1.2092528343200684, - "learning_rate": 9.361968306922436e-06, - "loss": 0.7433, - "step": 3755 - }, - { - "epoch": 3.63, - "grad_norm": 1.9154289960861206, - "learning_rate": 9.357798165137616e-06, - "loss": 0.7641, - "step": 3756 - }, - { - "epoch": 3.64, - "grad_norm": 1.324974775314331, - "learning_rate": 9.353628023352795e-06, - "loss": 0.7824, - "step": 3757 - }, - { - "epoch": 3.64, - "grad_norm": 1.2821885347366333, - "learning_rate": 9.349457881567973e-06, - "loss": 0.769, - "step": 3758 - }, - { - "epoch": 3.64, - "grad_norm": 1.1568022966384888, - "learning_rate": 9.345287739783153e-06, - "loss": 0.6623, - "step": 3759 - }, - { - "epoch": 3.64, - "grad_norm": 1.1679307222366333, - "learning_rate": 9.341117597998331e-06, - "loss": 0.7793, - "step": 3760 - }, - { - "epoch": 3.64, - "grad_norm": 1.2440667152404785, - "learning_rate": 9.336947456213511e-06, - "loss": 0.8573, - "step": 3761 - }, - { - "epoch": 3.64, - "grad_norm": 1.2248564958572388, - "learning_rate": 9.332777314428691e-06, - "loss": 0.7017, - "step": 3762 - }, - { - "epoch": 3.64, - "grad_norm": 1.140850305557251, - "learning_rate": 9.32860717264387e-06, - "loss": 0.741, - "step": 3763 - }, - { - "epoch": 3.64, - "grad_norm": 1.4085427522659302, - "learning_rate": 9.32443703085905e-06, - "loss": 0.7385, - "step": 3764 - }, - { - "epoch": 3.64, - "grad_norm": 1.1393663883209229, - "learning_rate": 9.32026688907423e-06, - "loss": 0.6256, - "step": 3765 - }, - { - "epoch": 3.64, - "grad_norm": 1.5232007503509521, - "learning_rate": 9.316096747289408e-06, - "loss": 0.7036, - "step": 3766 - }, - { - "epoch": 3.64, - "grad_norm": 1.3290067911148071, - "learning_rate": 9.311926605504588e-06, - "loss": 0.6962, - "step": 3767 - }, - { - "epoch": 3.65, - "grad_norm": 1.3733572959899902, - "learning_rate": 9.307756463719768e-06, - "loss": 0.7121, - "step": 3768 - }, - { - "epoch": 3.65, - "grad_norm": 1.4415980577468872, - "learning_rate": 9.303586321934945e-06, - "loss": 0.7647, - "step": 3769 - }, - { - "epoch": 3.65, - "grad_norm": 1.3391342163085938, - "learning_rate": 9.299416180150125e-06, - "loss": 0.768, - "step": 3770 - }, - { - "epoch": 3.65, - "grad_norm": 1.2893701791763306, - "learning_rate": 9.295246038365305e-06, - "loss": 0.667, - "step": 3771 - }, - { - "epoch": 3.65, - "grad_norm": 1.296198844909668, - "learning_rate": 9.291075896580483e-06, - "loss": 0.7745, - "step": 3772 - }, - { - "epoch": 3.65, - "grad_norm": 1.2993522882461548, - "learning_rate": 9.286905754795663e-06, - "loss": 0.8202, - "step": 3773 - }, - { - "epoch": 3.65, - "grad_norm": 1.1732268333435059, - "learning_rate": 9.282735613010843e-06, - "loss": 0.9365, - "step": 3774 - }, - { - "epoch": 3.65, - "grad_norm": 1.590893268585205, - "learning_rate": 9.278565471226022e-06, - "loss": 0.7263, - "step": 3775 - }, - { - "epoch": 3.65, - "grad_norm": 1.3125766515731812, - "learning_rate": 9.274395329441202e-06, - "loss": 0.8581, - "step": 3776 - }, - { - "epoch": 3.65, - "grad_norm": 1.445427417755127, - "learning_rate": 9.270225187656382e-06, - "loss": 0.687, - "step": 3777 - }, - { - "epoch": 3.66, - "grad_norm": 1.2969833612442017, - "learning_rate": 9.26605504587156e-06, - "loss": 0.7846, - "step": 3778 - }, - { - "epoch": 3.66, - "grad_norm": 1.3622808456420898, - "learning_rate": 9.26188490408674e-06, - "loss": 0.6802, - "step": 3779 - }, - { - "epoch": 3.66, - "grad_norm": 1.2291429042816162, - "learning_rate": 9.257714762301918e-06, - "loss": 0.7613, - "step": 3780 - }, - { - "epoch": 3.66, - "grad_norm": 1.2298214435577393, - "learning_rate": 9.253544620517097e-06, - "loss": 0.8673, - "step": 3781 - }, - { - "epoch": 3.66, - "grad_norm": 1.1689229011535645, - "learning_rate": 9.249374478732277e-06, - "loss": 0.8588, - "step": 3782 - }, - { - "epoch": 3.66, - "grad_norm": 1.2919254302978516, - "learning_rate": 9.245204336947457e-06, - "loss": 0.9037, - "step": 3783 - }, - { - "epoch": 3.66, - "grad_norm": 0.9366986155509949, - "learning_rate": 9.241034195162635e-06, - "loss": 0.6829, - "step": 3784 - }, - { - "epoch": 3.66, - "grad_norm": 1.0638045072555542, - "learning_rate": 9.236864053377815e-06, - "loss": 0.6312, - "step": 3785 - }, - { - "epoch": 3.66, - "grad_norm": 1.3222174644470215, - "learning_rate": 9.232693911592995e-06, - "loss": 0.7944, - "step": 3786 - }, - { - "epoch": 3.66, - "grad_norm": 1.158786654472351, - "learning_rate": 9.228523769808174e-06, - "loss": 0.7255, - "step": 3787 - }, - { - "epoch": 3.67, - "grad_norm": 1.1780650615692139, - "learning_rate": 9.224353628023354e-06, - "loss": 0.7522, - "step": 3788 - }, - { - "epoch": 3.67, - "grad_norm": 1.5175025463104248, - "learning_rate": 9.220183486238534e-06, - "loss": 0.8647, - "step": 3789 - }, - { - "epoch": 3.67, - "grad_norm": 1.585942268371582, - "learning_rate": 9.216013344453712e-06, - "loss": 0.6549, - "step": 3790 - }, - { - "epoch": 3.67, - "grad_norm": 1.1129939556121826, - "learning_rate": 9.21184320266889e-06, - "loss": 0.7661, - "step": 3791 - }, - { - "epoch": 3.67, - "grad_norm": 1.2599798440933228, - "learning_rate": 9.20767306088407e-06, - "loss": 0.8126, - "step": 3792 - }, - { - "epoch": 3.67, - "grad_norm": 1.6986596584320068, - "learning_rate": 9.203502919099249e-06, - "loss": 0.711, - "step": 3793 - }, - { - "epoch": 3.67, - "grad_norm": 1.230600357055664, - "learning_rate": 9.199332777314429e-06, - "loss": 0.8057, - "step": 3794 - }, - { - "epoch": 3.67, - "grad_norm": 1.1111856698989868, - "learning_rate": 9.195162635529609e-06, - "loss": 0.6925, - "step": 3795 - }, - { - "epoch": 3.67, - "grad_norm": 1.2809500694274902, - "learning_rate": 9.190992493744787e-06, - "loss": 0.701, - "step": 3796 - }, - { - "epoch": 3.67, - "grad_norm": 1.0989046096801758, - "learning_rate": 9.186822351959967e-06, - "loss": 0.62, - "step": 3797 - }, - { - "epoch": 3.67, - "grad_norm": 1.066048264503479, - "learning_rate": 9.182652210175147e-06, - "loss": 0.7419, - "step": 3798 - }, - { - "epoch": 3.68, - "grad_norm": 1.0855743885040283, - "learning_rate": 9.178482068390326e-06, - "loss": 0.7767, - "step": 3799 - }, - { - "epoch": 3.68, - "grad_norm": 1.243959903717041, - "learning_rate": 9.174311926605506e-06, - "loss": 0.8384, - "step": 3800 - }, - { - "epoch": 3.68, - "grad_norm": 1.0355478525161743, - "learning_rate": 9.170141784820686e-06, - "loss": 0.7668, - "step": 3801 - }, - { - "epoch": 3.68, - "grad_norm": 1.559926152229309, - "learning_rate": 9.165971643035862e-06, - "loss": 0.7143, - "step": 3802 - }, - { - "epoch": 3.68, - "grad_norm": 1.1760704517364502, - "learning_rate": 9.161801501251042e-06, - "loss": 0.8026, - "step": 3803 - }, - { - "epoch": 3.68, - "grad_norm": 1.2300641536712646, - "learning_rate": 9.157631359466222e-06, - "loss": 0.7846, - "step": 3804 - }, - { - "epoch": 3.68, - "grad_norm": 1.4027724266052246, - "learning_rate": 9.1534612176814e-06, - "loss": 0.7137, - "step": 3805 - }, - { - "epoch": 3.68, - "grad_norm": 1.0805466175079346, - "learning_rate": 9.14929107589658e-06, - "loss": 0.6632, - "step": 3806 - }, - { - "epoch": 3.68, - "grad_norm": 1.2802335023880005, - "learning_rate": 9.14512093411176e-06, - "loss": 0.8155, - "step": 3807 - }, - { - "epoch": 3.68, - "grad_norm": 1.4101649522781372, - "learning_rate": 9.140950792326939e-06, - "loss": 0.7731, - "step": 3808 - }, - { - "epoch": 3.69, - "grad_norm": 1.4881744384765625, - "learning_rate": 9.136780650542119e-06, - "loss": 0.7518, - "step": 3809 - }, - { - "epoch": 3.69, - "grad_norm": 1.190462589263916, - "learning_rate": 9.132610508757299e-06, - "loss": 0.6099, - "step": 3810 - }, - { - "epoch": 3.69, - "grad_norm": 1.277464509010315, - "learning_rate": 9.128440366972477e-06, - "loss": 0.7867, - "step": 3811 - }, - { - "epoch": 3.69, - "grad_norm": 1.3371671438217163, - "learning_rate": 9.124270225187658e-06, - "loss": 0.7767, - "step": 3812 - }, - { - "epoch": 3.69, - "grad_norm": 1.0814464092254639, - "learning_rate": 9.120100083402836e-06, - "loss": 0.6586, - "step": 3813 - }, - { - "epoch": 3.69, - "grad_norm": 1.6695940494537354, - "learning_rate": 9.115929941618014e-06, - "loss": 0.659, - "step": 3814 - }, - { - "epoch": 3.69, - "grad_norm": 1.0612856149673462, - "learning_rate": 9.111759799833194e-06, - "loss": 0.5992, - "step": 3815 - }, - { - "epoch": 3.69, - "grad_norm": 1.6444050073623657, - "learning_rate": 9.107589658048374e-06, - "loss": 0.6657, - "step": 3816 - }, - { - "epoch": 3.69, - "grad_norm": 1.5341229438781738, - "learning_rate": 9.103419516263553e-06, - "loss": 0.6647, - "step": 3817 - }, - { - "epoch": 3.69, - "grad_norm": 1.2004408836364746, - "learning_rate": 9.099249374478733e-06, - "loss": 0.7232, - "step": 3818 - }, - { - "epoch": 3.7, - "grad_norm": 1.0692825317382812, - "learning_rate": 9.095079232693913e-06, - "loss": 0.7102, - "step": 3819 - }, - { - "epoch": 3.7, - "grad_norm": 1.3460806608200073, - "learning_rate": 9.090909090909091e-06, - "loss": 0.707, - "step": 3820 - }, - { - "epoch": 3.7, - "grad_norm": 1.273742914199829, - "learning_rate": 9.086738949124271e-06, - "loss": 0.7478, - "step": 3821 - }, - { - "epoch": 3.7, - "grad_norm": 2.021270275115967, - "learning_rate": 9.082568807339451e-06, - "loss": 0.812, - "step": 3822 - }, - { - "epoch": 3.7, - "grad_norm": 1.2000283002853394, - "learning_rate": 9.07839866555463e-06, - "loss": 0.6991, - "step": 3823 - }, - { - "epoch": 3.7, - "grad_norm": 1.7958741188049316, - "learning_rate": 9.074228523769808e-06, - "loss": 0.6708, - "step": 3824 - }, - { - "epoch": 3.7, - "grad_norm": 0.9178566336631775, - "learning_rate": 9.070058381984988e-06, - "loss": 0.7075, - "step": 3825 - }, - { - "epoch": 3.7, - "grad_norm": 1.2584969997406006, - "learning_rate": 9.065888240200166e-06, - "loss": 0.629, - "step": 3826 - }, - { - "epoch": 3.7, - "grad_norm": 1.2469345331192017, - "learning_rate": 9.061718098415346e-06, - "loss": 0.7066, - "step": 3827 - }, - { - "epoch": 3.7, - "grad_norm": 1.5744034051895142, - "learning_rate": 9.057547956630526e-06, - "loss": 0.7953, - "step": 3828 - }, - { - "epoch": 3.7, - "grad_norm": 1.2157225608825684, - "learning_rate": 9.053377814845705e-06, - "loss": 0.7811, - "step": 3829 - }, - { - "epoch": 3.71, - "grad_norm": 1.3481709957122803, - "learning_rate": 9.049207673060885e-06, - "loss": 0.7233, - "step": 3830 - }, - { - "epoch": 3.71, - "grad_norm": 1.3199782371520996, - "learning_rate": 9.045037531276065e-06, - "loss": 0.8076, - "step": 3831 - }, - { - "epoch": 3.71, - "grad_norm": 1.0716701745986938, - "learning_rate": 9.040867389491243e-06, - "loss": 0.6526, - "step": 3832 - }, - { - "epoch": 3.71, - "grad_norm": 1.422723650932312, - "learning_rate": 9.036697247706423e-06, - "loss": 0.616, - "step": 3833 - }, - { - "epoch": 3.71, - "grad_norm": 1.0690348148345947, - "learning_rate": 9.032527105921603e-06, - "loss": 0.6838, - "step": 3834 - }, - { - "epoch": 3.71, - "grad_norm": 1.8414225578308105, - "learning_rate": 9.02835696413678e-06, - "loss": 0.7595, - "step": 3835 - }, - { - "epoch": 3.71, - "grad_norm": 1.3578447103500366, - "learning_rate": 9.02418682235196e-06, - "loss": 0.6945, - "step": 3836 - }, - { - "epoch": 3.71, - "grad_norm": 1.7697968482971191, - "learning_rate": 9.02001668056714e-06, - "loss": 0.7137, - "step": 3837 - }, - { - "epoch": 3.71, - "grad_norm": 1.4124974012374878, - "learning_rate": 9.015846538782318e-06, - "loss": 0.7721, - "step": 3838 - }, - { - "epoch": 3.71, - "grad_norm": 1.0588980913162231, - "learning_rate": 9.011676396997498e-06, - "loss": 0.6732, - "step": 3839 - }, - { - "epoch": 3.72, - "grad_norm": 1.1191587448120117, - "learning_rate": 9.007506255212678e-06, - "loss": 0.6778, - "step": 3840 - }, - { - "epoch": 3.72, - "grad_norm": 1.2233281135559082, - "learning_rate": 9.003336113427857e-06, - "loss": 0.8189, - "step": 3841 - }, - { - "epoch": 3.72, - "grad_norm": 1.1825919151306152, - "learning_rate": 8.999165971643037e-06, - "loss": 0.7156, - "step": 3842 - }, - { - "epoch": 3.72, - "grad_norm": 1.2899632453918457, - "learning_rate": 8.994995829858217e-06, - "loss": 0.6679, - "step": 3843 - }, - { - "epoch": 3.72, - "grad_norm": 1.245190978050232, - "learning_rate": 8.990825688073395e-06, - "loss": 0.6828, - "step": 3844 - }, - { - "epoch": 3.72, - "grad_norm": 1.7574058771133423, - "learning_rate": 8.986655546288575e-06, - "loss": 0.9988, - "step": 3845 - }, - { - "epoch": 3.72, - "grad_norm": 1.1397372484207153, - "learning_rate": 8.982485404503753e-06, - "loss": 0.6545, - "step": 3846 - }, - { - "epoch": 3.72, - "grad_norm": 1.2123494148254395, - "learning_rate": 8.978315262718932e-06, - "loss": 0.6628, - "step": 3847 - }, - { - "epoch": 3.72, - "grad_norm": 1.0750784873962402, - "learning_rate": 8.974145120934112e-06, - "loss": 0.7138, - "step": 3848 - }, - { - "epoch": 3.72, - "grad_norm": 1.2346811294555664, - "learning_rate": 8.969974979149292e-06, - "loss": 0.6846, - "step": 3849 - }, - { - "epoch": 3.73, - "grad_norm": 1.402255892753601, - "learning_rate": 8.96580483736447e-06, - "loss": 0.787, - "step": 3850 - }, - { - "epoch": 3.73, - "grad_norm": 1.5618852376937866, - "learning_rate": 8.96163469557965e-06, - "loss": 0.6784, - "step": 3851 - }, - { - "epoch": 3.73, - "grad_norm": 1.0520151853561401, - "learning_rate": 8.95746455379483e-06, - "loss": 0.6024, - "step": 3852 - }, - { - "epoch": 3.73, - "grad_norm": 1.403063178062439, - "learning_rate": 8.953294412010008e-06, - "loss": 0.7569, - "step": 3853 - }, - { - "epoch": 3.73, - "grad_norm": 1.5110641717910767, - "learning_rate": 8.949124270225189e-06, - "loss": 0.7434, - "step": 3854 - }, - { - "epoch": 3.73, - "grad_norm": 1.1577140092849731, - "learning_rate": 8.944954128440369e-06, - "loss": 0.6905, - "step": 3855 - }, - { - "epoch": 3.73, - "grad_norm": 1.5566719770431519, - "learning_rate": 8.940783986655547e-06, - "loss": 0.8057, - "step": 3856 - }, - { - "epoch": 3.73, - "grad_norm": 1.5115667581558228, - "learning_rate": 8.936613844870725e-06, - "loss": 0.5612, - "step": 3857 - }, - { - "epoch": 3.73, - "grad_norm": 1.4218279123306274, - "learning_rate": 8.932443703085905e-06, - "loss": 0.7115, - "step": 3858 - }, - { - "epoch": 3.73, - "grad_norm": 1.1728657484054565, - "learning_rate": 8.928273561301084e-06, - "loss": 0.6655, - "step": 3859 - }, - { - "epoch": 3.73, - "grad_norm": 1.3531938791275024, - "learning_rate": 8.924103419516264e-06, - "loss": 0.8297, - "step": 3860 - }, - { - "epoch": 3.74, - "grad_norm": 1.8712323904037476, - "learning_rate": 8.919933277731444e-06, - "loss": 0.792, - "step": 3861 - }, - { - "epoch": 3.74, - "grad_norm": 1.3992277383804321, - "learning_rate": 8.915763135946622e-06, - "loss": 0.7692, - "step": 3862 - }, - { - "epoch": 3.74, - "grad_norm": 1.2772537469863892, - "learning_rate": 8.911592994161802e-06, - "loss": 0.7146, - "step": 3863 - }, - { - "epoch": 3.74, - "grad_norm": 1.083439588546753, - "learning_rate": 8.907422852376982e-06, - "loss": 0.6436, - "step": 3864 - }, - { - "epoch": 3.74, - "grad_norm": 1.0760661363601685, - "learning_rate": 8.90325271059216e-06, - "loss": 0.6675, - "step": 3865 - }, - { - "epoch": 3.74, - "grad_norm": 1.0748087167739868, - "learning_rate": 8.89908256880734e-06, - "loss": 0.8525, - "step": 3866 - }, - { - "epoch": 3.74, - "grad_norm": 1.5770659446716309, - "learning_rate": 8.89491242702252e-06, - "loss": 0.7182, - "step": 3867 - }, - { - "epoch": 3.74, - "grad_norm": 1.2480491399765015, - "learning_rate": 8.890742285237697e-06, - "loss": 0.6618, - "step": 3868 - }, - { - "epoch": 3.74, - "grad_norm": 0.9434835314750671, - "learning_rate": 8.886572143452877e-06, - "loss": 0.6799, - "step": 3869 - }, - { - "epoch": 3.74, - "grad_norm": 1.2592473030090332, - "learning_rate": 8.882402001668057e-06, - "loss": 0.5887, - "step": 3870 - }, - { - "epoch": 3.75, - "grad_norm": 0.9747094511985779, - "learning_rate": 8.878231859883236e-06, - "loss": 0.659, - "step": 3871 - }, - { - "epoch": 3.75, - "grad_norm": 1.5506322383880615, - "learning_rate": 8.874061718098416e-06, - "loss": 0.6788, - "step": 3872 - }, - { - "epoch": 3.75, - "grad_norm": 1.4279855489730835, - "learning_rate": 8.869891576313596e-06, - "loss": 0.7228, - "step": 3873 - }, - { - "epoch": 3.75, - "grad_norm": 1.0204380750656128, - "learning_rate": 8.865721434528774e-06, - "loss": 0.6567, - "step": 3874 - }, - { - "epoch": 3.75, - "grad_norm": 2.258882999420166, - "learning_rate": 8.861551292743954e-06, - "loss": 0.7533, - "step": 3875 - }, - { - "epoch": 3.75, - "grad_norm": 1.316502332687378, - "learning_rate": 8.857381150959134e-06, - "loss": 0.6603, - "step": 3876 - }, - { - "epoch": 3.75, - "grad_norm": 1.306138515472412, - "learning_rate": 8.853211009174312e-06, - "loss": 0.7205, - "step": 3877 - }, - { - "epoch": 3.75, - "grad_norm": 1.134323000907898, - "learning_rate": 8.849040867389492e-06, - "loss": 0.7431, - "step": 3878 - }, - { - "epoch": 3.75, - "grad_norm": 1.4522703886032104, - "learning_rate": 8.84487072560467e-06, - "loss": 0.7373, - "step": 3879 - }, - { - "epoch": 3.75, - "grad_norm": 1.2387404441833496, - "learning_rate": 8.840700583819849e-06, - "loss": 0.7731, - "step": 3880 - }, - { - "epoch": 3.76, - "grad_norm": 1.2427064180374146, - "learning_rate": 8.83653044203503e-06, - "loss": 0.8667, - "step": 3881 - }, - { - "epoch": 3.76, - "grad_norm": 1.855582356452942, - "learning_rate": 8.83236030025021e-06, - "loss": 0.8088, - "step": 3882 - }, - { - "epoch": 3.76, - "grad_norm": 1.252475380897522, - "learning_rate": 8.828190158465388e-06, - "loss": 0.7236, - "step": 3883 - }, - { - "epoch": 3.76, - "grad_norm": 1.447736144065857, - "learning_rate": 8.824020016680568e-06, - "loss": 0.8166, - "step": 3884 - }, - { - "epoch": 3.76, - "grad_norm": 1.1489137411117554, - "learning_rate": 8.819849874895748e-06, - "loss": 0.709, - "step": 3885 - }, - { - "epoch": 3.76, - "grad_norm": 1.1987931728363037, - "learning_rate": 8.815679733110926e-06, - "loss": 0.7412, - "step": 3886 - }, - { - "epoch": 3.76, - "grad_norm": 1.2565351724624634, - "learning_rate": 8.811509591326106e-06, - "loss": 0.6736, - "step": 3887 - }, - { - "epoch": 3.76, - "grad_norm": 1.304922103881836, - "learning_rate": 8.807339449541286e-06, - "loss": 0.6911, - "step": 3888 - }, - { - "epoch": 3.76, - "grad_norm": 1.3182557821273804, - "learning_rate": 8.803169307756464e-06, - "loss": 0.9893, - "step": 3889 - }, - { - "epoch": 3.76, - "grad_norm": 1.0852586030960083, - "learning_rate": 8.798999165971643e-06, - "loss": 0.8067, - "step": 3890 - }, - { - "epoch": 3.76, - "grad_norm": 1.0428166389465332, - "learning_rate": 8.794829024186823e-06, - "loss": 0.5376, - "step": 3891 - }, - { - "epoch": 3.77, - "grad_norm": 1.490378975868225, - "learning_rate": 8.790658882402001e-06, - "loss": 0.788, - "step": 3892 - }, - { - "epoch": 3.77, - "grad_norm": 1.7355320453643799, - "learning_rate": 8.786488740617181e-06, - "loss": 0.6874, - "step": 3893 - }, - { - "epoch": 3.77, - "grad_norm": 1.0040160417556763, - "learning_rate": 8.782318598832361e-06, - "loss": 0.6798, - "step": 3894 - }, - { - "epoch": 3.77, - "grad_norm": 1.1138432025909424, - "learning_rate": 8.77814845704754e-06, - "loss": 0.7166, - "step": 3895 - }, - { - "epoch": 3.77, - "grad_norm": 1.5286980867385864, - "learning_rate": 8.77397831526272e-06, - "loss": 0.7808, - "step": 3896 - }, - { - "epoch": 3.77, - "grad_norm": 1.0869489908218384, - "learning_rate": 8.7698081734779e-06, - "loss": 0.6303, - "step": 3897 - }, - { - "epoch": 3.77, - "grad_norm": 1.4481172561645508, - "learning_rate": 8.765638031693078e-06, - "loss": 0.8856, - "step": 3898 - }, - { - "epoch": 3.77, - "grad_norm": 1.4681679010391235, - "learning_rate": 8.761467889908258e-06, - "loss": 0.6628, - "step": 3899 - }, - { - "epoch": 3.77, - "grad_norm": 1.4028898477554321, - "learning_rate": 8.757297748123438e-06, - "loss": 0.8009, - "step": 3900 - }, - { - "epoch": 3.77, - "eval_loss": 0.8551230430603027, - "eval_runtime": 858.1586, - "eval_samples_per_second": 4.817, - "eval_steps_per_second": 0.602, - "step": 3900 - }, - { - "epoch": 3.77, - "grad_norm": 1.1668781042099, - "learning_rate": 8.753127606338615e-06, - "loss": 0.7094, - "step": 3901 - }, - { - "epoch": 3.78, - "grad_norm": 1.279202938079834, - "learning_rate": 8.748957464553795e-06, - "loss": 0.8539, - "step": 3902 - }, - { - "epoch": 3.78, - "grad_norm": 1.3793323040008545, - "learning_rate": 8.744787322768975e-06, - "loss": 0.5602, - "step": 3903 - }, - { - "epoch": 3.78, - "grad_norm": 1.2167240381240845, - "learning_rate": 8.740617180984153e-06, - "loss": 0.7458, - "step": 3904 - }, - { - "epoch": 3.78, - "grad_norm": 1.5817376375198364, - "learning_rate": 8.736447039199333e-06, - "loss": 0.6797, - "step": 3905 - }, - { - "epoch": 3.78, - "grad_norm": 1.3980896472930908, - "learning_rate": 8.732276897414513e-06, - "loss": 0.6803, - "step": 3906 - }, - { - "epoch": 3.78, - "grad_norm": 1.4637523889541626, - "learning_rate": 8.728106755629691e-06, - "loss": 0.9646, - "step": 3907 - }, - { - "epoch": 3.78, - "grad_norm": 1.1062219142913818, - "learning_rate": 8.723936613844871e-06, - "loss": 0.6811, - "step": 3908 - }, - { - "epoch": 3.78, - "grad_norm": 1.5182491540908813, - "learning_rate": 8.719766472060052e-06, - "loss": 0.8207, - "step": 3909 - }, - { - "epoch": 3.78, - "grad_norm": 1.0973973274230957, - "learning_rate": 8.71559633027523e-06, - "loss": 0.7436, - "step": 3910 - }, - { - "epoch": 3.78, - "grad_norm": 1.0992844104766846, - "learning_rate": 8.71142618849041e-06, - "loss": 0.8696, - "step": 3911 - }, - { - "epoch": 3.79, - "grad_norm": 1.2279597520828247, - "learning_rate": 8.707256046705588e-06, - "loss": 0.7107, - "step": 3912 - }, - { - "epoch": 3.79, - "grad_norm": 1.3053665161132812, - "learning_rate": 8.703085904920767e-06, - "loss": 0.7919, - "step": 3913 - }, - { - "epoch": 3.79, - "grad_norm": 1.193116545677185, - "learning_rate": 8.698915763135947e-06, - "loss": 0.7822, - "step": 3914 - }, - { - "epoch": 3.79, - "grad_norm": 1.24557363986969, - "learning_rate": 8.694745621351127e-06, - "loss": 0.6917, - "step": 3915 - }, - { - "epoch": 3.79, - "grad_norm": 1.1103416681289673, - "learning_rate": 8.690575479566305e-06, - "loss": 0.6902, - "step": 3916 - }, - { - "epoch": 3.79, - "grad_norm": 1.5970569849014282, - "learning_rate": 8.686405337781485e-06, - "loss": 0.715, - "step": 3917 - }, - { - "epoch": 3.79, - "grad_norm": 1.3219919204711914, - "learning_rate": 8.682235195996665e-06, - "loss": 0.668, - "step": 3918 - }, - { - "epoch": 3.79, - "grad_norm": 1.3247804641723633, - "learning_rate": 8.678065054211843e-06, - "loss": 0.7101, - "step": 3919 - }, - { - "epoch": 3.79, - "grad_norm": 1.8229241371154785, - "learning_rate": 8.673894912427023e-06, - "loss": 0.7737, - "step": 3920 - }, - { - "epoch": 3.79, - "grad_norm": 1.1297950744628906, - "learning_rate": 8.669724770642203e-06, - "loss": 0.7004, - "step": 3921 - }, - { - "epoch": 3.79, - "grad_norm": 1.413045048713684, - "learning_rate": 8.665554628857382e-06, - "loss": 0.7086, - "step": 3922 - }, - { - "epoch": 3.8, - "grad_norm": 1.4547609090805054, - "learning_rate": 8.66138448707256e-06, - "loss": 0.8475, - "step": 3923 - }, - { - "epoch": 3.8, - "grad_norm": 1.1815779209136963, - "learning_rate": 8.65721434528774e-06, - "loss": 0.5953, - "step": 3924 - }, - { - "epoch": 3.8, - "grad_norm": 1.0510928630828857, - "learning_rate": 8.653044203502919e-06, - "loss": 0.6837, - "step": 3925 - }, - { - "epoch": 3.8, - "grad_norm": 1.6641714572906494, - "learning_rate": 8.648874061718099e-06, - "loss": 0.7985, - "step": 3926 - }, - { - "epoch": 3.8, - "grad_norm": 1.7786951065063477, - "learning_rate": 8.644703919933279e-06, - "loss": 0.68, - "step": 3927 - }, - { - "epoch": 3.8, - "grad_norm": 1.3749067783355713, - "learning_rate": 8.640533778148457e-06, - "loss": 0.7351, - "step": 3928 - }, - { - "epoch": 3.8, - "grad_norm": 1.4634582996368408, - "learning_rate": 8.636363636363637e-06, - "loss": 0.7863, - "step": 3929 - }, - { - "epoch": 3.8, - "grad_norm": 1.6887547969818115, - "learning_rate": 8.632193494578817e-06, - "loss": 0.8711, - "step": 3930 - }, - { - "epoch": 3.8, - "grad_norm": 1.2339435815811157, - "learning_rate": 8.628023352793995e-06, - "loss": 0.6633, - "step": 3931 - }, - { - "epoch": 3.8, - "grad_norm": 1.174820899963379, - "learning_rate": 8.623853211009175e-06, - "loss": 0.633, - "step": 3932 - }, - { - "epoch": 3.81, - "grad_norm": 1.3371484279632568, - "learning_rate": 8.619683069224355e-06, - "loss": 0.6297, - "step": 3933 - }, - { - "epoch": 3.81, - "grad_norm": 1.1373400688171387, - "learning_rate": 8.615512927439532e-06, - "loss": 0.758, - "step": 3934 - }, - { - "epoch": 3.81, - "grad_norm": 1.9303170442581177, - "learning_rate": 8.611342785654712e-06, - "loss": 0.7414, - "step": 3935 - }, - { - "epoch": 3.81, - "grad_norm": 1.1726877689361572, - "learning_rate": 8.607172643869892e-06, - "loss": 0.671, - "step": 3936 - }, - { - "epoch": 3.81, - "grad_norm": 1.918506383895874, - "learning_rate": 8.60300250208507e-06, - "loss": 0.6406, - "step": 3937 - }, - { - "epoch": 3.81, - "grad_norm": 1.0149345397949219, - "learning_rate": 8.59883236030025e-06, - "loss": 0.6896, - "step": 3938 - }, - { - "epoch": 3.81, - "grad_norm": 1.8189246654510498, - "learning_rate": 8.59466221851543e-06, - "loss": 0.6587, - "step": 3939 - }, - { - "epoch": 3.81, - "grad_norm": 1.7515846490859985, - "learning_rate": 8.590492076730609e-06, - "loss": 0.9204, - "step": 3940 - }, - { - "epoch": 3.81, - "grad_norm": 1.0661392211914062, - "learning_rate": 8.586321934945789e-06, - "loss": 0.6879, - "step": 3941 - }, - { - "epoch": 3.81, - "grad_norm": 1.293162226676941, - "learning_rate": 8.582151793160969e-06, - "loss": 0.6811, - "step": 3942 - }, - { - "epoch": 3.82, - "grad_norm": 1.080329179763794, - "learning_rate": 8.577981651376147e-06, - "loss": 0.8401, - "step": 3943 - }, - { - "epoch": 3.82, - "grad_norm": 1.2835712432861328, - "learning_rate": 8.573811509591327e-06, - "loss": 0.6676, - "step": 3944 - }, - { - "epoch": 3.82, - "grad_norm": 1.0308789014816284, - "learning_rate": 8.569641367806506e-06, - "loss": 0.6094, - "step": 3945 - }, - { - "epoch": 3.82, - "grad_norm": 1.296432375907898, - "learning_rate": 8.565471226021684e-06, - "loss": 0.8018, - "step": 3946 - }, - { - "epoch": 3.82, - "grad_norm": 1.3173983097076416, - "learning_rate": 8.561301084236864e-06, - "loss": 0.751, - "step": 3947 - }, - { - "epoch": 3.82, - "grad_norm": 1.3164054155349731, - "learning_rate": 8.557130942452044e-06, - "loss": 0.7981, - "step": 3948 - }, - { - "epoch": 3.82, - "grad_norm": 1.1949673891067505, - "learning_rate": 8.552960800667222e-06, - "loss": 0.7503, - "step": 3949 - }, - { - "epoch": 3.82, - "grad_norm": 1.2497044801712036, - "learning_rate": 8.548790658882402e-06, - "loss": 0.8542, - "step": 3950 - }, - { - "epoch": 3.82, - "grad_norm": 1.3445051908493042, - "learning_rate": 8.544620517097583e-06, - "loss": 0.843, - "step": 3951 - }, - { - "epoch": 3.82, - "grad_norm": 1.0741132497787476, - "learning_rate": 8.540450375312761e-06, - "loss": 0.6759, - "step": 3952 - }, - { - "epoch": 3.82, - "grad_norm": 1.1905039548873901, - "learning_rate": 8.536280233527941e-06, - "loss": 0.6363, - "step": 3953 - }, - { - "epoch": 3.83, - "grad_norm": 1.234527826309204, - "learning_rate": 8.532110091743121e-06, - "loss": 0.7328, - "step": 3954 - }, - { - "epoch": 3.83, - "grad_norm": 1.1649458408355713, - "learning_rate": 8.5279399499583e-06, - "loss": 0.556, - "step": 3955 - }, - { - "epoch": 3.83, - "grad_norm": 1.1444815397262573, - "learning_rate": 8.523769808173478e-06, - "loss": 0.6184, - "step": 3956 - }, - { - "epoch": 3.83, - "grad_norm": 1.1928359270095825, - "learning_rate": 8.519599666388658e-06, - "loss": 0.598, - "step": 3957 - }, - { - "epoch": 3.83, - "grad_norm": 1.1508607864379883, - "learning_rate": 8.515429524603836e-06, - "loss": 0.5703, - "step": 3958 - }, - { - "epoch": 3.83, - "grad_norm": 1.59660005569458, - "learning_rate": 8.511259382819016e-06, - "loss": 0.6351, - "step": 3959 - }, - { - "epoch": 3.83, - "grad_norm": 1.2995659112930298, - "learning_rate": 8.507089241034196e-06, - "loss": 0.8417, - "step": 3960 - }, - { - "epoch": 3.83, - "grad_norm": 1.7801694869995117, - "learning_rate": 8.502919099249374e-06, - "loss": 0.9495, - "step": 3961 - }, - { - "epoch": 3.83, - "grad_norm": 1.3981291055679321, - "learning_rate": 8.498748957464554e-06, - "loss": 0.7076, - "step": 3962 - }, - { - "epoch": 3.83, - "grad_norm": 1.372262716293335, - "learning_rate": 8.494578815679734e-06, - "loss": 0.8684, - "step": 3963 - }, - { - "epoch": 3.84, - "grad_norm": 1.3083829879760742, - "learning_rate": 8.490408673894913e-06, - "loss": 0.816, - "step": 3964 - }, - { - "epoch": 3.84, - "grad_norm": 1.0977733135223389, - "learning_rate": 8.486238532110093e-06, - "loss": 0.6485, - "step": 3965 - }, - { - "epoch": 3.84, - "grad_norm": 1.4571865797042847, - "learning_rate": 8.482068390325273e-06, - "loss": 0.8095, - "step": 3966 - }, - { - "epoch": 3.84, - "grad_norm": 1.2152897119522095, - "learning_rate": 8.47789824854045e-06, - "loss": 0.8019, - "step": 3967 - }, - { - "epoch": 3.84, - "grad_norm": 1.812170147895813, - "learning_rate": 8.47372810675563e-06, - "loss": 0.7086, - "step": 3968 - }, - { - "epoch": 3.84, - "grad_norm": 1.2257304191589355, - "learning_rate": 8.46955796497081e-06, - "loss": 0.7329, - "step": 3969 - }, - { - "epoch": 3.84, - "grad_norm": 1.5328458547592163, - "learning_rate": 8.465387823185988e-06, - "loss": 0.8077, - "step": 3970 - }, - { - "epoch": 3.84, - "grad_norm": 1.4491785764694214, - "learning_rate": 8.461217681401168e-06, - "loss": 0.8759, - "step": 3971 - }, - { - "epoch": 3.84, - "grad_norm": 1.3397568464279175, - "learning_rate": 8.457047539616348e-06, - "loss": 0.6961, - "step": 3972 - }, - { - "epoch": 3.84, - "grad_norm": 1.3516210317611694, - "learning_rate": 8.452877397831526e-06, - "loss": 0.7509, - "step": 3973 - }, - { - "epoch": 3.85, - "grad_norm": 1.0660499334335327, - "learning_rate": 8.448707256046706e-06, - "loss": 0.5578, - "step": 3974 - }, - { - "epoch": 3.85, - "grad_norm": 1.2426209449768066, - "learning_rate": 8.444537114261886e-06, - "loss": 0.7431, - "step": 3975 - }, - { - "epoch": 3.85, - "grad_norm": 1.277725100517273, - "learning_rate": 8.440366972477065e-06, - "loss": 0.7016, - "step": 3976 - }, - { - "epoch": 3.85, - "grad_norm": 1.1149075031280518, - "learning_rate": 8.436196830692245e-06, - "loss": 0.6526, - "step": 3977 - }, - { - "epoch": 3.85, - "grad_norm": 1.565482258796692, - "learning_rate": 8.432026688907423e-06, - "loss": 0.8724, - "step": 3978 - }, - { - "epoch": 3.85, - "grad_norm": 1.3156458139419556, - "learning_rate": 8.427856547122602e-06, - "loss": 0.6484, - "step": 3979 - }, - { - "epoch": 3.85, - "grad_norm": 1.5746185779571533, - "learning_rate": 8.423686405337782e-06, - "loss": 0.8209, - "step": 3980 - }, - { - "epoch": 3.85, - "grad_norm": 1.2292309999465942, - "learning_rate": 8.419516263552962e-06, - "loss": 0.6328, - "step": 3981 - }, - { - "epoch": 3.85, - "grad_norm": 1.4444047212600708, - "learning_rate": 8.41534612176814e-06, - "loss": 0.7564, - "step": 3982 - }, - { - "epoch": 3.85, - "grad_norm": 1.1935038566589355, - "learning_rate": 8.41117597998332e-06, - "loss": 0.7848, - "step": 3983 - }, - { - "epoch": 3.85, - "grad_norm": 1.3232645988464355, - "learning_rate": 8.4070058381985e-06, - "loss": 0.6391, - "step": 3984 - }, - { - "epoch": 3.86, - "grad_norm": 1.1306418180465698, - "learning_rate": 8.402835696413678e-06, - "loss": 0.7451, - "step": 3985 - }, - { - "epoch": 3.86, - "grad_norm": 1.3414762020111084, - "learning_rate": 8.398665554628858e-06, - "loss": 0.5525, - "step": 3986 - }, - { - "epoch": 3.86, - "grad_norm": 1.1248849630355835, - "learning_rate": 8.394495412844038e-06, - "loss": 0.774, - "step": 3987 - }, - { - "epoch": 3.86, - "grad_norm": 1.3943297863006592, - "learning_rate": 8.390325271059217e-06, - "loss": 0.7663, - "step": 3988 - }, - { - "epoch": 3.86, - "grad_norm": 1.0571784973144531, - "learning_rate": 8.386155129274395e-06, - "loss": 0.6862, - "step": 3989 - }, - { - "epoch": 3.86, - "grad_norm": 1.528711199760437, - "learning_rate": 8.381984987489575e-06, - "loss": 0.7129, - "step": 3990 - }, - { - "epoch": 3.86, - "grad_norm": 1.2378162145614624, - "learning_rate": 8.377814845704753e-06, - "loss": 0.6722, - "step": 3991 - }, - { - "epoch": 3.86, - "grad_norm": 1.1124138832092285, - "learning_rate": 8.373644703919933e-06, - "loss": 0.7745, - "step": 3992 - }, - { - "epoch": 3.86, - "grad_norm": 1.20100998878479, - "learning_rate": 8.369474562135114e-06, - "loss": 0.7916, - "step": 3993 - }, - { - "epoch": 3.86, - "grad_norm": 1.234928011894226, - "learning_rate": 8.365304420350292e-06, - "loss": 0.8269, - "step": 3994 - }, - { - "epoch": 3.87, - "grad_norm": 1.1601148843765259, - "learning_rate": 8.361134278565472e-06, - "loss": 0.6957, - "step": 3995 - }, - { - "epoch": 3.87, - "grad_norm": 1.1746742725372314, - "learning_rate": 8.356964136780652e-06, - "loss": 0.6987, - "step": 3996 - }, - { - "epoch": 3.87, - "grad_norm": 1.366929292678833, - "learning_rate": 8.35279399499583e-06, - "loss": 0.8521, - "step": 3997 - }, - { - "epoch": 3.87, - "grad_norm": 1.2353994846343994, - "learning_rate": 8.34862385321101e-06, - "loss": 0.6475, - "step": 3998 - }, - { - "epoch": 3.87, - "grad_norm": 1.0730762481689453, - "learning_rate": 8.344453711426189e-06, - "loss": 0.6401, - "step": 3999 - }, - { - "epoch": 3.87, - "grad_norm": 1.1085494756698608, - "learning_rate": 8.340283569641367e-06, - "loss": 0.6696, - "step": 4000 - }, - { - "epoch": 3.87, - "grad_norm": 1.027856707572937, - "learning_rate": 8.336113427856547e-06, - "loss": 0.7014, - "step": 4001 - }, - { - "epoch": 3.87, - "grad_norm": 1.6614447832107544, - "learning_rate": 8.331943286071727e-06, - "loss": 0.7025, - "step": 4002 - }, - { - "epoch": 3.87, - "grad_norm": 1.4471124410629272, - "learning_rate": 8.327773144286905e-06, - "loss": 0.6179, - "step": 4003 - }, - { - "epoch": 3.87, - "grad_norm": 1.5548070669174194, - "learning_rate": 8.323603002502085e-06, - "loss": 0.831, - "step": 4004 - }, - { - "epoch": 3.88, - "grad_norm": 1.3854544162750244, - "learning_rate": 8.319432860717265e-06, - "loss": 0.7213, - "step": 4005 - }, - { - "epoch": 3.88, - "grad_norm": 1.2448898553848267, - "learning_rate": 8.315262718932444e-06, - "loss": 0.9163, - "step": 4006 - }, - { - "epoch": 3.88, - "grad_norm": 1.0090829133987427, - "learning_rate": 8.311092577147624e-06, - "loss": 0.8564, - "step": 4007 - }, - { - "epoch": 3.88, - "grad_norm": 1.4596858024597168, - "learning_rate": 8.306922435362804e-06, - "loss": 0.728, - "step": 4008 - }, - { - "epoch": 3.88, - "grad_norm": 1.7371070384979248, - "learning_rate": 8.302752293577982e-06, - "loss": 0.7635, - "step": 4009 - }, - { - "epoch": 3.88, - "grad_norm": 1.4701354503631592, - "learning_rate": 8.298582151793162e-06, - "loss": 0.8342, - "step": 4010 - }, - { - "epoch": 3.88, - "grad_norm": 1.4418973922729492, - "learning_rate": 8.29441201000834e-06, - "loss": 0.6969, - "step": 4011 - }, - { - "epoch": 3.88, - "grad_norm": 1.3696964979171753, - "learning_rate": 8.290241868223519e-06, - "loss": 0.8933, - "step": 4012 - }, - { - "epoch": 3.88, - "grad_norm": 1.378226637840271, - "learning_rate": 8.286071726438699e-06, - "loss": 0.7117, - "step": 4013 - }, - { - "epoch": 3.88, - "grad_norm": 1.2607053518295288, - "learning_rate": 8.281901584653879e-06, - "loss": 0.7669, - "step": 4014 - }, - { - "epoch": 3.88, - "grad_norm": 1.0556697845458984, - "learning_rate": 8.277731442869057e-06, - "loss": 0.6564, - "step": 4015 - }, - { - "epoch": 3.89, - "grad_norm": 1.4814832210540771, - "learning_rate": 8.273561301084237e-06, - "loss": 0.6241, - "step": 4016 - }, - { - "epoch": 3.89, - "grad_norm": 1.2367101907730103, - "learning_rate": 8.269391159299417e-06, - "loss": 0.7911, - "step": 4017 - }, - { - "epoch": 3.89, - "grad_norm": 1.853064775466919, - "learning_rate": 8.265221017514596e-06, - "loss": 0.79, - "step": 4018 - }, - { - "epoch": 3.89, - "grad_norm": 1.3849866390228271, - "learning_rate": 8.261050875729776e-06, - "loss": 0.6966, - "step": 4019 - }, - { - "epoch": 3.89, - "grad_norm": 1.6763454675674438, - "learning_rate": 8.256880733944954e-06, - "loss": 0.6965, - "step": 4020 - }, - { - "epoch": 3.89, - "grad_norm": 1.173415184020996, - "learning_rate": 8.252710592160134e-06, - "loss": 0.6555, - "step": 4021 - }, - { - "epoch": 3.89, - "grad_norm": 1.315529704093933, - "learning_rate": 8.248540450375313e-06, - "loss": 0.7649, - "step": 4022 - }, - { - "epoch": 3.89, - "grad_norm": 1.5528346300125122, - "learning_rate": 8.244370308590493e-06, - "loss": 0.834, - "step": 4023 - }, - { - "epoch": 3.89, - "grad_norm": 1.5623164176940918, - "learning_rate": 8.240200166805671e-06, - "loss": 0.6323, - "step": 4024 - }, - { - "epoch": 3.89, - "grad_norm": 1.2017834186553955, - "learning_rate": 8.236030025020851e-06, - "loss": 0.5434, - "step": 4025 - }, - { - "epoch": 3.9, - "grad_norm": 1.257459044456482, - "learning_rate": 8.231859883236031e-06, - "loss": 1.0339, - "step": 4026 - }, - { - "epoch": 3.9, - "grad_norm": 1.421531081199646, - "learning_rate": 8.22768974145121e-06, - "loss": 0.7466, - "step": 4027 - }, - { - "epoch": 3.9, - "grad_norm": 1.3751212358474731, - "learning_rate": 8.22351959966639e-06, - "loss": 0.593, - "step": 4028 - }, - { - "epoch": 3.9, - "grad_norm": 1.1514464616775513, - "learning_rate": 8.21934945788157e-06, - "loss": 0.7281, - "step": 4029 - }, - { - "epoch": 3.9, - "grad_norm": 1.050018548965454, - "learning_rate": 8.215179316096748e-06, - "loss": 0.6659, - "step": 4030 - }, - { - "epoch": 3.9, - "grad_norm": 1.6007755994796753, - "learning_rate": 8.211009174311928e-06, - "loss": 0.6255, - "step": 4031 - }, - { - "epoch": 3.9, - "grad_norm": 1.3381264209747314, - "learning_rate": 8.206839032527106e-06, - "loss": 0.7391, - "step": 4032 - }, - { - "epoch": 3.9, - "grad_norm": 1.1450636386871338, - "learning_rate": 8.202668890742284e-06, - "loss": 0.6375, - "step": 4033 - }, - { - "epoch": 3.9, - "grad_norm": 1.4435229301452637, - "learning_rate": 8.198498748957465e-06, - "loss": 0.7139, - "step": 4034 - }, - { - "epoch": 3.9, - "grad_norm": 1.1717262268066406, - "learning_rate": 8.194328607172645e-06, - "loss": 0.6299, - "step": 4035 - }, - { - "epoch": 3.91, - "grad_norm": 1.3287850618362427, - "learning_rate": 8.190158465387823e-06, - "loss": 0.8284, - "step": 4036 - }, - { - "epoch": 3.91, - "grad_norm": 1.2789373397827148, - "learning_rate": 8.185988323603003e-06, - "loss": 0.6759, - "step": 4037 - }, - { - "epoch": 3.91, - "grad_norm": 1.2459635734558105, - "learning_rate": 8.181818181818183e-06, - "loss": 0.7471, - "step": 4038 - }, - { - "epoch": 3.91, - "grad_norm": 1.3551286458969116, - "learning_rate": 8.177648040033361e-06, - "loss": 0.7156, - "step": 4039 - }, - { - "epoch": 3.91, - "grad_norm": 1.843151569366455, - "learning_rate": 8.173477898248541e-06, - "loss": 0.6114, - "step": 4040 - }, - { - "epoch": 3.91, - "grad_norm": 1.3559409379959106, - "learning_rate": 8.16930775646372e-06, - "loss": 0.8308, - "step": 4041 - }, - { - "epoch": 3.91, - "grad_norm": 1.3529644012451172, - "learning_rate": 8.1651376146789e-06, - "loss": 0.7349, - "step": 4042 - }, - { - "epoch": 3.91, - "grad_norm": 1.9761481285095215, - "learning_rate": 8.16096747289408e-06, - "loss": 0.6383, - "step": 4043 - }, - { - "epoch": 3.91, - "grad_norm": 1.4816498756408691, - "learning_rate": 8.156797331109258e-06, - "loss": 0.6025, - "step": 4044 - }, - { - "epoch": 3.91, - "grad_norm": 1.2485196590423584, - "learning_rate": 8.152627189324436e-06, - "loss": 0.598, - "step": 4045 - }, - { - "epoch": 3.91, - "grad_norm": 1.2901933193206787, - "learning_rate": 8.148457047539616e-06, - "loss": 0.6969, - "step": 4046 - }, - { - "epoch": 3.92, - "grad_norm": 1.6511998176574707, - "learning_rate": 8.144286905754796e-06, - "loss": 0.7126, - "step": 4047 - }, - { - "epoch": 3.92, - "grad_norm": 1.236632227897644, - "learning_rate": 8.140116763969975e-06, - "loss": 0.7123, - "step": 4048 - }, - { - "epoch": 3.92, - "grad_norm": 1.117444634437561, - "learning_rate": 8.135946622185155e-06, - "loss": 0.7651, - "step": 4049 - }, - { - "epoch": 3.92, - "grad_norm": 1.4278165102005005, - "learning_rate": 8.131776480400335e-06, - "loss": 0.746, - "step": 4050 - }, - { - "epoch": 3.92, - "grad_norm": 1.3722543716430664, - "learning_rate": 8.127606338615513e-06, - "loss": 0.6405, - "step": 4051 - }, - { - "epoch": 3.92, - "grad_norm": 1.6733298301696777, - "learning_rate": 8.123436196830693e-06, - "loss": 0.7232, - "step": 4052 - }, - { - "epoch": 3.92, - "grad_norm": 1.2124762535095215, - "learning_rate": 8.119266055045872e-06, - "loss": 0.649, - "step": 4053 - }, - { - "epoch": 3.92, - "grad_norm": 1.4258729219436646, - "learning_rate": 8.115095913261052e-06, - "loss": 0.7848, - "step": 4054 - }, - { - "epoch": 3.92, - "grad_norm": 1.8051178455352783, - "learning_rate": 8.11092577147623e-06, - "loss": 0.8977, - "step": 4055 - }, - { - "epoch": 3.92, - "grad_norm": 1.4249836206436157, - "learning_rate": 8.10675562969141e-06, - "loss": 0.664, - "step": 4056 - }, - { - "epoch": 3.93, - "grad_norm": 1.1285873651504517, - "learning_rate": 8.102585487906588e-06, - "loss": 0.7577, - "step": 4057 - }, - { - "epoch": 3.93, - "grad_norm": 1.891027808189392, - "learning_rate": 8.098415346121768e-06, - "loss": 0.8048, - "step": 4058 - }, - { - "epoch": 3.93, - "grad_norm": 1.2006330490112305, - "learning_rate": 8.094245204336948e-06, - "loss": 0.8765, - "step": 4059 - }, - { - "epoch": 3.93, - "grad_norm": 1.6823763847351074, - "learning_rate": 8.090075062552127e-06, - "loss": 0.8365, - "step": 4060 - }, - { - "epoch": 3.93, - "grad_norm": 1.236812710762024, - "learning_rate": 8.085904920767307e-06, - "loss": 0.6771, - "step": 4061 - }, - { - "epoch": 3.93, - "grad_norm": 1.1466010808944702, - "learning_rate": 8.081734778982485e-06, - "loss": 0.6461, - "step": 4062 - }, - { - "epoch": 3.93, - "grad_norm": 1.853227972984314, - "learning_rate": 8.077564637197665e-06, - "loss": 0.5851, - "step": 4063 - }, - { - "epoch": 3.93, - "grad_norm": 1.1324950456619263, - "learning_rate": 8.073394495412845e-06, - "loss": 0.5844, - "step": 4064 - }, - { - "epoch": 3.93, - "grad_norm": 1.4123047590255737, - "learning_rate": 8.069224353628024e-06, - "loss": 0.6301, - "step": 4065 - }, - { - "epoch": 3.93, - "grad_norm": 1.2132854461669922, - "learning_rate": 8.065054211843202e-06, - "loss": 0.7212, - "step": 4066 - }, - { - "epoch": 3.94, - "grad_norm": 1.379394292831421, - "learning_rate": 8.060884070058382e-06, - "loss": 0.7104, - "step": 4067 - }, - { - "epoch": 3.94, - "grad_norm": 1.1606208086013794, - "learning_rate": 8.056713928273562e-06, - "loss": 0.6059, - "step": 4068 - }, - { - "epoch": 3.94, - "grad_norm": 1.2338708639144897, - "learning_rate": 8.05254378648874e-06, - "loss": 0.7739, - "step": 4069 - }, - { - "epoch": 3.94, - "grad_norm": 1.2605102062225342, - "learning_rate": 8.04837364470392e-06, - "loss": 0.7205, - "step": 4070 - }, - { - "epoch": 3.94, - "grad_norm": 1.2045714855194092, - "learning_rate": 8.0442035029191e-06, - "loss": 0.7695, - "step": 4071 - }, - { - "epoch": 3.94, - "grad_norm": 1.4576596021652222, - "learning_rate": 8.040033361134279e-06, - "loss": 0.5976, - "step": 4072 - }, - { - "epoch": 3.94, - "grad_norm": 1.3542989492416382, - "learning_rate": 8.035863219349459e-06, - "loss": 0.6175, - "step": 4073 - }, - { - "epoch": 3.94, - "grad_norm": 1.817543625831604, - "learning_rate": 8.031693077564637e-06, - "loss": 0.6522, - "step": 4074 - }, - { - "epoch": 3.94, - "grad_norm": 1.2661129236221313, - "learning_rate": 8.027522935779817e-06, - "loss": 0.6654, - "step": 4075 - }, - { - "epoch": 3.94, - "grad_norm": 1.2577834129333496, - "learning_rate": 8.023352793994997e-06, - "loss": 0.7401, - "step": 4076 - }, - { - "epoch": 3.94, - "grad_norm": 1.3352525234222412, - "learning_rate": 8.019182652210176e-06, - "loss": 0.8261, - "step": 4077 - }, - { - "epoch": 3.95, - "grad_norm": 1.4374886751174927, - "learning_rate": 8.015012510425354e-06, - "loss": 0.8182, - "step": 4078 - }, - { - "epoch": 3.95, - "grad_norm": 1.1310259103775024, - "learning_rate": 8.010842368640534e-06, - "loss": 0.7272, - "step": 4079 - }, - { - "epoch": 3.95, - "grad_norm": 1.3984516859054565, - "learning_rate": 8.006672226855714e-06, - "loss": 0.8062, - "step": 4080 - }, - { - "epoch": 3.95, - "grad_norm": 1.0844422578811646, - "learning_rate": 8.002502085070892e-06, - "loss": 0.6869, - "step": 4081 - }, - { - "epoch": 3.95, - "grad_norm": 1.5735132694244385, - "learning_rate": 7.998331943286072e-06, - "loss": 0.7919, - "step": 4082 - }, - { - "epoch": 3.95, - "grad_norm": 1.1832239627838135, - "learning_rate": 7.99416180150125e-06, - "loss": 0.6256, - "step": 4083 - }, - { - "epoch": 3.95, - "grad_norm": 1.2862153053283691, - "learning_rate": 7.98999165971643e-06, - "loss": 0.6671, - "step": 4084 - }, - { - "epoch": 3.95, - "grad_norm": 1.399275779724121, - "learning_rate": 7.98582151793161e-06, - "loss": 0.6625, - "step": 4085 - }, - { - "epoch": 3.95, - "grad_norm": 1.5161666870117188, - "learning_rate": 7.981651376146789e-06, - "loss": 0.5437, - "step": 4086 - }, - { - "epoch": 3.95, - "grad_norm": 1.5804264545440674, - "learning_rate": 7.977481234361969e-06, - "loss": 0.8329, - "step": 4087 - }, - { - "epoch": 3.96, - "grad_norm": 1.2821407318115234, - "learning_rate": 7.973311092577147e-06, - "loss": 0.8995, - "step": 4088 - }, - { - "epoch": 3.96, - "grad_norm": 1.3997056484222412, - "learning_rate": 7.969140950792327e-06, - "loss": 0.6828, - "step": 4089 - }, - { - "epoch": 3.96, - "grad_norm": 1.1985435485839844, - "learning_rate": 7.964970809007506e-06, - "loss": 0.6269, - "step": 4090 - }, - { - "epoch": 3.96, - "grad_norm": 1.3204777240753174, - "learning_rate": 7.960800667222686e-06, - "loss": 0.8216, - "step": 4091 - }, - { - "epoch": 3.96, - "grad_norm": 1.1255851984024048, - "learning_rate": 7.956630525437866e-06, - "loss": 0.6771, - "step": 4092 - }, - { - "epoch": 3.96, - "grad_norm": 1.0128809213638306, - "learning_rate": 7.952460383653044e-06, - "loss": 0.5821, - "step": 4093 - }, - { - "epoch": 3.96, - "grad_norm": 1.311255693435669, - "learning_rate": 7.948290241868224e-06, - "loss": 0.7216, - "step": 4094 - }, - { - "epoch": 3.96, - "grad_norm": 1.481868863105774, - "learning_rate": 7.944120100083403e-06, - "loss": 0.8287, - "step": 4095 - }, - { - "epoch": 3.96, - "grad_norm": 1.1370958089828491, - "learning_rate": 7.939949958298583e-06, - "loss": 0.7117, - "step": 4096 - }, - { - "epoch": 3.96, - "grad_norm": 1.6860215663909912, - "learning_rate": 7.935779816513763e-06, - "loss": 0.6729, - "step": 4097 - }, - { - "epoch": 3.97, - "grad_norm": 1.8059720993041992, - "learning_rate": 7.931609674728941e-06, - "loss": 0.6416, - "step": 4098 - }, - { - "epoch": 3.97, - "grad_norm": 1.1263718605041504, - "learning_rate": 7.927439532944121e-06, - "loss": 0.7189, - "step": 4099 - }, - { - "epoch": 3.97, - "grad_norm": 1.3962024450302124, - "learning_rate": 7.9232693911593e-06, - "loss": 0.6085, - "step": 4100 - }, - { - "epoch": 3.97, - "grad_norm": 1.22867751121521, - "learning_rate": 7.91909924937448e-06, - "loss": 0.8057, - "step": 4101 - }, - { - "epoch": 3.97, - "grad_norm": 1.3705651760101318, - "learning_rate": 7.914929107589658e-06, - "loss": 0.7628, - "step": 4102 - }, - { - "epoch": 3.97, - "grad_norm": 1.2708830833435059, - "learning_rate": 7.910758965804838e-06, - "loss": 0.8526, - "step": 4103 - }, - { - "epoch": 3.97, - "grad_norm": 1.4503384828567505, - "learning_rate": 7.906588824020016e-06, - "loss": 0.7616, - "step": 4104 - }, - { - "epoch": 3.97, - "grad_norm": 1.2485220432281494, - "learning_rate": 7.902418682235196e-06, - "loss": 0.6749, - "step": 4105 - }, - { - "epoch": 3.97, - "grad_norm": 1.2287968397140503, - "learning_rate": 7.898248540450376e-06, - "loss": 0.8152, - "step": 4106 - }, - { - "epoch": 3.97, - "grad_norm": 1.4547765254974365, - "learning_rate": 7.894078398665555e-06, - "loss": 0.8658, - "step": 4107 - }, - { - "epoch": 3.97, - "grad_norm": 1.3261808156967163, - "learning_rate": 7.889908256880735e-06, - "loss": 0.6446, - "step": 4108 - }, - { - "epoch": 3.98, - "grad_norm": 1.2493927478790283, - "learning_rate": 7.885738115095915e-06, - "loss": 0.6547, - "step": 4109 - }, - { - "epoch": 3.98, - "grad_norm": 1.3406943082809448, - "learning_rate": 7.881567973311093e-06, - "loss": 0.7077, - "step": 4110 - }, - { - "epoch": 3.98, - "grad_norm": 1.1841052770614624, - "learning_rate": 7.877397831526271e-06, - "loss": 0.763, - "step": 4111 - }, - { - "epoch": 3.98, - "grad_norm": 1.1067297458648682, - "learning_rate": 7.873227689741451e-06, - "loss": 0.7358, - "step": 4112 - }, - { - "epoch": 3.98, - "grad_norm": 1.1573944091796875, - "learning_rate": 7.869057547956631e-06, - "loss": 0.5685, - "step": 4113 - }, - { - "epoch": 3.98, - "grad_norm": 1.2256602048873901, - "learning_rate": 7.86488740617181e-06, - "loss": 0.5798, - "step": 4114 - }, - { - "epoch": 3.98, - "grad_norm": 1.2226641178131104, - "learning_rate": 7.86071726438699e-06, - "loss": 0.7897, - "step": 4115 - }, - { - "epoch": 3.98, - "grad_norm": 1.1671439409255981, - "learning_rate": 7.856547122602168e-06, - "loss": 0.7308, - "step": 4116 - }, - { - "epoch": 3.98, - "grad_norm": 0.9899999499320984, - "learning_rate": 7.852376980817348e-06, - "loss": 0.6807, - "step": 4117 - }, - { - "epoch": 3.98, - "grad_norm": 1.2406564950942993, - "learning_rate": 7.848206839032528e-06, - "loss": 0.7886, - "step": 4118 - }, - { - "epoch": 3.99, - "grad_norm": 1.2072851657867432, - "learning_rate": 7.844036697247707e-06, - "loss": 0.7571, - "step": 4119 - }, - { - "epoch": 3.99, - "grad_norm": 1.3472397327423096, - "learning_rate": 7.839866555462887e-06, - "loss": 0.6397, - "step": 4120 - }, - { - "epoch": 3.99, - "grad_norm": 1.2625677585601807, - "learning_rate": 7.835696413678067e-06, - "loss": 0.6698, - "step": 4121 - }, - { - "epoch": 3.99, - "grad_norm": 1.5513256788253784, - "learning_rate": 7.831526271893245e-06, - "loss": 0.8605, - "step": 4122 - }, - { - "epoch": 3.99, - "grad_norm": 1.3021574020385742, - "learning_rate": 7.827356130108423e-06, - "loss": 0.59, - "step": 4123 - }, - { - "epoch": 3.99, - "grad_norm": 1.3085097074508667, - "learning_rate": 7.823185988323603e-06, - "loss": 0.7135, - "step": 4124 - }, - { - "epoch": 3.99, - "grad_norm": 1.7146204710006714, - "learning_rate": 7.819015846538782e-06, - "loss": 0.7301, - "step": 4125 - }, - { - "epoch": 3.99, - "grad_norm": 1.4580557346343994, - "learning_rate": 7.814845704753962e-06, - "loss": 0.5922, - "step": 4126 - }, - { - "epoch": 3.99, - "grad_norm": 1.3994919061660767, - "learning_rate": 7.810675562969142e-06, - "loss": 0.7901, - "step": 4127 - }, - { - "epoch": 3.99, - "grad_norm": 1.262799859046936, - "learning_rate": 7.80650542118432e-06, - "loss": 0.7913, - "step": 4128 - }, - { - "epoch": 4.0, - "grad_norm": 1.7014191150665283, - "learning_rate": 7.8023352793995e-06, - "loss": 0.6946, - "step": 4129 - }, - { - "epoch": 4.0, - "grad_norm": 1.3678264617919922, - "learning_rate": 7.79816513761468e-06, - "loss": 0.6873, - "step": 4130 - }, - { - "epoch": 4.0, - "grad_norm": 1.4684674739837646, - "learning_rate": 7.793994995829859e-06, - "loss": 0.7323, - "step": 4131 - }, - { - "epoch": 4.0, - "grad_norm": 1.1394392251968384, - "learning_rate": 7.789824854045039e-06, - "loss": 0.8703, - "step": 4132 - }, - { - "epoch": 4.0, - "grad_norm": 0.9769808650016785, - "learning_rate": 7.785654712260217e-06, - "loss": 0.7576, - "step": 4133 - }, - { - "epoch": 4.0, - "grad_norm": 1.3571994304656982, - "learning_rate": 7.781484570475397e-06, - "loss": 0.6476, - "step": 4134 - }, - { - "epoch": 4.0, - "grad_norm": 1.3787708282470703, - "learning_rate": 7.777314428690575e-06, - "loss": 0.5787, - "step": 4135 - }, - { - "epoch": 4.0, - "grad_norm": 1.407253384590149, - "learning_rate": 7.773144286905755e-06, - "loss": 0.5968, - "step": 4136 - }, - { - "epoch": 4.0, - "grad_norm": 1.2448159456253052, - "learning_rate": 7.768974145120934e-06, - "loss": 0.6409, - "step": 4137 - }, - { - "epoch": 4.0, - "grad_norm": 1.0467275381088257, - "learning_rate": 7.764804003336114e-06, - "loss": 0.6802, - "step": 4138 - }, - { - "epoch": 4.0, - "grad_norm": 1.128260850906372, - "learning_rate": 7.760633861551294e-06, - "loss": 0.7138, - "step": 4139 - }, - { - "epoch": 4.01, - "grad_norm": 1.056241750717163, - "learning_rate": 7.756463719766472e-06, - "loss": 0.6125, - "step": 4140 - }, - { - "epoch": 4.01, - "grad_norm": 1.4507592916488647, - "learning_rate": 7.752293577981652e-06, - "loss": 0.6628, - "step": 4141 - }, - { - "epoch": 4.01, - "grad_norm": 1.4795253276824951, - "learning_rate": 7.748123436196832e-06, - "loss": 0.6709, - "step": 4142 - }, - { - "epoch": 4.01, - "grad_norm": 1.2754091024398804, - "learning_rate": 7.74395329441201e-06, - "loss": 0.6158, - "step": 4143 - }, - { - "epoch": 4.01, - "grad_norm": 1.1661182641983032, - "learning_rate": 7.739783152627189e-06, - "loss": 0.6366, - "step": 4144 - }, - { - "epoch": 4.01, - "grad_norm": 1.0788447856903076, - "learning_rate": 7.735613010842369e-06, - "loss": 0.6567, - "step": 4145 - }, - { - "epoch": 4.01, - "grad_norm": 1.242462158203125, - "learning_rate": 7.731442869057547e-06, - "loss": 0.6916, - "step": 4146 - }, - { - "epoch": 4.01, - "grad_norm": 1.7863349914550781, - "learning_rate": 7.727272727272727e-06, - "loss": 0.8489, - "step": 4147 - }, - { - "epoch": 4.01, - "grad_norm": 1.997514247894287, - "learning_rate": 7.723102585487907e-06, - "loss": 0.6475, - "step": 4148 - }, - { - "epoch": 4.01, - "grad_norm": 1.5353741645812988, - "learning_rate": 7.718932443703086e-06, - "loss": 0.4718, - "step": 4149 - }, - { - "epoch": 4.02, - "grad_norm": 1.6328120231628418, - "learning_rate": 7.714762301918266e-06, - "loss": 0.5622, - "step": 4150 - }, - { - "epoch": 4.02, - "grad_norm": 1.7414747476577759, - "learning_rate": 7.710592160133446e-06, - "loss": 0.7286, - "step": 4151 - }, - { - "epoch": 4.02, - "grad_norm": 1.3826340436935425, - "learning_rate": 7.706422018348624e-06, - "loss": 0.6723, - "step": 4152 - }, - { - "epoch": 4.02, - "grad_norm": 1.1786415576934814, - "learning_rate": 7.702251876563804e-06, - "loss": 0.7049, - "step": 4153 - }, - { - "epoch": 4.02, - "grad_norm": 1.5943514108657837, - "learning_rate": 7.698081734778984e-06, - "loss": 0.7895, - "step": 4154 - }, - { - "epoch": 4.02, - "grad_norm": 1.3432366847991943, - "learning_rate": 7.693911592994162e-06, - "loss": 0.8594, - "step": 4155 - }, - { - "epoch": 4.02, - "grad_norm": 1.395645022392273, - "learning_rate": 7.68974145120934e-06, - "loss": 0.6955, - "step": 4156 - }, - { - "epoch": 4.02, - "grad_norm": 1.2574201822280884, - "learning_rate": 7.68557130942452e-06, - "loss": 0.6216, - "step": 4157 - }, - { - "epoch": 4.02, - "grad_norm": 1.43873929977417, - "learning_rate": 7.681401167639699e-06, - "loss": 0.7228, - "step": 4158 - }, - { - "epoch": 4.02, - "grad_norm": 1.8955302238464355, - "learning_rate": 7.67723102585488e-06, - "loss": 0.6689, - "step": 4159 - }, - { - "epoch": 4.03, - "grad_norm": 1.3884892463684082, - "learning_rate": 7.67306088407006e-06, - "loss": 0.6353, - "step": 4160 - }, - { - "epoch": 4.03, - "grad_norm": 1.4339274168014526, - "learning_rate": 7.668890742285238e-06, - "loss": 0.7154, - "step": 4161 - }, - { - "epoch": 4.03, - "grad_norm": 1.2962485551834106, - "learning_rate": 7.664720600500418e-06, - "loss": 0.6518, - "step": 4162 - }, - { - "epoch": 4.03, - "grad_norm": 1.14412522315979, - "learning_rate": 7.660550458715598e-06, - "loss": 0.6968, - "step": 4163 - }, - { - "epoch": 4.03, - "grad_norm": 1.1335933208465576, - "learning_rate": 7.656380316930776e-06, - "loss": 0.6509, - "step": 4164 - }, - { - "epoch": 4.03, - "grad_norm": 1.2859175205230713, - "learning_rate": 7.652210175145956e-06, - "loss": 0.7919, - "step": 4165 - }, - { - "epoch": 4.03, - "grad_norm": 1.3848085403442383, - "learning_rate": 7.648040033361134e-06, - "loss": 0.6549, - "step": 4166 - }, - { - "epoch": 4.03, - "grad_norm": 1.1246529817581177, - "learning_rate": 7.643869891576313e-06, - "loss": 0.605, - "step": 4167 - }, - { - "epoch": 4.03, - "grad_norm": 1.286983609199524, - "learning_rate": 7.639699749791493e-06, - "loss": 0.6526, - "step": 4168 - }, - { - "epoch": 4.03, - "grad_norm": 1.1350047588348389, - "learning_rate": 7.635529608006673e-06, - "loss": 0.5825, - "step": 4169 - }, - { - "epoch": 4.03, - "grad_norm": 1.4251058101654053, - "learning_rate": 7.631359466221851e-06, - "loss": 0.7617, - "step": 4170 - }, - { - "epoch": 4.04, - "grad_norm": 1.2032486200332642, - "learning_rate": 7.627189324437031e-06, - "loss": 0.7039, - "step": 4171 - }, - { - "epoch": 4.04, - "grad_norm": 1.334125280380249, - "learning_rate": 7.62301918265221e-06, - "loss": 0.7031, - "step": 4172 - }, - { - "epoch": 4.04, - "grad_norm": 1.3071873188018799, - "learning_rate": 7.61884904086739e-06, - "loss": 0.7319, - "step": 4173 - }, - { - "epoch": 4.04, - "grad_norm": 1.1474069356918335, - "learning_rate": 7.6146788990825695e-06, - "loss": 0.719, - "step": 4174 - }, - { - "epoch": 4.04, - "grad_norm": 1.1719615459442139, - "learning_rate": 7.610508757297749e-06, - "loss": 0.7048, - "step": 4175 - }, - { - "epoch": 4.04, - "grad_norm": 1.4841634035110474, - "learning_rate": 7.606338615512929e-06, - "loss": 0.7132, - "step": 4176 - }, - { - "epoch": 4.04, - "grad_norm": 1.193687915802002, - "learning_rate": 7.602168473728106e-06, - "loss": 0.6186, - "step": 4177 - }, - { - "epoch": 4.04, - "grad_norm": 1.4975613355636597, - "learning_rate": 7.597998331943286e-06, - "loss": 0.6453, - "step": 4178 - }, - { - "epoch": 4.04, - "grad_norm": 1.264892339706421, - "learning_rate": 7.5938281901584655e-06, - "loss": 0.6619, - "step": 4179 - }, - { - "epoch": 4.04, - "grad_norm": 1.7290844917297363, - "learning_rate": 7.589658048373645e-06, - "loss": 0.7351, - "step": 4180 - }, - { - "epoch": 4.05, - "grad_norm": 1.3005568981170654, - "learning_rate": 7.585487906588825e-06, - "loss": 0.7248, - "step": 4181 - }, - { - "epoch": 4.05, - "grad_norm": 1.3518257141113281, - "learning_rate": 7.581317764804004e-06, - "loss": 0.7722, - "step": 4182 - }, - { - "epoch": 4.05, - "grad_norm": 1.853224277496338, - "learning_rate": 7.577147623019183e-06, - "loss": 0.6742, - "step": 4183 - }, - { - "epoch": 4.05, - "grad_norm": 1.3090929985046387, - "learning_rate": 7.572977481234362e-06, - "loss": 0.7192, - "step": 4184 - }, - { - "epoch": 4.05, - "grad_norm": 1.3071833848953247, - "learning_rate": 7.568807339449542e-06, - "loss": 0.6447, - "step": 4185 - }, - { - "epoch": 4.05, - "grad_norm": 1.6736398935317993, - "learning_rate": 7.5646371976647215e-06, - "loss": 0.5652, - "step": 4186 - }, - { - "epoch": 4.05, - "grad_norm": 1.0377569198608398, - "learning_rate": 7.560467055879901e-06, - "loss": 0.5316, - "step": 4187 - }, - { - "epoch": 4.05, - "grad_norm": 1.1365035772323608, - "learning_rate": 7.556296914095079e-06, - "loss": 0.777, - "step": 4188 - }, - { - "epoch": 4.05, - "grad_norm": 1.4174660444259644, - "learning_rate": 7.552126772310258e-06, - "loss": 0.6789, - "step": 4189 - }, - { - "epoch": 4.05, - "grad_norm": 1.4120662212371826, - "learning_rate": 7.547956630525438e-06, - "loss": 0.7055, - "step": 4190 - }, - { - "epoch": 4.06, - "grad_norm": 1.2537157535552979, - "learning_rate": 7.5437864887406174e-06, - "loss": 0.6143, - "step": 4191 - }, - { - "epoch": 4.06, - "grad_norm": 1.7365707159042358, - "learning_rate": 7.539616346955797e-06, - "loss": 0.8578, - "step": 4192 - }, - { - "epoch": 4.06, - "grad_norm": 1.385790467262268, - "learning_rate": 7.535446205170976e-06, - "loss": 0.6827, - "step": 4193 - }, - { - "epoch": 4.06, - "grad_norm": 1.1795783042907715, - "learning_rate": 7.531276063386156e-06, - "loss": 0.5481, - "step": 4194 - }, - { - "epoch": 4.06, - "grad_norm": 1.4059340953826904, - "learning_rate": 7.527105921601335e-06, - "loss": 0.657, - "step": 4195 - }, - { - "epoch": 4.06, - "grad_norm": 1.4970194101333618, - "learning_rate": 7.522935779816514e-06, - "loss": 0.7697, - "step": 4196 - }, - { - "epoch": 4.06, - "grad_norm": 1.3211995363235474, - "learning_rate": 7.518765638031694e-06, - "loss": 0.7945, - "step": 4197 - }, - { - "epoch": 4.06, - "grad_norm": 1.497291922569275, - "learning_rate": 7.5145954962468735e-06, - "loss": 0.5842, - "step": 4198 - }, - { - "epoch": 4.06, - "grad_norm": 1.3536211252212524, - "learning_rate": 7.510425354462052e-06, - "loss": 0.8181, - "step": 4199 - }, - { - "epoch": 4.06, - "grad_norm": 1.2283011674880981, - "learning_rate": 7.506255212677231e-06, - "loss": 0.8476, - "step": 4200 - }, - { - "epoch": 4.06, - "eval_loss": 0.8597999811172485, - "eval_runtime": 861.3569, - "eval_samples_per_second": 4.799, - "eval_steps_per_second": 0.6, - "step": 4200 - }, - { - "epoch": 4.06, - "grad_norm": 1.6468404531478882, - "learning_rate": 7.50208507089241e-06, - "loss": 0.8944, - "step": 4201 - }, - { - "epoch": 4.07, - "grad_norm": 1.2757965326309204, - "learning_rate": 7.49791492910759e-06, - "loss": 0.5931, - "step": 4202 - }, - { - "epoch": 4.07, - "grad_norm": 1.816794991493225, - "learning_rate": 7.493744787322769e-06, - "loss": 0.7, - "step": 4203 - }, - { - "epoch": 4.07, - "grad_norm": 1.759100317955017, - "learning_rate": 7.489574645537949e-06, - "loss": 0.7633, - "step": 4204 - }, - { - "epoch": 4.07, - "grad_norm": 1.4398976564407349, - "learning_rate": 7.485404503753128e-06, - "loss": 0.813, - "step": 4205 - }, - { - "epoch": 4.07, - "grad_norm": 1.2360817193984985, - "learning_rate": 7.481234361968308e-06, - "loss": 0.6071, - "step": 4206 - }, - { - "epoch": 4.07, - "grad_norm": 1.3215487003326416, - "learning_rate": 7.477064220183487e-06, - "loss": 0.7152, - "step": 4207 - }, - { - "epoch": 4.07, - "grad_norm": 1.0810505151748657, - "learning_rate": 7.472894078398666e-06, - "loss": 0.6263, - "step": 4208 - }, - { - "epoch": 4.07, - "grad_norm": 1.23736572265625, - "learning_rate": 7.468723936613846e-06, - "loss": 0.8244, - "step": 4209 - }, - { - "epoch": 4.07, - "grad_norm": 1.5156235694885254, - "learning_rate": 7.464553794829024e-06, - "loss": 0.6425, - "step": 4210 - }, - { - "epoch": 4.07, - "grad_norm": 1.2087029218673706, - "learning_rate": 7.460383653044204e-06, - "loss": 0.7957, - "step": 4211 - }, - { - "epoch": 4.08, - "grad_norm": 1.286115288734436, - "learning_rate": 7.456213511259383e-06, - "loss": 0.5744, - "step": 4212 - }, - { - "epoch": 4.08, - "grad_norm": 1.2504583597183228, - "learning_rate": 7.452043369474562e-06, - "loss": 0.6125, - "step": 4213 - }, - { - "epoch": 4.08, - "grad_norm": 1.2574108839035034, - "learning_rate": 7.447873227689741e-06, - "loss": 0.6948, - "step": 4214 - }, - { - "epoch": 4.08, - "grad_norm": 1.2414987087249756, - "learning_rate": 7.443703085904921e-06, - "loss": 0.5922, - "step": 4215 - }, - { - "epoch": 4.08, - "grad_norm": 1.456743836402893, - "learning_rate": 7.4395329441201005e-06, - "loss": 0.6379, - "step": 4216 - }, - { - "epoch": 4.08, - "grad_norm": 1.852477788925171, - "learning_rate": 7.43536280233528e-06, - "loss": 0.6489, - "step": 4217 - }, - { - "epoch": 4.08, - "grad_norm": 2.1483681201934814, - "learning_rate": 7.43119266055046e-06, - "loss": 0.6942, - "step": 4218 - }, - { - "epoch": 4.08, - "grad_norm": 1.3294167518615723, - "learning_rate": 7.427022518765639e-06, - "loss": 0.6328, - "step": 4219 - }, - { - "epoch": 4.08, - "grad_norm": 1.067713975906372, - "learning_rate": 7.422852376980818e-06, - "loss": 0.7014, - "step": 4220 - }, - { - "epoch": 4.08, - "grad_norm": 1.8215084075927734, - "learning_rate": 7.4186822351959965e-06, - "loss": 0.7014, - "step": 4221 - }, - { - "epoch": 4.09, - "grad_norm": 1.0847300291061401, - "learning_rate": 7.414512093411176e-06, - "loss": 0.6824, - "step": 4222 - }, - { - "epoch": 4.09, - "grad_norm": 1.9889841079711914, - "learning_rate": 7.410341951626356e-06, - "loss": 0.5876, - "step": 4223 - }, - { - "epoch": 4.09, - "grad_norm": 1.2415003776550293, - "learning_rate": 7.406171809841535e-06, - "loss": 0.6037, - "step": 4224 - }, - { - "epoch": 4.09, - "grad_norm": 1.4081982374191284, - "learning_rate": 7.402001668056714e-06, - "loss": 0.6475, - "step": 4225 - }, - { - "epoch": 4.09, - "grad_norm": 2.0091793537139893, - "learning_rate": 7.397831526271893e-06, - "loss": 0.8408, - "step": 4226 - }, - { - "epoch": 4.09, - "grad_norm": 1.2206426858901978, - "learning_rate": 7.393661384487073e-06, - "loss": 0.7029, - "step": 4227 - }, - { - "epoch": 4.09, - "grad_norm": 1.1421000957489014, - "learning_rate": 7.3894912427022525e-06, - "loss": 0.7084, - "step": 4228 - }, - { - "epoch": 4.09, - "grad_norm": 1.6849780082702637, - "learning_rate": 7.385321100917432e-06, - "loss": 0.7911, - "step": 4229 - }, - { - "epoch": 4.09, - "grad_norm": 1.2689474821090698, - "learning_rate": 7.381150959132612e-06, - "loss": 0.5672, - "step": 4230 - }, - { - "epoch": 4.09, - "grad_norm": 1.1003674268722534, - "learning_rate": 7.376980817347791e-06, - "loss": 0.7208, - "step": 4231 - }, - { - "epoch": 4.09, - "grad_norm": 1.3360180854797363, - "learning_rate": 7.372810675562969e-06, - "loss": 0.6315, - "step": 4232 - }, - { - "epoch": 4.1, - "grad_norm": 2.1548523902893066, - "learning_rate": 7.3686405337781484e-06, - "loss": 0.8421, - "step": 4233 - }, - { - "epoch": 4.1, - "grad_norm": 1.4975672960281372, - "learning_rate": 7.364470391993328e-06, - "loss": 0.7763, - "step": 4234 - }, - { - "epoch": 4.1, - "grad_norm": 1.7296528816223145, - "learning_rate": 7.360300250208507e-06, - "loss": 0.5566, - "step": 4235 - }, - { - "epoch": 4.1, - "grad_norm": 1.404763102531433, - "learning_rate": 7.356130108423687e-06, - "loss": 0.6582, - "step": 4236 - }, - { - "epoch": 4.1, - "grad_norm": 1.8087592124938965, - "learning_rate": 7.351959966638866e-06, - "loss": 0.7118, - "step": 4237 - }, - { - "epoch": 4.1, - "grad_norm": 1.3908724784851074, - "learning_rate": 7.347789824854045e-06, - "loss": 0.7868, - "step": 4238 - }, - { - "epoch": 4.1, - "grad_norm": 1.4540789127349854, - "learning_rate": 7.343619683069225e-06, - "loss": 0.611, - "step": 4239 - }, - { - "epoch": 4.1, - "grad_norm": 1.3166730403900146, - "learning_rate": 7.3394495412844045e-06, - "loss": 0.7433, - "step": 4240 - }, - { - "epoch": 4.1, - "grad_norm": 1.3730759620666504, - "learning_rate": 7.335279399499584e-06, - "loss": 0.741, - "step": 4241 - }, - { - "epoch": 4.1, - "grad_norm": 1.1578608751296997, - "learning_rate": 7.331109257714764e-06, - "loss": 0.6909, - "step": 4242 - }, - { - "epoch": 4.11, - "grad_norm": 1.2952829599380493, - "learning_rate": 7.326939115929941e-06, - "loss": 0.7476, - "step": 4243 - }, - { - "epoch": 4.11, - "grad_norm": 1.559354305267334, - "learning_rate": 7.322768974145121e-06, - "loss": 0.7231, - "step": 4244 - }, - { - "epoch": 4.11, - "grad_norm": 1.7259479761123657, - "learning_rate": 7.3185988323603e-06, - "loss": 0.7192, - "step": 4245 - }, - { - "epoch": 4.11, - "grad_norm": 1.373360276222229, - "learning_rate": 7.31442869057548e-06, - "loss": 0.7247, - "step": 4246 - }, - { - "epoch": 4.11, - "grad_norm": 1.2092353105545044, - "learning_rate": 7.310258548790659e-06, - "loss": 0.6721, - "step": 4247 - }, - { - "epoch": 4.11, - "grad_norm": 1.6677278280258179, - "learning_rate": 7.306088407005839e-06, - "loss": 0.6647, - "step": 4248 - }, - { - "epoch": 4.11, - "grad_norm": 1.4248433113098145, - "learning_rate": 7.301918265221018e-06, - "loss": 0.6858, - "step": 4249 - }, - { - "epoch": 4.11, - "grad_norm": 1.6589043140411377, - "learning_rate": 7.297748123436197e-06, - "loss": 0.7623, - "step": 4250 - }, - { - "epoch": 4.11, - "grad_norm": 1.4277273416519165, - "learning_rate": 7.293577981651377e-06, - "loss": 0.8315, - "step": 4251 - }, - { - "epoch": 4.11, - "grad_norm": 1.2882044315338135, - "learning_rate": 7.289407839866556e-06, - "loss": 0.68, - "step": 4252 - }, - { - "epoch": 4.12, - "grad_norm": 1.6518248319625854, - "learning_rate": 7.285237698081736e-06, - "loss": 0.6924, - "step": 4253 - }, - { - "epoch": 4.12, - "grad_norm": 1.9300150871276855, - "learning_rate": 7.281067556296914e-06, - "loss": 0.729, - "step": 4254 - }, - { - "epoch": 4.12, - "grad_norm": 1.2841802835464478, - "learning_rate": 7.276897414512093e-06, - "loss": 0.5875, - "step": 4255 - }, - { - "epoch": 4.12, - "grad_norm": 1.8010226488113403, - "learning_rate": 7.272727272727272e-06, - "loss": 0.6811, - "step": 4256 - }, - { - "epoch": 4.12, - "grad_norm": 1.115170955657959, - "learning_rate": 7.268557130942452e-06, - "loss": 0.6961, - "step": 4257 - }, - { - "epoch": 4.12, - "grad_norm": 1.9197747707366943, - "learning_rate": 7.2643869891576315e-06, - "loss": 0.6216, - "step": 4258 - }, - { - "epoch": 4.12, - "grad_norm": 1.5886878967285156, - "learning_rate": 7.260216847372811e-06, - "loss": 0.6624, - "step": 4259 - }, - { - "epoch": 4.12, - "grad_norm": 1.5277332067489624, - "learning_rate": 7.256046705587991e-06, - "loss": 0.7778, - "step": 4260 - }, - { - "epoch": 4.12, - "grad_norm": 1.5861066579818726, - "learning_rate": 7.25187656380317e-06, - "loss": 0.7441, - "step": 4261 - }, - { - "epoch": 4.12, - "grad_norm": 1.7800103425979614, - "learning_rate": 7.247706422018349e-06, - "loss": 0.7026, - "step": 4262 - }, - { - "epoch": 4.12, - "grad_norm": 1.6363444328308105, - "learning_rate": 7.243536280233529e-06, - "loss": 0.7031, - "step": 4263 - }, - { - "epoch": 4.13, - "grad_norm": 1.7111765146255493, - "learning_rate": 7.239366138448708e-06, - "loss": 0.6233, - "step": 4264 - }, - { - "epoch": 4.13, - "grad_norm": 1.126967191696167, - "learning_rate": 7.235195996663887e-06, - "loss": 0.5954, - "step": 4265 - }, - { - "epoch": 4.13, - "grad_norm": 1.6149272918701172, - "learning_rate": 7.231025854879066e-06, - "loss": 0.7709, - "step": 4266 - }, - { - "epoch": 4.13, - "grad_norm": 1.5248738527297974, - "learning_rate": 7.226855713094245e-06, - "loss": 0.6216, - "step": 4267 - }, - { - "epoch": 4.13, - "grad_norm": 1.8891106843948364, - "learning_rate": 7.222685571309424e-06, - "loss": 0.8195, - "step": 4268 - }, - { - "epoch": 4.13, - "grad_norm": 1.798587441444397, - "learning_rate": 7.218515429524604e-06, - "loss": 0.6679, - "step": 4269 - }, - { - "epoch": 4.13, - "grad_norm": 1.6670140027999878, - "learning_rate": 7.2143452877397835e-06, - "loss": 0.6251, - "step": 4270 - }, - { - "epoch": 4.13, - "grad_norm": 1.3214702606201172, - "learning_rate": 7.210175145954963e-06, - "loss": 0.5787, - "step": 4271 - }, - { - "epoch": 4.13, - "grad_norm": 2.0496490001678467, - "learning_rate": 7.206005004170143e-06, - "loss": 0.6629, - "step": 4272 - }, - { - "epoch": 4.13, - "grad_norm": 1.1761988401412964, - "learning_rate": 7.201834862385322e-06, - "loss": 0.7753, - "step": 4273 - }, - { - "epoch": 4.14, - "grad_norm": 1.1243436336517334, - "learning_rate": 7.197664720600501e-06, - "loss": 0.602, - "step": 4274 - }, - { - "epoch": 4.14, - "grad_norm": 1.3555951118469238, - "learning_rate": 7.193494578815681e-06, - "loss": 0.5541, - "step": 4275 - }, - { - "epoch": 4.14, - "grad_norm": 1.6746872663497925, - "learning_rate": 7.189324437030859e-06, - "loss": 0.7901, - "step": 4276 - }, - { - "epoch": 4.14, - "grad_norm": 1.5441886186599731, - "learning_rate": 7.185154295246038e-06, - "loss": 0.7559, - "step": 4277 - }, - { - "epoch": 4.14, - "grad_norm": 1.4386554956436157, - "learning_rate": 7.180984153461218e-06, - "loss": 0.6865, - "step": 4278 - }, - { - "epoch": 4.14, - "grad_norm": 1.2164535522460938, - "learning_rate": 7.176814011676397e-06, - "loss": 0.7552, - "step": 4279 - }, - { - "epoch": 4.14, - "grad_norm": 1.222366213798523, - "learning_rate": 7.172643869891576e-06, - "loss": 0.506, - "step": 4280 - }, - { - "epoch": 4.14, - "grad_norm": 1.8190783262252808, - "learning_rate": 7.168473728106756e-06, - "loss": 0.8186, - "step": 4281 - }, - { - "epoch": 4.14, - "grad_norm": 1.2464604377746582, - "learning_rate": 7.1643035863219355e-06, - "loss": 0.8343, - "step": 4282 - }, - { - "epoch": 4.14, - "grad_norm": 1.315718650817871, - "learning_rate": 7.160133444537115e-06, - "loss": 0.6059, - "step": 4283 - }, - { - "epoch": 4.15, - "grad_norm": 1.3518524169921875, - "learning_rate": 7.155963302752295e-06, - "loss": 0.634, - "step": 4284 - }, - { - "epoch": 4.15, - "grad_norm": 1.4598798751831055, - "learning_rate": 7.151793160967474e-06, - "loss": 0.7169, - "step": 4285 - }, - { - "epoch": 4.15, - "grad_norm": 1.5651257038116455, - "learning_rate": 7.147623019182653e-06, - "loss": 0.7569, - "step": 4286 - }, - { - "epoch": 4.15, - "grad_norm": 1.135716199874878, - "learning_rate": 7.143452877397831e-06, - "loss": 0.5594, - "step": 4287 - }, - { - "epoch": 4.15, - "grad_norm": 1.3022222518920898, - "learning_rate": 7.139282735613011e-06, - "loss": 0.6758, - "step": 4288 - }, - { - "epoch": 4.15, - "grad_norm": 1.3710997104644775, - "learning_rate": 7.13511259382819e-06, - "loss": 0.6633, - "step": 4289 - }, - { - "epoch": 4.15, - "grad_norm": 1.335264801979065, - "learning_rate": 7.13094245204337e-06, - "loss": 0.8073, - "step": 4290 - }, - { - "epoch": 4.15, - "grad_norm": 1.6815357208251953, - "learning_rate": 7.126772310258549e-06, - "loss": 0.6771, - "step": 4291 - }, - { - "epoch": 4.15, - "grad_norm": 1.7518526315689087, - "learning_rate": 7.122602168473728e-06, - "loss": 0.6287, - "step": 4292 - }, - { - "epoch": 4.15, - "grad_norm": 1.338187336921692, - "learning_rate": 7.118432026688908e-06, - "loss": 0.5854, - "step": 4293 - }, - { - "epoch": 4.15, - "grad_norm": 1.319826364517212, - "learning_rate": 7.114261884904087e-06, - "loss": 0.5895, - "step": 4294 - }, - { - "epoch": 4.16, - "grad_norm": 1.6188857555389404, - "learning_rate": 7.110091743119267e-06, - "loss": 0.909, - "step": 4295 - }, - { - "epoch": 4.16, - "grad_norm": 1.8238894939422607, - "learning_rate": 7.105921601334447e-06, - "loss": 0.6531, - "step": 4296 - }, - { - "epoch": 4.16, - "grad_norm": 1.8153358697891235, - "learning_rate": 7.101751459549626e-06, - "loss": 0.7412, - "step": 4297 - }, - { - "epoch": 4.16, - "grad_norm": 1.539629340171814, - "learning_rate": 7.097581317764803e-06, - "loss": 0.6625, - "step": 4298 - }, - { - "epoch": 4.16, - "grad_norm": 1.680781364440918, - "learning_rate": 7.093411175979983e-06, - "loss": 0.5307, - "step": 4299 - }, - { - "epoch": 4.16, - "grad_norm": 1.341411828994751, - "learning_rate": 7.0892410341951625e-06, - "loss": 0.7209, - "step": 4300 - }, - { - "epoch": 4.16, - "grad_norm": 1.168107032775879, - "learning_rate": 7.085070892410342e-06, - "loss": 0.593, - "step": 4301 - }, - { - "epoch": 4.16, - "grad_norm": 1.2601464986801147, - "learning_rate": 7.080900750625522e-06, - "loss": 0.6123, - "step": 4302 - }, - { - "epoch": 4.16, - "grad_norm": 1.542240023612976, - "learning_rate": 7.076730608840701e-06, - "loss": 0.6779, - "step": 4303 - }, - { - "epoch": 4.16, - "grad_norm": 1.5732543468475342, - "learning_rate": 7.07256046705588e-06, - "loss": 0.713, - "step": 4304 - }, - { - "epoch": 4.17, - "grad_norm": 1.2263343334197998, - "learning_rate": 7.06839032527106e-06, - "loss": 0.6758, - "step": 4305 - }, - { - "epoch": 4.17, - "grad_norm": 1.4326815605163574, - "learning_rate": 7.064220183486239e-06, - "loss": 0.5931, - "step": 4306 - }, - { - "epoch": 4.17, - "grad_norm": 1.255967617034912, - "learning_rate": 7.0600500417014186e-06, - "loss": 0.6113, - "step": 4307 - }, - { - "epoch": 4.17, - "grad_norm": 1.4633044004440308, - "learning_rate": 7.055879899916598e-06, - "loss": 0.7899, - "step": 4308 - }, - { - "epoch": 4.17, - "grad_norm": 1.2892683744430542, - "learning_rate": 7.051709758131776e-06, - "loss": 0.7507, - "step": 4309 - }, - { - "epoch": 4.17, - "grad_norm": 2.049952268600464, - "learning_rate": 7.047539616346955e-06, - "loss": 0.6878, - "step": 4310 - }, - { - "epoch": 4.17, - "grad_norm": 2.016397714614868, - "learning_rate": 7.043369474562135e-06, - "loss": 0.832, - "step": 4311 - }, - { - "epoch": 4.17, - "grad_norm": 1.3233511447906494, - "learning_rate": 7.0391993327773145e-06, - "loss": 0.6509, - "step": 4312 - }, - { - "epoch": 4.17, - "grad_norm": 1.4653106927871704, - "learning_rate": 7.035029190992494e-06, - "loss": 0.6382, - "step": 4313 - }, - { - "epoch": 4.17, - "grad_norm": 1.0743006467819214, - "learning_rate": 7.030859049207674e-06, - "loss": 0.5874, - "step": 4314 - }, - { - "epoch": 4.18, - "grad_norm": 1.4268132448196411, - "learning_rate": 7.026688907422853e-06, - "loss": 0.7449, - "step": 4315 - }, - { - "epoch": 4.18, - "grad_norm": 1.418232798576355, - "learning_rate": 7.022518765638032e-06, - "loss": 0.7174, - "step": 4316 - }, - { - "epoch": 4.18, - "grad_norm": 1.6750450134277344, - "learning_rate": 7.018348623853212e-06, - "loss": 0.7305, - "step": 4317 - }, - { - "epoch": 4.18, - "grad_norm": 1.2780274152755737, - "learning_rate": 7.014178482068391e-06, - "loss": 0.6194, - "step": 4318 - }, - { - "epoch": 4.18, - "grad_norm": 1.248227596282959, - "learning_rate": 7.0100083402835705e-06, - "loss": 0.7412, - "step": 4319 - }, - { - "epoch": 4.18, - "grad_norm": 1.4439641237258911, - "learning_rate": 7.005838198498749e-06, - "loss": 0.6121, - "step": 4320 - }, - { - "epoch": 4.18, - "grad_norm": 1.3450074195861816, - "learning_rate": 7.001668056713928e-06, - "loss": 0.7011, - "step": 4321 - }, - { - "epoch": 4.18, - "grad_norm": 1.401066780090332, - "learning_rate": 6.997497914929107e-06, - "loss": 0.7435, - "step": 4322 - }, - { - "epoch": 4.18, - "grad_norm": 1.3789832592010498, - "learning_rate": 6.993327773144287e-06, - "loss": 0.7078, - "step": 4323 - }, - { - "epoch": 4.18, - "grad_norm": 1.165249228477478, - "learning_rate": 6.9891576313594665e-06, - "loss": 0.6393, - "step": 4324 - }, - { - "epoch": 4.18, - "grad_norm": 1.4255436658859253, - "learning_rate": 6.984987489574646e-06, - "loss": 0.7127, - "step": 4325 - }, - { - "epoch": 4.19, - "grad_norm": 1.549740195274353, - "learning_rate": 6.980817347789826e-06, - "loss": 0.5712, - "step": 4326 - }, - { - "epoch": 4.19, - "grad_norm": 1.22221839427948, - "learning_rate": 6.976647206005005e-06, - "loss": 0.5935, - "step": 4327 - }, - { - "epoch": 4.19, - "grad_norm": 1.5324398279190063, - "learning_rate": 6.972477064220184e-06, - "loss": 0.6328, - "step": 4328 - }, - { - "epoch": 4.19, - "grad_norm": 1.7590183019638062, - "learning_rate": 6.968306922435363e-06, - "loss": 0.609, - "step": 4329 - }, - { - "epoch": 4.19, - "grad_norm": 1.6595736742019653, - "learning_rate": 6.964136780650543e-06, - "loss": 0.7005, - "step": 4330 - }, - { - "epoch": 4.19, - "grad_norm": 1.2330102920532227, - "learning_rate": 6.959966638865721e-06, - "loss": 0.5565, - "step": 4331 - }, - { - "epoch": 4.19, - "grad_norm": 1.4129624366760254, - "learning_rate": 6.955796497080901e-06, - "loss": 0.7066, - "step": 4332 - }, - { - "epoch": 4.19, - "grad_norm": 1.362594723701477, - "learning_rate": 6.95162635529608e-06, - "loss": 0.7155, - "step": 4333 - }, - { - "epoch": 4.19, - "grad_norm": 1.6006284952163696, - "learning_rate": 6.947456213511259e-06, - "loss": 0.8129, - "step": 4334 - }, - { - "epoch": 4.19, - "grad_norm": 1.730517029762268, - "learning_rate": 6.943286071726439e-06, - "loss": 0.6247, - "step": 4335 - }, - { - "epoch": 4.2, - "grad_norm": 1.2271586656570435, - "learning_rate": 6.939115929941618e-06, - "loss": 0.5687, - "step": 4336 - }, - { - "epoch": 4.2, - "grad_norm": 1.6972774267196655, - "learning_rate": 6.934945788156798e-06, - "loss": 0.7686, - "step": 4337 - }, - { - "epoch": 4.2, - "grad_norm": 1.1939290761947632, - "learning_rate": 6.930775646371978e-06, - "loss": 0.6312, - "step": 4338 - }, - { - "epoch": 4.2, - "grad_norm": 1.2736120223999023, - "learning_rate": 6.926605504587157e-06, - "loss": 0.6084, - "step": 4339 - }, - { - "epoch": 4.2, - "grad_norm": 1.7349547147750854, - "learning_rate": 6.922435362802336e-06, - "loss": 0.5477, - "step": 4340 - }, - { - "epoch": 4.2, - "grad_norm": 1.8467882871627808, - "learning_rate": 6.918265221017515e-06, - "loss": 0.6469, - "step": 4341 - }, - { - "epoch": 4.2, - "grad_norm": 1.3428188562393188, - "learning_rate": 6.9140950792326936e-06, - "loss": 0.6622, - "step": 4342 - }, - { - "epoch": 4.2, - "grad_norm": 1.5804131031036377, - "learning_rate": 6.909924937447873e-06, - "loss": 0.8024, - "step": 4343 - }, - { - "epoch": 4.2, - "grad_norm": 1.5460153818130493, - "learning_rate": 6.905754795663053e-06, - "loss": 0.7976, - "step": 4344 - }, - { - "epoch": 4.2, - "grad_norm": 1.3329187631607056, - "learning_rate": 6.901584653878232e-06, - "loss": 0.6699, - "step": 4345 - }, - { - "epoch": 4.21, - "grad_norm": 1.2377336025238037, - "learning_rate": 6.897414512093411e-06, - "loss": 0.6386, - "step": 4346 - }, - { - "epoch": 4.21, - "grad_norm": 1.1283926963806152, - "learning_rate": 6.893244370308591e-06, - "loss": 0.6808, - "step": 4347 - }, - { - "epoch": 4.21, - "grad_norm": 1.1400607824325562, - "learning_rate": 6.88907422852377e-06, - "loss": 0.6433, - "step": 4348 - }, - { - "epoch": 4.21, - "grad_norm": 1.4880428314208984, - "learning_rate": 6.8849040867389496e-06, - "loss": 0.7377, - "step": 4349 - }, - { - "epoch": 4.21, - "grad_norm": 1.3610410690307617, - "learning_rate": 6.880733944954129e-06, - "loss": 0.6365, - "step": 4350 - }, - { - "epoch": 4.21, - "grad_norm": 1.3585680723190308, - "learning_rate": 6.876563803169309e-06, - "loss": 0.7175, - "step": 4351 - }, - { - "epoch": 4.21, - "grad_norm": 1.3938407897949219, - "learning_rate": 6.872393661384488e-06, - "loss": 0.7103, - "step": 4352 - }, - { - "epoch": 4.21, - "grad_norm": 1.8672634363174438, - "learning_rate": 6.868223519599666e-06, - "loss": 0.785, - "step": 4353 - }, - { - "epoch": 4.21, - "grad_norm": 1.3515386581420898, - "learning_rate": 6.8640533778148455e-06, - "loss": 0.6009, - "step": 4354 - }, - { - "epoch": 4.21, - "grad_norm": 1.4289917945861816, - "learning_rate": 6.859883236030025e-06, - "loss": 0.6041, - "step": 4355 - }, - { - "epoch": 4.21, - "grad_norm": 1.355471134185791, - "learning_rate": 6.855713094245205e-06, - "loss": 0.8069, - "step": 4356 - }, - { - "epoch": 4.22, - "grad_norm": 1.3696022033691406, - "learning_rate": 6.851542952460384e-06, - "loss": 0.8306, - "step": 4357 - }, - { - "epoch": 4.22, - "grad_norm": 1.4903416633605957, - "learning_rate": 6.847372810675563e-06, - "loss": 0.6864, - "step": 4358 - }, - { - "epoch": 4.22, - "grad_norm": 1.4738951921463013, - "learning_rate": 6.843202668890743e-06, - "loss": 0.5804, - "step": 4359 - }, - { - "epoch": 4.22, - "grad_norm": 1.2567520141601562, - "learning_rate": 6.839032527105922e-06, - "loss": 0.6282, - "step": 4360 - }, - { - "epoch": 4.22, - "grad_norm": 1.3491474390029907, - "learning_rate": 6.8348623853211015e-06, - "loss": 0.6883, - "step": 4361 - }, - { - "epoch": 4.22, - "grad_norm": 1.539475440979004, - "learning_rate": 6.830692243536281e-06, - "loss": 0.6455, - "step": 4362 - }, - { - "epoch": 4.22, - "grad_norm": 1.4578179121017456, - "learning_rate": 6.826522101751461e-06, - "loss": 0.7266, - "step": 4363 - }, - { - "epoch": 4.22, - "grad_norm": 1.6439467668533325, - "learning_rate": 6.822351959966638e-06, - "loss": 0.7019, - "step": 4364 - }, - { - "epoch": 4.22, - "grad_norm": 1.4401473999023438, - "learning_rate": 6.818181818181818e-06, - "loss": 0.7081, - "step": 4365 - }, - { - "epoch": 4.22, - "grad_norm": 1.6413389444351196, - "learning_rate": 6.8140116763969975e-06, - "loss": 0.7683, - "step": 4366 - }, - { - "epoch": 4.23, - "grad_norm": 1.5531343221664429, - "learning_rate": 6.809841534612177e-06, - "loss": 0.7655, - "step": 4367 - }, - { - "epoch": 4.23, - "grad_norm": 1.2082351446151733, - "learning_rate": 6.805671392827357e-06, - "loss": 0.6311, - "step": 4368 - }, - { - "epoch": 4.23, - "grad_norm": 1.2384322881698608, - "learning_rate": 6.801501251042536e-06, - "loss": 0.7649, - "step": 4369 - }, - { - "epoch": 4.23, - "grad_norm": 1.3738021850585938, - "learning_rate": 6.797331109257715e-06, - "loss": 0.8453, - "step": 4370 - }, - { - "epoch": 4.23, - "grad_norm": 1.4350394010543823, - "learning_rate": 6.793160967472894e-06, - "loss": 0.9358, - "step": 4371 - }, - { - "epoch": 4.23, - "grad_norm": 1.2021479606628418, - "learning_rate": 6.788990825688074e-06, - "loss": 0.5609, - "step": 4372 - }, - { - "epoch": 4.23, - "grad_norm": 1.453177571296692, - "learning_rate": 6.7848206839032535e-06, - "loss": 0.6782, - "step": 4373 - }, - { - "epoch": 4.23, - "grad_norm": 1.809272050857544, - "learning_rate": 6.780650542118433e-06, - "loss": 0.6943, - "step": 4374 - }, - { - "epoch": 4.23, - "grad_norm": 1.274755597114563, - "learning_rate": 6.776480400333611e-06, - "loss": 0.8202, - "step": 4375 - }, - { - "epoch": 4.23, - "grad_norm": 1.4974430799484253, - "learning_rate": 6.77231025854879e-06, - "loss": 0.6515, - "step": 4376 - }, - { - "epoch": 4.24, - "grad_norm": 1.6731135845184326, - "learning_rate": 6.76814011676397e-06, - "loss": 0.6141, - "step": 4377 - }, - { - "epoch": 4.24, - "grad_norm": 1.4311158657073975, - "learning_rate": 6.763969974979149e-06, - "loss": 0.5637, - "step": 4378 - }, - { - "epoch": 4.24, - "grad_norm": 1.4362891912460327, - "learning_rate": 6.759799833194329e-06, - "loss": 0.6189, - "step": 4379 - }, - { - "epoch": 4.24, - "grad_norm": 1.370519757270813, - "learning_rate": 6.755629691409509e-06, - "loss": 0.6415, - "step": 4380 - }, - { - "epoch": 4.24, - "grad_norm": 1.4324407577514648, - "learning_rate": 6.751459549624688e-06, - "loss": 0.6882, - "step": 4381 - }, - { - "epoch": 4.24, - "grad_norm": 1.5416909456253052, - "learning_rate": 6.747289407839867e-06, - "loss": 0.7743, - "step": 4382 - }, - { - "epoch": 4.24, - "grad_norm": 1.38045334815979, - "learning_rate": 6.743119266055046e-06, - "loss": 0.838, - "step": 4383 - }, - { - "epoch": 4.24, - "grad_norm": 1.2203946113586426, - "learning_rate": 6.738949124270226e-06, - "loss": 0.6709, - "step": 4384 - }, - { - "epoch": 4.24, - "grad_norm": 1.570999026298523, - "learning_rate": 6.7347789824854054e-06, - "loss": 0.8299, - "step": 4385 - }, - { - "epoch": 4.24, - "grad_norm": 1.1222865581512451, - "learning_rate": 6.730608840700584e-06, - "loss": 0.4957, - "step": 4386 - }, - { - "epoch": 4.24, - "grad_norm": 1.0991058349609375, - "learning_rate": 6.726438698915763e-06, - "loss": 0.6796, - "step": 4387 - }, - { - "epoch": 4.25, - "grad_norm": 1.5178134441375732, - "learning_rate": 6.722268557130942e-06, - "loss": 0.6211, - "step": 4388 - }, - { - "epoch": 4.25, - "grad_norm": 1.367569923400879, - "learning_rate": 6.718098415346122e-06, - "loss": 0.7229, - "step": 4389 - }, - { - "epoch": 4.25, - "grad_norm": 1.4267053604125977, - "learning_rate": 6.713928273561301e-06, - "loss": 0.7702, - "step": 4390 - }, - { - "epoch": 4.25, - "grad_norm": 1.4790523052215576, - "learning_rate": 6.7097581317764806e-06, - "loss": 0.7489, - "step": 4391 - }, - { - "epoch": 4.25, - "grad_norm": 1.9144536256790161, - "learning_rate": 6.70558798999166e-06, - "loss": 0.6801, - "step": 4392 - }, - { - "epoch": 4.25, - "grad_norm": 1.2850652933120728, - "learning_rate": 6.70141784820684e-06, - "loss": 0.7403, - "step": 4393 - }, - { - "epoch": 4.25, - "grad_norm": 1.5940477848052979, - "learning_rate": 6.697247706422019e-06, - "loss": 0.6171, - "step": 4394 - }, - { - "epoch": 4.25, - "grad_norm": 1.6070002317428589, - "learning_rate": 6.693077564637198e-06, - "loss": 0.6765, - "step": 4395 - }, - { - "epoch": 4.25, - "grad_norm": 1.9825026988983154, - "learning_rate": 6.688907422852378e-06, - "loss": 0.6512, - "step": 4396 - }, - { - "epoch": 4.25, - "grad_norm": 1.339361548423767, - "learning_rate": 6.684737281067556e-06, - "loss": 0.7988, - "step": 4397 - }, - { - "epoch": 4.26, - "grad_norm": 1.8375968933105469, - "learning_rate": 6.680567139282736e-06, - "loss": 0.7436, - "step": 4398 - }, - { - "epoch": 4.26, - "grad_norm": 1.5017917156219482, - "learning_rate": 6.676396997497915e-06, - "loss": 0.7136, - "step": 4399 - }, - { - "epoch": 4.26, - "grad_norm": 1.284619927406311, - "learning_rate": 6.672226855713094e-06, - "loss": 0.6345, - "step": 4400 - }, - { - "epoch": 4.26, - "grad_norm": 1.629049301147461, - "learning_rate": 6.668056713928274e-06, - "loss": 0.6055, - "step": 4401 - }, - { - "epoch": 4.26, - "grad_norm": 2.0935773849487305, - "learning_rate": 6.663886572143453e-06, - "loss": 0.6613, - "step": 4402 - }, - { - "epoch": 4.26, - "grad_norm": 1.2761112451553345, - "learning_rate": 6.6597164303586325e-06, - "loss": 0.9318, - "step": 4403 - }, - { - "epoch": 4.26, - "grad_norm": 1.524101972579956, - "learning_rate": 6.655546288573812e-06, - "loss": 0.6462, - "step": 4404 - }, - { - "epoch": 4.26, - "grad_norm": 1.6486200094223022, - "learning_rate": 6.651376146788992e-06, - "loss": 0.7126, - "step": 4405 - }, - { - "epoch": 4.26, - "grad_norm": 1.5298010110855103, - "learning_rate": 6.647206005004171e-06, - "loss": 0.8588, - "step": 4406 - }, - { - "epoch": 4.26, - "grad_norm": 1.5817596912384033, - "learning_rate": 6.64303586321935e-06, - "loss": 0.7953, - "step": 4407 - }, - { - "epoch": 4.27, - "grad_norm": 1.8760782480239868, - "learning_rate": 6.6388657214345285e-06, - "loss": 0.5902, - "step": 4408 - }, - { - "epoch": 4.27, - "grad_norm": 1.343523383140564, - "learning_rate": 6.634695579649708e-06, - "loss": 0.5988, - "step": 4409 - }, - { - "epoch": 4.27, - "grad_norm": 1.6053376197814941, - "learning_rate": 6.630525437864888e-06, - "loss": 0.6569, - "step": 4410 - }, - { - "epoch": 4.27, - "grad_norm": 1.8453056812286377, - "learning_rate": 6.626355296080067e-06, - "loss": 0.7616, - "step": 4411 - }, - { - "epoch": 4.27, - "grad_norm": 1.554426908493042, - "learning_rate": 6.622185154295246e-06, - "loss": 0.7056, - "step": 4412 - }, - { - "epoch": 4.27, - "grad_norm": 1.5776360034942627, - "learning_rate": 6.618015012510425e-06, - "loss": 0.7216, - "step": 4413 - }, - { - "epoch": 4.27, - "grad_norm": 1.8021432161331177, - "learning_rate": 6.613844870725605e-06, - "loss": 0.5638, - "step": 4414 - }, - { - "epoch": 4.27, - "grad_norm": 1.8947653770446777, - "learning_rate": 6.6096747289407845e-06, - "loss": 0.631, - "step": 4415 - }, - { - "epoch": 4.27, - "grad_norm": 1.4564845561981201, - "learning_rate": 6.605504587155964e-06, - "loss": 0.7454, - "step": 4416 - }, - { - "epoch": 4.27, - "grad_norm": 1.5228424072265625, - "learning_rate": 6.601334445371144e-06, - "loss": 0.5376, - "step": 4417 - }, - { - "epoch": 4.27, - "grad_norm": 1.2071088552474976, - "learning_rate": 6.597164303586323e-06, - "loss": 0.6799, - "step": 4418 - }, - { - "epoch": 4.28, - "grad_norm": 1.364287257194519, - "learning_rate": 6.592994161801501e-06, - "loss": 0.6515, - "step": 4419 - }, - { - "epoch": 4.28, - "grad_norm": 1.24534010887146, - "learning_rate": 6.5888240200166804e-06, - "loss": 0.6459, - "step": 4420 - }, - { - "epoch": 4.28, - "grad_norm": 1.8855162858963013, - "learning_rate": 6.58465387823186e-06, - "loss": 0.6589, - "step": 4421 - }, - { - "epoch": 4.28, - "grad_norm": 1.6058472394943237, - "learning_rate": 6.58048373644704e-06, - "loss": 0.7044, - "step": 4422 - }, - { - "epoch": 4.28, - "grad_norm": 1.393979787826538, - "learning_rate": 6.576313594662219e-06, - "loss": 0.7708, - "step": 4423 - }, - { - "epoch": 4.28, - "grad_norm": 1.3308145999908447, - "learning_rate": 6.572143452877398e-06, - "loss": 0.7672, - "step": 4424 - }, - { - "epoch": 4.28, - "grad_norm": 1.3571292161941528, - "learning_rate": 6.567973311092577e-06, - "loss": 0.7612, - "step": 4425 - }, - { - "epoch": 4.28, - "grad_norm": 1.1570378541946411, - "learning_rate": 6.563803169307757e-06, - "loss": 0.7035, - "step": 4426 - }, - { - "epoch": 4.28, - "grad_norm": 1.7388896942138672, - "learning_rate": 6.5596330275229364e-06, - "loss": 0.9574, - "step": 4427 - }, - { - "epoch": 4.28, - "grad_norm": 1.3100439310073853, - "learning_rate": 6.555462885738116e-06, - "loss": 0.7397, - "step": 4428 - }, - { - "epoch": 4.29, - "grad_norm": 1.8462955951690674, - "learning_rate": 6.551292743953296e-06, - "loss": 0.6368, - "step": 4429 - }, - { - "epoch": 4.29, - "grad_norm": 1.5478241443634033, - "learning_rate": 6.547122602168473e-06, - "loss": 0.4956, - "step": 4430 - }, - { - "epoch": 4.29, - "grad_norm": 1.5611926317214966, - "learning_rate": 6.542952460383653e-06, - "loss": 0.5935, - "step": 4431 - }, - { - "epoch": 4.29, - "grad_norm": 1.588344931602478, - "learning_rate": 6.538782318598832e-06, - "loss": 0.7343, - "step": 4432 - }, - { - "epoch": 4.29, - "grad_norm": 1.6632417440414429, - "learning_rate": 6.5346121768140116e-06, - "loss": 0.888, - "step": 4433 - }, - { - "epoch": 4.29, - "grad_norm": 1.861594796180725, - "learning_rate": 6.530442035029191e-06, - "loss": 0.6662, - "step": 4434 - }, - { - "epoch": 4.29, - "grad_norm": 1.4945757389068604, - "learning_rate": 6.526271893244371e-06, - "loss": 0.7169, - "step": 4435 - }, - { - "epoch": 4.29, - "grad_norm": 1.4567769765853882, - "learning_rate": 6.52210175145955e-06, - "loss": 0.856, - "step": 4436 - }, - { - "epoch": 4.29, - "grad_norm": 1.3844677209854126, - "learning_rate": 6.517931609674729e-06, - "loss": 0.5879, - "step": 4437 - }, - { - "epoch": 4.29, - "grad_norm": 1.4528354406356812, - "learning_rate": 6.513761467889909e-06, - "loss": 0.6289, - "step": 4438 - }, - { - "epoch": 4.3, - "grad_norm": 1.5714360475540161, - "learning_rate": 6.509591326105088e-06, - "loss": 0.753, - "step": 4439 - }, - { - "epoch": 4.3, - "grad_norm": 1.2027802467346191, - "learning_rate": 6.505421184320268e-06, - "loss": 0.6496, - "step": 4440 - }, - { - "epoch": 4.3, - "grad_norm": 1.3950735330581665, - "learning_rate": 6.501251042535446e-06, - "loss": 0.6583, - "step": 4441 - }, - { - "epoch": 4.3, - "grad_norm": 1.5818397998809814, - "learning_rate": 6.497080900750625e-06, - "loss": 0.6201, - "step": 4442 - }, - { - "epoch": 4.3, - "grad_norm": 1.5390669107437134, - "learning_rate": 6.492910758965805e-06, - "loss": 0.6856, - "step": 4443 - }, - { - "epoch": 4.3, - "grad_norm": 1.4448585510253906, - "learning_rate": 6.488740617180984e-06, - "loss": 0.6811, - "step": 4444 - }, - { - "epoch": 4.3, - "grad_norm": 1.9134057760238647, - "learning_rate": 6.4845704753961635e-06, - "loss": 0.6728, - "step": 4445 - }, - { - "epoch": 4.3, - "grad_norm": 1.314792513847351, - "learning_rate": 6.480400333611343e-06, - "loss": 0.6442, - "step": 4446 - }, - { - "epoch": 4.3, - "grad_norm": 1.6837114095687866, - "learning_rate": 6.476230191826523e-06, - "loss": 0.6764, - "step": 4447 - }, - { - "epoch": 4.3, - "grad_norm": 1.4650620222091675, - "learning_rate": 6.472060050041702e-06, - "loss": 0.7084, - "step": 4448 - }, - { - "epoch": 4.3, - "grad_norm": 1.4765480756759644, - "learning_rate": 6.467889908256881e-06, - "loss": 0.7154, - "step": 4449 - }, - { - "epoch": 4.31, - "grad_norm": 1.4001424312591553, - "learning_rate": 6.463719766472061e-06, - "loss": 0.7793, - "step": 4450 - }, - { - "epoch": 4.31, - "grad_norm": 1.1573011875152588, - "learning_rate": 6.45954962468724e-06, - "loss": 0.7458, - "step": 4451 - }, - { - "epoch": 4.31, - "grad_norm": 1.374753475189209, - "learning_rate": 6.455379482902419e-06, - "loss": 0.6044, - "step": 4452 - }, - { - "epoch": 4.31, - "grad_norm": 1.8408690690994263, - "learning_rate": 6.451209341117598e-06, - "loss": 0.7081, - "step": 4453 - }, - { - "epoch": 4.31, - "grad_norm": 1.4238532781600952, - "learning_rate": 6.447039199332777e-06, - "loss": 0.6827, - "step": 4454 - }, - { - "epoch": 4.31, - "grad_norm": 1.4446330070495605, - "learning_rate": 6.442869057547956e-06, - "loss": 0.5796, - "step": 4455 - }, - { - "epoch": 4.31, - "grad_norm": 1.597171664237976, - "learning_rate": 6.438698915763136e-06, - "loss": 0.7198, - "step": 4456 - }, - { - "epoch": 4.31, - "grad_norm": 1.1514348983764648, - "learning_rate": 6.4345287739783155e-06, - "loss": 0.6486, - "step": 4457 - }, - { - "epoch": 4.31, - "grad_norm": 1.5550312995910645, - "learning_rate": 6.430358632193495e-06, - "loss": 0.6427, - "step": 4458 - }, - { - "epoch": 4.31, - "grad_norm": 1.2647738456726074, - "learning_rate": 6.426188490408675e-06, - "loss": 0.9576, - "step": 4459 - }, - { - "epoch": 4.32, - "grad_norm": 1.4543766975402832, - "learning_rate": 6.422018348623854e-06, - "loss": 0.755, - "step": 4460 - }, - { - "epoch": 4.32, - "grad_norm": 1.6130422353744507, - "learning_rate": 6.417848206839033e-06, - "loss": 0.911, - "step": 4461 - }, - { - "epoch": 4.32, - "grad_norm": 1.3784844875335693, - "learning_rate": 6.413678065054213e-06, - "loss": 0.8042, - "step": 4462 - }, - { - "epoch": 4.32, - "grad_norm": 1.572872281074524, - "learning_rate": 6.409507923269391e-06, - "loss": 0.6032, - "step": 4463 - }, - { - "epoch": 4.32, - "grad_norm": 1.4436098337173462, - "learning_rate": 6.405337781484571e-06, - "loss": 0.7356, - "step": 4464 - }, - { - "epoch": 4.32, - "grad_norm": 1.6711442470550537, - "learning_rate": 6.40116763969975e-06, - "loss": 0.7637, - "step": 4465 - }, - { - "epoch": 4.32, - "grad_norm": 1.4850529432296753, - "learning_rate": 6.396997497914929e-06, - "loss": 0.8872, - "step": 4466 - }, - { - "epoch": 4.32, - "grad_norm": 1.4410306215286255, - "learning_rate": 6.392827356130108e-06, - "loss": 0.7155, - "step": 4467 - }, - { - "epoch": 4.32, - "grad_norm": 1.6716467142105103, - "learning_rate": 6.388657214345288e-06, - "loss": 0.7096, - "step": 4468 - }, - { - "epoch": 4.32, - "grad_norm": 1.4089220762252808, - "learning_rate": 6.3844870725604674e-06, - "loss": 0.7065, - "step": 4469 - }, - { - "epoch": 4.33, - "grad_norm": 1.375309705734253, - "learning_rate": 6.380316930775647e-06, - "loss": 0.6433, - "step": 4470 - }, - { - "epoch": 4.33, - "grad_norm": 1.5627754926681519, - "learning_rate": 6.376146788990827e-06, - "loss": 0.9394, - "step": 4471 - }, - { - "epoch": 4.33, - "grad_norm": 1.535332202911377, - "learning_rate": 6.371976647206006e-06, - "loss": 0.9066, - "step": 4472 - }, - { - "epoch": 4.33, - "grad_norm": 1.3067853450775146, - "learning_rate": 6.367806505421185e-06, - "loss": 0.6955, - "step": 4473 - }, - { - "epoch": 4.33, - "grad_norm": 1.353558897972107, - "learning_rate": 6.363636363636363e-06, - "loss": 0.6122, - "step": 4474 - }, - { - "epoch": 4.33, - "grad_norm": 1.277823805809021, - "learning_rate": 6.3594662218515426e-06, - "loss": 0.708, - "step": 4475 - }, - { - "epoch": 4.33, - "grad_norm": 1.5617762804031372, - "learning_rate": 6.355296080066722e-06, - "loss": 0.8191, - "step": 4476 - }, - { - "epoch": 4.33, - "grad_norm": 1.3509758710861206, - "learning_rate": 6.351125938281902e-06, - "loss": 0.7159, - "step": 4477 - }, - { - "epoch": 4.33, - "grad_norm": 1.5985627174377441, - "learning_rate": 6.346955796497081e-06, - "loss": 0.6212, - "step": 4478 - }, - { - "epoch": 4.33, - "grad_norm": 1.2670369148254395, - "learning_rate": 6.34278565471226e-06, - "loss": 0.5706, - "step": 4479 - }, - { - "epoch": 4.33, - "grad_norm": 1.6988168954849243, - "learning_rate": 6.33861551292744e-06, - "loss": 0.7364, - "step": 4480 - }, - { - "epoch": 4.34, - "grad_norm": 1.737405776977539, - "learning_rate": 6.334445371142619e-06, - "loss": 0.7654, - "step": 4481 - }, - { - "epoch": 4.34, - "grad_norm": 1.70065438747406, - "learning_rate": 6.330275229357799e-06, - "loss": 0.8737, - "step": 4482 - }, - { - "epoch": 4.34, - "grad_norm": 1.3618711233139038, - "learning_rate": 6.326105087572979e-06, - "loss": 0.609, - "step": 4483 - }, - { - "epoch": 4.34, - "grad_norm": 1.207494854927063, - "learning_rate": 6.321934945788158e-06, - "loss": 0.6795, - "step": 4484 - }, - { - "epoch": 4.34, - "grad_norm": 1.6335123777389526, - "learning_rate": 6.317764804003336e-06, - "loss": 0.5714, - "step": 4485 - }, - { - "epoch": 4.34, - "grad_norm": 1.358859658241272, - "learning_rate": 6.313594662218515e-06, - "loss": 0.6437, - "step": 4486 - }, - { - "epoch": 4.34, - "grad_norm": 1.5137168169021606, - "learning_rate": 6.3094245204336945e-06, - "loss": 0.6404, - "step": 4487 - }, - { - "epoch": 4.34, - "grad_norm": 1.8898518085479736, - "learning_rate": 6.305254378648874e-06, - "loss": 0.6857, - "step": 4488 - }, - { - "epoch": 4.34, - "grad_norm": 1.2113492488861084, - "learning_rate": 6.301084236864054e-06, - "loss": 0.6162, - "step": 4489 - }, - { - "epoch": 4.34, - "grad_norm": 1.317917823791504, - "learning_rate": 6.296914095079233e-06, - "loss": 0.6325, - "step": 4490 - }, - { - "epoch": 4.35, - "grad_norm": 1.5444402694702148, - "learning_rate": 6.292743953294412e-06, - "loss": 0.6928, - "step": 4491 - }, - { - "epoch": 4.35, - "grad_norm": 1.3579732179641724, - "learning_rate": 6.288573811509592e-06, - "loss": 0.6092, - "step": 4492 - }, - { - "epoch": 4.35, - "grad_norm": 1.6278555393218994, - "learning_rate": 6.284403669724771e-06, - "loss": 0.79, - "step": 4493 - }, - { - "epoch": 4.35, - "grad_norm": 1.4639973640441895, - "learning_rate": 6.2802335279399505e-06, - "loss": 0.8683, - "step": 4494 - }, - { - "epoch": 4.35, - "grad_norm": 1.2048999071121216, - "learning_rate": 6.276063386155131e-06, - "loss": 0.5256, - "step": 4495 - }, - { - "epoch": 4.35, - "grad_norm": 1.336359977722168, - "learning_rate": 6.271893244370308e-06, - "loss": 0.5147, - "step": 4496 - }, - { - "epoch": 4.35, - "grad_norm": 1.4179655313491821, - "learning_rate": 6.267723102585487e-06, - "loss": 0.7034, - "step": 4497 - }, - { - "epoch": 4.35, - "grad_norm": 1.2926969528198242, - "learning_rate": 6.263552960800667e-06, - "loss": 0.6835, - "step": 4498 - }, - { - "epoch": 4.35, - "grad_norm": 1.225582480430603, - "learning_rate": 6.2593828190158465e-06, - "loss": 0.7043, - "step": 4499 - }, - { - "epoch": 4.35, - "grad_norm": 1.4773812294006348, - "learning_rate": 6.255212677231026e-06, - "loss": 0.9061, - "step": 4500 - }, - { - "epoch": 4.35, - "eval_loss": 0.8581743240356445, - "eval_runtime": 856.5442, - "eval_samples_per_second": 4.826, - "eval_steps_per_second": 0.604, - "step": 4500 - }, - { - "epoch": 4.36, - "grad_norm": 1.2046254873275757, - "learning_rate": 6.251042535446206e-06, - "loss": 0.6437, - "step": 4501 - }, - { - "epoch": 4.36, - "grad_norm": 1.9686743021011353, - "learning_rate": 6.246872393661385e-06, - "loss": 0.7816, - "step": 4502 - }, - { - "epoch": 4.36, - "grad_norm": 1.4191972017288208, - "learning_rate": 6.242702251876564e-06, - "loss": 0.6423, - "step": 4503 - }, - { - "epoch": 4.36, - "grad_norm": 1.3162946701049805, - "learning_rate": 6.238532110091744e-06, - "loss": 0.7523, - "step": 4504 - }, - { - "epoch": 4.36, - "grad_norm": 1.2248839139938354, - "learning_rate": 6.2343619683069225e-06, - "loss": 0.6698, - "step": 4505 - }, - { - "epoch": 4.36, - "grad_norm": 1.5480225086212158, - "learning_rate": 6.230191826522102e-06, - "loss": 0.7132, - "step": 4506 - }, - { - "epoch": 4.36, - "grad_norm": 1.264034390449524, - "learning_rate": 6.226021684737282e-06, - "loss": 0.5244, - "step": 4507 - }, - { - "epoch": 4.36, - "grad_norm": 1.3124220371246338, - "learning_rate": 6.221851542952461e-06, - "loss": 0.846, - "step": 4508 - }, - { - "epoch": 4.36, - "grad_norm": 1.2050153017044067, - "learning_rate": 6.21768140116764e-06, - "loss": 0.7297, - "step": 4509 - }, - { - "epoch": 4.36, - "grad_norm": 1.1116827726364136, - "learning_rate": 6.213511259382819e-06, - "loss": 0.5178, - "step": 4510 - }, - { - "epoch": 4.36, - "grad_norm": 1.7788770198822021, - "learning_rate": 6.2093411175979984e-06, - "loss": 0.6892, - "step": 4511 - }, - { - "epoch": 4.37, - "grad_norm": 1.5678691864013672, - "learning_rate": 6.205170975813178e-06, - "loss": 0.7617, - "step": 4512 - }, - { - "epoch": 4.37, - "grad_norm": 1.6809147596359253, - "learning_rate": 6.201000834028358e-06, - "loss": 0.6395, - "step": 4513 - }, - { - "epoch": 4.37, - "grad_norm": 1.4206522703170776, - "learning_rate": 6.196830692243537e-06, - "loss": 0.7164, - "step": 4514 - }, - { - "epoch": 4.37, - "grad_norm": 1.2952232360839844, - "learning_rate": 6.192660550458716e-06, - "loss": 0.6791, - "step": 4515 - }, - { - "epoch": 4.37, - "grad_norm": 1.8799912929534912, - "learning_rate": 6.188490408673895e-06, - "loss": 0.6621, - "step": 4516 - }, - { - "epoch": 4.37, - "grad_norm": 1.5345474481582642, - "learning_rate": 6.1843202668890744e-06, - "loss": 0.7264, - "step": 4517 - }, - { - "epoch": 4.37, - "grad_norm": 1.6303728818893433, - "learning_rate": 6.180150125104254e-06, - "loss": 0.5539, - "step": 4518 - }, - { - "epoch": 4.37, - "grad_norm": 1.459893822669983, - "learning_rate": 6.175979983319434e-06, - "loss": 0.7534, - "step": 4519 - }, - { - "epoch": 4.37, - "grad_norm": 1.3992021083831787, - "learning_rate": 6.171809841534613e-06, - "loss": 0.5496, - "step": 4520 - }, - { - "epoch": 4.37, - "grad_norm": 1.4364956617355347, - "learning_rate": 6.167639699749791e-06, - "loss": 0.7307, - "step": 4521 - }, - { - "epoch": 4.38, - "grad_norm": 1.596460223197937, - "learning_rate": 6.163469557964971e-06, - "loss": 0.6842, - "step": 4522 - }, - { - "epoch": 4.38, - "grad_norm": 1.453177809715271, - "learning_rate": 6.15929941618015e-06, - "loss": 0.6237, - "step": 4523 - }, - { - "epoch": 4.38, - "grad_norm": 1.6033985614776611, - "learning_rate": 6.15512927439533e-06, - "loss": 0.7595, - "step": 4524 - }, - { - "epoch": 4.38, - "grad_norm": 1.551035761833191, - "learning_rate": 6.15095913261051e-06, - "loss": 0.6417, - "step": 4525 - }, - { - "epoch": 4.38, - "grad_norm": 1.6841317415237427, - "learning_rate": 6.146788990825689e-06, - "loss": 0.7573, - "step": 4526 - }, - { - "epoch": 4.38, - "grad_norm": 1.8649938106536865, - "learning_rate": 6.142618849040867e-06, - "loss": 0.5851, - "step": 4527 - }, - { - "epoch": 4.38, - "grad_norm": 1.2951748371124268, - "learning_rate": 6.138448707256047e-06, - "loss": 0.6367, - "step": 4528 - }, - { - "epoch": 4.38, - "grad_norm": 1.4893923997879028, - "learning_rate": 6.134278565471226e-06, - "loss": 0.7389, - "step": 4529 - }, - { - "epoch": 4.38, - "grad_norm": 1.517099380493164, - "learning_rate": 6.1301084236864056e-06, - "loss": 0.6981, - "step": 4530 - }, - { - "epoch": 4.38, - "grad_norm": 0.9570285677909851, - "learning_rate": 6.125938281901586e-06, - "loss": 0.5932, - "step": 4531 - }, - { - "epoch": 4.39, - "grad_norm": 1.7340022325515747, - "learning_rate": 6.121768140116764e-06, - "loss": 0.6693, - "step": 4532 - }, - { - "epoch": 4.39, - "grad_norm": 1.3229211568832397, - "learning_rate": 6.117597998331943e-06, - "loss": 0.6984, - "step": 4533 - }, - { - "epoch": 4.39, - "grad_norm": 1.7112501859664917, - "learning_rate": 6.113427856547123e-06, - "loss": 0.7863, - "step": 4534 - }, - { - "epoch": 4.39, - "grad_norm": 1.1070160865783691, - "learning_rate": 6.109257714762302e-06, - "loss": 0.6884, - "step": 4535 - }, - { - "epoch": 4.39, - "grad_norm": 1.218097448348999, - "learning_rate": 6.1050875729774816e-06, - "loss": 0.8119, - "step": 4536 - }, - { - "epoch": 4.39, - "grad_norm": 1.3918700218200684, - "learning_rate": 6.100917431192662e-06, - "loss": 0.7109, - "step": 4537 - }, - { - "epoch": 4.39, - "grad_norm": 1.3033515214920044, - "learning_rate": 6.09674728940784e-06, - "loss": 0.7304, - "step": 4538 - }, - { - "epoch": 4.39, - "grad_norm": 1.5123662948608398, - "learning_rate": 6.092577147623019e-06, - "loss": 0.8365, - "step": 4539 - }, - { - "epoch": 4.39, - "grad_norm": 1.6895707845687866, - "learning_rate": 6.088407005838199e-06, - "loss": 0.5216, - "step": 4540 - }, - { - "epoch": 4.39, - "grad_norm": 1.752480149269104, - "learning_rate": 6.084236864053378e-06, - "loss": 0.7785, - "step": 4541 - }, - { - "epoch": 4.39, - "grad_norm": 0.9798446893692017, - "learning_rate": 6.0800667222685575e-06, - "loss": 0.71, - "step": 4542 - }, - { - "epoch": 4.4, - "grad_norm": 1.2920374870300293, - "learning_rate": 6.075896580483737e-06, - "loss": 0.9092, - "step": 4543 - }, - { - "epoch": 4.4, - "grad_norm": 1.741518259048462, - "learning_rate": 6.071726438698916e-06, - "loss": 0.6685, - "step": 4544 - }, - { - "epoch": 4.4, - "grad_norm": 1.4512648582458496, - "learning_rate": 6.067556296914095e-06, - "loss": 0.6378, - "step": 4545 - }, - { - "epoch": 4.4, - "grad_norm": 1.3239641189575195, - "learning_rate": 6.063386155129275e-06, - "loss": 0.7009, - "step": 4546 - }, - { - "epoch": 4.4, - "grad_norm": 1.6167960166931152, - "learning_rate": 6.059216013344454e-06, - "loss": 0.6332, - "step": 4547 - }, - { - "epoch": 4.4, - "grad_norm": 1.438257098197937, - "learning_rate": 6.0550458715596335e-06, - "loss": 0.6658, - "step": 4548 - }, - { - "epoch": 4.4, - "grad_norm": 1.4437922239303589, - "learning_rate": 6.050875729774813e-06, - "loss": 0.6923, - "step": 4549 - }, - { - "epoch": 4.4, - "grad_norm": 1.660217523574829, - "learning_rate": 6.046705587989992e-06, - "loss": 0.641, - "step": 4550 - }, - { - "epoch": 4.4, - "grad_norm": 1.4629638195037842, - "learning_rate": 6.042535446205171e-06, - "loss": 0.7885, - "step": 4551 - }, - { - "epoch": 4.4, - "grad_norm": 1.5988290309906006, - "learning_rate": 6.038365304420351e-06, - "loss": 0.7277, - "step": 4552 - }, - { - "epoch": 4.41, - "grad_norm": 1.421746850013733, - "learning_rate": 6.03419516263553e-06, - "loss": 0.5789, - "step": 4553 - }, - { - "epoch": 4.41, - "grad_norm": 1.5449638366699219, - "learning_rate": 6.030025020850709e-06, - "loss": 0.5776, - "step": 4554 - }, - { - "epoch": 4.41, - "grad_norm": 1.4405105113983154, - "learning_rate": 6.025854879065889e-06, - "loss": 0.6907, - "step": 4555 - }, - { - "epoch": 4.41, - "grad_norm": 1.2792209386825562, - "learning_rate": 6.021684737281068e-06, - "loss": 0.5526, - "step": 4556 - }, - { - "epoch": 4.41, - "grad_norm": 1.272682547569275, - "learning_rate": 6.017514595496247e-06, - "loss": 0.6382, - "step": 4557 - }, - { - "epoch": 4.41, - "grad_norm": 1.6121176481246948, - "learning_rate": 6.013344453711427e-06, - "loss": 0.608, - "step": 4558 - }, - { - "epoch": 4.41, - "grad_norm": 1.5273442268371582, - "learning_rate": 6.009174311926606e-06, - "loss": 0.7358, - "step": 4559 - }, - { - "epoch": 4.41, - "grad_norm": 1.7222824096679688, - "learning_rate": 6.005004170141785e-06, - "loss": 0.7476, - "step": 4560 - }, - { - "epoch": 4.41, - "grad_norm": 1.130391001701355, - "learning_rate": 6.000834028356965e-06, - "loss": 0.5936, - "step": 4561 - }, - { - "epoch": 4.41, - "grad_norm": 1.3474061489105225, - "learning_rate": 5.996663886572144e-06, - "loss": 0.5635, - "step": 4562 - }, - { - "epoch": 4.42, - "grad_norm": 1.3879218101501465, - "learning_rate": 5.992493744787323e-06, - "loss": 0.7199, - "step": 4563 - }, - { - "epoch": 4.42, - "grad_norm": 1.6851611137390137, - "learning_rate": 5.988323603002502e-06, - "loss": 0.6714, - "step": 4564 - }, - { - "epoch": 4.42, - "grad_norm": 1.5315393209457397, - "learning_rate": 5.984153461217681e-06, - "loss": 0.5827, - "step": 4565 - }, - { - "epoch": 4.42, - "grad_norm": 1.298017978668213, - "learning_rate": 5.979983319432861e-06, - "loss": 0.8551, - "step": 4566 - }, - { - "epoch": 4.42, - "grad_norm": 1.3128635883331299, - "learning_rate": 5.975813177648041e-06, - "loss": 0.682, - "step": 4567 - }, - { - "epoch": 4.42, - "grad_norm": 1.4132344722747803, - "learning_rate": 5.97164303586322e-06, - "loss": 0.6471, - "step": 4568 - }, - { - "epoch": 4.42, - "grad_norm": 1.35657799243927, - "learning_rate": 5.967472894078399e-06, - "loss": 0.4753, - "step": 4569 - }, - { - "epoch": 4.42, - "grad_norm": 1.211188554763794, - "learning_rate": 5.963302752293578e-06, - "loss": 0.7416, - "step": 4570 - }, - { - "epoch": 4.42, - "grad_norm": 1.1546083688735962, - "learning_rate": 5.959132610508757e-06, - "loss": 0.7779, - "step": 4571 - }, - { - "epoch": 4.42, - "grad_norm": 1.3571324348449707, - "learning_rate": 5.9549624687239366e-06, - "loss": 0.6944, - "step": 4572 - }, - { - "epoch": 4.42, - "grad_norm": 1.4888793230056763, - "learning_rate": 5.950792326939117e-06, - "loss": 0.6427, - "step": 4573 - }, - { - "epoch": 4.43, - "grad_norm": 1.563929796218872, - "learning_rate": 5.946622185154296e-06, - "loss": 0.7502, - "step": 4574 - }, - { - "epoch": 4.43, - "grad_norm": 1.6636128425598145, - "learning_rate": 5.942452043369475e-06, - "loss": 0.6738, - "step": 4575 - }, - { - "epoch": 4.43, - "grad_norm": 1.431536078453064, - "learning_rate": 5.938281901584654e-06, - "loss": 0.751, - "step": 4576 - }, - { - "epoch": 4.43, - "grad_norm": 1.067686676979065, - "learning_rate": 5.934111759799833e-06, - "loss": 0.5663, - "step": 4577 - }, - { - "epoch": 4.43, - "grad_norm": 1.6124565601348877, - "learning_rate": 5.9299416180150126e-06, - "loss": 0.6439, - "step": 4578 - }, - { - "epoch": 4.43, - "grad_norm": 1.589220404624939, - "learning_rate": 5.925771476230193e-06, - "loss": 0.7915, - "step": 4579 - }, - { - "epoch": 4.43, - "grad_norm": 1.9252246618270874, - "learning_rate": 5.921601334445372e-06, - "loss": 0.8538, - "step": 4580 - }, - { - "epoch": 4.43, - "grad_norm": 1.3919744491577148, - "learning_rate": 5.917431192660551e-06, - "loss": 0.5826, - "step": 4581 - }, - { - "epoch": 4.43, - "grad_norm": 1.2920315265655518, - "learning_rate": 5.91326105087573e-06, - "loss": 0.6896, - "step": 4582 - }, - { - "epoch": 4.43, - "grad_norm": 1.8945282697677612, - "learning_rate": 5.909090909090909e-06, - "loss": 0.6634, - "step": 4583 - }, - { - "epoch": 4.44, - "grad_norm": 1.4104385375976562, - "learning_rate": 5.9049207673060885e-06, - "loss": 0.6774, - "step": 4584 - }, - { - "epoch": 4.44, - "grad_norm": 0.9925658106803894, - "learning_rate": 5.9007506255212686e-06, - "loss": 0.7068, - "step": 4585 - }, - { - "epoch": 4.44, - "grad_norm": 1.6425079107284546, - "learning_rate": 5.896580483736448e-06, - "loss": 0.6906, - "step": 4586 - }, - { - "epoch": 4.44, - "grad_norm": 1.734883427619934, - "learning_rate": 5.892410341951626e-06, - "loss": 0.6661, - "step": 4587 - }, - { - "epoch": 4.44, - "grad_norm": 1.314497709274292, - "learning_rate": 5.888240200166806e-06, - "loss": 0.6166, - "step": 4588 - }, - { - "epoch": 4.44, - "grad_norm": 1.7467992305755615, - "learning_rate": 5.884070058381985e-06, - "loss": 0.6317, - "step": 4589 - }, - { - "epoch": 4.44, - "grad_norm": 1.6375136375427246, - "learning_rate": 5.8798999165971645e-06, - "loss": 0.8575, - "step": 4590 - }, - { - "epoch": 4.44, - "grad_norm": 1.6072413921356201, - "learning_rate": 5.875729774812344e-06, - "loss": 0.6524, - "step": 4591 - }, - { - "epoch": 4.44, - "grad_norm": 1.4089336395263672, - "learning_rate": 5.871559633027524e-06, - "loss": 0.7012, - "step": 4592 - }, - { - "epoch": 4.44, - "grad_norm": 1.2882603406906128, - "learning_rate": 5.867389491242702e-06, - "loss": 0.6848, - "step": 4593 - }, - { - "epoch": 4.45, - "grad_norm": 1.2387679815292358, - "learning_rate": 5.863219349457882e-06, - "loss": 0.8055, - "step": 4594 - }, - { - "epoch": 4.45, - "grad_norm": 1.8764599561691284, - "learning_rate": 5.859049207673061e-06, - "loss": 0.578, - "step": 4595 - }, - { - "epoch": 4.45, - "grad_norm": 1.4414983987808228, - "learning_rate": 5.8548790658882405e-06, - "loss": 0.6162, - "step": 4596 - }, - { - "epoch": 4.45, - "grad_norm": 1.491419792175293, - "learning_rate": 5.85070892410342e-06, - "loss": 0.5989, - "step": 4597 - }, - { - "epoch": 4.45, - "grad_norm": 2.5183537006378174, - "learning_rate": 5.846538782318599e-06, - "loss": 0.7089, - "step": 4598 - }, - { - "epoch": 4.45, - "grad_norm": 1.0582003593444824, - "learning_rate": 5.842368640533778e-06, - "loss": 0.6531, - "step": 4599 - }, - { - "epoch": 4.45, - "grad_norm": 1.6829962730407715, - "learning_rate": 5.838198498748958e-06, - "loss": 0.7932, - "step": 4600 - }, - { - "epoch": 4.45, - "grad_norm": 1.9470819234848022, - "learning_rate": 5.834028356964137e-06, - "loss": 0.6358, - "step": 4601 - }, - { - "epoch": 4.45, - "grad_norm": 1.2453104257583618, - "learning_rate": 5.8298582151793165e-06, - "loss": 0.6782, - "step": 4602 - }, - { - "epoch": 4.45, - "grad_norm": 2.341174840927124, - "learning_rate": 5.825688073394496e-06, - "loss": 0.7806, - "step": 4603 - }, - { - "epoch": 4.45, - "grad_norm": 1.486384391784668, - "learning_rate": 5.821517931609675e-06, - "loss": 0.741, - "step": 4604 - }, - { - "epoch": 4.46, - "grad_norm": 1.264007329940796, - "learning_rate": 5.817347789824854e-06, - "loss": 0.6896, - "step": 4605 - }, - { - "epoch": 4.46, - "grad_norm": 1.740382432937622, - "learning_rate": 5.813177648040034e-06, - "loss": 0.6706, - "step": 4606 - }, - { - "epoch": 4.46, - "grad_norm": 1.2103074789047241, - "learning_rate": 5.809007506255213e-06, - "loss": 0.6736, - "step": 4607 - }, - { - "epoch": 4.46, - "grad_norm": 1.4383554458618164, - "learning_rate": 5.8048373644703924e-06, - "loss": 0.7079, - "step": 4608 - }, - { - "epoch": 4.46, - "grad_norm": 1.181028962135315, - "learning_rate": 5.800667222685572e-06, - "loss": 0.6232, - "step": 4609 - }, - { - "epoch": 4.46, - "grad_norm": 1.425830602645874, - "learning_rate": 5.796497080900751e-06, - "loss": 0.7142, - "step": 4610 - }, - { - "epoch": 4.46, - "grad_norm": 1.0202964544296265, - "learning_rate": 5.79232693911593e-06, - "loss": 0.6243, - "step": 4611 - }, - { - "epoch": 4.46, - "grad_norm": 1.389682650566101, - "learning_rate": 5.788156797331109e-06, - "loss": 0.7684, - "step": 4612 - }, - { - "epoch": 4.46, - "grad_norm": 2.0149354934692383, - "learning_rate": 5.783986655546289e-06, - "loss": 0.647, - "step": 4613 - }, - { - "epoch": 4.46, - "grad_norm": 1.9424692392349243, - "learning_rate": 5.7798165137614684e-06, - "loss": 0.6478, - "step": 4614 - }, - { - "epoch": 4.47, - "grad_norm": 1.4337615966796875, - "learning_rate": 5.775646371976648e-06, - "loss": 0.6507, - "step": 4615 - }, - { - "epoch": 4.47, - "grad_norm": 1.2378499507904053, - "learning_rate": 5.771476230191827e-06, - "loss": 0.6993, - "step": 4616 - }, - { - "epoch": 4.47, - "grad_norm": 1.8119548559188843, - "learning_rate": 5.767306088407006e-06, - "loss": 0.6861, - "step": 4617 - }, - { - "epoch": 4.47, - "grad_norm": 1.6021716594696045, - "learning_rate": 5.763135946622185e-06, - "loss": 0.6991, - "step": 4618 - }, - { - "epoch": 4.47, - "grad_norm": 1.6414045095443726, - "learning_rate": 5.758965804837365e-06, - "loss": 0.6302, - "step": 4619 - }, - { - "epoch": 4.47, - "grad_norm": 1.7944785356521606, - "learning_rate": 5.7547956630525436e-06, - "loss": 0.7703, - "step": 4620 - }, - { - "epoch": 4.47, - "grad_norm": 1.0491467714309692, - "learning_rate": 5.750625521267724e-06, - "loss": 0.6449, - "step": 4621 - }, - { - "epoch": 4.47, - "grad_norm": 1.6654112339019775, - "learning_rate": 5.746455379482903e-06, - "loss": 0.6433, - "step": 4622 - }, - { - "epoch": 4.47, - "grad_norm": 1.2329283952713013, - "learning_rate": 5.742285237698082e-06, - "loss": 0.6477, - "step": 4623 - }, - { - "epoch": 4.47, - "grad_norm": 1.6013000011444092, - "learning_rate": 5.738115095913261e-06, - "loss": 0.6302, - "step": 4624 - }, - { - "epoch": 4.48, - "grad_norm": 1.8500837087631226, - "learning_rate": 5.733944954128441e-06, - "loss": 0.7395, - "step": 4625 - }, - { - "epoch": 4.48, - "grad_norm": 1.3515931367874146, - "learning_rate": 5.7297748123436195e-06, - "loss": 0.6717, - "step": 4626 - }, - { - "epoch": 4.48, - "grad_norm": 1.6684017181396484, - "learning_rate": 5.7256046705587996e-06, - "loss": 0.6337, - "step": 4627 - }, - { - "epoch": 4.48, - "grad_norm": 1.1044994592666626, - "learning_rate": 5.721434528773979e-06, - "loss": 0.6495, - "step": 4628 - }, - { - "epoch": 4.48, - "grad_norm": 1.3723560571670532, - "learning_rate": 5.717264386989158e-06, - "loss": 0.7101, - "step": 4629 - }, - { - "epoch": 4.48, - "grad_norm": 1.2639974355697632, - "learning_rate": 5.713094245204337e-06, - "loss": 0.7862, - "step": 4630 - }, - { - "epoch": 4.48, - "grad_norm": 1.0320762395858765, - "learning_rate": 5.708924103419516e-06, - "loss": 0.594, - "step": 4631 - }, - { - "epoch": 4.48, - "grad_norm": 1.7549233436584473, - "learning_rate": 5.7047539616346955e-06, - "loss": 0.6192, - "step": 4632 - }, - { - "epoch": 4.48, - "grad_norm": 1.311571717262268, - "learning_rate": 5.700583819849875e-06, - "loss": 0.6548, - "step": 4633 - }, - { - "epoch": 4.48, - "grad_norm": 1.782296895980835, - "learning_rate": 5.696413678065055e-06, - "loss": 0.6356, - "step": 4634 - }, - { - "epoch": 4.48, - "grad_norm": 1.3225741386413574, - "learning_rate": 5.692243536280234e-06, - "loss": 0.8134, - "step": 4635 - }, - { - "epoch": 4.49, - "grad_norm": 1.4524954557418823, - "learning_rate": 5.688073394495413e-06, - "loss": 0.7109, - "step": 4636 - }, - { - "epoch": 4.49, - "grad_norm": 1.2796392440795898, - "learning_rate": 5.683903252710592e-06, - "loss": 0.7489, - "step": 4637 - }, - { - "epoch": 4.49, - "grad_norm": 1.9327563047409058, - "learning_rate": 5.6797331109257715e-06, - "loss": 0.6422, - "step": 4638 - }, - { - "epoch": 4.49, - "grad_norm": 2.186600923538208, - "learning_rate": 5.675562969140951e-06, - "loss": 0.5712, - "step": 4639 - }, - { - "epoch": 4.49, - "grad_norm": 1.4998987913131714, - "learning_rate": 5.671392827356131e-06, - "loss": 0.6776, - "step": 4640 - }, - { - "epoch": 4.49, - "grad_norm": 1.1831672191619873, - "learning_rate": 5.66722268557131e-06, - "loss": 0.6532, - "step": 4641 - }, - { - "epoch": 4.49, - "grad_norm": 1.3108035326004028, - "learning_rate": 5.663052543786489e-06, - "loss": 0.6943, - "step": 4642 - }, - { - "epoch": 4.49, - "grad_norm": 1.7191115617752075, - "learning_rate": 5.658882402001668e-06, - "loss": 0.7349, - "step": 4643 - }, - { - "epoch": 4.49, - "grad_norm": 1.2873550653457642, - "learning_rate": 5.6547122602168475e-06, - "loss": 0.8601, - "step": 4644 - }, - { - "epoch": 4.49, - "grad_norm": 1.3496503829956055, - "learning_rate": 5.650542118432027e-06, - "loss": 0.6711, - "step": 4645 - }, - { - "epoch": 4.5, - "grad_norm": 1.552755355834961, - "learning_rate": 5.646371976647207e-06, - "loss": 0.6595, - "step": 4646 - }, - { - "epoch": 4.5, - "grad_norm": 1.2598237991333008, - "learning_rate": 5.642201834862386e-06, - "loss": 0.8592, - "step": 4647 - }, - { - "epoch": 4.5, - "grad_norm": 1.4796096086502075, - "learning_rate": 5.638031693077565e-06, - "loss": 0.8437, - "step": 4648 - }, - { - "epoch": 4.5, - "grad_norm": 1.364597201347351, - "learning_rate": 5.633861551292744e-06, - "loss": 0.5901, - "step": 4649 - }, - { - "epoch": 4.5, - "grad_norm": 1.2805659770965576, - "learning_rate": 5.6296914095079234e-06, - "loss": 0.7921, - "step": 4650 - }, - { - "epoch": 4.5, - "grad_norm": 1.4497544765472412, - "learning_rate": 5.625521267723103e-06, - "loss": 0.6249, - "step": 4651 - }, - { - "epoch": 4.5, - "grad_norm": 1.7044733762741089, - "learning_rate": 5.621351125938283e-06, - "loss": 0.7502, - "step": 4652 - }, - { - "epoch": 4.5, - "grad_norm": 1.466091275215149, - "learning_rate": 5.617180984153461e-06, - "loss": 0.6731, - "step": 4653 - }, - { - "epoch": 4.5, - "grad_norm": 1.43474280834198, - "learning_rate": 5.61301084236864e-06, - "loss": 0.6992, - "step": 4654 - }, - { - "epoch": 4.5, - "grad_norm": 1.197882056236267, - "learning_rate": 5.60884070058382e-06, - "loss": 0.6655, - "step": 4655 - }, - { - "epoch": 4.51, - "grad_norm": 1.6035302877426147, - "learning_rate": 5.6046705587989994e-06, - "loss": 0.7569, - "step": 4656 - }, - { - "epoch": 4.51, - "grad_norm": 1.7309684753417969, - "learning_rate": 5.600500417014179e-06, - "loss": 0.7687, - "step": 4657 - }, - { - "epoch": 4.51, - "grad_norm": 1.495622158050537, - "learning_rate": 5.596330275229359e-06, - "loss": 0.6196, - "step": 4658 - }, - { - "epoch": 4.51, - "grad_norm": 1.9390276670455933, - "learning_rate": 5.592160133444537e-06, - "loss": 0.7622, - "step": 4659 - }, - { - "epoch": 4.51, - "grad_norm": 1.416953682899475, - "learning_rate": 5.587989991659716e-06, - "loss": 0.5089, - "step": 4660 - }, - { - "epoch": 4.51, - "grad_norm": 1.546241283416748, - "learning_rate": 5.583819849874896e-06, - "loss": 0.6598, - "step": 4661 - }, - { - "epoch": 4.51, - "grad_norm": 1.6533225774765015, - "learning_rate": 5.579649708090075e-06, - "loss": 0.9055, - "step": 4662 - }, - { - "epoch": 4.51, - "grad_norm": 1.3580639362335205, - "learning_rate": 5.575479566305255e-06, - "loss": 0.8042, - "step": 4663 - }, - { - "epoch": 4.51, - "grad_norm": 1.56376314163208, - "learning_rate": 5.571309424520434e-06, - "loss": 0.8632, - "step": 4664 - }, - { - "epoch": 4.51, - "grad_norm": 1.5229129791259766, - "learning_rate": 5.567139282735613e-06, - "loss": 0.6711, - "step": 4665 - }, - { - "epoch": 4.51, - "grad_norm": 1.6064221858978271, - "learning_rate": 5.562969140950792e-06, - "loss": 0.8422, - "step": 4666 - }, - { - "epoch": 4.52, - "grad_norm": 1.3323804140090942, - "learning_rate": 5.558798999165972e-06, - "loss": 0.6602, - "step": 4667 - }, - { - "epoch": 4.52, - "grad_norm": 1.4172663688659668, - "learning_rate": 5.554628857381151e-06, - "loss": 0.6033, - "step": 4668 - }, - { - "epoch": 4.52, - "grad_norm": 1.5368646383285522, - "learning_rate": 5.5504587155963306e-06, - "loss": 0.6856, - "step": 4669 - }, - { - "epoch": 4.52, - "grad_norm": 1.359999656677246, - "learning_rate": 5.54628857381151e-06, - "loss": 0.6784, - "step": 4670 - }, - { - "epoch": 4.52, - "grad_norm": 1.7071532011032104, - "learning_rate": 5.542118432026689e-06, - "loss": 0.748, - "step": 4671 - }, - { - "epoch": 4.52, - "grad_norm": 1.4840952157974243, - "learning_rate": 5.537948290241868e-06, - "loss": 0.767, - "step": 4672 - }, - { - "epoch": 4.52, - "grad_norm": 1.385696291923523, - "learning_rate": 5.533778148457048e-06, - "loss": 0.6435, - "step": 4673 - }, - { - "epoch": 4.52, - "grad_norm": 1.4366190433502197, - "learning_rate": 5.529608006672227e-06, - "loss": 0.5813, - "step": 4674 - }, - { - "epoch": 4.52, - "grad_norm": 1.3593374490737915, - "learning_rate": 5.525437864887406e-06, - "loss": 0.6451, - "step": 4675 - }, - { - "epoch": 4.52, - "grad_norm": 1.4528684616088867, - "learning_rate": 5.521267723102586e-06, - "loss": 0.7298, - "step": 4676 - }, - { - "epoch": 4.53, - "grad_norm": 1.4020411968231201, - "learning_rate": 5.517097581317765e-06, - "loss": 0.6495, - "step": 4677 - }, - { - "epoch": 4.53, - "grad_norm": 1.2791095972061157, - "learning_rate": 5.512927439532944e-06, - "loss": 0.596, - "step": 4678 - }, - { - "epoch": 4.53, - "grad_norm": 1.73211669921875, - "learning_rate": 5.508757297748124e-06, - "loss": 0.7312, - "step": 4679 - }, - { - "epoch": 4.53, - "grad_norm": 1.699086308479309, - "learning_rate": 5.504587155963303e-06, - "loss": 0.5438, - "step": 4680 - }, - { - "epoch": 4.53, - "grad_norm": 1.799531102180481, - "learning_rate": 5.500417014178482e-06, - "loss": 0.7692, - "step": 4681 - }, - { - "epoch": 4.53, - "grad_norm": 1.633374810218811, - "learning_rate": 5.496246872393662e-06, - "loss": 0.7507, - "step": 4682 - }, - { - "epoch": 4.53, - "grad_norm": 1.4437192678451538, - "learning_rate": 5.492076730608841e-06, - "loss": 0.7854, - "step": 4683 - }, - { - "epoch": 4.53, - "grad_norm": 1.8305168151855469, - "learning_rate": 5.48790658882402e-06, - "loss": 0.604, - "step": 4684 - }, - { - "epoch": 4.53, - "grad_norm": 1.6433649063110352, - "learning_rate": 5.4837364470392e-06, - "loss": 0.7629, - "step": 4685 - }, - { - "epoch": 4.53, - "grad_norm": 1.6655476093292236, - "learning_rate": 5.4795663052543785e-06, - "loss": 0.7122, - "step": 4686 - }, - { - "epoch": 4.54, - "grad_norm": 1.635684609413147, - "learning_rate": 5.475396163469558e-06, - "loss": 0.6979, - "step": 4687 - }, - { - "epoch": 4.54, - "grad_norm": 1.8280525207519531, - "learning_rate": 5.471226021684738e-06, - "loss": 0.6252, - "step": 4688 - }, - { - "epoch": 4.54, - "grad_norm": 1.7299901247024536, - "learning_rate": 5.467055879899917e-06, - "loss": 0.5505, - "step": 4689 - }, - { - "epoch": 4.54, - "grad_norm": 1.6322293281555176, - "learning_rate": 5.462885738115096e-06, - "loss": 0.7157, - "step": 4690 - }, - { - "epoch": 4.54, - "grad_norm": 2.1160528659820557, - "learning_rate": 5.458715596330276e-06, - "loss": 0.7681, - "step": 4691 - }, - { - "epoch": 4.54, - "grad_norm": 1.3451793193817139, - "learning_rate": 5.4545454545454545e-06, - "loss": 0.6274, - "step": 4692 - }, - { - "epoch": 4.54, - "grad_norm": 1.2206450700759888, - "learning_rate": 5.450375312760634e-06, - "loss": 0.6895, - "step": 4693 - }, - { - "epoch": 4.54, - "grad_norm": 1.297421932220459, - "learning_rate": 5.446205170975814e-06, - "loss": 0.7076, - "step": 4694 - }, - { - "epoch": 4.54, - "grad_norm": 1.439626693725586, - "learning_rate": 5.442035029190993e-06, - "loss": 0.7507, - "step": 4695 - }, - { - "epoch": 4.54, - "grad_norm": 1.2901383638381958, - "learning_rate": 5.437864887406172e-06, - "loss": 0.5414, - "step": 4696 - }, - { - "epoch": 4.54, - "grad_norm": 1.759224772453308, - "learning_rate": 5.433694745621351e-06, - "loss": 0.7401, - "step": 4697 - }, - { - "epoch": 4.55, - "grad_norm": 1.2545334100723267, - "learning_rate": 5.4295246038365304e-06, - "loss": 0.689, - "step": 4698 - }, - { - "epoch": 4.55, - "grad_norm": 1.627297282218933, - "learning_rate": 5.42535446205171e-06, - "loss": 0.8067, - "step": 4699 - }, - { - "epoch": 4.55, - "grad_norm": 1.56171452999115, - "learning_rate": 5.42118432026689e-06, - "loss": 0.7103, - "step": 4700 - }, - { - "epoch": 4.55, - "grad_norm": 1.5313225984573364, - "learning_rate": 5.417014178482069e-06, - "loss": 0.5791, - "step": 4701 - }, - { - "epoch": 4.55, - "grad_norm": 1.573706030845642, - "learning_rate": 5.412844036697248e-06, - "loss": 0.7821, - "step": 4702 - }, - { - "epoch": 4.55, - "grad_norm": 1.542401671409607, - "learning_rate": 5.408673894912427e-06, - "loss": 0.5661, - "step": 4703 - }, - { - "epoch": 4.55, - "grad_norm": 1.9369721412658691, - "learning_rate": 5.404503753127606e-06, - "loss": 0.7782, - "step": 4704 - }, - { - "epoch": 4.55, - "grad_norm": 1.4803828001022339, - "learning_rate": 5.400333611342786e-06, - "loss": 0.7395, - "step": 4705 - }, - { - "epoch": 4.55, - "grad_norm": 1.692859411239624, - "learning_rate": 5.396163469557966e-06, - "loss": 0.6787, - "step": 4706 - }, - { - "epoch": 4.55, - "grad_norm": 1.4623578786849976, - "learning_rate": 5.391993327773145e-06, - "loss": 0.6155, - "step": 4707 - }, - { - "epoch": 4.56, - "grad_norm": 1.4121465682983398, - "learning_rate": 5.387823185988323e-06, - "loss": 0.6422, - "step": 4708 - }, - { - "epoch": 4.56, - "grad_norm": 1.2926206588745117, - "learning_rate": 5.383653044203503e-06, - "loss": 0.7233, - "step": 4709 - }, - { - "epoch": 4.56, - "grad_norm": 1.1981099843978882, - "learning_rate": 5.379482902418682e-06, - "loss": 0.7078, - "step": 4710 - }, - { - "epoch": 4.56, - "grad_norm": 1.4239003658294678, - "learning_rate": 5.375312760633862e-06, - "loss": 0.6135, - "step": 4711 - }, - { - "epoch": 4.56, - "grad_norm": 1.8219012022018433, - "learning_rate": 5.371142618849042e-06, - "loss": 0.6569, - "step": 4712 - }, - { - "epoch": 4.56, - "grad_norm": 1.68372642993927, - "learning_rate": 5.366972477064221e-06, - "loss": 0.4832, - "step": 4713 - }, - { - "epoch": 4.56, - "grad_norm": 1.3844177722930908, - "learning_rate": 5.362802335279399e-06, - "loss": 0.7542, - "step": 4714 - }, - { - "epoch": 4.56, - "grad_norm": 1.6422113180160522, - "learning_rate": 5.358632193494579e-06, - "loss": 0.6599, - "step": 4715 - }, - { - "epoch": 4.56, - "grad_norm": 1.2676887512207031, - "learning_rate": 5.354462051709758e-06, - "loss": 0.6356, - "step": 4716 - }, - { - "epoch": 4.56, - "grad_norm": 1.4765608310699463, - "learning_rate": 5.3502919099249376e-06, - "loss": 0.5737, - "step": 4717 - }, - { - "epoch": 4.57, - "grad_norm": 1.6787502765655518, - "learning_rate": 5.346121768140118e-06, - "loss": 0.5737, - "step": 4718 - }, - { - "epoch": 4.57, - "grad_norm": 1.5113779306411743, - "learning_rate": 5.341951626355296e-06, - "loss": 0.7146, - "step": 4719 - }, - { - "epoch": 4.57, - "grad_norm": 1.2630488872528076, - "learning_rate": 5.337781484570475e-06, - "loss": 0.5984, - "step": 4720 - }, - { - "epoch": 4.57, - "grad_norm": 1.5241637229919434, - "learning_rate": 5.333611342785655e-06, - "loss": 1.0374, - "step": 4721 - }, - { - "epoch": 4.57, - "grad_norm": 1.288524866104126, - "learning_rate": 5.329441201000834e-06, - "loss": 0.6198, - "step": 4722 - }, - { - "epoch": 4.57, - "grad_norm": 1.4171669483184814, - "learning_rate": 5.3252710592160135e-06, - "loss": 0.6126, - "step": 4723 - }, - { - "epoch": 4.57, - "grad_norm": 1.2440285682678223, - "learning_rate": 5.3211009174311936e-06, - "loss": 0.7572, - "step": 4724 - }, - { - "epoch": 4.57, - "grad_norm": 1.20148503780365, - "learning_rate": 5.316930775646372e-06, - "loss": 0.6161, - "step": 4725 - }, - { - "epoch": 4.57, - "grad_norm": 1.493597388267517, - "learning_rate": 5.312760633861551e-06, - "loss": 0.6352, - "step": 4726 - }, - { - "epoch": 4.57, - "grad_norm": 1.736620306968689, - "learning_rate": 5.308590492076731e-06, - "loss": 0.7501, - "step": 4727 - }, - { - "epoch": 4.57, - "grad_norm": 1.3761045932769775, - "learning_rate": 5.30442035029191e-06, - "loss": 0.7785, - "step": 4728 - }, - { - "epoch": 4.58, - "grad_norm": 1.5715200901031494, - "learning_rate": 5.3002502085070895e-06, - "loss": 0.658, - "step": 4729 - }, - { - "epoch": 4.58, - "grad_norm": 2.1754579544067383, - "learning_rate": 5.296080066722269e-06, - "loss": 0.7418, - "step": 4730 - }, - { - "epoch": 4.58, - "grad_norm": 1.5906949043273926, - "learning_rate": 5.291909924937448e-06, - "loss": 0.641, - "step": 4731 - }, - { - "epoch": 4.58, - "grad_norm": 1.359338402748108, - "learning_rate": 5.287739783152627e-06, - "loss": 0.5666, - "step": 4732 - }, - { - "epoch": 4.58, - "grad_norm": 1.6645625829696655, - "learning_rate": 5.283569641367807e-06, - "loss": 0.8261, - "step": 4733 - }, - { - "epoch": 4.58, - "grad_norm": 1.646085262298584, - "learning_rate": 5.279399499582986e-06, - "loss": 0.7408, - "step": 4734 - }, - { - "epoch": 4.58, - "grad_norm": 1.2797610759735107, - "learning_rate": 5.2752293577981655e-06, - "loss": 0.7439, - "step": 4735 - }, - { - "epoch": 4.58, - "grad_norm": 2.1849143505096436, - "learning_rate": 5.271059216013345e-06, - "loss": 0.76, - "step": 4736 - }, - { - "epoch": 4.58, - "grad_norm": 1.5100131034851074, - "learning_rate": 5.266889074228524e-06, - "loss": 0.7065, - "step": 4737 - }, - { - "epoch": 4.58, - "grad_norm": 1.7423347234725952, - "learning_rate": 5.262718932443703e-06, - "loss": 0.6817, - "step": 4738 - }, - { - "epoch": 4.59, - "grad_norm": 1.381550669670105, - "learning_rate": 5.258548790658883e-06, - "loss": 0.6844, - "step": 4739 - }, - { - "epoch": 4.59, - "grad_norm": 1.2365972995758057, - "learning_rate": 5.254378648874062e-06, - "loss": 0.7174, - "step": 4740 - }, - { - "epoch": 4.59, - "grad_norm": 1.9256683588027954, - "learning_rate": 5.250208507089241e-06, - "loss": 0.8264, - "step": 4741 - }, - { - "epoch": 4.59, - "grad_norm": 1.5130013227462769, - "learning_rate": 5.246038365304421e-06, - "loss": 0.6881, - "step": 4742 - }, - { - "epoch": 4.59, - "grad_norm": 1.4686535596847534, - "learning_rate": 5.2418682235196e-06, - "loss": 0.7101, - "step": 4743 - }, - { - "epoch": 4.59, - "grad_norm": 1.4216227531433105, - "learning_rate": 5.237698081734779e-06, - "loss": 0.7207, - "step": 4744 - }, - { - "epoch": 4.59, - "grad_norm": 1.66473388671875, - "learning_rate": 5.233527939949959e-06, - "loss": 0.7675, - "step": 4745 - }, - { - "epoch": 4.59, - "grad_norm": 1.5849100351333618, - "learning_rate": 5.229357798165138e-06, - "loss": 0.6856, - "step": 4746 - }, - { - "epoch": 4.59, - "grad_norm": 1.4400815963745117, - "learning_rate": 5.225187656380317e-06, - "loss": 0.6889, - "step": 4747 - }, - { - "epoch": 4.59, - "grad_norm": 1.6706531047821045, - "learning_rate": 5.221017514595497e-06, - "loss": 0.7243, - "step": 4748 - }, - { - "epoch": 4.6, - "grad_norm": 1.865055799484253, - "learning_rate": 5.216847372810676e-06, - "loss": 0.7578, - "step": 4749 - }, - { - "epoch": 4.6, - "grad_norm": 1.6698907613754272, - "learning_rate": 5.212677231025855e-06, - "loss": 0.6211, - "step": 4750 - }, - { - "epoch": 4.6, - "grad_norm": 1.5708190202713013, - "learning_rate": 5.208507089241035e-06, - "loss": 0.7566, - "step": 4751 - }, - { - "epoch": 4.6, - "grad_norm": 2.0286459922790527, - "learning_rate": 5.204336947456213e-06, - "loss": 0.6628, - "step": 4752 - }, - { - "epoch": 4.6, - "grad_norm": 1.241782307624817, - "learning_rate": 5.200166805671393e-06, - "loss": 0.6194, - "step": 4753 - }, - { - "epoch": 4.6, - "grad_norm": 1.5183227062225342, - "learning_rate": 5.195996663886573e-06, - "loss": 0.7809, - "step": 4754 - }, - { - "epoch": 4.6, - "grad_norm": 1.5377471446990967, - "learning_rate": 5.191826522101752e-06, - "loss": 0.8283, - "step": 4755 - }, - { - "epoch": 4.6, - "grad_norm": 1.3237930536270142, - "learning_rate": 5.187656380316931e-06, - "loss": 0.7827, - "step": 4756 - }, - { - "epoch": 4.6, - "grad_norm": 1.519793152809143, - "learning_rate": 5.183486238532111e-06, - "loss": 0.6665, - "step": 4757 - }, - { - "epoch": 4.6, - "grad_norm": 1.4032889604568481, - "learning_rate": 5.179316096747289e-06, - "loss": 0.6367, - "step": 4758 - }, - { - "epoch": 4.6, - "grad_norm": 1.611026406288147, - "learning_rate": 5.1751459549624686e-06, - "loss": 0.6187, - "step": 4759 - }, - { - "epoch": 4.61, - "grad_norm": 1.4174565076828003, - "learning_rate": 5.170975813177649e-06, - "loss": 1.0453, - "step": 4760 - }, - { - "epoch": 4.61, - "grad_norm": 1.5549442768096924, - "learning_rate": 5.166805671392828e-06, - "loss": 0.6455, - "step": 4761 - }, - { - "epoch": 4.61, - "grad_norm": 1.5228101015090942, - "learning_rate": 5.162635529608007e-06, - "loss": 0.6295, - "step": 4762 - }, - { - "epoch": 4.61, - "grad_norm": 1.405816674232483, - "learning_rate": 5.158465387823186e-06, - "loss": 0.6324, - "step": 4763 - }, - { - "epoch": 4.61, - "grad_norm": 1.5264065265655518, - "learning_rate": 5.154295246038365e-06, - "loss": 0.6643, - "step": 4764 - }, - { - "epoch": 4.61, - "grad_norm": 1.7057868242263794, - "learning_rate": 5.1501251042535445e-06, - "loss": 0.638, - "step": 4765 - }, - { - "epoch": 4.61, - "grad_norm": 1.5288913249969482, - "learning_rate": 5.1459549624687246e-06, - "loss": 0.7412, - "step": 4766 - }, - { - "epoch": 4.61, - "grad_norm": 1.7368863821029663, - "learning_rate": 5.141784820683904e-06, - "loss": 0.6938, - "step": 4767 - }, - { - "epoch": 4.61, - "grad_norm": 2.2275304794311523, - "learning_rate": 5.137614678899083e-06, - "loss": 0.7366, - "step": 4768 - }, - { - "epoch": 4.61, - "grad_norm": 1.8191800117492676, - "learning_rate": 5.133444537114262e-06, - "loss": 0.7383, - "step": 4769 - }, - { - "epoch": 4.62, - "grad_norm": 1.4102978706359863, - "learning_rate": 5.129274395329441e-06, - "loss": 0.74, - "step": 4770 - }, - { - "epoch": 4.62, - "grad_norm": 1.8225984573364258, - "learning_rate": 5.1251042535446205e-06, - "loss": 0.721, - "step": 4771 - }, - { - "epoch": 4.62, - "grad_norm": 1.3556174039840698, - "learning_rate": 5.1209341117598006e-06, - "loss": 0.7782, - "step": 4772 - }, - { - "epoch": 4.62, - "grad_norm": 1.751905918121338, - "learning_rate": 5.11676396997498e-06, - "loss": 0.8165, - "step": 4773 - }, - { - "epoch": 4.62, - "grad_norm": 1.4879447221755981, - "learning_rate": 5.112593828190159e-06, - "loss": 0.6494, - "step": 4774 - }, - { - "epoch": 4.62, - "grad_norm": 1.6665265560150146, - "learning_rate": 5.108423686405338e-06, - "loss": 0.5807, - "step": 4775 - }, - { - "epoch": 4.62, - "grad_norm": 1.3465780019760132, - "learning_rate": 5.104253544620517e-06, - "loss": 0.7655, - "step": 4776 - }, - { - "epoch": 4.62, - "grad_norm": 1.2939397096633911, - "learning_rate": 5.1000834028356965e-06, - "loss": 0.6662, - "step": 4777 - }, - { - "epoch": 4.62, - "grad_norm": 1.625292420387268, - "learning_rate": 5.0959132610508765e-06, - "loss": 0.7895, - "step": 4778 - }, - { - "epoch": 4.62, - "grad_norm": 1.1519323587417603, - "learning_rate": 5.091743119266056e-06, - "loss": 0.748, - "step": 4779 - }, - { - "epoch": 4.63, - "grad_norm": 1.2725067138671875, - "learning_rate": 5.087572977481234e-06, - "loss": 0.8611, - "step": 4780 - }, - { - "epoch": 4.63, - "grad_norm": 1.5201002359390259, - "learning_rate": 5.083402835696414e-06, - "loss": 0.6974, - "step": 4781 - }, - { - "epoch": 4.63, - "grad_norm": 1.7186866998672485, - "learning_rate": 5.079232693911593e-06, - "loss": 0.6424, - "step": 4782 - }, - { - "epoch": 4.63, - "grad_norm": 1.6808973550796509, - "learning_rate": 5.0750625521267725e-06, - "loss": 0.6133, - "step": 4783 - }, - { - "epoch": 4.63, - "grad_norm": 1.4706043004989624, - "learning_rate": 5.0708924103419525e-06, - "loss": 0.7042, - "step": 4784 - }, - { - "epoch": 4.63, - "grad_norm": 1.3771601915359497, - "learning_rate": 5.066722268557132e-06, - "loss": 0.6956, - "step": 4785 - }, - { - "epoch": 4.63, - "grad_norm": 1.5026473999023438, - "learning_rate": 5.06255212677231e-06, - "loss": 0.6319, - "step": 4786 - }, - { - "epoch": 4.63, - "grad_norm": 1.444579839706421, - "learning_rate": 5.05838198498749e-06, - "loss": 0.7188, - "step": 4787 - }, - { - "epoch": 4.63, - "grad_norm": 1.5965029001235962, - "learning_rate": 5.054211843202669e-06, - "loss": 0.6373, - "step": 4788 - }, - { - "epoch": 4.63, - "grad_norm": 1.3108526468276978, - "learning_rate": 5.0500417014178485e-06, - "loss": 0.6486, - "step": 4789 - }, - { - "epoch": 4.63, - "grad_norm": 1.3351519107818604, - "learning_rate": 5.045871559633028e-06, - "loss": 0.61, - "step": 4790 - }, - { - "epoch": 4.64, - "grad_norm": 1.2217323780059814, - "learning_rate": 5.041701417848207e-06, - "loss": 0.6368, - "step": 4791 - }, - { - "epoch": 4.64, - "grad_norm": 1.9024354219436646, - "learning_rate": 5.037531276063386e-06, - "loss": 0.7607, - "step": 4792 - }, - { - "epoch": 4.64, - "grad_norm": 1.520367980003357, - "learning_rate": 5.033361134278566e-06, - "loss": 0.6358, - "step": 4793 - }, - { - "epoch": 4.64, - "grad_norm": 1.536771297454834, - "learning_rate": 5.029190992493745e-06, - "loss": 0.7668, - "step": 4794 - }, - { - "epoch": 4.64, - "grad_norm": 1.5564029216766357, - "learning_rate": 5.0250208507089244e-06, - "loss": 0.7745, - "step": 4795 - }, - { - "epoch": 4.64, - "grad_norm": 1.4659467935562134, - "learning_rate": 5.020850708924104e-06, - "loss": 0.6083, - "step": 4796 - }, - { - "epoch": 4.64, - "grad_norm": 1.8061070442199707, - "learning_rate": 5.016680567139283e-06, - "loss": 0.8638, - "step": 4797 - }, - { - "epoch": 4.64, - "grad_norm": 1.3351647853851318, - "learning_rate": 5.012510425354462e-06, - "loss": 0.8551, - "step": 4798 - }, - { - "epoch": 4.64, - "grad_norm": 1.2969155311584473, - "learning_rate": 5.008340283569642e-06, - "loss": 0.612, - "step": 4799 - }, - { - "epoch": 4.64, - "grad_norm": 1.4245238304138184, - "learning_rate": 5.004170141784821e-06, - "loss": 0.6597, - "step": 4800 - }, - { - "epoch": 4.64, - "eval_loss": 0.8592209815979004, - "eval_runtime": 856.5817, - "eval_samples_per_second": 4.826, - "eval_steps_per_second": 0.604, - "step": 4800 } ], "logging_steps": 1, @@ -33742,7 +21094,7 @@ "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 300, - "total_flos": 2.092122684281979e+18, + "total_flos": 1.308129860002775e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null