{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5907, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005078720162519045, "grad_norm": 8.11971664428711, "learning_rate": 7.614213197969544e-08, "loss": 0.755, "step": 10 }, { "epoch": 0.01015744032503809, "grad_norm": 7.887208938598633, "learning_rate": 1.607445008460237e-07, "loss": 0.7436, "step": 20 }, { "epoch": 0.015236160487557136, "grad_norm": 6.621643543243408, "learning_rate": 2.4534686971235194e-07, "loss": 0.7286, "step": 30 }, { "epoch": 0.02031488065007618, "grad_norm": 5.338628768920898, "learning_rate": 3.2994923857868026e-07, "loss": 0.6637, "step": 40 }, { "epoch": 0.025393600812595226, "grad_norm": 3.3166372776031494, "learning_rate": 4.1455160744500853e-07, "loss": 0.5849, "step": 50 }, { "epoch": 0.03047232097511427, "grad_norm": 2.2426822185516357, "learning_rate": 4.991539763113367e-07, "loss": 0.5074, "step": 60 }, { "epoch": 0.03555104113763331, "grad_norm": 1.4896293878555298, "learning_rate": 5.83756345177665e-07, "loss": 0.4381, "step": 70 }, { "epoch": 0.04062976130015236, "grad_norm": 1.3547300100326538, "learning_rate": 6.683587140439933e-07, "loss": 0.3912, "step": 80 }, { "epoch": 0.0457084814626714, "grad_norm": 1.2349778413772583, "learning_rate": 7.529610829103214e-07, "loss": 0.3717, "step": 90 }, { "epoch": 0.05078720162519045, "grad_norm": 1.228826880455017, "learning_rate": 8.375634517766498e-07, "loss": 0.3763, "step": 100 }, { "epoch": 0.055865921787709494, "grad_norm": 1.9437307119369507, "learning_rate": 9.22165820642978e-07, "loss": 0.3648, "step": 110 }, { "epoch": 0.06094464195022854, "grad_norm": 1.1562706232070923, "learning_rate": 1.0067681895093063e-06, "loss": 0.3593, "step": 120 }, { "epoch": 0.06602336211274759, "grad_norm": 1.2007368803024292, "learning_rate": 1.0913705583756345e-06, "loss": 0.3619, "step": 130 }, { "epoch": 0.07110208227526663, "grad_norm": 1.1364222764968872, "learning_rate": 1.1759729272419628e-06, "loss": 0.3492, "step": 140 }, { "epoch": 0.07618080243778567, "grad_norm": 1.2867300510406494, "learning_rate": 1.2605752961082913e-06, "loss": 0.359, "step": 150 }, { "epoch": 0.08125952260030472, "grad_norm": 1.230358600616455, "learning_rate": 1.3451776649746193e-06, "loss": 0.3396, "step": 160 }, { "epoch": 0.08633824276282377, "grad_norm": 1.1166342496871948, "learning_rate": 1.4297800338409476e-06, "loss": 0.3412, "step": 170 }, { "epoch": 0.0914169629253428, "grad_norm": 1.1211029291152954, "learning_rate": 1.5143824027072759e-06, "loss": 0.3436, "step": 180 }, { "epoch": 0.09649568308786186, "grad_norm": 1.0837715864181519, "learning_rate": 1.5989847715736043e-06, "loss": 0.3399, "step": 190 }, { "epoch": 0.1015744032503809, "grad_norm": 1.1223688125610352, "learning_rate": 1.6835871404399324e-06, "loss": 0.3465, "step": 200 }, { "epoch": 0.10665312341289995, "grad_norm": 1.1082217693328857, "learning_rate": 1.7681895093062607e-06, "loss": 0.3422, "step": 210 }, { "epoch": 0.11173184357541899, "grad_norm": 1.1912903785705566, "learning_rate": 1.852791878172589e-06, "loss": 0.3336, "step": 220 }, { "epoch": 0.11681056373793804, "grad_norm": 1.1215198040008545, "learning_rate": 1.937394247038917e-06, "loss": 0.3293, "step": 230 }, { "epoch": 0.12188928390045708, "grad_norm": 1.1722122430801392, "learning_rate": 2.0219966159052453e-06, "loss": 0.3376, "step": 240 }, { "epoch": 0.12696800406297612, "grad_norm": 1.1333527565002441, "learning_rate": 2.1065989847715737e-06, "loss": 0.342, "step": 250 }, { "epoch": 0.13204672422549518, "grad_norm": 1.0237016677856445, "learning_rate": 2.1912013536379022e-06, "loss": 0.3322, "step": 260 }, { "epoch": 0.13712544438801422, "grad_norm": 1.061033844947815, "learning_rate": 2.2758037225042303e-06, "loss": 0.3311, "step": 270 }, { "epoch": 0.14220416455053325, "grad_norm": 1.1101399660110474, "learning_rate": 2.3604060913705588e-06, "loss": 0.3426, "step": 280 }, { "epoch": 0.14728288471305231, "grad_norm": 1.1787445545196533, "learning_rate": 2.445008460236887e-06, "loss": 0.321, "step": 290 }, { "epoch": 0.15236160487557135, "grad_norm": 1.0249688625335693, "learning_rate": 2.5296108291032153e-06, "loss": 0.328, "step": 300 }, { "epoch": 0.1574403250380904, "grad_norm": 1.058312177658081, "learning_rate": 2.6142131979695434e-06, "loss": 0.3299, "step": 310 }, { "epoch": 0.16251904520060945, "grad_norm": 1.0902348756790161, "learning_rate": 2.698815566835872e-06, "loss": 0.3192, "step": 320 }, { "epoch": 0.16759776536312848, "grad_norm": 1.0917754173278809, "learning_rate": 2.7834179357022e-06, "loss": 0.3211, "step": 330 }, { "epoch": 0.17267648552564754, "grad_norm": 1.0080076456069946, "learning_rate": 2.8680203045685284e-06, "loss": 0.3299, "step": 340 }, { "epoch": 0.17775520568816658, "grad_norm": 1.2910374402999878, "learning_rate": 2.952622673434856e-06, "loss": 0.3286, "step": 350 }, { "epoch": 0.1828339258506856, "grad_norm": 1.014559268951416, "learning_rate": 3.0372250423011845e-06, "loss": 0.3174, "step": 360 }, { "epoch": 0.18791264601320468, "grad_norm": 1.0668742656707764, "learning_rate": 3.121827411167513e-06, "loss": 0.327, "step": 370 }, { "epoch": 0.1929913661757237, "grad_norm": 1.0306109189987183, "learning_rate": 3.206429780033841e-06, "loss": 0.3266, "step": 380 }, { "epoch": 0.19807008633824277, "grad_norm": 1.1934700012207031, "learning_rate": 3.2910321489001695e-06, "loss": 0.3288, "step": 390 }, { "epoch": 0.2031488065007618, "grad_norm": 1.1355713605880737, "learning_rate": 3.375634517766498e-06, "loss": 0.3318, "step": 400 }, { "epoch": 0.20822752666328084, "grad_norm": 1.0128185749053955, "learning_rate": 3.460236886632826e-06, "loss": 0.3153, "step": 410 }, { "epoch": 0.2133062468257999, "grad_norm": 1.1666514873504639, "learning_rate": 3.5448392554991545e-06, "loss": 0.3249, "step": 420 }, { "epoch": 0.21838496698831894, "grad_norm": 1.1061981916427612, "learning_rate": 3.629441624365482e-06, "loss": 0.3218, "step": 430 }, { "epoch": 0.22346368715083798, "grad_norm": 1.1475132703781128, "learning_rate": 3.7140439932318106e-06, "loss": 0.3338, "step": 440 }, { "epoch": 0.22854240731335704, "grad_norm": 1.1342549324035645, "learning_rate": 3.798646362098139e-06, "loss": 0.3251, "step": 450 }, { "epoch": 0.23362112747587607, "grad_norm": 1.0099126100540161, "learning_rate": 3.883248730964467e-06, "loss": 0.3251, "step": 460 }, { "epoch": 0.23869984763839514, "grad_norm": 1.112540602684021, "learning_rate": 3.967851099830796e-06, "loss": 0.3234, "step": 470 }, { "epoch": 0.24377856780091417, "grad_norm": 1.0405648946762085, "learning_rate": 4.052453468697124e-06, "loss": 0.3255, "step": 480 }, { "epoch": 0.2488572879634332, "grad_norm": 1.0701533555984497, "learning_rate": 4.137055837563453e-06, "loss": 0.3214, "step": 490 }, { "epoch": 0.25393600812595224, "grad_norm": 1.2194687128067017, "learning_rate": 4.22165820642978e-06, "loss": 0.3221, "step": 500 }, { "epoch": 0.25393600812595224, "eval_loss": 0.3276064395904541, "eval_runtime": 121.7232, "eval_samples_per_second": 41.077, "eval_steps_per_second": 2.571, "step": 500 }, { "epoch": 0.25901472828847133, "grad_norm": 1.0545361042022705, "learning_rate": 4.306260575296109e-06, "loss": 0.3102, "step": 510 }, { "epoch": 0.26409344845099036, "grad_norm": 0.9262936115264893, "learning_rate": 4.390862944162436e-06, "loss": 0.3044, "step": 520 }, { "epoch": 0.2691721686135094, "grad_norm": 1.0422135591506958, "learning_rate": 4.475465313028765e-06, "loss": 0.3178, "step": 530 }, { "epoch": 0.27425088877602843, "grad_norm": 1.108899474143982, "learning_rate": 4.560067681895093e-06, "loss": 0.3224, "step": 540 }, { "epoch": 0.27932960893854747, "grad_norm": 0.9598031044006348, "learning_rate": 4.644670050761422e-06, "loss": 0.3232, "step": 550 }, { "epoch": 0.2844083291010665, "grad_norm": 1.0063800811767578, "learning_rate": 4.72927241962775e-06, "loss": 0.3224, "step": 560 }, { "epoch": 0.2894870492635856, "grad_norm": 1.0170040130615234, "learning_rate": 4.813874788494079e-06, "loss": 0.3204, "step": 570 }, { "epoch": 0.29456576942610463, "grad_norm": 1.1103383302688599, "learning_rate": 4.898477157360406e-06, "loss": 0.3249, "step": 580 }, { "epoch": 0.29964448958862366, "grad_norm": 1.101906657218933, "learning_rate": 4.983079526226735e-06, "loss": 0.3185, "step": 590 }, { "epoch": 0.3047232097511427, "grad_norm": 0.9216155409812927, "learning_rate": 4.999972060477541e-06, "loss": 0.3197, "step": 600 }, { "epoch": 0.30980192991366173, "grad_norm": 1.0119545459747314, "learning_rate": 4.999858557237848e-06, "loss": 0.316, "step": 610 }, { "epoch": 0.3148806500761808, "grad_norm": 0.9301111698150635, "learning_rate": 4.999657748021748e-06, "loss": 0.3256, "step": 620 }, { "epoch": 0.31995937023869986, "grad_norm": 0.944416344165802, "learning_rate": 4.999369639842375e-06, "loss": 0.3253, "step": 630 }, { "epoch": 0.3250380904012189, "grad_norm": 0.856289803981781, "learning_rate": 4.998994242761724e-06, "loss": 0.3142, "step": 640 }, { "epoch": 0.33011681056373793, "grad_norm": 1.0146931409835815, "learning_rate": 4.998531569890301e-06, "loss": 0.3117, "step": 650 }, { "epoch": 0.33519553072625696, "grad_norm": 0.859649121761322, "learning_rate": 4.997981637386663e-06, "loss": 0.3089, "step": 660 }, { "epoch": 0.34027425088877605, "grad_norm": 0.9393402338027954, "learning_rate": 4.997344464456854e-06, "loss": 0.3121, "step": 670 }, { "epoch": 0.3453529710512951, "grad_norm": 0.9901228547096252, "learning_rate": 4.9966200733537345e-06, "loss": 0.3268, "step": 680 }, { "epoch": 0.3504316912138141, "grad_norm": 0.9233267307281494, "learning_rate": 4.995808489376206e-06, "loss": 0.3214, "step": 690 }, { "epoch": 0.35551041137633316, "grad_norm": 0.8530528545379639, "learning_rate": 4.9949097408683235e-06, "loss": 0.3148, "step": 700 }, { "epoch": 0.3605891315388522, "grad_norm": 0.9197198748588562, "learning_rate": 4.99392385921831e-06, "loss": 0.3108, "step": 710 }, { "epoch": 0.3656678517013712, "grad_norm": 0.9738614559173584, "learning_rate": 4.992850878857458e-06, "loss": 0.3189, "step": 720 }, { "epoch": 0.3707465718638903, "grad_norm": 0.9086189270019531, "learning_rate": 4.991690837258926e-06, "loss": 0.3157, "step": 730 }, { "epoch": 0.37582529202640935, "grad_norm": 1.1314260959625244, "learning_rate": 4.990443774936432e-06, "loss": 0.3295, "step": 740 }, { "epoch": 0.3809040121889284, "grad_norm": 0.9313626885414124, "learning_rate": 4.989109735442838e-06, "loss": 0.3223, "step": 750 }, { "epoch": 0.3859827323514474, "grad_norm": 0.894349217414856, "learning_rate": 4.987688765368628e-06, "loss": 0.3068, "step": 760 }, { "epoch": 0.39106145251396646, "grad_norm": 0.8493692874908447, "learning_rate": 4.986180914340281e-06, "loss": 0.3145, "step": 770 }, { "epoch": 0.39614017267648555, "grad_norm": 0.935171365737915, "learning_rate": 4.9845862350185405e-06, "loss": 0.3155, "step": 780 }, { "epoch": 0.4012188928390046, "grad_norm": 0.8560822010040283, "learning_rate": 4.98290478309657e-06, "loss": 0.3274, "step": 790 }, { "epoch": 0.4062976130015236, "grad_norm": 0.867138147354126, "learning_rate": 4.981136617298012e-06, "loss": 0.314, "step": 800 }, { "epoch": 0.41137633316404265, "grad_norm": 0.8818840384483337, "learning_rate": 4.97928179937494e-06, "loss": 0.3182, "step": 810 }, { "epoch": 0.4164550533265617, "grad_norm": 0.9063979387283325, "learning_rate": 4.977340394105692e-06, "loss": 0.3152, "step": 820 }, { "epoch": 0.4215337734890808, "grad_norm": 0.8890411257743835, "learning_rate": 4.975312469292618e-06, "loss": 0.3084, "step": 830 }, { "epoch": 0.4266124936515998, "grad_norm": 0.8582976460456848, "learning_rate": 4.973198095759708e-06, "loss": 0.3054, "step": 840 }, { "epoch": 0.43169121381411885, "grad_norm": 0.8314648270606995, "learning_rate": 4.970997347350117e-06, "loss": 0.3067, "step": 850 }, { "epoch": 0.4367699339766379, "grad_norm": 0.9010292887687683, "learning_rate": 4.96871030092359e-06, "loss": 0.3112, "step": 860 }, { "epoch": 0.4418486541391569, "grad_norm": 0.8275406956672668, "learning_rate": 4.966337036353775e-06, "loss": 0.3075, "step": 870 }, { "epoch": 0.44692737430167595, "grad_norm": 1.2174943685531616, "learning_rate": 4.963877636525431e-06, "loss": 0.3166, "step": 880 }, { "epoch": 0.45200609446419504, "grad_norm": 0.8684174418449402, "learning_rate": 4.961332187331541e-06, "loss": 0.3221, "step": 890 }, { "epoch": 0.4570848146267141, "grad_norm": 0.864565372467041, "learning_rate": 4.958700777670306e-06, "loss": 0.3168, "step": 900 }, { "epoch": 0.4621635347892331, "grad_norm": 0.8785614967346191, "learning_rate": 4.955983499442039e-06, "loss": 0.3168, "step": 910 }, { "epoch": 0.46724225495175215, "grad_norm": 0.7927045226097107, "learning_rate": 4.953180447545965e-06, "loss": 0.3175, "step": 920 }, { "epoch": 0.4723209751142712, "grad_norm": 0.9102469086647034, "learning_rate": 4.950291719876891e-06, "loss": 0.3217, "step": 930 }, { "epoch": 0.47739969527679027, "grad_norm": 0.8130275011062622, "learning_rate": 4.947317417321803e-06, "loss": 0.2997, "step": 940 }, { "epoch": 0.4824784154393093, "grad_norm": 0.8968609571456909, "learning_rate": 4.944257643756333e-06, "loss": 0.3148, "step": 950 }, { "epoch": 0.48755713560182834, "grad_norm": 0.9034120440483093, "learning_rate": 4.941112506041135e-06, "loss": 0.3132, "step": 960 }, { "epoch": 0.4926358557643474, "grad_norm": 0.8752701282501221, "learning_rate": 4.93788211401815e-06, "loss": 0.3141, "step": 970 }, { "epoch": 0.4977145759268664, "grad_norm": 0.9108573794364929, "learning_rate": 4.9345665805067735e-06, "loss": 0.3175, "step": 980 }, { "epoch": 0.5027932960893855, "grad_norm": 0.9702094197273254, "learning_rate": 4.931166021299914e-06, "loss": 0.3055, "step": 990 }, { "epoch": 0.5078720162519045, "grad_norm": 0.9001919031143188, "learning_rate": 4.927680555159946e-06, "loss": 0.3095, "step": 1000 }, { "epoch": 0.5078720162519045, "eval_loss": 0.3185386657714844, "eval_runtime": 121.8144, "eval_samples_per_second": 41.046, "eval_steps_per_second": 2.569, "step": 1000 }, { "epoch": 0.5129507364144236, "grad_norm": 0.9391583800315857, "learning_rate": 4.924110303814567e-06, "loss": 0.3165, "step": 1010 }, { "epoch": 0.5180294565769427, "grad_norm": 0.8731343150138855, "learning_rate": 4.920455391952543e-06, "loss": 0.303, "step": 1020 }, { "epoch": 0.5231081767394616, "grad_norm": 0.811320424079895, "learning_rate": 4.916715947219356e-06, "loss": 0.3015, "step": 1030 }, { "epoch": 0.5281868969019807, "grad_norm": 0.9213032126426697, "learning_rate": 4.912892100212744e-06, "loss": 0.3054, "step": 1040 }, { "epoch": 0.5332656170644997, "grad_norm": 0.8103631734848022, "learning_rate": 4.908983984478141e-06, "loss": 0.2985, "step": 1050 }, { "epoch": 0.5383443372270188, "grad_norm": 0.8164056539535522, "learning_rate": 4.9049917365040135e-06, "loss": 0.3138, "step": 1060 }, { "epoch": 0.5434230573895379, "grad_norm": 0.7794240713119507, "learning_rate": 4.900915495717092e-06, "loss": 0.3073, "step": 1070 }, { "epoch": 0.5485017775520569, "grad_norm": 0.8127440810203552, "learning_rate": 4.896755404477505e-06, "loss": 0.3047, "step": 1080 }, { "epoch": 0.553580497714576, "grad_norm": 0.8375741243362427, "learning_rate": 4.892511608073804e-06, "loss": 0.3178, "step": 1090 }, { "epoch": 0.5586592178770949, "grad_norm": 0.836986243724823, "learning_rate": 4.888184254717886e-06, "loss": 0.314, "step": 1100 }, { "epoch": 0.563737938039614, "grad_norm": 0.8443421125411987, "learning_rate": 4.88377349553983e-06, "loss": 0.3187, "step": 1110 }, { "epoch": 0.568816658202133, "grad_norm": 0.8671994805335999, "learning_rate": 4.879279484582603e-06, "loss": 0.3152, "step": 1120 }, { "epoch": 0.5738953783646521, "grad_norm": 0.7974778413772583, "learning_rate": 4.874702378796694e-06, "loss": 0.3064, "step": 1130 }, { "epoch": 0.5789740985271712, "grad_norm": 0.8144264221191406, "learning_rate": 4.870042338034618e-06, "loss": 0.3083, "step": 1140 }, { "epoch": 0.5840528186896902, "grad_norm": 0.8496512770652771, "learning_rate": 4.8652995250453515e-06, "loss": 0.3011, "step": 1150 }, { "epoch": 0.5891315388522093, "grad_norm": 0.8523489832878113, "learning_rate": 4.86047410546863e-06, "loss": 0.3105, "step": 1160 }, { "epoch": 0.5942102590147282, "grad_norm": 0.8697237968444824, "learning_rate": 4.855566247829177e-06, "loss": 0.3118, "step": 1170 }, { "epoch": 0.5992889791772473, "grad_norm": 0.758233368396759, "learning_rate": 4.85057612353081e-06, "loss": 0.3057, "step": 1180 }, { "epoch": 0.6043676993397664, "grad_norm": 1.0396537780761719, "learning_rate": 4.845503906850461e-06, "loss": 0.3081, "step": 1190 }, { "epoch": 0.6094464195022854, "grad_norm": 0.8582219481468201, "learning_rate": 4.840349774932081e-06, "loss": 0.3101, "step": 1200 }, { "epoch": 0.6145251396648045, "grad_norm": 0.8096279501914978, "learning_rate": 4.835113907780464e-06, "loss": 0.3162, "step": 1210 }, { "epoch": 0.6196038598273235, "grad_norm": 0.7163273692131042, "learning_rate": 4.829796488254954e-06, "loss": 0.3104, "step": 1220 }, { "epoch": 0.6246825799898426, "grad_norm": 0.8141240477561951, "learning_rate": 4.824397702063058e-06, "loss": 0.3225, "step": 1230 }, { "epoch": 0.6297613001523616, "grad_norm": 0.7636848092079163, "learning_rate": 4.8189177377539635e-06, "loss": 0.3172, "step": 1240 }, { "epoch": 0.6348400203148806, "grad_norm": 0.8381786942481995, "learning_rate": 4.8133567867119525e-06, "loss": 0.3091, "step": 1250 }, { "epoch": 0.6399187404773997, "grad_norm": 0.7415141463279724, "learning_rate": 4.8077150431497175e-06, "loss": 0.3119, "step": 1260 }, { "epoch": 0.6449974606399187, "grad_norm": 0.7708622813224792, "learning_rate": 4.801992704101578e-06, "loss": 0.3121, "step": 1270 }, { "epoch": 0.6500761808024378, "grad_norm": 0.8236015439033508, "learning_rate": 4.796189969416601e-06, "loss": 0.3042, "step": 1280 }, { "epoch": 0.6551549009649569, "grad_norm": 0.7215218544006348, "learning_rate": 4.790307041751617e-06, "loss": 0.3031, "step": 1290 }, { "epoch": 0.6602336211274759, "grad_norm": 0.7796043753623962, "learning_rate": 4.78434412656415e-06, "loss": 0.305, "step": 1300 }, { "epoch": 0.665312341289995, "grad_norm": 0.8012691140174866, "learning_rate": 4.778301432105234e-06, "loss": 0.3066, "step": 1310 }, { "epoch": 0.6703910614525139, "grad_norm": 0.8139703273773193, "learning_rate": 4.772179169412146e-06, "loss": 0.3023, "step": 1320 }, { "epoch": 0.675469781615033, "grad_norm": 0.7603716850280762, "learning_rate": 4.765977552301031e-06, "loss": 0.3093, "step": 1330 }, { "epoch": 0.6805485017775521, "grad_norm": 0.8556414842605591, "learning_rate": 4.759696797359438e-06, "loss": 0.3084, "step": 1340 }, { "epoch": 0.6856272219400711, "grad_norm": 0.7557649612426758, "learning_rate": 4.753337123938754e-06, "loss": 0.3057, "step": 1350 }, { "epoch": 0.6907059421025902, "grad_norm": 0.6622769832611084, "learning_rate": 4.746898754146545e-06, "loss": 0.3034, "step": 1360 }, { "epoch": 0.6957846622651092, "grad_norm": 0.8777470588684082, "learning_rate": 4.740381912838797e-06, "loss": 0.2972, "step": 1370 }, { "epoch": 0.7008633824276282, "grad_norm": 0.9032386541366577, "learning_rate": 4.733786827612064e-06, "loss": 0.3075, "step": 1380 }, { "epoch": 0.7059421025901473, "grad_norm": 0.7780275940895081, "learning_rate": 4.72711372879552e-06, "loss": 0.3057, "step": 1390 }, { "epoch": 0.7110208227526663, "grad_norm": 0.8000150918960571, "learning_rate": 4.720362849442912e-06, "loss": 0.3118, "step": 1400 }, { "epoch": 0.7160995429151854, "grad_norm": 0.7798091173171997, "learning_rate": 4.713534425324426e-06, "loss": 0.3049, "step": 1410 }, { "epoch": 0.7211782630777044, "grad_norm": 0.7654507160186768, "learning_rate": 4.706628694918448e-06, "loss": 0.3107, "step": 1420 }, { "epoch": 0.7262569832402235, "grad_norm": 0.846693217754364, "learning_rate": 4.699645899403238e-06, "loss": 0.3074, "step": 1430 }, { "epoch": 0.7313357034027425, "grad_norm": 0.8081271648406982, "learning_rate": 4.692586282648504e-06, "loss": 0.3082, "step": 1440 }, { "epoch": 0.7364144235652615, "grad_norm": 0.7759858965873718, "learning_rate": 4.685450091206893e-06, "loss": 0.3127, "step": 1450 }, { "epoch": 0.7414931437277806, "grad_norm": 0.8160701990127563, "learning_rate": 4.678237574305364e-06, "loss": 0.3018, "step": 1460 }, { "epoch": 0.7465718638902996, "grad_norm": 0.8192775845527649, "learning_rate": 4.670948983836505e-06, "loss": 0.3024, "step": 1470 }, { "epoch": 0.7516505840528187, "grad_norm": 0.8106697797775269, "learning_rate": 4.66358457434972e-06, "loss": 0.3167, "step": 1480 }, { "epoch": 0.7567293042153377, "grad_norm": 0.7517253160476685, "learning_rate": 4.6561446030423435e-06, "loss": 0.3064, "step": 1490 }, { "epoch": 0.7618080243778568, "grad_norm": 0.7888948321342468, "learning_rate": 4.648629329750662e-06, "loss": 0.308, "step": 1500 }, { "epoch": 0.7618080243778568, "eval_loss": 0.3137528896331787, "eval_runtime": 121.613, "eval_samples_per_second": 41.114, "eval_steps_per_second": 2.574, "step": 1500 }, { "epoch": 0.7668867445403759, "grad_norm": 0.7494853138923645, "learning_rate": 4.641039016940832e-06, "loss": 0.3086, "step": 1510 }, { "epoch": 0.7719654647028948, "grad_norm": 0.7426589131355286, "learning_rate": 4.6333739296997205e-06, "loss": 0.3, "step": 1520 }, { "epoch": 0.7770441848654139, "grad_norm": 0.7976534366607666, "learning_rate": 4.625634335725644e-06, "loss": 0.3134, "step": 1530 }, { "epoch": 0.7821229050279329, "grad_norm": 0.8214184045791626, "learning_rate": 4.617820505319018e-06, "loss": 0.3076, "step": 1540 }, { "epoch": 0.787201625190452, "grad_norm": 0.8541253209114075, "learning_rate": 4.609932711372921e-06, "loss": 0.3141, "step": 1550 }, { "epoch": 0.7922803453529711, "grad_norm": 0.75464928150177, "learning_rate": 4.601971229363558e-06, "loss": 0.3053, "step": 1560 }, { "epoch": 0.7973590655154901, "grad_norm": 0.8184385895729065, "learning_rate": 4.593936337340645e-06, "loss": 0.3123, "step": 1570 }, { "epoch": 0.8024377856780092, "grad_norm": 0.7774415612220764, "learning_rate": 4.5858283159176955e-06, "loss": 0.2999, "step": 1580 }, { "epoch": 0.8075165058405281, "grad_norm": 0.9587996602058411, "learning_rate": 4.57764744826222e-06, "loss": 0.3077, "step": 1590 }, { "epoch": 0.8125952260030472, "grad_norm": 0.9347293972969055, "learning_rate": 4.569394020085841e-06, "loss": 0.3104, "step": 1600 }, { "epoch": 0.8176739461655663, "grad_norm": 0.7690241932868958, "learning_rate": 4.561068319634307e-06, "loss": 0.2998, "step": 1610 }, { "epoch": 0.8227526663280853, "grad_norm": 0.783155083656311, "learning_rate": 4.552670637677432e-06, "loss": 0.3011, "step": 1620 }, { "epoch": 0.8278313864906044, "grad_norm": 0.8160974979400635, "learning_rate": 4.544201267498939e-06, "loss": 0.3042, "step": 1630 }, { "epoch": 0.8329101066531234, "grad_norm": 0.9004737734794617, "learning_rate": 4.535660504886215e-06, "loss": 0.3079, "step": 1640 }, { "epoch": 0.8379888268156425, "grad_norm": 0.7750893235206604, "learning_rate": 4.527048648119986e-06, "loss": 0.3002, "step": 1650 }, { "epoch": 0.8430675469781616, "grad_norm": 0.7503623962402344, "learning_rate": 4.5183659979638905e-06, "loss": 0.3117, "step": 1660 }, { "epoch": 0.8481462671406805, "grad_norm": 0.8173331618309021, "learning_rate": 4.509612857653987e-06, "loss": 0.3079, "step": 1670 }, { "epoch": 0.8532249873031996, "grad_norm": 0.7421526312828064, "learning_rate": 4.500789532888154e-06, "loss": 0.2998, "step": 1680 }, { "epoch": 0.8583037074657186, "grad_norm": 0.7622323632240295, "learning_rate": 4.49189633181542e-06, "loss": 0.3003, "step": 1690 }, { "epoch": 0.8633824276282377, "grad_norm": 0.730026364326477, "learning_rate": 4.482933565025198e-06, "loss": 0.3118, "step": 1700 }, { "epoch": 0.8684611477907568, "grad_norm": 0.7820921540260315, "learning_rate": 4.47390154553644e-06, "loss": 0.3056, "step": 1710 }, { "epoch": 0.8735398679532758, "grad_norm": 0.8582718372344971, "learning_rate": 4.4648005887867064e-06, "loss": 0.2969, "step": 1720 }, { "epoch": 0.8786185881157949, "grad_norm": 0.7791563272476196, "learning_rate": 4.455631012621143e-06, "loss": 0.3068, "step": 1730 }, { "epoch": 0.8836973082783138, "grad_norm": 0.7581758499145508, "learning_rate": 4.4463931372813914e-06, "loss": 0.304, "step": 1740 }, { "epoch": 0.8887760284408329, "grad_norm": 0.766441285610199, "learning_rate": 4.4370872853943936e-06, "loss": 0.298, "step": 1750 }, { "epoch": 0.8938547486033519, "grad_norm": 0.7395172119140625, "learning_rate": 4.427713781961132e-06, "loss": 0.2996, "step": 1760 }, { "epoch": 0.898933468765871, "grad_norm": 0.7233405709266663, "learning_rate": 4.4182729543452765e-06, "loss": 0.2929, "step": 1770 }, { "epoch": 0.9040121889283901, "grad_norm": 0.7392520904541016, "learning_rate": 4.408765132261749e-06, "loss": 0.3088, "step": 1780 }, { "epoch": 0.9090909090909091, "grad_norm": 0.7640084028244019, "learning_rate": 4.399190647765213e-06, "loss": 0.3059, "step": 1790 }, { "epoch": 0.9141696292534282, "grad_norm": 0.7984718084335327, "learning_rate": 4.389549835238473e-06, "loss": 0.3088, "step": 1800 }, { "epoch": 0.9192483494159471, "grad_norm": 0.7300999164581299, "learning_rate": 4.379843031380801e-06, "loss": 0.3025, "step": 1810 }, { "epoch": 0.9243270695784662, "grad_norm": 0.7906836867332458, "learning_rate": 4.370070575196172e-06, "loss": 0.3056, "step": 1820 }, { "epoch": 0.9294057897409853, "grad_norm": 0.7191804051399231, "learning_rate": 4.360232807981426e-06, "loss": 0.3001, "step": 1830 }, { "epoch": 0.9344845099035043, "grad_norm": 0.7592332363128662, "learning_rate": 4.350330073314351e-06, "loss": 0.3101, "step": 1840 }, { "epoch": 0.9395632300660234, "grad_norm": 0.8039364218711853, "learning_rate": 4.340362717041682e-06, "loss": 0.3055, "step": 1850 }, { "epoch": 0.9446419502285424, "grad_norm": 0.7507144212722778, "learning_rate": 4.3303310872670226e-06, "loss": 0.3018, "step": 1860 }, { "epoch": 0.9497206703910615, "grad_norm": 0.9044272303581238, "learning_rate": 4.320235534338685e-06, "loss": 0.2943, "step": 1870 }, { "epoch": 0.9547993905535805, "grad_norm": 0.7523107528686523, "learning_rate": 4.310076410837463e-06, "loss": 0.3026, "step": 1880 }, { "epoch": 0.9598781107160995, "grad_norm": 0.955448567867279, "learning_rate": 4.299854071564307e-06, "loss": 0.2926, "step": 1890 }, { "epoch": 0.9649568308786186, "grad_norm": 0.7745742797851562, "learning_rate": 4.289568873527941e-06, "loss": 0.304, "step": 1900 }, { "epoch": 0.9700355510411376, "grad_norm": 0.7563455104827881, "learning_rate": 4.279221175932389e-06, "loss": 0.297, "step": 1910 }, { "epoch": 0.9751142712036567, "grad_norm": 0.8183065056800842, "learning_rate": 4.268811340164436e-06, "loss": 0.2986, "step": 1920 }, { "epoch": 0.9801929913661758, "grad_norm": 0.7718958258628845, "learning_rate": 4.258339729781e-06, "loss": 0.2957, "step": 1930 }, { "epoch": 0.9852717115286947, "grad_norm": 0.7502573728561401, "learning_rate": 4.24780671049644e-06, "loss": 0.3023, "step": 1940 }, { "epoch": 0.9903504316912138, "grad_norm": 0.8469391465187073, "learning_rate": 4.237212650169783e-06, "loss": 0.3078, "step": 1950 }, { "epoch": 0.9954291518537328, "grad_norm": 0.8061825633049011, "learning_rate": 4.226557918791872e-06, "loss": 0.3002, "step": 1960 }, { "epoch": 1.000507872016252, "grad_norm": 0.7248758673667908, "learning_rate": 4.215842888472452e-06, "loss": 0.3125, "step": 1970 }, { "epoch": 1.005586592178771, "grad_norm": 0.7721697092056274, "learning_rate": 4.205067933427169e-06, "loss": 0.2544, "step": 1980 }, { "epoch": 1.01066531234129, "grad_norm": 0.7651776075363159, "learning_rate": 4.194233429964501e-06, "loss": 0.2535, "step": 1990 }, { "epoch": 1.015744032503809, "grad_norm": 0.7415822148323059, "learning_rate": 4.183339756472617e-06, "loss": 0.2496, "step": 2000 }, { "epoch": 1.015744032503809, "eval_loss": 0.31630080938339233, "eval_runtime": 121.7314, "eval_samples_per_second": 41.074, "eval_steps_per_second": 2.571, "step": 2000 }, { "epoch": 1.020822752666328, "grad_norm": 0.792204737663269, "learning_rate": 4.172387293406164e-06, "loss": 0.2465, "step": 2010 }, { "epoch": 1.0259014728288471, "grad_norm": 0.7987692356109619, "learning_rate": 4.161376423272974e-06, "loss": 0.2479, "step": 2020 }, { "epoch": 1.0309801929913662, "grad_norm": 0.7764890789985657, "learning_rate": 4.150307530620714e-06, "loss": 0.25, "step": 2030 }, { "epoch": 1.0360589131538853, "grad_norm": 0.7811829447746277, "learning_rate": 4.139181002023445e-06, "loss": 0.2518, "step": 2040 }, { "epoch": 1.0411376333164042, "grad_norm": 0.7891380190849304, "learning_rate": 4.1279972260681286e-06, "loss": 0.2521, "step": 2050 }, { "epoch": 1.0462163534789233, "grad_norm": 0.831947922706604, "learning_rate": 4.1167565933410575e-06, "loss": 0.2529, "step": 2060 }, { "epoch": 1.0512950736414424, "grad_norm": 0.8048622012138367, "learning_rate": 4.105459496414207e-06, "loss": 0.2508, "step": 2070 }, { "epoch": 1.0563737938039615, "grad_norm": 0.7546584606170654, "learning_rate": 4.094106329831531e-06, "loss": 0.247, "step": 2080 }, { "epoch": 1.0614525139664805, "grad_norm": 0.7589654922485352, "learning_rate": 4.08269749009518e-06, "loss": 0.2466, "step": 2090 }, { "epoch": 1.0665312341289994, "grad_norm": 0.7780548334121704, "learning_rate": 4.0712333756516535e-06, "loss": 0.2482, "step": 2100 }, { "epoch": 1.0716099542915185, "grad_norm": 0.8401651978492737, "learning_rate": 4.059714386877886e-06, "loss": 0.2578, "step": 2110 }, { "epoch": 1.0766886744540376, "grad_norm": 0.7654274702072144, "learning_rate": 4.048140926067262e-06, "loss": 0.2568, "step": 2120 }, { "epoch": 1.0817673946165567, "grad_norm": 0.8236129283905029, "learning_rate": 4.036513397415571e-06, "loss": 0.2496, "step": 2130 }, { "epoch": 1.0868461147790756, "grad_norm": 0.8232077360153198, "learning_rate": 4.024832207006883e-06, "loss": 0.2489, "step": 2140 }, { "epoch": 1.0919248349415946, "grad_norm": 0.7585301399230957, "learning_rate": 4.013097762799372e-06, "loss": 0.2503, "step": 2150 }, { "epoch": 1.0970035551041137, "grad_norm": 0.7547110319137573, "learning_rate": 4.001310474611069e-06, "loss": 0.2489, "step": 2160 }, { "epoch": 1.1020822752666328, "grad_norm": 0.8154515027999878, "learning_rate": 3.989470754105546e-06, "loss": 0.2516, "step": 2170 }, { "epoch": 1.107160995429152, "grad_norm": 0.8041960000991821, "learning_rate": 3.9775790147775425e-06, "loss": 0.2455, "step": 2180 }, { "epoch": 1.112239715591671, "grad_norm": 0.7653600573539734, "learning_rate": 3.96563567193852e-06, "loss": 0.2539, "step": 2190 }, { "epoch": 1.1173184357541899, "grad_norm": 0.8358815312385559, "learning_rate": 3.953641142702161e-06, "loss": 0.2473, "step": 2200 }, { "epoch": 1.122397155916709, "grad_norm": 0.8611774444580078, "learning_rate": 3.941595845969799e-06, "loss": 0.2522, "step": 2210 }, { "epoch": 1.127475876079228, "grad_norm": 0.734160840511322, "learning_rate": 3.929500202415793e-06, "loss": 0.2469, "step": 2220 }, { "epoch": 1.1325545962417471, "grad_norm": 0.7705450057983398, "learning_rate": 3.917354634472831e-06, "loss": 0.2475, "step": 2230 }, { "epoch": 1.137633316404266, "grad_norm": 0.7762371897697449, "learning_rate": 3.9051595663171795e-06, "loss": 0.2519, "step": 2240 }, { "epoch": 1.142712036566785, "grad_norm": 0.8073663115501404, "learning_rate": 3.892915423853866e-06, "loss": 0.2417, "step": 2250 }, { "epoch": 1.1477907567293042, "grad_norm": 0.8323131203651428, "learning_rate": 3.880622634701812e-06, "loss": 0.2489, "step": 2260 }, { "epoch": 1.1528694768918233, "grad_norm": 0.8708091974258423, "learning_rate": 3.868281628178888e-06, "loss": 0.2548, "step": 2270 }, { "epoch": 1.1579481970543424, "grad_norm": 0.7699044346809387, "learning_rate": 3.855892835286931e-06, "loss": 0.2467, "step": 2280 }, { "epoch": 1.1630269172168615, "grad_norm": 0.718021035194397, "learning_rate": 3.843456688696683e-06, "loss": 0.2486, "step": 2290 }, { "epoch": 1.1681056373793803, "grad_norm": 0.7651357054710388, "learning_rate": 3.830973622732686e-06, "loss": 0.248, "step": 2300 }, { "epoch": 1.1731843575418994, "grad_norm": 0.8027908802032471, "learning_rate": 3.818444073358108e-06, "loss": 0.2352, "step": 2310 }, { "epoch": 1.1782630777044185, "grad_norm": 0.7923874855041504, "learning_rate": 3.8058684781595277e-06, "loss": 0.2434, "step": 2320 }, { "epoch": 1.1833417978669376, "grad_norm": 0.8118336200714111, "learning_rate": 3.793247276331636e-06, "loss": 0.2488, "step": 2330 }, { "epoch": 1.1884205180294565, "grad_norm": 0.7456130385398865, "learning_rate": 3.780580908661915e-06, "loss": 0.248, "step": 2340 }, { "epoch": 1.1934992381919756, "grad_norm": 0.7925790548324585, "learning_rate": 3.7678698175152286e-06, "loss": 0.2529, "step": 2350 }, { "epoch": 1.1985779583544947, "grad_norm": 0.7958409190177917, "learning_rate": 3.7551144468183824e-06, "loss": 0.2526, "step": 2360 }, { "epoch": 1.2036566785170137, "grad_norm": 0.7938746809959412, "learning_rate": 3.7423152420446185e-06, "loss": 0.2478, "step": 2370 }, { "epoch": 1.2087353986795328, "grad_norm": 0.8479053378105164, "learning_rate": 3.729472650198054e-06, "loss": 0.2434, "step": 2380 }, { "epoch": 1.213814118842052, "grad_norm": 0.7347251176834106, "learning_rate": 3.716587119798074e-06, "loss": 0.2401, "step": 2390 }, { "epoch": 1.2188928390045708, "grad_norm": 0.7714645266532898, "learning_rate": 3.703659100863664e-06, "loss": 0.2511, "step": 2400 }, { "epoch": 1.2239715591670899, "grad_norm": 0.777991533279419, "learning_rate": 3.690689044897695e-06, "loss": 0.2509, "step": 2410 }, { "epoch": 1.229050279329609, "grad_norm": 0.7387746572494507, "learning_rate": 3.6776774048711558e-06, "loss": 0.2466, "step": 2420 }, { "epoch": 1.234128999492128, "grad_norm": 0.740714967250824, "learning_rate": 3.66462463520733e-06, "loss": 0.244, "step": 2430 }, { "epoch": 1.239207719654647, "grad_norm": 0.7498502731323242, "learning_rate": 3.6515311917659302e-06, "loss": 0.2411, "step": 2440 }, { "epoch": 1.244286439817166, "grad_norm": 0.7903018593788147, "learning_rate": 3.6383975318271724e-06, "loss": 0.2556, "step": 2450 }, { "epoch": 1.2493651599796851, "grad_norm": 0.8446292281150818, "learning_rate": 3.6252241140758103e-06, "loss": 0.243, "step": 2460 }, { "epoch": 1.2544438801422042, "grad_norm": 0.7503316402435303, "learning_rate": 3.6120113985851134e-06, "loss": 0.2522, "step": 2470 }, { "epoch": 1.2595226003047233, "grad_norm": 0.7319806218147278, "learning_rate": 3.5987598468007993e-06, "loss": 0.2475, "step": 2480 }, { "epoch": 1.2646013204672424, "grad_norm": 0.7854649424552917, "learning_rate": 3.585469921524919e-06, "loss": 0.2534, "step": 2490 }, { "epoch": 1.2696800406297613, "grad_norm": 0.7887899279594421, "learning_rate": 3.5721420868996943e-06, "loss": 0.248, "step": 2500 }, { "epoch": 1.2696800406297613, "eval_loss": 0.3149872124195099, "eval_runtime": 121.8702, "eval_samples_per_second": 41.027, "eval_steps_per_second": 2.568, "step": 2500 }, { "epoch": 1.2747587607922803, "grad_norm": 0.7797321677207947, "learning_rate": 3.5587768083913037e-06, "loss": 0.251, "step": 2510 }, { "epoch": 1.2798374809547994, "grad_norm": 0.84684157371521, "learning_rate": 3.545374552773635e-06, "loss": 0.2514, "step": 2520 }, { "epoch": 1.2849162011173183, "grad_norm": 0.8210347294807434, "learning_rate": 3.5319357881119733e-06, "loss": 0.2451, "step": 2530 }, { "epoch": 1.2899949212798374, "grad_norm": 0.8470488786697388, "learning_rate": 3.518460983746661e-06, "loss": 0.248, "step": 2540 }, { "epoch": 1.2950736414423565, "grad_norm": 0.7454930543899536, "learning_rate": 3.5049506102767037e-06, "loss": 0.2414, "step": 2550 }, { "epoch": 1.3001523616048756, "grad_norm": 0.713578462600708, "learning_rate": 3.4914051395433363e-06, "loss": 0.2482, "step": 2560 }, { "epoch": 1.3052310817673947, "grad_norm": 0.7766876816749573, "learning_rate": 3.477825044613543e-06, "loss": 0.2501, "step": 2570 }, { "epoch": 1.3103098019299138, "grad_norm": 0.7383909821510315, "learning_rate": 3.464210799763536e-06, "loss": 0.247, "step": 2580 }, { "epoch": 1.3153885220924328, "grad_norm": 0.7781063318252563, "learning_rate": 3.450562880462191e-06, "loss": 0.2479, "step": 2590 }, { "epoch": 1.3204672422549517, "grad_norm": 0.7478165030479431, "learning_rate": 3.436881763354444e-06, "loss": 0.2502, "step": 2600 }, { "epoch": 1.3255459624174708, "grad_norm": 0.7556635737419128, "learning_rate": 3.4231679262446426e-06, "loss": 0.2517, "step": 2610 }, { "epoch": 1.33062468257999, "grad_norm": 0.7706420421600342, "learning_rate": 3.4094218480798608e-06, "loss": 0.2448, "step": 2620 }, { "epoch": 1.3357034027425088, "grad_norm": 0.8588224053382874, "learning_rate": 3.3956440089331687e-06, "loss": 0.2431, "step": 2630 }, { "epoch": 1.3407821229050279, "grad_norm": 0.7747892141342163, "learning_rate": 3.3818348899868707e-06, "loss": 0.2517, "step": 2640 }, { "epoch": 1.345860843067547, "grad_norm": 0.8341949582099915, "learning_rate": 3.3679949735156974e-06, "loss": 0.2478, "step": 2650 }, { "epoch": 1.350939563230066, "grad_norm": 0.7596986889839172, "learning_rate": 3.354124742869965e-06, "loss": 0.2451, "step": 2660 }, { "epoch": 1.3560182833925851, "grad_norm": 0.8195773363113403, "learning_rate": 3.3402246824586897e-06, "loss": 0.2448, "step": 2670 }, { "epoch": 1.3610970035551042, "grad_norm": 0.7367848753929138, "learning_rate": 3.3262952777326775e-06, "loss": 0.2503, "step": 2680 }, { "epoch": 1.366175723717623, "grad_norm": 0.8999025821685791, "learning_rate": 3.3123370151675615e-06, "loss": 0.2537, "step": 2690 }, { "epoch": 1.3712544438801422, "grad_norm": 0.7917847037315369, "learning_rate": 3.2983503822468214e-06, "loss": 0.2496, "step": 2700 }, { "epoch": 1.3763331640426613, "grad_norm": 0.7561963200569153, "learning_rate": 3.28433586744475e-06, "loss": 0.2489, "step": 2710 }, { "epoch": 1.3814118842051804, "grad_norm": 0.8509207963943481, "learning_rate": 3.2702939602093988e-06, "loss": 0.2448, "step": 2720 }, { "epoch": 1.3864906043676992, "grad_norm": 0.8005545139312744, "learning_rate": 3.2562251509454813e-06, "loss": 0.2578, "step": 2730 }, { "epoch": 1.3915693245302183, "grad_norm": 0.8283247351646423, "learning_rate": 3.2421299309972485e-06, "loss": 0.2489, "step": 2740 }, { "epoch": 1.3966480446927374, "grad_norm": 0.7954180836677551, "learning_rate": 3.2280087926313288e-06, "loss": 0.248, "step": 2750 }, { "epoch": 1.4017267648552565, "grad_norm": 0.8017697930335999, "learning_rate": 3.2138622290195325e-06, "loss": 0.2535, "step": 2760 }, { "epoch": 1.4068054850177756, "grad_norm": 0.7989761829376221, "learning_rate": 3.1996907342216318e-06, "loss": 0.2434, "step": 2770 }, { "epoch": 1.4118842051802947, "grad_norm": 0.8122096061706543, "learning_rate": 3.1854948031681044e-06, "loss": 0.2518, "step": 2780 }, { "epoch": 1.4169629253428135, "grad_norm": 0.8307642936706543, "learning_rate": 3.1712749316428487e-06, "loss": 0.2433, "step": 2790 }, { "epoch": 1.4220416455053326, "grad_norm": 0.7779731154441833, "learning_rate": 3.157031616265871e-06, "loss": 0.2543, "step": 2800 }, { "epoch": 1.4271203656678517, "grad_norm": 0.7799838185310364, "learning_rate": 3.1427653544759352e-06, "loss": 0.247, "step": 2810 }, { "epoch": 1.4321990858303708, "grad_norm": 0.7921550869941711, "learning_rate": 3.1284766445131975e-06, "loss": 0.2485, "step": 2820 }, { "epoch": 1.4372778059928897, "grad_norm": 0.7986315488815308, "learning_rate": 3.114165985401801e-06, "loss": 0.2469, "step": 2830 }, { "epoch": 1.4423565261554088, "grad_norm": 0.848434329032898, "learning_rate": 3.09983387693245e-06, "loss": 0.2492, "step": 2840 }, { "epoch": 1.4474352463179279, "grad_norm": 0.7870414853096008, "learning_rate": 3.085480819644951e-06, "loss": 0.2417, "step": 2850 }, { "epoch": 1.452513966480447, "grad_norm": 0.8498828411102295, "learning_rate": 3.0711073148107395e-06, "loss": 0.2533, "step": 2860 }, { "epoch": 1.457592686642966, "grad_norm": 0.7977587580680847, "learning_rate": 3.056713864415363e-06, "loss": 0.2483, "step": 2870 }, { "epoch": 1.4626714068054851, "grad_norm": 0.7781681418418884, "learning_rate": 3.0423009711409614e-06, "loss": 0.2502, "step": 2880 }, { "epoch": 1.467750126968004, "grad_norm": 0.8061252236366272, "learning_rate": 3.0278691383486992e-06, "loss": 0.2491, "step": 2890 }, { "epoch": 1.472828847130523, "grad_norm": 0.7913114428520203, "learning_rate": 3.013418870061194e-06, "loss": 0.2461, "step": 2900 }, { "epoch": 1.4779075672930422, "grad_norm": 0.7775682806968689, "learning_rate": 2.9989506709449123e-06, "loss": 0.2498, "step": 2910 }, { "epoch": 1.4829862874555613, "grad_norm": 0.77854323387146, "learning_rate": 2.984465046292541e-06, "loss": 0.2473, "step": 2920 }, { "epoch": 1.4880650076180801, "grad_norm": 0.8510044813156128, "learning_rate": 2.9699625020053457e-06, "loss": 0.2443, "step": 2930 }, { "epoch": 1.4931437277805992, "grad_norm": 0.823668897151947, "learning_rate": 2.9554435445754976e-06, "loss": 0.2386, "step": 2940 }, { "epoch": 1.4982224479431183, "grad_norm": 0.7475908994674683, "learning_rate": 2.9409086810683858e-06, "loss": 0.2503, "step": 2950 }, { "epoch": 1.5033011681056374, "grad_norm": 0.8057491183280945, "learning_rate": 2.926358419104911e-06, "loss": 0.2421, "step": 2960 }, { "epoch": 1.5083798882681565, "grad_norm": 0.8842071890830994, "learning_rate": 2.9117932668437542e-06, "loss": 0.2515, "step": 2970 }, { "epoch": 1.5134586084306756, "grad_norm": 0.7616868615150452, "learning_rate": 2.8972137329636324e-06, "loss": 0.2486, "step": 2980 }, { "epoch": 1.5185373285931947, "grad_norm": 0.7204374074935913, "learning_rate": 2.8826203266455276e-06, "loss": 0.2406, "step": 2990 }, { "epoch": 1.5236160487557135, "grad_norm": 0.801526665687561, "learning_rate": 2.868013557554911e-06, "loss": 0.2526, "step": 3000 }, { "epoch": 1.5236160487557135, "eval_loss": 0.3125091791152954, "eval_runtime": 121.6711, "eval_samples_per_second": 41.094, "eval_steps_per_second": 2.573, "step": 3000 }, { "epoch": 1.5286947689182326, "grad_norm": 0.7821494936943054, "learning_rate": 2.8533939358239405e-06, "loss": 0.2447, "step": 3010 }, { "epoch": 1.5337734890807515, "grad_norm": 0.7913290858268738, "learning_rate": 2.838761972033643e-06, "loss": 0.2508, "step": 3020 }, { "epoch": 1.5388522092432706, "grad_norm": 1.1276675462722778, "learning_rate": 2.824118177196083e-06, "loss": 0.2481, "step": 3030 }, { "epoch": 1.5439309294057897, "grad_norm": 0.7831458449363708, "learning_rate": 2.8094630627365193e-06, "loss": 0.2525, "step": 3040 }, { "epoch": 1.5490096495683088, "grad_norm": 0.8312116861343384, "learning_rate": 2.7947971404755392e-06, "loss": 0.2503, "step": 3050 }, { "epoch": 1.5540883697308279, "grad_norm": 0.7501511573791504, "learning_rate": 2.7801209226111874e-06, "loss": 0.2508, "step": 3060 }, { "epoch": 1.559167089893347, "grad_norm": 0.8042319416999817, "learning_rate": 2.765434921701075e-06, "loss": 0.2435, "step": 3070 }, { "epoch": 1.564245810055866, "grad_norm": 0.8284822702407837, "learning_rate": 2.7507396506444805e-06, "loss": 0.2468, "step": 3080 }, { "epoch": 1.569324530218385, "grad_norm": 0.7171116471290588, "learning_rate": 2.7360356226644342e-06, "loss": 0.2473, "step": 3090 }, { "epoch": 1.574403250380904, "grad_norm": 0.775512158870697, "learning_rate": 2.721323351289799e-06, "loss": 0.2468, "step": 3100 }, { "epoch": 1.579481970543423, "grad_norm": 0.7975888252258301, "learning_rate": 2.7066033503373323e-06, "loss": 0.243, "step": 3110 }, { "epoch": 1.584560690705942, "grad_norm": 0.8064377903938293, "learning_rate": 2.6918761338937427e-06, "loss": 0.2566, "step": 3120 }, { "epoch": 1.589639410868461, "grad_norm": 0.8264314532279968, "learning_rate": 2.677142216297733e-06, "loss": 0.2518, "step": 3130 }, { "epoch": 1.5947181310309801, "grad_norm": 0.8090101480484009, "learning_rate": 2.6624021121220415e-06, "loss": 0.2537, "step": 3140 }, { "epoch": 1.5997968511934992, "grad_norm": 0.8687019944190979, "learning_rate": 2.647656336155469e-06, "loss": 0.248, "step": 3150 }, { "epoch": 1.6048755713560183, "grad_norm": 0.75754714012146, "learning_rate": 2.6329054033848994e-06, "loss": 0.2435, "step": 3160 }, { "epoch": 1.6099542915185374, "grad_norm": 0.7850366234779358, "learning_rate": 2.6181498289773145e-06, "loss": 0.2473, "step": 3170 }, { "epoch": 1.6150330116810565, "grad_norm": 0.8010424375534058, "learning_rate": 2.603390128261802e-06, "loss": 0.2466, "step": 3180 }, { "epoch": 1.6201117318435754, "grad_norm": 0.8364565968513489, "learning_rate": 2.5886268167115597e-06, "loss": 0.2463, "step": 3190 }, { "epoch": 1.6251904520060945, "grad_norm": 0.7993068099021912, "learning_rate": 2.5738604099258908e-06, "loss": 0.2414, "step": 3200 }, { "epoch": 1.6302691721686136, "grad_norm": 0.8057124018669128, "learning_rate": 2.559091423612196e-06, "loss": 0.2573, "step": 3210 }, { "epoch": 1.6353478923311324, "grad_norm": 0.810699462890625, "learning_rate": 2.5443203735679682e-06, "loss": 0.2468, "step": 3220 }, { "epoch": 1.6404266124936515, "grad_norm": 0.8280355334281921, "learning_rate": 2.52954777566277e-06, "loss": 0.2475, "step": 3230 }, { "epoch": 1.6455053326561706, "grad_norm": 0.8114942312240601, "learning_rate": 2.5147741458202266e-06, "loss": 0.2528, "step": 3240 }, { "epoch": 1.6505840528186897, "grad_norm": 0.7734003663063049, "learning_rate": 2.5e-06, "loss": 0.252, "step": 3250 }, { "epoch": 1.6556627729812088, "grad_norm": 0.7834712862968445, "learning_rate": 2.485225854179774e-06, "loss": 0.2529, "step": 3260 }, { "epoch": 1.6607414931437279, "grad_norm": 0.8254967331886292, "learning_rate": 2.47045222433723e-06, "loss": 0.2548, "step": 3270 }, { "epoch": 1.665820213306247, "grad_norm": 0.7638046145439148, "learning_rate": 2.455679626432032e-06, "loss": 0.2472, "step": 3280 }, { "epoch": 1.6708989334687658, "grad_norm": 0.7961717247962952, "learning_rate": 2.4409085763878043e-06, "loss": 0.2437, "step": 3290 }, { "epoch": 1.675977653631285, "grad_norm": 0.8181645274162292, "learning_rate": 2.426139590074111e-06, "loss": 0.2418, "step": 3300 }, { "epoch": 1.681056373793804, "grad_norm": 0.751830518245697, "learning_rate": 2.4113731832884407e-06, "loss": 0.239, "step": 3310 }, { "epoch": 1.6861350939563229, "grad_norm": 0.7490795254707336, "learning_rate": 2.396609871738199e-06, "loss": 0.2474, "step": 3320 }, { "epoch": 1.691213814118842, "grad_norm": 0.7789638042449951, "learning_rate": 2.3818501710226867e-06, "loss": 0.2502, "step": 3330 }, { "epoch": 1.696292534281361, "grad_norm": 0.7721952795982361, "learning_rate": 2.3670945966151014e-06, "loss": 0.2571, "step": 3340 }, { "epoch": 1.7013712544438802, "grad_norm": 0.8575606346130371, "learning_rate": 2.3523436638445312e-06, "loss": 0.2527, "step": 3350 }, { "epoch": 1.7064499746063992, "grad_norm": 0.814814567565918, "learning_rate": 2.3375978878779593e-06, "loss": 0.2465, "step": 3360 }, { "epoch": 1.7115286947689183, "grad_norm": 0.821048378944397, "learning_rate": 2.322857783702268e-06, "loss": 0.2515, "step": 3370 }, { "epoch": 1.7166074149314374, "grad_norm": 0.8528531193733215, "learning_rate": 2.3081238661062585e-06, "loss": 0.253, "step": 3380 }, { "epoch": 1.7216861350939563, "grad_norm": 0.7616355419158936, "learning_rate": 2.2933966496626677e-06, "loss": 0.2549, "step": 3390 }, { "epoch": 1.7267648552564754, "grad_norm": 0.7662220001220703, "learning_rate": 2.2786766487102014e-06, "loss": 0.2565, "step": 3400 }, { "epoch": 1.7318435754189943, "grad_norm": 0.730667769908905, "learning_rate": 2.2639643773355666e-06, "loss": 0.2405, "step": 3410 }, { "epoch": 1.7369222955815133, "grad_norm": 0.7821247577667236, "learning_rate": 2.2492603493555208e-06, "loss": 0.2472, "step": 3420 }, { "epoch": 1.7420010157440324, "grad_norm": 0.7954862117767334, "learning_rate": 2.234565078298925e-06, "loss": 0.2538, "step": 3430 }, { "epoch": 1.7470797359065515, "grad_norm": 0.810747504234314, "learning_rate": 2.219879077388813e-06, "loss": 0.2542, "step": 3440 }, { "epoch": 1.7521584560690706, "grad_norm": 0.8034137487411499, "learning_rate": 2.2052028595244616e-06, "loss": 0.254, "step": 3450 }, { "epoch": 1.7572371762315897, "grad_norm": 0.826793909072876, "learning_rate": 2.190536937263482e-06, "loss": 0.2431, "step": 3460 }, { "epoch": 1.7623158963941088, "grad_norm": 0.719495415687561, "learning_rate": 2.175881822803917e-06, "loss": 0.2467, "step": 3470 }, { "epoch": 1.7673946165566279, "grad_norm": 0.7717339396476746, "learning_rate": 2.1612380279663576e-06, "loss": 0.2414, "step": 3480 }, { "epoch": 1.7724733367191468, "grad_norm": 0.7720147371292114, "learning_rate": 2.14660606417606e-06, "loss": 0.2479, "step": 3490 }, { "epoch": 1.7775520568816658, "grad_norm": 0.735954999923706, "learning_rate": 2.1319864424450894e-06, "loss": 0.254, "step": 3500 }, { "epoch": 1.7775520568816658, "eval_loss": 0.30940133333206177, "eval_runtime": 121.7572, "eval_samples_per_second": 41.065, "eval_steps_per_second": 2.571, "step": 3500 }, { "epoch": 1.7826307770441847, "grad_norm": 0.8783974647521973, "learning_rate": 2.117379673354473e-06, "loss": 0.2461, "step": 3510 }, { "epoch": 1.7877094972067038, "grad_norm": 0.683204174041748, "learning_rate": 2.1027862670363685e-06, "loss": 0.2475, "step": 3520 }, { "epoch": 1.792788217369223, "grad_norm": 0.7428295612335205, "learning_rate": 2.088206733156246e-06, "loss": 0.2346, "step": 3530 }, { "epoch": 1.797866937531742, "grad_norm": 0.754948079586029, "learning_rate": 2.0736415808950898e-06, "loss": 0.255, "step": 3540 }, { "epoch": 1.802945657694261, "grad_norm": 0.754705548286438, "learning_rate": 2.059091318931615e-06, "loss": 0.2466, "step": 3550 }, { "epoch": 1.8080243778567802, "grad_norm": 0.8315344452857971, "learning_rate": 2.0445564554245033e-06, "loss": 0.2431, "step": 3560 }, { "epoch": 1.8131030980192993, "grad_norm": 0.7972525954246521, "learning_rate": 2.030037497994655e-06, "loss": 0.2556, "step": 3570 }, { "epoch": 1.8181818181818183, "grad_norm": 0.8291880488395691, "learning_rate": 2.0155349537074598e-06, "loss": 0.2376, "step": 3580 }, { "epoch": 1.8232605383443372, "grad_norm": 0.7485976815223694, "learning_rate": 2.001049329055088e-06, "loss": 0.2416, "step": 3590 }, { "epoch": 1.8283392585068563, "grad_norm": 0.8340455293655396, "learning_rate": 1.9865811299388062e-06, "loss": 0.2447, "step": 3600 }, { "epoch": 1.8334179786693752, "grad_norm": 0.8124203681945801, "learning_rate": 1.972130861651302e-06, "loss": 0.2475, "step": 3610 }, { "epoch": 1.8384966988318943, "grad_norm": 0.8198763132095337, "learning_rate": 1.95769902885904e-06, "loss": 0.25, "step": 3620 }, { "epoch": 1.8435754189944134, "grad_norm": 0.8861445188522339, "learning_rate": 1.943286135584637e-06, "loss": 0.245, "step": 3630 }, { "epoch": 1.8486541391569324, "grad_norm": 0.8078780174255371, "learning_rate": 1.9288926851892614e-06, "loss": 0.2424, "step": 3640 }, { "epoch": 1.8537328593194515, "grad_norm": 0.785547137260437, "learning_rate": 1.9145191803550493e-06, "loss": 0.2367, "step": 3650 }, { "epoch": 1.8588115794819706, "grad_norm": 0.8039028644561768, "learning_rate": 1.9001661230675516e-06, "loss": 0.2434, "step": 3660 }, { "epoch": 1.8638902996444897, "grad_norm": 0.8308824896812439, "learning_rate": 1.8858340145981994e-06, "loss": 0.2475, "step": 3670 }, { "epoch": 1.8689690198070086, "grad_norm": 0.7723254561424255, "learning_rate": 1.8715233554868035e-06, "loss": 0.2409, "step": 3680 }, { "epoch": 1.8740477399695277, "grad_norm": 0.8033102750778198, "learning_rate": 1.8572346455240656e-06, "loss": 0.2415, "step": 3690 }, { "epoch": 1.8791264601320468, "grad_norm": 0.7221494913101196, "learning_rate": 1.8429683837341306e-06, "loss": 0.2499, "step": 3700 }, { "epoch": 1.8842051802945656, "grad_norm": 0.8179773688316345, "learning_rate": 1.828725068357151e-06, "loss": 0.2499, "step": 3710 }, { "epoch": 1.8892839004570847, "grad_norm": 0.730506420135498, "learning_rate": 1.8145051968318966e-06, "loss": 0.2431, "step": 3720 }, { "epoch": 1.8943626206196038, "grad_norm": 0.8109707832336426, "learning_rate": 1.800309265778369e-06, "loss": 0.2417, "step": 3730 }, { "epoch": 1.899441340782123, "grad_norm": 0.9620540738105774, "learning_rate": 1.7861377709804687e-06, "loss": 0.2538, "step": 3740 }, { "epoch": 1.904520060944642, "grad_norm": 0.7754760980606079, "learning_rate": 1.7719912073686712e-06, "loss": 0.2414, "step": 3750 }, { "epoch": 1.909598781107161, "grad_norm": 0.79917311668396, "learning_rate": 1.7578700690027517e-06, "loss": 0.2409, "step": 3760 }, { "epoch": 1.9146775012696802, "grad_norm": 0.7494426965713501, "learning_rate": 1.7437748490545191e-06, "loss": 0.2414, "step": 3770 }, { "epoch": 1.919756221432199, "grad_norm": 0.7564458847045898, "learning_rate": 1.7297060397906023e-06, "loss": 0.2424, "step": 3780 }, { "epoch": 1.9248349415947181, "grad_norm": 0.7982223033905029, "learning_rate": 1.7156641325552503e-06, "loss": 0.2521, "step": 3790 }, { "epoch": 1.9299136617572372, "grad_norm": 0.7981540560722351, "learning_rate": 1.7016496177531792e-06, "loss": 0.2399, "step": 3800 }, { "epoch": 1.934992381919756, "grad_norm": 0.7966643571853638, "learning_rate": 1.6876629848324391e-06, "loss": 0.2336, "step": 3810 }, { "epoch": 1.9400711020822752, "grad_norm": 0.7969587445259094, "learning_rate": 1.6737047222673235e-06, "loss": 0.25, "step": 3820 }, { "epoch": 1.9451498222447943, "grad_norm": 0.7210260629653931, "learning_rate": 1.6597753175413103e-06, "loss": 0.2516, "step": 3830 }, { "epoch": 1.9502285424073134, "grad_norm": 0.7629299163818359, "learning_rate": 1.6458752571300358e-06, "loss": 0.2422, "step": 3840 }, { "epoch": 1.9553072625698324, "grad_norm": 1.0255619287490845, "learning_rate": 1.632005026484303e-06, "loss": 0.2467, "step": 3850 }, { "epoch": 1.9603859827323515, "grad_norm": 0.8044396042823792, "learning_rate": 1.6181651100131302e-06, "loss": 0.2478, "step": 3860 }, { "epoch": 1.9654647028948706, "grad_norm": 0.8026655912399292, "learning_rate": 1.6043559910668315e-06, "loss": 0.2466, "step": 3870 }, { "epoch": 1.9705434230573895, "grad_norm": 0.7410418391227722, "learning_rate": 1.5905781519201398e-06, "loss": 0.2401, "step": 3880 }, { "epoch": 1.9756221432199086, "grad_norm": 0.8294776082038879, "learning_rate": 1.576832073755358e-06, "loss": 0.2548, "step": 3890 }, { "epoch": 1.9807008633824277, "grad_norm": 0.7692272067070007, "learning_rate": 1.5631182366455566e-06, "loss": 0.2394, "step": 3900 }, { "epoch": 1.9857795835449465, "grad_norm": 0.8169528841972351, "learning_rate": 1.54943711953781e-06, "loss": 0.2396, "step": 3910 }, { "epoch": 1.9908583037074656, "grad_norm": 0.8071213960647583, "learning_rate": 1.5357892002364649e-06, "loss": 0.2435, "step": 3920 }, { "epoch": 1.9959370238699847, "grad_norm": 0.8226195573806763, "learning_rate": 1.5221749553864578e-06, "loss": 0.2416, "step": 3930 }, { "epoch": 2.001015744032504, "grad_norm": 0.7779680490493774, "learning_rate": 1.5085948604566647e-06, "loss": 0.2302, "step": 3940 }, { "epoch": 2.006094464195023, "grad_norm": 0.793228268623352, "learning_rate": 1.4950493897232967e-06, "loss": 0.1941, "step": 3950 }, { "epoch": 2.011173184357542, "grad_norm": 0.8707436919212341, "learning_rate": 1.4815390162533397e-06, "loss": 0.1885, "step": 3960 }, { "epoch": 2.016251904520061, "grad_norm": 0.887116014957428, "learning_rate": 1.4680642118880275e-06, "loss": 0.1911, "step": 3970 }, { "epoch": 2.02133062468258, "grad_norm": 0.8474360108375549, "learning_rate": 1.454625447226366e-06, "loss": 0.1874, "step": 3980 }, { "epoch": 2.026409344845099, "grad_norm": 0.8413044214248657, "learning_rate": 1.441223191608696e-06, "loss": 0.1862, "step": 3990 }, { "epoch": 2.031488065007618, "grad_norm": 0.8266652822494507, "learning_rate": 1.4278579131003067e-06, "loss": 0.1867, "step": 4000 }, { "epoch": 2.031488065007618, "eval_loss": 0.33027777075767517, "eval_runtime": 121.5959, "eval_samples_per_second": 41.12, "eval_steps_per_second": 2.574, "step": 4000 }, { "epoch": 2.036566785170137, "grad_norm": 0.8005456924438477, "learning_rate": 1.414530078475082e-06, "loss": 0.1844, "step": 4010 }, { "epoch": 2.041645505332656, "grad_norm": 0.8141132593154907, "learning_rate": 1.4012401531992013e-06, "loss": 0.1806, "step": 4020 }, { "epoch": 2.046724225495175, "grad_norm": 0.8092536330223083, "learning_rate": 1.3879886014148864e-06, "loss": 0.1834, "step": 4030 }, { "epoch": 2.0518029456576943, "grad_norm": 0.829396665096283, "learning_rate": 1.3747758859241896e-06, "loss": 0.1832, "step": 4040 }, { "epoch": 2.0568816658202134, "grad_norm": 0.7657766342163086, "learning_rate": 1.3616024681728278e-06, "loss": 0.1751, "step": 4050 }, { "epoch": 2.0619603859827325, "grad_norm": 0.8640854954719543, "learning_rate": 1.3484688082340708e-06, "loss": 0.1824, "step": 4060 }, { "epoch": 2.0670391061452515, "grad_norm": 0.8462369441986084, "learning_rate": 1.3353753647926701e-06, "loss": 0.187, "step": 4070 }, { "epoch": 2.0721178263077706, "grad_norm": 0.7753511667251587, "learning_rate": 1.3223225951288449e-06, "loss": 0.1772, "step": 4080 }, { "epoch": 2.0771965464702893, "grad_norm": 0.936180830001831, "learning_rate": 1.3093109551023058e-06, "loss": 0.182, "step": 4090 }, { "epoch": 2.0822752666328084, "grad_norm": 0.8638271689414978, "learning_rate": 1.2963408991363374e-06, "loss": 0.1814, "step": 4100 }, { "epoch": 2.0873539867953275, "grad_norm": 0.8632501363754272, "learning_rate": 1.283412880201927e-06, "loss": 0.1816, "step": 4110 }, { "epoch": 2.0924327069578466, "grad_norm": 0.8283603191375732, "learning_rate": 1.270527349801946e-06, "loss": 0.1826, "step": 4120 }, { "epoch": 2.0975114271203656, "grad_norm": 0.8857740163803101, "learning_rate": 1.2576847579553826e-06, "loss": 0.1872, "step": 4130 }, { "epoch": 2.1025901472828847, "grad_norm": 0.7950591444969177, "learning_rate": 1.2448855531816184e-06, "loss": 0.1815, "step": 4140 }, { "epoch": 2.107668867445404, "grad_norm": 0.9257445931434631, "learning_rate": 1.232130182484772e-06, "loss": 0.1841, "step": 4150 }, { "epoch": 2.112747587607923, "grad_norm": 0.9456517696380615, "learning_rate": 1.2194190913380858e-06, "loss": 0.1844, "step": 4160 }, { "epoch": 2.117826307770442, "grad_norm": 0.9262843728065491, "learning_rate": 1.206752723668364e-06, "loss": 0.1833, "step": 4170 }, { "epoch": 2.122905027932961, "grad_norm": 0.8877847194671631, "learning_rate": 1.194131521840474e-06, "loss": 0.1845, "step": 4180 }, { "epoch": 2.1279837480954797, "grad_norm": 0.9533064365386963, "learning_rate": 1.181555926641892e-06, "loss": 0.1871, "step": 4190 }, { "epoch": 2.133062468257999, "grad_norm": 0.8590644598007202, "learning_rate": 1.1690263772673158e-06, "loss": 0.1872, "step": 4200 }, { "epoch": 2.138141188420518, "grad_norm": 0.8177663087844849, "learning_rate": 1.1565433113033176e-06, "loss": 0.1821, "step": 4210 }, { "epoch": 2.143219908583037, "grad_norm": 0.8513428568840027, "learning_rate": 1.14410716471307e-06, "loss": 0.1796, "step": 4220 }, { "epoch": 2.148298628745556, "grad_norm": 0.7657913565635681, "learning_rate": 1.131718371821112e-06, "loss": 0.184, "step": 4230 }, { "epoch": 2.153377348908075, "grad_norm": 0.8900587558746338, "learning_rate": 1.119377365298189e-06, "loss": 0.1851, "step": 4240 }, { "epoch": 2.1584560690705943, "grad_norm": 0.8556007742881775, "learning_rate": 1.1070845761461347e-06, "loss": 0.1861, "step": 4250 }, { "epoch": 2.1635347892331134, "grad_norm": 0.8326412439346313, "learning_rate": 1.0948404336828222e-06, "loss": 0.1816, "step": 4260 }, { "epoch": 2.1686135093956325, "grad_norm": 0.8394993543624878, "learning_rate": 1.0826453655271693e-06, "loss": 0.1884, "step": 4270 }, { "epoch": 2.173692229558151, "grad_norm": 0.9060940742492676, "learning_rate": 1.0704997975842075e-06, "loss": 0.1851, "step": 4280 }, { "epoch": 2.17877094972067, "grad_norm": 0.816910445690155, "learning_rate": 1.0584041540302009e-06, "loss": 0.185, "step": 4290 }, { "epoch": 2.1838496698831893, "grad_norm": 0.8141408562660217, "learning_rate": 1.0463588572978399e-06, "loss": 0.1874, "step": 4300 }, { "epoch": 2.1889283900457084, "grad_norm": 0.8770137429237366, "learning_rate": 1.0343643280614798e-06, "loss": 0.1812, "step": 4310 }, { "epoch": 2.1940071102082275, "grad_norm": 0.8542727828025818, "learning_rate": 1.0224209852224573e-06, "loss": 0.1858, "step": 4320 }, { "epoch": 2.1990858303707466, "grad_norm": 0.7976948618888855, "learning_rate": 1.010529245894454e-06, "loss": 0.1782, "step": 4330 }, { "epoch": 2.2041645505332657, "grad_norm": 0.8359585404396057, "learning_rate": 9.986895253889322e-07, "loss": 0.1865, "step": 4340 }, { "epoch": 2.2092432706957847, "grad_norm": 0.8453836441040039, "learning_rate": 9.86902237200629e-07, "loss": 0.1836, "step": 4350 }, { "epoch": 2.214321990858304, "grad_norm": 0.9079688191413879, "learning_rate": 9.751677929931189e-07, "loss": 0.1879, "step": 4360 }, { "epoch": 2.219400711020823, "grad_norm": 0.9282172918319702, "learning_rate": 9.634866025844306e-07, "loss": 0.1838, "step": 4370 }, { "epoch": 2.224479431183342, "grad_norm": 0.9397227168083191, "learning_rate": 9.518590739327382e-07, "loss": 0.1846, "step": 4380 }, { "epoch": 2.2295581513458607, "grad_norm": 0.817206084728241, "learning_rate": 9.402856131221144e-07, "loss": 0.1826, "step": 4390 }, { "epoch": 2.2346368715083798, "grad_norm": 0.8512512445449829, "learning_rate": 9.287666243483473e-07, "loss": 0.1813, "step": 4400 }, { "epoch": 2.239715591670899, "grad_norm": 0.8650760054588318, "learning_rate": 9.17302509904821e-07, "loss": 0.1823, "step": 4410 }, { "epoch": 2.244794311833418, "grad_norm": 0.8736887574195862, "learning_rate": 9.058936701684698e-07, "loss": 0.188, "step": 4420 }, { "epoch": 2.249873031995937, "grad_norm": 0.8562535047531128, "learning_rate": 8.945405035857932e-07, "loss": 0.1864, "step": 4430 }, { "epoch": 2.254951752158456, "grad_norm": 0.9099084734916687, "learning_rate": 8.832434066589432e-07, "loss": 0.189, "step": 4440 }, { "epoch": 2.260030472320975, "grad_norm": 0.9459706544876099, "learning_rate": 8.720027739318724e-07, "loss": 0.1851, "step": 4450 }, { "epoch": 2.2651091924834943, "grad_norm": 0.9423843026161194, "learning_rate": 8.608189979765563e-07, "loss": 0.1854, "step": 4460 }, { "epoch": 2.2701879126460134, "grad_norm": 0.9056732654571533, "learning_rate": 8.496924693792872e-07, "loss": 0.1869, "step": 4470 }, { "epoch": 2.275266632808532, "grad_norm": 0.8627443909645081, "learning_rate": 8.386235767270256e-07, "loss": 0.1873, "step": 4480 }, { "epoch": 2.280345352971051, "grad_norm": 0.9132104516029358, "learning_rate": 8.27612706593837e-07, "loss": 0.1859, "step": 4490 }, { "epoch": 2.28542407313357, "grad_norm": 0.878332257270813, "learning_rate": 8.166602435273832e-07, "loss": 0.1863, "step": 4500 }, { "epoch": 2.28542407313357, "eval_loss": 0.3316808044910431, "eval_runtime": 121.6909, "eval_samples_per_second": 41.088, "eval_steps_per_second": 2.572, "step": 4500 }, { "epoch": 2.2905027932960893, "grad_norm": 0.9403025507926941, "learning_rate": 8.057665700354999e-07, "loss": 0.1839, "step": 4510 }, { "epoch": 2.2955815134586084, "grad_norm": 0.8784217834472656, "learning_rate": 7.949320665728319e-07, "loss": 0.1876, "step": 4520 }, { "epoch": 2.3006602336211275, "grad_norm": 1.0212981700897217, "learning_rate": 7.841571115275487e-07, "loss": 0.1804, "step": 4530 }, { "epoch": 2.3057389537836466, "grad_norm": 0.850695013999939, "learning_rate": 7.734420812081283e-07, "loss": 0.1821, "step": 4540 }, { "epoch": 2.3108176739461657, "grad_norm": 0.9093451499938965, "learning_rate": 7.62787349830218e-07, "loss": 0.1867, "step": 4550 }, { "epoch": 2.3158963941086848, "grad_norm": 1.0111429691314697, "learning_rate": 7.521932895035605e-07, "loss": 0.1847, "step": 4560 }, { "epoch": 2.320975114271204, "grad_norm": 1.0417455434799194, "learning_rate": 7.416602702190004e-07, "loss": 0.1804, "step": 4570 }, { "epoch": 2.326053834433723, "grad_norm": 0.8404303193092346, "learning_rate": 7.311886598355642e-07, "loss": 0.1832, "step": 4580 }, { "epoch": 2.3311325545962416, "grad_norm": 0.8522669672966003, "learning_rate": 7.207788240676108e-07, "loss": 0.1847, "step": 4590 }, { "epoch": 2.3362112747587607, "grad_norm": 0.7680218815803528, "learning_rate": 7.104311264720598e-07, "loss": 0.1843, "step": 4600 }, { "epoch": 2.3412899949212798, "grad_norm": 0.8408559560775757, "learning_rate": 7.001459284356938e-07, "loss": 0.1816, "step": 4610 }, { "epoch": 2.346368715083799, "grad_norm": 0.8629699945449829, "learning_rate": 6.899235891625372e-07, "loss": 0.1845, "step": 4620 }, { "epoch": 2.351447435246318, "grad_norm": 0.8334382176399231, "learning_rate": 6.79764465661315e-07, "loss": 0.1828, "step": 4630 }, { "epoch": 2.356526155408837, "grad_norm": 0.8202574849128723, "learning_rate": 6.696689127329792e-07, "loss": 0.184, "step": 4640 }, { "epoch": 2.361604875571356, "grad_norm": 0.901674747467041, "learning_rate": 6.596372829583184e-07, "loss": 0.1855, "step": 4650 }, { "epoch": 2.366683595733875, "grad_norm": 0.8691708445549011, "learning_rate": 6.496699266856493e-07, "loss": 0.188, "step": 4660 }, { "epoch": 2.3717623158963943, "grad_norm": 1.0007835626602173, "learning_rate": 6.397671920185738e-07, "loss": 0.1808, "step": 4670 }, { "epoch": 2.376841036058913, "grad_norm": 0.8889086842536926, "learning_rate": 6.299294248038281e-07, "loss": 0.1828, "step": 4680 }, { "epoch": 2.381919756221432, "grad_norm": 0.9149643182754517, "learning_rate": 6.201569686191988e-07, "loss": 0.1861, "step": 4690 }, { "epoch": 2.386998476383951, "grad_norm": 0.8830495476722717, "learning_rate": 6.104501647615265e-07, "loss": 0.1919, "step": 4700 }, { "epoch": 2.39207719654647, "grad_norm": 0.886160671710968, "learning_rate": 6.00809352234788e-07, "loss": 0.1847, "step": 4710 }, { "epoch": 2.3971559167089893, "grad_norm": 0.9045748114585876, "learning_rate": 5.912348677382523e-07, "loss": 0.1865, "step": 4720 }, { "epoch": 2.4022346368715084, "grad_norm": 0.9505787491798401, "learning_rate": 5.81727045654725e-07, "loss": 0.1873, "step": 4730 }, { "epoch": 2.4073133570340275, "grad_norm": 0.9555074572563171, "learning_rate": 5.722862180388683e-07, "loss": 0.1823, "step": 4740 }, { "epoch": 2.4123920771965466, "grad_norm": 0.8942949175834656, "learning_rate": 5.629127146056062e-07, "loss": 0.1804, "step": 4750 }, { "epoch": 2.4174707973590657, "grad_norm": 0.883151113986969, "learning_rate": 5.536068627186089e-07, "loss": 0.184, "step": 4760 }, { "epoch": 2.4225495175215848, "grad_norm": 0.8733017444610596, "learning_rate": 5.443689873788572e-07, "loss": 0.1855, "step": 4770 }, { "epoch": 2.427628237684104, "grad_norm": 0.8522325158119202, "learning_rate": 5.351994112132944e-07, "loss": 0.1845, "step": 4780 }, { "epoch": 2.4327069578466225, "grad_norm": 0.8934102058410645, "learning_rate": 5.260984544635603e-07, "loss": 0.1869, "step": 4790 }, { "epoch": 2.4377856780091416, "grad_norm": 0.9026458263397217, "learning_rate": 5.170664349748031e-07, "loss": 0.1836, "step": 4800 }, { "epoch": 2.4428643981716607, "grad_norm": 0.8497732877731323, "learning_rate": 5.081036681845813e-07, "loss": 0.1896, "step": 4810 }, { "epoch": 2.4479431183341798, "grad_norm": 0.9134292006492615, "learning_rate": 4.99210467111847e-07, "loss": 0.1869, "step": 4820 }, { "epoch": 2.453021838496699, "grad_norm": 0.8175578117370605, "learning_rate": 4.903871423460141e-07, "loss": 0.1845, "step": 4830 }, { "epoch": 2.458100558659218, "grad_norm": 0.7790173292160034, "learning_rate": 4.816340020361096e-07, "loss": 0.1766, "step": 4840 }, { "epoch": 2.463179278821737, "grad_norm": 0.869907557964325, "learning_rate": 4.7295135188001465e-07, "loss": 0.1829, "step": 4850 }, { "epoch": 2.468257998984256, "grad_norm": 0.7368974089622498, "learning_rate": 4.6433949511378417e-07, "loss": 0.1861, "step": 4860 }, { "epoch": 2.4733367191467748, "grad_norm": 0.9023917317390442, "learning_rate": 4.557987325010613e-07, "loss": 0.1865, "step": 4870 }, { "epoch": 2.478415439309294, "grad_norm": 0.8636237382888794, "learning_rate": 4.4732936232256855e-07, "loss": 0.1812, "step": 4880 }, { "epoch": 2.483494159471813, "grad_norm": 0.8983198404312134, "learning_rate": 4.389316803656943e-07, "loss": 0.1918, "step": 4890 }, { "epoch": 2.488572879634332, "grad_norm": 0.8668248057365417, "learning_rate": 4.3060597991415987e-07, "loss": 0.178, "step": 4900 }, { "epoch": 2.493651599796851, "grad_norm": 0.9295836091041565, "learning_rate": 4.223525517377805e-07, "loss": 0.1864, "step": 4910 }, { "epoch": 2.4987303199593702, "grad_norm": 0.8381646871566772, "learning_rate": 4.1417168408230596e-07, "loss": 0.1849, "step": 4920 }, { "epoch": 2.5038090401218893, "grad_norm": 0.9747976660728455, "learning_rate": 4.060636626593556e-07, "loss": 0.1816, "step": 4930 }, { "epoch": 2.5088877602844084, "grad_norm": 0.933236837387085, "learning_rate": 3.9802877063644193e-07, "loss": 0.1833, "step": 4940 }, { "epoch": 2.5139664804469275, "grad_norm": 0.8036865592002869, "learning_rate": 3.9006728862707925e-07, "loss": 0.1781, "step": 4950 }, { "epoch": 2.5190452006094466, "grad_norm": 0.8587754368782043, "learning_rate": 3.8217949468098205e-07, "loss": 0.1813, "step": 4960 }, { "epoch": 2.5241239207719657, "grad_norm": 0.958289384841919, "learning_rate": 3.7436566427435675e-07, "loss": 0.1915, "step": 4970 }, { "epoch": 2.5292026409344848, "grad_norm": 1.040694236755371, "learning_rate": 3.6662607030028e-07, "loss": 0.1843, "step": 4980 }, { "epoch": 2.5342813610970034, "grad_norm": 0.8534585237503052, "learning_rate": 3.589609830591692e-07, "loss": 0.1877, "step": 4990 }, { "epoch": 2.5393600812595225, "grad_norm": 0.8737279772758484, "learning_rate": 3.513706702493394e-07, "loss": 0.1825, "step": 5000 }, { "epoch": 2.5393600812595225, "eval_loss": 0.3309638500213623, "eval_runtime": 121.7997, "eval_samples_per_second": 41.051, "eval_steps_per_second": 2.57, "step": 5000 }, { "epoch": 2.5444388014220416, "grad_norm": 0.8709063529968262, "learning_rate": 3.438553969576569e-07, "loss": 0.1831, "step": 5010 }, { "epoch": 2.5495175215845607, "grad_norm": 0.8241310715675354, "learning_rate": 3.364154256502808e-07, "loss": 0.184, "step": 5020 }, { "epoch": 2.5545962417470798, "grad_norm": 0.8842706680297852, "learning_rate": 3.2905101616349497e-07, "loss": 0.1854, "step": 5030 }, { "epoch": 2.559674961909599, "grad_norm": 0.9000037908554077, "learning_rate": 3.217624256946361e-07, "loss": 0.1832, "step": 5040 }, { "epoch": 2.564753682072118, "grad_norm": 0.8701291680335999, "learning_rate": 3.1454990879310866e-07, "loss": 0.1827, "step": 5050 }, { "epoch": 2.5698324022346366, "grad_norm": 0.8461666703224182, "learning_rate": 3.0741371735149544e-07, "loss": 0.1825, "step": 5060 }, { "epoch": 2.5749111223971557, "grad_norm": 0.8629067540168762, "learning_rate": 3.003541005967628e-07, "loss": 0.1785, "step": 5070 }, { "epoch": 2.579989842559675, "grad_norm": 0.9108079075813293, "learning_rate": 2.9337130508155287e-07, "loss": 0.1784, "step": 5080 }, { "epoch": 2.585068562722194, "grad_norm": 0.8795527219772339, "learning_rate": 2.8646557467557514e-07, "loss": 0.1831, "step": 5090 }, { "epoch": 2.590147282884713, "grad_norm": 0.871895432472229, "learning_rate": 2.796371505570888e-07, "loss": 0.1865, "step": 5100 }, { "epoch": 2.595226003047232, "grad_norm": 0.8901582956314087, "learning_rate": 2.728862712044811e-07, "loss": 0.1794, "step": 5110 }, { "epoch": 2.600304723209751, "grad_norm": 0.8318222165107727, "learning_rate": 2.662131723879366e-07, "loss": 0.1859, "step": 5120 }, { "epoch": 2.6053834433722702, "grad_norm": 0.8986064791679382, "learning_rate": 2.5961808716120364e-07, "loss": 0.1797, "step": 5130 }, { "epoch": 2.6104621635347893, "grad_norm": 0.9209254384040833, "learning_rate": 2.531012458534551e-07, "loss": 0.1812, "step": 5140 }, { "epoch": 2.6155408836973084, "grad_norm": 0.9384124279022217, "learning_rate": 2.466628760612463e-07, "loss": 0.1874, "step": 5150 }, { "epoch": 2.6206196038598275, "grad_norm": 0.8080311417579651, "learning_rate": 2.40303202640563e-07, "loss": 0.1774, "step": 5160 }, { "epoch": 2.6256983240223466, "grad_norm": 0.8578788042068481, "learning_rate": 2.3402244769896998e-07, "loss": 0.1843, "step": 5170 }, { "epoch": 2.6307770441848657, "grad_norm": 0.9375886917114258, "learning_rate": 2.2782083058785458e-07, "loss": 0.1829, "step": 5180 }, { "epoch": 2.6358557643473843, "grad_norm": 0.819479763507843, "learning_rate": 2.216985678947664e-07, "loss": 0.1821, "step": 5190 }, { "epoch": 2.6409344845099034, "grad_norm": 0.899229109287262, "learning_rate": 2.156558734358505e-07, "loss": 0.1774, "step": 5200 }, { "epoch": 2.6460132046724225, "grad_norm": 0.9143205881118774, "learning_rate": 2.0969295824838336e-07, "loss": 0.1852, "step": 5210 }, { "epoch": 2.6510919248349416, "grad_norm": 0.8006069660186768, "learning_rate": 2.0381003058339982e-07, "loss": 0.1832, "step": 5220 }, { "epoch": 2.6561706449974607, "grad_norm": 0.9519758820533752, "learning_rate": 1.9800729589842222e-07, "loss": 0.1763, "step": 5230 }, { "epoch": 2.66124936515998, "grad_norm": 0.8859177827835083, "learning_rate": 1.92284956850283e-07, "loss": 0.1815, "step": 5240 }, { "epoch": 2.666328085322499, "grad_norm": 0.9211258888244629, "learning_rate": 1.866432132880483e-07, "loss": 0.1834, "step": 5250 }, { "epoch": 2.6714068054850175, "grad_norm": 0.993823766708374, "learning_rate": 1.8108226224603732e-07, "loss": 0.1804, "step": 5260 }, { "epoch": 2.6764855256475366, "grad_norm": 0.840509295463562, "learning_rate": 1.7560229793694288e-07, "loss": 0.1828, "step": 5270 }, { "epoch": 2.6815642458100557, "grad_norm": 0.9131551384925842, "learning_rate": 1.702035117450468e-07, "loss": 0.1901, "step": 5280 }, { "epoch": 2.686642965972575, "grad_norm": 0.8990755677223206, "learning_rate": 1.6488609221953612e-07, "loss": 0.1797, "step": 5290 }, { "epoch": 2.691721686135094, "grad_norm": 0.8473220467567444, "learning_rate": 1.596502250679194e-07, "loss": 0.1799, "step": 5300 }, { "epoch": 2.696800406297613, "grad_norm": 0.8115700483322144, "learning_rate": 1.5449609314954012e-07, "loss": 0.1849, "step": 5310 }, { "epoch": 2.701879126460132, "grad_norm": 0.9483342170715332, "learning_rate": 1.494238764691902e-07, "loss": 0.181, "step": 5320 }, { "epoch": 2.706957846622651, "grad_norm": 0.8683326840400696, "learning_rate": 1.444337521708236e-07, "loss": 0.1806, "step": 5330 }, { "epoch": 2.7120365667851702, "grad_norm": 0.9478078484535217, "learning_rate": 1.3952589453137017e-07, "loss": 0.1803, "step": 5340 }, { "epoch": 2.7171152869476893, "grad_norm": 0.8961120843887329, "learning_rate": 1.3470047495464905e-07, "loss": 0.1779, "step": 5350 }, { "epoch": 2.7221940071102084, "grad_norm": 0.8967982530593872, "learning_rate": 1.2995766196538194e-07, "loss": 0.179, "step": 5360 }, { "epoch": 2.7272727272727275, "grad_norm": 0.9021346569061279, "learning_rate": 1.252976212033072e-07, "loss": 0.1862, "step": 5370 }, { "epoch": 2.732351447435246, "grad_norm": 0.840735912322998, "learning_rate": 1.2072051541739682e-07, "loss": 0.171, "step": 5380 }, { "epoch": 2.7374301675977653, "grad_norm": 0.9057764410972595, "learning_rate": 1.1622650446017042e-07, "loss": 0.1785, "step": 5390 }, { "epoch": 2.7425088877602843, "grad_norm": 0.8543335795402527, "learning_rate": 1.118157452821142e-07, "loss": 0.1764, "step": 5400 }, { "epoch": 2.7475876079228034, "grad_norm": 0.8573223352432251, "learning_rate": 1.0748839192619764e-07, "loss": 0.1809, "step": 5410 }, { "epoch": 2.7526663280853225, "grad_norm": 0.8379035592079163, "learning_rate": 1.0324459552249505e-07, "loss": 0.1804, "step": 5420 }, { "epoch": 2.7577450482478416, "grad_norm": 0.9201931953430176, "learning_rate": 9.908450428290806e-08, "loss": 0.1837, "step": 5430 }, { "epoch": 2.7628237684103607, "grad_norm": 0.853122889995575, "learning_rate": 9.500826349598729e-08, "loss": 0.1779, "step": 5440 }, { "epoch": 2.76790248857288, "grad_norm": 0.8565370440483093, "learning_rate": 9.101601552185951e-08, "loss": 0.1853, "step": 5450 }, { "epoch": 2.7729812087353984, "grad_norm": 0.9906251430511475, "learning_rate": 8.710789978725653e-08, "loss": 0.1817, "step": 5460 }, { "epoch": 2.7780599288979175, "grad_norm": 0.9657715559005737, "learning_rate": 8.328405278064417e-08, "loss": 0.1843, "step": 5470 }, { "epoch": 2.7831386490604366, "grad_norm": 0.8711429834365845, "learning_rate": 7.954460804745712e-08, "loss": 0.1811, "step": 5480 }, { "epoch": 2.7882173692229557, "grad_norm": 0.8879536986351013, "learning_rate": 7.588969618543357e-08, "loss": 0.1873, "step": 5490 }, { "epoch": 2.793296089385475, "grad_norm": 0.8269737958908081, "learning_rate": 7.231944484005437e-08, "loss": 0.1787, "step": 5500 }, { "epoch": 2.793296089385475, "eval_loss": 0.3309679627418518, "eval_runtime": 121.7053, "eval_samples_per_second": 41.083, "eval_steps_per_second": 2.572, "step": 5500 }, { "epoch": 2.798374809547994, "grad_norm": 0.9365456700325012, "learning_rate": 6.883397870008662e-08, "loss": 0.1831, "step": 5510 }, { "epoch": 2.803453529710513, "grad_norm": 0.8533931374549866, "learning_rate": 6.543341949322657e-08, "loss": 0.1851, "step": 5520 }, { "epoch": 2.808532249873032, "grad_norm": 0.9121244549751282, "learning_rate": 6.211788598185081e-08, "loss": 0.182, "step": 5530 }, { "epoch": 2.813610970035551, "grad_norm": 0.8693413734436035, "learning_rate": 5.8887493958866004e-08, "loss": 0.1809, "step": 5540 }, { "epoch": 2.8186896901980703, "grad_norm": 0.8991943001747131, "learning_rate": 5.574235624366764e-08, "loss": 0.1875, "step": 5550 }, { "epoch": 2.8237684103605893, "grad_norm": 0.9877569079399109, "learning_rate": 5.2682582678197644e-08, "loss": 0.1777, "step": 5560 }, { "epoch": 2.8288471305231084, "grad_norm": 0.9565731883049011, "learning_rate": 4.970828012310969e-08, "loss": 0.1856, "step": 5570 }, { "epoch": 2.833925850685627, "grad_norm": 0.8290345668792725, "learning_rate": 4.681955245403602e-08, "loss": 0.1872, "step": 5580 }, { "epoch": 2.839004570848146, "grad_norm": 0.7747939825057983, "learning_rate": 4.401650055796042e-08, "loss": 0.1811, "step": 5590 }, { "epoch": 2.8440832910106653, "grad_norm": 0.8827708959579468, "learning_rate": 4.1299222329694574e-08, "loss": 0.177, "step": 5600 }, { "epoch": 2.8491620111731844, "grad_norm": 0.8102378249168396, "learning_rate": 3.8667812668459204e-08, "loss": 0.1863, "step": 5610 }, { "epoch": 2.8542407313357034, "grad_norm": 0.8598487377166748, "learning_rate": 3.612236347456943e-08, "loss": 0.1799, "step": 5620 }, { "epoch": 2.8593194514982225, "grad_norm": 0.9280297160148621, "learning_rate": 3.366296364622629e-08, "loss": 0.1836, "step": 5630 }, { "epoch": 2.8643981716607416, "grad_norm": 0.8947204947471619, "learning_rate": 3.128969907641027e-08, "loss": 0.1813, "step": 5640 }, { "epoch": 2.8694768918232603, "grad_norm": 0.9226625561714172, "learning_rate": 2.9002652649882945e-08, "loss": 0.1851, "step": 5650 }, { "epoch": 2.8745556119857794, "grad_norm": 0.8815863728523254, "learning_rate": 2.6801904240292275e-08, "loss": 0.1795, "step": 5660 }, { "epoch": 2.8796343321482984, "grad_norm": 0.9666081070899963, "learning_rate": 2.4687530707381836e-08, "loss": 0.1883, "step": 5670 }, { "epoch": 2.8847130523108175, "grad_norm": 0.8660172820091248, "learning_rate": 2.265960589430821e-08, "loss": 0.1755, "step": 5680 }, { "epoch": 2.8897917724733366, "grad_norm": 0.9310330152511597, "learning_rate": 2.0718200625060302e-08, "loss": 0.1806, "step": 5690 }, { "epoch": 2.8948704926358557, "grad_norm": 0.90314781665802, "learning_rate": 1.8863382701987675e-08, "loss": 0.1838, "step": 5700 }, { "epoch": 2.899949212798375, "grad_norm": 0.8870557546615601, "learning_rate": 1.70952169034308e-08, "loss": 0.1822, "step": 5710 }, { "epoch": 2.905027932960894, "grad_norm": 0.8747410178184509, "learning_rate": 1.5413764981460354e-08, "loss": 0.189, "step": 5720 }, { "epoch": 2.910106653123413, "grad_norm": 0.8954981565475464, "learning_rate": 1.3819085659719233e-08, "loss": 0.1802, "step": 5730 }, { "epoch": 2.915185373285932, "grad_norm": 0.9273566007614136, "learning_rate": 1.2311234631372514e-08, "loss": 0.185, "step": 5740 }, { "epoch": 2.920264093448451, "grad_norm": 0.8598524928092957, "learning_rate": 1.0890264557162356e-08, "loss": 0.1859, "step": 5750 }, { "epoch": 2.9253428136109703, "grad_norm": 0.8601447939872742, "learning_rate": 9.556225063568347e-09, "loss": 0.1822, "step": 5760 }, { "epoch": 2.9304215337734894, "grad_norm": 0.8048866391181946, "learning_rate": 8.309162741074461e-09, "loss": 0.1837, "step": 5770 }, { "epoch": 2.935500253936008, "grad_norm": 0.9375097155570984, "learning_rate": 7.149121142542292e-09, "loss": 0.1833, "step": 5780 }, { "epoch": 2.940578974098527, "grad_norm": 0.8771663904190063, "learning_rate": 6.076140781690054e-09, "loss": 0.1856, "step": 5790 }, { "epoch": 2.945657694261046, "grad_norm": 0.9071695804595947, "learning_rate": 5.090259131676767e-09, "loss": 0.1853, "step": 5800 }, { "epoch": 2.9507364144235653, "grad_norm": 0.9588919878005981, "learning_rate": 4.191510623794414e-09, "loss": 0.1823, "step": 5810 }, { "epoch": 2.9558151345860844, "grad_norm": 0.8780667185783386, "learning_rate": 3.379926646265852e-09, "loss": 0.1791, "step": 5820 }, { "epoch": 2.9608938547486034, "grad_norm": 0.7855400443077087, "learning_rate": 2.6555355431465145e-09, "loss": 0.1831, "step": 5830 }, { "epoch": 2.9659725749111225, "grad_norm": 0.8734691143035889, "learning_rate": 2.0183626133374325e-09, "loss": 0.1836, "step": 5840 }, { "epoch": 2.971051295073641, "grad_norm": 0.8363956212997437, "learning_rate": 1.4684301096992704e-09, "loss": 0.1823, "step": 5850 }, { "epoch": 2.9761300152361603, "grad_norm": 0.8143458962440491, "learning_rate": 1.0057572382765613e-09, "loss": 0.1829, "step": 5860 }, { "epoch": 2.9812087353986794, "grad_norm": 0.8893367648124695, "learning_rate": 6.303601576257423e-10, "loss": 0.1855, "step": 5870 }, { "epoch": 2.9862874555611985, "grad_norm": 0.8227106332778931, "learning_rate": 3.4225197825227264e-10, "loss": 0.1824, "step": 5880 }, { "epoch": 2.9913661757237175, "grad_norm": 0.8717892169952393, "learning_rate": 1.4144276215211085e-10, "loss": 0.187, "step": 5890 }, { "epoch": 2.9964448958862366, "grad_norm": 0.9425582885742188, "learning_rate": 2.793952245921938e-11, "loss": 0.1806, "step": 5900 }, { "epoch": 3.0, "step": 5907, "total_flos": 3.03001910027265e+19, "train_loss": 0.25274969475188996, "train_runtime": 28024.1449, "train_samples_per_second": 6.744, "train_steps_per_second": 0.211 } ], "logging_steps": 10, "max_steps": 5907, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.03001910027265e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }